# HG changeset patch # User kvn # Date 1347935947 25200 # Node ID 137868b7aa6f5a36886537099fccf60221b9a9a5 # Parent 8d3cc6612bd1f72fc2dd9116dd5c6fc7edd915c6 7196199: java/text/Bidi/Bug6665028.java failed: Bidi run count incorrect Summary: Save whole XMM/YMM registers in safepoint interrupt handler. Reviewed-by: roland, twisti diff -r 8d3cc6612bd1 -r 137868b7aa6f src/cpu/sparc/vm/sharedRuntime_sparc.cpp --- a/src/cpu/sparc/vm/sharedRuntime_sparc.cpp Mon Sep 17 17:02:10 2012 -0700 +++ b/src/cpu/sparc/vm/sharedRuntime_sparc.cpp Mon Sep 17 19:39:07 2012 -0700 @@ -313,6 +313,14 @@ } +// Is vector's size (in bytes) bigger than a size saved by default? +// 8 bytes FP registers are saved by default on SPARC. +bool SharedRuntime::is_wide_vector(int size) { + // Note, MaxVectorSize == 8 on SPARC. + assert(size <= 8, err_msg_res("%d bytes vectors are not supported", size)); + return size > 8; +} + // The java_calling_convention describes stack locations as ideal slots on // a frame with no abi restrictions. Since we must observe abi restrictions // (like the placement of the register window) the slots must be biased by @@ -3734,7 +3742,7 @@ // the 64-bit %o's, then do a save, then fixup the caller's SP (our FP). // Tricky, tricky, tricky... -SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, bool cause_return) { +SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, int poll_type) { assert (StubRoutines::forward_exception_entry() != NULL, "must be generated before"); // allocate space for the code @@ -3752,6 +3760,7 @@ int start = __ offset(); + bool cause_return = (poll_type == POLL_AT_RETURN); // If this causes a return before the processing, then do a "restore" if (cause_return) { __ restore(); diff -r 8d3cc6612bd1 -r 137868b7aa6f src/cpu/x86/vm/assembler_x86.cpp --- a/src/cpu/x86/vm/assembler_x86.cpp Mon Sep 17 17:02:10 2012 -0700 +++ b/src/cpu/x86/vm/assembler_x86.cpp Mon Sep 17 19:39:07 2012 -0700 @@ -3496,6 +3496,33 @@ emit_byte(0x01); } +void Assembler::vinsertf128h(XMMRegister dst, Address src) { + assert(VM_Version::supports_avx(), ""); + InstructionMark im(this); + bool vector256 = true; + assert(dst != xnoreg, "sanity"); + int dst_enc = dst->encoding(); + // swap src<->dst for encoding + vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, false, vector256); + emit_byte(0x18); + emit_operand(dst, src); + // 0x01 - insert into upper 128 bits + emit_byte(0x01); +} + +void Assembler::vextractf128h(Address dst, XMMRegister src) { + assert(VM_Version::supports_avx(), ""); + InstructionMark im(this); + bool vector256 = true; + assert(src != xnoreg, "sanity"); + int src_enc = src->encoding(); + vex_prefix(dst, 0, src_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, false, vector256); + emit_byte(0x19); + emit_operand(src, dst); + // 0x01 - extract from upper 128 bits + emit_byte(0x01); +} + void Assembler::vinserti128h(XMMRegister dst, XMMRegister nds, XMMRegister src) { assert(VM_Version::supports_avx2(), ""); bool vector256 = true; @@ -3507,6 +3534,33 @@ emit_byte(0x01); } +void Assembler::vinserti128h(XMMRegister dst, Address src) { + assert(VM_Version::supports_avx2(), ""); + InstructionMark im(this); + bool vector256 = true; + assert(dst != xnoreg, "sanity"); + int dst_enc = dst->encoding(); + // swap src<->dst for encoding + vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, false, vector256); + emit_byte(0x38); + emit_operand(dst, src); + // 0x01 - insert into upper 128 bits + emit_byte(0x01); +} + +void Assembler::vextracti128h(Address dst, XMMRegister src) { + assert(VM_Version::supports_avx2(), ""); + InstructionMark im(this); + bool vector256 = true; + assert(src != xnoreg, "sanity"); + int src_enc = src->encoding(); + vex_prefix(dst, 0, src_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, false, vector256); + emit_byte(0x39); + emit_operand(src, dst); + // 0x01 - extract from upper 128 bits + emit_byte(0x01); +} + void Assembler::vzeroupper() { assert(VM_Version::supports_avx(), ""); (void)vex_prefix_and_encode(xmm0, xmm0, xmm0, VEX_SIMD_NONE); @@ -8907,11 +8961,9 @@ pusha(); // if we are coming from c1, xmm registers may be live - if (UseSSE >= 1) { - subptr(rsp, sizeof(jdouble)* LP64_ONLY(16) NOT_LP64(8)); - } int off = 0; if (UseSSE == 1) { + subptr(rsp, sizeof(jdouble)*8); movflt(Address(rsp,off++*sizeof(jdouble)),xmm0); movflt(Address(rsp,off++*sizeof(jdouble)),xmm1); movflt(Address(rsp,off++*sizeof(jdouble)),xmm2); @@ -8921,23 +8973,50 @@ movflt(Address(rsp,off++*sizeof(jdouble)),xmm6); movflt(Address(rsp,off++*sizeof(jdouble)),xmm7); } else if (UseSSE >= 2) { - movdbl(Address(rsp,off++*sizeof(jdouble)),xmm0); - movdbl(Address(rsp,off++*sizeof(jdouble)),xmm1); - movdbl(Address(rsp,off++*sizeof(jdouble)),xmm2); - movdbl(Address(rsp,off++*sizeof(jdouble)),xmm3); - movdbl(Address(rsp,off++*sizeof(jdouble)),xmm4); - movdbl(Address(rsp,off++*sizeof(jdouble)),xmm5); - movdbl(Address(rsp,off++*sizeof(jdouble)),xmm6); - movdbl(Address(rsp,off++*sizeof(jdouble)),xmm7); +#ifdef COMPILER2 + if (MaxVectorSize > 16) { + assert(UseAVX > 0, "256bit vectors are supported only with AVX"); + // Save upper half of YMM registes + subptr(rsp, 16 * LP64_ONLY(16) NOT_LP64(8)); + vextractf128h(Address(rsp, 0),xmm0); + vextractf128h(Address(rsp, 16),xmm1); + vextractf128h(Address(rsp, 32),xmm2); + vextractf128h(Address(rsp, 48),xmm3); + vextractf128h(Address(rsp, 64),xmm4); + vextractf128h(Address(rsp, 80),xmm5); + vextractf128h(Address(rsp, 96),xmm6); + vextractf128h(Address(rsp,112),xmm7); #ifdef _LP64 - movdbl(Address(rsp,off++*sizeof(jdouble)),xmm8); - movdbl(Address(rsp,off++*sizeof(jdouble)),xmm9); - movdbl(Address(rsp,off++*sizeof(jdouble)),xmm10); - movdbl(Address(rsp,off++*sizeof(jdouble)),xmm11); - movdbl(Address(rsp,off++*sizeof(jdouble)),xmm12); - movdbl(Address(rsp,off++*sizeof(jdouble)),xmm13); - movdbl(Address(rsp,off++*sizeof(jdouble)),xmm14); - movdbl(Address(rsp,off++*sizeof(jdouble)),xmm15); + vextractf128h(Address(rsp,128),xmm8); + vextractf128h(Address(rsp,144),xmm9); + vextractf128h(Address(rsp,160),xmm10); + vextractf128h(Address(rsp,176),xmm11); + vextractf128h(Address(rsp,192),xmm12); + vextractf128h(Address(rsp,208),xmm13); + vextractf128h(Address(rsp,224),xmm14); + vextractf128h(Address(rsp,240),xmm15); +#endif + } +#endif + // Save whole 128bit (16 bytes) XMM regiters + subptr(rsp, 16 * LP64_ONLY(16) NOT_LP64(8)); + movdqu(Address(rsp,off++*16),xmm0); + movdqu(Address(rsp,off++*16),xmm1); + movdqu(Address(rsp,off++*16),xmm2); + movdqu(Address(rsp,off++*16),xmm3); + movdqu(Address(rsp,off++*16),xmm4); + movdqu(Address(rsp,off++*16),xmm5); + movdqu(Address(rsp,off++*16),xmm6); + movdqu(Address(rsp,off++*16),xmm7); +#ifdef _LP64 + movdqu(Address(rsp,off++*16),xmm8); + movdqu(Address(rsp,off++*16),xmm9); + movdqu(Address(rsp,off++*16),xmm10); + movdqu(Address(rsp,off++*16),xmm11); + movdqu(Address(rsp,off++*16),xmm12); + movdqu(Address(rsp,off++*16),xmm13); + movdqu(Address(rsp,off++*16),xmm14); + movdqu(Address(rsp,off++*16),xmm15); #endif } @@ -9015,28 +9094,52 @@ movflt(xmm5, Address(rsp,off++*sizeof(jdouble))); movflt(xmm6, Address(rsp,off++*sizeof(jdouble))); movflt(xmm7, Address(rsp,off++*sizeof(jdouble))); + addptr(rsp, sizeof(jdouble)*8); } else if (UseSSE >= 2) { - movdbl(xmm0, Address(rsp,off++*sizeof(jdouble))); - movdbl(xmm1, Address(rsp,off++*sizeof(jdouble))); - movdbl(xmm2, Address(rsp,off++*sizeof(jdouble))); - movdbl(xmm3, Address(rsp,off++*sizeof(jdouble))); - movdbl(xmm4, Address(rsp,off++*sizeof(jdouble))); - movdbl(xmm5, Address(rsp,off++*sizeof(jdouble))); - movdbl(xmm6, Address(rsp,off++*sizeof(jdouble))); - movdbl(xmm7, Address(rsp,off++*sizeof(jdouble))); + // Restore whole 128bit (16 bytes) XMM regiters + movdqu(xmm0, Address(rsp,off++*16)); + movdqu(xmm1, Address(rsp,off++*16)); + movdqu(xmm2, Address(rsp,off++*16)); + movdqu(xmm3, Address(rsp,off++*16)); + movdqu(xmm4, Address(rsp,off++*16)); + movdqu(xmm5, Address(rsp,off++*16)); + movdqu(xmm6, Address(rsp,off++*16)); + movdqu(xmm7, Address(rsp,off++*16)); #ifdef _LP64 - movdbl(xmm8, Address(rsp,off++*sizeof(jdouble))); - movdbl(xmm9, Address(rsp,off++*sizeof(jdouble))); - movdbl(xmm10, Address(rsp,off++*sizeof(jdouble))); - movdbl(xmm11, Address(rsp,off++*sizeof(jdouble))); - movdbl(xmm12, Address(rsp,off++*sizeof(jdouble))); - movdbl(xmm13, Address(rsp,off++*sizeof(jdouble))); - movdbl(xmm14, Address(rsp,off++*sizeof(jdouble))); - movdbl(xmm15, Address(rsp,off++*sizeof(jdouble))); + movdqu(xmm8, Address(rsp,off++*16)); + movdqu(xmm9, Address(rsp,off++*16)); + movdqu(xmm10, Address(rsp,off++*16)); + movdqu(xmm11, Address(rsp,off++*16)); + movdqu(xmm12, Address(rsp,off++*16)); + movdqu(xmm13, Address(rsp,off++*16)); + movdqu(xmm14, Address(rsp,off++*16)); + movdqu(xmm15, Address(rsp,off++*16)); #endif - } - if (UseSSE >= 1) { - addptr(rsp, sizeof(jdouble)* LP64_ONLY(16) NOT_LP64(8)); + addptr(rsp, 16 * LP64_ONLY(16) NOT_LP64(8)); +#ifdef COMPILER2 + if (MaxVectorSize > 16) { + // Restore upper half of YMM registes. + vinsertf128h(xmm0, Address(rsp, 0)); + vinsertf128h(xmm1, Address(rsp, 16)); + vinsertf128h(xmm2, Address(rsp, 32)); + vinsertf128h(xmm3, Address(rsp, 48)); + vinsertf128h(xmm4, Address(rsp, 64)); + vinsertf128h(xmm5, Address(rsp, 80)); + vinsertf128h(xmm6, Address(rsp, 96)); + vinsertf128h(xmm7, Address(rsp,112)); +#ifdef _LP64 + vinsertf128h(xmm8, Address(rsp,128)); + vinsertf128h(xmm9, Address(rsp,144)); + vinsertf128h(xmm10, Address(rsp,160)); + vinsertf128h(xmm11, Address(rsp,176)); + vinsertf128h(xmm12, Address(rsp,192)); + vinsertf128h(xmm13, Address(rsp,208)); + vinsertf128h(xmm14, Address(rsp,224)); + vinsertf128h(xmm15, Address(rsp,240)); +#endif + addptr(rsp, 16 * LP64_ONLY(16) NOT_LP64(8)); + } +#endif } popa(); } diff -r 8d3cc6612bd1 -r 137868b7aa6f src/cpu/x86/vm/assembler_x86.hpp --- a/src/cpu/x86/vm/assembler_x86.hpp Mon Sep 17 17:02:10 2012 -0700 +++ b/src/cpu/x86/vm/assembler_x86.hpp Mon Sep 17 19:39:07 2012 -0700 @@ -1743,6 +1743,12 @@ void vinsertf128h(XMMRegister dst, XMMRegister nds, XMMRegister src); void vinserti128h(XMMRegister dst, XMMRegister nds, XMMRegister src); + // Load/store high 128bit of YMM registers which does not destroy other half. + void vinsertf128h(XMMRegister dst, Address src); + void vinserti128h(XMMRegister dst, Address src); + void vextractf128h(Address dst, XMMRegister src); + void vextracti128h(Address dst, XMMRegister src); + // AVX instruction which is used to clear upper 128 bits of YMM registers and // to avoid transaction penalty between AVX and SSE states. There is no // penalty if legacy SSE instructions are encoded using VEX prefix because diff -r 8d3cc6612bd1 -r 137868b7aa6f src/cpu/x86/vm/sharedRuntime_x86_32.cpp --- a/src/cpu/x86/vm/sharedRuntime_x86_32.cpp Mon Sep 17 17:02:10 2012 -0700 +++ b/src/cpu/x86/vm/sharedRuntime_x86_32.cpp Mon Sep 17 19:39:07 2012 -0700 @@ -46,11 +46,11 @@ const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size; class RegisterSaver { - enum { FPU_regs_live = 8 /*for the FPU stack*/+8/*eight more for XMM registers*/ }; // Capture info about frame layout +#define DEF_XMM_OFFS(regnum) xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off enum layout { fpu_state_off = 0, - fpu_state_end = fpu_state_off+FPUStateSizeInWords-1, + fpu_state_end = fpu_state_off+FPUStateSizeInWords, st0_off, st0H_off, st1_off, st1H_off, st2_off, st2H_off, @@ -59,16 +59,16 @@ st5_off, st5H_off, st6_off, st6H_off, st7_off, st7H_off, - - xmm0_off, xmm0H_off, - xmm1_off, xmm1H_off, - xmm2_off, xmm2H_off, - xmm3_off, xmm3H_off, - xmm4_off, xmm4H_off, - xmm5_off, xmm5H_off, - xmm6_off, xmm6H_off, - xmm7_off, xmm7H_off, - flags_off, + xmm_off, + DEF_XMM_OFFS(0), + DEF_XMM_OFFS(1), + DEF_XMM_OFFS(2), + DEF_XMM_OFFS(3), + DEF_XMM_OFFS(4), + DEF_XMM_OFFS(5), + DEF_XMM_OFFS(6), + DEF_XMM_OFFS(7), + flags_off = xmm7_off + 16/BytesPerInt + 1, // 16-byte stack alignment fill word rdi_off, rsi_off, ignore_off, // extra copy of rbp, @@ -83,13 +83,13 @@ rbp_off, return_off, // slot for return address reg_save_size }; - + enum { FPU_regs_live = flags_off - fpu_state_end }; public: static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, - int* total_frame_words, bool verify_fpu = true); - static void restore_live_registers(MacroAssembler* masm); + int* total_frame_words, bool verify_fpu = true, bool save_vectors = false); + static void restore_live_registers(MacroAssembler* masm, bool restore_vectors = false); static int rax_offset() { return rax_off; } static int rbx_offset() { return rbx_off; } @@ -113,9 +113,20 @@ }; OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, - int* total_frame_words, bool verify_fpu) { - - int frame_size_in_bytes = (reg_save_size + additional_frame_words) * wordSize; + int* total_frame_words, bool verify_fpu, bool save_vectors) { + int vect_words = 0; +#ifdef COMPILER2 + if (save_vectors) { + assert(UseAVX > 0, "256bit vectors are supported only with AVX"); + assert(MaxVectorSize == 32, "only 256bit vectors are supported now"); + // Save upper half of YMM registes + vect_words = 8 * 16 / wordSize; + additional_frame_words += vect_words; + } +#else + assert(!save_vectors, "vectors are generated only by C2"); +#endif + int frame_size_in_bytes = (reg_save_size + additional_frame_words) * wordSize; int frame_words = frame_size_in_bytes / wordSize; *total_frame_words = frame_words; @@ -129,7 +140,7 @@ __ enter(); __ pusha(); __ pushf(); - __ subptr(rsp,FPU_regs_live*sizeof(jdouble)); // Push FPU registers space + __ subptr(rsp,FPU_regs_live*wordSize); // Push FPU registers space __ push_FPU_state(); // Save FPU state & init if (verify_fpu) { @@ -183,14 +194,28 @@ __ movflt(Address(rsp,xmm6_off*wordSize),xmm6); __ movflt(Address(rsp,xmm7_off*wordSize),xmm7); } else if( UseSSE >= 2 ) { - __ movdbl(Address(rsp,xmm0_off*wordSize),xmm0); - __ movdbl(Address(rsp,xmm1_off*wordSize),xmm1); - __ movdbl(Address(rsp,xmm2_off*wordSize),xmm2); - __ movdbl(Address(rsp,xmm3_off*wordSize),xmm3); - __ movdbl(Address(rsp,xmm4_off*wordSize),xmm4); - __ movdbl(Address(rsp,xmm5_off*wordSize),xmm5); - __ movdbl(Address(rsp,xmm6_off*wordSize),xmm6); - __ movdbl(Address(rsp,xmm7_off*wordSize),xmm7); + // Save whole 128bit (16 bytes) XMM regiters + __ movdqu(Address(rsp,xmm0_off*wordSize),xmm0); + __ movdqu(Address(rsp,xmm1_off*wordSize),xmm1); + __ movdqu(Address(rsp,xmm2_off*wordSize),xmm2); + __ movdqu(Address(rsp,xmm3_off*wordSize),xmm3); + __ movdqu(Address(rsp,xmm4_off*wordSize),xmm4); + __ movdqu(Address(rsp,xmm5_off*wordSize),xmm5); + __ movdqu(Address(rsp,xmm6_off*wordSize),xmm6); + __ movdqu(Address(rsp,xmm7_off*wordSize),xmm7); + } + + if (vect_words > 0) { + assert(vect_words*wordSize == 128, ""); + __ subptr(rsp, 128); // Save upper half of YMM registes + __ vextractf128h(Address(rsp, 0),xmm0); + __ vextractf128h(Address(rsp, 16),xmm1); + __ vextractf128h(Address(rsp, 32),xmm2); + __ vextractf128h(Address(rsp, 48),xmm3); + __ vextractf128h(Address(rsp, 64),xmm4); + __ vextractf128h(Address(rsp, 80),xmm5); + __ vextractf128h(Address(rsp, 96),xmm6); + __ vextractf128h(Address(rsp,112),xmm7); } // Set an oopmap for the call site. This oopmap will map all @@ -253,10 +278,20 @@ } -void RegisterSaver::restore_live_registers(MacroAssembler* masm) { - +void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_vectors) { // Recover XMM & FPU state - if( UseSSE == 1 ) { + int additional_frame_bytes = 0; +#ifdef COMPILER2 + if (restore_vectors) { + assert(UseAVX > 0, "256bit vectors are supported only with AVX"); + assert(MaxVectorSize == 32, "only 256bit vectors are supported now"); + additional_frame_bytes = 128; + } +#else + assert(!restore_vectors, "vectors are generated only by C2"); +#endif + if (UseSSE == 1) { + assert(additional_frame_bytes == 0, ""); __ movflt(xmm0,Address(rsp,xmm0_off*wordSize)); __ movflt(xmm1,Address(rsp,xmm1_off*wordSize)); __ movflt(xmm2,Address(rsp,xmm2_off*wordSize)); @@ -265,18 +300,33 @@ __ movflt(xmm5,Address(rsp,xmm5_off*wordSize)); __ movflt(xmm6,Address(rsp,xmm6_off*wordSize)); __ movflt(xmm7,Address(rsp,xmm7_off*wordSize)); - } else if( UseSSE >= 2 ) { - __ movdbl(xmm0,Address(rsp,xmm0_off*wordSize)); - __ movdbl(xmm1,Address(rsp,xmm1_off*wordSize)); - __ movdbl(xmm2,Address(rsp,xmm2_off*wordSize)); - __ movdbl(xmm3,Address(rsp,xmm3_off*wordSize)); - __ movdbl(xmm4,Address(rsp,xmm4_off*wordSize)); - __ movdbl(xmm5,Address(rsp,xmm5_off*wordSize)); - __ movdbl(xmm6,Address(rsp,xmm6_off*wordSize)); - __ movdbl(xmm7,Address(rsp,xmm7_off*wordSize)); + } else if (UseSSE >= 2) { +#define STACK_ADDRESS(x) Address(rsp,(x)*wordSize + additional_frame_bytes) + __ movdqu(xmm0,STACK_ADDRESS(xmm0_off)); + __ movdqu(xmm1,STACK_ADDRESS(xmm1_off)); + __ movdqu(xmm2,STACK_ADDRESS(xmm2_off)); + __ movdqu(xmm3,STACK_ADDRESS(xmm3_off)); + __ movdqu(xmm4,STACK_ADDRESS(xmm4_off)); + __ movdqu(xmm5,STACK_ADDRESS(xmm5_off)); + __ movdqu(xmm6,STACK_ADDRESS(xmm6_off)); + __ movdqu(xmm7,STACK_ADDRESS(xmm7_off)); +#undef STACK_ADDRESS + } + if (restore_vectors) { + // Restore upper half of YMM registes. + assert(additional_frame_bytes == 128, ""); + __ vinsertf128h(xmm0, Address(rsp, 0)); + __ vinsertf128h(xmm1, Address(rsp, 16)); + __ vinsertf128h(xmm2, Address(rsp, 32)); + __ vinsertf128h(xmm3, Address(rsp, 48)); + __ vinsertf128h(xmm4, Address(rsp, 64)); + __ vinsertf128h(xmm5, Address(rsp, 80)); + __ vinsertf128h(xmm6, Address(rsp, 96)); + __ vinsertf128h(xmm7, Address(rsp,112)); + __ addptr(rsp, additional_frame_bytes); } __ pop_FPU_state(); - __ addptr(rsp, FPU_regs_live*sizeof(jdouble)); // Pop FPU registers + __ addptr(rsp, FPU_regs_live*wordSize); // Pop FPU registers __ popf(); __ popa(); @@ -308,6 +358,13 @@ __ addptr(rsp, return_off * wordSize); } +// Is vector's size (in bytes) bigger than a size saved by default? +// 16 bytes XMM registers are saved by default using SSE2 movdqu instructions. +// Note, MaxVectorSize == 0 with UseSSE < 2 and vectors are not generated. +bool SharedRuntime::is_wide_vector(int size) { + return size > 16; +} + // The java_calling_convention describes stack locations as ideal slots on // a frame with no abi restrictions. Since we must observe abi restrictions // (like the placement of the register window) the slots must be biased by @@ -2732,7 +2789,6 @@ return 0; } - //------------------------------generate_deopt_blob---------------------------- void SharedRuntime::generate_deopt_blob() { // allocate space for the code @@ -3270,7 +3326,7 @@ // setup oopmap, and calls safepoint code to stop the compiled code for // a safepoint. // -SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, bool cause_return) { +SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, int poll_type) { // Account for thread arg in our frame const int additional_words = 1; @@ -3290,17 +3346,18 @@ const Register java_thread = rdi; // callee-saved for VC++ address start = __ pc(); address call_pc = NULL; - + bool cause_return = (poll_type == POLL_AT_RETURN); + bool save_vectors = (poll_type == POLL_AT_VECTOR_LOOP); // If cause_return is true we are at a poll_return and there is // the return address on the stack to the caller on the nmethod // that is safepoint. We can leave this return on the stack and // effectively complete the return and safepoint in the caller. // Otherwise we push space for a return address that the safepoint // handler will install later to make the stack walking sensible. - if( !cause_return ) - __ push(rbx); // Make room for return address (or push it again) - - map = RegisterSaver::save_live_registers(masm, additional_words, &frame_size_in_words, false); + if (!cause_return) + __ push(rbx); // Make room for return address (or push it again) + + map = RegisterSaver::save_live_registers(masm, additional_words, &frame_size_in_words, false, save_vectors); // The following is basically a call_VM. However, we need the precise // address of the call in order to generate an oopmap. Hence, we do all the @@ -3312,7 +3369,7 @@ __ set_last_Java_frame(java_thread, noreg, noreg, NULL); // if this was not a poll_return then we need to correct the return address now. - if( !cause_return ) { + if (!cause_return) { __ movptr(rax, Address(java_thread, JavaThread::saved_exception_pc_offset())); __ movptr(Address(rbp, wordSize), rax); } @@ -3340,15 +3397,14 @@ __ jcc(Assembler::equal, noException); // Exception pending - - RegisterSaver::restore_live_registers(masm); + RegisterSaver::restore_live_registers(masm, save_vectors); __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); __ bind(noException); // Normal exit, register restoring and exit - RegisterSaver::restore_live_registers(masm); + RegisterSaver::restore_live_registers(masm, save_vectors); __ ret(0); diff -r 8d3cc6612bd1 -r 137868b7aa6f src/cpu/x86/vm/sharedRuntime_x86_64.cpp --- a/src/cpu/x86/vm/sharedRuntime_x86_64.cpp Mon Sep 17 17:02:10 2012 -0700 +++ b/src/cpu/x86/vm/sharedRuntime_x86_64.cpp Mon Sep 17 19:39:07 2012 -0700 @@ -116,8 +116,8 @@ }; public: - static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words); - static void restore_live_registers(MacroAssembler* masm); + static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_vectors = false); + static void restore_live_registers(MacroAssembler* masm, bool restore_vectors = false); // Offsets into the register save area // Used by deoptimization when it is managing result register @@ -134,7 +134,19 @@ static void restore_result_registers(MacroAssembler* masm); }; -OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words) { +OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_vectors) { + int vect_words = 0; +#ifdef COMPILER2 + if (save_vectors) { + assert(UseAVX > 0, "256bit vectors are supported only with AVX"); + assert(MaxVectorSize == 32, "only 256bit vectors are supported now"); + // Save upper half of YMM registes + vect_words = 16 * 16 / wordSize; + additional_frame_words += vect_words; + } +#else + assert(!save_vectors, "vectors are generated only by C2"); +#endif // Always make the frame size 16-byte aligned int frame_size_in_bytes = round_to(additional_frame_words*wordSize + @@ -155,6 +167,27 @@ __ enter(); // rsp becomes 16-byte aligned here __ push_CPU_state(); // Push a multiple of 16 bytes + + if (vect_words > 0) { + assert(vect_words*wordSize == 256, ""); + __ subptr(rsp, 256); // Save upper half of YMM registes + __ vextractf128h(Address(rsp, 0),xmm0); + __ vextractf128h(Address(rsp, 16),xmm1); + __ vextractf128h(Address(rsp, 32),xmm2); + __ vextractf128h(Address(rsp, 48),xmm3); + __ vextractf128h(Address(rsp, 64),xmm4); + __ vextractf128h(Address(rsp, 80),xmm5); + __ vextractf128h(Address(rsp, 96),xmm6); + __ vextractf128h(Address(rsp,112),xmm7); + __ vextractf128h(Address(rsp,128),xmm8); + __ vextractf128h(Address(rsp,144),xmm9); + __ vextractf128h(Address(rsp,160),xmm10); + __ vextractf128h(Address(rsp,176),xmm11); + __ vextractf128h(Address(rsp,192),xmm12); + __ vextractf128h(Address(rsp,208),xmm13); + __ vextractf128h(Address(rsp,224),xmm14); + __ vextractf128h(Address(rsp,240),xmm15); + } if (frame::arg_reg_save_area_bytes != 0) { // Allocate argument register save area __ subptr(rsp, frame::arg_reg_save_area_bytes); @@ -167,112 +200,111 @@ OopMapSet *oop_maps = new OopMapSet(); OopMap* map = new OopMap(frame_size_in_slots, 0); - map->set_callee_saved(VMRegImpl::stack2reg( rax_off + additional_frame_slots), rax->as_VMReg()); - map->set_callee_saved(VMRegImpl::stack2reg( rcx_off + additional_frame_slots), rcx->as_VMReg()); - map->set_callee_saved(VMRegImpl::stack2reg( rdx_off + additional_frame_slots), rdx->as_VMReg()); - map->set_callee_saved(VMRegImpl::stack2reg( rbx_off + additional_frame_slots), rbx->as_VMReg()); + +#define STACK_OFFSET(x) VMRegImpl::stack2reg((x) + additional_frame_slots) + + map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg()); + map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg()); + map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg()); + map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg()); // rbp location is known implicitly by the frame sender code, needs no oopmap // and the location where rbp was saved by is ignored - map->set_callee_saved(VMRegImpl::stack2reg( rsi_off + additional_frame_slots), rsi->as_VMReg()); - map->set_callee_saved(VMRegImpl::stack2reg( rdi_off + additional_frame_slots), rdi->as_VMReg()); - map->set_callee_saved(VMRegImpl::stack2reg( r8_off + additional_frame_slots), r8->as_VMReg()); - map->set_callee_saved(VMRegImpl::stack2reg( r9_off + additional_frame_slots), r9->as_VMReg()); - map->set_callee_saved(VMRegImpl::stack2reg( r10_off + additional_frame_slots), r10->as_VMReg()); - map->set_callee_saved(VMRegImpl::stack2reg( r11_off + additional_frame_slots), r11->as_VMReg()); - map->set_callee_saved(VMRegImpl::stack2reg( r12_off + additional_frame_slots), r12->as_VMReg()); - map->set_callee_saved(VMRegImpl::stack2reg( r13_off + additional_frame_slots), r13->as_VMReg()); - map->set_callee_saved(VMRegImpl::stack2reg( r14_off + additional_frame_slots), r14->as_VMReg()); - map->set_callee_saved(VMRegImpl::stack2reg( r15_off + additional_frame_slots), r15->as_VMReg()); - map->set_callee_saved(VMRegImpl::stack2reg(xmm0_off + additional_frame_slots), xmm0->as_VMReg()); - map->set_callee_saved(VMRegImpl::stack2reg(xmm1_off + additional_frame_slots), xmm1->as_VMReg()); - map->set_callee_saved(VMRegImpl::stack2reg(xmm2_off + additional_frame_slots), xmm2->as_VMReg()); - map->set_callee_saved(VMRegImpl::stack2reg(xmm3_off + additional_frame_slots), xmm3->as_VMReg()); - map->set_callee_saved(VMRegImpl::stack2reg(xmm4_off + additional_frame_slots), xmm4->as_VMReg()); - map->set_callee_saved(VMRegImpl::stack2reg(xmm5_off + additional_frame_slots), xmm5->as_VMReg()); - map->set_callee_saved(VMRegImpl::stack2reg(xmm6_off + additional_frame_slots), xmm6->as_VMReg()); - map->set_callee_saved(VMRegImpl::stack2reg(xmm7_off + additional_frame_slots), xmm7->as_VMReg()); - map->set_callee_saved(VMRegImpl::stack2reg(xmm8_off + additional_frame_slots), xmm8->as_VMReg()); - map->set_callee_saved(VMRegImpl::stack2reg(xmm9_off + additional_frame_slots), xmm9->as_VMReg()); - map->set_callee_saved(VMRegImpl::stack2reg(xmm10_off + additional_frame_slots), xmm10->as_VMReg()); - map->set_callee_saved(VMRegImpl::stack2reg(xmm11_off + additional_frame_slots), xmm11->as_VMReg()); - map->set_callee_saved(VMRegImpl::stack2reg(xmm12_off + additional_frame_slots), xmm12->as_VMReg()); - map->set_callee_saved(VMRegImpl::stack2reg(xmm13_off + additional_frame_slots), xmm13->as_VMReg()); - map->set_callee_saved(VMRegImpl::stack2reg(xmm14_off + additional_frame_slots), xmm14->as_VMReg()); - map->set_callee_saved(VMRegImpl::stack2reg(xmm15_off + additional_frame_slots), xmm15->as_VMReg()); + map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg()); + map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg()); + map->set_callee_saved(STACK_OFFSET( r8_off ), r8->as_VMReg()); + map->set_callee_saved(STACK_OFFSET( r9_off ), r9->as_VMReg()); + map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg()); + map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg()); + map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg()); + map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg()); + map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg()); + map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg()); + map->set_callee_saved(STACK_OFFSET(xmm0_off ), xmm0->as_VMReg()); + map->set_callee_saved(STACK_OFFSET(xmm1_off ), xmm1->as_VMReg()); + map->set_callee_saved(STACK_OFFSET(xmm2_off ), xmm2->as_VMReg()); + map->set_callee_saved(STACK_OFFSET(xmm3_off ), xmm3->as_VMReg()); + map->set_callee_saved(STACK_OFFSET(xmm4_off ), xmm4->as_VMReg()); + map->set_callee_saved(STACK_OFFSET(xmm5_off ), xmm5->as_VMReg()); + map->set_callee_saved(STACK_OFFSET(xmm6_off ), xmm6->as_VMReg()); + map->set_callee_saved(STACK_OFFSET(xmm7_off ), xmm7->as_VMReg()); + map->set_callee_saved(STACK_OFFSET(xmm8_off ), xmm8->as_VMReg()); + map->set_callee_saved(STACK_OFFSET(xmm9_off ), xmm9->as_VMReg()); + map->set_callee_saved(STACK_OFFSET(xmm10_off), xmm10->as_VMReg()); + map->set_callee_saved(STACK_OFFSET(xmm11_off), xmm11->as_VMReg()); + map->set_callee_saved(STACK_OFFSET(xmm12_off), xmm12->as_VMReg()); + map->set_callee_saved(STACK_OFFSET(xmm13_off), xmm13->as_VMReg()); + map->set_callee_saved(STACK_OFFSET(xmm14_off), xmm14->as_VMReg()); + map->set_callee_saved(STACK_OFFSET(xmm15_off), xmm15->as_VMReg()); // %%% These should all be a waste but we'll keep things as they were for now if (true) { - map->set_callee_saved(VMRegImpl::stack2reg( raxH_off + additional_frame_slots), - rax->as_VMReg()->next()); - map->set_callee_saved(VMRegImpl::stack2reg( rcxH_off + additional_frame_slots), - rcx->as_VMReg()->next()); - map->set_callee_saved(VMRegImpl::stack2reg( rdxH_off + additional_frame_slots), - rdx->as_VMReg()->next()); - map->set_callee_saved(VMRegImpl::stack2reg( rbxH_off + additional_frame_slots), - rbx->as_VMReg()->next()); + map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next()); + map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next()); + map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next()); + map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next()); // rbp location is known implicitly by the frame sender code, needs no oopmap - map->set_callee_saved(VMRegImpl::stack2reg( rsiH_off + additional_frame_slots), - rsi->as_VMReg()->next()); - map->set_callee_saved(VMRegImpl::stack2reg( rdiH_off + additional_frame_slots), - rdi->as_VMReg()->next()); - map->set_callee_saved(VMRegImpl::stack2reg( r8H_off + additional_frame_slots), - r8->as_VMReg()->next()); - map->set_callee_saved(VMRegImpl::stack2reg( r9H_off + additional_frame_slots), - r9->as_VMReg()->next()); - map->set_callee_saved(VMRegImpl::stack2reg( r10H_off + additional_frame_slots), - r10->as_VMReg()->next()); - map->set_callee_saved(VMRegImpl::stack2reg( r11H_off + additional_frame_slots), - r11->as_VMReg()->next()); - map->set_callee_saved(VMRegImpl::stack2reg( r12H_off + additional_frame_slots), - r12->as_VMReg()->next()); - map->set_callee_saved(VMRegImpl::stack2reg( r13H_off + additional_frame_slots), - r13->as_VMReg()->next()); - map->set_callee_saved(VMRegImpl::stack2reg( r14H_off + additional_frame_slots), - r14->as_VMReg()->next()); - map->set_callee_saved(VMRegImpl::stack2reg( r15H_off + additional_frame_slots), - r15->as_VMReg()->next()); - map->set_callee_saved(VMRegImpl::stack2reg(xmm0H_off + additional_frame_slots), - xmm0->as_VMReg()->next()); - map->set_callee_saved(VMRegImpl::stack2reg(xmm1H_off + additional_frame_slots), - xmm1->as_VMReg()->next()); - map->set_callee_saved(VMRegImpl::stack2reg(xmm2H_off + additional_frame_slots), - xmm2->as_VMReg()->next()); - map->set_callee_saved(VMRegImpl::stack2reg(xmm3H_off + additional_frame_slots), - xmm3->as_VMReg()->next()); - map->set_callee_saved(VMRegImpl::stack2reg(xmm4H_off + additional_frame_slots), - xmm4->as_VMReg()->next()); - map->set_callee_saved(VMRegImpl::stack2reg(xmm5H_off + additional_frame_slots), - xmm5->as_VMReg()->next()); - map->set_callee_saved(VMRegImpl::stack2reg(xmm6H_off + additional_frame_slots), - xmm6->as_VMReg()->next()); - map->set_callee_saved(VMRegImpl::stack2reg(xmm7H_off + additional_frame_slots), - xmm7->as_VMReg()->next()); - map->set_callee_saved(VMRegImpl::stack2reg(xmm8H_off + additional_frame_slots), - xmm8->as_VMReg()->next()); - map->set_callee_saved(VMRegImpl::stack2reg(xmm9H_off + additional_frame_slots), - xmm9->as_VMReg()->next()); - map->set_callee_saved(VMRegImpl::stack2reg(xmm10H_off + additional_frame_slots), - xmm10->as_VMReg()->next()); - map->set_callee_saved(VMRegImpl::stack2reg(xmm11H_off + additional_frame_slots), - xmm11->as_VMReg()->next()); - map->set_callee_saved(VMRegImpl::stack2reg(xmm12H_off + additional_frame_slots), - xmm12->as_VMReg()->next()); - map->set_callee_saved(VMRegImpl::stack2reg(xmm13H_off + additional_frame_slots), - xmm13->as_VMReg()->next()); - map->set_callee_saved(VMRegImpl::stack2reg(xmm14H_off + additional_frame_slots), - xmm14->as_VMReg()->next()); - map->set_callee_saved(VMRegImpl::stack2reg(xmm15H_off + additional_frame_slots), - xmm15->as_VMReg()->next()); + map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next()); + map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next()); + map->set_callee_saved(STACK_OFFSET( r8H_off ), r8->as_VMReg()->next()); + map->set_callee_saved(STACK_OFFSET( r9H_off ), r9->as_VMReg()->next()); + map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next()); + map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next()); + map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next()); + map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next()); + map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next()); + map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next()); + map->set_callee_saved(STACK_OFFSET(xmm0H_off ), xmm0->as_VMReg()->next()); + map->set_callee_saved(STACK_OFFSET(xmm1H_off ), xmm1->as_VMReg()->next()); + map->set_callee_saved(STACK_OFFSET(xmm2H_off ), xmm2->as_VMReg()->next()); + map->set_callee_saved(STACK_OFFSET(xmm3H_off ), xmm3->as_VMReg()->next()); + map->set_callee_saved(STACK_OFFSET(xmm4H_off ), xmm4->as_VMReg()->next()); + map->set_callee_saved(STACK_OFFSET(xmm5H_off ), xmm5->as_VMReg()->next()); + map->set_callee_saved(STACK_OFFSET(xmm6H_off ), xmm6->as_VMReg()->next()); + map->set_callee_saved(STACK_OFFSET(xmm7H_off ), xmm7->as_VMReg()->next()); + map->set_callee_saved(STACK_OFFSET(xmm8H_off ), xmm8->as_VMReg()->next()); + map->set_callee_saved(STACK_OFFSET(xmm9H_off ), xmm9->as_VMReg()->next()); + map->set_callee_saved(STACK_OFFSET(xmm10H_off), xmm10->as_VMReg()->next()); + map->set_callee_saved(STACK_OFFSET(xmm11H_off), xmm11->as_VMReg()->next()); + map->set_callee_saved(STACK_OFFSET(xmm12H_off), xmm12->as_VMReg()->next()); + map->set_callee_saved(STACK_OFFSET(xmm13H_off), xmm13->as_VMReg()->next()); + map->set_callee_saved(STACK_OFFSET(xmm14H_off), xmm14->as_VMReg()->next()); + map->set_callee_saved(STACK_OFFSET(xmm15H_off), xmm15->as_VMReg()->next()); } return map; } -void RegisterSaver::restore_live_registers(MacroAssembler* masm) { +void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_vectors) { if (frame::arg_reg_save_area_bytes != 0) { // Pop arg register save area __ addptr(rsp, frame::arg_reg_save_area_bytes); } +#ifdef COMPILER2 + if (restore_vectors) { + // Restore upper half of YMM registes. + assert(UseAVX > 0, "256bit vectors are supported only with AVX"); + assert(MaxVectorSize == 32, "only 256bit vectors are supported now"); + __ vinsertf128h(xmm0, Address(rsp, 0)); + __ vinsertf128h(xmm1, Address(rsp, 16)); + __ vinsertf128h(xmm2, Address(rsp, 32)); + __ vinsertf128h(xmm3, Address(rsp, 48)); + __ vinsertf128h(xmm4, Address(rsp, 64)); + __ vinsertf128h(xmm5, Address(rsp, 80)); + __ vinsertf128h(xmm6, Address(rsp, 96)); + __ vinsertf128h(xmm7, Address(rsp,112)); + __ vinsertf128h(xmm8, Address(rsp,128)); + __ vinsertf128h(xmm9, Address(rsp,144)); + __ vinsertf128h(xmm10, Address(rsp,160)); + __ vinsertf128h(xmm11, Address(rsp,176)); + __ vinsertf128h(xmm12, Address(rsp,192)); + __ vinsertf128h(xmm13, Address(rsp,208)); + __ vinsertf128h(xmm14, Address(rsp,224)); + __ vinsertf128h(xmm15, Address(rsp,240)); + __ addptr(rsp, 256); + } +#else + assert(!restore_vectors, "vectors are generated only by C2"); +#endif // Recover CPU state __ pop_CPU_state(); // Get the rbp described implicitly by the calling convention (no oopMap) @@ -297,6 +329,12 @@ __ addptr(rsp, return_offset_in_bytes()); } +// Is vector's size (in bytes) bigger than a size saved by default? +// 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions. +bool SharedRuntime::is_wide_vector(int size) { + return size > 16; +} + // The java_calling_convention describes stack locations as ideal slots on // a frame with no abi restrictions. Since we must observe abi restrictions // (like the placement of the register window) the slots must be biased by @@ -3235,7 +3273,6 @@ return 0; } - //------------------------------generate_deopt_blob---------------------------- void SharedRuntime::generate_deopt_blob() { // Allocate space for the code @@ -3740,7 +3777,7 @@ // Generate a special Compile2Runtime blob that saves all registers, // and setup oopmap. // -SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, bool cause_return) { +SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, int poll_type) { assert(StubRoutines::forward_exception_entry() != NULL, "must be generated before"); @@ -3755,6 +3792,8 @@ address start = __ pc(); address call_pc = NULL; int frame_size_in_words; + bool cause_return = (poll_type == POLL_AT_RETURN); + bool save_vectors = (poll_type == POLL_AT_VECTOR_LOOP); // Make room for return address (or push it again) if (!cause_return) { @@ -3762,7 +3801,7 @@ } // Save registers, fpu state, and flags - map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words); + map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_vectors); // The following is basically a call_VM. However, we need the precise // address of the call in order to generate an oopmap. Hence, we do all the @@ -3799,7 +3838,7 @@ // Exception pending - RegisterSaver::restore_live_registers(masm); + RegisterSaver::restore_live_registers(masm, save_vectors); __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); @@ -3807,7 +3846,7 @@ __ bind(noException); // Normal exit, restore registers and exit. - RegisterSaver::restore_live_registers(masm); + RegisterSaver::restore_live_registers(masm, save_vectors); __ ret(0); diff -r 8d3cc6612bd1 -r 137868b7aa6f src/cpu/x86/vm/x86.ad --- a/src/cpu/x86/vm/x86.ad Mon Sep 17 17:02:10 2012 -0700 +++ b/src/cpu/x86/vm/x86.ad Mon Sep 17 19:39:07 2012 -0700 @@ -498,6 +498,7 @@ case Op_PopCountL: if (!UsePopCountInstruction) return false; + break; case Op_MulVI: if ((UseSSE < 4) && (UseAVX < 1)) // only with SSE4_1 or AVX return false; diff -r 8d3cc6612bd1 -r 137868b7aa6f src/share/vm/c1/c1_Compilation.cpp --- a/src/share/vm/c1/c1_Compilation.cpp Mon Sep 17 17:02:10 2012 -0700 +++ b/src/share/vm/c1/c1_Compilation.cpp Mon Sep 17 19:39:07 2012 -0700 @@ -346,7 +346,8 @@ implicit_exception_table(), compiler(), _env->comp_level(), - has_unsafe_access() + has_unsafe_access(), + SharedRuntime::is_wide_vector(max_vector_size()) ); } diff -r 8d3cc6612bd1 -r 137868b7aa6f src/share/vm/c1/c1_Compilation.hpp --- a/src/share/vm/c1/c1_Compilation.hpp Mon Sep 17 17:02:10 2012 -0700 +++ b/src/share/vm/c1/c1_Compilation.hpp Mon Sep 17 19:39:07 2012 -0700 @@ -127,6 +127,7 @@ bool has_exception_handlers() const { return _has_exception_handlers; } bool has_fpu_code() const { return _has_fpu_code; } bool has_unsafe_access() const { return _has_unsafe_access; } + int max_vector_size() const { return 0; } ciMethod* method() const { return _method; } int osr_bci() const { return _osr_bci; } bool is_osr_compile() const { return osr_bci() >= 0; } diff -r 8d3cc6612bd1 -r 137868b7aa6f src/share/vm/ci/ciEnv.cpp --- a/src/share/vm/ci/ciEnv.cpp Mon Sep 17 17:02:10 2012 -0700 +++ b/src/share/vm/ci/ciEnv.cpp Mon Sep 17 19:39:07 2012 -0700 @@ -921,7 +921,8 @@ ImplicitExceptionTable* inc_table, AbstractCompiler* compiler, int comp_level, - bool has_unsafe_access) { + bool has_unsafe_access, + bool has_wide_vectors) { VM_ENTRY_MARK; nmethod* nm = NULL; { @@ -1016,6 +1017,7 @@ } } else { nm->set_has_unsafe_access(has_unsafe_access); + nm->set_has_wide_vectors(has_wide_vectors); // Record successful registration. // (Put nm into the task handle *before* publishing to the Java heap.) diff -r 8d3cc6612bd1 -r 137868b7aa6f src/share/vm/ci/ciEnv.hpp --- a/src/share/vm/ci/ciEnv.hpp Mon Sep 17 17:02:10 2012 -0700 +++ b/src/share/vm/ci/ciEnv.hpp Mon Sep 17 19:39:07 2012 -0700 @@ -362,7 +362,8 @@ ImplicitExceptionTable* inc_table, AbstractCompiler* compiler, int comp_level, - bool has_unsafe_access); + bool has_unsafe_access, + bool has_wide_vectors); // Access to certain well known ciObjects. diff -r 8d3cc6612bd1 -r 137868b7aa6f src/share/vm/code/nmethod.cpp --- a/src/share/vm/code/nmethod.cpp Mon Sep 17 17:02:10 2012 -0700 +++ b/src/share/vm/code/nmethod.cpp Mon Sep 17 19:39:07 2012 -0700 @@ -463,6 +463,7 @@ _has_unsafe_access = 0; _has_method_handle_invokes = 0; _lazy_critical_native = 0; + _has_wide_vectors = 0; _marked_for_deoptimization = 0; _lock_count = 0; _stack_traversal_mark = 0; diff -r 8d3cc6612bd1 -r 137868b7aa6f src/share/vm/code/nmethod.hpp --- a/src/share/vm/code/nmethod.hpp Mon Sep 17 17:02:10 2012 -0700 +++ b/src/share/vm/code/nmethod.hpp Mon Sep 17 19:39:07 2012 -0700 @@ -177,6 +177,7 @@ unsigned int _has_unsafe_access:1; // May fault due to unsafe access. unsigned int _has_method_handle_invokes:1; // Has this method MethodHandle invokes? unsigned int _lazy_critical_native:1; // Lazy JNI critical native + unsigned int _has_wide_vectors:1; // Preserve wide vectors at safepoints // Protected by Patching_lock unsigned char _state; // {alive, not_entrant, zombie, unloaded} @@ -442,6 +443,9 @@ bool is_lazy_critical_native() const { return _lazy_critical_native; } void set_lazy_critical_native(bool z) { _lazy_critical_native = z; } + bool has_wide_vectors() const { return _has_wide_vectors; } + void set_has_wide_vectors(bool z) { _has_wide_vectors = z; } + int comp_level() const { return _comp_level; } // Support for oops in scopes and relocs: diff -r 8d3cc6612bd1 -r 137868b7aa6f src/share/vm/opto/compile.cpp --- a/src/share/vm/opto/compile.cpp Mon Sep 17 17:02:10 2012 -0700 +++ b/src/share/vm/opto/compile.cpp Mon Sep 17 19:39:07 2012 -0700 @@ -825,7 +825,8 @@ &_handler_table, &_inc_table, compiler, env()->comp_level(), - has_unsafe_access() + has_unsafe_access(), + SharedRuntime::is_wide_vector(max_vector_size()) ); } } @@ -963,6 +964,7 @@ _trap_can_recompile = false; // no traps emitted yet _major_progress = true; // start out assuming good things will happen set_has_unsafe_access(false); + set_max_vector_size(0); Copy::zero_to_bytes(_trap_hist, sizeof(_trap_hist)); set_decompile_count(0); diff -r 8d3cc6612bd1 -r 137868b7aa6f src/share/vm/opto/compile.hpp --- a/src/share/vm/opto/compile.hpp Mon Sep 17 17:02:10 2012 -0700 +++ b/src/share/vm/opto/compile.hpp Mon Sep 17 19:39:07 2012 -0700 @@ -279,6 +279,7 @@ bool _has_split_ifs; // True if the method _may_ have some split-if bool _has_unsafe_access; // True if the method _may_ produce faults in unsafe loads or stores. bool _has_stringbuilder; // True StringBuffers or StringBuilders are allocated + int _max_vector_size; // Maximum size of generated vectors uint _trap_hist[trapHistLength]; // Cumulative traps bool _trap_can_recompile; // Have we emitted a recompiling trap? uint _decompile_count; // Cumulative decompilation counts. @@ -443,6 +444,8 @@ void set_has_unsafe_access(bool z) { _has_unsafe_access = z; } bool has_stringbuilder() const { return _has_stringbuilder; } void set_has_stringbuilder(bool z) { _has_stringbuilder = z; } + int max_vector_size() const { return _max_vector_size; } + void set_max_vector_size(int s) { _max_vector_size = s; } void set_trap_count(uint r, uint c) { assert(r < trapHistLength, "oob"); _trap_hist[r] = c; } uint trap_count(uint r) const { assert(r < trapHistLength, "oob"); return _trap_hist[r]; } bool trap_can_recompile() const { return _trap_can_recompile; } diff -r 8d3cc6612bd1 -r 137868b7aa6f src/share/vm/opto/output.cpp --- a/src/share/vm/opto/output.cpp Mon Sep 17 17:02:10 2012 -0700 +++ b/src/share/vm/opto/output.cpp Mon Sep 17 19:39:07 2012 -0700 @@ -1869,7 +1869,9 @@ if (!do_scheduling()) return; - assert(MaxVectorSize <= 8, "scheduling code works only with pairs"); + // Scheduling code works only with pairs (8 bytes) maximum. + if (max_vector_size() > 8) + return; NOT_PRODUCT( TracePhase t2("isched", &_t_instrSched, TimeCompiler); ) diff -r 8d3cc6612bd1 -r 137868b7aa6f src/share/vm/opto/superword.cpp --- a/src/share/vm/opto/superword.cpp Mon Sep 17 17:02:10 2012 -0700 +++ b/src/share/vm/opto/superword.cpp Mon Sep 17 19:39:07 2012 -0700 @@ -1350,11 +1350,14 @@ insert_extracts(_packset.at(i)); } + Compile* C = _phase->C; + uint max_vlen_in_bytes = 0; for (int i = 0; i < _block.length(); i++) { Node* n = _block.at(i); Node_List* p = my_pack(n); if (p && n == executed_last(p)) { uint vlen = p->size(); + uint vlen_in_bytes = 0; Node* vn = NULL; Node* low_adr = p->at(0); Node* first = executed_first(p); @@ -1364,7 +1367,8 @@ Node* mem = first->in(MemNode::Memory); Node* adr = low_adr->in(MemNode::Address); const TypePtr* atyp = n->adr_type(); - vn = LoadVectorNode::make(_phase->C, opc, ctl, mem, adr, atyp, vlen, velt_basic_type(n)); + vn = LoadVectorNode::make(C, opc, ctl, mem, adr, atyp, vlen, velt_basic_type(n)); + vlen_in_bytes = vn->as_LoadVector()->memory_size(); } else if (n->is_Store()) { // Promote value to be stored to vector Node* val = vector_opd(p, MemNode::ValueIn); @@ -1372,7 +1376,8 @@ Node* mem = first->in(MemNode::Memory); Node* adr = low_adr->in(MemNode::Address); const TypePtr* atyp = n->adr_type(); - vn = StoreVectorNode::make(_phase->C, opc, ctl, mem, adr, atyp, val, vlen); + vn = StoreVectorNode::make(C, opc, ctl, mem, adr, atyp, val, vlen); + vlen_in_bytes = vn->as_StoreVector()->memory_size(); } else if (n->req() == 3) { // Promote operands to vector Node* in1 = vector_opd(p, 1); @@ -1383,7 +1388,8 @@ in1 = in2; in2 = tmp; } - vn = VectorNode::make(_phase->C, opc, in1, in2, vlen, velt_basic_type(n)); + vn = VectorNode::make(C, opc, in1, in2, vlen, velt_basic_type(n)); + vlen_in_bytes = vn->as_Vector()->length_in_bytes(); } else { ShouldNotReachHere(); } @@ -1395,6 +1401,10 @@ _igvn.replace_node(pm, vn); } _igvn._worklist.push(vn); + + if (vlen_in_bytes > max_vlen_in_bytes) { + max_vlen_in_bytes = vlen_in_bytes; + } #ifdef ASSERT if (TraceNewVectors) { tty->print("new Vector node: "); @@ -1403,6 +1413,7 @@ #endif } } + C->set_max_vector_size(max_vlen_in_bytes); } //------------------------------vector_opd--------------------------- @@ -1439,7 +1450,7 @@ } assert(opd->bottom_type()->isa_int(), "int type only"); // Move non constant shift count into XMM register. - cnt = new (_phase->C, 2) MoveI2FNode(cnt); + cnt = new (C, 2) MoveI2FNode(cnt); } if (cnt != opd) { _phase->_igvn.register_new_node_with_optimizer(cnt); @@ -1480,10 +1491,10 @@ _phase->_igvn.register_new_node_with_optimizer(pk); _phase->set_ctrl(pk, _phase->get_ctrl(opd)); #ifdef ASSERT - if (TraceNewVectors) { - tty->print("new Vector node: "); - pk->dump(); - } + if (TraceNewVectors) { + tty->print("new Vector node: "); + pk->dump(); + } #endif return pk; } diff -r 8d3cc6612bd1 -r 137868b7aa6f src/share/vm/runtime/sharedRuntime.cpp --- a/src/share/vm/runtime/sharedRuntime.cpp Mon Sep 17 17:02:10 2012 -0700 +++ b/src/share/vm/runtime/sharedRuntime.cpp Mon Sep 17 19:39:07 2012 -0700 @@ -88,6 +88,7 @@ RuntimeStub* SharedRuntime::_resolve_static_call_blob; DeoptimizationBlob* SharedRuntime::_deopt_blob; +SafepointBlob* SharedRuntime::_polling_page_vectors_safepoint_handler_blob; SafepointBlob* SharedRuntime::_polling_page_safepoint_handler_blob; SafepointBlob* SharedRuntime::_polling_page_return_handler_blob; @@ -104,8 +105,14 @@ _resolve_virtual_call_blob = generate_resolve_blob(CAST_FROM_FN_PTR(address, SharedRuntime::resolve_virtual_call_C), "resolve_virtual_call"); _resolve_static_call_blob = generate_resolve_blob(CAST_FROM_FN_PTR(address, SharedRuntime::resolve_static_call_C), "resolve_static_call"); - _polling_page_safepoint_handler_blob = generate_handler_blob(CAST_FROM_FN_PTR(address, SafepointSynchronize::handle_polling_page_exception), false); - _polling_page_return_handler_blob = generate_handler_blob(CAST_FROM_FN_PTR(address, SafepointSynchronize::handle_polling_page_exception), true); +#ifdef COMPILER2 + // Vectors are generated only by C2. + if (is_wide_vector(MaxVectorSize)) { + _polling_page_vectors_safepoint_handler_blob = generate_handler_blob(CAST_FROM_FN_PTR(address, SafepointSynchronize::handle_polling_page_exception), POLL_AT_VECTOR_LOOP); + } +#endif // COMPILER2 + _polling_page_safepoint_handler_blob = generate_handler_blob(CAST_FROM_FN_PTR(address, SafepointSynchronize::handle_polling_page_exception), POLL_AT_LOOP); + _polling_page_return_handler_blob = generate_handler_blob(CAST_FROM_FN_PTR(address, SafepointSynchronize::handle_polling_page_exception), POLL_AT_RETURN); generate_deopt_blob(); @@ -535,10 +542,15 @@ "Only polling locations are used for safepoint"); bool at_poll_return = ((nmethod*)cb)->is_at_poll_return(pc); + bool has_wide_vectors = ((nmethod*)cb)->has_wide_vectors(); if (at_poll_return) { assert(SharedRuntime::polling_page_return_handler_blob() != NULL, "polling page return stub not created yet"); stub = SharedRuntime::polling_page_return_handler_blob()->entry_point(); + } else if (has_wide_vectors) { + assert(SharedRuntime::polling_page_vectors_safepoint_handler_blob() != NULL, + "polling page vectors safepoint stub not created yet"); + stub = SharedRuntime::polling_page_vectors_safepoint_handler_blob()->entry_point(); } else { assert(SharedRuntime::polling_page_safepoint_handler_blob() != NULL, "polling page safepoint stub not created yet"); diff -r 8d3cc6612bd1 -r 137868b7aa6f src/share/vm/runtime/sharedRuntime.hpp --- a/src/share/vm/runtime/sharedRuntime.hpp Mon Sep 17 17:02:10 2012 -0700 +++ b/src/share/vm/runtime/sharedRuntime.hpp Mon Sep 17 19:39:07 2012 -0700 @@ -62,6 +62,7 @@ static DeoptimizationBlob* _deopt_blob; + static SafepointBlob* _polling_page_vectors_safepoint_handler_blob; static SafepointBlob* _polling_page_safepoint_handler_blob; static SafepointBlob* _polling_page_return_handler_blob; @@ -75,7 +76,8 @@ #endif // !PRODUCT private: - static SafepointBlob* generate_handler_blob(address call_ptr, bool cause_return); + enum { POLL_AT_RETURN, POLL_AT_LOOP, POLL_AT_VECTOR_LOOP }; + static SafepointBlob* generate_handler_blob(address call_ptr, int poll_type); static RuntimeStub* generate_resolve_blob(address destination, const char* name); public: @@ -223,6 +225,7 @@ static SafepointBlob* polling_page_return_handler_blob() { return _polling_page_return_handler_blob; } static SafepointBlob* polling_page_safepoint_handler_blob() { return _polling_page_safepoint_handler_blob; } + static SafepointBlob* polling_page_vectors_safepoint_handler_blob() { return _polling_page_vectors_safepoint_handler_blob; } // Counters #ifndef PRODUCT @@ -416,6 +419,10 @@ // when an interrupt occurs. static uint out_preserve_stack_slots(); + // Is vector's size (in bytes) bigger than a size saved by default? + // For example, on x86 16 bytes XMM registers are saved by default. + static bool is_wide_vector(int size); + // Save and restore a native result static void save_native_result(MacroAssembler *_masm, BasicType ret_type, int frame_slots ); static void restore_native_result(MacroAssembler *_masm, BasicType ret_type, int frame_slots ); diff -r 8d3cc6612bd1 -r 137868b7aa6f test/compiler/7196199/Test7196199.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test/compiler/7196199/Test7196199.java Mon Sep 17 19:39:07 2012 -0700 @@ -0,0 +1,189 @@ +/* + * Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + * + */ + +/** + * @test + * @bug 7196199 + * @summary java/text/Bidi/Bug6665028.java failed: Bidi run count incorrect + * + * @run main/othervm/timeout=400 -Xmx32m -Xbatch -XX:+IgnoreUnrecognizedVMOptions -XX:-TieredCompilation -XX:CompileCommand=exclude,Test7196199.test -XX:+SafepointALot -XX:GuaranteedSafepointInterval=100 Test7196199 + */ + + +public class Test7196199 { + private static final int ARRLEN = 97; + private static final int ITERS = 5000; + private static final int INI_ITERS = 1000; + private static final int SFP_ITERS = 10000; + private static final float SFP_ITERS_F = 10000.f; + private static final float VALUE = 15.f; + public static void main(String args[]) { + int errn = test(); + if (errn > 0) { + System.err.println("FAILED: " + errn + " errors"); + System.exit(97); + } + System.out.println("PASSED"); + } + + static int test() { + float[] a0 = new float[ARRLEN]; + float[] a1 = new float[ARRLEN]; + // Initialize + for (int i=0; i 0) + return errn; + + System.out.println("Time"); + long start, end; + + start = System.currentTimeMillis(); + for (int i=0; i