# HG changeset patch # User kvn # Date 1357254595 28800 # Node ID 00af3a3a8df4d5afdf21209da0176bfc390dd070 # Parent d092d1b3122937df88d43a46b0c2c5fff1bbabb8 8005522: use fast-string instructions on x86 for zeroing Summary: use 'rep stosb' instead of 'rep stosq' when fast-string operations are available. Reviewed-by: twisti, roland diff -r d092d1b31229 -r 00af3a3a8df4 src/cpu/x86/vm/assembler_x86.cpp --- a/src/cpu/x86/vm/assembler_x86.cpp Sun Dec 23 17:08:22 2012 +0100 +++ b/src/cpu/x86/vm/assembler_x86.cpp Thu Jan 03 15:09:55 2013 -0800 @@ -2544,12 +2544,18 @@ emit_int8((unsigned char)0xA5); } +// sets rcx bytes with rax, value at [edi] +void Assembler::rep_stosb() { + emit_int8((unsigned char)0xF3); // REP + LP64_ONLY(prefix(REX_W)); + emit_int8((unsigned char)0xAA); // STOSB +} + // sets rcx pointer sized words with rax, value at [edi] // generic -void Assembler::rep_set() { // rep_set - emit_int8((unsigned char)0xF3); - // STOSQ - LP64_ONLY(prefix(REX_W)); +void Assembler::rep_stos() { + emit_int8((unsigned char)0xF3); // REP + LP64_ONLY(prefix(REX_W)); // LP64:STOSQ, LP32:STOSD emit_int8((unsigned char)0xAB); } diff -r d092d1b31229 -r 00af3a3a8df4 src/cpu/x86/vm/assembler_x86.hpp --- a/src/cpu/x86/vm/assembler_x86.hpp Sun Dec 23 17:08:22 2012 +0100 +++ b/src/cpu/x86/vm/assembler_x86.hpp Thu Jan 03 15:09:55 2013 -0800 @@ -832,7 +832,8 @@ // These do register sized moves/scans void rep_mov(); - void rep_set(); + void rep_stos(); + void rep_stosb(); void repne_scan(); #ifdef _LP64 void repne_scanl(); diff -r d092d1b31229 -r 00af3a3a8df4 src/cpu/x86/vm/globals_x86.hpp --- a/src/cpu/x86/vm/globals_x86.hpp Sun Dec 23 17:08:22 2012 +0100 +++ b/src/cpu/x86/vm/globals_x86.hpp Thu Jan 03 15:09:55 2013 -0800 @@ -120,6 +120,9 @@ product(bool, UseUnalignedLoadStores, false, \ "Use SSE2 MOVDQU instruction for Arraycopy") \ \ + product(bool, UseFastStosb, false, \ + "Use fast-string operation for zeroing: rep stosb") \ + \ /* assembler */ \ product(bool, Use486InstrsOnly, false, \ "Use 80486 Compliant instruction subset") \ diff -r d092d1b31229 -r 00af3a3a8df4 src/cpu/x86/vm/macroAssembler_x86.cpp --- a/src/cpu/x86/vm/macroAssembler_x86.cpp Sun Dec 23 17:08:22 2012 +0100 +++ b/src/cpu/x86/vm/macroAssembler_x86.cpp Thu Jan 03 15:09:55 2013 -0800 @@ -5224,6 +5224,22 @@ } +void MacroAssembler::clear_mem(Register base, Register cnt, Register tmp) { + // cnt - number of qwords (8-byte words). + // base - start address, qword aligned. + assert(base==rdi, "base register must be edi for rep stos"); + assert(tmp==rax, "tmp register must be eax for rep stos"); + assert(cnt==rcx, "cnt register must be ecx for rep stos"); + + xorptr(tmp, tmp); + if (UseFastStosb) { + shlptr(cnt,3); // convert to number of bytes + rep_stosb(); + } else { + NOT_LP64(shlptr(cnt,1);) // convert to number of dwords for 32-bit VM + rep_stos(); + } +} // IndexOf for constant substrings with size >= 8 chars // which don't need to be loaded through stack. diff -r d092d1b31229 -r 00af3a3a8df4 src/cpu/x86/vm/macroAssembler_x86.hpp --- a/src/cpu/x86/vm/macroAssembler_x86.hpp Sun Dec 23 17:08:22 2012 +0100 +++ b/src/cpu/x86/vm/macroAssembler_x86.hpp Thu Jan 03 15:09:55 2013 -0800 @@ -1096,6 +1096,9 @@ // C2 compiled method's prolog code. void verified_entry(int framesize, bool stack_bang, bool fp_mode_24b); + // clear memory of size 'cnt' qwords, starting at 'base'. + void clear_mem(Register base, Register cnt, Register rtmp); + // IndexOf strings. // Small strings are loaded through stack if they cross page boundary. void string_indexof(Register str1, Register str2, diff -r d092d1b31229 -r 00af3a3a8df4 src/cpu/x86/vm/vm_version_x86.cpp --- a/src/cpu/x86/vm/vm_version_x86.cpp Sun Dec 23 17:08:22 2012 +0100 +++ b/src/cpu/x86/vm/vm_version_x86.cpp Thu Jan 03 15:09:55 2013 -0800 @@ -429,7 +429,7 @@ } char buf[256]; - jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s", + jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s", cores_per_cpu(), threads_per_core(), cpu_family(), _model, _stepping, (supports_cmov() ? ", cmov" : ""), @@ -446,6 +446,7 @@ (supports_avx() ? ", avx" : ""), (supports_avx2() ? ", avx2" : ""), (supports_aes() ? ", aes" : ""), + (supports_erms() ? ", erms" : ""), (supports_mmx_ext() ? ", mmxext" : ""), (supports_3dnow_prefetch() ? ", 3dnowpref" : ""), (supports_lzcnt() ? ", lzcnt": ""), @@ -671,6 +672,16 @@ FLAG_SET_DEFAULT(UsePopCountInstruction, false); } + // Use fast-string operations if available. + if (supports_erms()) { + if (FLAG_IS_DEFAULT(UseFastStosb)) { + UseFastStosb = true; + } + } else if (UseFastStosb) { + warning("fast-string operations are not available on this CPU"); + FLAG_SET_DEFAULT(UseFastStosb, false); + } + #ifdef COMPILER2 if (FLAG_IS_DEFAULT(AlignVector)) { // Modern processors allow misaligned memory operations for vectors. diff -r d092d1b31229 -r 00af3a3a8df4 src/cpu/x86/vm/vm_version_x86.hpp --- a/src/cpu/x86/vm/vm_version_x86.hpp Sun Dec 23 17:08:22 2012 +0100 +++ b/src/cpu/x86/vm/vm_version_x86.hpp Thu Jan 03 15:09:55 2013 -0800 @@ -204,7 +204,8 @@ avx2 : 1, : 2, bmi2 : 1, - : 23; + erms : 1, + : 22; } bits; }; @@ -247,7 +248,8 @@ CPU_TSCINV = (1 << 16), CPU_AVX = (1 << 17), CPU_AVX2 = (1 << 18), - CPU_AES = (1 << 19) + CPU_AES = (1 << 19), + CPU_ERMS = (1 << 20) // enhanced 'rep movsb/stosb' instructions } cpuFeatureFlags; enum { @@ -425,6 +427,8 @@ result |= CPU_TSCINV; if (_cpuid_info.std_cpuid1_ecx.bits.aes != 0) result |= CPU_AES; + if (_cpuid_info.sef_cpuid7_ebx.bits.erms != 0) + result |= CPU_ERMS; // AMD features. if (is_amd()) { @@ -489,7 +493,7 @@ return (_cpuid_info.std_max_function >= 0xB) && // eax[4:0] | ebx[0:15] == 0 indicates invalid topology level. // Some cpus have max cpuid >= 0xB but do not support processor topology. - ((_cpuid_info.tpl_cpuidB0_eax & 0x1f | _cpuid_info.tpl_cpuidB0_ebx.bits.logical_cpus) != 0); + (((_cpuid_info.tpl_cpuidB0_eax & 0x1f) | _cpuid_info.tpl_cpuidB0_ebx.bits.logical_cpus) != 0); } static uint cores_per_cpu() { @@ -550,6 +554,7 @@ static bool supports_avx2() { return (_cpuFeatures & CPU_AVX2) != 0; } static bool supports_tsc() { return (_cpuFeatures & CPU_TSC) != 0; } static bool supports_aes() { return (_cpuFeatures & CPU_AES) != 0; } + static bool supports_erms() { return (_cpuFeatures & CPU_ERMS) != 0; } // Intel features static bool is_intel_family_core() { return is_intel() && diff -r d092d1b31229 -r 00af3a3a8df4 src/cpu/x86/vm/x86_32.ad --- a/src/cpu/x86/vm/x86_32.ad Sun Dec 23 17:08:22 2012 +0100 +++ b/src/cpu/x86/vm/x86_32.ad Thu Jan 03 15:09:55 2013 -0800 @@ -11572,15 +11572,28 @@ // ======================================================================= // fast clearing of an array instruct rep_stos(eCXRegI cnt, eDIRegP base, eAXRegI zero, Universe dummy, eFlagsReg cr) %{ + predicate(!UseFastStosb); match(Set dummy (ClearArray cnt base)); effect(USE_KILL cnt, USE_KILL base, KILL zero, KILL cr); - format %{ "SHL ECX,1\t# Convert doublewords to words\n\t" - "XOR EAX,EAX\n\t" + format %{ "XOR EAX,EAX\t# ClearArray:\n\t" + "SHL ECX,1\t# Convert doublewords to words\n\t" "REP STOS\t# store EAX into [EDI++] while ECX--" %} - opcode(0,0x4); - ins_encode( Opcode(0xD1), RegOpc(ECX), - OpcRegReg(0x33,EAX,EAX), - Opcode(0xF3), Opcode(0xAB) ); + ins_encode %{ + __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register); + %} + ins_pipe( pipe_slow ); +%} + +instruct rep_fast_stosb(eCXRegI cnt, eDIRegP base, eAXRegI zero, Universe dummy, eFlagsReg cr) %{ + predicate(UseFastStosb); + match(Set dummy (ClearArray cnt base)); + effect(USE_KILL cnt, USE_KILL base, KILL zero, KILL cr); + format %{ "XOR EAX,EAX\t# ClearArray:\n\t" + "SHL ECX,3\t# Convert doublewords to bytes\n\t" + "REP STOSB\t# store EAX into [EDI++] while ECX--" %} + ins_encode %{ + __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register); + %} ins_pipe( pipe_slow ); %} diff -r d092d1b31229 -r 00af3a3a8df4 src/cpu/x86/vm/x86_64.ad --- a/src/cpu/x86/vm/x86_64.ad Sun Dec 23 17:08:22 2012 +0100 +++ b/src/cpu/x86/vm/x86_64.ad Thu Jan 03 15:09:55 2013 -0800 @@ -10374,16 +10374,33 @@ instruct rep_stos(rcx_RegL cnt, rdi_RegP base, rax_RegI zero, Universe dummy, rFlagsReg cr) %{ + predicate(!UseFastStosb); match(Set dummy (ClearArray cnt base)); effect(USE_KILL cnt, USE_KILL base, KILL zero, KILL cr); - format %{ "xorl rax, rax\t# ClearArray:\n\t" - "rep stosq\t# Store rax to *rdi++ while rcx--" %} - ins_encode(opc_reg_reg(0x33, RAX, RAX), // xorl %eax, %eax - Opcode(0xF3), Opcode(0x48), Opcode(0xAB)); // rep REX_W stos + format %{ "xorq rax, rax\t# ClearArray:\n\t" + "rep stosq\t# Store rax to *rdi++ while rcx--" %} + ins_encode %{ + __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register); + %} ins_pipe(pipe_slow); %} +instruct rep_fast_stosb(rcx_RegL cnt, rdi_RegP base, rax_RegI zero, Universe dummy, + rFlagsReg cr) +%{ + predicate(UseFastStosb); + match(Set dummy (ClearArray cnt base)); + effect(USE_KILL cnt, USE_KILL base, KILL zero, KILL cr); + format %{ "xorq rax, rax\t# ClearArray:\n\t" + "shlq rcx,3\t# Convert doublewords to bytes\n\t" + "rep stosb\t# Store rax to *rdi++ while rcx--" %} + ins_encode %{ + __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register); + %} + ins_pipe( pipe_slow ); +%} + instruct string_compare(rdi_RegP str1, rcx_RegI cnt1, rsi_RegP str2, rdx_RegI cnt2, rax_RegI result, regD tmp1, rFlagsReg cr) %{ diff -r d092d1b31229 -r 00af3a3a8df4 src/share/vm/opto/memnode.cpp --- a/src/share/vm/opto/memnode.cpp Sun Dec 23 17:08:22 2012 +0100 +++ b/src/share/vm/opto/memnode.cpp Thu Jan 03 15:09:55 2013 -0800 @@ -2725,10 +2725,8 @@ zend = phase->transform( new(C) URShiftXNode(zend, shift) ); } + // Bulk clear double-words Node* zsize = phase->transform( new(C) SubXNode(zend, zbase) ); - Node* zinit = phase->zerocon((unit == BytesPerLong) ? T_LONG : T_INT); - - // Bulk clear double-words Node* adr = phase->transform( new(C) AddPNode(dest, dest, start_offset) ); mem = new (C) ClearArrayNode(ctl, mem, zsize, adr); return phase->transform(mem);