# HG changeset patch # User kvn # Date 1288713637 25200 # Node ID ae065c367d9305538ffce8dcd7e5671d3f74ff37 # Parent 3b2dea75431e6cfd0a79e8b1da72cce0114708f3 6987135: Performance regression on Intel platform with 32-bits edition between 6u13 and 6u14. Summary: Use hardware DIV instruction for long division by constant when it is faster than code with multiply. Reviewed-by: never diff -r 3b2dea75431e -r ae065c367d93 src/cpu/sparc/vm/sparc.ad --- a/src/cpu/sparc/vm/sparc.ad Sat Oct 30 13:08:23 2010 -0700 +++ b/src/cpu/sparc/vm/sparc.ad Tue Nov 02 09:00:37 2010 -0700 @@ -1843,6 +1843,12 @@ return can_be_java_arg(reg); } +bool Matcher::use_asm_for_ldiv_by_con( jlong divisor ) { + // Use hardware SDIVX instruction when it is + // faster than a code which use multiply. + return VM_Version::has_fast_idiv(); +} + // Register for DIVI projection of divmodI RegMask Matcher::divI_proj_mask() { ShouldNotReachHere(); diff -r 3b2dea75431e -r ae065c367d93 src/cpu/sparc/vm/vm_version_sparc.cpp --- a/src/cpu/sparc/vm/vm_version_sparc.cpp Sat Oct 30 13:08:23 2010 -0700 +++ b/src/cpu/sparc/vm/vm_version_sparc.cpp Tue Nov 02 09:00:37 2010 -0700 @@ -1,5 +1,5 @@ /* - * Copyright (c) 1997, 2009, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2010, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -80,7 +80,8 @@ FLAG_SET_DEFAULT(InteriorEntryAlignment, 4); } if (is_niagara1_plus()) { - if (AllocatePrefetchStyle > 0 && FLAG_IS_DEFAULT(AllocatePrefetchStyle)) { + if (has_blk_init() && AllocatePrefetchStyle > 0 && + FLAG_IS_DEFAULT(AllocatePrefetchStyle)) { // Use BIS instruction for allocation prefetch. FLAG_SET_DEFAULT(AllocatePrefetchStyle, 3); if (FLAG_IS_DEFAULT(AllocatePrefetchDistance)) { @@ -118,16 +119,18 @@ #endif char buf[512]; - jio_snprintf(buf, sizeof(buf), "%s%s%s%s%s%s%s%s%s%s%s%s", + jio_snprintf(buf, sizeof(buf), "%s%s%s%s%s%s%s%s%s%s%s%s%s%s", (has_v8() ? ", has_v8" : ""), (has_v9() ? ", has_v9" : ""), (has_hardware_popc() ? ", popc" : ""), (has_vis1() ? ", has_vis1" : ""), (has_vis2() ? ", has_vis2" : ""), + (has_blk_init() ? ", has_blk_init" : ""), (is_ultra3() ? ", is_ultra3" : ""), (is_sun4v() ? ", is_sun4v" : ""), (is_niagara1() ? ", is_niagara1" : ""), (is_niagara1_plus() ? ", is_niagara1_plus" : ""), + (is_sparc64() ? ", is_sparc64" : ""), (!has_hardware_mul32() ? ", no-mul32" : ""), (!has_hardware_div32() ? ", no-div32" : ""), (!has_hardware_fsmuld() ? ", no-fsmuld" : "")); diff -r 3b2dea75431e -r ae065c367d93 src/cpu/sparc/vm/vm_version_sparc.hpp --- a/src/cpu/sparc/vm/vm_version_sparc.hpp Sat Oct 30 13:08:23 2010 -0700 +++ b/src/cpu/sparc/vm/vm_version_sparc.hpp Tue Nov 02 09:00:37 2010 -0700 @@ -1,5 +1,5 @@ /* - * Copyright (c) 1997, 2009, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2010, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -33,7 +33,9 @@ v9_instructions = 5, vis1_instructions = 6, vis2_instructions = 7, - sun4v_instructions = 8 + sun4v_instructions = 8, + blk_init_instructions = 9, + fmaf_instructions = 10 }; enum Feature_Flag_Set { @@ -49,6 +51,8 @@ vis1_instructions_m = 1 << vis1_instructions, vis2_instructions_m = 1 << vis2_instructions, sun4v_m = 1 << sun4v_instructions, + blk_init_instructions_m = 1 << blk_init_instructions, + fmaf_instructions_m = 1 << fmaf_instructions, generic_v8_m = v8_instructions_m | hardware_mul32_m | hardware_div32_m | hardware_fsmuld_m, generic_v9_m = generic_v8_m | v9_instructions_m, @@ -67,6 +71,7 @@ static int platform_features(int features); static bool is_niagara1(int features) { return (features & sun4v_m) != 0; } + static bool is_sparc64(int features) { return (features & fmaf_instructions_m) != 0; } static int maximum_niagara1_processor_count() { return 32; } // Returns true if the platform is in the niagara line and @@ -86,6 +91,7 @@ static bool has_hardware_popc() { return (_features & hardware_popc_m) != 0; } static bool has_vis1() { return (_features & vis1_instructions_m) != 0; } static bool has_vis2() { return (_features & vis2_instructions_m) != 0; } + static bool has_blk_init() { return (_features & blk_init_instructions_m) != 0; } static bool supports_compare_and_exchange() { return has_v9(); } @@ -93,8 +99,10 @@ static bool is_ultra3() { return (_features & ultra3_m) == ultra3_m; } static bool is_sun4v() { return (_features & sun4v_m) != 0; } static bool is_niagara1() { return is_niagara1(_features); } + static bool is_sparc64() { return is_sparc64(_features); } static bool has_fast_fxtof() { return has_v9() && !is_ultra3(); } + static bool has_fast_idiv() { return is_niagara1_plus() || is_sparc64(); } static const char* cpu_features() { return _features_str; } diff -r 3b2dea75431e -r ae065c367d93 src/cpu/x86/vm/assembler_x86.cpp --- a/src/cpu/x86/vm/assembler_x86.cpp Sat Oct 30 13:08:23 2010 -0700 +++ b/src/cpu/x86/vm/assembler_x86.cpp Tue Nov 02 09:00:37 2010 -0700 @@ -1288,7 +1288,7 @@ if (is8bit(value)) { emit_byte(0x6B); emit_byte(0xC0 | encode); - emit_byte(value); + emit_byte(value & 0xFF); } else { emit_byte(0x69); emit_byte(0xC0 | encode); @@ -3903,7 +3903,7 @@ if (is8bit(value)) { emit_byte(0x6B); emit_byte(0xC0 | encode); - emit_byte(value); + emit_byte(value & 0xFF); } else { emit_byte(0x69); emit_byte(0xC0 | encode); diff -r 3b2dea75431e -r ae065c367d93 src/cpu/x86/vm/vm_version_x86.hpp --- a/src/cpu/x86/vm/vm_version_x86.hpp Sat Oct 30 13:08:23 2010 -0700 +++ b/src/cpu/x86/vm/vm_version_x86.hpp Tue Nov 02 09:00:37 2010 -0700 @@ -446,6 +446,10 @@ static bool supports_lzcnt() { return (_cpuFeatures & CPU_LZCNT) != 0; } static bool supports_sse4a() { return (_cpuFeatures & CPU_SSE4A) != 0; } + // Intel Core and newer cpus have fast IDIV instruction (excluding Atom). + static bool has_fast_idiv() { return is_intel() && cpu_family() == 6 && + supports_sse3() && _model != 0x1C; } + static bool supports_compare_and_exchange() { return true; } static const char* cpu_features() { return _features_str; } diff -r 3b2dea75431e -r ae065c367d93 src/cpu/x86/vm/x86_32.ad --- a/src/cpu/x86/vm/x86_32.ad Sat Oct 30 13:08:23 2010 -0700 +++ b/src/cpu/x86/vm/x86_32.ad Tue Nov 02 09:00:37 2010 -0700 @@ -1508,6 +1508,16 @@ return can_be_java_arg(reg); } +bool Matcher::use_asm_for_ldiv_by_con( jlong divisor ) { + // Use hardware integer DIV instruction when + // it is faster than a code which use multiply. + // Only when constant divisor fits into 32 bit + // (min_jint is excluded to get only correct + // positive 32 bit values from negative). + return VM_Version::has_fast_idiv() && + (divisor == (int)divisor && divisor != min_jint); +} + // Register for DIVI projection of divmodI RegMask Matcher::divI_proj_mask() { return EAX_REG_mask; @@ -1546,6 +1556,9 @@ return true; } } + if (opc == Op_ConL && (n->get_long() & 0xFFFFFFFF00000000LL) == 0LL) { + return true; + } return false; } @@ -2309,9 +2322,11 @@ enc_class move_long_big_shift_sign( eRegL dst, immI_32_63 cnt ) %{ emit_opcode( cbuf, 0x8B ); // Move emit_rm(cbuf, 0x3, $dst$$reg, HIGH_FROM_LOW($dst$$reg)); - emit_d8(cbuf,$primary); - emit_rm(cbuf, 0x3, $secondary, $dst$$reg); - emit_d8(cbuf,$cnt$$constant-32); + if( $cnt$$constant > 32 ) { // Shift, if not by zero + emit_d8(cbuf,$primary); + emit_rm(cbuf, 0x3, $secondary, $dst$$reg); + emit_d8(cbuf,$cnt$$constant-32); + } emit_d8(cbuf,$primary); emit_rm(cbuf, 0x3, $secondary, HIGH_FROM_LOW($dst$$reg)); emit_d8(cbuf,31); @@ -8842,6 +8857,103 @@ ins_pipe( pipe_slow ); %} +// Divide Register Long (no special case since divisor != -1) +instruct divL_eReg_imm32( eADXRegL dst, immL32 imm, eRegI tmp, eRegI tmp2, eFlagsReg cr ) %{ + match(Set dst (DivL dst imm)); + effect( TEMP tmp, TEMP tmp2, KILL cr ); + ins_cost(1000); + format %{ "MOV $tmp,abs($imm) # ldiv EDX:EAX,$imm\n\t" + "CMP $tmp,EDX\n\t" + "JA,s fast\n\t" + "MOV $tmp2,EAX\n\t" + "MOV EAX,EDX\n\t" + "SAR EDX,31\n\t" + "IDIV $tmp\n\t" + "XCHG EAX,$tmp2 \n\t" + "IDIV $tmp\n\t" + "CDQ\n\t" + "ADD EDX,$tmp2\n\t" + "JMP,s done\n" + "fast:\n\t" + "IDIV $tmp\n\t" + "XOR EDX,EDX\n" + "done:\n\t" + "NEG EDX:EAX # if $imm < 0" %} + ins_encode %{ + int con = (int)$imm$$constant; + assert(con != 0 && con != -1 && con != min_jint, "wrong divisor"); + int pcon = (con > 0) ? con : -con; + Label Lfast, Ldone; + + __ movl($tmp$$Register, pcon); + __ cmpl($tmp$$Register, HIGH_FROM_LOW($dst$$Register)); + __ jccb(Assembler::above, Lfast); + + __ movl($tmp2$$Register, $dst$$Register); // save + __ movl($dst$$Register, HIGH_FROM_LOW($dst$$Register)); + __ sarl(HIGH_FROM_LOW($dst$$Register), 31); // src sign + __ idivl($tmp$$Register); + __ xchgl($dst$$Register, $tmp2$$Register); + __ idivl($tmp$$Register); + __ cdql(); + __ addl(HIGH_FROM_LOW($dst$$Register),$tmp2$$Register); + __ jmpb(Ldone); + + __ bind(Lfast); + // fast path: src is positive and result fits into 32 bit + __ idivl($tmp$$Register); + __ xorl(HIGH_FROM_LOW($dst$$Register),HIGH_FROM_LOW($dst$$Register)); + + __ bind(Ldone); + if (con < 0) { + __ lneg(HIGH_FROM_LOW($dst$$Register), $dst$$Register); + } + %} + ins_pipe( pipe_slow ); +%} + +// Remainder Register Long (remainder fit into 32 bits) +instruct modL_eReg_imm32( eADXRegL dst, immL32 imm, eRegI tmp, eRegI tmp2, eFlagsReg cr ) %{ + match(Set dst (ModL dst imm)); + effect( TEMP tmp, TEMP tmp2, KILL cr ); + ins_cost(1000); + format %{ "MOV $tmp,abs($imm) # lrem EDX:EAX,$imm\n\t" + "CMP $tmp,EDX\n\t" + "JA,s fast\n\t" + "MOV $tmp2,EAX\n\t" + "MOV EAX,EDX\n\t" + "SAR EDX,31\n\t" + "IDIV $tmp\n\t" + "MOV EAX,$tmp2\n" + "fast:\n\t" + "IDIV $tmp\n\t" + "MOV EAX,EDX\n\t" + "SAR EDX,31\n\t" %} + ins_encode %{ + int con = (int)$imm$$constant; + assert(con != 0 && con != -1 && con != min_jint, "wrong divisor"); + int pcon = (con > 0) ? con : -con; + Label Lfast; + + __ movl($tmp$$Register, pcon); + __ cmpl($tmp$$Register, HIGH_FROM_LOW($dst$$Register)); + __ jccb(Assembler::above, Lfast); // src is positive and result fits into 32 bit + + __ movl($tmp2$$Register, $dst$$Register); // save + __ movl($dst$$Register, HIGH_FROM_LOW($dst$$Register)); + __ sarl(HIGH_FROM_LOW($dst$$Register), 31); // src sign + __ idivl($tmp$$Register); + __ movl($dst$$Register, $tmp2$$Register); + + __ bind(Lfast); + __ idivl($tmp$$Register); + __ movl($dst$$Register, HIGH_FROM_LOW($dst$$Register)); + __ sarl(HIGH_FROM_LOW($dst$$Register), 31); // result sign + + %} + ins_pipe( pipe_slow ); +%} + // Integer Shift Instructions // Shift Left by one instruct shlI_eReg_1(eRegI dst, immI1 shift, eFlagsReg cr) %{ diff -r 3b2dea75431e -r ae065c367d93 src/cpu/x86/vm/x86_64.ad --- a/src/cpu/x86/vm/x86_64.ad Sat Oct 30 13:08:23 2010 -0700 +++ b/src/cpu/x86/vm/x86_64.ad Tue Nov 02 09:00:37 2010 -0700 @@ -2065,6 +2065,13 @@ return can_be_java_arg(reg); } +bool Matcher::use_asm_for_ldiv_by_con( jlong divisor ) { + // In 64 bit mode a code which use multiply when + // devisor is constant is faster than hardware + // DIV instruction (it uses MulHiL). + return false; +} + // Register for DIVI projection of divmodI RegMask Matcher::divI_proj_mask() { return INT_RAX_REG_mask; diff -r 3b2dea75431e -r ae065c367d93 src/os_cpu/solaris_sparc/vm/vm_version_solaris_sparc.cpp --- a/src/os_cpu/solaris_sparc/vm/vm_version_solaris_sparc.cpp Sat Oct 30 13:08:23 2010 -0700 +++ b/src/os_cpu/solaris_sparc/vm/vm_version_solaris_sparc.cpp Tue Nov 02 09:00:37 2010 -0700 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006, 2009, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -65,10 +65,6 @@ // getisax(2), SI_ARCHITECTURE_32, and SI_ARCHITECTURE_64 are // supported on Solaris 10 and later. if (os::Solaris::supports_getisax()) { -#ifndef PRODUCT - if (PrintMiscellaneous && Verbose) - tty->print_cr("getisax(2) supported."); -#endif // Check 32-bit architecture. do_sysinfo(SI_ARCHITECTURE_32, "sparc", &features, v8_instructions_m); @@ -81,6 +77,11 @@ uint_t avn = os::Solaris::getisax(&av, 1); assert(avn == 1, "should only return one av"); +#ifndef PRODUCT + if (PrintMiscellaneous && Verbose) + tty->print_cr("getisax(2) returned: " PTR32_FORMAT, av); +#endif + if (av & AV_SPARC_MUL32) features |= hardware_mul32_m; if (av & AV_SPARC_DIV32) features |= hardware_div32_m; if (av & AV_SPARC_FSMULD) features |= hardware_fsmuld_m; @@ -88,11 +89,22 @@ if (av & AV_SPARC_POPC) features |= hardware_popc_m; if (av & AV_SPARC_VIS) features |= vis1_instructions_m; if (av & AV_SPARC_VIS2) features |= vis2_instructions_m; + + // Next values are not defined before Solaris 10 + // but Solaris 8 is used for jdk6 update builds. +#ifndef AV_SPARC_ASI_BLK_INIT +#define AV_SPARC_ASI_BLK_INIT 0x0080 /* ASI_BLK_INIT_xxx ASI */ +#endif +#ifndef AV_SPARC_FMAF +#define AV_SPARC_FMAF 0x0100 /* Sparc64 Fused Multiply-Add */ +#endif + if (av & AV_SPARC_ASI_BLK_INIT) features |= blk_init_instructions_m; + if (av & AV_SPARC_FMAF) features |= fmaf_instructions_m; } else { // getisax(2) failed, use the old legacy code. #ifndef PRODUCT if (PrintMiscellaneous && Verbose) - tty->print_cr("getisax(2) not supported."); + tty->print_cr("getisax(2) is not supported."); #endif char tmp; diff -r 3b2dea75431e -r ae065c367d93 src/share/vm/opto/divnode.cpp --- a/src/share/vm/opto/divnode.cpp Sat Oct 30 13:08:23 2010 -0700 +++ b/src/share/vm/opto/divnode.cpp Tue Nov 02 09:00:37 2010 -0700 @@ -1,5 +1,5 @@ /* - * Copyright (c) 1997, 2009, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2010, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -388,7 +388,8 @@ if (!d_pos) { q = new (phase->C, 3) SubLNode(phase->longcon(0), phase->transform(q)); } - } else { + } else if ( !Matcher::use_asm_for_ldiv_by_con(d) ) { // Use hardware DIV instruction when + // it is faster than code generated below. // Attempt the jlong constant divide -> multiply transform found in // "Division by Invariant Integers using Multiplication" // by Granlund and Montgomery @@ -558,7 +559,7 @@ set_req(0,NULL); // Dividing by a not-zero constant; no faulting - // Dividing by MININT does not optimize as a power-of-2 shift. + // Dividing by MINLONG does not optimize as a power-of-2 shift. if( l == min_jlong ) return NULL; return transform_long_divide( phase, in(1), l ); @@ -1062,7 +1063,7 @@ // Fell thru, the unroll case is not appropriate. Transform the modulo // into a long multiply/int multiply/subtract case - // Cannot handle mod 0, and min_jint isn't handled by the transform + // Cannot handle mod 0, and min_jlong isn't handled by the transform if( con == 0 || con == min_jlong ) return NULL; // Get the absolute value of the constant; at this point, we can use this @@ -1075,7 +1076,7 @@ // If this is a power of two, then maybe we can mask it if( is_power_of_2_long(pos_con) ) { - log2_con = log2_long(pos_con); + log2_con = exact_log2_long(pos_con); const Type *dt = phase->type(in(1)); const TypeLong *dtl = dt->isa_long(); @@ -1088,7 +1089,7 @@ // Save in(1) so that it cannot be changed or deleted hook->init_req(0, in(1)); - // Divide using the transform from DivI to MulL + // Divide using the transform from DivL to MulL Node *result = transform_long_divide( phase, in(1), pos_con ); if (result != NULL) { Node *divide = phase->transform(result); diff -r 3b2dea75431e -r ae065c367d93 src/share/vm/opto/matcher.hpp --- a/src/share/vm/opto/matcher.hpp Sat Oct 30 13:08:23 2010 -0700 +++ b/src/share/vm/opto/matcher.hpp Tue Nov 02 09:00:37 2010 -0700 @@ -1,5 +1,5 @@ /* - * Copyright (c) 1997, 2009, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2010, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -298,6 +298,10 @@ // Register for MODL projection of divmodL static RegMask modL_proj_mask(); + // Use hardware DIV instruction when it is faster than + // a code which use multiply for division by constant. + static bool use_asm_for_ldiv_by_con( jlong divisor ); + static const RegMask method_handle_invoke_SP_save_mask(); // Java-Interpreter calling convention