# HG changeset patch # User amurillo # Date 1399615631 25200 # Node ID 28bbbecff5f08c1e343fc0c40923c05d86b7cf82 # Parent 7dd67cb4f225e2437717e6cb863247b1e6efca26# Parent 63c5920a038d73df37c6ae3101937010f7e9659a Merge diff -r 7dd67cb4f225 -r 28bbbecff5f0 .hgtags --- a/.hgtags Wed May 07 10:58:47 2014 -0700 +++ b/.hgtags Thu May 08 23:07:11 2014 -0700 @@ -462,3 +462,4 @@ 3c291bc2aa7c58efb1219701f38c41731609e595 hs25.20-b12 18ae0dac7620474547aa1721bc3fd748af07b8b5 jdk8u20-b12 47951595af60460a479b8574622375bfbf5c8ed2 jdk8u20-b13 +798f5b02be897151fdad44d695446088b1cca6b1 hs25.20-b13 diff -r 7dd67cb4f225 -r 28bbbecff5f0 make/hotspot_version --- a/make/hotspot_version Wed May 07 10:58:47 2014 -0700 +++ b/make/hotspot_version Thu May 08 23:07:11 2014 -0700 @@ -35,7 +35,7 @@ HS_MAJOR_VER=25 HS_MINOR_VER=20 -HS_BUILD_NUMBER=12 +HS_BUILD_NUMBER=14 JDK_MAJOR_VER=1 JDK_MINOR_VER=8 diff -r 7dd67cb4f225 -r 28bbbecff5f0 src/cpu/ppc/vm/cppInterpreter_ppc.cpp --- a/src/cpu/ppc/vm/cppInterpreter_ppc.cpp Wed May 07 10:58:47 2014 -0700 +++ b/src/cpu/ppc/vm/cppInterpreter_ppc.cpp Thu May 08 23:07:11 2014 -0700 @@ -1,3 +1,4 @@ + /* * Copyright (c) 1997, 2013, Oracle and/or its affiliates. All rights reserved. * Copyright 2012, 2013 SAP AG. All rights reserved. @@ -403,7 +404,7 @@ BLOCK_COMMENT("compute_interpreter_state {"); // access_flags = method->access_flags(); - // TODO: PPC port: assert(4 == methodOopDesc::sz_access_flags(), "unexpected field size"); + // TODO: PPC port: assert(4 == sizeof(AccessFlags), "unexpected field size"); __ lwa(access_flags, method_(access_flags)); // parameter_count = method->constMethod->size_of_parameters(); @@ -1055,7 +1056,7 @@ assert(access_flags->is_nonvolatile(), "access_flags must be in a non-volatile register"); // Type check. - // TODO: PPC port: assert(4 == methodOopDesc::sz_access_flags(), "unexpected field size"); + // TODO: PPC port: assert(4 == sizeof(AccessFlags), "unexpected field size"); __ lwz(access_flags, method_(access_flags)); // We don't want to reload R19_method and access_flags after calls @@ -1838,7 +1839,7 @@ // Interpreter state fields. const Register msg = R24_tmp4; - // MethodOop fields. + // Method fields. const Register parameter_count = R25_tmp5; const Register result_index = R26_tmp6; @@ -2023,7 +2024,7 @@ __ add(R17_tos, R17_tos, parameter_count); // Result stub address array index - // TODO: PPC port: assert(4 == methodOopDesc::sz_result_index(), "unexpected field size"); + // TODO: PPC port: assert(4 == sizeof(AccessFlags), "unexpected field size"); __ lwa(result_index, method_(result_index)); __ li(msg, BytecodeInterpreter::method_resume); @@ -2709,7 +2710,7 @@ __ ld(R3_ARG1, state_(_result._osr._osr_buf)); __ mtctr(R12_scratch2); - // Load method oop, gc may move it during execution of osr'd method. + // Load method, gc may move it during execution of osr'd method. __ ld(R22_tmp2, state_(_method)); // Load message 'call_method'. __ li(R23_tmp3, BytecodeInterpreter::call_method); diff -r 7dd67cb4f225 -r 28bbbecff5f0 src/cpu/ppc/vm/frame_ppc.inline.hpp --- a/src/cpu/ppc/vm/frame_ppc.inline.hpp Wed May 07 10:58:47 2014 -0700 +++ b/src/cpu/ppc/vm/frame_ppc.inline.hpp Thu May 08 23:07:11 2014 -0700 @@ -26,6 +26,8 @@ #ifndef CPU_PPC_VM_FRAME_PPC_INLINE_HPP #define CPU_PPC_VM_FRAME_PPC_INLINE_HPP +#include "code/codeCache.hpp" + // Inline functions for ppc64 frames: // Find codeblob and set deopt_state. diff -r 7dd67cb4f225 -r 28bbbecff5f0 src/cpu/ppc/vm/interp_masm_ppc_64.hpp --- a/src/cpu/ppc/vm/interp_masm_ppc_64.hpp Wed May 07 10:58:47 2014 -0700 +++ b/src/cpu/ppc/vm/interp_masm_ppc_64.hpp Thu May 08 23:07:11 2014 -0700 @@ -26,7 +26,7 @@ #ifndef CPU_PPC_VM_INTERP_MASM_PPC_64_HPP #define CPU_PPC_VM_INTERP_MASM_PPC_64_HPP -#include "assembler_ppc.inline.hpp" +#include "asm/macroAssembler.hpp" #include "interpreter/invocationCounter.hpp" // This file specializes the assembler with interpreter-specific macros. diff -r 7dd67cb4f225 -r 28bbbecff5f0 src/cpu/ppc/vm/interpreterRT_ppc.cpp --- a/src/cpu/ppc/vm/interpreterRT_ppc.cpp Wed May 07 10:58:47 2014 -0700 +++ b/src/cpu/ppc/vm/interpreterRT_ppc.cpp Thu May 08 23:07:11 2014 -0700 @@ -24,6 +24,7 @@ */ #include "precompiled.hpp" +#include "asm/assembler.inline.hpp" #include "interpreter/interpreter.hpp" #include "interpreter/interpreterRuntime.hpp" #include "memory/allocation.inline.hpp" diff -r 7dd67cb4f225 -r 28bbbecff5f0 src/cpu/ppc/vm/interpreter_ppc.cpp --- a/src/cpu/ppc/vm/interpreter_ppc.cpp Wed May 07 10:58:47 2014 -0700 +++ b/src/cpu/ppc/vm/interpreter_ppc.cpp Thu May 08 23:07:11 2014 -0700 @@ -139,32 +139,16 @@ // Signature is in R3_RET. Signature is callee saved. __ mr(signature, R3_RET); - // Reload method, it may have moved. -#ifdef CC_INTERP - __ ld(R19_method, state_(_method)); -#else - __ ld(R19_method, 0, target_sp); - __ ld(R19_method, _ijava_state_neg(method), R19_method); -#endif - // Get the result handler. __ call_VM_leaf(CAST_FROM_FN_PTR(address, InterpreterRuntime::get_result_handler), R16_thread, R19_method); - // Reload method, it may have moved. -#ifdef CC_INTERP - __ ld(R19_method, state_(_method)); -#else - __ ld(R19_method, 0, target_sp); - __ ld(R19_method, _ijava_state_neg(method), R19_method); -#endif - { Label L; // test if static // _access_flags._flags must be at offset 0. // TODO PPC port: requires change in shared code. //assert(in_bytes(AccessFlags::flags_offset()) == 0, - // "MethodOopDesc._access_flags == MethodOopDesc._access_flags._flags"); + // "MethodDesc._access_flags == MethodDesc._access_flags._flags"); // _access_flags must be a 32 bit value. assert(sizeof(AccessFlags) == 4, "wrong size"); __ lwa(R11_scratch1/*access_flags*/, method_(access_flags)); diff -r 7dd67cb4f225 -r 28bbbecff5f0 src/cpu/ppc/vm/jniFastGetField_ppc.cpp --- a/src/cpu/ppc/vm/jniFastGetField_ppc.cpp Wed May 07 10:58:47 2014 -0700 +++ b/src/cpu/ppc/vm/jniFastGetField_ppc.cpp Thu May 08 23:07:11 2014 -0700 @@ -32,7 +32,7 @@ address JNI_FastGetField::generate_fast_get_int_field0(BasicType type) { - // we don't have fast jni accessors. + // We don't have fast jni accessors. return (address) -1; } @@ -57,12 +57,12 @@ } address JNI_FastGetField::generate_fast_get_long_field() { - // we don't have fast jni accessors. + // We don't have fast jni accessors. return (address) -1; } address JNI_FastGetField::generate_fast_get_float_field0(BasicType type) { - // e don't have fast jni accessors. + // We don't have fast jni accessors. return (address) -1; } diff -r 7dd67cb4f225 -r 28bbbecff5f0 src/cpu/ppc/vm/ppc.ad --- a/src/cpu/ppc/vm/ppc.ad Wed May 07 10:58:47 2014 -0700 +++ b/src/cpu/ppc/vm/ppc.ad Thu May 08 23:07:11 2014 -0700 @@ -898,7 +898,7 @@ // To keep related declarations/definitions/uses close together, // we switch between source %{ }% and source_hpp %{ }% freely as needed. - // Returns true if Node n is followed by a MemBar node that + // Returns true if Node n is followed by a MemBar node that // will do an acquire. If so, this node must not do the acquire // operation. bool followed_by_acquire(const Node *n); @@ -908,7 +908,7 @@ // Optimize load-acquire. // -// Check if acquire is unnecessary due to following operation that does +// Check if acquire is unnecessary due to following operation that does // acquire anyways. // Walk the pattern: // @@ -919,12 +919,12 @@ // Proj(ctrl) Proj(mem) // | | // MemBarRelease/Volatile -// +// bool followed_by_acquire(const Node *load) { assert(load->is_Load(), "So far implemented only for loads."); // Find MemBarAcquire. - const Node *mba = NULL; + const Node *mba = NULL; for (DUIterator_Fast imax, i = load->fast_outs(imax); i < imax; i++) { const Node *out = load->fast_out(i); if (out->Opcode() == Op_MemBarAcquire) { @@ -937,7 +937,7 @@ // Find following MemBar node. // - // The following node must be reachable by control AND memory + // The following node must be reachable by control AND memory // edge to assure no other operations are in between the two nodes. // // So first get the Proj node, mem_proj, to use it to iterate forward. @@ -1135,6 +1135,7 @@ public: + // Emit call stub, compiled java to interpreter. static void emit_trampoline_stub(MacroAssembler &_masm, int destination_toc_offset, int insts_call_instruction_offset); // Size of call trampoline stub. @@ -2752,7 +2753,7 @@ // inputs for new nodes m1->add_req(NULL, n_toc); m2->add_req(NULL, m1); - + // operands for new nodes m1->_opnds[0] = new (C) iRegPdstOper(); // dst m1->_opnds[1] = op_src; // src @@ -2760,29 +2761,29 @@ m2->_opnds[0] = new (C) iRegPdstOper(); // dst m2->_opnds[1] = op_src; // src m2->_opnds[2] = new (C) iRegLdstOper(); // base - + // Initialize ins_attrib TOC fields. m1->_const_toc_offset = -1; m2->_const_toc_offset_hi_node = m1; - + // Register allocation for new nodes. ra_->set_pair(m1->_idx, ra_->get_reg_second(this), ra_->get_reg_first(this)); ra_->set_pair(m2->_idx, ra_->get_reg_second(this), ra_->get_reg_first(this)); - + nodes->push(m1); nodes->push(m2); assert(m2->bottom_type()->isa_ptr(), "must be ptr"); } else { loadConPNode *m2 = new (C) loadConPNode(); - + // inputs for new nodes m2->add_req(NULL, n_toc); - + // operands for new nodes m2->_opnds[0] = new (C) iRegPdstOper(); // dst m2->_opnds[1] = op_src; // src m2->_opnds[2] = new (C) iRegPdstOper(); // toc - + // Register allocation for new nodes. ra_->set_pair(m2->_idx, ra_->get_reg_second(this), ra_->get_reg_first(this)); @@ -2974,17 +2975,17 @@ n_sub_base->_opnds[1] = op_crx; n_sub_base->_opnds[2] = op_src; n_sub_base->_bottom_type = _bottom_type; - + n_shift->add_req(n_region, n_sub_base); n_shift->_opnds[0] = op_dst; n_shift->_opnds[1] = op_dst; n_shift->_bottom_type = _bottom_type; - + ra_->set_pair(n_shift->_idx, ra_->get_reg_second(this), ra_->get_reg_first(this)); ra_->set_pair(n_compare->_idx, ra_->get_reg_second(n_crx), ra_->get_reg_first(n_crx)); ra_->set_pair(n_sub_base->_idx, ra_->get_reg_second(this), ra_->get_reg_first(this)); ra_->set_pair(n_move->_idx, ra_->get_reg_second(this), ra_->get_reg_first(this)); - + nodes->push(n_move); nodes->push(n_compare); nodes->push(n_sub_base); @@ -3061,20 +3062,20 @@ } else { // before Power 7 cond_add_baseNode *n_add_base = new (C) cond_add_baseNode(); - + n_add_base->add_req(n_region, n_compare, n_shift); n_add_base->_opnds[0] = op_dst; n_add_base->_opnds[1] = op_crx; n_add_base->_opnds[2] = op_dst; n_add_base->_bottom_type = _bottom_type; - + assert(ra_->is_oop(this) == true, "A decodeN node must produce an oop!"); ra_->set_oop(n_add_base, true); - + ra_->set_pair(n_shift->_idx, ra_->get_reg_second(this), ra_->get_reg_first(this)); ra_->set_pair(n_compare->_idx, ra_->get_reg_second(n_crx), ra_->get_reg_first(n_crx)); ra_->set_pair(n_add_base->_idx, ra_->get_reg_second(this), ra_->get_reg_first(this)); - + nodes->push(n_compare); nodes->push(n_shift); nodes->push(n_add_base); @@ -3631,11 +3632,11 @@ // Req... for (uint i = 0; i < req(); ++i) { // The expanded node does not need toc any more. - // Add the inline cache constant here instead. This expresses the + // Add the inline cache constant here instead. This expresses the // register of the inline cache must be live at the call. // Else we would have to adapt JVMState by -1. if (i == mach_constant_base_node_input()) { - call->add_req(loadConLNodes_IC._last); + call->add_req(loadConLNodes_IC._last); } else { call->add_req(in(i)); } @@ -3663,6 +3664,8 @@ %} // Compound version of call dynamic + // Toc is only passed so that it can be used in ins_encode statement. + // In the code we have to use $constanttablebase. enc_class enc_java_dynamic_call(method meth, iRegLdst toc) %{ // TODO: PPC port $archOpcode(ppc64Opcode_compound); MacroAssembler _masm(&cbuf); @@ -3670,14 +3673,17 @@ Register Rtoc = (ra_) ? $constanttablebase : R2_TOC; #if 0 + int vtable_index = this->_vtable_index; if (_vtable_index < 0) { // Must be invalid_vtable_index, not nonvirtual_vtable_index. assert(_vtable_index == Method::invalid_vtable_index, "correct sentinel value"); Register ic_reg = as_Register(Matcher::inline_cache_reg_encode()); - AddressLiteral meta = __ allocate_metadata_address((Metadata *)Universe::non_oop_word()); - + + // Virtual call relocation will point to ic load. address virtual_call_meta_addr = __ pc(); - __ load_const_from_method_toc(ic_reg, meta, Rtoc); + // Load a clear inline cache. + AddressLiteral empty_ic((address) Universe::non_oop_word()); + __ load_const_from_method_toc(ic_reg, empty_ic, Rtoc); // CALL to fixup routine. Fixup routine uses ScopeDesc info // to determine who we intended to call. __ relocate(virtual_call_Relocation::spec(virtual_call_meta_addr)); @@ -3710,7 +3716,6 @@ "Fix constant in ret_addr_offset()"); } #endif - guarantee(0, "Fix handling of toc edge: messes up derived/base pairs."); Unimplemented(); // ret_addr_offset not yet fixed. Depends on compressed oops (load klass!). %} @@ -5436,7 +5441,7 @@ ins_pipe(pipe_class_memory); %} -// Match loading integer and casting it to unsigned int in +// Match loading integer and casting it to unsigned int in // long register. // LoadI + ConvI2L + AndL 0xffffffff. instruct loadUI2L(iRegLdst dst, memory mem, immL_32bits mask) %{ @@ -6078,7 +6083,7 @@ ins_pipe(pipe_class_default); %} -// This needs a match rule so that build_oop_map knows this is +// This needs a match rule so that build_oop_map knows this is // not a narrow oop. instruct loadConNKlass_lo(iRegNdst dst, immNKlass_NM src1, iRegNsrc src2) %{ match(Set dst src1); @@ -6702,7 +6707,7 @@ size(4); ins_encode %{ // This is a Power7 instruction for which no machine description exists. - // TODO: PPC port $archOpcode(ppc64Opcode_compound); + // TODO: PPC port $archOpcode(ppc64Opcode_compound); __ isel_0($dst$$Register, $crx$$CondRegister, Assembler::equal, $src1$$Register); %} ins_pipe(pipe_class_default); @@ -6847,7 +6852,7 @@ size(4); ins_encode %{ // This is a Power7 instruction for which no machine description exists. - // TODO: PPC port $archOpcode(ppc64Opcode_compound); + // TODO: PPC port $archOpcode(ppc64Opcode_compound); __ isel_0($dst$$Register, $crx$$CondRegister, Assembler::equal, $src1$$Register); %} ins_pipe(pipe_class_default); @@ -7064,7 +7069,7 @@ n1->_bottom_type = _bottom_type; decodeNKlass_shiftNode *n2 = new (C) decodeNKlass_shiftNode(); - n2->add_req(n_region, n2); + n2->add_req(n_region, n1); n2->_opnds[0] = op_dst; n2->_opnds[1] = op_dst; n2->_bottom_type = _bottom_type; @@ -7199,7 +7204,7 @@ // inline_unsafe_load_store). // // Add this node again if we found a good solution for inline_unsafe_load_store(). -// Don't forget to look at the implementation of post_store_load_barrier again, +// Don't forget to look at the implementation of post_store_load_barrier again, // we did other fixes in that method. //instruct unnecessary_membar_volatile() %{ // match(MemBarVolatile); @@ -7237,7 +7242,7 @@ // exists. Anyways, the scheduler should be off on Power7. // TODO: PPC port $archOpcode(ppc64Opcode_compound); int cc = $cmp$$cmpcode; - __ isel($dst$$Register, $crx$$CondRegister, + __ isel($dst$$Register, $crx$$CondRegister, (Assembler::Condition)(cc & 3), /*invert*/((~cc) & 8), $src$$Register); %} ins_pipe(pipe_class_default); @@ -7283,7 +7288,7 @@ // exists. Anyways, the scheduler should be off on Power7. // TODO: PPC port $archOpcode(ppc64Opcode_compound); int cc = $cmp$$cmpcode; - __ isel($dst$$Register, $crx$$CondRegister, + __ isel($dst$$Register, $crx$$CondRegister, (Assembler::Condition)(cc & 3), /*invert*/((~cc) & 8), $src$$Register); %} ins_pipe(pipe_class_default); @@ -7329,7 +7334,7 @@ // exists. Anyways, the scheduler should be off on Power7. // TODO: PPC port $archOpcode(ppc64Opcode_compound); int cc = $cmp$$cmpcode; - __ isel($dst$$Register, $crx$$CondRegister, + __ isel($dst$$Register, $crx$$CondRegister, (Assembler::Condition)(cc & 3), /*invert*/((~cc) & 8), $src$$Register); %} ins_pipe(pipe_class_default); @@ -7376,7 +7381,7 @@ // exists. Anyways, the scheduler should be off on Power7. // TODO: PPC port $archOpcode(ppc64Opcode_compound); int cc = $cmp$$cmpcode; - __ isel($dst$$Register, $crx$$CondRegister, + __ isel($dst$$Register, $crx$$CondRegister, (Assembler::Condition)(cc & 3), /*invert*/((~cc) & 8), $src$$Register); %} ins_pipe(pipe_class_default); @@ -7522,8 +7527,8 @@ ins_encode %{ // TODO: PPC port $archOpcode(ppc64Opcode_compound); // CmpxchgX sets CCR0 to cmpX(src1, src2) and Rres to 'true'/'false'. - __ cmpxchgw(CCR0, R0, $src1$$Register, $src2$$Register, $mem_ptr$$Register, - MacroAssembler::MemBarFenceAfter, MacroAssembler::cmpxchgx_hint_atomic_update(), + __ cmpxchgw(CCR0, R0, $src1$$Register, $src2$$Register, $mem_ptr$$Register, + MacroAssembler::MemBarFenceAfter, MacroAssembler::cmpxchgx_hint_atomic_update(), $res$$Register, true); %} ins_pipe(pipe_class_default); @@ -7929,7 +7934,23 @@ // Turn the sign-bit of a long into a 64-bit mask, 0x0...0 for // positive longs and 0xF...F for negative ones. -instruct signmask64I_regI(iRegIdst dst, iRegIsrc src) %{ +instruct signmask64I_regL(iRegIdst dst, iRegLsrc src) %{ + // no match-rule, false predicate + effect(DEF dst, USE src); + predicate(false); + + format %{ "SRADI $dst, $src, #63" %} + size(4); + ins_encode %{ + // TODO: PPC port $archOpcode(ppc64Opcode_sradi); + __ sradi($dst$$Register, $src$$Register, 0x3f); + %} + ins_pipe(pipe_class_default); +%} + +// Turn the sign-bit of a long into a 64-bit mask, 0x0...0 for +// positive longs and 0xF...F for negative ones. +instruct signmask64L_regL(iRegLdst dst, iRegLsrc src) %{ // no match-rule, false predicate effect(DEF dst, USE src); predicate(false); @@ -8893,7 +8914,7 @@ size(4); ins_encode %{ // TODO: PPC port $archOpcode(ppc64Opcode_rlwinm); - __ rlwinm($dst$$Register, $src1$$Register, 0, + __ rlwinm($dst$$Register, $src1$$Register, 0, (31-log2_long((jlong) $src2$$constant)) & 0x1f, (31-log2_long((jlong) $src2$$constant)) & 0x1f); %} ins_pipe(pipe_class_default); @@ -9619,14 +9640,14 @@ ins_cost(DEFAULT_COST*4); expand %{ - iRegIdst src1s; - iRegIdst src2s; - iRegIdst diff; - sxtI_reg(src1s, src1); // ensure proper sign extention - sxtI_reg(src2s, src2); // ensure proper sign extention - subI_reg_reg(diff, src1s, src2s); + iRegLdst src1s; + iRegLdst src2s; + iRegLdst diff; + convI2L_reg(src1s, src1); // Ensure proper sign extension. + convI2L_reg(src2s, src2); // Ensure proper sign extension. + subL_reg_reg(diff, src1s, src2s); // Need to consider >=33 bit result, therefore we need signmaskL. - signmask64I_regI(dst, diff); + signmask64I_regL(dst, diff); %} %} @@ -10863,7 +10884,7 @@ format %{ "PartialSubtypeCheck $result = ($subklass instanceOf $superklass) tmp: $tmp_klass, $tmp_arrayptr" %} ins_encode %{ // TODO: PPC port $archOpcode(ppc64Opcode_compound); - __ check_klass_subtype_slow_path($subklass$$Register, $superklass$$Register, $tmp_arrayptr$$Register, + __ check_klass_subtype_slow_path($subklass$$Register, $superklass$$Register, $tmp_arrayptr$$Register, $tmp_klass$$Register, NULL, $result$$Register); %} ins_pipe(pipe_class_default); @@ -11178,18 +11199,18 @@ ins_cost(DEFAULT_COST*6); expand %{ - iRegIdst src1s; - iRegIdst src2s; - iRegIdst diff; - iRegIdst sm; - iRegIdst doz; // difference or zero - sxtI_reg(src1s, src1); // Ensure proper sign extention. - sxtI_reg(src2s, src2); // Ensure proper sign extention. - subI_reg_reg(diff, src2s, src1s); + iRegLdst src1s; + iRegLdst src2s; + iRegLdst diff; + iRegLdst sm; + iRegLdst doz; // difference or zero + convI2L_reg(src1s, src1); // Ensure proper sign extension. + convI2L_reg(src2s, src2); // Ensure proper sign extension. + subL_reg_reg(diff, src2s, src1s); // Need to consider >=33 bit result, therefore we need signmaskL. - signmask64I_regI(sm, diff); - andI_reg_reg(doz, diff, sm); // <=0 - addI_reg_reg(dst, doz, src1s); + signmask64L_regL(sm, diff); + andL_reg_reg(doz, diff, sm); // <=0 + addI_regL_regL(dst, doz, src1s); %} %} @@ -11198,19 +11219,18 @@ ins_cost(DEFAULT_COST*6); expand %{ - immI_minus1 m1 %{ -1 %} - iRegIdst src1s; - iRegIdst src2s; - iRegIdst diff; - iRegIdst sm; - iRegIdst doz; // difference or zero - sxtI_reg(src1s, src1); // Ensure proper sign extention. - sxtI_reg(src2s, src2); // Ensure proper sign extention. - subI_reg_reg(diff, src2s, src1s); + iRegLdst src1s; + iRegLdst src2s; + iRegLdst diff; + iRegLdst sm; + iRegLdst doz; // difference or zero + convI2L_reg(src1s, src1); // Ensure proper sign extension. + convI2L_reg(src2s, src2); // Ensure proper sign extension. + subL_reg_reg(diff, src2s, src1s); // Need to consider >=33 bit result, therefore we need signmaskL. - signmask64I_regI(sm, diff); - andcI_reg_reg(doz, sm, m1, diff); // >=0 - addI_reg_reg(dst, doz, src1s); + signmask64L_regL(sm, diff); + andcL_reg_reg(doz, diff, sm); // >=0 + addI_regL_regL(dst, doz, src1s); %} %} diff -r 7dd67cb4f225 -r 28bbbecff5f0 src/cpu/ppc/vm/templateInterpreter_ppc.cpp --- a/src/cpu/ppc/vm/templateInterpreter_ppc.cpp Wed May 07 10:58:47 2014 -0700 +++ b/src/cpu/ppc/vm/templateInterpreter_ppc.cpp Thu May 08 23:07:11 2014 -0700 @@ -81,24 +81,18 @@ #if 0 // Call special ClassCastException constructor taking object to cast // and target class as arguments. -address TemplateInterpreterGenerator::generate_ClassCastException_verbose_handler(const char* name) { +address TemplateInterpreterGenerator::generate_ClassCastException_verbose_handler() { address entry = __ pc(); - // Target class oop is in register R6_ARG4 by convention! - // Expression stack must be empty before entering the VM if an // exception happened. __ empty_expression_stack(); - // Setup parameters. + // Thread will be loaded to R3_ARG1. - __ load_const_optimized(R4_ARG2, (address) name); - __ mr(R5_ARG3, R17_tos); - // R6_ARG4 contains specified class. - __ call_VM(noreg, CAST_FROM_FN_PTR(address, InterpreterRuntime::throw_ClassCastException_verbose)); -#ifdef ASSERT + // Target class oop is in register R5_ARG3 by convention! + __ call_VM(noreg, CAST_FROM_FN_PTR(address, InterpreterRuntime::throw_ClassCastException_verbose, R17_tos, R5_ARG3)); // Above call must not return here since exception pending. - __ should_not_reach_here(); -#endif + DEBUG_ONLY(__ should_not_reach_here();) return entry; } #endif @@ -1535,14 +1529,32 @@ __ stw(R0, in_bytes(JavaThread::popframe_condition_offset()), R16_thread); // Get out of the current method and re-execute the call that called us. - __ merge_frames(/*top_frame_sp*/ R21_sender_SP, /*return_pc*/ return_pc, R11_scratch1, R12_scratch2); + __ merge_frames(/*top_frame_sp*/ R21_sender_SP, /*return_pc*/ noreg, R11_scratch1, R12_scratch2); __ restore_interpreter_state(R11_scratch1); __ ld(R12_scratch2, _ijava_state_neg(top_frame_sp), R11_scratch1); __ resize_frame_absolute(R12_scratch2, R11_scratch1, R0); - __ mtlr(return_pc); if (ProfileInterpreter) { __ set_method_data_pointer_for_bcp(); } +#if INCLUDE_JVMTI + Label L_done; + + __ lbz(R11_scratch1, 0, R14_bcp); + __ cmpwi(CCR0, R11_scratch1, Bytecodes::_invokestatic); + __ bne(CCR0, L_done); + + // The member name argument must be restored if _invokestatic is re-executed after a PopFrame call. + // Detect such a case in the InterpreterRuntime function and return the member name argument, or NULL. + __ ld(R4_ARG2, 0, R18_locals); + __ call_VM(R11_scratch1, CAST_FROM_FN_PTR(address, InterpreterRuntime::member_name_arg_or_null), + R4_ARG2, R19_method, R14_bcp); + + __ cmpdi(CCR0, R11_scratch1, 0); + __ beq(CCR0, L_done); + + __ std(R11_scratch1, wordSize, R15_esp); + __ bind(L_done); +#endif // INCLUDE_JVMTI __ dispatch_next(vtos); } // end of JVMTI PopFrame support diff -r 7dd67cb4f225 -r 28bbbecff5f0 src/cpu/ppc/vm/templateTable_ppc_64.cpp --- a/src/cpu/ppc/vm/templateTable_ppc_64.cpp Wed May 07 10:58:47 2014 -0700 +++ b/src/cpu/ppc/vm/templateTable_ppc_64.cpp Thu May 08 23:07:11 2014 -0700 @@ -64,7 +64,7 @@ assert_different_registers(Rtmp1, Rtmp2, Rtmp3, Rval, Rbase); switch (barrier) { -#ifndef SERIALGC +#if INCLUDE_ALL_GCS case BarrierSet::G1SATBCT: case BarrierSet::G1SATBCTLogging: { @@ -104,7 +104,7 @@ __ bind(Ldone); } break; -#endif // SERIALGC +#endif // INCLUDE_ALL_GCS case BarrierSet::CardTableModRef: case BarrierSet::CardTableExtension: { @@ -259,17 +259,17 @@ switch (value) { default: ShouldNotReachHere(); case 0: { - int simm16_offset = __ load_const_optimized(R11_scratch1, (address*)&zero, R0); + int simm16_offset = __ load_const_optimized(R11_scratch1, (address*)&zero, R0, true); __ lfs(F15_ftos, simm16_offset, R11_scratch1); break; } case 1: { - int simm16_offset = __ load_const_optimized(R11_scratch1, (address*)&one, R0); + int simm16_offset = __ load_const_optimized(R11_scratch1, (address*)&one, R0, true); __ lfs(F15_ftos, simm16_offset, R11_scratch1); break; } case 2: { - int simm16_offset = __ load_const_optimized(R11_scratch1, (address*)&two, R0); + int simm16_offset = __ load_const_optimized(R11_scratch1, (address*)&two, R0, true); __ lfs(F15_ftos, simm16_offset, R11_scratch1); break; } @@ -282,12 +282,12 @@ static double one = 1.0; switch (value) { case 0: { - int simm16_offset = __ load_const_optimized(R11_scratch1, (address*)&zero, R0); + int simm16_offset = __ load_const_optimized(R11_scratch1, (address*)&zero, R0, true); __ lfd(F15_ftos, simm16_offset, R11_scratch1); break; } case 1: { - int simm16_offset = __ load_const_optimized(R11_scratch1, (address*)&one, R0); + int simm16_offset = __ load_const_optimized(R11_scratch1, (address*)&one, R0, true); __ lfd(F15_ftos, simm16_offset, R11_scratch1); break; } @@ -3728,9 +3728,9 @@ transition(atos, atos); Label Ldone, Lis_null, Lquicked, Lresolved; - Register Roffset = R5_ARG3, + Register Roffset = R6_ARG4, RobjKlass = R4_ARG2, - RspecifiedKlass = R6_ARG4, // Generate_ClassCastException_verbose_handler will expect this register. + RspecifiedKlass = R5_ARG3, // Generate_ClassCastException_verbose_handler will read value from this register. Rcpool = R11_scratch1, Rtags = R12_scratch2; diff -r 7dd67cb4f225 -r 28bbbecff5f0 src/cpu/sparc/vm/assembler_sparc.hpp --- a/src/cpu/sparc/vm/assembler_sparc.hpp Wed May 07 10:58:47 2014 -0700 +++ b/src/cpu/sparc/vm/assembler_sparc.hpp Thu May 08 23:07:11 2014 -0700 @@ -1,5 +1,5 @@ /* - * Copyright (c) 1997, 2013, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2014, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -123,8 +123,13 @@ fpop2_op3 = 0x35, impdep1_op3 = 0x36, aes3_op3 = 0x36, + alignaddr_op3 = 0x36, + faligndata_op3 = 0x36, flog3_op3 = 0x36, + edge_op3 = 0x36, + fsrc_op3 = 0x36, impdep2_op3 = 0x37, + stpartialf_op3 = 0x37, jmpl_op3 = 0x38, rett_op3 = 0x39, trap_op3 = 0x3a, @@ -175,17 +180,23 @@ enum opfs { // selected opfs + edge8n_opf = 0x01, + fmovs_opf = 0x01, fmovd_opf = 0x02, fnegs_opf = 0x05, fnegd_opf = 0x06, + alignaddr_opf = 0x18, + fadds_opf = 0x41, faddd_opf = 0x42, fsubs_opf = 0x45, fsubd_opf = 0x46, + faligndata_opf = 0x48, + fmuls_opf = 0x49, fmuld_opf = 0x4a, fdivs_opf = 0x4d, @@ -348,6 +359,8 @@ ASI_PRIMARY = 0x80, ASI_PRIMARY_NOFAULT = 0x82, ASI_PRIMARY_LITTLE = 0x88, + // 8x8-bit partial store + ASI_PST8_PRIMARY = 0xC0, // Block initializing store ASI_ST_BLKINIT_PRIMARY = 0xE2, // Most-Recently-Used (MRU) BIS variant @@ -585,6 +598,9 @@ // instruction only in VIS1 static void vis1_only() { assert( VM_Version::has_vis1(), "This instruction only works on SPARC with VIS1"); } + // instruction only in VIS2 + static void vis2_only() { assert( VM_Version::has_vis2(), "This instruction only works on SPARC with VIS2"); } + // instruction only in VIS3 static void vis3_only() { assert( VM_Version::has_vis3(), "This instruction only works on SPARC with VIS3"); } @@ -1164,6 +1180,20 @@ inline void wrfprs( Register d) { v9_only(); emit_int32( op(arith_op) | rs1(d) | op3(wrreg_op3) | u_field(6, 29, 25)); } + // VIS1 instructions + + void alignaddr( Register s1, Register s2, Register d ) { vis1_only(); emit_int32( op(arith_op) | rd(d) | op3(alignaddr_op3) | rs1(s1) | opf(alignaddr_opf) | rs2(s2)); } + + void faligndata( FloatRegister s1, FloatRegister s2, FloatRegister d ) { vis1_only(); emit_int32( op(arith_op) | fd(d, FloatRegisterImpl::D) | op3(faligndata_op3) | fs1(s1, FloatRegisterImpl::D) | opf(faligndata_opf) | fs2(s2, FloatRegisterImpl::D)); } + + void fsrc2( FloatRegisterImpl::Width w, FloatRegister s2, FloatRegister d ) { vis1_only(); emit_int32( op(arith_op) | fd(d, w) | op3(fsrc_op3) | opf(0x7A - w) | fs2(s2, w)); } + + void stpartialf( Register s1, Register s2, FloatRegister d, int ia = -1 ) { vis1_only(); emit_int32( op(ldst_op) | fd(d, FloatRegisterImpl::D) | op3(stpartialf_op3) | rs1(s1) | imm_asi(ia) | rs2(s2)); } + + // VIS2 instructions + + void edge8n( Register s1, Register s2, Register d ) { vis2_only(); emit_int32( op(arith_op) | rd(d) | op3(edge_op3) | rs1(s1) | opf(edge8n_opf) | rs2(s2)); } + // VIS3 instructions void movstosw( FloatRegister s, Register d ) { vis3_only(); emit_int32( op(arith_op) | rd(d) | op3(mftoi_op3) | opf(mstosw_opf) | fs2(s, FloatRegisterImpl::S)); } diff -r 7dd67cb4f225 -r 28bbbecff5f0 src/cpu/sparc/vm/stubGenerator_sparc.cpp --- a/src/cpu/sparc/vm/stubGenerator_sparc.cpp Wed May 07 10:58:47 2014 -0700 +++ b/src/cpu/sparc/vm/stubGenerator_sparc.cpp Thu May 08 23:07:11 2014 -0700 @@ -1,5 +1,5 @@ /* - * Copyright (c) 1997, 2013, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2014, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -3305,9 +3305,12 @@ } address generate_aescrypt_encryptBlock() { + // required since we read expanded key 'int' array starting first element without alignment considerations + assert((arrayOopDesc::base_offset_in_bytes(T_INT) & 7) == 0, + "the following code assumes that first element of an int array is aligned to 8 bytes"); __ align(CodeEntryAlignment); - StubCodeMark mark(this, "StubRoutines", "aesencryptBlock"); - Label L_doLast128bit, L_storeOutput; + StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); + Label L_load_misaligned_input, L_load_expanded_key, L_doLast128bit, L_storeOutput, L_store_misaligned_output; address start = __ pc(); Register from = O0; // source byte array Register to = O1; // destination byte array @@ -3317,15 +3320,33 @@ // read expanded key length __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0); - // load input into F54-F56; F30-F31 used as temp - __ ldf(FloatRegisterImpl::S, from, 0, F30); - __ ldf(FloatRegisterImpl::S, from, 4, F31); - __ fmov(FloatRegisterImpl::D, F30, F54); - __ ldf(FloatRegisterImpl::S, from, 8, F30); - __ ldf(FloatRegisterImpl::S, from, 12, F31); - __ fmov(FloatRegisterImpl::D, F30, F56); - - // load expanded key + // Method to address arbitrary alignment for load instructions: + // Check last 3 bits of 'from' address to see if it is aligned to 8-byte boundary + // If zero/aligned then continue with double FP load instructions + // If not zero/mis-aligned then alignaddr will set GSR.align with number of bytes to skip during faligndata + // alignaddr will also convert arbitrary aligned 'from' address to nearest 8-byte aligned address + // load 3 * 8-byte components (to read 16 bytes input) in 3 different FP regs starting at this aligned address + // faligndata will then extract (based on GSR.align value) the appropriate 8 bytes from the 2 source regs + + // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero + __ andcc(from, 7, G0); + __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input); + __ delayed()->alignaddr(from, G0, from); + + // aligned case: load input into F54-F56 + __ ldf(FloatRegisterImpl::D, from, 0, F54); + __ ldf(FloatRegisterImpl::D, from, 8, F56); + __ ba_short(L_load_expanded_key); + + __ BIND(L_load_misaligned_input); + __ ldf(FloatRegisterImpl::D, from, 0, F54); + __ ldf(FloatRegisterImpl::D, from, 8, F56); + __ ldf(FloatRegisterImpl::D, from, 16, F58); + __ faligndata(F54, F56, F54); + __ faligndata(F56, F58, F56); + + __ BIND(L_load_expanded_key); + // Since we load expanded key buffers starting first element, 8-byte alignment is guaranteed for ( int i = 0; i <= 38; i += 2 ) { __ ldf(FloatRegisterImpl::D, key, i*4, as_FloatRegister(i)); } @@ -3365,8 +3386,7 @@ __ ldf(FloatRegisterImpl::D, key, 232, F50); __ aes_eround01(F52, F54, F56, F58); //round 13 __ aes_eround23(F46, F54, F56, F60); - __ br(Assembler::always, false, Assembler::pt, L_storeOutput); - __ delayed()->nop(); + __ ba_short(L_storeOutput); __ BIND(L_doLast128bit); __ ldf(FloatRegisterImpl::D, key, 160, F48); @@ -3377,23 +3397,62 @@ __ aes_eround01_l(F48, F58, F60, F54); //last round __ aes_eround23_l(F50, F58, F60, F56); - // store output into the destination array, F0-F1 used as temp - __ fmov(FloatRegisterImpl::D, F54, F0); - __ stf(FloatRegisterImpl::S, F0, to, 0); - __ stf(FloatRegisterImpl::S, F1, to, 4); - __ fmov(FloatRegisterImpl::D, F56, F0); - __ stf(FloatRegisterImpl::S, F0, to, 8); + // Method to address arbitrary alignment for store instructions: + // Check last 3 bits of 'dest' address to see if it is aligned to 8-byte boundary + // If zero/aligned then continue with double FP store instructions + // If not zero/mis-aligned then edge8n will generate edge mask in result reg (O3 in below case) + // Example: If dest address is 0x07 and nearest 8-byte aligned address is 0x00 then edge mask will be 00000001 + // Compute (8-n) where n is # of bytes skipped by partial store(stpartialf) inst from edge mask, n=7 in this case + // We get the value of n from the andcc that checks 'dest' alignment. n is available in O5 in below case. + // Set GSR.align to (8-n) using alignaddr + // Circular byte shift store values by n places so that the original bytes are at correct position for stpartialf + // Set the arbitrarily aligned 'dest' address to nearest 8-byte aligned address + // Store (partial) the original first (8-n) bytes starting at the original 'dest' address + // Negate the edge mask so that the subsequent stpartialf can store the original (8-n-1)th through 8th bytes at appropriate address + // We need to execute this process for both the 8-byte result values + + // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero + __ andcc(to, 7, O5); + __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output); + __ delayed()->edge8n(to, G0, O3); + + // aligned case: store output into the destination array + __ stf(FloatRegisterImpl::D, F54, to, 0); __ retl(); - __ delayed()->stf(FloatRegisterImpl::S, F1, to, 12); + __ delayed()->stf(FloatRegisterImpl::D, F56, to, 8); + + __ BIND(L_store_misaligned_output); + __ add(to, 8, O4); + __ mov(8, O2); + __ sub(O2, O5, O2); + __ alignaddr(O2, G0, O2); + __ faligndata(F54, F54, F54); + __ faligndata(F56, F56, F56); + __ and3(to, -8, to); + __ and3(O4, -8, O4); + __ stpartialf(to, O3, F54, Assembler::ASI_PST8_PRIMARY); + __ stpartialf(O4, O3, F56, Assembler::ASI_PST8_PRIMARY); + __ add(to, 8, to); + __ add(O4, 8, O4); + __ orn(G0, O3, O3); + __ stpartialf(to, O3, F54, Assembler::ASI_PST8_PRIMARY); + __ retl(); + __ delayed()->stpartialf(O4, O3, F56, Assembler::ASI_PST8_PRIMARY); return start; } address generate_aescrypt_decryptBlock() { + assert((arrayOopDesc::base_offset_in_bytes(T_INT) & 7) == 0, + "the following code assumes that first element of an int array is aligned to 8 bytes"); + // required since we read original key 'byte' array as well in the decryption stubs + assert((arrayOopDesc::base_offset_in_bytes(T_BYTE) & 7) == 0, + "the following code assumes that first element of a byte array is aligned to 8 bytes"); __ align(CodeEntryAlignment); - StubCodeMark mark(this, "StubRoutines", "aesdecryptBlock"); + StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock"); address start = __ pc(); - Label L_expand192bit, L_expand256bit, L_common_transform; + Label L_load_misaligned_input, L_load_original_key, L_expand192bit, L_expand256bit, L_reload_misaligned_input; + Label L_256bit_transform, L_common_transform, L_store_misaligned_output; Register from = O0; // source byte array Register to = O1; // destination byte array Register key = O2; // expanded key array @@ -3403,15 +3462,29 @@ // read expanded key array length __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0); - // load input into F52-F54; F30,F31 used as temp - __ ldf(FloatRegisterImpl::S, from, 0, F30); - __ ldf(FloatRegisterImpl::S, from, 4, F31); - __ fmov(FloatRegisterImpl::D, F30, F52); - __ ldf(FloatRegisterImpl::S, from, 8, F30); - __ ldf(FloatRegisterImpl::S, from, 12, F31); - __ fmov(FloatRegisterImpl::D, F30, F54); - + // save 'from' since we may need to recheck alignment in case of 256-bit decryption + __ mov(from, G1); + + // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero + __ andcc(from, 7, G0); + __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input); + __ delayed()->alignaddr(from, G0, from); + + // aligned case: load input into F52-F54 + __ ldf(FloatRegisterImpl::D, from, 0, F52); + __ ldf(FloatRegisterImpl::D, from, 8, F54); + __ ba_short(L_load_original_key); + + __ BIND(L_load_misaligned_input); + __ ldf(FloatRegisterImpl::D, from, 0, F52); + __ ldf(FloatRegisterImpl::D, from, 8, F54); + __ ldf(FloatRegisterImpl::D, from, 16, F56); + __ faligndata(F52, F54, F52); + __ faligndata(F54, F56, F54); + + __ BIND(L_load_original_key); // load original key from SunJCE expanded decryption key + // Since we load original key buffer starting first element, 8-byte alignment is guaranteed for ( int i = 0; i <= 3; i++ ) { __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i)); } @@ -3432,8 +3505,7 @@ // perform 128-bit key specific inverse cipher transformation __ fxor(FloatRegisterImpl::D, F42, F54, F54); __ fxor(FloatRegisterImpl::D, F40, F52, F52); - __ br(Assembler::always, false, Assembler::pt, L_common_transform); - __ delayed()->nop(); + __ ba_short(L_common_transform); __ BIND(L_expand192bit); @@ -3457,8 +3529,7 @@ __ aes_dround01(F44, F52, F54, F56); __ aes_dround23(F42, F56, F58, F54); __ aes_dround01(F40, F56, F58, F52); - __ br(Assembler::always, false, Assembler::pt, L_common_transform); - __ delayed()->nop(); + __ ba_short(L_common_transform); __ BIND(L_expand256bit); @@ -3478,14 +3549,31 @@ __ aes_kexpand2(F50, F56, F58); for ( int i = 0; i <= 6; i += 2 ) { - __ fmov(FloatRegisterImpl::D, as_FloatRegister(58-i), as_FloatRegister(i)); + __ fsrc2(FloatRegisterImpl::D, as_FloatRegister(58-i), as_FloatRegister(i)); } - // load input into F52-F54 + // reload original 'from' address + __ mov(G1, from); + + // re-check 8-byte alignment + __ andcc(from, 7, G0); + __ br(Assembler::notZero, true, Assembler::pn, L_reload_misaligned_input); + __ delayed()->alignaddr(from, G0, from); + + // aligned case: load input into F52-F54 __ ldf(FloatRegisterImpl::D, from, 0, F52); __ ldf(FloatRegisterImpl::D, from, 8, F54); + __ ba_short(L_256bit_transform); + + __ BIND(L_reload_misaligned_input); + __ ldf(FloatRegisterImpl::D, from, 0, F52); + __ ldf(FloatRegisterImpl::D, from, 8, F54); + __ ldf(FloatRegisterImpl::D, from, 16, F56); + __ faligndata(F52, F54, F52); + __ faligndata(F54, F56, F54); // perform 256-bit key specific inverse cipher transformation + __ BIND(L_256bit_transform); __ fxor(FloatRegisterImpl::D, F0, F54, F54); __ fxor(FloatRegisterImpl::D, F2, F52, F52); __ aes_dround23(F4, F52, F54, F58); @@ -3515,43 +3603,71 @@ } } - // store output to destination array, F0-F1 used as temp - __ fmov(FloatRegisterImpl::D, F52, F0); - __ stf(FloatRegisterImpl::S, F0, to, 0); - __ stf(FloatRegisterImpl::S, F1, to, 4); - __ fmov(FloatRegisterImpl::D, F54, F0); - __ stf(FloatRegisterImpl::S, F0, to, 8); + // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero + __ andcc(to, 7, O5); + __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output); + __ delayed()->edge8n(to, G0, O3); + + // aligned case: store output into the destination array + __ stf(FloatRegisterImpl::D, F52, to, 0); __ retl(); - __ delayed()->stf(FloatRegisterImpl::S, F1, to, 12); + __ delayed()->stf(FloatRegisterImpl::D, F54, to, 8); + + __ BIND(L_store_misaligned_output); + __ add(to, 8, O4); + __ mov(8, O2); + __ sub(O2, O5, O2); + __ alignaddr(O2, G0, O2); + __ faligndata(F52, F52, F52); + __ faligndata(F54, F54, F54); + __ and3(to, -8, to); + __ and3(O4, -8, O4); + __ stpartialf(to, O3, F52, Assembler::ASI_PST8_PRIMARY); + __ stpartialf(O4, O3, F54, Assembler::ASI_PST8_PRIMARY); + __ add(to, 8, to); + __ add(O4, 8, O4); + __ orn(G0, O3, O3); + __ stpartialf(to, O3, F52, Assembler::ASI_PST8_PRIMARY); + __ retl(); + __ delayed()->stpartialf(O4, O3, F54, Assembler::ASI_PST8_PRIMARY); return start; } address generate_cipherBlockChaining_encryptAESCrypt() { + assert((arrayOopDesc::base_offset_in_bytes(T_INT) & 7) == 0, + "the following code assumes that first element of an int array is aligned to 8 bytes"); + assert((arrayOopDesc::base_offset_in_bytes(T_BYTE) & 7) == 0, + "the following code assumes that first element of a byte array is aligned to 8 bytes"); __ align(CodeEntryAlignment); StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); - Label L_cbcenc128, L_cbcenc192, L_cbcenc256; + Label L_cbcenc128, L_load_misaligned_input_128bit, L_128bit_transform, L_store_misaligned_output_128bit; + Label L_check_loop_end_128bit, L_cbcenc192, L_load_misaligned_input_192bit, L_192bit_transform; + Label L_store_misaligned_output_192bit, L_check_loop_end_192bit, L_cbcenc256, L_load_misaligned_input_256bit; + Label L_256bit_transform, L_store_misaligned_output_256bit, L_check_loop_end_256bit; address start = __ pc(); - Register from = O0; // source byte array - Register to = O1; // destination byte array - Register key = O2; // expanded key array - Register rvec = O3; // init vector - const Register len_reg = O4; // cipher length - const Register keylen = O5; // reg for storing expanded key array length - - // save cipher len to return in the end - __ mov(len_reg, L1); + Register from = I0; // source byte array + Register to = I1; // destination byte array + Register key = I2; // expanded key array + Register rvec = I3; // init vector + const Register len_reg = I4; // cipher length + const Register keylen = I5; // reg for storing expanded key array length + + // save cipher len before save_frame, to return in the end + __ mov(O4, L0); + __ save_frame(0); // read expanded key length __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0); - // load init vector + // load initial vector, 8-byte alignment is guranteed __ ldf(FloatRegisterImpl::D, rvec, 0, F60); __ ldf(FloatRegisterImpl::D, rvec, 8, F62); + // load key, 8-byte alignment is guranteed __ ldx(key,0,G1); - __ ldx(key,8,G2); - - // start loading expanded key + __ ldx(key,8,G5); + + // start loading expanded key, 8-byte alignment is guranteed for ( int i = 0, j = 16; i <= 38; i += 2, j += 8 ) { __ ldf(FloatRegisterImpl::D, key, j, as_FloatRegister(i)); } @@ -3571,15 +3687,35 @@ } // 256-bit original key size - __ br(Assembler::always, false, Assembler::pt, L_cbcenc256); - __ delayed()->nop(); + __ ba_short(L_cbcenc256); __ align(OptoLoopAlignment); __ BIND(L_cbcenc128); + // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero + __ andcc(from, 7, G0); + __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input_128bit); + __ delayed()->mov(from, L1); // save original 'from' address before alignaddr + + // aligned case: load input into G3 and G4 __ ldx(from,0,G3); __ ldx(from,8,G4); + __ ba_short(L_128bit_transform); + + __ BIND(L_load_misaligned_input_128bit); + // can clobber F48, F50 and F52 as they are not used in 128 and 192-bit key encryption + __ alignaddr(from, G0, from); + __ ldf(FloatRegisterImpl::D, from, 0, F48); + __ ldf(FloatRegisterImpl::D, from, 8, F50); + __ ldf(FloatRegisterImpl::D, from, 16, F52); + __ faligndata(F48, F50, F48); + __ faligndata(F50, F52, F50); + __ movdtox(F48, G3); + __ movdtox(F50, G4); + __ mov(L1, from); + + __ BIND(L_128bit_transform); __ xor3(G1,G3,G3); - __ xor3(G2,G4,G4); + __ xor3(G5,G4,G4); __ movxtod(G3,F56); __ movxtod(G4,F58); __ fxor(FloatRegisterImpl::D, F60, F56, F60); @@ -3598,24 +3734,81 @@ } } + // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero + __ andcc(to, 7, L1); + __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_128bit); + __ delayed()->edge8n(to, G0, L2); + + // aligned case: store output into the destination array __ stf(FloatRegisterImpl::D, F60, to, 0); __ stf(FloatRegisterImpl::D, F62, to, 8); + __ ba_short(L_check_loop_end_128bit); + + __ BIND(L_store_misaligned_output_128bit); + __ add(to, 8, L3); + __ mov(8, L4); + __ sub(L4, L1, L4); + __ alignaddr(L4, G0, L4); + // save cipher text before circular right shift + // as it needs to be stored as iv for next block (see code before next retl) + __ movdtox(F60, L6); + __ movdtox(F62, L7); + __ faligndata(F60, F60, F60); + __ faligndata(F62, F62, F62); + __ mov(to, L5); + __ and3(to, -8, to); + __ and3(L3, -8, L3); + __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY); + __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY); + __ add(to, 8, to); + __ add(L3, 8, L3); + __ orn(G0, L2, L2); + __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY); + __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY); + __ mov(L5, to); + __ movxtod(L6, F60); + __ movxtod(L7, F62); + + __ BIND(L_check_loop_end_128bit); __ add(from, 16, from); __ add(to, 16, to); __ subcc(len_reg, 16, len_reg); __ br(Assembler::notEqual, false, Assembler::pt, L_cbcenc128); __ delayed()->nop(); + // re-init intial vector for next block, 8-byte alignment is guaranteed __ stf(FloatRegisterImpl::D, F60, rvec, 0); __ stf(FloatRegisterImpl::D, F62, rvec, 8); + __ restore(); __ retl(); - __ delayed()->mov(L1, O0); + __ delayed()->mov(L0, O0); __ align(OptoLoopAlignment); __ BIND(L_cbcenc192); + // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero + __ andcc(from, 7, G0); + __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input_192bit); + __ delayed()->mov(from, L1); // save original 'from' address before alignaddr + + // aligned case: load input into G3 and G4 __ ldx(from,0,G3); __ ldx(from,8,G4); + __ ba_short(L_192bit_transform); + + __ BIND(L_load_misaligned_input_192bit); + // can clobber F48, F50 and F52 as they are not used in 128 and 192-bit key encryption + __ alignaddr(from, G0, from); + __ ldf(FloatRegisterImpl::D, from, 0, F48); + __ ldf(FloatRegisterImpl::D, from, 8, F50); + __ ldf(FloatRegisterImpl::D, from, 16, F52); + __ faligndata(F48, F50, F48); + __ faligndata(F50, F52, F50); + __ movdtox(F48, G3); + __ movdtox(F50, G4); + __ mov(L1, from); + + __ BIND(L_192bit_transform); __ xor3(G1,G3,G3); - __ xor3(G2,G4,G4); + __ xor3(G5,G4,G4); __ movxtod(G3,F56); __ movxtod(G4,F58); __ fxor(FloatRegisterImpl::D, F60, F56, F60); @@ -3634,24 +3827,81 @@ } } + // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero + __ andcc(to, 7, L1); + __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_192bit); + __ delayed()->edge8n(to, G0, L2); + + // aligned case: store output into the destination array __ stf(FloatRegisterImpl::D, F60, to, 0); __ stf(FloatRegisterImpl::D, F62, to, 8); + __ ba_short(L_check_loop_end_192bit); + + __ BIND(L_store_misaligned_output_192bit); + __ add(to, 8, L3); + __ mov(8, L4); + __ sub(L4, L1, L4); + __ alignaddr(L4, G0, L4); + __ movdtox(F60, L6); + __ movdtox(F62, L7); + __ faligndata(F60, F60, F60); + __ faligndata(F62, F62, F62); + __ mov(to, L5); + __ and3(to, -8, to); + __ and3(L3, -8, L3); + __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY); + __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY); + __ add(to, 8, to); + __ add(L3, 8, L3); + __ orn(G0, L2, L2); + __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY); + __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY); + __ mov(L5, to); + __ movxtod(L6, F60); + __ movxtod(L7, F62); + + __ BIND(L_check_loop_end_192bit); __ add(from, 16, from); __ subcc(len_reg, 16, len_reg); __ add(to, 16, to); __ br(Assembler::notEqual, false, Assembler::pt, L_cbcenc192); __ delayed()->nop(); + // re-init intial vector for next block, 8-byte alignment is guaranteed __ stf(FloatRegisterImpl::D, F60, rvec, 0); __ stf(FloatRegisterImpl::D, F62, rvec, 8); + __ restore(); __ retl(); - __ delayed()->mov(L1, O0); + __ delayed()->mov(L0, O0); __ align(OptoLoopAlignment); __ BIND(L_cbcenc256); + // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero + __ andcc(from, 7, G0); + __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input_256bit); + __ delayed()->mov(from, L1); // save original 'from' address before alignaddr + + // aligned case: load input into G3 and G4 __ ldx(from,0,G3); __ ldx(from,8,G4); + __ ba_short(L_256bit_transform); + + __ BIND(L_load_misaligned_input_256bit); + // cannot clobber F48, F50 and F52. F56, F58 can be used though + __ alignaddr(from, G0, from); + __ movdtox(F60, L2); // save F60 before overwriting + __ ldf(FloatRegisterImpl::D, from, 0, F56); + __ ldf(FloatRegisterImpl::D, from, 8, F58); + __ ldf(FloatRegisterImpl::D, from, 16, F60); + __ faligndata(F56, F58, F56); + __ faligndata(F58, F60, F58); + __ movdtox(F56, G3); + __ movdtox(F58, G4); + __ mov(L1, from); + __ movxtod(L2, F60); + + __ BIND(L_256bit_transform); __ xor3(G1,G3,G3); - __ xor3(G2,G4,G4); + __ xor3(G5,G4,G4); __ movxtod(G3,F56); __ movxtod(G4,F58); __ fxor(FloatRegisterImpl::D, F60, F56, F60); @@ -3670,26 +3920,69 @@ } } + // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero + __ andcc(to, 7, L1); + __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_256bit); + __ delayed()->edge8n(to, G0, L2); + + // aligned case: store output into the destination array __ stf(FloatRegisterImpl::D, F60, to, 0); __ stf(FloatRegisterImpl::D, F62, to, 8); + __ ba_short(L_check_loop_end_256bit); + + __ BIND(L_store_misaligned_output_256bit); + __ add(to, 8, L3); + __ mov(8, L4); + __ sub(L4, L1, L4); + __ alignaddr(L4, G0, L4); + __ movdtox(F60, L6); + __ movdtox(F62, L7); + __ faligndata(F60, F60, F60); + __ faligndata(F62, F62, F62); + __ mov(to, L5); + __ and3(to, -8, to); + __ and3(L3, -8, L3); + __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY); + __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY); + __ add(to, 8, to); + __ add(L3, 8, L3); + __ orn(G0, L2, L2); + __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY); + __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY); + __ mov(L5, to); + __ movxtod(L6, F60); + __ movxtod(L7, F62); + + __ BIND(L_check_loop_end_256bit); __ add(from, 16, from); __ subcc(len_reg, 16, len_reg); __ add(to, 16, to); __ br(Assembler::notEqual, false, Assembler::pt, L_cbcenc256); __ delayed()->nop(); + // re-init intial vector for next block, 8-byte alignment is guaranteed __ stf(FloatRegisterImpl::D, F60, rvec, 0); __ stf(FloatRegisterImpl::D, F62, rvec, 8); + __ restore(); __ retl(); - __ delayed()->mov(L1, O0); + __ delayed()->mov(L0, O0); return start; } address generate_cipherBlockChaining_decryptAESCrypt_Parallel() { + assert((arrayOopDesc::base_offset_in_bytes(T_INT) & 7) == 0, + "the following code assumes that first element of an int array is aligned to 8 bytes"); + assert((arrayOopDesc::base_offset_in_bytes(T_BYTE) & 7) == 0, + "the following code assumes that first element of a byte array is aligned to 8 bytes"); __ align(CodeEntryAlignment); StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); Label L_cbcdec_end, L_expand192bit, L_expand256bit, L_dec_first_block_start; Label L_dec_first_block128, L_dec_first_block192, L_dec_next2_blocks128, L_dec_next2_blocks192, L_dec_next2_blocks256; + Label L_load_misaligned_input_first_block, L_transform_first_block, L_load_misaligned_next2_blocks128, L_transform_next2_blocks128; + Label L_load_misaligned_next2_blocks192, L_transform_next2_blocks192, L_load_misaligned_next2_blocks256, L_transform_next2_blocks256; + Label L_store_misaligned_output_first_block, L_check_decrypt_end, L_store_misaligned_output_next2_blocks128; + Label L_check_decrypt_loop_end128, L_store_misaligned_output_next2_blocks192, L_check_decrypt_loop_end192; + Label L_store_misaligned_output_next2_blocks256, L_check_decrypt_loop_end256; address start = __ pc(); Register from = I0; // source byte array Register to = I1; // destination byte array @@ -3704,11 +3997,12 @@ __ save_frame(0); //args are read from I* registers since we save the frame in the beginning // load original key from SunJCE expanded decryption key + // Since we load original key buffer starting first element, 8-byte alignment is guaranteed for ( int i = 0; i <= 3; i++ ) { __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i)); } - // load initial vector + // load initial vector, 8-byte alignment is guaranteed __ ldx(rvec,0,L0); __ ldx(rvec,8,L1); @@ -3733,11 +4027,10 @@ __ movdtox(F42,L3); __ and3(len_reg, 16, L4); - __ br_null(L4, false, Assembler::pt, L_dec_next2_blocks128); - __ delayed()->nop(); - - __ br(Assembler::always, false, Assembler::pt, L_dec_first_block_start); - __ delayed()->nop(); + __ br_null_short(L4, Assembler::pt, L_dec_next2_blocks128); + __ nop(); + + __ ba_short(L_dec_first_block_start); __ BIND(L_expand192bit); // load rest of the 192-bit key @@ -3758,11 +4051,10 @@ __ movdtox(F50,L3); __ and3(len_reg, 16, L4); - __ br_null(L4, false, Assembler::pt, L_dec_next2_blocks192); - __ delayed()->nop(); - - __ br(Assembler::always, false, Assembler::pt, L_dec_first_block_start); - __ delayed()->nop(); + __ br_null_short(L4, Assembler::pt, L_dec_next2_blocks192); + __ nop(); + + __ ba_short(L_dec_first_block_start); __ BIND(L_expand256bit); // load rest of the 256-bit key @@ -3785,12 +4077,32 @@ __ movdtox(F58,L3); __ and3(len_reg, 16, L4); - __ br_null(L4, false, Assembler::pt, L_dec_next2_blocks256); - __ delayed()->nop(); + __ br_null_short(L4, Assembler::pt, L_dec_next2_blocks256); __ BIND(L_dec_first_block_start); + // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero + __ andcc(from, 7, G0); + __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input_first_block); + __ delayed()->mov(from, G1); // save original 'from' address before alignaddr + + // aligned case: load input into L4 and L5 __ ldx(from,0,L4); __ ldx(from,8,L5); + __ ba_short(L_transform_first_block); + + __ BIND(L_load_misaligned_input_first_block); + __ alignaddr(from, G0, from); + // F58, F60, F62 can be clobbered + __ ldf(FloatRegisterImpl::D, from, 0, F58); + __ ldf(FloatRegisterImpl::D, from, 8, F60); + __ ldf(FloatRegisterImpl::D, from, 16, F62); + __ faligndata(F58, F60, F58); + __ faligndata(F60, F62, F60); + __ movdtox(F58, L4); + __ movdtox(F60, L5); + __ mov(G1, from); + + __ BIND(L_transform_first_block); __ xor3(L2,L4,G1); __ movxtod(G1,F60); __ xor3(L3,L5,G1); @@ -3833,9 +4145,36 @@ __ fxor(FloatRegisterImpl::D, F56, F60, F60); __ fxor(FloatRegisterImpl::D, F58, F62, F62); + // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero + __ andcc(to, 7, G1); + __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_first_block); + __ delayed()->edge8n(to, G0, G2); + + // aligned case: store output into the destination array __ stf(FloatRegisterImpl::D, F60, to, 0); __ stf(FloatRegisterImpl::D, F62, to, 8); - + __ ba_short(L_check_decrypt_end); + + __ BIND(L_store_misaligned_output_first_block); + __ add(to, 8, G3); + __ mov(8, G4); + __ sub(G4, G1, G4); + __ alignaddr(G4, G0, G4); + __ faligndata(F60, F60, F60); + __ faligndata(F62, F62, F62); + __ mov(to, G1); + __ and3(to, -8, to); + __ and3(G3, -8, G3); + __ stpartialf(to, G2, F60, Assembler::ASI_PST8_PRIMARY); + __ stpartialf(G3, G2, F62, Assembler::ASI_PST8_PRIMARY); + __ add(to, 8, to); + __ add(G3, 8, G3); + __ orn(G0, G2, G2); + __ stpartialf(to, G2, F60, Assembler::ASI_PST8_PRIMARY); + __ stpartialf(G3, G2, F62, Assembler::ASI_PST8_PRIMARY); + __ mov(G1, to); + + __ BIND(L_check_decrypt_end); __ add(from, 16, from); __ add(to, 16, to); __ subcc(len_reg, 16, len_reg); @@ -3852,17 +4191,44 @@ __ BIND(L_dec_next2_blocks128); __ nop(); - // F40:F42 used for first 16-bytes + // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero + __ andcc(from, 7, G0); + __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_next2_blocks128); + __ delayed()->mov(from, G1); // save original 'from' address before alignaddr + + // aligned case: load input into G4, G5, L4 and L5 __ ldx(from,0,G4); __ ldx(from,8,G5); + __ ldx(from,16,L4); + __ ldx(from,24,L5); + __ ba_short(L_transform_next2_blocks128); + + __ BIND(L_load_misaligned_next2_blocks128); + __ alignaddr(from, G0, from); + // F40, F42, F58, F60, F62 can be clobbered + __ ldf(FloatRegisterImpl::D, from, 0, F40); + __ ldf(FloatRegisterImpl::D, from, 8, F42); + __ ldf(FloatRegisterImpl::D, from, 16, F60); + __ ldf(FloatRegisterImpl::D, from, 24, F62); + __ ldf(FloatRegisterImpl::D, from, 32, F58); + __ faligndata(F40, F42, F40); + __ faligndata(F42, F60, F42); + __ faligndata(F60, F62, F60); + __ faligndata(F62, F58, F62); + __ movdtox(F40, G4); + __ movdtox(F42, G5); + __ movdtox(F60, L4); + __ movdtox(F62, L5); + __ mov(G1, from); + + __ BIND(L_transform_next2_blocks128); + // F40:F42 used for first 16-bytes __ xor3(L2,G4,G1); __ movxtod(G1,F40); __ xor3(L3,G5,G1); __ movxtod(G1,F42); // F60:F62 used for next 16-bytes - __ ldx(from,16,L4); - __ ldx(from,24,L5); __ xor3(L2,L4,G1); __ movxtod(G1,F60); __ xor3(L3,L5,G1); @@ -3891,9 +4257,6 @@ __ fxor(FloatRegisterImpl::D, F46, F40, F40); __ fxor(FloatRegisterImpl::D, F44, F42, F42); - __ stf(FloatRegisterImpl::D, F40, to, 0); - __ stf(FloatRegisterImpl::D, F42, to, 8); - __ movxtod(G4,F56); __ movxtod(G5,F58); __ mov(L4,L0); @@ -3901,32 +4264,93 @@ __ fxor(FloatRegisterImpl::D, F56, F60, F60); __ fxor(FloatRegisterImpl::D, F58, F62, F62); + // For mis-aligned store of 32 bytes of result we can do: + // Circular right-shift all 4 FP registers so that 'head' and 'tail' + // parts that need to be stored starting at mis-aligned address are in a FP reg + // the other 3 FP regs can thus be stored using regular store + // we then use the edge + partial-store mechanism to store the 'head' and 'tail' parts + + // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero + __ andcc(to, 7, G1); + __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_next2_blocks128); + __ delayed()->edge8n(to, G0, G2); + + // aligned case: store output into the destination array + __ stf(FloatRegisterImpl::D, F40, to, 0); + __ stf(FloatRegisterImpl::D, F42, to, 8); __ stf(FloatRegisterImpl::D, F60, to, 16); __ stf(FloatRegisterImpl::D, F62, to, 24); - + __ ba_short(L_check_decrypt_loop_end128); + + __ BIND(L_store_misaligned_output_next2_blocks128); + __ mov(8, G4); + __ sub(G4, G1, G4); + __ alignaddr(G4, G0, G4); + __ faligndata(F40, F42, F56); // F56 can be clobbered + __ faligndata(F42, F60, F42); + __ faligndata(F60, F62, F60); + __ faligndata(F62, F40, F40); + __ mov(to, G1); + __ and3(to, -8, to); + __ stpartialf(to, G2, F40, Assembler::ASI_PST8_PRIMARY); + __ stf(FloatRegisterImpl::D, F56, to, 8); + __ stf(FloatRegisterImpl::D, F42, to, 16); + __ stf(FloatRegisterImpl::D, F60, to, 24); + __ add(to, 32, to); + __ orn(G0, G2, G2); + __ stpartialf(to, G2, F40, Assembler::ASI_PST8_PRIMARY); + __ mov(G1, to); + + __ BIND(L_check_decrypt_loop_end128); __ add(from, 32, from); __ add(to, 32, to); __ subcc(len_reg, 32, len_reg); __ br(Assembler::notEqual, false, Assembler::pt, L_dec_next2_blocks128); __ delayed()->nop(); - __ br(Assembler::always, false, Assembler::pt, L_cbcdec_end); - __ delayed()->nop(); + __ ba_short(L_cbcdec_end); __ align(OptoLoopAlignment); __ BIND(L_dec_next2_blocks192); __ nop(); - // F48:F50 used for first 16-bytes + // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero + __ andcc(from, 7, G0); + __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_next2_blocks192); + __ delayed()->mov(from, G1); // save original 'from' address before alignaddr + + // aligned case: load input into G4, G5, L4 and L5 __ ldx(from,0,G4); __ ldx(from,8,G5); + __ ldx(from,16,L4); + __ ldx(from,24,L5); + __ ba_short(L_transform_next2_blocks192); + + __ BIND(L_load_misaligned_next2_blocks192); + __ alignaddr(from, G0, from); + // F48, F50, F52, F60, F62 can be clobbered + __ ldf(FloatRegisterImpl::D, from, 0, F48); + __ ldf(FloatRegisterImpl::D, from, 8, F50); + __ ldf(FloatRegisterImpl::D, from, 16, F60); + __ ldf(FloatRegisterImpl::D, from, 24, F62); + __ ldf(FloatRegisterImpl::D, from, 32, F52); + __ faligndata(F48, F50, F48); + __ faligndata(F50, F60, F50); + __ faligndata(F60, F62, F60); + __ faligndata(F62, F52, F62); + __ movdtox(F48, G4); + __ movdtox(F50, G5); + __ movdtox(F60, L4); + __ movdtox(F62, L5); + __ mov(G1, from); + + __ BIND(L_transform_next2_blocks192); + // F48:F50 used for first 16-bytes __ xor3(L2,G4,G1); __ movxtod(G1,F48); __ xor3(L3,G5,G1); __ movxtod(G1,F50); // F60:F62 used for next 16-bytes - __ ldx(from,16,L4); - __ ldx(from,24,L5); __ xor3(L2,L4,G1); __ movxtod(G1,F60); __ xor3(L3,L5,G1); @@ -3955,9 +4379,6 @@ __ fxor(FloatRegisterImpl::D, F54, F48, F48); __ fxor(FloatRegisterImpl::D, F52, F50, F50); - __ stf(FloatRegisterImpl::D, F48, to, 0); - __ stf(FloatRegisterImpl::D, F50, to, 8); - __ movxtod(G4,F56); __ movxtod(G5,F58); __ mov(L4,L0); @@ -3965,32 +4386,87 @@ __ fxor(FloatRegisterImpl::D, F56, F60, F60); __ fxor(FloatRegisterImpl::D, F58, F62, F62); + // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero + __ andcc(to, 7, G1); + __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_next2_blocks192); + __ delayed()->edge8n(to, G0, G2); + + // aligned case: store output into the destination array + __ stf(FloatRegisterImpl::D, F48, to, 0); + __ stf(FloatRegisterImpl::D, F50, to, 8); __ stf(FloatRegisterImpl::D, F60, to, 16); __ stf(FloatRegisterImpl::D, F62, to, 24); - + __ ba_short(L_check_decrypt_loop_end192); + + __ BIND(L_store_misaligned_output_next2_blocks192); + __ mov(8, G4); + __ sub(G4, G1, G4); + __ alignaddr(G4, G0, G4); + __ faligndata(F48, F50, F56); // F56 can be clobbered + __ faligndata(F50, F60, F50); + __ faligndata(F60, F62, F60); + __ faligndata(F62, F48, F48); + __ mov(to, G1); + __ and3(to, -8, to); + __ stpartialf(to, G2, F48, Assembler::ASI_PST8_PRIMARY); + __ stf(FloatRegisterImpl::D, F56, to, 8); + __ stf(FloatRegisterImpl::D, F50, to, 16); + __ stf(FloatRegisterImpl::D, F60, to, 24); + __ add(to, 32, to); + __ orn(G0, G2, G2); + __ stpartialf(to, G2, F48, Assembler::ASI_PST8_PRIMARY); + __ mov(G1, to); + + __ BIND(L_check_decrypt_loop_end192); __ add(from, 32, from); __ add(to, 32, to); __ subcc(len_reg, 32, len_reg); __ br(Assembler::notEqual, false, Assembler::pt, L_dec_next2_blocks192); __ delayed()->nop(); - __ br(Assembler::always, false, Assembler::pt, L_cbcdec_end); - __ delayed()->nop(); + __ ba_short(L_cbcdec_end); __ align(OptoLoopAlignment); __ BIND(L_dec_next2_blocks256); __ nop(); - // F0:F2 used for first 16-bytes + // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero + __ andcc(from, 7, G0); + __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_next2_blocks256); + __ delayed()->mov(from, G1); // save original 'from' address before alignaddr + + // aligned case: load input into G4, G5, L4 and L5 __ ldx(from,0,G4); __ ldx(from,8,G5); + __ ldx(from,16,L4); + __ ldx(from,24,L5); + __ ba_short(L_transform_next2_blocks256); + + __ BIND(L_load_misaligned_next2_blocks256); + __ alignaddr(from, G0, from); + // F0, F2, F4, F60, F62 can be clobbered + __ ldf(FloatRegisterImpl::D, from, 0, F0); + __ ldf(FloatRegisterImpl::D, from, 8, F2); + __ ldf(FloatRegisterImpl::D, from, 16, F60); + __ ldf(FloatRegisterImpl::D, from, 24, F62); + __ ldf(FloatRegisterImpl::D, from, 32, F4); + __ faligndata(F0, F2, F0); + __ faligndata(F2, F60, F2); + __ faligndata(F60, F62, F60); + __ faligndata(F62, F4, F62); + __ movdtox(F0, G4); + __ movdtox(F2, G5); + __ movdtox(F60, L4); + __ movdtox(F62, L5); + __ mov(G1, from); + + __ BIND(L_transform_next2_blocks256); + // F0:F2 used for first 16-bytes __ xor3(L2,G4,G1); __ movxtod(G1,F0); __ xor3(L3,G5,G1); __ movxtod(G1,F2); // F60:F62 used for next 16-bytes - __ ldx(from,16,L4); - __ ldx(from,24,L5); __ xor3(L2,L4,G1); __ movxtod(G1,F60); __ xor3(L3,L5,G1); @@ -4043,9 +4519,6 @@ __ fxor(FloatRegisterImpl::D, F6, F0, F0); __ fxor(FloatRegisterImpl::D, F4, F2, F2); - __ stf(FloatRegisterImpl::D, F0, to, 0); - __ stf(FloatRegisterImpl::D, F2, to, 8); - __ movxtod(G4,F56); __ movxtod(G5,F58); __ mov(L4,L0); @@ -4053,9 +4526,38 @@ __ fxor(FloatRegisterImpl::D, F56, F60, F60); __ fxor(FloatRegisterImpl::D, F58, F62, F62); + // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero + __ andcc(to, 7, G1); + __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_next2_blocks256); + __ delayed()->edge8n(to, G0, G2); + + // aligned case: store output into the destination array + __ stf(FloatRegisterImpl::D, F0, to, 0); + __ stf(FloatRegisterImpl::D, F2, to, 8); __ stf(FloatRegisterImpl::D, F60, to, 16); __ stf(FloatRegisterImpl::D, F62, to, 24); - + __ ba_short(L_check_decrypt_loop_end256); + + __ BIND(L_store_misaligned_output_next2_blocks256); + __ mov(8, G4); + __ sub(G4, G1, G4); + __ alignaddr(G4, G0, G4); + __ faligndata(F0, F2, F56); // F56 can be clobbered + __ faligndata(F2, F60, F2); + __ faligndata(F60, F62, F60); + __ faligndata(F62, F0, F0); + __ mov(to, G1); + __ and3(to, -8, to); + __ stpartialf(to, G2, F0, Assembler::ASI_PST8_PRIMARY); + __ stf(FloatRegisterImpl::D, F56, to, 8); + __ stf(FloatRegisterImpl::D, F2, to, 16); + __ stf(FloatRegisterImpl::D, F60, to, 24); + __ add(to, 32, to); + __ orn(G0, G2, G2); + __ stpartialf(to, G2, F0, Assembler::ASI_PST8_PRIMARY); + __ mov(G1, to); + + __ BIND(L_check_decrypt_loop_end256); __ add(from, 32, from); __ add(to, 32, to); __ subcc(len_reg, 32, len_reg); @@ -4063,6 +4565,7 @@ __ delayed()->nop(); __ BIND(L_cbcdec_end); + // re-init intial vector for next block, 8-byte alignment is guaranteed __ stx(L0, rvec, 0); __ stx(L1, rvec, 8); __ restore(); diff -r 7dd67cb4f225 -r 28bbbecff5f0 src/cpu/sparc/vm/stubRoutines_sparc.hpp --- a/src/cpu/sparc/vm/stubRoutines_sparc.hpp Wed May 07 10:58:47 2014 -0700 +++ b/src/cpu/sparc/vm/stubRoutines_sparc.hpp Thu May 08 23:07:11 2014 -0700 @@ -41,7 +41,7 @@ enum /* platform_dependent_constants */ { // %%%%%%%% May be able to shrink this a lot code_size1 = 20000, // simply increase if too small (assembler will crash if too small) - code_size2 = 20000 // simply increase if too small (assembler will crash if too small) + code_size2 = 22000 // simply increase if too small (assembler will crash if too small) }; class Sparc { diff -r 7dd67cb4f225 -r 28bbbecff5f0 src/cpu/sparc/vm/vm_version_sparc.cpp --- a/src/cpu/sparc/vm/vm_version_sparc.cpp Wed May 07 10:58:47 2014 -0700 +++ b/src/cpu/sparc/vm/vm_version_sparc.cpp Thu May 08 23:07:11 2014 -0700 @@ -1,5 +1,5 @@ /* - * Copyright (c) 1997, 2013, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2014, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -266,9 +266,9 @@ if (!has_vis1()) // Drop to 0 if no VIS1 support UseVIS = 0; - // T2 and above should have support for AES instructions + // SPARC T4 and above should have support for AES instructions if (has_aes()) { - if (UseVIS > 0) { // AES intrinsics use FXOR instruction which is VIS1 + if (UseVIS > 2) { // AES intrinsics use MOVxTOd/MOVdTOx which are VIS3 if (FLAG_IS_DEFAULT(UseAES)) { FLAG_SET_DEFAULT(UseAES, true); } @@ -282,7 +282,7 @@ } } else { if (UseAES || UseAESIntrinsics) { - warning("SPARC AES intrinsics require VIS1 instruction support. Intrinsics will be disabled."); + warning("SPARC AES intrinsics require VIS3 instruction support. Intrinsics will be disabled."); if (UseAES) { FLAG_SET_DEFAULT(UseAES, false); } diff -r 7dd67cb4f225 -r 28bbbecff5f0 src/cpu/x86/vm/assembler_x86.cpp --- a/src/cpu/x86/vm/assembler_x86.cpp Wed May 07 10:58:47 2014 -0700 +++ b/src/cpu/x86/vm/assembler_x86.cpp Thu May 08 23:07:11 2014 -0700 @@ -1766,7 +1766,7 @@ // Move Unaligned 256bit Vector void Assembler::vmovdqu(XMMRegister dst, XMMRegister src) { - assert(UseAVX, ""); + assert(UseAVX > 0, ""); bool vector256 = true; int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_F3, vector256); emit_int8(0x6F); @@ -1774,7 +1774,7 @@ } void Assembler::vmovdqu(XMMRegister dst, Address src) { - assert(UseAVX, ""); + assert(UseAVX > 0, ""); InstructionMark im(this); bool vector256 = true; vex_prefix(dst, xnoreg, src, VEX_SIMD_F3, vector256); @@ -1783,7 +1783,7 @@ } void Assembler::vmovdqu(Address dst, XMMRegister src) { - assert(UseAVX, ""); + assert(UseAVX > 0, ""); InstructionMark im(this); bool vector256 = true; // swap src<->dst for encoding diff -r 7dd67cb4f225 -r 28bbbecff5f0 src/cpu/x86/vm/vm_version_x86.cpp --- a/src/cpu/x86/vm/vm_version_x86.cpp Wed May 07 10:58:47 2014 -0700 +++ b/src/cpu/x86/vm/vm_version_x86.cpp Thu May 08 23:07:11 2014 -0700 @@ -263,6 +263,10 @@ // and check upper YMM bits after it. // VM_Version::set_avx_cpuFeatures(); // Enable temporary to pass asserts + intx saved_useavx = UseAVX; + intx saved_usesse = UseSSE; + UseAVX = 1; + UseSSE = 2; // load value into all 32 bytes of ymm7 register __ movl(rcx, VM_Version::ymm_test_value()); @@ -292,6 +296,8 @@ #endif VM_Version::clean_cpuFeatures(); + UseAVX = saved_useavx; + UseSSE = saved_usesse; // // cpuid(0x7) Structured Extended Features diff -r 7dd67cb4f225 -r 28bbbecff5f0 src/os_cpu/linux_ppc/vm/atomic_linux_ppc.inline.hpp --- a/src/os_cpu/linux_ppc/vm/atomic_linux_ppc.inline.hpp Wed May 07 10:58:47 2014 -0700 +++ b/src/os_cpu/linux_ppc/vm/atomic_linux_ppc.inline.hpp Thu May 08 23:07:11 2014 -0700 @@ -53,41 +53,41 @@ inline jlong Atomic::load(volatile jlong* src) { return *src; } -/* - machine barrier instructions: - - - sync two-way memory barrier, aka fence - - lwsync orders Store|Store, - Load|Store, - Load|Load, - but not Store|Load - - eieio orders memory accesses for device memory (only) - - isync invalidates speculatively executed instructions - From the POWER ISA 2.06 documentation: - "[...] an isync instruction prevents the execution of - instructions following the isync until instructions - preceding the isync have completed, [...]" - From IBM's AIX assembler reference: - "The isync [...] instructions causes the processor to - refetch any instructions that might have been fetched - prior to the isync instruction. The instruction isync - causes the processor to wait for all previous instructions - to complete. Then any instructions already fetched are - discarded and instruction processing continues in the - environment established by the previous instructions." - - semantic barrier instructions: - (as defined in orderAccess.hpp) - - - release orders Store|Store, (maps to lwsync) - Load|Store - - acquire orders Load|Store, (maps to lwsync) - Load|Load - - fence orders Store|Store, (maps to sync) - Load|Store, - Load|Load, - Store|Load -*/ +// +// machine barrier instructions: +// +// - sync two-way memory barrier, aka fence +// - lwsync orders Store|Store, +// Load|Store, +// Load|Load, +// but not Store|Load +// - eieio orders memory accesses for device memory (only) +// - isync invalidates speculatively executed instructions +// From the POWER ISA 2.06 documentation: +// "[...] an isync instruction prevents the execution of +// instructions following the isync until instructions +// preceding the isync have completed, [...]" +// From IBM's AIX assembler reference: +// "The isync [...] instructions causes the processor to +// refetch any instructions that might have been fetched +// prior to the isync instruction. The instruction isync +// causes the processor to wait for all previous instructions +// to complete. Then any instructions already fetched are +// discarded and instruction processing continues in the +// environment established by the previous instructions." +// +// semantic barrier instructions: +// (as defined in orderAccess.hpp) +// +// - release orders Store|Store, (maps to lwsync) +// Load|Store +// - acquire orders Load|Store, (maps to lwsync) +// Load|Load +// - fence orders Store|Store, (maps to sync) +// Load|Store, +// Load|Load, +// Store|Load +// #define strasm_sync "\n sync \n" #define strasm_lwsync "\n lwsync \n" diff -r 7dd67cb4f225 -r 28bbbecff5f0 src/share/vm/ci/ciReplay.cpp --- a/src/share/vm/ci/ciReplay.cpp Wed May 07 10:58:47 2014 -0700 +++ b/src/share/vm/ci/ciReplay.cpp Thu May 08 23:07:11 2014 -0700 @@ -376,11 +376,15 @@ int c = getc(_stream); while(c != EOF) { c = get_line(c); - process_command(CHECK); + process_command(THREAD); if (had_error()) { tty->print_cr("Error while parsing line %d: %s\n", line_no, _error_message); - tty->print_cr("%s", _buffer); - return; + if (ReplayIgnoreInitErrors) { + CLEAR_PENDING_EXCEPTION; + _error_message = NULL; + } else { + return; + } } line_no++; } @@ -565,10 +569,14 @@ void process_ciMethodData(TRAPS) { Method* method = parse_method(CHECK); if (had_error()) return; - /* jsut copied from Method, to build interpret data*/ + /* just copied from Method, to build interpret data*/ if (InstanceRefKlass::owns_pending_list_lock((JavaThread*)THREAD)) { return; } + // To be properly initialized, some profiling in the MDO needs the + // method to be rewritten (number of arguments at a call for + // instance) + method->method_holder()->link_class(CHECK); // methodOopDesc::build_interpreter_method_data(method, CHECK); { // Grab a lock here to prevent multiple diff -r 7dd67cb4f225 -r 28bbbecff5f0 src/share/vm/classfile/vmSymbols.hpp --- a/src/share/vm/classfile/vmSymbols.hpp Wed May 07 10:58:47 2014 -0700 +++ b/src/share/vm/classfile/vmSymbols.hpp Thu May 08 23:07:11 2014 -0700 @@ -774,7 +774,7 @@ /* java/lang/ref/Reference */ \ do_intrinsic(_Reference_get, java_lang_ref_Reference, get_name, void_object_signature, F_R) \ \ - /* support for com.sum.crypto.provider.AESCrypt and some of its callers */ \ + /* support for com.sun.crypto.provider.AESCrypt and some of its callers */ \ do_class(com_sun_crypto_provider_aescrypt, "com/sun/crypto/provider/AESCrypt") \ do_intrinsic(_aescrypt_encryptBlock, com_sun_crypto_provider_aescrypt, encryptBlock_name, byteArray_int_byteArray_int_signature, F_R) \ do_intrinsic(_aescrypt_decryptBlock, com_sun_crypto_provider_aescrypt, decryptBlock_name, byteArray_int_byteArray_int_signature, F_R) \ diff -r 7dd67cb4f225 -r 28bbbecff5f0 src/share/vm/code/nmethod.cpp --- a/src/share/vm/code/nmethod.cpp Wed May 07 10:58:47 2014 -0700 +++ b/src/share/vm/code/nmethod.cpp Thu May 08 23:07:11 2014 -0700 @@ -771,7 +771,11 @@ _hotness_counter = NMethodSweeper::hotness_counter_reset_val(); code_buffer->copy_values_to(this); - debug_only(verify_scavenge_root_oops()); + if (ScavengeRootsInCode && detect_scavenge_root_oops()) { + CodeCache::add_scavenge_root_nmethod(this); + Universe::heap()->register_nmethod(this); + } + DEBUG_ONLY(verify_scavenge_root_oops();) CodeCache::commit(this); } diff -r 7dd67cb4f225 -r 28bbbecff5f0 src/share/vm/oops/klass.cpp --- a/src/share/vm/oops/klass.cpp Wed May 07 10:58:47 2014 -0700 +++ b/src/share/vm/oops/klass.cpp Thu May 08 23:07:11 2014 -0700 @@ -496,6 +496,7 @@ } void Klass::restore_unshareable_info(TRAPS) { + TRACE_INIT_ID(this); // If an exception happened during CDS restore, some of these fields may already be // set. We leave the class on the CLD list, even if incomplete so that we don't // modify the CLD list outside a safepoint. diff -r 7dd67cb4f225 -r 28bbbecff5f0 src/share/vm/opto/compile.cpp --- a/src/share/vm/opto/compile.cpp Wed May 07 10:58:47 2014 -0700 +++ b/src/share/vm/opto/compile.cpp Thu May 08 23:07:11 2014 -0700 @@ -693,6 +693,7 @@ #endif set_print_inlining(PrintInlining || method()->has_option("PrintInlining") NOT_PRODUCT( || PrintOptoInlining)); set_print_intrinsics(PrintIntrinsics || method()->has_option("PrintIntrinsics")); + set_has_irreducible_loop(true); // conservative until build_loop_tree() reset it if (ProfileTraps RTM_OPT_ONLY( || UseRTMLocking )) { // Make sure the method being compiled gets its own MDO, @@ -977,6 +978,8 @@ set_print_assembly(PrintFrameConverterAssembly); set_parsed_irreducible_loop(false); #endif + set_has_irreducible_loop(false); // no loops + CompileWrapper cw(this); Init(/*AliasLevel=*/ 0); init_tf((*generator)()); @@ -1147,7 +1150,7 @@ if( start->is_Start() ) return start->as_Start(); } - ShouldNotReachHere(); + fatal("Did not find Start node!"); return NULL; } diff -r 7dd67cb4f225 -r 28bbbecff5f0 src/share/vm/opto/compile.hpp --- a/src/share/vm/opto/compile.hpp Wed May 07 10:58:47 2014 -0700 +++ b/src/share/vm/opto/compile.hpp Thu May 08 23:07:11 2014 -0700 @@ -319,6 +319,7 @@ bool _trace_opto_output; bool _parsed_irreducible_loop; // True if ciTypeFlow detected irreducible loops during parsing #endif + bool _has_irreducible_loop; // Found irreducible loops // JSR 292 bool _has_method_handle_invokes; // True if this method has MethodHandle invokes. RTMState _rtm_state; // State of Restricted Transactional Memory usage @@ -605,6 +606,8 @@ void set_parsed_irreducible_loop(bool z) { _parsed_irreducible_loop = z; } int _in_dump_cnt; // Required for dumping ir nodes. #endif + bool has_irreducible_loop() const { return _has_irreducible_loop; } + void set_has_irreducible_loop(bool z) { _has_irreducible_loop = z; } // JSR 292 bool has_method_handle_invokes() const { return _has_method_handle_invokes; } diff -r 7dd67cb4f225 -r 28bbbecff5f0 src/share/vm/opto/loopnode.cpp --- a/src/share/vm/opto/loopnode.cpp Wed May 07 10:58:47 2014 -0700 +++ b/src/share/vm/opto/loopnode.cpp Thu May 08 23:07:11 2014 -0700 @@ -266,9 +266,9 @@ // Counted loop head must be a good RegionNode with only 3 not NULL // control input edges: Self, Entry, LoopBack. - if (x->in(LoopNode::Self) == NULL || x->req() != 3) + if (x->in(LoopNode::Self) == NULL || x->req() != 3 || loop->_irreducible) { return false; - + } Node *init_control = x->in(LoopNode::EntryControl); Node *back_control = x->in(LoopNode::LoopBackControl); if (init_control == NULL || back_control == NULL) // Partially dead @@ -1522,11 +1522,11 @@ // If I have one hot backedge, peel off myself loop. // I better be the outermost loop. - if( _head->req() > 3 ) { + if (_head->req() > 3 && !_irreducible) { split_outer_loop( phase ); result = true; - } else if( !_head->is_Loop() && !_irreducible ) { + } else if (!_head->is_Loop() && !_irreducible) { // Make a new LoopNode to replace the old loop head Node *l = new (phase->C) LoopNode( _head->in(1), _head->in(2) ); l = igvn.register_new_node_with_optimizer(l, _head); @@ -2938,6 +2938,7 @@ return pre_order; } } + C->set_has_irreducible_loop(_has_irreducible_loops); } // This Node might be a decision point for loops. It is only if diff -r 7dd67cb4f225 -r 28bbbecff5f0 src/share/vm/opto/memnode.cpp --- a/src/share/vm/opto/memnode.cpp Wed May 07 10:58:47 2014 -0700 +++ b/src/share/vm/opto/memnode.cpp Thu May 08 23:07:11 2014 -0700 @@ -306,33 +306,16 @@ int alias_idx = phase->C->get_alias_index(t_adr->is_ptr()); } -#ifdef ASSERT Node* base = NULL; - if (address->is_AddP()) + if (address->is_AddP()) { base = address->in(AddPNode::Base); + } if (base != NULL && phase->type(base)->higher_equal(TypePtr::NULL_PTR) && !t_adr->isa_rawptr()) { // Note: raw address has TOP base and top->higher_equal(TypePtr::NULL_PTR) is true. - Compile* C = phase->C; - tty->cr(); - tty->print_cr("===== NULL+offs not RAW address ====="); - if (C->is_dead_node(this->_idx)) tty->print_cr("'this' is dead"); - if ((ctl != NULL) && C->is_dead_node(ctl->_idx)) tty->print_cr("'ctl' is dead"); - if (C->is_dead_node(mem->_idx)) tty->print_cr("'mem' is dead"); - if (C->is_dead_node(address->_idx)) tty->print_cr("'address' is dead"); - if (C->is_dead_node(base->_idx)) tty->print_cr("'base' is dead"); - tty->cr(); - base->dump(1); - tty->cr(); - this->dump(2); - tty->print("this->adr_type(): "); adr_type()->dump(); tty->cr(); - tty->print("phase->type(address): "); t_adr->dump(); tty->cr(); - tty->print("phase->type(base): "); phase->type(address)->dump(); tty->cr(); - tty->cr(); + // Skip this node optimization if its address has TOP base. + return NodeSentinel; // caller will return NULL } - assert(base == NULL || t_adr->isa_rawptr() || - !phase->type(base)->higher_equal(TypePtr::NULL_PTR), "NULL+offs not RAW address?"); -#endif // Avoid independent memory operations Node* old_mem = mem; diff -r 7dd67cb4f225 -r 28bbbecff5f0 src/share/vm/opto/node.cpp --- a/src/share/vm/opto/node.cpp Wed May 07 10:58:47 2014 -0700 +++ b/src/share/vm/opto/node.cpp Thu May 08 23:07:11 2014 -0700 @@ -27,6 +27,7 @@ #include "memory/allocation.inline.hpp" #include "opto/cfgnode.hpp" #include "opto/connode.hpp" +#include "opto/loopnode.hpp" #include "opto/machnode.hpp" #include "opto/matcher.hpp" #include "opto/node.hpp" @@ -1255,6 +1256,7 @@ Node *top = igvn->C->top(); nstack.push(dead); + bool has_irreducible_loop = igvn->C->has_irreducible_loop(); while (nstack.size() > 0) { dead = nstack.pop(); @@ -1269,13 +1271,31 @@ assert (!use->is_Con(), "Control for Con node should be Root node."); use->set_req(0, top); // Cut dead edge to prevent processing nstack.push(use); // the dead node again. + } else if (!has_irreducible_loop && // Backedge could be alive in irreducible loop + use->is_Loop() && !use->is_Root() && // Don't kill Root (RootNode extends LoopNode) + use->in(LoopNode::EntryControl) == dead) { // Dead loop if its entry is dead + use->set_req(LoopNode::EntryControl, top); // Cut dead edge to prevent processing + use->set_req(0, top); // Cut self edge + nstack.push(use); } else { // Else found a not-dead user + // Dead if all inputs are top or null + bool dead_use = !use->is_Root(); // Keep empty graph alive for (uint j = 1; j < use->req(); j++) { - if (use->in(j) == dead) { // Turn all dead inputs into TOP + Node* in = use->in(j); + if (in == dead) { // Turn all dead inputs into TOP use->set_req(j, top); + } else if (in != NULL && !in->is_top()) { + dead_use = false; } } - igvn->_worklist.push(use); + if (dead_use) { + if (use->is_Region()) { + use->set_req(0, top); // Cut self edge + } + nstack.push(use); + } else { + igvn->_worklist.push(use); + } } // Refresh the iterator, since any number of kills might have happened. k = dead->last_outs(kmin); diff -r 7dd67cb4f225 -r 28bbbecff5f0 src/share/vm/opto/runtime.cpp --- a/src/share/vm/opto/runtime.cpp Wed May 07 10:58:47 2014 -0700 +++ b/src/share/vm/opto/runtime.cpp Thu May 08 23:07:11 2014 -0700 @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998, 2013, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1998, 2014, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -870,7 +870,7 @@ return TypeFunc::make(domain, range); } -// for cipherBlockChaining calls of aescrypt encrypt/decrypt, four pointers and a length, returning void +// for cipherBlockChaining calls of aescrypt encrypt/decrypt, four pointers and a length, returning int const TypeFunc* OptoRuntime::cipherBlockChaining_aescrypt_Type() { // create input type (domain) int num_args = 5; diff -r 7dd67cb4f225 -r 28bbbecff5f0 src/share/vm/runtime/advancedThresholdPolicy.cpp --- a/src/share/vm/runtime/advancedThresholdPolicy.cpp Wed May 07 10:58:47 2014 -0700 +++ b/src/share/vm/runtime/advancedThresholdPolicy.cpp Thu May 08 23:07:11 2014 -0700 @@ -53,7 +53,8 @@ } set_c1_count(MAX2(count / 3, 1)); - set_c2_count(MAX2(count - count / 3, 1)); + set_c2_count(MAX2(count - c1_count(), 1)); + FLAG_SET_ERGO(intx, CICompilerCount, c1_count() + c2_count()); // Some inlining tuning #ifdef X86 diff -r 7dd67cb4f225 -r 28bbbecff5f0 src/share/vm/runtime/arguments.cpp --- a/src/share/vm/runtime/arguments.cpp Wed May 07 10:58:47 2014 -0700 +++ b/src/share/vm/runtime/arguments.cpp Thu May 08 23:07:11 2014 -0700 @@ -2383,6 +2383,10 @@ status &= verify_interval(NmethodSweepFraction, 1, ReservedCodeCacheSize/K, "NmethodSweepFraction"); status &= verify_interval(NmethodSweepActivity, 0, 2000, "NmethodSweepActivity"); + if (!FLAG_IS_DEFAULT(CICompilerCount) && !FLAG_IS_DEFAULT(CICompilerCountPerCPU) && CICompilerCountPerCPU) { + warning("The VM option CICompilerCountPerCPU overrides CICompilerCount."); + } + return status; } diff -r 7dd67cb4f225 -r 28bbbecff5f0 src/share/vm/runtime/compilationPolicy.cpp --- a/src/share/vm/runtime/compilationPolicy.cpp Wed May 07 10:58:47 2014 -0700 +++ b/src/share/vm/runtime/compilationPolicy.cpp Thu May 08 23:07:11 2014 -0700 @@ -182,6 +182,7 @@ // max(log2(8)-1,1) = 2 compiler threads on an 8-way machine. // May help big-app startup time. _compiler_count = MAX2(log2_intptr(os::active_processor_count())-1,1); + FLAG_SET_ERGO(intx, CICompilerCount, _compiler_count); } else { _compiler_count = CICompilerCount; } diff -r 7dd67cb4f225 -r 28bbbecff5f0 src/share/vm/runtime/sharedRuntime.cpp --- a/src/share/vm/runtime/sharedRuntime.cpp Wed May 07 10:58:47 2014 -0700 +++ b/src/share/vm/runtime/sharedRuntime.cpp Thu May 08 23:07:11 2014 -0700 @@ -2690,19 +2690,20 @@ JRT_END #ifdef HAVE_DTRACE_H -// Create a dtrace nmethod for this method. The wrapper converts the -// java compiled calling convention to the native convention, makes a dummy call -// (actually nops for the size of the call instruction, which become a trap if -// probe is enabled). The returns to the caller. Since this all looks like a -// leaf no thread transition is needed. - +/** + * Create a dtrace nmethod for this method. The wrapper converts the + * Java-compiled calling convention to the native convention, makes a dummy call + * (actually nops for the size of the call instruction, which become a trap if + * probe is enabled), and finally returns to the caller. Since this all looks like a + * leaf, no thread transition is needed. + */ nmethod *AdapterHandlerLibrary::create_dtrace_nmethod(methodHandle method) { ResourceMark rm; nmethod* nm = NULL; if (PrintCompilation) { ttyLocker ttyl; - tty->print("--- n%s "); + tty->print("--- n "); method->print_short_name(tty); if (method->is_static()) { tty->print(" (static)"); diff -r 7dd67cb4f225 -r 28bbbecff5f0 src/share/vm/runtime/simpleThresholdPolicy.cpp --- a/src/share/vm/runtime/simpleThresholdPolicy.cpp Wed May 07 10:58:47 2014 -0700 +++ b/src/share/vm/runtime/simpleThresholdPolicy.cpp Thu May 08 23:07:11 2014 -0700 @@ -142,7 +142,8 @@ count = MAX2(log2_intptr(os::active_processor_count()), 1) * 3 / 2; } set_c1_count(MAX2(count / 3, 1)); - set_c2_count(MAX2(count - count / 3, 1)); + set_c2_count(MAX2(count - c1_count(), 1)); + FLAG_SET_ERGO(intx, CICompilerCount, c1_count() + c2_count()); } void SimpleThresholdPolicy::set_carry_if_necessary(InvocationCounter *counter) { @@ -191,6 +192,10 @@ thread->is_interp_only_mode()) { return NULL; } + if (CompileTheWorld || ReplayCompiles) { + // Don't trigger other compiles in testing mode + return NULL; + } nmethod *osr_nm = NULL; handle_counter_overflow(method()); diff -r 7dd67cb4f225 -r 28bbbecff5f0 test/compiler/7184394/TestAESBase.java --- a/test/compiler/7184394/TestAESBase.java Wed May 07 10:58:47 2014 -0700 +++ b/test/compiler/7184394/TestAESBase.java Thu May 08 23:07:11 2014 -0700 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2014, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -40,9 +40,20 @@ int msgSize = Integer.getInteger("msgSize", 646); boolean checkOutput = Boolean.getBoolean("checkOutput"); boolean noReinit = Boolean.getBoolean("noReinit"); + boolean testingMisalignment; + private static final int ALIGN = 8; + int encInputOffset = Integer.getInteger("encInputOffset", 0) % ALIGN; + int encOutputOffset = Integer.getInteger("encOutputOffset", 0) % ALIGN; + int decOutputOffset = Integer.getInteger("decOutputOffset", 0) % ALIGN; + int lastChunkSize = Integer.getInteger("lastChunkSize", 32); int keySize = Integer.getInteger("keySize", 128); + int inputLength; + int encodeLength; + int decodeLength; + int decodeMsgSize; String algorithm = System.getProperty("algorithm", "AES"); String mode = System.getProperty("mode", "CBC"); + String paddingStr = System.getProperty("paddingStr", "PKCS5Padding"); byte[] input; byte[] encode; byte[] expectedEncode; @@ -51,7 +62,6 @@ Random random = new Random(0); Cipher cipher; Cipher dCipher; - String paddingStr = "PKCS5Padding"; AlgorithmParameters algParams; SecretKey key; @@ -67,7 +77,10 @@ public void prepare() { try { - System.out.println("\nalgorithm=" + algorithm + ", mode=" + mode + ", msgSize=" + msgSize + ", keySize=" + keySize + ", noReinit=" + noReinit + ", checkOutput=" + checkOutput); + System.out.println("\nalgorithm=" + algorithm + ", mode=" + mode + ", paddingStr=" + paddingStr + ", msgSize=" + msgSize + ", keySize=" + keySize + ", noReinit=" + noReinit + ", checkOutput=" + checkOutput + ", encInputOffset=" + encInputOffset + ", encOutputOffset=" + encOutputOffset + ", decOutputOffset=" + decOutputOffset + ", lastChunkSize=" +lastChunkSize ); + + if (encInputOffset % ALIGN != 0 || encOutputOffset % ALIGN != 0 || decOutputOffset % ALIGN !=0 ) + testingMisalignment = true; int keyLenBytes = (keySize == 0 ? 16 : keySize/8); byte keyBytes[] = new byte[keyLenBytes]; @@ -81,10 +94,6 @@ System.out.println("Algorithm: " + key.getAlgorithm() + "(" + key.getEncoded().length * 8 + "bit)"); } - input = new byte[msgSize]; - for (int i=0; i 0 ? Integer.valueOf(args[0]) : 1000000); + int warmupIters = (args.length > 1 ? Integer.valueOf(args[1]) : 20000); System.out.println(iters + " iterations"); TestAESEncode etest = new TestAESEncode(); etest.prepare(); - // warm-up for 20K iterations + // warm-up System.out.println("Starting encryption warm-up"); - for (int i=0; i<20000; i++) { + for (int i=0; i