# HG changeset patch # User kvn # Date 1389750408 28800 # Node ID 00f5eff62d1823ee7ce381da3da424e626f112ea # Parent 8cdf3f43f63ef08aeec7fe8d53af5d8fe50d8c6d 8002074: Support for AES on SPARC Summary: Add intrinsics/stub routines support for single-block and multi-block (as used by Cipher Block Chaining mode) AES encryption and decryption operations on the SPARC platform. Reviewed-by: kvn, roland Contributed-by: shrinivas.joshi@oracle.com diff -r 8cdf3f43f63e -r 00f5eff62d18 src/cpu/sparc/vm/assembler_sparc.hpp --- a/src/cpu/sparc/vm/assembler_sparc.hpp Tue Jan 14 14:51:47 2014 +0100 +++ b/src/cpu/sparc/vm/assembler_sparc.hpp Tue Jan 14 17:46:48 2014 -0800 @@ -88,6 +88,7 @@ orncc_op3 = 0x16, xnorcc_op3 = 0x17, addccc_op3 = 0x18, + aes4_op3 = 0x19, umulcc_op3 = 0x1a, smulcc_op3 = 0x1b, subccc_op3 = 0x1c, @@ -121,6 +122,8 @@ fpop1_op3 = 0x34, fpop2_op3 = 0x35, impdep1_op3 = 0x36, + aes3_op3 = 0x36, + flog3_op3 = 0x36, impdep2_op3 = 0x37, jmpl_op3 = 0x38, rett_op3 = 0x39, @@ -172,41 +175,56 @@ enum opfs { // selected opfs - fmovs_opf = 0x01, - fmovd_opf = 0x02, + fmovs_opf = 0x01, + fmovd_opf = 0x02, - fnegs_opf = 0x05, - fnegd_opf = 0x06, + fnegs_opf = 0x05, + fnegd_opf = 0x06, - fadds_opf = 0x41, - faddd_opf = 0x42, - fsubs_opf = 0x45, - fsubd_opf = 0x46, + fadds_opf = 0x41, + faddd_opf = 0x42, + fsubs_opf = 0x45, + fsubd_opf = 0x46, - fmuls_opf = 0x49, - fmuld_opf = 0x4a, - fdivs_opf = 0x4d, - fdivd_opf = 0x4e, + fmuls_opf = 0x49, + fmuld_opf = 0x4a, + fdivs_opf = 0x4d, + fdivd_opf = 0x4e, + + fcmps_opf = 0x51, + fcmpd_opf = 0x52, - fcmps_opf = 0x51, - fcmpd_opf = 0x52, + fstox_opf = 0x81, + fdtox_opf = 0x82, + fxtos_opf = 0x84, + fxtod_opf = 0x88, + fitos_opf = 0xc4, + fdtos_opf = 0xc6, + fitod_opf = 0xc8, + fstod_opf = 0xc9, + fstoi_opf = 0xd1, + fdtoi_opf = 0xd2, - fstox_opf = 0x81, - fdtox_opf = 0x82, - fxtos_opf = 0x84, - fxtod_opf = 0x88, - fitos_opf = 0xc4, - fdtos_opf = 0xc6, - fitod_opf = 0xc8, - fstod_opf = 0xc9, - fstoi_opf = 0xd1, - fdtoi_opf = 0xd2, + mdtox_opf = 0x110, + mstouw_opf = 0x111, + mstosw_opf = 0x113, + mxtod_opf = 0x118, + mwtos_opf = 0x119, + + aes_kexpand0_opf = 0x130, + aes_kexpand2_opf = 0x131 + }; - mdtox_opf = 0x110, - mstouw_opf = 0x111, - mstosw_opf = 0x113, - mxtod_opf = 0x118, - mwtos_opf = 0x119 + enum op5s { + aes_eround01_op5 = 0x00, + aes_eround23_op5 = 0x01, + aes_dround01_op5 = 0x02, + aes_dround23_op5 = 0x03, + aes_eround01_l_op5 = 0x04, + aes_eround23_l_op5 = 0x05, + aes_dround01_l_op5 = 0x06, + aes_dround23_l_op5 = 0x07, + aes_kexpand1_op5 = 0x08 }; enum RCondition { rc_z = 1, rc_lez = 2, rc_lz = 3, rc_nz = 5, rc_gz = 6, rc_gez = 7, rc_last = rc_gez }; @@ -427,6 +445,7 @@ static int immed( bool i) { return u_field(i ? 1 : 0, 13, 13); } static int opf_low6( int w) { return u_field(w, 10, 5); } static int opf_low5( int w) { return u_field(w, 9, 5); } + static int op5( int x) { return u_field(x, 8, 5); } static int trapcc( CC cc) { return u_field(cc, 12, 11); } static int sx( int i) { return u_field(i, 12, 12); } // shift x=1 means 64-bit static int opf( int x) { return u_field(x, 13, 5); } @@ -451,6 +470,7 @@ static int fd( FloatRegister r, FloatRegisterImpl::Width fwa) { return u_field(r->encoding(fwa), 29, 25); }; static int fs1(FloatRegister r, FloatRegisterImpl::Width fwa) { return u_field(r->encoding(fwa), 18, 14); }; static int fs2(FloatRegister r, FloatRegisterImpl::Width fwa) { return u_field(r->encoding(fwa), 4, 0); }; + static int fs3(FloatRegister r, FloatRegisterImpl::Width fwa) { return u_field(r->encoding(fwa), 13, 9); }; // some float instructions use this encoding on the op3 field static int alt_op3(int op, FloatRegisterImpl::Width w) { @@ -559,6 +579,12 @@ return x & ((1 << 10) - 1); } + // AES crypto instructions supported only on certain processors + static void aes_only() { assert( VM_Version::has_aes(), "This instruction only works on SPARC with AES instructions support"); } + + // instruction only in VIS1 + static void vis1_only() { assert( VM_Version::has_vis1(), "This instruction only works on SPARC with VIS1"); } + // instruction only in VIS3 static void vis3_only() { assert( VM_Version::has_vis3(), "This instruction only works on SPARC with VIS3"); } @@ -682,6 +708,24 @@ void addccc( Register s1, int simm13a, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(addc_op3 | cc_bit_op3) | rs1(s1) | immed(true) | simm(simm13a, 13) ); } + // 4-operand AES instructions + + void aes_eround01( FloatRegister s1, FloatRegister s2, FloatRegister s3, FloatRegister d ) { aes_only(); emit_int32( op(arith_op) | fd(d, FloatRegisterImpl::D) | op3(aes4_op3) | fs1(s1, FloatRegisterImpl::D) | fs3(s3, FloatRegisterImpl::D) | op5(aes_eround01_op5) | fs2(s2, FloatRegisterImpl::D) ); } + void aes_eround23( FloatRegister s1, FloatRegister s2, FloatRegister s3, FloatRegister d ) { aes_only(); emit_int32( op(arith_op) | fd(d, FloatRegisterImpl::D) | op3(aes4_op3) | fs1(s1, FloatRegisterImpl::D) | fs3(s3, FloatRegisterImpl::D) | op5(aes_eround23_op5) | fs2(s2, FloatRegisterImpl::D) ); } + void aes_dround01( FloatRegister s1, FloatRegister s2, FloatRegister s3, FloatRegister d ) { aes_only(); emit_int32( op(arith_op) | fd(d, FloatRegisterImpl::D) | op3(aes4_op3) | fs1(s1, FloatRegisterImpl::D) | fs3(s3, FloatRegisterImpl::D) | op5(aes_dround01_op5) | fs2(s2, FloatRegisterImpl::D) ); } + void aes_dround23( FloatRegister s1, FloatRegister s2, FloatRegister s3, FloatRegister d ) { aes_only(); emit_int32( op(arith_op) | fd(d, FloatRegisterImpl::D) | op3(aes4_op3) | fs1(s1, FloatRegisterImpl::D) | fs3(s3, FloatRegisterImpl::D) | op5(aes_dround23_op5) | fs2(s2, FloatRegisterImpl::D) ); } + void aes_eround01_l( FloatRegister s1, FloatRegister s2, FloatRegister s3, FloatRegister d ) { aes_only(); emit_int32( op(arith_op) | fd(d, FloatRegisterImpl::D) | op3(aes4_op3) | fs1(s1, FloatRegisterImpl::D) | fs3(s3, FloatRegisterImpl::D) | op5(aes_eround01_l_op5) | fs2(s2, FloatRegisterImpl::D) ); } + void aes_eround23_l( FloatRegister s1, FloatRegister s2, FloatRegister s3, FloatRegister d ) { aes_only(); emit_int32( op(arith_op) | fd(d, FloatRegisterImpl::D) | op3(aes4_op3) | fs1(s1, FloatRegisterImpl::D) | fs3(s3, FloatRegisterImpl::D) | op5(aes_eround23_l_op5) | fs2(s2, FloatRegisterImpl::D) ); } + void aes_dround01_l( FloatRegister s1, FloatRegister s2, FloatRegister s3, FloatRegister d ) { aes_only(); emit_int32( op(arith_op) | fd(d, FloatRegisterImpl::D) | op3(aes4_op3) | fs1(s1, FloatRegisterImpl::D) | fs3(s3, FloatRegisterImpl::D) | op5(aes_dround01_l_op5) | fs2(s2, FloatRegisterImpl::D) ); } + void aes_dround23_l( FloatRegister s1, FloatRegister s2, FloatRegister s3, FloatRegister d ) { aes_only(); emit_int32( op(arith_op) | fd(d, FloatRegisterImpl::D) | op3(aes4_op3) | fs1(s1, FloatRegisterImpl::D) | fs3(s3, FloatRegisterImpl::D) | op5(aes_dround23_l_op5) | fs2(s2, FloatRegisterImpl::D) ); } + void aes_kexpand1( FloatRegister s1, FloatRegister s2, int imm5a, FloatRegister d ) { aes_only(); emit_int32( op(arith_op) | fd(d, FloatRegisterImpl::D) | op3(aes4_op3) | fs1(s1, FloatRegisterImpl::D) | u_field(imm5a, 13, 9) | op5(aes_kexpand1_op5) | fs2(s2, FloatRegisterImpl::D) ); } + + + // 3-operand AES instructions + + void aes_kexpand0( FloatRegister s1, FloatRegister s2, FloatRegister d ) { aes_only(); emit_int32( op(arith_op) | fd(d, FloatRegisterImpl::D) | op3(aes3_op3) | fs1(s1, FloatRegisterImpl::D) | opf(aes_kexpand0_opf) | fs2(s2, FloatRegisterImpl::D) ); } + void aes_kexpand2( FloatRegister s1, FloatRegister s2, FloatRegister d ) { aes_only(); emit_int32( op(arith_op) | fd(d, FloatRegisterImpl::D) | op3(aes3_op3) | fs1(s1, FloatRegisterImpl::D) | opf(aes_kexpand2_opf) | fs2(s2, FloatRegisterImpl::D) ); } + // pp 136 inline void bpr(RCondition c, bool a, Predict p, Register s1, address d, relocInfo::relocType rt = relocInfo::none); @@ -784,6 +828,10 @@ void fmul( FloatRegisterImpl::Width sw, FloatRegisterImpl::Width dw, FloatRegister s1, FloatRegister s2, FloatRegister d ) { emit_int32( op(arith_op) | fd(d, dw) | op3(fpop1_op3) | fs1(s1, sw) | opf(0x60 + sw + dw*4) | fs2(s2, sw)); } void fdiv( FloatRegisterImpl::Width w, FloatRegister s1, FloatRegister s2, FloatRegister d ) { emit_int32( op(arith_op) | fd(d, w) | op3(fpop1_op3) | fs1(s1, w) | opf(0x4c + w) | fs2(s2, w)); } + // FXORs/FXORd instructions + + void fxor( FloatRegisterImpl::Width w, FloatRegister s1, FloatRegister s2, FloatRegister d ) { vis1_only(); emit_int32( op(arith_op) | fd(d, w) | op3(flog3_op3) | fs1(s1, w) | opf(0x6E - w) | fs2(s2, w)); } + // pp 164 void fsqrt( FloatRegisterImpl::Width w, FloatRegister s, FloatRegister d ) { emit_int32( op(arith_op) | fd(d, w) | op3(fpop1_op3) | opf(0x28 + w) | fs2(s, w)); } diff -r 8cdf3f43f63e -r 00f5eff62d18 src/cpu/sparc/vm/sparc.ad --- a/src/cpu/sparc/vm/sparc.ad Tue Jan 14 14:51:47 2014 +0100 +++ b/src/cpu/sparc/vm/sparc.ad Tue Jan 14 17:46:48 2014 -0800 @@ -1848,6 +1848,12 @@ return false; } +// Current (2013) SPARC platforms need to read original key +// to construct decryption expanded key +const bool Matcher::pass_original_key_for_aes() { + return true; +} + // USII supports fxtof through the whole range of number, USIII doesn't const bool Matcher::convL2FSupported(void) { return VM_Version::has_fast_fxtof(); diff -r 8cdf3f43f63e -r 00f5eff62d18 src/cpu/sparc/vm/stubGenerator_sparc.cpp --- a/src/cpu/sparc/vm/stubGenerator_sparc.cpp Tue Jan 14 14:51:47 2014 +0100 +++ b/src/cpu/sparc/vm/stubGenerator_sparc.cpp Tue Jan 14 17:46:48 2014 -0800 @@ -3304,6 +3304,775 @@ } } + address generate_aescrypt_encryptBlock() { + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", "aesencryptBlock"); + Label L_doLast128bit, L_storeOutput; + address start = __ pc(); + Register from = O0; // source byte array + Register to = O1; // destination byte array + Register key = O2; // expanded key array + const Register keylen = O4; //reg for storing expanded key array length + + // read expanded key length + __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0); + + // load input into F54-F56; F30-F31 used as temp + __ ldf(FloatRegisterImpl::S, from, 0, F30); + __ ldf(FloatRegisterImpl::S, from, 4, F31); + __ fmov(FloatRegisterImpl::D, F30, F54); + __ ldf(FloatRegisterImpl::S, from, 8, F30); + __ ldf(FloatRegisterImpl::S, from, 12, F31); + __ fmov(FloatRegisterImpl::D, F30, F56); + + // load expanded key + for ( int i = 0; i <= 38; i += 2 ) { + __ ldf(FloatRegisterImpl::D, key, i*4, as_FloatRegister(i)); + } + + // perform cipher transformation + __ fxor(FloatRegisterImpl::D, F0, F54, F54); + __ fxor(FloatRegisterImpl::D, F2, F56, F56); + // rounds 1 through 8 + for ( int i = 4; i <= 28; i += 8 ) { + __ aes_eround01(as_FloatRegister(i), F54, F56, F58); + __ aes_eround23(as_FloatRegister(i+2), F54, F56, F60); + __ aes_eround01(as_FloatRegister(i+4), F58, F60, F54); + __ aes_eround23(as_FloatRegister(i+6), F58, F60, F56); + } + __ aes_eround01(F36, F54, F56, F58); //round 9 + __ aes_eround23(F38, F54, F56, F60); + + // 128-bit original key size + __ cmp_and_brx_short(keylen, 44, Assembler::equal, Assembler::pt, L_doLast128bit); + + for ( int i = 40; i <= 50; i += 2 ) { + __ ldf(FloatRegisterImpl::D, key, i*4, as_FloatRegister(i) ); + } + __ aes_eround01(F40, F58, F60, F54); //round 10 + __ aes_eround23(F42, F58, F60, F56); + __ aes_eround01(F44, F54, F56, F58); //round 11 + __ aes_eround23(F46, F54, F56, F60); + + // 192-bit original key size + __ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pt, L_storeOutput); + + __ ldf(FloatRegisterImpl::D, key, 208, F52); + __ aes_eround01(F48, F58, F60, F54); //round 12 + __ aes_eround23(F50, F58, F60, F56); + __ ldf(FloatRegisterImpl::D, key, 216, F46); + __ ldf(FloatRegisterImpl::D, key, 224, F48); + __ ldf(FloatRegisterImpl::D, key, 232, F50); + __ aes_eround01(F52, F54, F56, F58); //round 13 + __ aes_eround23(F46, F54, F56, F60); + __ br(Assembler::always, false, Assembler::pt, L_storeOutput); + __ delayed()->nop(); + + __ BIND(L_doLast128bit); + __ ldf(FloatRegisterImpl::D, key, 160, F48); + __ ldf(FloatRegisterImpl::D, key, 168, F50); + + __ BIND(L_storeOutput); + // perform last round of encryption common for all key sizes + __ aes_eround01_l(F48, F58, F60, F54); //last round + __ aes_eround23_l(F50, F58, F60, F56); + + // store output into the destination array, F0-F1 used as temp + __ fmov(FloatRegisterImpl::D, F54, F0); + __ stf(FloatRegisterImpl::S, F0, to, 0); + __ stf(FloatRegisterImpl::S, F1, to, 4); + __ fmov(FloatRegisterImpl::D, F56, F0); + __ stf(FloatRegisterImpl::S, F0, to, 8); + __ retl(); + __ delayed()->stf(FloatRegisterImpl::S, F1, to, 12); + + return start; + } + + address generate_aescrypt_decryptBlock() { + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", "aesdecryptBlock"); + address start = __ pc(); + Label L_expand192bit, L_expand256bit, L_common_transform; + Register from = O0; // source byte array + Register to = O1; // destination byte array + Register key = O2; // expanded key array + Register original_key = O3; // original key array only required during decryption + const Register keylen = O4; // reg for storing expanded key array length + + // read expanded key array length + __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0); + + // load input into F52-F54; F30,F31 used as temp + __ ldf(FloatRegisterImpl::S, from, 0, F30); + __ ldf(FloatRegisterImpl::S, from, 4, F31); + __ fmov(FloatRegisterImpl::D, F30, F52); + __ ldf(FloatRegisterImpl::S, from, 8, F30); + __ ldf(FloatRegisterImpl::S, from, 12, F31); + __ fmov(FloatRegisterImpl::D, F30, F54); + + // load original key from SunJCE expanded decryption key + for ( int i = 0; i <= 3; i++ ) { + __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i)); + } + + // 256-bit original key size + __ cmp_and_brx_short(keylen, 60, Assembler::equal, Assembler::pn, L_expand256bit); + + // 192-bit original key size + __ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pn, L_expand192bit); + + // 128-bit original key size + // perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions + for ( int i = 0; i <= 36; i += 4 ) { + __ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+2), i/4, as_FloatRegister(i+4)); + __ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+4), as_FloatRegister(i+6)); + } + + // perform 128-bit key specific inverse cipher transformation + __ fxor(FloatRegisterImpl::D, F42, F54, F54); + __ fxor(FloatRegisterImpl::D, F40, F52, F52); + __ br(Assembler::always, false, Assembler::pt, L_common_transform); + __ delayed()->nop(); + + __ BIND(L_expand192bit); + + // start loading rest of the 192-bit key + __ ldf(FloatRegisterImpl::S, original_key, 16, F4); + __ ldf(FloatRegisterImpl::S, original_key, 20, F5); + + // perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions + for ( int i = 0; i <= 36; i += 6 ) { + __ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+4), i/6, as_FloatRegister(i+6)); + __ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+6), as_FloatRegister(i+8)); + __ aes_kexpand2(as_FloatRegister(i+4), as_FloatRegister(i+8), as_FloatRegister(i+10)); + } + __ aes_kexpand1(F42, F46, 7, F48); + __ aes_kexpand2(F44, F48, F50); + + // perform 192-bit key specific inverse cipher transformation + __ fxor(FloatRegisterImpl::D, F50, F54, F54); + __ fxor(FloatRegisterImpl::D, F48, F52, F52); + __ aes_dround23(F46, F52, F54, F58); + __ aes_dround01(F44, F52, F54, F56); + __ aes_dround23(F42, F56, F58, F54); + __ aes_dround01(F40, F56, F58, F52); + __ br(Assembler::always, false, Assembler::pt, L_common_transform); + __ delayed()->nop(); + + __ BIND(L_expand256bit); + + // load rest of the 256-bit key + for ( int i = 4; i <= 7; i++ ) { + __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i)); + } + + // perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions + for ( int i = 0; i <= 40; i += 8 ) { + __ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+6), i/8, as_FloatRegister(i+8)); + __ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+8), as_FloatRegister(i+10)); + __ aes_kexpand0(as_FloatRegister(i+4), as_FloatRegister(i+10), as_FloatRegister(i+12)); + __ aes_kexpand2(as_FloatRegister(i+6), as_FloatRegister(i+12), as_FloatRegister(i+14)); + } + __ aes_kexpand1(F48, F54, 6, F56); + __ aes_kexpand2(F50, F56, F58); + + for ( int i = 0; i <= 6; i += 2 ) { + __ fmov(FloatRegisterImpl::D, as_FloatRegister(58-i), as_FloatRegister(i)); + } + + // load input into F52-F54 + __ ldf(FloatRegisterImpl::D, from, 0, F52); + __ ldf(FloatRegisterImpl::D, from, 8, F54); + + // perform 256-bit key specific inverse cipher transformation + __ fxor(FloatRegisterImpl::D, F0, F54, F54); + __ fxor(FloatRegisterImpl::D, F2, F52, F52); + __ aes_dround23(F4, F52, F54, F58); + __ aes_dround01(F6, F52, F54, F56); + __ aes_dround23(F50, F56, F58, F54); + __ aes_dround01(F48, F56, F58, F52); + __ aes_dround23(F46, F52, F54, F58); + __ aes_dround01(F44, F52, F54, F56); + __ aes_dround23(F42, F56, F58, F54); + __ aes_dround01(F40, F56, F58, F52); + + for ( int i = 0; i <= 7; i++ ) { + __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i)); + } + + // perform inverse cipher transformations common for all key sizes + __ BIND(L_common_transform); + for ( int i = 38; i >= 6; i -= 8 ) { + __ aes_dround23(as_FloatRegister(i), F52, F54, F58); + __ aes_dround01(as_FloatRegister(i-2), F52, F54, F56); + if ( i != 6) { + __ aes_dround23(as_FloatRegister(i-4), F56, F58, F54); + __ aes_dround01(as_FloatRegister(i-6), F56, F58, F52); + } else { + __ aes_dround23_l(as_FloatRegister(i-4), F56, F58, F54); + __ aes_dround01_l(as_FloatRegister(i-6), F56, F58, F52); + } + } + + // store output to destination array, F0-F1 used as temp + __ fmov(FloatRegisterImpl::D, F52, F0); + __ stf(FloatRegisterImpl::S, F0, to, 0); + __ stf(FloatRegisterImpl::S, F1, to, 4); + __ fmov(FloatRegisterImpl::D, F54, F0); + __ stf(FloatRegisterImpl::S, F0, to, 8); + __ retl(); + __ delayed()->stf(FloatRegisterImpl::S, F1, to, 12); + + return start; + } + + address generate_cipherBlockChaining_encryptAESCrypt() { + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); + Label L_cbcenc128, L_cbcenc192, L_cbcenc256; + address start = __ pc(); + Register from = O0; // source byte array + Register to = O1; // destination byte array + Register key = O2; // expanded key array + Register rvec = O3; // init vector + const Register len_reg = O4; // cipher length + const Register keylen = O5; // reg for storing expanded key array length + + // save cipher len to return in the end + __ mov(len_reg, L1); + + // read expanded key length + __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0); + + // load init vector + __ ldf(FloatRegisterImpl::D, rvec, 0, F60); + __ ldf(FloatRegisterImpl::D, rvec, 8, F62); + __ ldx(key,0,G1); + __ ldx(key,8,G2); + + // start loading expanded key + for ( int i = 0, j = 16; i <= 38; i += 2, j += 8 ) { + __ ldf(FloatRegisterImpl::D, key, j, as_FloatRegister(i)); + } + + // 128-bit original key size + __ cmp_and_brx_short(keylen, 44, Assembler::equal, Assembler::pt, L_cbcenc128); + + for ( int i = 40, j = 176; i <= 46; i += 2, j += 8 ) { + __ ldf(FloatRegisterImpl::D, key, j, as_FloatRegister(i)); + } + + // 192-bit original key size + __ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pt, L_cbcenc192); + + for ( int i = 48, j = 208; i <= 54; i += 2, j += 8 ) { + __ ldf(FloatRegisterImpl::D, key, j, as_FloatRegister(i)); + } + + // 256-bit original key size + __ br(Assembler::always, false, Assembler::pt, L_cbcenc256); + __ delayed()->nop(); + + __ align(OptoLoopAlignment); + __ BIND(L_cbcenc128); + __ ldx(from,0,G3); + __ ldx(from,8,G4); + __ xor3(G1,G3,G3); + __ xor3(G2,G4,G4); + __ movxtod(G3,F56); + __ movxtod(G4,F58); + __ fxor(FloatRegisterImpl::D, F60, F56, F60); + __ fxor(FloatRegisterImpl::D, F62, F58, F62); + + // TEN_EROUNDS + for ( int i = 0; i <= 32; i += 8 ) { + __ aes_eround01(as_FloatRegister(i), F60, F62, F56); + __ aes_eround23(as_FloatRegister(i+2), F60, F62, F58); + if (i != 32 ) { + __ aes_eround01(as_FloatRegister(i+4), F56, F58, F60); + __ aes_eround23(as_FloatRegister(i+6), F56, F58, F62); + } else { + __ aes_eround01_l(as_FloatRegister(i+4), F56, F58, F60); + __ aes_eround23_l(as_FloatRegister(i+6), F56, F58, F62); + } + } + + __ stf(FloatRegisterImpl::D, F60, to, 0); + __ stf(FloatRegisterImpl::D, F62, to, 8); + __ add(from, 16, from); + __ add(to, 16, to); + __ subcc(len_reg, 16, len_reg); + __ br(Assembler::notEqual, false, Assembler::pt, L_cbcenc128); + __ delayed()->nop(); + __ stf(FloatRegisterImpl::D, F60, rvec, 0); + __ stf(FloatRegisterImpl::D, F62, rvec, 8); + __ retl(); + __ delayed()->mov(L1, O0); + + __ align(OptoLoopAlignment); + __ BIND(L_cbcenc192); + __ ldx(from,0,G3); + __ ldx(from,8,G4); + __ xor3(G1,G3,G3); + __ xor3(G2,G4,G4); + __ movxtod(G3,F56); + __ movxtod(G4,F58); + __ fxor(FloatRegisterImpl::D, F60, F56, F60); + __ fxor(FloatRegisterImpl::D, F62, F58, F62); + + // TWELEVE_EROUNDS + for ( int i = 0; i <= 40; i += 8 ) { + __ aes_eround01(as_FloatRegister(i), F60, F62, F56); + __ aes_eround23(as_FloatRegister(i+2), F60, F62, F58); + if (i != 40 ) { + __ aes_eround01(as_FloatRegister(i+4), F56, F58, F60); + __ aes_eround23(as_FloatRegister(i+6), F56, F58, F62); + } else { + __ aes_eround01_l(as_FloatRegister(i+4), F56, F58, F60); + __ aes_eround23_l(as_FloatRegister(i+6), F56, F58, F62); + } + } + + __ stf(FloatRegisterImpl::D, F60, to, 0); + __ stf(FloatRegisterImpl::D, F62, to, 8); + __ add(from, 16, from); + __ subcc(len_reg, 16, len_reg); + __ add(to, 16, to); + __ br(Assembler::notEqual, false, Assembler::pt, L_cbcenc192); + __ delayed()->nop(); + __ stf(FloatRegisterImpl::D, F60, rvec, 0); + __ stf(FloatRegisterImpl::D, F62, rvec, 8); + __ retl(); + __ delayed()->mov(L1, O0); + + __ align(OptoLoopAlignment); + __ BIND(L_cbcenc256); + __ ldx(from,0,G3); + __ ldx(from,8,G4); + __ xor3(G1,G3,G3); + __ xor3(G2,G4,G4); + __ movxtod(G3,F56); + __ movxtod(G4,F58); + __ fxor(FloatRegisterImpl::D, F60, F56, F60); + __ fxor(FloatRegisterImpl::D, F62, F58, F62); + + // FOURTEEN_EROUNDS + for ( int i = 0; i <= 48; i += 8 ) { + __ aes_eround01(as_FloatRegister(i), F60, F62, F56); + __ aes_eround23(as_FloatRegister(i+2), F60, F62, F58); + if (i != 48 ) { + __ aes_eround01(as_FloatRegister(i+4), F56, F58, F60); + __ aes_eround23(as_FloatRegister(i+6), F56, F58, F62); + } else { + __ aes_eround01_l(as_FloatRegister(i+4), F56, F58, F60); + __ aes_eround23_l(as_FloatRegister(i+6), F56, F58, F62); + } + } + + __ stf(FloatRegisterImpl::D, F60, to, 0); + __ stf(FloatRegisterImpl::D, F62, to, 8); + __ add(from, 16, from); + __ subcc(len_reg, 16, len_reg); + __ add(to, 16, to); + __ br(Assembler::notEqual, false, Assembler::pt, L_cbcenc256); + __ delayed()->nop(); + __ stf(FloatRegisterImpl::D, F60, rvec, 0); + __ stf(FloatRegisterImpl::D, F62, rvec, 8); + __ retl(); + __ delayed()->mov(L1, O0); + + return start; + } + + address generate_cipherBlockChaining_decryptAESCrypt_Parallel() { + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); + Label L_cbcdec_end, L_expand192bit, L_expand256bit, L_dec_first_block_start; + Label L_dec_first_block128, L_dec_first_block192, L_dec_next2_blocks128, L_dec_next2_blocks192, L_dec_next2_blocks256; + address start = __ pc(); + Register from = I0; // source byte array + Register to = I1; // destination byte array + Register key = I2; // expanded key array + Register rvec = I3; // init vector + const Register len_reg = I4; // cipher length + const Register original_key = I5; // original key array only required during decryption + const Register keylen = L6; // reg for storing expanded key array length + + // save cipher len before save_frame, to return in the end + __ mov(O4, L0); + __ save_frame(0); //args are read from I* registers since we save the frame in the beginning + + // load original key from SunJCE expanded decryption key + for ( int i = 0; i <= 3; i++ ) { + __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i)); + } + + // load initial vector + __ ldx(rvec,0,L0); + __ ldx(rvec,8,L1); + + // read expanded key array length + __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0); + + // 256-bit original key size + __ cmp_and_brx_short(keylen, 60, Assembler::equal, Assembler::pn, L_expand256bit); + + // 192-bit original key size + __ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pn, L_expand192bit); + + // 128-bit original key size + // perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions + for ( int i = 0; i <= 36; i += 4 ) { + __ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+2), i/4, as_FloatRegister(i+4)); + __ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+4), as_FloatRegister(i+6)); + } + + // load expanded key[last-1] and key[last] elements + __ movdtox(F40,L2); + __ movdtox(F42,L3); + + __ and3(len_reg, 16, L4); + __ br_null(L4, false, Assembler::pt, L_dec_next2_blocks128); + __ delayed()->nop(); + + __ br(Assembler::always, false, Assembler::pt, L_dec_first_block_start); + __ delayed()->nop(); + + __ BIND(L_expand192bit); + // load rest of the 192-bit key + __ ldf(FloatRegisterImpl::S, original_key, 16, F4); + __ ldf(FloatRegisterImpl::S, original_key, 20, F5); + + // perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions + for ( int i = 0; i <= 36; i += 6 ) { + __ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+4), i/6, as_FloatRegister(i+6)); + __ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+6), as_FloatRegister(i+8)); + __ aes_kexpand2(as_FloatRegister(i+4), as_FloatRegister(i+8), as_FloatRegister(i+10)); + } + __ aes_kexpand1(F42, F46, 7, F48); + __ aes_kexpand2(F44, F48, F50); + + // load expanded key[last-1] and key[last] elements + __ movdtox(F48,L2); + __ movdtox(F50,L3); + + __ and3(len_reg, 16, L4); + __ br_null(L4, false, Assembler::pt, L_dec_next2_blocks192); + __ delayed()->nop(); + + __ br(Assembler::always, false, Assembler::pt, L_dec_first_block_start); + __ delayed()->nop(); + + __ BIND(L_expand256bit); + // load rest of the 256-bit key + for ( int i = 4; i <= 7; i++ ) { + __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i)); + } + + // perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions + for ( int i = 0; i <= 40; i += 8 ) { + __ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+6), i/8, as_FloatRegister(i+8)); + __ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+8), as_FloatRegister(i+10)); + __ aes_kexpand0(as_FloatRegister(i+4), as_FloatRegister(i+10), as_FloatRegister(i+12)); + __ aes_kexpand2(as_FloatRegister(i+6), as_FloatRegister(i+12), as_FloatRegister(i+14)); + } + __ aes_kexpand1(F48, F54, 6, F56); + __ aes_kexpand2(F50, F56, F58); + + // load expanded key[last-1] and key[last] elements + __ movdtox(F56,L2); + __ movdtox(F58,L3); + + __ and3(len_reg, 16, L4); + __ br_null(L4, false, Assembler::pt, L_dec_next2_blocks256); + __ delayed()->nop(); + + __ BIND(L_dec_first_block_start); + __ ldx(from,0,L4); + __ ldx(from,8,L5); + __ xor3(L2,L4,G1); + __ movxtod(G1,F60); + __ xor3(L3,L5,G1); + __ movxtod(G1,F62); + + // 128-bit original key size + __ cmp_and_brx_short(keylen, 44, Assembler::equal, Assembler::pn, L_dec_first_block128); + + // 192-bit original key size + __ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pn, L_dec_first_block192); + + __ aes_dround23(F54, F60, F62, F58); + __ aes_dround01(F52, F60, F62, F56); + __ aes_dround23(F50, F56, F58, F62); + __ aes_dround01(F48, F56, F58, F60); + + __ BIND(L_dec_first_block192); + __ aes_dround23(F46, F60, F62, F58); + __ aes_dround01(F44, F60, F62, F56); + __ aes_dround23(F42, F56, F58, F62); + __ aes_dround01(F40, F56, F58, F60); + + __ BIND(L_dec_first_block128); + for ( int i = 38; i >= 6; i -= 8 ) { + __ aes_dround23(as_FloatRegister(i), F60, F62, F58); + __ aes_dround01(as_FloatRegister(i-2), F60, F62, F56); + if ( i != 6) { + __ aes_dround23(as_FloatRegister(i-4), F56, F58, F62); + __ aes_dround01(as_FloatRegister(i-6), F56, F58, F60); + } else { + __ aes_dround23_l(as_FloatRegister(i-4), F56, F58, F62); + __ aes_dround01_l(as_FloatRegister(i-6), F56, F58, F60); + } + } + + __ movxtod(L0,F56); + __ movxtod(L1,F58); + __ mov(L4,L0); + __ mov(L5,L1); + __ fxor(FloatRegisterImpl::D, F56, F60, F60); + __ fxor(FloatRegisterImpl::D, F58, F62, F62); + + __ stf(FloatRegisterImpl::D, F60, to, 0); + __ stf(FloatRegisterImpl::D, F62, to, 8); + + __ add(from, 16, from); + __ add(to, 16, to); + __ subcc(len_reg, 16, len_reg); + __ br(Assembler::equal, false, Assembler::pt, L_cbcdec_end); + __ delayed()->nop(); + + // 256-bit original key size + __ cmp_and_brx_short(keylen, 60, Assembler::equal, Assembler::pn, L_dec_next2_blocks256); + + // 192-bit original key size + __ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pn, L_dec_next2_blocks192); + + __ align(OptoLoopAlignment); + __ BIND(L_dec_next2_blocks128); + __ nop(); + + // F40:F42 used for first 16-bytes + __ ldx(from,0,G4); + __ ldx(from,8,G5); + __ xor3(L2,G4,G1); + __ movxtod(G1,F40); + __ xor3(L3,G5,G1); + __ movxtod(G1,F42); + + // F60:F62 used for next 16-bytes + __ ldx(from,16,L4); + __ ldx(from,24,L5); + __ xor3(L2,L4,G1); + __ movxtod(G1,F60); + __ xor3(L3,L5,G1); + __ movxtod(G1,F62); + + for ( int i = 38; i >= 6; i -= 8 ) { + __ aes_dround23(as_FloatRegister(i), F40, F42, F44); + __ aes_dround01(as_FloatRegister(i-2), F40, F42, F46); + __ aes_dround23(as_FloatRegister(i), F60, F62, F58); + __ aes_dround01(as_FloatRegister(i-2), F60, F62, F56); + if (i != 6 ) { + __ aes_dround23(as_FloatRegister(i-4), F46, F44, F42); + __ aes_dround01(as_FloatRegister(i-6), F46, F44, F40); + __ aes_dround23(as_FloatRegister(i-4), F56, F58, F62); + __ aes_dround01(as_FloatRegister(i-6), F56, F58, F60); + } else { + __ aes_dround23_l(as_FloatRegister(i-4), F46, F44, F42); + __ aes_dround01_l(as_FloatRegister(i-6), F46, F44, F40); + __ aes_dround23_l(as_FloatRegister(i-4), F56, F58, F62); + __ aes_dround01_l(as_FloatRegister(i-6), F56, F58, F60); + } + } + + __ movxtod(L0,F46); + __ movxtod(L1,F44); + __ fxor(FloatRegisterImpl::D, F46, F40, F40); + __ fxor(FloatRegisterImpl::D, F44, F42, F42); + + __ stf(FloatRegisterImpl::D, F40, to, 0); + __ stf(FloatRegisterImpl::D, F42, to, 8); + + __ movxtod(G4,F56); + __ movxtod(G5,F58); + __ mov(L4,L0); + __ mov(L5,L1); + __ fxor(FloatRegisterImpl::D, F56, F60, F60); + __ fxor(FloatRegisterImpl::D, F58, F62, F62); + + __ stf(FloatRegisterImpl::D, F60, to, 16); + __ stf(FloatRegisterImpl::D, F62, to, 24); + + __ add(from, 32, from); + __ add(to, 32, to); + __ subcc(len_reg, 32, len_reg); + __ br(Assembler::notEqual, false, Assembler::pt, L_dec_next2_blocks128); + __ delayed()->nop(); + __ br(Assembler::always, false, Assembler::pt, L_cbcdec_end); + __ delayed()->nop(); + + __ align(OptoLoopAlignment); + __ BIND(L_dec_next2_blocks192); + __ nop(); + + // F48:F50 used for first 16-bytes + __ ldx(from,0,G4); + __ ldx(from,8,G5); + __ xor3(L2,G4,G1); + __ movxtod(G1,F48); + __ xor3(L3,G5,G1); + __ movxtod(G1,F50); + + // F60:F62 used for next 16-bytes + __ ldx(from,16,L4); + __ ldx(from,24,L5); + __ xor3(L2,L4,G1); + __ movxtod(G1,F60); + __ xor3(L3,L5,G1); + __ movxtod(G1,F62); + + for ( int i = 46; i >= 6; i -= 8 ) { + __ aes_dround23(as_FloatRegister(i), F48, F50, F52); + __ aes_dround01(as_FloatRegister(i-2), F48, F50, F54); + __ aes_dround23(as_FloatRegister(i), F60, F62, F58); + __ aes_dround01(as_FloatRegister(i-2), F60, F62, F56); + if (i != 6 ) { + __ aes_dround23(as_FloatRegister(i-4), F54, F52, F50); + __ aes_dround01(as_FloatRegister(i-6), F54, F52, F48); + __ aes_dround23(as_FloatRegister(i-4), F56, F58, F62); + __ aes_dround01(as_FloatRegister(i-6), F56, F58, F60); + } else { + __ aes_dround23_l(as_FloatRegister(i-4), F54, F52, F50); + __ aes_dround01_l(as_FloatRegister(i-6), F54, F52, F48); + __ aes_dround23_l(as_FloatRegister(i-4), F56, F58, F62); + __ aes_dround01_l(as_FloatRegister(i-6), F56, F58, F60); + } + } + + __ movxtod(L0,F54); + __ movxtod(L1,F52); + __ fxor(FloatRegisterImpl::D, F54, F48, F48); + __ fxor(FloatRegisterImpl::D, F52, F50, F50); + + __ stf(FloatRegisterImpl::D, F48, to, 0); + __ stf(FloatRegisterImpl::D, F50, to, 8); + + __ movxtod(G4,F56); + __ movxtod(G5,F58); + __ mov(L4,L0); + __ mov(L5,L1); + __ fxor(FloatRegisterImpl::D, F56, F60, F60); + __ fxor(FloatRegisterImpl::D, F58, F62, F62); + + __ stf(FloatRegisterImpl::D, F60, to, 16); + __ stf(FloatRegisterImpl::D, F62, to, 24); + + __ add(from, 32, from); + __ add(to, 32, to); + __ subcc(len_reg, 32, len_reg); + __ br(Assembler::notEqual, false, Assembler::pt, L_dec_next2_blocks192); + __ delayed()->nop(); + __ br(Assembler::always, false, Assembler::pt, L_cbcdec_end); + __ delayed()->nop(); + + __ align(OptoLoopAlignment); + __ BIND(L_dec_next2_blocks256); + __ nop(); + + // F0:F2 used for first 16-bytes + __ ldx(from,0,G4); + __ ldx(from,8,G5); + __ xor3(L2,G4,G1); + __ movxtod(G1,F0); + __ xor3(L3,G5,G1); + __ movxtod(G1,F2); + + // F60:F62 used for next 16-bytes + __ ldx(from,16,L4); + __ ldx(from,24,L5); + __ xor3(L2,L4,G1); + __ movxtod(G1,F60); + __ xor3(L3,L5,G1); + __ movxtod(G1,F62); + + __ aes_dround23(F54, F0, F2, F4); + __ aes_dround01(F52, F0, F2, F6); + __ aes_dround23(F54, F60, F62, F58); + __ aes_dround01(F52, F60, F62, F56); + __ aes_dround23(F50, F6, F4, F2); + __ aes_dround01(F48, F6, F4, F0); + __ aes_dround23(F50, F56, F58, F62); + __ aes_dround01(F48, F56, F58, F60); + // save F48:F54 in temp registers + __ movdtox(F54,G2); + __ movdtox(F52,G3); + __ movdtox(F50,G6); + __ movdtox(F48,G1); + for ( int i = 46; i >= 14; i -= 8 ) { + __ aes_dround23(as_FloatRegister(i), F0, F2, F4); + __ aes_dround01(as_FloatRegister(i-2), F0, F2, F6); + __ aes_dround23(as_FloatRegister(i), F60, F62, F58); + __ aes_dround01(as_FloatRegister(i-2), F60, F62, F56); + __ aes_dround23(as_FloatRegister(i-4), F6, F4, F2); + __ aes_dround01(as_FloatRegister(i-6), F6, F4, F0); + __ aes_dround23(as_FloatRegister(i-4), F56, F58, F62); + __ aes_dround01(as_FloatRegister(i-6), F56, F58, F60); + } + // init F48:F54 with F0:F6 values (original key) + __ ldf(FloatRegisterImpl::D, original_key, 0, F48); + __ ldf(FloatRegisterImpl::D, original_key, 8, F50); + __ ldf(FloatRegisterImpl::D, original_key, 16, F52); + __ ldf(FloatRegisterImpl::D, original_key, 24, F54); + __ aes_dround23(F54, F0, F2, F4); + __ aes_dround01(F52, F0, F2, F6); + __ aes_dround23(F54, F60, F62, F58); + __ aes_dround01(F52, F60, F62, F56); + __ aes_dround23_l(F50, F6, F4, F2); + __ aes_dround01_l(F48, F6, F4, F0); + __ aes_dround23_l(F50, F56, F58, F62); + __ aes_dround01_l(F48, F56, F58, F60); + // re-init F48:F54 with their original values + __ movxtod(G2,F54); + __ movxtod(G3,F52); + __ movxtod(G6,F50); + __ movxtod(G1,F48); + + __ movxtod(L0,F6); + __ movxtod(L1,F4); + __ fxor(FloatRegisterImpl::D, F6, F0, F0); + __ fxor(FloatRegisterImpl::D, F4, F2, F2); + + __ stf(FloatRegisterImpl::D, F0, to, 0); + __ stf(FloatRegisterImpl::D, F2, to, 8); + + __ movxtod(G4,F56); + __ movxtod(G5,F58); + __ mov(L4,L0); + __ mov(L5,L1); + __ fxor(FloatRegisterImpl::D, F56, F60, F60); + __ fxor(FloatRegisterImpl::D, F58, F62, F62); + + __ stf(FloatRegisterImpl::D, F60, to, 16); + __ stf(FloatRegisterImpl::D, F62, to, 24); + + __ add(from, 32, from); + __ add(to, 32, to); + __ subcc(len_reg, 32, len_reg); + __ br(Assembler::notEqual, false, Assembler::pt, L_dec_next2_blocks256); + __ delayed()->nop(); + + __ BIND(L_cbcdec_end); + __ stx(L0, rvec, 0); + __ stx(L1, rvec, 8); + __ restore(); + __ mov(L0, O0); + __ retl(); + __ delayed()->nop(); + + return start; + } + void generate_initial() { // Generates all stubs and initializes the entry points @@ -3368,6 +4137,14 @@ generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry, &StubRoutines::_safefetchN_fault_pc, &StubRoutines::_safefetchN_continuation_pc); + + // generate AES intrinsics code + if (UseAESIntrinsics) { + StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); + StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); + StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); + StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel(); + } } diff -r 8cdf3f43f63e -r 00f5eff62d18 src/cpu/sparc/vm/vm_version_sparc.cpp --- a/src/cpu/sparc/vm/vm_version_sparc.cpp Tue Jan 14 14:51:47 2014 +0100 +++ b/src/cpu/sparc/vm/vm_version_sparc.cpp Tue Jan 14 17:46:48 2014 -0800 @@ -234,7 +234,7 @@ assert((OptoLoopAlignment % relocInfo::addr_unit()) == 0, "alignment is not a multiple of NOP size"); char buf[512]; - jio_snprintf(buf, sizeof(buf), "%s%s%s%s%s%s%s%s%s%s%s%s%s%s", + jio_snprintf(buf, sizeof(buf), "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s", (has_v9() ? ", v9" : (has_v8() ? ", v8" : "")), (has_hardware_popc() ? ", popc" : ""), (has_vis1() ? ", vis1" : ""), @@ -242,6 +242,7 @@ (has_vis3() ? ", vis3" : ""), (has_blk_init() ? ", blk_init" : ""), (has_cbcond() ? ", cbcond" : ""), + (has_aes() ? ", aes" : ""), (is_ultra3() ? ", ultra3" : ""), (is_sun4v() ? ", sun4v" : ""), (is_niagara_plus() ? ", niagara_plus" : (is_niagara() ? ", niagara" : "")), @@ -265,6 +266,41 @@ if (!has_vis1()) // Drop to 0 if no VIS1 support UseVIS = 0; + // T2 and above should have support for AES instructions + if (has_aes()) { + if (UseVIS > 0) { // AES intrinsics use FXOR instruction which is VIS1 + if (FLAG_IS_DEFAULT(UseAES)) { + FLAG_SET_DEFAULT(UseAES, true); + } + if (FLAG_IS_DEFAULT(UseAESIntrinsics)) { + FLAG_SET_DEFAULT(UseAESIntrinsics, true); + } + // we disable both the AES flags if either of them is disabled on the command line + if (!UseAES || !UseAESIntrinsics) { + FLAG_SET_DEFAULT(UseAES, false); + FLAG_SET_DEFAULT(UseAESIntrinsics, false); + } + } else { + if (UseAES || UseAESIntrinsics) { + warning("SPARC AES intrinsics require VIS1 instruction support. Intrinsics will be disabled."); + if (UseAES) { + FLAG_SET_DEFAULT(UseAES, false); + } + if (UseAESIntrinsics) { + FLAG_SET_DEFAULT(UseAESIntrinsics, false); + } + } + } + } else if (UseAES || UseAESIntrinsics) { + warning("AES instructions are not available on this CPU"); + if (UseAES) { + FLAG_SET_DEFAULT(UseAES, false); + } + if (UseAESIntrinsics) { + FLAG_SET_DEFAULT(UseAESIntrinsics, false); + } + } + if (FLAG_IS_DEFAULT(ContendedPaddingWidth) && (cache_line_size > ContendedPaddingWidth)) ContendedPaddingWidth = cache_line_size; diff -r 8cdf3f43f63e -r 00f5eff62d18 src/cpu/sparc/vm/vm_version_sparc.hpp --- a/src/cpu/sparc/vm/vm_version_sparc.hpp Tue Jan 14 14:51:47 2014 +0100 +++ b/src/cpu/sparc/vm/vm_version_sparc.hpp Tue Jan 14 17:46:48 2014 -0800 @@ -48,7 +48,8 @@ sparc64_family = 14, M_family = 15, T_family = 16, - T1_model = 17 + T1_model = 17, + aes_instructions = 18 }; enum Feature_Flag_Set { @@ -73,6 +74,7 @@ M_family_m = 1 << M_family, T_family_m = 1 << T_family, T1_model_m = 1 << T1_model, + aes_instructions_m = 1 << aes_instructions, generic_v8_m = v8_instructions_m | hardware_mul32_m | hardware_div32_m | hardware_fsmuld_m, generic_v9_m = generic_v8_m | v9_instructions_m, @@ -123,6 +125,7 @@ static bool has_vis3() { return (_features & vis3_instructions_m) != 0; } static bool has_blk_init() { return (_features & blk_init_instructions_m) != 0; } static bool has_cbcond() { return (_features & cbcond_instructions_m) != 0; } + static bool has_aes() { return (_features & aes_instructions_m) != 0; } static bool supports_compare_and_exchange() { return has_v9(); } diff -r 8cdf3f43f63e -r 00f5eff62d18 src/cpu/x86/vm/stubGenerator_x86_32.cpp --- a/src/cpu/x86/vm/stubGenerator_x86_32.cpp Tue Jan 14 14:51:47 2014 +0100 +++ b/src/cpu/x86/vm/stubGenerator_x86_32.cpp Tue Jan 14 17:46:48 2014 -0800 @@ -2403,6 +2403,9 @@ // c_rarg3 - r vector byte array address // c_rarg4 - input length // + // Output: + // rax - input length + // address generate_cipherBlockChaining_encryptAESCrypt() { assert(UseAES, "need AES instructions and misaligned SSE support"); __ align(CodeEntryAlignment); @@ -2483,7 +2486,7 @@ __ movdqu(Address(rvec, 0), xmm_result); // final value of r stored in rvec of CipherBlockChaining object handleSOERegisters(false /*restoring*/); - __ movl(rax, 0); // return 0 (why?) + __ movptr(rax, len_param); // return length __ leave(); // required for proper stackwalking of RuntimeStub frame __ ret(0); @@ -2557,6 +2560,9 @@ // c_rarg3 - r vector byte array address // c_rarg4 - input length // + // Output: + // rax - input length + // address generate_cipherBlockChaining_decryptAESCrypt() { assert(UseAES, "need AES instructions and misaligned SSE support"); @@ -2650,7 +2656,7 @@ __ movptr(rvec , rvec_param); // restore this since used in loop __ movdqu(Address(rvec, 0), xmm_temp); // final value of r stored in rvec of CipherBlockChaining object handleSOERegisters(false /*restoring*/); - __ movl(rax, 0); // return 0 (why?) + __ movptr(rax, len_param); // return length __ leave(); // required for proper stackwalking of RuntimeStub frame __ ret(0); diff -r 8cdf3f43f63e -r 00f5eff62d18 src/cpu/x86/vm/stubGenerator_x86_64.cpp --- a/src/cpu/x86/vm/stubGenerator_x86_64.cpp Tue Jan 14 14:51:47 2014 +0100 +++ b/src/cpu/x86/vm/stubGenerator_x86_64.cpp Tue Jan 14 17:46:48 2014 -0800 @@ -3217,6 +3217,9 @@ // c_rarg3 - r vector byte array address // c_rarg4 - input length // + // Output: + // rax - input length + // address generate_cipherBlockChaining_encryptAESCrypt() { assert(UseAES, "need AES instructions and misaligned SSE support"); __ align(CodeEntryAlignment); @@ -3232,7 +3235,7 @@ #ifndef _WIN64 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) #else - const Address len_mem(rsp, 6 * wordSize); // length is on stack on Win64 + const Address len_mem(rbp, 6 * wordSize); // length is on stack on Win64 const Register len_reg = r10; // pick the first volatile windows register #endif const Register pos = rax; @@ -3259,6 +3262,8 @@ for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) { __ movdqu(xmm_save(i), as_XMMRegister(i)); } +#else + __ push(len_reg); // Save #endif const XMMRegister xmm_key_shuf_mask = xmm_temp; // used temporarily to swap key bytes up front @@ -3301,8 +3306,10 @@ for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) { __ movdqu(as_XMMRegister(i), xmm_save(i)); } + __ movl(rax, len_mem); +#else + __ pop(rax); // return length #endif - __ movl(rax, 0); // return 0 (why?) __ leave(); // required for proper stackwalking of RuntimeStub frame __ ret(0); @@ -3409,6 +3416,9 @@ // c_rarg3 - r vector byte array address // c_rarg4 - input length // + // Output: + // rax - input length + // address generate_cipherBlockChaining_decryptAESCrypt_Parallel() { assert(UseAES, "need AES instructions and misaligned SSE support"); @@ -3427,7 +3437,7 @@ #ifndef _WIN64 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) #else - const Address len_mem(rsp, 6 * wordSize); // length is on stack on Win64 + const Address len_mem(rbp, 6 * wordSize); // length is on stack on Win64 const Register len_reg = r10; // pick the first volatile windows register #endif const Register pos = rax; @@ -3448,7 +3458,10 @@ for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) { __ movdqu(xmm_save(i), as_XMMRegister(i)); } +#else + __ push(len_reg); // Save #endif + // the java expanded key ordering is rotated one position from what we want // so we start from 0x10 here and hit 0x00 last const XMMRegister xmm_key_shuf_mask = xmm1; // used temporarily to swap key bytes up front @@ -3554,8 +3567,10 @@ for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) { __ movdqu(as_XMMRegister(i), xmm_save(i)); } + __ movl(rax, len_mem); +#else + __ pop(rax); // return length #endif - __ movl(rax, 0); // return 0 (why?) __ leave(); // required for proper stackwalking of RuntimeStub frame __ ret(0); diff -r 8cdf3f43f63e -r 00f5eff62d18 src/cpu/x86/vm/x86.ad --- a/src/cpu/x86/vm/x86.ad Tue Jan 14 14:51:47 2014 +0100 +++ b/src/cpu/x86/vm/x86.ad Tue Jan 14 17:46:48 2014 -0800 @@ -581,6 +581,12 @@ return !AlignVector; // can be changed by flag } +// x86 AES instructions are compatible with SunJCE expanded +// keys, hence we do not need to pass the original key to stubs +const bool Matcher::pass_original_key_for_aes() { + return false; +} + // Helper methods for MachSpillCopyNode::implementation(). static int vec_mov_helper(CodeBuffer *cbuf, bool do_size, int src_lo, int dst_lo, int src_hi, int dst_hi, uint ireg, outputStream* st) { diff -r 8cdf3f43f63e -r 00f5eff62d18 src/os_cpu/solaris_sparc/vm/vm_version_solaris_sparc.cpp --- a/src/os_cpu/solaris_sparc/vm/vm_version_solaris_sparc.cpp Tue Jan 14 14:51:47 2014 +0100 +++ b/src/os_cpu/solaris_sparc/vm/vm_version_solaris_sparc.cpp Tue Jan 14 17:46:48 2014 -0800 @@ -119,6 +119,11 @@ #endif if (av & AV_SPARC_CBCOND) features |= cbcond_instructions_m; +#ifndef AV_SPARC_AES +#define AV_SPARC_AES 0x00020000 /* aes instrs supported */ +#endif + if (av & AV_SPARC_AES) features |= aes_instructions_m; + } else { // getisax(2) failed, use the old legacy code. #ifndef PRODUCT diff -r 8cdf3f43f63e -r 00f5eff62d18 src/share/vm/classfile/vmSymbols.hpp --- a/src/share/vm/classfile/vmSymbols.hpp Tue Jan 14 14:51:47 2014 +0100 +++ b/src/share/vm/classfile/vmSymbols.hpp Tue Jan 14 17:46:48 2014 -0800 @@ -787,7 +787,7 @@ do_intrinsic(_cipherBlockChaining_decryptAESCrypt, com_sun_crypto_provider_cipherBlockChaining, decrypt_name, byteArray_int_int_byteArray_int_signature, F_R) \ do_name( encrypt_name, "encrypt") \ do_name( decrypt_name, "decrypt") \ - do_signature(byteArray_int_int_byteArray_int_signature, "([BII[BI)V") \ + do_signature(byteArray_int_int_byteArray_int_signature, "([BII[BI)I") \ \ /* support for java.util.zip */ \ do_class(java_util_zip_CRC32, "java/util/zip/CRC32") \ diff -r 8cdf3f43f63e -r 00f5eff62d18 src/share/vm/opto/library_call.cpp --- a/src/share/vm/opto/library_call.cpp Tue Jan 14 14:51:47 2014 +0100 +++ b/src/share/vm/opto/library_call.cpp Tue Jan 14 17:46:48 2014 -0800 @@ -304,6 +304,7 @@ bool inline_cipherBlockChaining_AESCrypt(vmIntrinsics::ID id); Node* inline_cipherBlockChaining_AESCrypt_predicate(bool decrypting); Node* get_key_start_from_aescrypt_object(Node* aescrypt_object); + Node* get_original_key_start_from_aescrypt_object(Node* aescrypt_object); bool inline_encodeISOArray(); bool inline_updateCRC32(); bool inline_updateBytesCRC32(); @@ -5936,10 +5937,22 @@ Node* k_start = get_key_start_from_aescrypt_object(aescrypt_object); if (k_start == NULL) return false; - // Call the stub. - make_runtime_call(RC_LEAF|RC_NO_FP, OptoRuntime::aescrypt_block_Type(), - stubAddr, stubName, TypePtr::BOTTOM, - src_start, dest_start, k_start); + if (Matcher::pass_original_key_for_aes()) { + // on SPARC we need to pass the original key since key expansion needs to happen in intrinsics due to + // compatibility issues between Java key expansion and SPARC crypto instructions + Node* original_k_start = get_original_key_start_from_aescrypt_object(aescrypt_object); + if (original_k_start == NULL) return false; + + // Call the stub. + make_runtime_call(RC_LEAF|RC_NO_FP, OptoRuntime::aescrypt_block_Type(), + stubAddr, stubName, TypePtr::BOTTOM, + src_start, dest_start, k_start, original_k_start); + } else { + // Call the stub. + make_runtime_call(RC_LEAF|RC_NO_FP, OptoRuntime::aescrypt_block_Type(), + stubAddr, stubName, TypePtr::BOTTOM, + src_start, dest_start, k_start); + } return true; } @@ -6017,14 +6030,29 @@ if (objRvec == NULL) return false; Node* r_start = array_element_address(objRvec, intcon(0), T_BYTE); - // Call the stub, passing src_start, dest_start, k_start, r_start and src_len - make_runtime_call(RC_LEAF|RC_NO_FP, - OptoRuntime::cipherBlockChaining_aescrypt_Type(), - stubAddr, stubName, TypePtr::BOTTOM, - src_start, dest_start, k_start, r_start, len); - - // return is void so no result needs to be pushed - + Node* cbcCrypt; + if (Matcher::pass_original_key_for_aes()) { + // on SPARC we need to pass the original key since key expansion needs to happen in intrinsics due to + // compatibility issues between Java key expansion and SPARC crypto instructions + Node* original_k_start = get_original_key_start_from_aescrypt_object(aescrypt_object); + if (original_k_start == NULL) return false; + + // Call the stub, passing src_start, dest_start, k_start, r_start, src_len and original_k_start + cbcCrypt = make_runtime_call(RC_LEAF|RC_NO_FP, + OptoRuntime::cipherBlockChaining_aescrypt_Type(), + stubAddr, stubName, TypePtr::BOTTOM, + src_start, dest_start, k_start, r_start, len, original_k_start); + } else { + // Call the stub, passing src_start, dest_start, k_start, r_start and src_len + cbcCrypt = make_runtime_call(RC_LEAF|RC_NO_FP, + OptoRuntime::cipherBlockChaining_aescrypt_Type(), + stubAddr, stubName, TypePtr::BOTTOM, + src_start, dest_start, k_start, r_start, len); + } + + // return cipher length (int) + Node* retvalue = _gvn.transform(new (C) ProjNode(cbcCrypt, TypeFunc::Parms)); + set_result(retvalue); return true; } @@ -6039,6 +6067,17 @@ return k_start; } +//------------------------------get_original_key_start_from_aescrypt_object----------------------- +Node * LibraryCallKit::get_original_key_start_from_aescrypt_object(Node *aescrypt_object) { + Node* objAESCryptKey = load_field_from_object(aescrypt_object, "lastKey", "[B", /*is_exact*/ false); + assert (objAESCryptKey != NULL, "wrong version of com.sun.crypto.provider.AESCrypt"); + if (objAESCryptKey == NULL) return (Node *) NULL; + + // now have the array, need to get the start address of the lastKey array + Node* original_k_start = array_element_address(objAESCryptKey, intcon(0), T_BYTE); + return original_k_start; +} + //----------------------------inline_cipherBlockChaining_AESCrypt_predicate---------------------------- // Return node representing slow path of predicate check. // the pseudo code we want to emulate with this predicate is: diff -r 8cdf3f43f63e -r 00f5eff62d18 src/share/vm/opto/matcher.hpp --- a/src/share/vm/opto/matcher.hpp Tue Jan 14 14:51:47 2014 +0100 +++ b/src/share/vm/opto/matcher.hpp Tue Jan 14 17:46:48 2014 -0800 @@ -286,6 +286,9 @@ // CPU supports misaligned vectors store/load. static const bool misaligned_vectors_ok(); + // Should original key array reference be passed to AES stubs + static const bool pass_original_key_for_aes(); + // Used to determine a "low complexity" 64-bit constant. (Zero is simple.) // The standard of comparison is one (StoreL ConL) vs. two (StoreI ConI). // Depends on the details of 64-bit constant generation on the CPU. diff -r 8cdf3f43f63e -r 00f5eff62d18 src/share/vm/opto/runtime.cpp --- a/src/share/vm/opto/runtime.cpp Tue Jan 14 14:51:47 2014 +0100 +++ b/src/share/vm/opto/runtime.cpp Tue Jan 14 17:46:48 2014 -0800 @@ -814,12 +814,18 @@ const TypeFunc* OptoRuntime::aescrypt_block_Type() { // create input type (domain) int num_args = 3; + if (Matcher::pass_original_key_for_aes()) { + num_args = 4; + } int argcnt = num_args; const Type** fields = TypeTuple::fields(argcnt); int argp = TypeFunc::Parms; fields[argp++] = TypePtr::NOTNULL; // src fields[argp++] = TypePtr::NOTNULL; // dest fields[argp++] = TypePtr::NOTNULL; // k array + if (Matcher::pass_original_key_for_aes()) { + fields[argp++] = TypePtr::NOTNULL; // original k array + } assert(argp == TypeFunc::Parms+argcnt, "correct decoding"); const TypeTuple* domain = TypeTuple::make(TypeFunc::Parms+argcnt, fields); @@ -856,6 +862,9 @@ const TypeFunc* OptoRuntime::cipherBlockChaining_aescrypt_Type() { // create input type (domain) int num_args = 5; + if (Matcher::pass_original_key_for_aes()) { + num_args = 6; + } int argcnt = num_args; const Type** fields = TypeTuple::fields(argcnt); int argp = TypeFunc::Parms; @@ -864,13 +873,16 @@ fields[argp++] = TypePtr::NOTNULL; // k array fields[argp++] = TypePtr::NOTNULL; // r array fields[argp++] = TypeInt::INT; // src len + if (Matcher::pass_original_key_for_aes()) { + fields[argp++] = TypePtr::NOTNULL; // original k array + } assert(argp == TypeFunc::Parms+argcnt, "correct decoding"); const TypeTuple* domain = TypeTuple::make(TypeFunc::Parms+argcnt, fields); - // no result type needed + // returning cipher len (int) fields = TypeTuple::fields(1); - fields[TypeFunc::Parms+0] = NULL; // void - const TypeTuple* range = TypeTuple::make(TypeFunc::Parms, fields); + fields[TypeFunc::Parms+0] = TypeInt::INT; + const TypeTuple* range = TypeTuple::make(TypeFunc::Parms+1, fields); return TypeFunc::make(domain, range); } diff -r 8cdf3f43f63e -r 00f5eff62d18 test/compiler/7184394/TestAESMain.java --- a/test/compiler/7184394/TestAESMain.java Tue Jan 14 14:51:47 2014 +0100 +++ b/test/compiler/7184394/TestAESMain.java Tue Jan 14 17:46:48 2014 -0800 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2014 Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -39,20 +39,32 @@ System.out.println(iters + " iterations"); TestAESEncode etest = new TestAESEncode(); etest.prepare(); + // warm-up for 20K iterations + System.out.println("Starting encryption warm-up"); + for (int i=0; i<20000; i++) { + etest.run(); + } + System.out.println("Finished encryption warm-up"); long start = System.nanoTime(); for (int i=0; i