Mercurial > hg > truffle
diff src/cpu/x86/vm/stubGenerator_x86_64.cpp @ 7482:989155e2d07a
Merge with hs25-b15.
author | Thomas Wuerthinger <thomas.wuerthinger@oracle.com> |
---|---|
date | Wed, 16 Jan 2013 01:34:24 +0100 |
parents | 291ffc492eb6 e2e6bf86682c |
children | b9a918201d47 |
line wrap: on
line diff
--- a/src/cpu/x86/vm/stubGenerator_x86_64.cpp Tue Jan 15 18:54:02 2013 +0100 +++ b/src/cpu/x86/vm/stubGenerator_x86_64.cpp Wed Jan 16 01:34:24 2013 +0100 @@ -1286,23 +1286,54 @@ // end_to - destination array end address // qword_count - 64-bits element count, negative // to - scratch - // L_copy_32_bytes - entry label + // L_copy_bytes - entry label // L_copy_8_bytes - exit label // - void copy_32_bytes_forward(Register end_from, Register end_to, + void copy_bytes_forward(Register end_from, Register end_to, Register qword_count, Register to, - Label& L_copy_32_bytes, Label& L_copy_8_bytes) { + Label& L_copy_bytes, Label& L_copy_8_bytes) { DEBUG_ONLY(__ stop("enter at entry label, not here")); Label L_loop; __ align(OptoLoopAlignment); - __ BIND(L_loop); - if(UseUnalignedLoadStores) { - __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24)); - __ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm0); - __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, - 8)); - __ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm1); - + if (UseUnalignedLoadStores) { + Label L_end; + // Copy 64-bytes per iteration + __ BIND(L_loop); + if (UseAVX >= 2) { + __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56)); + __ vmovdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0); + __ vmovdqu(xmm1, Address(end_from, qword_count, Address::times_8, -24)); + __ vmovdqu(Address(end_to, qword_count, Address::times_8, -24), xmm1); + } else { + __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56)); + __ movdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0); + __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, -40)); + __ movdqu(Address(end_to, qword_count, Address::times_8, -40), xmm1); + __ movdqu(xmm2, Address(end_from, qword_count, Address::times_8, -24)); + __ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm2); + __ movdqu(xmm3, Address(end_from, qword_count, Address::times_8, - 8)); + __ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm3); + } + __ BIND(L_copy_bytes); + __ addptr(qword_count, 8); + __ jcc(Assembler::lessEqual, L_loop); + __ subptr(qword_count, 4); // sub(8) and add(4) + __ jccb(Assembler::greater, L_end); + // Copy trailing 32 bytes + if (UseAVX >= 2) { + __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24)); + __ vmovdqu(Address(end_to, qword_count, Address::times_8, -24), xmm0); + } else { + __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24)); + __ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm0); + __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, - 8)); + __ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm1); + } + __ addptr(qword_count, 4); + __ BIND(L_end); } else { + // Copy 32-bytes per iteration + __ BIND(L_loop); __ movq(to, Address(end_from, qword_count, Address::times_8, -24)); __ movq(Address(end_to, qword_count, Address::times_8, -24), to); __ movq(to, Address(end_from, qword_count, Address::times_8, -16)); @@ -1311,15 +1342,15 @@ __ movq(Address(end_to, qword_count, Address::times_8, - 8), to); __ movq(to, Address(end_from, qword_count, Address::times_8, - 0)); __ movq(Address(end_to, qword_count, Address::times_8, - 0), to); + + __ BIND(L_copy_bytes); + __ addptr(qword_count, 4); + __ jcc(Assembler::lessEqual, L_loop); } - __ BIND(L_copy_32_bytes); - __ addptr(qword_count, 4); - __ jcc(Assembler::lessEqual, L_loop); __ subptr(qword_count, 4); __ jcc(Assembler::less, L_copy_8_bytes); // Copy trailing qwords } - // Copy big chunks backward // // Inputs: @@ -1327,23 +1358,55 @@ // dest - destination array address // qword_count - 64-bits element count // to - scratch - // L_copy_32_bytes - entry label + // L_copy_bytes - entry label // L_copy_8_bytes - exit label // - void copy_32_bytes_backward(Register from, Register dest, + void copy_bytes_backward(Register from, Register dest, Register qword_count, Register to, - Label& L_copy_32_bytes, Label& L_copy_8_bytes) { + Label& L_copy_bytes, Label& L_copy_8_bytes) { DEBUG_ONLY(__ stop("enter at entry label, not here")); Label L_loop; __ align(OptoLoopAlignment); - __ BIND(L_loop); - if(UseUnalignedLoadStores) { - __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 16)); - __ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm0); - __ movdqu(xmm1, Address(from, qword_count, Address::times_8, 0)); - __ movdqu(Address(dest, qword_count, Address::times_8, 0), xmm1); - + if (UseUnalignedLoadStores) { + Label L_end; + // Copy 64-bytes per iteration + __ BIND(L_loop); + if (UseAVX >= 2) { + __ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 32)); + __ vmovdqu(Address(dest, qword_count, Address::times_8, 32), xmm0); + __ vmovdqu(xmm1, Address(from, qword_count, Address::times_8, 0)); + __ vmovdqu(Address(dest, qword_count, Address::times_8, 0), xmm1); + } else { + __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 48)); + __ movdqu(Address(dest, qword_count, Address::times_8, 48), xmm0); + __ movdqu(xmm1, Address(from, qword_count, Address::times_8, 32)); + __ movdqu(Address(dest, qword_count, Address::times_8, 32), xmm1); + __ movdqu(xmm2, Address(from, qword_count, Address::times_8, 16)); + __ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm2); + __ movdqu(xmm3, Address(from, qword_count, Address::times_8, 0)); + __ movdqu(Address(dest, qword_count, Address::times_8, 0), xmm3); + } + __ BIND(L_copy_bytes); + __ subptr(qword_count, 8); + __ jcc(Assembler::greaterEqual, L_loop); + + __ addptr(qword_count, 4); // add(8) and sub(4) + __ jccb(Assembler::less, L_end); + // Copy trailing 32 bytes + if (UseAVX >= 2) { + __ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 0)); + __ vmovdqu(Address(dest, qword_count, Address::times_8, 0), xmm0); + } else { + __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 16)); + __ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm0); + __ movdqu(xmm1, Address(from, qword_count, Address::times_8, 0)); + __ movdqu(Address(dest, qword_count, Address::times_8, 0), xmm1); + } + __ subptr(qword_count, 4); + __ BIND(L_end); } else { + // Copy 32-bytes per iteration + __ BIND(L_loop); __ movq(to, Address(from, qword_count, Address::times_8, 24)); __ movq(Address(dest, qword_count, Address::times_8, 24), to); __ movq(to, Address(from, qword_count, Address::times_8, 16)); @@ -1352,10 +1415,11 @@ __ movq(Address(dest, qword_count, Address::times_8, 8), to); __ movq(to, Address(from, qword_count, Address::times_8, 0)); __ movq(Address(dest, qword_count, Address::times_8, 0), to); + + __ BIND(L_copy_bytes); + __ subptr(qword_count, 4); + __ jcc(Assembler::greaterEqual, L_loop); } - __ BIND(L_copy_32_bytes); - __ subptr(qword_count, 4); - __ jcc(Assembler::greaterEqual, L_loop); __ addptr(qword_count, 4); __ jcc(Assembler::greater, L_copy_8_bytes); // Copy trailing qwords } @@ -1385,7 +1449,7 @@ StubCodeMark mark(this, "StubRoutines", name); address start = __ pc(); - Label L_copy_32_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes; + Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes; Label L_copy_byte, L_exit; const Register from = rdi; // source array address const Register to = rsi; // destination array address @@ -1417,7 +1481,7 @@ __ lea(end_from, Address(from, qword_count, Address::times_8, -8)); __ lea(end_to, Address(to, qword_count, Address::times_8, -8)); __ negptr(qword_count); // make the count negative - __ jmp(L_copy_32_bytes); + __ jmp(L_copy_bytes); // Copy trailing qwords __ BIND(L_copy_8_bytes); @@ -1460,8 +1524,8 @@ __ leave(); // required for proper stackwalking of RuntimeStub frame __ ret(0); - // Copy in 32-bytes chunks - copy_32_bytes_forward(end_from, end_to, qword_count, rax, L_copy_32_bytes, L_copy_8_bytes); + // Copy in multi-bytes chunks + copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes); __ jmp(L_copy_4_bytes); return start; @@ -1488,7 +1552,7 @@ StubCodeMark mark(this, "StubRoutines", name); address start = __ pc(); - Label L_copy_32_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes; + Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes; const Register from = rdi; // source array address const Register to = rsi; // destination array address const Register count = rdx; // elements count @@ -1531,10 +1595,10 @@ // Check for and copy trailing dword __ BIND(L_copy_4_bytes); __ testl(byte_count, 4); - __ jcc(Assembler::zero, L_copy_32_bytes); + __ jcc(Assembler::zero, L_copy_bytes); __ movl(rax, Address(from, qword_count, Address::times_8)); __ movl(Address(to, qword_count, Address::times_8), rax); - __ jmp(L_copy_32_bytes); + __ jmp(L_copy_bytes); // Copy trailing qwords __ BIND(L_copy_8_bytes); @@ -1549,8 +1613,8 @@ __ leave(); // required for proper stackwalking of RuntimeStub frame __ ret(0); - // Copy in 32-bytes chunks - copy_32_bytes_backward(from, to, qword_count, rax, L_copy_32_bytes, L_copy_8_bytes); + // Copy in multi-bytes chunks + copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes); restore_arg_regs(); inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free @@ -1585,7 +1649,7 @@ StubCodeMark mark(this, "StubRoutines", name); address start = __ pc(); - Label L_copy_32_bytes, L_copy_8_bytes, L_copy_4_bytes,L_copy_2_bytes,L_exit; + Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes,L_copy_2_bytes,L_exit; const Register from = rdi; // source array address const Register to = rsi; // destination array address const Register count = rdx; // elements count @@ -1616,7 +1680,7 @@ __ lea(end_from, Address(from, qword_count, Address::times_8, -8)); __ lea(end_to, Address(to, qword_count, Address::times_8, -8)); __ negptr(qword_count); - __ jmp(L_copy_32_bytes); + __ jmp(L_copy_bytes); // Copy trailing qwords __ BIND(L_copy_8_bytes); @@ -1652,8 +1716,8 @@ __ leave(); // required for proper stackwalking of RuntimeStub frame __ ret(0); - // Copy in 32-bytes chunks - copy_32_bytes_forward(end_from, end_to, qword_count, rax, L_copy_32_bytes, L_copy_8_bytes); + // Copy in multi-bytes chunks + copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes); __ jmp(L_copy_4_bytes); return start; @@ -1700,7 +1764,7 @@ StubCodeMark mark(this, "StubRoutines", name); address start = __ pc(); - Label L_copy_32_bytes, L_copy_8_bytes, L_copy_4_bytes; + Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes; const Register from = rdi; // source array address const Register to = rsi; // destination array address const Register count = rdx; // elements count @@ -1735,10 +1799,10 @@ // Check for and copy trailing dword __ BIND(L_copy_4_bytes); __ testl(word_count, 2); - __ jcc(Assembler::zero, L_copy_32_bytes); + __ jcc(Assembler::zero, L_copy_bytes); __ movl(rax, Address(from, qword_count, Address::times_8)); __ movl(Address(to, qword_count, Address::times_8), rax); - __ jmp(L_copy_32_bytes); + __ jmp(L_copy_bytes); // Copy trailing qwords __ BIND(L_copy_8_bytes); @@ -1753,8 +1817,8 @@ __ leave(); // required for proper stackwalking of RuntimeStub frame __ ret(0); - // Copy in 32-bytes chunks - copy_32_bytes_backward(from, to, qword_count, rax, L_copy_32_bytes, L_copy_8_bytes); + // Copy in multi-bytes chunks + copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes); restore_arg_regs(); inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free @@ -1790,7 +1854,7 @@ StubCodeMark mark(this, "StubRoutines", name); address start = __ pc(); - Label L_copy_32_bytes, L_copy_8_bytes, L_copy_4_bytes, L_exit; + Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_exit; const Register from = rdi; // source array address const Register to = rsi; // destination array address const Register count = rdx; // elements count @@ -1826,7 +1890,7 @@ __ lea(end_from, Address(from, qword_count, Address::times_8, -8)); __ lea(end_to, Address(to, qword_count, Address::times_8, -8)); __ negptr(qword_count); - __ jmp(L_copy_32_bytes); + __ jmp(L_copy_bytes); // Copy trailing qwords __ BIND(L_copy_8_bytes); @@ -1853,8 +1917,8 @@ __ leave(); // required for proper stackwalking of RuntimeStub frame __ ret(0); - // Copy 32-bytes chunks - copy_32_bytes_forward(end_from, end_to, qword_count, rax, L_copy_32_bytes, L_copy_8_bytes); + // Copy in multi-bytes chunks + copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes); __ jmp(L_copy_4_bytes); return start; @@ -1882,7 +1946,7 @@ StubCodeMark mark(this, "StubRoutines", name); address start = __ pc(); - Label L_copy_32_bytes, L_copy_8_bytes, L_copy_2_bytes, L_exit; + Label L_copy_bytes, L_copy_8_bytes, L_copy_2_bytes, L_exit; const Register from = rdi; // source array address const Register to = rsi; // destination array address const Register count = rdx; // elements count @@ -1916,10 +1980,10 @@ // Check for and copy trailing dword __ testl(dword_count, 1); - __ jcc(Assembler::zero, L_copy_32_bytes); + __ jcc(Assembler::zero, L_copy_bytes); __ movl(rax, Address(from, dword_count, Address::times_4, -4)); __ movl(Address(to, dword_count, Address::times_4, -4), rax); - __ jmp(L_copy_32_bytes); + __ jmp(L_copy_bytes); // Copy trailing qwords __ BIND(L_copy_8_bytes); @@ -1937,8 +2001,8 @@ __ leave(); // required for proper stackwalking of RuntimeStub frame __ ret(0); - // Copy in 32-bytes chunks - copy_32_bytes_backward(from, to, qword_count, rax, L_copy_32_bytes, L_copy_8_bytes); + // Copy in multi-bytes chunks + copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes); __ bind(L_exit); if (is_oop) { @@ -1976,7 +2040,7 @@ StubCodeMark mark(this, "StubRoutines", name); address start = __ pc(); - Label L_copy_32_bytes, L_copy_8_bytes, L_exit; + Label L_copy_bytes, L_copy_8_bytes, L_exit; const Register from = rdi; // source array address const Register to = rsi; // destination array address const Register qword_count = rdx; // elements count @@ -2008,7 +2072,7 @@ __ lea(end_from, Address(from, qword_count, Address::times_8, -8)); __ lea(end_to, Address(to, qword_count, Address::times_8, -8)); __ negptr(qword_count); - __ jmp(L_copy_32_bytes); + __ jmp(L_copy_bytes); // Copy trailing qwords __ BIND(L_copy_8_bytes); @@ -2027,8 +2091,8 @@ __ ret(0); } - // Copy 64-byte chunks - copy_32_bytes_forward(end_from, end_to, qword_count, rax, L_copy_32_bytes, L_copy_8_bytes); + // Copy in multi-bytes chunks + copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes); if (is_oop) { __ BIND(L_exit); @@ -2065,7 +2129,7 @@ StubCodeMark mark(this, "StubRoutines", name); address start = __ pc(); - Label L_copy_32_bytes, L_copy_8_bytes, L_exit; + Label L_copy_bytes, L_copy_8_bytes, L_exit; const Register from = rdi; // source array address const Register to = rsi; // destination array address const Register qword_count = rdx; // elements count @@ -2091,7 +2155,7 @@ gen_write_ref_array_pre_barrier(to, saved_count, dest_uninitialized); } - __ jmp(L_copy_32_bytes); + __ jmp(L_copy_bytes); // Copy trailing qwords __ BIND(L_copy_8_bytes); @@ -2110,8 +2174,8 @@ __ ret(0); } - // Copy in 32-bytes chunks - copy_32_bytes_backward(from, to, qword_count, rax, L_copy_32_bytes, L_copy_8_bytes); + // Copy in multi-bytes chunks + copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes); if (is_oop) { __ BIND(L_exit); @@ -2953,21 +3017,6 @@ } } - // aesenc using specified key+offset - // can optionally specify that the shuffle mask is already in an xmmregister - void aes_enc_key(XMMRegister xmmdst, XMMRegister xmmtmp, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) { - load_key(xmmtmp, key, offset, xmm_shuf_mask); - __ aesenc(xmmdst, xmmtmp); - } - - // aesdec using specified key+offset - // can optionally specify that the shuffle mask is already in an xmmregister - void aes_dec_key(XMMRegister xmmdst, XMMRegister xmmtmp, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) { - load_key(xmmtmp, key, offset, xmm_shuf_mask); - __ aesdec(xmmdst, xmmtmp); - } - - // Arguments: // // Inputs: @@ -2976,7 +3025,7 @@ // c_rarg2 - K (key) in little endian int array // address generate_aescrypt_encryptBlock() { - assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support"); + assert(UseAES, "need AES instructions and misaligned SSE support"); __ align(CodeEntryAlignment); StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); Label L_doLast; @@ -2988,15 +3037,17 @@ const Register keylen = rax; const XMMRegister xmm_result = xmm0; - const XMMRegister xmm_temp = xmm1; - const XMMRegister xmm_key_shuf_mask = xmm2; + const XMMRegister xmm_key_shuf_mask = xmm1; + // On win64 xmm6-xmm15 must be preserved so don't use them. + const XMMRegister xmm_temp1 = xmm2; + const XMMRegister xmm_temp2 = xmm3; + const XMMRegister xmm_temp3 = xmm4; + const XMMRegister xmm_temp4 = xmm5; __ enter(); // required for proper stackwalking of RuntimeStub frame + // keylen could be only {11, 13, 15} * 4 = {44, 52, 60} __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); - // keylen = # of 32-bit words, convert to 128-bit words - __ shrl(keylen, 2); - __ subl(keylen, 11); // every key has at least 11 128-bit words, some have more __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); __ movdqu(xmm_result, Address(from, 0)); // get 16 bytes of input @@ -3004,25 +3055,53 @@ // For encryption, the java expanded key ordering is just what we need // we don't know if the key is aligned, hence not using load-execute form - load_key(xmm_temp, key, 0x00, xmm_key_shuf_mask); - __ pxor(xmm_result, xmm_temp); - for (int offset = 0x10; offset <= 0x90; offset += 0x10) { - aes_enc_key(xmm_result, xmm_temp, key, offset, xmm_key_shuf_mask); - } - load_key (xmm_temp, key, 0xa0, xmm_key_shuf_mask); - __ cmpl(keylen, 0); - __ jcc(Assembler::equal, L_doLast); - __ aesenc(xmm_result, xmm_temp); // only in 192 and 256 bit keys - aes_enc_key(xmm_result, xmm_temp, key, 0xb0, xmm_key_shuf_mask); - load_key(xmm_temp, key, 0xc0, xmm_key_shuf_mask); - __ subl(keylen, 2); - __ jcc(Assembler::equal, L_doLast); - __ aesenc(xmm_result, xmm_temp); // only in 256 bit keys - aes_enc_key(xmm_result, xmm_temp, key, 0xd0, xmm_key_shuf_mask); - load_key(xmm_temp, key, 0xe0, xmm_key_shuf_mask); + load_key(xmm_temp1, key, 0x00, xmm_key_shuf_mask); + __ pxor(xmm_result, xmm_temp1); + + load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask); + load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask); + load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask); + load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask); + + __ aesenc(xmm_result, xmm_temp1); + __ aesenc(xmm_result, xmm_temp2); + __ aesenc(xmm_result, xmm_temp3); + __ aesenc(xmm_result, xmm_temp4); + + load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask); + load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask); + load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask); + load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask); + + __ aesenc(xmm_result, xmm_temp1); + __ aesenc(xmm_result, xmm_temp2); + __ aesenc(xmm_result, xmm_temp3); + __ aesenc(xmm_result, xmm_temp4); + + load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask); + load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask); + + __ cmpl(keylen, 44); + __ jccb(Assembler::equal, L_doLast); + + __ aesenc(xmm_result, xmm_temp1); + __ aesenc(xmm_result, xmm_temp2); + + load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask); + load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask); + + __ cmpl(keylen, 52); + __ jccb(Assembler::equal, L_doLast); + + __ aesenc(xmm_result, xmm_temp1); + __ aesenc(xmm_result, xmm_temp2); + + load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask); + load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask); __ BIND(L_doLast); - __ aesenclast(xmm_result, xmm_temp); + __ aesenc(xmm_result, xmm_temp1); + __ aesenclast(xmm_result, xmm_temp2); __ movdqu(Address(to, 0), xmm_result); // store the result __ xorptr(rax, rax); // return 0 __ leave(); // required for proper stackwalking of RuntimeStub frame @@ -3040,7 +3119,7 @@ // c_rarg2 - K (key) in little endian int array // address generate_aescrypt_decryptBlock() { - assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support"); + assert(UseAES, "need AES instructions and misaligned SSE support"); __ align(CodeEntryAlignment); StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock"); Label L_doLast; @@ -3052,15 +3131,17 @@ const Register keylen = rax; const XMMRegister xmm_result = xmm0; - const XMMRegister xmm_temp = xmm1; - const XMMRegister xmm_key_shuf_mask = xmm2; + const XMMRegister xmm_key_shuf_mask = xmm1; + // On win64 xmm6-xmm15 must be preserved so don't use them. + const XMMRegister xmm_temp1 = xmm2; + const XMMRegister xmm_temp2 = xmm3; + const XMMRegister xmm_temp3 = xmm4; + const XMMRegister xmm_temp4 = xmm5; __ enter(); // required for proper stackwalking of RuntimeStub frame + // keylen could be only {11, 13, 15} * 4 = {44, 52, 60} __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); - // keylen = # of 32-bit words, convert to 128-bit words - __ shrl(keylen, 2); - __ subl(keylen, 11); // every key has at least 11 128-bit words, some have more __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); __ movdqu(xmm_result, Address(from, 0)); @@ -3068,29 +3149,55 @@ // for decryption java expanded key ordering is rotated one position from what we want // so we start from 0x10 here and hit 0x00 last // we don't know if the key is aligned, hence not using load-execute form - load_key(xmm_temp, key, 0x10, xmm_key_shuf_mask); - __ pxor (xmm_result, xmm_temp); - for (int offset = 0x20; offset <= 0xa0; offset += 0x10) { - aes_dec_key(xmm_result, xmm_temp, key, offset, xmm_key_shuf_mask); - } - __ cmpl(keylen, 0); - __ jcc(Assembler::equal, L_doLast); - // only in 192 and 256 bit keys - aes_dec_key(xmm_result, xmm_temp, key, 0xb0, xmm_key_shuf_mask); - aes_dec_key(xmm_result, xmm_temp, key, 0xc0, xmm_key_shuf_mask); - __ subl(keylen, 2); - __ jcc(Assembler::equal, L_doLast); - // only in 256 bit keys - aes_dec_key(xmm_result, xmm_temp, key, 0xd0, xmm_key_shuf_mask); - aes_dec_key(xmm_result, xmm_temp, key, 0xe0, xmm_key_shuf_mask); + load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask); + load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask); + load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask); + load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask); + + __ pxor (xmm_result, xmm_temp1); + __ aesdec(xmm_result, xmm_temp2); + __ aesdec(xmm_result, xmm_temp3); + __ aesdec(xmm_result, xmm_temp4); + + load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask); + load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask); + load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask); + load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask); + + __ aesdec(xmm_result, xmm_temp1); + __ aesdec(xmm_result, xmm_temp2); + __ aesdec(xmm_result, xmm_temp3); + __ aesdec(xmm_result, xmm_temp4); + + load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask); + load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask); + load_key(xmm_temp3, key, 0x00, xmm_key_shuf_mask); + + __ cmpl(keylen, 44); + __ jccb(Assembler::equal, L_doLast); + + __ aesdec(xmm_result, xmm_temp1); + __ aesdec(xmm_result, xmm_temp2); + + load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask); + load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask); + + __ cmpl(keylen, 52); + __ jccb(Assembler::equal, L_doLast); + + __ aesdec(xmm_result, xmm_temp1); + __ aesdec(xmm_result, xmm_temp2); + + load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask); + load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask); __ BIND(L_doLast); + __ aesdec(xmm_result, xmm_temp1); + __ aesdec(xmm_result, xmm_temp2); + // for decryption the aesdeclast operation is always on key+0x00 - load_key(xmm_temp, key, 0x00, xmm_key_shuf_mask); - __ aesdeclast(xmm_result, xmm_temp); - + __ aesdeclast(xmm_result, xmm_temp3); __ movdqu(Address(to, 0), xmm_result); // store the result - __ xorptr(rax, rax); // return 0 __ leave(); // required for proper stackwalking of RuntimeStub frame __ ret(0); @@ -3109,7 +3216,7 @@ // c_rarg4 - input length // address generate_cipherBlockChaining_encryptAESCrypt() { - assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support"); + assert(UseAES, "need AES instructions and misaligned SSE support"); __ align(CodeEntryAlignment); StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); address start = __ pc(); @@ -3133,16 +3240,19 @@ const XMMRegister xmm_temp = xmm1; // keys 0-10 preloaded into xmm2-xmm12 const int XMM_REG_NUM_KEY_FIRST = 2; - const int XMM_REG_NUM_KEY_LAST = 12; + const int XMM_REG_NUM_KEY_LAST = 15; const XMMRegister xmm_key0 = as_XMMRegister(XMM_REG_NUM_KEY_FIRST); - const XMMRegister xmm_key10 = as_XMMRegister(XMM_REG_NUM_KEY_LAST); + const XMMRegister xmm_key10 = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+10); + const XMMRegister xmm_key11 = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+11); + const XMMRegister xmm_key12 = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+12); + const XMMRegister xmm_key13 = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+13); __ enter(); // required for proper stackwalking of RuntimeStub frame #ifdef _WIN64 // on win64, fill len_reg from stack position __ movl(len_reg, len_mem); - // save the xmm registers which must be preserved 6-12 + // save the xmm registers which must be preserved 6-15 __ subptr(rsp, -rsp_after_call_off * wordSize); for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) { __ movdqu(xmm_save(i), as_XMMRegister(i)); @@ -3151,12 +3261,11 @@ const XMMRegister xmm_key_shuf_mask = xmm_temp; // used temporarily to swap key bytes up front __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); - // load up xmm regs 2 thru 12 with key 0x00 - 0xa0 - for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x00; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) { + // load up xmm regs xmm2 thru xmm12 with key 0x00 - 0xa0 + for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x00; rnum <= XMM_REG_NUM_KEY_FIRST+10; rnum++) { load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask); offset += 0x10; } - __ movdqu(xmm_result, Address(rvec, 0x00)); // initialize xmm_result with r vec // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256)) @@ -3167,16 +3276,15 @@ // 128 bit code follows here __ movptr(pos, 0); __ align(OptoLoopAlignment); + __ BIND(L_loopTop_128); __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input __ pxor (xmm_result, xmm_temp); // xor with the current r vector - __ pxor (xmm_result, xmm_key0); // do the aes rounds - for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST - 1; rnum++) { + for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_FIRST + 9; rnum++) { __ aesenc(xmm_result, as_XMMRegister(rnum)); } __ aesenclast(xmm_result, xmm_key10); - __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output // no need to store r to memory until we exit __ addptr(pos, AESBlockSize); @@ -3198,24 +3306,23 @@ __ BIND(L_key_192_256); // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256) + load_key(xmm_key11, key, 0xb0, xmm_key_shuf_mask); + load_key(xmm_key12, key, 0xc0, xmm_key_shuf_mask); __ cmpl(rax, 52); __ jcc(Assembler::notEqual, L_key_256); // 192-bit code follows here (could be changed to use more xmm registers) __ movptr(pos, 0); __ align(OptoLoopAlignment); + __ BIND(L_loopTop_192); __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input __ pxor (xmm_result, xmm_temp); // xor with the current r vector - __ pxor (xmm_result, xmm_key0); // do the aes rounds - for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) { + for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_FIRST + 11; rnum++) { __ aesenc(xmm_result, as_XMMRegister(rnum)); } - aes_enc_key(xmm_result, xmm_temp, key, 0xb0); - load_key(xmm_temp, key, 0xc0); - __ aesenclast(xmm_result, xmm_temp); - + __ aesenclast(xmm_result, xmm_key12); __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output // no need to store r to memory until we exit __ addptr(pos, AESBlockSize); @@ -3225,22 +3332,19 @@ __ BIND(L_key_256); // 256-bit code follows here (could be changed to use more xmm registers) + load_key(xmm_key13, key, 0xd0, xmm_key_shuf_mask); __ movptr(pos, 0); __ align(OptoLoopAlignment); + __ BIND(L_loopTop_256); __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input __ pxor (xmm_result, xmm_temp); // xor with the current r vector - __ pxor (xmm_result, xmm_key0); // do the aes rounds - for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) { + for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_FIRST + 13; rnum++) { __ aesenc(xmm_result, as_XMMRegister(rnum)); } - aes_enc_key(xmm_result, xmm_temp, key, 0xb0); - aes_enc_key(xmm_result, xmm_temp, key, 0xc0); - aes_enc_key(xmm_result, xmm_temp, key, 0xd0); load_key(xmm_temp, key, 0xe0); __ aesenclast(xmm_result, xmm_temp); - __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output // no need to store r to memory until we exit __ addptr(pos, AESBlockSize); @@ -3267,7 +3371,7 @@ // address generate_cipherBlockChaining_decryptAESCrypt_Parallel() { - assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support"); + assert(UseAES, "need AES instructions and misaligned SSE support"); __ align(CodeEntryAlignment); StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); address start = __ pc(); @@ -3288,12 +3392,10 @@ #endif const Register pos = rax; - // xmm register assignments for the loops below - const XMMRegister xmm_result = xmm0; // keys 0-10 preloaded into xmm2-xmm12 const int XMM_REG_NUM_KEY_FIRST = 5; const int XMM_REG_NUM_KEY_LAST = 15; - const XMMRegister xmm_key_first = as_XMMRegister(XMM_REG_NUM_KEY_FIRST); + const XMMRegister xmm_key_first = as_XMMRegister(XMM_REG_NUM_KEY_FIRST); const XMMRegister xmm_key_last = as_XMMRegister(XMM_REG_NUM_KEY_LAST); __ enter(); // required for proper stackwalking of RuntimeStub frame @@ -3312,13 +3414,14 @@ const XMMRegister xmm_key_shuf_mask = xmm1; // used temporarily to swap key bytes up front __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); // load up xmm regs 5 thru 15 with key 0x10 - 0xa0 - 0x00 - for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x10; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) { - if (rnum == XMM_REG_NUM_KEY_LAST) offset = 0x00; + for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x10; rnum < XMM_REG_NUM_KEY_LAST; rnum++) { load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask); offset += 0x10; } + load_key(xmm_key_last, key, 0x00, xmm_key_shuf_mask); const XMMRegister xmm_prev_block_cipher = xmm1; // holds cipher of previous block + // registers holding the four results in the parallelized loop const XMMRegister xmm_result0 = xmm0; const XMMRegister xmm_result1 = xmm2; @@ -3376,8 +3479,12 @@ __ jmp(L_multiBlock_loopTop_128); // registers used in the non-parallelized loops + // xmm register assignments for the loops below + const XMMRegister xmm_result = xmm0; const XMMRegister xmm_prev_block_cipher_save = xmm2; - const XMMRegister xmm_temp = xmm3; + const XMMRegister xmm_key11 = xmm3; + const XMMRegister xmm_key12 = xmm4; + const XMMRegister xmm_temp = xmm4; __ align(OptoLoopAlignment); __ BIND(L_singleBlock_loopTop_128); @@ -3415,12 +3522,15 @@ __ BIND(L_key_192_256); // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256) + load_key(xmm_key11, key, 0xb0); __ cmpl(rax, 52); __ jcc(Assembler::notEqual, L_key_256); // 192-bit code follows here (could be optimized to use parallelism) + load_key(xmm_key12, key, 0xc0); // 192-bit key goes up to c0 __ movptr(pos, 0); __ align(OptoLoopAlignment); + __ BIND(L_singleBlock_loopTop_192); __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input __ movdqa(xmm_prev_block_cipher_save, xmm_result); // save for next r vector @@ -3428,14 +3538,13 @@ for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST - 1; rnum++) { __ aesdec(xmm_result, as_XMMRegister(rnum)); } - aes_dec_key(xmm_result, xmm_temp, key, 0xb0); // 192-bit key goes up to c0 - aes_dec_key(xmm_result, xmm_temp, key, 0xc0); + __ aesdec(xmm_result, xmm_key11); + __ aesdec(xmm_result, xmm_key12); __ aesdeclast(xmm_result, xmm_key_last); // xmm15 always came from key+0 __ pxor (xmm_result, xmm_prev_block_cipher); // xor with the current r vector - __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output + __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output // no need to store r to memory until we exit - __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save); // set up next r vector with cipher input from this block - + __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save); // set up next r vector with cipher input from this block __ addptr(pos, AESBlockSize); __ subptr(len_reg, AESBlockSize); __ jcc(Assembler::notEqual,L_singleBlock_loopTop_192); @@ -3445,23 +3554,26 @@ // 256-bit code follows here (could be optimized to use parallelism) __ movptr(pos, 0); __ align(OptoLoopAlignment); + __ BIND(L_singleBlock_loopTop_256); - __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input + __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input __ movdqa(xmm_prev_block_cipher_save, xmm_result); // save for next r vector __ pxor (xmm_result, xmm_key_first); // do the aes dec rounds for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST - 1; rnum++) { __ aesdec(xmm_result, as_XMMRegister(rnum)); } - aes_dec_key(xmm_result, xmm_temp, key, 0xb0); // 256-bit key goes up to e0 - aes_dec_key(xmm_result, xmm_temp, key, 0xc0); - aes_dec_key(xmm_result, xmm_temp, key, 0xd0); - aes_dec_key(xmm_result, xmm_temp, key, 0xe0); - __ aesdeclast(xmm_result, xmm_key_last); // xmm15 came from key+0 + __ aesdec(xmm_result, xmm_key11); + load_key(xmm_temp, key, 0xc0); + __ aesdec(xmm_result, xmm_temp); + load_key(xmm_temp, key, 0xd0); + __ aesdec(xmm_result, xmm_temp); + load_key(xmm_temp, key, 0xe0); // 256-bit key goes up to e0 + __ aesdec(xmm_result, xmm_temp); + __ aesdeclast(xmm_result, xmm_key_last); // xmm15 came from key+0 __ pxor (xmm_result, xmm_prev_block_cipher); // xor with the current r vector - __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output + __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output // no need to store r to memory until we exit - __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save); // set up next r vector with cipher input from this block - + __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save); // set up next r vector with cipher input from this block __ addptr(pos, AESBlockSize); __ subptr(len_reg, AESBlockSize); __ jcc(Assembler::notEqual,L_singleBlock_loopTop_256);