graal-compiler: src/cpu/x86/vm/stubGenerator_x86

comparison src/cpu/x86/vm/stubGenerator_x86_32.cpp @ 7482:989155e2d07a

Merge with hs25-b15.

author	Thomas Wuerthinger <thomas.wuerthinger@oracle.com>
date	Wed, 16 Jan 2013 01:34:24 +0100
parents	291ffc492eb6 e2e6bf86682c
children	b9a918201d47

comparison

equal deleted inserted replaced

-:6761a8f854a4
+:989155e2d07a
 // Copy 64-byte chunks
 __ jmpb(L_copy_64_bytes);
 __ align(OptoLoopAlignment);
 __ BIND(L_copy_64_bytes_loop);
-if(UseUnalignedLoadStores) {
+if (UseUnalignedLoadStores) {
-__ movdqu(xmm0, Address(from, 0));
+if (UseAVX >= 2) {
-__ movdqu(Address(from, to_from, Address::times_1, 0), xmm0);
+__ vmovdqu(xmm0, Address(from,  0));
-__ movdqu(xmm1, Address(from, 16));
+__ vmovdqu(Address(from, to_from, Address::times_1,  0), xmm0);
-__ movdqu(Address(from, to_from, Address::times_1, 16), xmm1);
+__ vmovdqu(xmm1, Address(from, 32));
-__ movdqu(xmm2, Address(from, 32));
+__ vmovdqu(Address(from, to_from, Address::times_1, 32), xmm1);
-__ movdqu(Address(from, to_from, Address::times_1, 32), xmm2);
+} else {
-__ movdqu(xmm3, Address(from, 48));
+__ movdqu(xmm0, Address(from, 0));
-__ movdqu(Address(from, to_from, Address::times_1, 48), xmm3);
+__ movdqu(Address(from, to_from, Address::times_1, 0), xmm0);
+__ movdqu(xmm1, Address(from, 16));
+__ movdqu(Address(from, to_from, Address::times_1, 16), xmm1);
+__ movdqu(xmm2, Address(from, 32));
+__ movdqu(Address(from, to_from, Address::times_1, 32), xmm2);
+__ movdqu(xmm3, Address(from, 48));
+__ movdqu(Address(from, to_from, Address::times_1, 48), xmm3);
+}
 } else {
 __ movq(xmm0, Address(from, 0));
 __ movq(Address(from, to_from, Address::times_1, 0), xmm0);
 __ movq(xmm1, Address(from, 8));
 __ movq(Address(from, to_from, Address::times_1, 8), xmm1);
 //   c_rarg0   - source byte array address
 //   c_rarg1   - destination byte array address
 //   c_rarg2   - K (key) in little endian int array
 //
 address generate_aescrypt_encryptBlock() {
-assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support");
+assert(UseAES, "need AES instructions and misaligned SSE support");
 __ align(CodeEntryAlignment);
 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
 Label L_doLast;
 address start = __ pc();
-const Register from        = rsi;      // source array address
+const Register from        = rdx;      // source array address
 const Register to          = rdx;      // destination array address
 const Register key         = rcx;      // key array address
 const Register keylen      = rax;
 const Address  from_param(rbp, 8+0);
 const Address  to_param  (rbp, 8+4);
 const Address  key_param (rbp, 8+8);
 const XMMRegister xmm_result = xmm0;
-const XMMRegister xmm_temp   = xmm1;
+const XMMRegister xmm_key_shuf_mask = xmm1;
-const XMMRegister xmm_key_shuf_mask = xmm2;
+const XMMRegister xmm_temp1  = xmm2;
+const XMMRegister xmm_temp2  = xmm3;
-__ enter(); // required for proper stackwalking of RuntimeStub frame
+const XMMRegister xmm_temp3  = xmm4;
-__ push(rsi);
+const XMMRegister xmm_temp4  = xmm5;
-__ movptr(from , from_param);
-__ movptr(to   , to_param);
+__ enter();   // required for proper stackwalking of RuntimeStub frame
-__ movptr(key  , key_param);
+__ movptr(from, from_param);
+__ movptr(key, key_param);
+// keylen could be only {11, 13, 15} * 4 = {44, 52, 60}
 __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
-// keylen = # of 32-bit words, convert to 128-bit words
-__ shrl(keylen, 2);
-__ subl(keylen, 11);   // every key has at least 11 128-bit words, some have more
 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
 __ movdqu(xmm_result, Address(from, 0));  // get 16 bytes of input
+__ movptr(to, to_param);
 // For encryption, the java expanded key ordering is just what we need
-load_key(xmm_temp, key, 0x00, xmm_key_shuf_mask);
+load_key(xmm_temp1, key, 0x00, xmm_key_shuf_mask);
-__ pxor(xmm_result, xmm_temp);
+__ pxor(xmm_result, xmm_temp1);
-for (int offset = 0x10; offset <= 0x90; offset += 0x10) {
-aes_enc_key(xmm_result, xmm_temp, key, offset, xmm_key_shuf_mask);
+load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask);
-}
+load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask);
-load_key  (xmm_temp, key, 0xa0, xmm_key_shuf_mask);
+load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask);
-__ cmpl(keylen, 0);
+load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask);
-__ jcc(Assembler::equal, L_doLast);
-__ aesenc(xmm_result, xmm_temp);                   // only in 192 and 256 bit keys
+__ aesenc(xmm_result, xmm_temp1);
-aes_enc_key(xmm_result, xmm_temp, key, 0xb0, xmm_key_shuf_mask);
+__ aesenc(xmm_result, xmm_temp2);
-load_key(xmm_temp, key, 0xc0, xmm_key_shuf_mask);
+__ aesenc(xmm_result, xmm_temp3);
-__ subl(keylen, 2);
+__ aesenc(xmm_result, xmm_temp4);
-__ jcc(Assembler::equal, L_doLast);
-__ aesenc(xmm_result, xmm_temp);                   // only in 256 bit keys
+load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask);
-aes_enc_key(xmm_result, xmm_temp, key, 0xd0, xmm_key_shuf_mask);
+load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask);
-load_key(xmm_temp, key, 0xe0, xmm_key_shuf_mask);
+load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask);
+load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask);
+__ aesenc(xmm_result, xmm_temp1);
+__ aesenc(xmm_result, xmm_temp2);
+__ aesenc(xmm_result, xmm_temp3);
+__ aesenc(xmm_result, xmm_temp4);
+load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask);
+load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask);
+__ cmpl(keylen, 44);
+__ jccb(Assembler::equal, L_doLast);
+__ aesenc(xmm_result, xmm_temp1);
+__ aesenc(xmm_result, xmm_temp2);
+load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask);
+load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask);
+__ cmpl(keylen, 52);
+__ jccb(Assembler::equal, L_doLast);
+__ aesenc(xmm_result, xmm_temp1);
+__ aesenc(xmm_result, xmm_temp2);
+load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask);
+load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask);
 __ BIND(L_doLast);
-__ aesenclast(xmm_result, xmm_temp);
+__ aesenc(xmm_result, xmm_temp1);
+__ aesenclast(xmm_result, xmm_temp2);
 __ movdqu(Address(to, 0), xmm_result);        // store the result
 __ xorptr(rax, rax); // return 0
-__ pop(rsi);
 __ leave(); // required for proper stackwalking of RuntimeStub frame
 __ ret(0);
 return start;
 }
 //   c_rarg0   - source byte array address
 //   c_rarg1   - destination byte array address
 //   c_rarg2   - K (key) in little endian int array
 //
 address generate_aescrypt_decryptBlock() {
-assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support");
+assert(UseAES, "need AES instructions and misaligned SSE support");
 __ align(CodeEntryAlignment);
 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
 Label L_doLast;
 address start = __ pc();
-const Register from        = rsi;      // source array address
+const Register from        = rdx;      // source array address
 const Register to          = rdx;      // destination array address
 const Register key         = rcx;      // key array address
 const Register keylen      = rax;
 const Address  from_param(rbp, 8+0);
 const Address  to_param  (rbp, 8+4);
 const Address  key_param (rbp, 8+8);
 const XMMRegister xmm_result = xmm0;
-const XMMRegister xmm_temp   = xmm1;
+const XMMRegister xmm_key_shuf_mask = xmm1;
-const XMMRegister xmm_key_shuf_mask = xmm2;
+const XMMRegister xmm_temp1  = xmm2;
+const XMMRegister xmm_temp2  = xmm3;
+const XMMRegister xmm_temp3  = xmm4;
+const XMMRegister xmm_temp4  = xmm5;
 __ enter(); // required for proper stackwalking of RuntimeStub frame
-__ push(rsi);
+__ movptr(from, from_param);
-__ movptr(from , from_param);
+__ movptr(key, key_param);
-__ movptr(to   , to_param);
-__ movptr(key  , key_param);
+// keylen could be only {11, 13, 15} * 4 = {44, 52, 60}
 __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
-// keylen = # of 32-bit words, convert to 128-bit words
-__ shrl(keylen, 2);
-__ subl(keylen, 11);   // every key has at least 11 128-bit words, some have more
 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
 __ movdqu(xmm_result, Address(from, 0));
+__ movptr(to, to_param);
 // for decryption java expanded key ordering is rotated one position from what we want
 // so we start from 0x10 here and hit 0x00 last
 // we don't know if the key is aligned, hence not using load-execute form
-load_key(xmm_temp, key, 0x10, xmm_key_shuf_mask);
+load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask);
-__ pxor  (xmm_result, xmm_temp);
+load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask);
-for (int offset = 0x20; offset <= 0xa0; offset += 0x10) {
+load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask);
-aes_dec_key(xmm_result, xmm_temp, key, offset, xmm_key_shuf_mask);
+load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask);
-}
-__ cmpl(keylen, 0);
+__ pxor  (xmm_result, xmm_temp1);
-__ jcc(Assembler::equal, L_doLast);
+__ aesdec(xmm_result, xmm_temp2);
-// only in 192 and 256 bit keys
+__ aesdec(xmm_result, xmm_temp3);
-aes_dec_key(xmm_result, xmm_temp, key, 0xb0, xmm_key_shuf_mask);
+__ aesdec(xmm_result, xmm_temp4);
-aes_dec_key(xmm_result, xmm_temp, key, 0xc0, xmm_key_shuf_mask);
-__ subl(keylen, 2);
+load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask);
-__ jcc(Assembler::equal, L_doLast);
+load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask);
-// only in 256 bit keys
+load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask);
-aes_dec_key(xmm_result, xmm_temp, key, 0xd0, xmm_key_shuf_mask);
+load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask);
-aes_dec_key(xmm_result, xmm_temp, key, 0xe0, xmm_key_shuf_mask);
+__ aesdec(xmm_result, xmm_temp1);
+__ aesdec(xmm_result, xmm_temp2);
+__ aesdec(xmm_result, xmm_temp3);
+__ aesdec(xmm_result, xmm_temp4);
+load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask);
+load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask);
+load_key(xmm_temp3, key, 0x00, xmm_key_shuf_mask);
+__ cmpl(keylen, 44);
+__ jccb(Assembler::equal, L_doLast);
+__ aesdec(xmm_result, xmm_temp1);
+__ aesdec(xmm_result, xmm_temp2);
+load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask);
+load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask);
+__ cmpl(keylen, 52);
+__ jccb(Assembler::equal, L_doLast);
+__ aesdec(xmm_result, xmm_temp1);
+__ aesdec(xmm_result, xmm_temp2);
+load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask);
+load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask);
 __ BIND(L_doLast);
+__ aesdec(xmm_result, xmm_temp1);
+__ aesdec(xmm_result, xmm_temp2);
 // for decryption the aesdeclast operation is always on key+0x00
-load_key(xmm_temp, key, 0x00, xmm_key_shuf_mask);
+__ aesdeclast(xmm_result, xmm_temp3);
-__ aesdeclast(xmm_result, xmm_temp);
 __ movdqu(Address(to, 0), xmm_result);  // store the result
 __ xorptr(rax, rax); // return 0
-__ pop(rsi);
 __ leave(); // required for proper stackwalking of RuntimeStub frame
 __ ret(0);
 return start;
 }
 //   c_rarg2   - K (key) in little endian int array
 //   c_rarg3   - r vector byte array address
 //   c_rarg4   - input length
 //
 address generate_cipherBlockChaining_encryptAESCrypt() {
-assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support");
+assert(UseAES, "need AES instructions and misaligned SSE support");
 __ align(CodeEntryAlignment);
 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
 address start = __ pc();
 Label L_exit, L_key_192_256, L_key_256, L_loopTop_128, L_loopTop_192, L_loopTop_256;
 __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 __ cmpl(rax, 44);
 __ jcc(Assembler::notEqual, L_key_192_256);
 // 128 bit code follows here
-__ movptr(pos, 0);
+__ movl(pos, 0);
 __ align(OptoLoopAlignment);
 __ BIND(L_loopTop_128);
 __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
 __ pxor  (xmm_result, xmm_temp);                                // xor with the current r vector
 handleSOERegisters(false /*restoring*/);
 __ movl(rax, 0);                             // return 0 (why?)
 __ leave();                                  // required for proper stackwalking of RuntimeStub frame
 __ ret(0);
 __ BIND(L_key_192_256);
 // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256)
 __ cmpl(rax, 52);
 __ jcc(Assembler::notEqual, L_key_256);
 // 192-bit code follows here (could be changed to use more xmm registers)
-__ movptr(pos, 0);
+__ movl(pos, 0);
 __ align(OptoLoopAlignment);
 __ BIND(L_loopTop_192);
 __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
 __ pxor  (xmm_result, xmm_temp);                                // xor with the current r vector
 __ pxor  (xmm_result, xmm_key0);                                // do the aes rounds
 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_LAST; rnum++) {
 __ addptr(pos, AESBlockSize);
 __ subptr(len_reg, AESBlockSize);
 __ jcc(Assembler::notEqual, L_loopTop_192);
 __ jmp(L_exit);
 __ BIND(L_key_256);
 // 256-bit code follows here (could be changed to use more xmm registers)
-__ movptr(pos, 0);
+__ movl(pos, 0);
 __ align(OptoLoopAlignment);
 __ BIND(L_loopTop_256);
 __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
 __ pxor  (xmm_result, xmm_temp);                                // xor with the current r vector
 __ pxor  (xmm_result, xmm_key0);                                // do the aes rounds
 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_LAST; rnum++) {
 //   c_rarg3   - r vector byte array address
 //   c_rarg4   - input length
 //
 address generate_cipherBlockChaining_decryptAESCrypt() {
-assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support");
+assert(UseAES, "need AES instructions and misaligned SSE support");
 __ align(CodeEntryAlignment);
 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
 address start = __ pc();
 Label L_exit, L_key_192_256, L_key_256;
 __ cmpl(rax, 44);
 __ jcc(Assembler::notEqual, L_key_192_256);
 // 128-bit code follows here, parallelized
-__ movptr(pos, 0);
+__ movl(pos, 0);
 __ align(OptoLoopAlignment);
 __ BIND(L_singleBlock_loopTop_128);
 __ cmpptr(len_reg, 0);           // any blocks left??
 __ jcc(Assembler::equal, L_exit);
 __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of cipher input
 __ pxor  (xmm_result, xmm_key_first);                             // do the aes dec rounds
 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_LAST; rnum++) {
 // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256)
 __ cmpl(rax, 52);
 __ jcc(Assembler::notEqual, L_key_256);
 // 192-bit code follows here (could be optimized to use parallelism)
-__ movptr(pos, 0);
+__ movl(pos, 0);
 __ align(OptoLoopAlignment);
 __ BIND(L_singleBlock_loopTop_192);
 __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of cipher input
 __ pxor  (xmm_result, xmm_key_first);                             // do the aes dec rounds
 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) {
 __ jcc(Assembler::notEqual,L_singleBlock_loopTop_192);
 __ jmp(L_exit);
 __ BIND(L_key_256);
 // 256-bit code follows here (could be optimized to use parallelism)
-__ movptr(pos, 0);
+__ movl(pos, 0);
 __ align(OptoLoopAlignment);
 __ BIND(L_singleBlock_loopTop_256);
 __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of cipher input
 __ pxor  (xmm_result, xmm_key_first);                             // do the aes dec rounds
 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) {

Mercurial > hg > graal-compiler

comparison src/cpu/x86/vm/stubGenerator_x86_32.cpp @ 7482:989155e2d07a