comparison src/cpu/x86/vm/stubGenerator_x86_32.cpp @ 7427:2c7f594145dc

8004835: Improve AES intrinsics on x86 Summary: Enable AES intrinsics on non-AVX cpus, group together aes instructions in crypto stubs. Reviewed-by: roland, twisti
author kvn
date Wed, 19 Dec 2012 15:40:35 -0800
parents d2f8c38e543d
children e2e6bf86682c
comparison
equal deleted inserted replaced
7426:65c8342f726a 7427:2c7f594145dc
2172 // c_rarg0 - source byte array address 2172 // c_rarg0 - source byte array address
2173 // c_rarg1 - destination byte array address 2173 // c_rarg1 - destination byte array address
2174 // c_rarg2 - K (key) in little endian int array 2174 // c_rarg2 - K (key) in little endian int array
2175 // 2175 //
2176 address generate_aescrypt_encryptBlock() { 2176 address generate_aescrypt_encryptBlock() {
2177 assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support"); 2177 assert(UseAES, "need AES instructions and misaligned SSE support");
2178 __ align(CodeEntryAlignment); 2178 __ align(CodeEntryAlignment);
2179 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); 2179 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2180 Label L_doLast; 2180 Label L_doLast;
2181 address start = __ pc(); 2181 address start = __ pc();
2182 2182
2183 const Register from = rsi; // source array address 2183 const Register from = rdx; // source array address
2184 const Register to = rdx; // destination array address 2184 const Register to = rdx; // destination array address
2185 const Register key = rcx; // key array address 2185 const Register key = rcx; // key array address
2186 const Register keylen = rax; 2186 const Register keylen = rax;
2187 const Address from_param(rbp, 8+0); 2187 const Address from_param(rbp, 8+0);
2188 const Address to_param (rbp, 8+4); 2188 const Address to_param (rbp, 8+4);
2189 const Address key_param (rbp, 8+8); 2189 const Address key_param (rbp, 8+8);
2190 2190
2191 const XMMRegister xmm_result = xmm0; 2191 const XMMRegister xmm_result = xmm0;
2192 const XMMRegister xmm_temp = xmm1; 2192 const XMMRegister xmm_key_shuf_mask = xmm1;
2193 const XMMRegister xmm_key_shuf_mask = xmm2; 2193 const XMMRegister xmm_temp1 = xmm2;
2194 2194 const XMMRegister xmm_temp2 = xmm3;
2195 __ enter(); // required for proper stackwalking of RuntimeStub frame 2195 const XMMRegister xmm_temp3 = xmm4;
2196 __ push(rsi); 2196 const XMMRegister xmm_temp4 = xmm5;
2197 __ movptr(from , from_param); 2197
2198 __ movptr(to , to_param); 2198 __ enter(); // required for proper stackwalking of RuntimeStub frame
2199 __ movptr(key , key_param); 2199 __ movptr(from, from_param);
2200 2200 __ movptr(key, key_param);
2201
2202 // keylen could be only {11, 13, 15} * 4 = {44, 52, 60}
2201 __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2203 __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2202 // keylen = # of 32-bit words, convert to 128-bit words
2203 __ shrl(keylen, 2);
2204 __ subl(keylen, 11); // every key has at least 11 128-bit words, some have more
2205 2204
2206 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); 2205 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
2207 __ movdqu(xmm_result, Address(from, 0)); // get 16 bytes of input 2206 __ movdqu(xmm_result, Address(from, 0)); // get 16 bytes of input
2207 __ movptr(to, to_param);
2208 2208
2209 // For encryption, the java expanded key ordering is just what we need 2209 // For encryption, the java expanded key ordering is just what we need
2210 2210
2211 load_key(xmm_temp, key, 0x00, xmm_key_shuf_mask); 2211 load_key(xmm_temp1, key, 0x00, xmm_key_shuf_mask);
2212 __ pxor(xmm_result, xmm_temp); 2212 __ pxor(xmm_result, xmm_temp1);
2213 for (int offset = 0x10; offset <= 0x90; offset += 0x10) { 2213
2214 aes_enc_key(xmm_result, xmm_temp, key, offset, xmm_key_shuf_mask); 2214 load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask);
2215 } 2215 load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask);
2216 load_key (xmm_temp, key, 0xa0, xmm_key_shuf_mask); 2216 load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask);
2217 __ cmpl(keylen, 0); 2217 load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask);
2218 __ jcc(Assembler::equal, L_doLast); 2218
2219 __ aesenc(xmm_result, xmm_temp); // only in 192 and 256 bit keys 2219 __ aesenc(xmm_result, xmm_temp1);
2220 aes_enc_key(xmm_result, xmm_temp, key, 0xb0, xmm_key_shuf_mask); 2220 __ aesenc(xmm_result, xmm_temp2);
2221 load_key(xmm_temp, key, 0xc0, xmm_key_shuf_mask); 2221 __ aesenc(xmm_result, xmm_temp3);
2222 __ subl(keylen, 2); 2222 __ aesenc(xmm_result, xmm_temp4);
2223 __ jcc(Assembler::equal, L_doLast); 2223
2224 __ aesenc(xmm_result, xmm_temp); // only in 256 bit keys 2224 load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask);
2225 aes_enc_key(xmm_result, xmm_temp, key, 0xd0, xmm_key_shuf_mask); 2225 load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask);
2226 load_key(xmm_temp, key, 0xe0, xmm_key_shuf_mask); 2226 load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask);
2227 load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask);
2228
2229 __ aesenc(xmm_result, xmm_temp1);
2230 __ aesenc(xmm_result, xmm_temp2);
2231 __ aesenc(xmm_result, xmm_temp3);
2232 __ aesenc(xmm_result, xmm_temp4);
2233
2234 load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask);
2235 load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask);
2236
2237 __ cmpl(keylen, 44);
2238 __ jccb(Assembler::equal, L_doLast);
2239
2240 __ aesenc(xmm_result, xmm_temp1);
2241 __ aesenc(xmm_result, xmm_temp2);
2242
2243 load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask);
2244 load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask);
2245
2246 __ cmpl(keylen, 52);
2247 __ jccb(Assembler::equal, L_doLast);
2248
2249 __ aesenc(xmm_result, xmm_temp1);
2250 __ aesenc(xmm_result, xmm_temp2);
2251
2252 load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask);
2253 load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask);
2227 2254
2228 __ BIND(L_doLast); 2255 __ BIND(L_doLast);
2229 __ aesenclast(xmm_result, xmm_temp); 2256 __ aesenc(xmm_result, xmm_temp1);
2257 __ aesenclast(xmm_result, xmm_temp2);
2230 __ movdqu(Address(to, 0), xmm_result); // store the result 2258 __ movdqu(Address(to, 0), xmm_result); // store the result
2231 __ xorptr(rax, rax); // return 0 2259 __ xorptr(rax, rax); // return 0
2232 __ pop(rsi);
2233 __ leave(); // required for proper stackwalking of RuntimeStub frame 2260 __ leave(); // required for proper stackwalking of RuntimeStub frame
2234 __ ret(0); 2261 __ ret(0);
2235 2262
2236 return start; 2263 return start;
2237 } 2264 }
2243 // c_rarg0 - source byte array address 2270 // c_rarg0 - source byte array address
2244 // c_rarg1 - destination byte array address 2271 // c_rarg1 - destination byte array address
2245 // c_rarg2 - K (key) in little endian int array 2272 // c_rarg2 - K (key) in little endian int array
2246 // 2273 //
2247 address generate_aescrypt_decryptBlock() { 2274 address generate_aescrypt_decryptBlock() {
2248 assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support"); 2275 assert(UseAES, "need AES instructions and misaligned SSE support");
2249 __ align(CodeEntryAlignment); 2276 __ align(CodeEntryAlignment);
2250 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock"); 2277 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2251 Label L_doLast; 2278 Label L_doLast;
2252 address start = __ pc(); 2279 address start = __ pc();
2253 2280
2254 const Register from = rsi; // source array address 2281 const Register from = rdx; // source array address
2255 const Register to = rdx; // destination array address 2282 const Register to = rdx; // destination array address
2256 const Register key = rcx; // key array address 2283 const Register key = rcx; // key array address
2257 const Register keylen = rax; 2284 const Register keylen = rax;
2258 const Address from_param(rbp, 8+0); 2285 const Address from_param(rbp, 8+0);
2259 const Address to_param (rbp, 8+4); 2286 const Address to_param (rbp, 8+4);
2260 const Address key_param (rbp, 8+8); 2287 const Address key_param (rbp, 8+8);
2261 2288
2262 const XMMRegister xmm_result = xmm0; 2289 const XMMRegister xmm_result = xmm0;
2263 const XMMRegister xmm_temp = xmm1; 2290 const XMMRegister xmm_key_shuf_mask = xmm1;
2264 const XMMRegister xmm_key_shuf_mask = xmm2; 2291 const XMMRegister xmm_temp1 = xmm2;
2292 const XMMRegister xmm_temp2 = xmm3;
2293 const XMMRegister xmm_temp3 = xmm4;
2294 const XMMRegister xmm_temp4 = xmm5;
2265 2295
2266 __ enter(); // required for proper stackwalking of RuntimeStub frame 2296 __ enter(); // required for proper stackwalking of RuntimeStub frame
2267 __ push(rsi); 2297 __ movptr(from, from_param);
2268 __ movptr(from , from_param); 2298 __ movptr(key, key_param);
2269 __ movptr(to , to_param); 2299
2270 __ movptr(key , key_param); 2300 // keylen could be only {11, 13, 15} * 4 = {44, 52, 60}
2271
2272 __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2301 __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2273 // keylen = # of 32-bit words, convert to 128-bit words
2274 __ shrl(keylen, 2);
2275 __ subl(keylen, 11); // every key has at least 11 128-bit words, some have more
2276 2302
2277 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); 2303 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
2278 __ movdqu(xmm_result, Address(from, 0)); 2304 __ movdqu(xmm_result, Address(from, 0));
2305 __ movptr(to, to_param);
2279 2306
2280 // for decryption java expanded key ordering is rotated one position from what we want 2307 // for decryption java expanded key ordering is rotated one position from what we want
2281 // so we start from 0x10 here and hit 0x00 last 2308 // so we start from 0x10 here and hit 0x00 last
2282 // we don't know if the key is aligned, hence not using load-execute form 2309 // we don't know if the key is aligned, hence not using load-execute form
2283 load_key(xmm_temp, key, 0x10, xmm_key_shuf_mask); 2310 load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask);
2284 __ pxor (xmm_result, xmm_temp); 2311 load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask);
2285 for (int offset = 0x20; offset <= 0xa0; offset += 0x10) { 2312 load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask);
2286 aes_dec_key(xmm_result, xmm_temp, key, offset, xmm_key_shuf_mask); 2313 load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask);
2287 } 2314
2288 __ cmpl(keylen, 0); 2315 __ pxor (xmm_result, xmm_temp1);
2289 __ jcc(Assembler::equal, L_doLast); 2316 __ aesdec(xmm_result, xmm_temp2);
2290 // only in 192 and 256 bit keys 2317 __ aesdec(xmm_result, xmm_temp3);
2291 aes_dec_key(xmm_result, xmm_temp, key, 0xb0, xmm_key_shuf_mask); 2318 __ aesdec(xmm_result, xmm_temp4);
2292 aes_dec_key(xmm_result, xmm_temp, key, 0xc0, xmm_key_shuf_mask); 2319
2293 __ subl(keylen, 2); 2320 load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask);
2294 __ jcc(Assembler::equal, L_doLast); 2321 load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask);
2295 // only in 256 bit keys 2322 load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask);
2296 aes_dec_key(xmm_result, xmm_temp, key, 0xd0, xmm_key_shuf_mask); 2323 load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask);
2297 aes_dec_key(xmm_result, xmm_temp, key, 0xe0, xmm_key_shuf_mask); 2324
2325 __ aesdec(xmm_result, xmm_temp1);
2326 __ aesdec(xmm_result, xmm_temp2);
2327 __ aesdec(xmm_result, xmm_temp3);
2328 __ aesdec(xmm_result, xmm_temp4);
2329
2330 load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask);
2331 load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask);
2332 load_key(xmm_temp3, key, 0x00, xmm_key_shuf_mask);
2333
2334 __ cmpl(keylen, 44);
2335 __ jccb(Assembler::equal, L_doLast);
2336
2337 __ aesdec(xmm_result, xmm_temp1);
2338 __ aesdec(xmm_result, xmm_temp2);
2339
2340 load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask);
2341 load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask);
2342
2343 __ cmpl(keylen, 52);
2344 __ jccb(Assembler::equal, L_doLast);
2345
2346 __ aesdec(xmm_result, xmm_temp1);
2347 __ aesdec(xmm_result, xmm_temp2);
2348
2349 load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask);
2350 load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask);
2298 2351
2299 __ BIND(L_doLast); 2352 __ BIND(L_doLast);
2353 __ aesdec(xmm_result, xmm_temp1);
2354 __ aesdec(xmm_result, xmm_temp2);
2355
2300 // for decryption the aesdeclast operation is always on key+0x00 2356 // for decryption the aesdeclast operation is always on key+0x00
2301 load_key(xmm_temp, key, 0x00, xmm_key_shuf_mask); 2357 __ aesdeclast(xmm_result, xmm_temp3);
2302 __ aesdeclast(xmm_result, xmm_temp);
2303
2304 __ movdqu(Address(to, 0), xmm_result); // store the result 2358 __ movdqu(Address(to, 0), xmm_result); // store the result
2305
2306 __ xorptr(rax, rax); // return 0 2359 __ xorptr(rax, rax); // return 0
2307 __ pop(rsi);
2308 __ leave(); // required for proper stackwalking of RuntimeStub frame 2360 __ leave(); // required for proper stackwalking of RuntimeStub frame
2309 __ ret(0); 2361 __ ret(0);
2310 2362
2311 return start; 2363 return start;
2312 } 2364 }
2338 // c_rarg2 - K (key) in little endian int array 2390 // c_rarg2 - K (key) in little endian int array
2339 // c_rarg3 - r vector byte array address 2391 // c_rarg3 - r vector byte array address
2340 // c_rarg4 - input length 2392 // c_rarg4 - input length
2341 // 2393 //
2342 address generate_cipherBlockChaining_encryptAESCrypt() { 2394 address generate_cipherBlockChaining_encryptAESCrypt() {
2343 assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support"); 2395 assert(UseAES, "need AES instructions and misaligned SSE support");
2344 __ align(CodeEntryAlignment); 2396 __ align(CodeEntryAlignment);
2345 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); 2397 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
2346 address start = __ pc(); 2398 address start = __ pc();
2347 2399
2348 Label L_exit, L_key_192_256, L_key_256, L_loopTop_128, L_loopTop_192, L_loopTop_256; 2400 Label L_exit, L_key_192_256, L_key_256, L_loopTop_128, L_loopTop_192, L_loopTop_256;
2391 __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2443 __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2392 __ cmpl(rax, 44); 2444 __ cmpl(rax, 44);
2393 __ jcc(Assembler::notEqual, L_key_192_256); 2445 __ jcc(Assembler::notEqual, L_key_192_256);
2394 2446
2395 // 128 bit code follows here 2447 // 128 bit code follows here
2396 __ movptr(pos, 0); 2448 __ movl(pos, 0);
2397 __ align(OptoLoopAlignment); 2449 __ align(OptoLoopAlignment);
2398 __ BIND(L_loopTop_128); 2450 __ BIND(L_loopTop_128);
2399 __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input 2451 __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input
2400 __ pxor (xmm_result, xmm_temp); // xor with the current r vector 2452 __ pxor (xmm_result, xmm_temp); // xor with the current r vector
2401 2453
2421 handleSOERegisters(false /*restoring*/); 2473 handleSOERegisters(false /*restoring*/);
2422 __ movl(rax, 0); // return 0 (why?) 2474 __ movl(rax, 0); // return 0 (why?)
2423 __ leave(); // required for proper stackwalking of RuntimeStub frame 2475 __ leave(); // required for proper stackwalking of RuntimeStub frame
2424 __ ret(0); 2476 __ ret(0);
2425 2477
2426 __ BIND(L_key_192_256); 2478 __ BIND(L_key_192_256);
2427 // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256) 2479 // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256)
2428 __ cmpl(rax, 52); 2480 __ cmpl(rax, 52);
2429 __ jcc(Assembler::notEqual, L_key_256); 2481 __ jcc(Assembler::notEqual, L_key_256);
2430 2482
2431 // 192-bit code follows here (could be changed to use more xmm registers) 2483 // 192-bit code follows here (could be changed to use more xmm registers)
2432 __ movptr(pos, 0); 2484 __ movl(pos, 0);
2433 __ align(OptoLoopAlignment); 2485 __ align(OptoLoopAlignment);
2434 __ BIND(L_loopTop_192); 2486 __ BIND(L_loopTop_192);
2435 __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input 2487 __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input
2436 __ pxor (xmm_result, xmm_temp); // xor with the current r vector 2488 __ pxor (xmm_result, xmm_temp); // xor with the current r vector
2437 2489
2438 __ pxor (xmm_result, xmm_key0); // do the aes rounds 2490 __ pxor (xmm_result, xmm_key0); // do the aes rounds
2439 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) { 2491 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) {
2450 __ addptr(pos, AESBlockSize); 2502 __ addptr(pos, AESBlockSize);
2451 __ subptr(len_reg, AESBlockSize); 2503 __ subptr(len_reg, AESBlockSize);
2452 __ jcc(Assembler::notEqual, L_loopTop_192); 2504 __ jcc(Assembler::notEqual, L_loopTop_192);
2453 __ jmp(L_exit); 2505 __ jmp(L_exit);
2454 2506
2455 __ BIND(L_key_256); 2507 __ BIND(L_key_256);
2456 // 256-bit code follows here (could be changed to use more xmm registers) 2508 // 256-bit code follows here (could be changed to use more xmm registers)
2457 __ movptr(pos, 0); 2509 __ movl(pos, 0);
2458 __ align(OptoLoopAlignment); 2510 __ align(OptoLoopAlignment);
2459 __ BIND(L_loopTop_256); 2511 __ BIND(L_loopTop_256);
2460 __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input 2512 __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input
2461 __ pxor (xmm_result, xmm_temp); // xor with the current r vector 2513 __ pxor (xmm_result, xmm_temp); // xor with the current r vector
2462 2514
2463 __ pxor (xmm_result, xmm_key0); // do the aes rounds 2515 __ pxor (xmm_result, xmm_key0); // do the aes rounds
2464 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) { 2516 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) {
2493 // c_rarg3 - r vector byte array address 2545 // c_rarg3 - r vector byte array address
2494 // c_rarg4 - input length 2546 // c_rarg4 - input length
2495 // 2547 //
2496 2548
2497 address generate_cipherBlockChaining_decryptAESCrypt() { 2549 address generate_cipherBlockChaining_decryptAESCrypt() {
2498 assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support"); 2550 assert(UseAES, "need AES instructions and misaligned SSE support");
2499 __ align(CodeEntryAlignment); 2551 __ align(CodeEntryAlignment);
2500 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); 2552 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
2501 address start = __ pc(); 2553 address start = __ pc();
2502 2554
2503 Label L_exit, L_key_192_256, L_key_256; 2555 Label L_exit, L_key_192_256, L_key_256;
2554 __ cmpl(rax, 44); 2606 __ cmpl(rax, 44);
2555 __ jcc(Assembler::notEqual, L_key_192_256); 2607 __ jcc(Assembler::notEqual, L_key_192_256);
2556 2608
2557 2609
2558 // 128-bit code follows here, parallelized 2610 // 128-bit code follows here, parallelized
2559 __ movptr(pos, 0); 2611 __ movl(pos, 0);
2560 __ align(OptoLoopAlignment); 2612 __ align(OptoLoopAlignment);
2561 __ BIND(L_singleBlock_loopTop_128); 2613 __ BIND(L_singleBlock_loopTop_128);
2562 __ cmpptr(len_reg, 0); // any blocks left?? 2614 __ cmpptr(len_reg, 0); // any blocks left??
2563 __ jcc(Assembler::equal, L_exit); 2615 __ jcc(Assembler::equal, L_exit);
2564 __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input 2616 __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input
2565 __ pxor (xmm_result, xmm_key_first); // do the aes dec rounds 2617 __ pxor (xmm_result, xmm_key_first); // do the aes dec rounds
2566 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) { 2618 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) {
2595 // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256) 2647 // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256)
2596 __ cmpl(rax, 52); 2648 __ cmpl(rax, 52);
2597 __ jcc(Assembler::notEqual, L_key_256); 2649 __ jcc(Assembler::notEqual, L_key_256);
2598 2650
2599 // 192-bit code follows here (could be optimized to use parallelism) 2651 // 192-bit code follows here (could be optimized to use parallelism)
2600 __ movptr(pos, 0); 2652 __ movl(pos, 0);
2601 __ align(OptoLoopAlignment); 2653 __ align(OptoLoopAlignment);
2602 __ BIND(L_singleBlock_loopTop_192); 2654 __ BIND(L_singleBlock_loopTop_192);
2603 __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input 2655 __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input
2604 __ pxor (xmm_result, xmm_key_first); // do the aes dec rounds 2656 __ pxor (xmm_result, xmm_key_first); // do the aes dec rounds
2605 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) { 2657 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) {
2620 __ jcc(Assembler::notEqual,L_singleBlock_loopTop_192); 2672 __ jcc(Assembler::notEqual,L_singleBlock_loopTop_192);
2621 __ jmp(L_exit); 2673 __ jmp(L_exit);
2622 2674
2623 __ BIND(L_key_256); 2675 __ BIND(L_key_256);
2624 // 256-bit code follows here (could be optimized to use parallelism) 2676 // 256-bit code follows here (could be optimized to use parallelism)
2625 __ movptr(pos, 0); 2677 __ movl(pos, 0);
2626 __ align(OptoLoopAlignment); 2678 __ align(OptoLoopAlignment);
2627 __ BIND(L_singleBlock_loopTop_256); 2679 __ BIND(L_singleBlock_loopTop_256);
2628 __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input 2680 __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input
2629 __ pxor (xmm_result, xmm_key_first); // do the aes dec rounds 2681 __ pxor (xmm_result, xmm_key_first); // do the aes dec rounds
2630 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) { 2682 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) {