comparison src/cpu/sparc/vm/stubGenerator_sparc.cpp @ 17910:03214612e77e

8035936: SIGBUS in StubRoutines::aesencryptBlock, solaris-sparc Summary: Fix the arbitrary alignment issue in SPARC AES crypto stub routines. Reviewed-by: kvn, iveresov Contributed-by: shrinivas.joshi@oracle.com
author kvn
date Wed, 30 Apr 2014 14:14:01 -0700
parents 04d32e7fad07
children 0fb5b60ab4a2
comparison
equal deleted inserted replaced
17909:85d6efcb1fa3 17910:03214612e77e
1 /* 1 /*
2 * Copyright (c) 1997, 2013, Oracle and/or its affiliates. All rights reserved. 2 * Copyright (c) 1997, 2014, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 * 4 *
5 * This code is free software; you can redistribute it and/or modify it 5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as 6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation. 7 * published by the Free Software Foundation.
3303 StubRoutines::_zero_aligned_words = generate_zero_aligned_words("zero_aligned_words"); 3303 StubRoutines::_zero_aligned_words = generate_zero_aligned_words("zero_aligned_words");
3304 } 3304 }
3305 } 3305 }
3306 3306
3307 address generate_aescrypt_encryptBlock() { 3307 address generate_aescrypt_encryptBlock() {
3308 // required since we read expanded key 'int' array starting first element without alignment considerations
3309 assert((arrayOopDesc::base_offset_in_bytes(T_INT) & 7) == 0,
3310 "the following code assumes that first element of an int array is aligned to 8 bytes");
3308 __ align(CodeEntryAlignment); 3311 __ align(CodeEntryAlignment);
3309 StubCodeMark mark(this, "StubRoutines", "aesencryptBlock"); 3312 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
3310 Label L_doLast128bit, L_storeOutput; 3313 Label L_load_misaligned_input, L_load_expanded_key, L_doLast128bit, L_storeOutput, L_store_misaligned_output;
3311 address start = __ pc(); 3314 address start = __ pc();
3312 Register from = O0; // source byte array 3315 Register from = O0; // source byte array
3313 Register to = O1; // destination byte array 3316 Register to = O1; // destination byte array
3314 Register key = O2; // expanded key array 3317 Register key = O2; // expanded key array
3315 const Register keylen = O4; //reg for storing expanded key array length 3318 const Register keylen = O4; //reg for storing expanded key array length
3316 3319
3317 // read expanded key length 3320 // read expanded key length
3318 __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0); 3321 __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0);
3319 3322
3320 // load input into F54-F56; F30-F31 used as temp 3323 // Method to address arbitrary alignment for load instructions:
3321 __ ldf(FloatRegisterImpl::S, from, 0, F30); 3324 // Check last 3 bits of 'from' address to see if it is aligned to 8-byte boundary
3322 __ ldf(FloatRegisterImpl::S, from, 4, F31); 3325 // If zero/aligned then continue with double FP load instructions
3323 __ fmov(FloatRegisterImpl::D, F30, F54); 3326 // If not zero/mis-aligned then alignaddr will set GSR.align with number of bytes to skip during faligndata
3324 __ ldf(FloatRegisterImpl::S, from, 8, F30); 3327 // alignaddr will also convert arbitrary aligned 'from' address to nearest 8-byte aligned address
3325 __ ldf(FloatRegisterImpl::S, from, 12, F31); 3328 // load 3 * 8-byte components (to read 16 bytes input) in 3 different FP regs starting at this aligned address
3326 __ fmov(FloatRegisterImpl::D, F30, F56); 3329 // faligndata will then extract (based on GSR.align value) the appropriate 8 bytes from the 2 source regs
3327 3330
3328 // load expanded key 3331 // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
3332 __ andcc(from, 7, G0);
3333 __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input);
3334 __ delayed()->alignaddr(from, G0, from);
3335
3336 // aligned case: load input into F54-F56
3337 __ ldf(FloatRegisterImpl::D, from, 0, F54);
3338 __ ldf(FloatRegisterImpl::D, from, 8, F56);
3339 __ ba_short(L_load_expanded_key);
3340
3341 __ BIND(L_load_misaligned_input);
3342 __ ldf(FloatRegisterImpl::D, from, 0, F54);
3343 __ ldf(FloatRegisterImpl::D, from, 8, F56);
3344 __ ldf(FloatRegisterImpl::D, from, 16, F58);
3345 __ faligndata(F54, F56, F54);
3346 __ faligndata(F56, F58, F56);
3347
3348 __ BIND(L_load_expanded_key);
3349 // Since we load expanded key buffers starting first element, 8-byte alignment is guaranteed
3329 for ( int i = 0; i <= 38; i += 2 ) { 3350 for ( int i = 0; i <= 38; i += 2 ) {
3330 __ ldf(FloatRegisterImpl::D, key, i*4, as_FloatRegister(i)); 3351 __ ldf(FloatRegisterImpl::D, key, i*4, as_FloatRegister(i));
3331 } 3352 }
3332 3353
3333 // perform cipher transformation 3354 // perform cipher transformation
3363 __ ldf(FloatRegisterImpl::D, key, 216, F46); 3384 __ ldf(FloatRegisterImpl::D, key, 216, F46);
3364 __ ldf(FloatRegisterImpl::D, key, 224, F48); 3385 __ ldf(FloatRegisterImpl::D, key, 224, F48);
3365 __ ldf(FloatRegisterImpl::D, key, 232, F50); 3386 __ ldf(FloatRegisterImpl::D, key, 232, F50);
3366 __ aes_eround01(F52, F54, F56, F58); //round 13 3387 __ aes_eround01(F52, F54, F56, F58); //round 13
3367 __ aes_eround23(F46, F54, F56, F60); 3388 __ aes_eround23(F46, F54, F56, F60);
3368 __ br(Assembler::always, false, Assembler::pt, L_storeOutput); 3389 __ ba_short(L_storeOutput);
3369 __ delayed()->nop();
3370 3390
3371 __ BIND(L_doLast128bit); 3391 __ BIND(L_doLast128bit);
3372 __ ldf(FloatRegisterImpl::D, key, 160, F48); 3392 __ ldf(FloatRegisterImpl::D, key, 160, F48);
3373 __ ldf(FloatRegisterImpl::D, key, 168, F50); 3393 __ ldf(FloatRegisterImpl::D, key, 168, F50);
3374 3394
3375 __ BIND(L_storeOutput); 3395 __ BIND(L_storeOutput);
3376 // perform last round of encryption common for all key sizes 3396 // perform last round of encryption common for all key sizes
3377 __ aes_eround01_l(F48, F58, F60, F54); //last round 3397 __ aes_eround01_l(F48, F58, F60, F54); //last round
3378 __ aes_eround23_l(F50, F58, F60, F56); 3398 __ aes_eround23_l(F50, F58, F60, F56);
3379 3399
3380 // store output into the destination array, F0-F1 used as temp 3400 // Method to address arbitrary alignment for store instructions:
3381 __ fmov(FloatRegisterImpl::D, F54, F0); 3401 // Check last 3 bits of 'dest' address to see if it is aligned to 8-byte boundary
3382 __ stf(FloatRegisterImpl::S, F0, to, 0); 3402 // If zero/aligned then continue with double FP store instructions
3383 __ stf(FloatRegisterImpl::S, F1, to, 4); 3403 // If not zero/mis-aligned then edge8n will generate edge mask in result reg (O3 in below case)
3384 __ fmov(FloatRegisterImpl::D, F56, F0); 3404 // Example: If dest address is 0x07 and nearest 8-byte aligned address is 0x00 then edge mask will be 00000001
3385 __ stf(FloatRegisterImpl::S, F0, to, 8); 3405 // Compute (8-n) where n is # of bytes skipped by partial store(stpartialf) inst from edge mask, n=7 in this case
3406 // We get the value of n from the andcc that checks 'dest' alignment. n is available in O5 in below case.
3407 // Set GSR.align to (8-n) using alignaddr
3408 // Circular byte shift store values by n places so that the original bytes are at correct position for stpartialf
3409 // Set the arbitrarily aligned 'dest' address to nearest 8-byte aligned address
3410 // Store (partial) the original first (8-n) bytes starting at the original 'dest' address
3411 // Negate the edge mask so that the subsequent stpartialf can store the original (8-n-1)th through 8th bytes at appropriate address
3412 // We need to execute this process for both the 8-byte result values
3413
3414 // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
3415 __ andcc(to, 7, O5);
3416 __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output);
3417 __ delayed()->edge8n(to, G0, O3);
3418
3419 // aligned case: store output into the destination array
3420 __ stf(FloatRegisterImpl::D, F54, to, 0);
3386 __ retl(); 3421 __ retl();
3387 __ delayed()->stf(FloatRegisterImpl::S, F1, to, 12); 3422 __ delayed()->stf(FloatRegisterImpl::D, F56, to, 8);
3423
3424 __ BIND(L_store_misaligned_output);
3425 __ add(to, 8, O4);
3426 __ mov(8, O2);
3427 __ sub(O2, O5, O2);
3428 __ alignaddr(O2, G0, O2);
3429 __ faligndata(F54, F54, F54);
3430 __ faligndata(F56, F56, F56);
3431 __ and3(to, -8, to);
3432 __ and3(O4, -8, O4);
3433 __ stpartialf(to, O3, F54, Assembler::ASI_PST8_PRIMARY);
3434 __ stpartialf(O4, O3, F56, Assembler::ASI_PST8_PRIMARY);
3435 __ add(to, 8, to);
3436 __ add(O4, 8, O4);
3437 __ orn(G0, O3, O3);
3438 __ stpartialf(to, O3, F54, Assembler::ASI_PST8_PRIMARY);
3439 __ retl();
3440 __ delayed()->stpartialf(O4, O3, F56, Assembler::ASI_PST8_PRIMARY);
3388 3441
3389 return start; 3442 return start;
3390 } 3443 }
3391 3444
3392 address generate_aescrypt_decryptBlock() { 3445 address generate_aescrypt_decryptBlock() {
3446 assert((arrayOopDesc::base_offset_in_bytes(T_INT) & 7) == 0,
3447 "the following code assumes that first element of an int array is aligned to 8 bytes");
3448 // required since we read original key 'byte' array as well in the decryption stubs
3449 assert((arrayOopDesc::base_offset_in_bytes(T_BYTE) & 7) == 0,
3450 "the following code assumes that first element of a byte array is aligned to 8 bytes");
3393 __ align(CodeEntryAlignment); 3451 __ align(CodeEntryAlignment);
3394 StubCodeMark mark(this, "StubRoutines", "aesdecryptBlock"); 3452 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
3395 address start = __ pc(); 3453 address start = __ pc();
3396 Label L_expand192bit, L_expand256bit, L_common_transform; 3454 Label L_load_misaligned_input, L_load_original_key, L_expand192bit, L_expand256bit, L_reload_misaligned_input;
3455 Label L_256bit_transform, L_common_transform, L_store_misaligned_output;
3397 Register from = O0; // source byte array 3456 Register from = O0; // source byte array
3398 Register to = O1; // destination byte array 3457 Register to = O1; // destination byte array
3399 Register key = O2; // expanded key array 3458 Register key = O2; // expanded key array
3400 Register original_key = O3; // original key array only required during decryption 3459 Register original_key = O3; // original key array only required during decryption
3401 const Register keylen = O4; // reg for storing expanded key array length 3460 const Register keylen = O4; // reg for storing expanded key array length
3402 3461
3403 // read expanded key array length 3462 // read expanded key array length
3404 __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0); 3463 __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0);
3405 3464
3406 // load input into F52-F54; F30,F31 used as temp 3465 // save 'from' since we may need to recheck alignment in case of 256-bit decryption
3407 __ ldf(FloatRegisterImpl::S, from, 0, F30); 3466 __ mov(from, G1);
3408 __ ldf(FloatRegisterImpl::S, from, 4, F31); 3467
3409 __ fmov(FloatRegisterImpl::D, F30, F52); 3468 // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
3410 __ ldf(FloatRegisterImpl::S, from, 8, F30); 3469 __ andcc(from, 7, G0);
3411 __ ldf(FloatRegisterImpl::S, from, 12, F31); 3470 __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input);
3412 __ fmov(FloatRegisterImpl::D, F30, F54); 3471 __ delayed()->alignaddr(from, G0, from);
3413 3472
3473 // aligned case: load input into F52-F54
3474 __ ldf(FloatRegisterImpl::D, from, 0, F52);
3475 __ ldf(FloatRegisterImpl::D, from, 8, F54);
3476 __ ba_short(L_load_original_key);
3477
3478 __ BIND(L_load_misaligned_input);
3479 __ ldf(FloatRegisterImpl::D, from, 0, F52);
3480 __ ldf(FloatRegisterImpl::D, from, 8, F54);
3481 __ ldf(FloatRegisterImpl::D, from, 16, F56);
3482 __ faligndata(F52, F54, F52);
3483 __ faligndata(F54, F56, F54);
3484
3485 __ BIND(L_load_original_key);
3414 // load original key from SunJCE expanded decryption key 3486 // load original key from SunJCE expanded decryption key
3487 // Since we load original key buffer starting first element, 8-byte alignment is guaranteed
3415 for ( int i = 0; i <= 3; i++ ) { 3488 for ( int i = 0; i <= 3; i++ ) {
3416 __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i)); 3489 __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i));
3417 } 3490 }
3418 3491
3419 // 256-bit original key size 3492 // 256-bit original key size
3430 } 3503 }
3431 3504
3432 // perform 128-bit key specific inverse cipher transformation 3505 // perform 128-bit key specific inverse cipher transformation
3433 __ fxor(FloatRegisterImpl::D, F42, F54, F54); 3506 __ fxor(FloatRegisterImpl::D, F42, F54, F54);
3434 __ fxor(FloatRegisterImpl::D, F40, F52, F52); 3507 __ fxor(FloatRegisterImpl::D, F40, F52, F52);
3435 __ br(Assembler::always, false, Assembler::pt, L_common_transform); 3508 __ ba_short(L_common_transform);
3436 __ delayed()->nop();
3437 3509
3438 __ BIND(L_expand192bit); 3510 __ BIND(L_expand192bit);
3439 3511
3440 // start loading rest of the 192-bit key 3512 // start loading rest of the 192-bit key
3441 __ ldf(FloatRegisterImpl::S, original_key, 16, F4); 3513 __ ldf(FloatRegisterImpl::S, original_key, 16, F4);
3455 __ fxor(FloatRegisterImpl::D, F48, F52, F52); 3527 __ fxor(FloatRegisterImpl::D, F48, F52, F52);
3456 __ aes_dround23(F46, F52, F54, F58); 3528 __ aes_dround23(F46, F52, F54, F58);
3457 __ aes_dround01(F44, F52, F54, F56); 3529 __ aes_dround01(F44, F52, F54, F56);
3458 __ aes_dround23(F42, F56, F58, F54); 3530 __ aes_dround23(F42, F56, F58, F54);
3459 __ aes_dround01(F40, F56, F58, F52); 3531 __ aes_dround01(F40, F56, F58, F52);
3460 __ br(Assembler::always, false, Assembler::pt, L_common_transform); 3532 __ ba_short(L_common_transform);
3461 __ delayed()->nop();
3462 3533
3463 __ BIND(L_expand256bit); 3534 __ BIND(L_expand256bit);
3464 3535
3465 // load rest of the 256-bit key 3536 // load rest of the 256-bit key
3466 for ( int i = 4; i <= 7; i++ ) { 3537 for ( int i = 4; i <= 7; i++ ) {
3476 } 3547 }
3477 __ aes_kexpand1(F48, F54, 6, F56); 3548 __ aes_kexpand1(F48, F54, 6, F56);
3478 __ aes_kexpand2(F50, F56, F58); 3549 __ aes_kexpand2(F50, F56, F58);
3479 3550
3480 for ( int i = 0; i <= 6; i += 2 ) { 3551 for ( int i = 0; i <= 6; i += 2 ) {
3481 __ fmov(FloatRegisterImpl::D, as_FloatRegister(58-i), as_FloatRegister(i)); 3552 __ fsrc2(FloatRegisterImpl::D, as_FloatRegister(58-i), as_FloatRegister(i));
3482 } 3553 }
3483 3554
3484 // load input into F52-F54 3555 // reload original 'from' address
3556 __ mov(G1, from);
3557
3558 // re-check 8-byte alignment
3559 __ andcc(from, 7, G0);
3560 __ br(Assembler::notZero, true, Assembler::pn, L_reload_misaligned_input);
3561 __ delayed()->alignaddr(from, G0, from);
3562
3563 // aligned case: load input into F52-F54
3485 __ ldf(FloatRegisterImpl::D, from, 0, F52); 3564 __ ldf(FloatRegisterImpl::D, from, 0, F52);
3486 __ ldf(FloatRegisterImpl::D, from, 8, F54); 3565 __ ldf(FloatRegisterImpl::D, from, 8, F54);
3566 __ ba_short(L_256bit_transform);
3567
3568 __ BIND(L_reload_misaligned_input);
3569 __ ldf(FloatRegisterImpl::D, from, 0, F52);
3570 __ ldf(FloatRegisterImpl::D, from, 8, F54);
3571 __ ldf(FloatRegisterImpl::D, from, 16, F56);
3572 __ faligndata(F52, F54, F52);
3573 __ faligndata(F54, F56, F54);
3487 3574
3488 // perform 256-bit key specific inverse cipher transformation 3575 // perform 256-bit key specific inverse cipher transformation
3576 __ BIND(L_256bit_transform);
3489 __ fxor(FloatRegisterImpl::D, F0, F54, F54); 3577 __ fxor(FloatRegisterImpl::D, F0, F54, F54);
3490 __ fxor(FloatRegisterImpl::D, F2, F52, F52); 3578 __ fxor(FloatRegisterImpl::D, F2, F52, F52);
3491 __ aes_dround23(F4, F52, F54, F58); 3579 __ aes_dround23(F4, F52, F54, F58);
3492 __ aes_dround01(F6, F52, F54, F56); 3580 __ aes_dround01(F6, F52, F54, F56);
3493 __ aes_dround23(F50, F56, F58, F54); 3581 __ aes_dround23(F50, F56, F58, F54);
3513 __ aes_dround23_l(as_FloatRegister(i-4), F56, F58, F54); 3601 __ aes_dround23_l(as_FloatRegister(i-4), F56, F58, F54);
3514 __ aes_dround01_l(as_FloatRegister(i-6), F56, F58, F52); 3602 __ aes_dround01_l(as_FloatRegister(i-6), F56, F58, F52);
3515 } 3603 }
3516 } 3604 }
3517 3605
3518 // store output to destination array, F0-F1 used as temp 3606 // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
3519 __ fmov(FloatRegisterImpl::D, F52, F0); 3607 __ andcc(to, 7, O5);
3520 __ stf(FloatRegisterImpl::S, F0, to, 0); 3608 __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output);
3521 __ stf(FloatRegisterImpl::S, F1, to, 4); 3609 __ delayed()->edge8n(to, G0, O3);
3522 __ fmov(FloatRegisterImpl::D, F54, F0); 3610
3523 __ stf(FloatRegisterImpl::S, F0, to, 8); 3611 // aligned case: store output into the destination array
3612 __ stf(FloatRegisterImpl::D, F52, to, 0);
3524 __ retl(); 3613 __ retl();
3525 __ delayed()->stf(FloatRegisterImpl::S, F1, to, 12); 3614 __ delayed()->stf(FloatRegisterImpl::D, F54, to, 8);
3615
3616 __ BIND(L_store_misaligned_output);
3617 __ add(to, 8, O4);
3618 __ mov(8, O2);
3619 __ sub(O2, O5, O2);
3620 __ alignaddr(O2, G0, O2);
3621 __ faligndata(F52, F52, F52);
3622 __ faligndata(F54, F54, F54);
3623 __ and3(to, -8, to);
3624 __ and3(O4, -8, O4);
3625 __ stpartialf(to, O3, F52, Assembler::ASI_PST8_PRIMARY);
3626 __ stpartialf(O4, O3, F54, Assembler::ASI_PST8_PRIMARY);
3627 __ add(to, 8, to);
3628 __ add(O4, 8, O4);
3629 __ orn(G0, O3, O3);
3630 __ stpartialf(to, O3, F52, Assembler::ASI_PST8_PRIMARY);
3631 __ retl();
3632 __ delayed()->stpartialf(O4, O3, F54, Assembler::ASI_PST8_PRIMARY);
3526 3633
3527 return start; 3634 return start;
3528 } 3635 }
3529 3636
3530 address generate_cipherBlockChaining_encryptAESCrypt() { 3637 address generate_cipherBlockChaining_encryptAESCrypt() {
3638 assert((arrayOopDesc::base_offset_in_bytes(T_INT) & 7) == 0,
3639 "the following code assumes that first element of an int array is aligned to 8 bytes");
3640 assert((arrayOopDesc::base_offset_in_bytes(T_BYTE) & 7) == 0,
3641 "the following code assumes that first element of a byte array is aligned to 8 bytes");
3531 __ align(CodeEntryAlignment); 3642 __ align(CodeEntryAlignment);
3532 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); 3643 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
3533 Label L_cbcenc128, L_cbcenc192, L_cbcenc256; 3644 Label L_cbcenc128, L_load_misaligned_input_128bit, L_128bit_transform, L_store_misaligned_output_128bit;
3645 Label L_check_loop_end_128bit, L_cbcenc192, L_load_misaligned_input_192bit, L_192bit_transform;
3646 Label L_store_misaligned_output_192bit, L_check_loop_end_192bit, L_cbcenc256, L_load_misaligned_input_256bit;
3647 Label L_256bit_transform, L_store_misaligned_output_256bit, L_check_loop_end_256bit;
3534 address start = __ pc(); 3648 address start = __ pc();
3535 Register from = O0; // source byte array 3649 Register from = I0; // source byte array
3536 Register to = O1; // destination byte array 3650 Register to = I1; // destination byte array
3537 Register key = O2; // expanded key array 3651 Register key = I2; // expanded key array
3538 Register rvec = O3; // init vector 3652 Register rvec = I3; // init vector
3539 const Register len_reg = O4; // cipher length 3653 const Register len_reg = I4; // cipher length
3540 const Register keylen = O5; // reg for storing expanded key array length 3654 const Register keylen = I5; // reg for storing expanded key array length
3541 3655
3542 // save cipher len to return in the end 3656 // save cipher len before save_frame, to return in the end
3543 __ mov(len_reg, L1); 3657 __ mov(O4, L0);
3658 __ save_frame(0);
3544 3659
3545 // read expanded key length 3660 // read expanded key length
3546 __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0); 3661 __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0);
3547 3662
3548 // load init vector 3663 // load initial vector, 8-byte alignment is guranteed
3549 __ ldf(FloatRegisterImpl::D, rvec, 0, F60); 3664 __ ldf(FloatRegisterImpl::D, rvec, 0, F60);
3550 __ ldf(FloatRegisterImpl::D, rvec, 8, F62); 3665 __ ldf(FloatRegisterImpl::D, rvec, 8, F62);
3666 // load key, 8-byte alignment is guranteed
3551 __ ldx(key,0,G1); 3667 __ ldx(key,0,G1);
3552 __ ldx(key,8,G2); 3668 __ ldx(key,8,G5);
3553 3669
3554 // start loading expanded key 3670 // start loading expanded key, 8-byte alignment is guranteed
3555 for ( int i = 0, j = 16; i <= 38; i += 2, j += 8 ) { 3671 for ( int i = 0, j = 16; i <= 38; i += 2, j += 8 ) {
3556 __ ldf(FloatRegisterImpl::D, key, j, as_FloatRegister(i)); 3672 __ ldf(FloatRegisterImpl::D, key, j, as_FloatRegister(i));
3557 } 3673 }
3558 3674
3559 // 128-bit original key size 3675 // 128-bit original key size
3569 for ( int i = 48, j = 208; i <= 54; i += 2, j += 8 ) { 3685 for ( int i = 48, j = 208; i <= 54; i += 2, j += 8 ) {
3570 __ ldf(FloatRegisterImpl::D, key, j, as_FloatRegister(i)); 3686 __ ldf(FloatRegisterImpl::D, key, j, as_FloatRegister(i));
3571 } 3687 }
3572 3688
3573 // 256-bit original key size 3689 // 256-bit original key size
3574 __ br(Assembler::always, false, Assembler::pt, L_cbcenc256); 3690 __ ba_short(L_cbcenc256);
3575 __ delayed()->nop();
3576 3691
3577 __ align(OptoLoopAlignment); 3692 __ align(OptoLoopAlignment);
3578 __ BIND(L_cbcenc128); 3693 __ BIND(L_cbcenc128);
3694 // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
3695 __ andcc(from, 7, G0);
3696 __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input_128bit);
3697 __ delayed()->mov(from, L1); // save original 'from' address before alignaddr
3698
3699 // aligned case: load input into G3 and G4
3579 __ ldx(from,0,G3); 3700 __ ldx(from,0,G3);
3580 __ ldx(from,8,G4); 3701 __ ldx(from,8,G4);
3702 __ ba_short(L_128bit_transform);
3703
3704 __ BIND(L_load_misaligned_input_128bit);
3705 // can clobber F48, F50 and F52 as they are not used in 128 and 192-bit key encryption
3706 __ alignaddr(from, G0, from);
3707 __ ldf(FloatRegisterImpl::D, from, 0, F48);
3708 __ ldf(FloatRegisterImpl::D, from, 8, F50);
3709 __ ldf(FloatRegisterImpl::D, from, 16, F52);
3710 __ faligndata(F48, F50, F48);
3711 __ faligndata(F50, F52, F50);
3712 __ movdtox(F48, G3);
3713 __ movdtox(F50, G4);
3714 __ mov(L1, from);
3715
3716 __ BIND(L_128bit_transform);
3581 __ xor3(G1,G3,G3); 3717 __ xor3(G1,G3,G3);
3582 __ xor3(G2,G4,G4); 3718 __ xor3(G5,G4,G4);
3583 __ movxtod(G3,F56); 3719 __ movxtod(G3,F56);
3584 __ movxtod(G4,F58); 3720 __ movxtod(G4,F58);
3585 __ fxor(FloatRegisterImpl::D, F60, F56, F60); 3721 __ fxor(FloatRegisterImpl::D, F60, F56, F60);
3586 __ fxor(FloatRegisterImpl::D, F62, F58, F62); 3722 __ fxor(FloatRegisterImpl::D, F62, F58, F62);
3587 3723
3596 __ aes_eround01_l(as_FloatRegister(i+4), F56, F58, F60); 3732 __ aes_eround01_l(as_FloatRegister(i+4), F56, F58, F60);
3597 __ aes_eround23_l(as_FloatRegister(i+6), F56, F58, F62); 3733 __ aes_eround23_l(as_FloatRegister(i+6), F56, F58, F62);
3598 } 3734 }
3599 } 3735 }
3600 3736
3737 // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
3738 __ andcc(to, 7, L1);
3739 __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_128bit);
3740 __ delayed()->edge8n(to, G0, L2);
3741
3742 // aligned case: store output into the destination array
3601 __ stf(FloatRegisterImpl::D, F60, to, 0); 3743 __ stf(FloatRegisterImpl::D, F60, to, 0);
3602 __ stf(FloatRegisterImpl::D, F62, to, 8); 3744 __ stf(FloatRegisterImpl::D, F62, to, 8);
3745 __ ba_short(L_check_loop_end_128bit);
3746
3747 __ BIND(L_store_misaligned_output_128bit);
3748 __ add(to, 8, L3);
3749 __ mov(8, L4);
3750 __ sub(L4, L1, L4);
3751 __ alignaddr(L4, G0, L4);
3752 // save cipher text before circular right shift
3753 // as it needs to be stored as iv for next block (see code before next retl)
3754 __ movdtox(F60, L6);
3755 __ movdtox(F62, L7);
3756 __ faligndata(F60, F60, F60);
3757 __ faligndata(F62, F62, F62);
3758 __ mov(to, L5);
3759 __ and3(to, -8, to);
3760 __ and3(L3, -8, L3);
3761 __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY);
3762 __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY);
3763 __ add(to, 8, to);
3764 __ add(L3, 8, L3);
3765 __ orn(G0, L2, L2);
3766 __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY);
3767 __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY);
3768 __ mov(L5, to);
3769 __ movxtod(L6, F60);
3770 __ movxtod(L7, F62);
3771
3772 __ BIND(L_check_loop_end_128bit);
3603 __ add(from, 16, from); 3773 __ add(from, 16, from);
3604 __ add(to, 16, to); 3774 __ add(to, 16, to);
3605 __ subcc(len_reg, 16, len_reg); 3775 __ subcc(len_reg, 16, len_reg);
3606 __ br(Assembler::notEqual, false, Assembler::pt, L_cbcenc128); 3776 __ br(Assembler::notEqual, false, Assembler::pt, L_cbcenc128);
3607 __ delayed()->nop(); 3777 __ delayed()->nop();
3778 // re-init intial vector for next block, 8-byte alignment is guaranteed
3608 __ stf(FloatRegisterImpl::D, F60, rvec, 0); 3779 __ stf(FloatRegisterImpl::D, F60, rvec, 0);
3609 __ stf(FloatRegisterImpl::D, F62, rvec, 8); 3780 __ stf(FloatRegisterImpl::D, F62, rvec, 8);
3781 __ restore();
3610 __ retl(); 3782 __ retl();
3611 __ delayed()->mov(L1, O0); 3783 __ delayed()->mov(L0, O0);
3612 3784
3613 __ align(OptoLoopAlignment); 3785 __ align(OptoLoopAlignment);
3614 __ BIND(L_cbcenc192); 3786 __ BIND(L_cbcenc192);
3787 // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
3788 __ andcc(from, 7, G0);
3789 __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input_192bit);
3790 __ delayed()->mov(from, L1); // save original 'from' address before alignaddr
3791
3792 // aligned case: load input into G3 and G4
3615 __ ldx(from,0,G3); 3793 __ ldx(from,0,G3);
3616 __ ldx(from,8,G4); 3794 __ ldx(from,8,G4);
3795 __ ba_short(L_192bit_transform);
3796
3797 __ BIND(L_load_misaligned_input_192bit);
3798 // can clobber F48, F50 and F52 as they are not used in 128 and 192-bit key encryption
3799 __ alignaddr(from, G0, from);
3800 __ ldf(FloatRegisterImpl::D, from, 0, F48);
3801 __ ldf(FloatRegisterImpl::D, from, 8, F50);
3802 __ ldf(FloatRegisterImpl::D, from, 16, F52);
3803 __ faligndata(F48, F50, F48);
3804 __ faligndata(F50, F52, F50);
3805 __ movdtox(F48, G3);
3806 __ movdtox(F50, G4);
3807 __ mov(L1, from);
3808
3809 __ BIND(L_192bit_transform);
3617 __ xor3(G1,G3,G3); 3810 __ xor3(G1,G3,G3);
3618 __ xor3(G2,G4,G4); 3811 __ xor3(G5,G4,G4);
3619 __ movxtod(G3,F56); 3812 __ movxtod(G3,F56);
3620 __ movxtod(G4,F58); 3813 __ movxtod(G4,F58);
3621 __ fxor(FloatRegisterImpl::D, F60, F56, F60); 3814 __ fxor(FloatRegisterImpl::D, F60, F56, F60);
3622 __ fxor(FloatRegisterImpl::D, F62, F58, F62); 3815 __ fxor(FloatRegisterImpl::D, F62, F58, F62);
3623 3816
3632 __ aes_eround01_l(as_FloatRegister(i+4), F56, F58, F60); 3825 __ aes_eround01_l(as_FloatRegister(i+4), F56, F58, F60);
3633 __ aes_eround23_l(as_FloatRegister(i+6), F56, F58, F62); 3826 __ aes_eround23_l(as_FloatRegister(i+6), F56, F58, F62);
3634 } 3827 }
3635 } 3828 }
3636 3829
3830 // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
3831 __ andcc(to, 7, L1);
3832 __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_192bit);
3833 __ delayed()->edge8n(to, G0, L2);
3834
3835 // aligned case: store output into the destination array
3637 __ stf(FloatRegisterImpl::D, F60, to, 0); 3836 __ stf(FloatRegisterImpl::D, F60, to, 0);
3638 __ stf(FloatRegisterImpl::D, F62, to, 8); 3837 __ stf(FloatRegisterImpl::D, F62, to, 8);
3838 __ ba_short(L_check_loop_end_192bit);
3839
3840 __ BIND(L_store_misaligned_output_192bit);
3841 __ add(to, 8, L3);
3842 __ mov(8, L4);
3843 __ sub(L4, L1, L4);
3844 __ alignaddr(L4, G0, L4);
3845 __ movdtox(F60, L6);
3846 __ movdtox(F62, L7);
3847 __ faligndata(F60, F60, F60);
3848 __ faligndata(F62, F62, F62);
3849 __ mov(to, L5);
3850 __ and3(to, -8, to);
3851 __ and3(L3, -8, L3);
3852 __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY);
3853 __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY);
3854 __ add(to, 8, to);
3855 __ add(L3, 8, L3);
3856 __ orn(G0, L2, L2);
3857 __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY);
3858 __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY);
3859 __ mov(L5, to);
3860 __ movxtod(L6, F60);
3861 __ movxtod(L7, F62);
3862
3863 __ BIND(L_check_loop_end_192bit);
3639 __ add(from, 16, from); 3864 __ add(from, 16, from);
3640 __ subcc(len_reg, 16, len_reg); 3865 __ subcc(len_reg, 16, len_reg);
3641 __ add(to, 16, to); 3866 __ add(to, 16, to);
3642 __ br(Assembler::notEqual, false, Assembler::pt, L_cbcenc192); 3867 __ br(Assembler::notEqual, false, Assembler::pt, L_cbcenc192);
3643 __ delayed()->nop(); 3868 __ delayed()->nop();
3869 // re-init intial vector for next block, 8-byte alignment is guaranteed
3644 __ stf(FloatRegisterImpl::D, F60, rvec, 0); 3870 __ stf(FloatRegisterImpl::D, F60, rvec, 0);
3645 __ stf(FloatRegisterImpl::D, F62, rvec, 8); 3871 __ stf(FloatRegisterImpl::D, F62, rvec, 8);
3872 __ restore();
3646 __ retl(); 3873 __ retl();
3647 __ delayed()->mov(L1, O0); 3874 __ delayed()->mov(L0, O0);
3648 3875
3649 __ align(OptoLoopAlignment); 3876 __ align(OptoLoopAlignment);
3650 __ BIND(L_cbcenc256); 3877 __ BIND(L_cbcenc256);
3878 // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
3879 __ andcc(from, 7, G0);
3880 __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input_256bit);
3881 __ delayed()->mov(from, L1); // save original 'from' address before alignaddr
3882
3883 // aligned case: load input into G3 and G4
3651 __ ldx(from,0,G3); 3884 __ ldx(from,0,G3);
3652 __ ldx(from,8,G4); 3885 __ ldx(from,8,G4);
3886 __ ba_short(L_256bit_transform);
3887
3888 __ BIND(L_load_misaligned_input_256bit);
3889 // cannot clobber F48, F50 and F52. F56, F58 can be used though
3890 __ alignaddr(from, G0, from);
3891 __ movdtox(F60, L2); // save F60 before overwriting
3892 __ ldf(FloatRegisterImpl::D, from, 0, F56);
3893 __ ldf(FloatRegisterImpl::D, from, 8, F58);
3894 __ ldf(FloatRegisterImpl::D, from, 16, F60);
3895 __ faligndata(F56, F58, F56);
3896 __ faligndata(F58, F60, F58);
3897 __ movdtox(F56, G3);
3898 __ movdtox(F58, G4);
3899 __ mov(L1, from);
3900 __ movxtod(L2, F60);
3901
3902 __ BIND(L_256bit_transform);
3653 __ xor3(G1,G3,G3); 3903 __ xor3(G1,G3,G3);
3654 __ xor3(G2,G4,G4); 3904 __ xor3(G5,G4,G4);
3655 __ movxtod(G3,F56); 3905 __ movxtod(G3,F56);
3656 __ movxtod(G4,F58); 3906 __ movxtod(G4,F58);
3657 __ fxor(FloatRegisterImpl::D, F60, F56, F60); 3907 __ fxor(FloatRegisterImpl::D, F60, F56, F60);
3658 __ fxor(FloatRegisterImpl::D, F62, F58, F62); 3908 __ fxor(FloatRegisterImpl::D, F62, F58, F62);
3659 3909
3668 __ aes_eround01_l(as_FloatRegister(i+4), F56, F58, F60); 3918 __ aes_eround01_l(as_FloatRegister(i+4), F56, F58, F60);
3669 __ aes_eround23_l(as_FloatRegister(i+6), F56, F58, F62); 3919 __ aes_eround23_l(as_FloatRegister(i+6), F56, F58, F62);
3670 } 3920 }
3671 } 3921 }
3672 3922
3923 // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
3924 __ andcc(to, 7, L1);
3925 __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_256bit);
3926 __ delayed()->edge8n(to, G0, L2);
3927
3928 // aligned case: store output into the destination array
3673 __ stf(FloatRegisterImpl::D, F60, to, 0); 3929 __ stf(FloatRegisterImpl::D, F60, to, 0);
3674 __ stf(FloatRegisterImpl::D, F62, to, 8); 3930 __ stf(FloatRegisterImpl::D, F62, to, 8);
3931 __ ba_short(L_check_loop_end_256bit);
3932
3933 __ BIND(L_store_misaligned_output_256bit);
3934 __ add(to, 8, L3);
3935 __ mov(8, L4);
3936 __ sub(L4, L1, L4);
3937 __ alignaddr(L4, G0, L4);
3938 __ movdtox(F60, L6);
3939 __ movdtox(F62, L7);
3940 __ faligndata(F60, F60, F60);
3941 __ faligndata(F62, F62, F62);
3942 __ mov(to, L5);
3943 __ and3(to, -8, to);
3944 __ and3(L3, -8, L3);
3945 __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY);
3946 __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY);
3947 __ add(to, 8, to);
3948 __ add(L3, 8, L3);
3949 __ orn(G0, L2, L2);
3950 __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY);
3951 __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY);
3952 __ mov(L5, to);
3953 __ movxtod(L6, F60);
3954 __ movxtod(L7, F62);
3955
3956 __ BIND(L_check_loop_end_256bit);
3675 __ add(from, 16, from); 3957 __ add(from, 16, from);
3676 __ subcc(len_reg, 16, len_reg); 3958 __ subcc(len_reg, 16, len_reg);
3677 __ add(to, 16, to); 3959 __ add(to, 16, to);
3678 __ br(Assembler::notEqual, false, Assembler::pt, L_cbcenc256); 3960 __ br(Assembler::notEqual, false, Assembler::pt, L_cbcenc256);
3679 __ delayed()->nop(); 3961 __ delayed()->nop();
3962 // re-init intial vector for next block, 8-byte alignment is guaranteed
3680 __ stf(FloatRegisterImpl::D, F60, rvec, 0); 3963 __ stf(FloatRegisterImpl::D, F60, rvec, 0);
3681 __ stf(FloatRegisterImpl::D, F62, rvec, 8); 3964 __ stf(FloatRegisterImpl::D, F62, rvec, 8);
3965 __ restore();
3682 __ retl(); 3966 __ retl();
3683 __ delayed()->mov(L1, O0); 3967 __ delayed()->mov(L0, O0);
3684 3968
3685 return start; 3969 return start;
3686 } 3970 }
3687 3971
3688 address generate_cipherBlockChaining_decryptAESCrypt_Parallel() { 3972 address generate_cipherBlockChaining_decryptAESCrypt_Parallel() {
3973 assert((arrayOopDesc::base_offset_in_bytes(T_INT) & 7) == 0,
3974 "the following code assumes that first element of an int array is aligned to 8 bytes");
3975 assert((arrayOopDesc::base_offset_in_bytes(T_BYTE) & 7) == 0,
3976 "the following code assumes that first element of a byte array is aligned to 8 bytes");
3689 __ align(CodeEntryAlignment); 3977 __ align(CodeEntryAlignment);
3690 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); 3978 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
3691 Label L_cbcdec_end, L_expand192bit, L_expand256bit, L_dec_first_block_start; 3979 Label L_cbcdec_end, L_expand192bit, L_expand256bit, L_dec_first_block_start;
3692 Label L_dec_first_block128, L_dec_first_block192, L_dec_next2_blocks128, L_dec_next2_blocks192, L_dec_next2_blocks256; 3980 Label L_dec_first_block128, L_dec_first_block192, L_dec_next2_blocks128, L_dec_next2_blocks192, L_dec_next2_blocks256;
3981 Label L_load_misaligned_input_first_block, L_transform_first_block, L_load_misaligned_next2_blocks128, L_transform_next2_blocks128;
3982 Label L_load_misaligned_next2_blocks192, L_transform_next2_blocks192, L_load_misaligned_next2_blocks256, L_transform_next2_blocks256;
3983 Label L_store_misaligned_output_first_block, L_check_decrypt_end, L_store_misaligned_output_next2_blocks128;
3984 Label L_check_decrypt_loop_end128, L_store_misaligned_output_next2_blocks192, L_check_decrypt_loop_end192;
3985 Label L_store_misaligned_output_next2_blocks256, L_check_decrypt_loop_end256;
3693 address start = __ pc(); 3986 address start = __ pc();
3694 Register from = I0; // source byte array 3987 Register from = I0; // source byte array
3695 Register to = I1; // destination byte array 3988 Register to = I1; // destination byte array
3696 Register key = I2; // expanded key array 3989 Register key = I2; // expanded key array
3697 Register rvec = I3; // init vector 3990 Register rvec = I3; // init vector
3702 // save cipher len before save_frame, to return in the end 3995 // save cipher len before save_frame, to return in the end
3703 __ mov(O4, L0); 3996 __ mov(O4, L0);
3704 __ save_frame(0); //args are read from I* registers since we save the frame in the beginning 3997 __ save_frame(0); //args are read from I* registers since we save the frame in the beginning
3705 3998
3706 // load original key from SunJCE expanded decryption key 3999 // load original key from SunJCE expanded decryption key
4000 // Since we load original key buffer starting first element, 8-byte alignment is guaranteed
3707 for ( int i = 0; i <= 3; i++ ) { 4001 for ( int i = 0; i <= 3; i++ ) {
3708 __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i)); 4002 __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i));
3709 } 4003 }
3710 4004
3711 // load initial vector 4005 // load initial vector, 8-byte alignment is guaranteed
3712 __ ldx(rvec,0,L0); 4006 __ ldx(rvec,0,L0);
3713 __ ldx(rvec,8,L1); 4007 __ ldx(rvec,8,L1);
3714 4008
3715 // read expanded key array length 4009 // read expanded key array length
3716 __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0); 4010 __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0);
3731 // load expanded key[last-1] and key[last] elements 4025 // load expanded key[last-1] and key[last] elements
3732 __ movdtox(F40,L2); 4026 __ movdtox(F40,L2);
3733 __ movdtox(F42,L3); 4027 __ movdtox(F42,L3);
3734 4028
3735 __ and3(len_reg, 16, L4); 4029 __ and3(len_reg, 16, L4);
3736 __ br_null(L4, false, Assembler::pt, L_dec_next2_blocks128); 4030 __ br_null_short(L4, Assembler::pt, L_dec_next2_blocks128);
3737 __ delayed()->nop(); 4031 __ nop();
3738 4032
3739 __ br(Assembler::always, false, Assembler::pt, L_dec_first_block_start); 4033 __ ba_short(L_dec_first_block_start);
3740 __ delayed()->nop();
3741 4034
3742 __ BIND(L_expand192bit); 4035 __ BIND(L_expand192bit);
3743 // load rest of the 192-bit key 4036 // load rest of the 192-bit key
3744 __ ldf(FloatRegisterImpl::S, original_key, 16, F4); 4037 __ ldf(FloatRegisterImpl::S, original_key, 16, F4);
3745 __ ldf(FloatRegisterImpl::S, original_key, 20, F5); 4038 __ ldf(FloatRegisterImpl::S, original_key, 20, F5);
3756 // load expanded key[last-1] and key[last] elements 4049 // load expanded key[last-1] and key[last] elements
3757 __ movdtox(F48,L2); 4050 __ movdtox(F48,L2);
3758 __ movdtox(F50,L3); 4051 __ movdtox(F50,L3);
3759 4052
3760 __ and3(len_reg, 16, L4); 4053 __ and3(len_reg, 16, L4);
3761 __ br_null(L4, false, Assembler::pt, L_dec_next2_blocks192); 4054 __ br_null_short(L4, Assembler::pt, L_dec_next2_blocks192);
3762 __ delayed()->nop(); 4055 __ nop();
3763 4056
3764 __ br(Assembler::always, false, Assembler::pt, L_dec_first_block_start); 4057 __ ba_short(L_dec_first_block_start);
3765 __ delayed()->nop();
3766 4058
3767 __ BIND(L_expand256bit); 4059 __ BIND(L_expand256bit);
3768 // load rest of the 256-bit key 4060 // load rest of the 256-bit key
3769 for ( int i = 4; i <= 7; i++ ) { 4061 for ( int i = 4; i <= 7; i++ ) {
3770 __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i)); 4062 __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i));
3783 // load expanded key[last-1] and key[last] elements 4075 // load expanded key[last-1] and key[last] elements
3784 __ movdtox(F56,L2); 4076 __ movdtox(F56,L2);
3785 __ movdtox(F58,L3); 4077 __ movdtox(F58,L3);
3786 4078
3787 __ and3(len_reg, 16, L4); 4079 __ and3(len_reg, 16, L4);
3788 __ br_null(L4, false, Assembler::pt, L_dec_next2_blocks256); 4080 __ br_null_short(L4, Assembler::pt, L_dec_next2_blocks256);
3789 __ delayed()->nop();
3790 4081
3791 __ BIND(L_dec_first_block_start); 4082 __ BIND(L_dec_first_block_start);
4083 // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
4084 __ andcc(from, 7, G0);
4085 __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input_first_block);
4086 __ delayed()->mov(from, G1); // save original 'from' address before alignaddr
4087
4088 // aligned case: load input into L4 and L5
3792 __ ldx(from,0,L4); 4089 __ ldx(from,0,L4);
3793 __ ldx(from,8,L5); 4090 __ ldx(from,8,L5);
4091 __ ba_short(L_transform_first_block);
4092
4093 __ BIND(L_load_misaligned_input_first_block);
4094 __ alignaddr(from, G0, from);
4095 // F58, F60, F62 can be clobbered
4096 __ ldf(FloatRegisterImpl::D, from, 0, F58);
4097 __ ldf(FloatRegisterImpl::D, from, 8, F60);
4098 __ ldf(FloatRegisterImpl::D, from, 16, F62);
4099 __ faligndata(F58, F60, F58);
4100 __ faligndata(F60, F62, F60);
4101 __ movdtox(F58, L4);
4102 __ movdtox(F60, L5);
4103 __ mov(G1, from);
4104
4105 __ BIND(L_transform_first_block);
3794 __ xor3(L2,L4,G1); 4106 __ xor3(L2,L4,G1);
3795 __ movxtod(G1,F60); 4107 __ movxtod(G1,F60);
3796 __ xor3(L3,L5,G1); 4108 __ xor3(L3,L5,G1);
3797 __ movxtod(G1,F62); 4109 __ movxtod(G1,F62);
3798 4110
3831 __ mov(L4,L0); 4143 __ mov(L4,L0);
3832 __ mov(L5,L1); 4144 __ mov(L5,L1);
3833 __ fxor(FloatRegisterImpl::D, F56, F60, F60); 4145 __ fxor(FloatRegisterImpl::D, F56, F60, F60);
3834 __ fxor(FloatRegisterImpl::D, F58, F62, F62); 4146 __ fxor(FloatRegisterImpl::D, F58, F62, F62);
3835 4147
4148 // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
4149 __ andcc(to, 7, G1);
4150 __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_first_block);
4151 __ delayed()->edge8n(to, G0, G2);
4152
4153 // aligned case: store output into the destination array
3836 __ stf(FloatRegisterImpl::D, F60, to, 0); 4154 __ stf(FloatRegisterImpl::D, F60, to, 0);
3837 __ stf(FloatRegisterImpl::D, F62, to, 8); 4155 __ stf(FloatRegisterImpl::D, F62, to, 8);
3838 4156 __ ba_short(L_check_decrypt_end);
4157
4158 __ BIND(L_store_misaligned_output_first_block);
4159 __ add(to, 8, G3);
4160 __ mov(8, G4);
4161 __ sub(G4, G1, G4);
4162 __ alignaddr(G4, G0, G4);
4163 __ faligndata(F60, F60, F60);
4164 __ faligndata(F62, F62, F62);
4165 __ mov(to, G1);
4166 __ and3(to, -8, to);
4167 __ and3(G3, -8, G3);
4168 __ stpartialf(to, G2, F60, Assembler::ASI_PST8_PRIMARY);
4169 __ stpartialf(G3, G2, F62, Assembler::ASI_PST8_PRIMARY);
4170 __ add(to, 8, to);
4171 __ add(G3, 8, G3);
4172 __ orn(G0, G2, G2);
4173 __ stpartialf(to, G2, F60, Assembler::ASI_PST8_PRIMARY);
4174 __ stpartialf(G3, G2, F62, Assembler::ASI_PST8_PRIMARY);
4175 __ mov(G1, to);
4176
4177 __ BIND(L_check_decrypt_end);
3839 __ add(from, 16, from); 4178 __ add(from, 16, from);
3840 __ add(to, 16, to); 4179 __ add(to, 16, to);
3841 __ subcc(len_reg, 16, len_reg); 4180 __ subcc(len_reg, 16, len_reg);
3842 __ br(Assembler::equal, false, Assembler::pt, L_cbcdec_end); 4181 __ br(Assembler::equal, false, Assembler::pt, L_cbcdec_end);
3843 __ delayed()->nop(); 4182 __ delayed()->nop();
3850 4189
3851 __ align(OptoLoopAlignment); 4190 __ align(OptoLoopAlignment);
3852 __ BIND(L_dec_next2_blocks128); 4191 __ BIND(L_dec_next2_blocks128);
3853 __ nop(); 4192 __ nop();
3854 4193
3855 // F40:F42 used for first 16-bytes 4194 // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
4195 __ andcc(from, 7, G0);
4196 __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_next2_blocks128);
4197 __ delayed()->mov(from, G1); // save original 'from' address before alignaddr
4198
4199 // aligned case: load input into G4, G5, L4 and L5
3856 __ ldx(from,0,G4); 4200 __ ldx(from,0,G4);
3857 __ ldx(from,8,G5); 4201 __ ldx(from,8,G5);
4202 __ ldx(from,16,L4);
4203 __ ldx(from,24,L5);
4204 __ ba_short(L_transform_next2_blocks128);
4205
4206 __ BIND(L_load_misaligned_next2_blocks128);
4207 __ alignaddr(from, G0, from);
4208 // F40, F42, F58, F60, F62 can be clobbered
4209 __ ldf(FloatRegisterImpl::D, from, 0, F40);
4210 __ ldf(FloatRegisterImpl::D, from, 8, F42);
4211 __ ldf(FloatRegisterImpl::D, from, 16, F60);
4212 __ ldf(FloatRegisterImpl::D, from, 24, F62);
4213 __ ldf(FloatRegisterImpl::D, from, 32, F58);
4214 __ faligndata(F40, F42, F40);
4215 __ faligndata(F42, F60, F42);
4216 __ faligndata(F60, F62, F60);
4217 __ faligndata(F62, F58, F62);
4218 __ movdtox(F40, G4);
4219 __ movdtox(F42, G5);
4220 __ movdtox(F60, L4);
4221 __ movdtox(F62, L5);
4222 __ mov(G1, from);
4223
4224 __ BIND(L_transform_next2_blocks128);
4225 // F40:F42 used for first 16-bytes
3858 __ xor3(L2,G4,G1); 4226 __ xor3(L2,G4,G1);
3859 __ movxtod(G1,F40); 4227 __ movxtod(G1,F40);
3860 __ xor3(L3,G5,G1); 4228 __ xor3(L3,G5,G1);
3861 __ movxtod(G1,F42); 4229 __ movxtod(G1,F42);
3862 4230
3863 // F60:F62 used for next 16-bytes 4231 // F60:F62 used for next 16-bytes
3864 __ ldx(from,16,L4);
3865 __ ldx(from,24,L5);
3866 __ xor3(L2,L4,G1); 4232 __ xor3(L2,L4,G1);
3867 __ movxtod(G1,F60); 4233 __ movxtod(G1,F60);
3868 __ xor3(L3,L5,G1); 4234 __ xor3(L3,L5,G1);
3869 __ movxtod(G1,F62); 4235 __ movxtod(G1,F62);
3870 4236
3889 __ movxtod(L0,F46); 4255 __ movxtod(L0,F46);
3890 __ movxtod(L1,F44); 4256 __ movxtod(L1,F44);
3891 __ fxor(FloatRegisterImpl::D, F46, F40, F40); 4257 __ fxor(FloatRegisterImpl::D, F46, F40, F40);
3892 __ fxor(FloatRegisterImpl::D, F44, F42, F42); 4258 __ fxor(FloatRegisterImpl::D, F44, F42, F42);
3893 4259
3894 __ stf(FloatRegisterImpl::D, F40, to, 0);
3895 __ stf(FloatRegisterImpl::D, F42, to, 8);
3896
3897 __ movxtod(G4,F56); 4260 __ movxtod(G4,F56);
3898 __ movxtod(G5,F58); 4261 __ movxtod(G5,F58);
3899 __ mov(L4,L0); 4262 __ mov(L4,L0);
3900 __ mov(L5,L1); 4263 __ mov(L5,L1);
3901 __ fxor(FloatRegisterImpl::D, F56, F60, F60); 4264 __ fxor(FloatRegisterImpl::D, F56, F60, F60);
3902 __ fxor(FloatRegisterImpl::D, F58, F62, F62); 4265 __ fxor(FloatRegisterImpl::D, F58, F62, F62);
3903 4266
4267 // For mis-aligned store of 32 bytes of result we can do:
4268 // Circular right-shift all 4 FP registers so that 'head' and 'tail'
4269 // parts that need to be stored starting at mis-aligned address are in a FP reg
4270 // the other 3 FP regs can thus be stored using regular store
4271 // we then use the edge + partial-store mechanism to store the 'head' and 'tail' parts
4272
4273 // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
4274 __ andcc(to, 7, G1);
4275 __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_next2_blocks128);
4276 __ delayed()->edge8n(to, G0, G2);
4277
4278 // aligned case: store output into the destination array
4279 __ stf(FloatRegisterImpl::D, F40, to, 0);
4280 __ stf(FloatRegisterImpl::D, F42, to, 8);
3904 __ stf(FloatRegisterImpl::D, F60, to, 16); 4281 __ stf(FloatRegisterImpl::D, F60, to, 16);
3905 __ stf(FloatRegisterImpl::D, F62, to, 24); 4282 __ stf(FloatRegisterImpl::D, F62, to, 24);
3906 4283 __ ba_short(L_check_decrypt_loop_end128);
4284
4285 __ BIND(L_store_misaligned_output_next2_blocks128);
4286 __ mov(8, G4);
4287 __ sub(G4, G1, G4);
4288 __ alignaddr(G4, G0, G4);
4289 __ faligndata(F40, F42, F56); // F56 can be clobbered
4290 __ faligndata(F42, F60, F42);
4291 __ faligndata(F60, F62, F60);
4292 __ faligndata(F62, F40, F40);
4293 __ mov(to, G1);
4294 __ and3(to, -8, to);
4295 __ stpartialf(to, G2, F40, Assembler::ASI_PST8_PRIMARY);
4296 __ stf(FloatRegisterImpl::D, F56, to, 8);
4297 __ stf(FloatRegisterImpl::D, F42, to, 16);
4298 __ stf(FloatRegisterImpl::D, F60, to, 24);
4299 __ add(to, 32, to);
4300 __ orn(G0, G2, G2);
4301 __ stpartialf(to, G2, F40, Assembler::ASI_PST8_PRIMARY);
4302 __ mov(G1, to);
4303
4304 __ BIND(L_check_decrypt_loop_end128);
3907 __ add(from, 32, from); 4305 __ add(from, 32, from);
3908 __ add(to, 32, to); 4306 __ add(to, 32, to);
3909 __ subcc(len_reg, 32, len_reg); 4307 __ subcc(len_reg, 32, len_reg);
3910 __ br(Assembler::notEqual, false, Assembler::pt, L_dec_next2_blocks128); 4308 __ br(Assembler::notEqual, false, Assembler::pt, L_dec_next2_blocks128);
3911 __ delayed()->nop(); 4309 __ delayed()->nop();
3912 __ br(Assembler::always, false, Assembler::pt, L_cbcdec_end); 4310 __ ba_short(L_cbcdec_end);
3913 __ delayed()->nop();
3914 4311
3915 __ align(OptoLoopAlignment); 4312 __ align(OptoLoopAlignment);
3916 __ BIND(L_dec_next2_blocks192); 4313 __ BIND(L_dec_next2_blocks192);
3917 __ nop(); 4314 __ nop();
3918 4315
3919 // F48:F50 used for first 16-bytes 4316 // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
4317 __ andcc(from, 7, G0);
4318 __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_next2_blocks192);
4319 __ delayed()->mov(from, G1); // save original 'from' address before alignaddr
4320
4321 // aligned case: load input into G4, G5, L4 and L5
3920 __ ldx(from,0,G4); 4322 __ ldx(from,0,G4);
3921 __ ldx(from,8,G5); 4323 __ ldx(from,8,G5);
4324 __ ldx(from,16,L4);
4325 __ ldx(from,24,L5);
4326 __ ba_short(L_transform_next2_blocks192);
4327
4328 __ BIND(L_load_misaligned_next2_blocks192);
4329 __ alignaddr(from, G0, from);
4330 // F48, F50, F52, F60, F62 can be clobbered
4331 __ ldf(FloatRegisterImpl::D, from, 0, F48);
4332 __ ldf(FloatRegisterImpl::D, from, 8, F50);
4333 __ ldf(FloatRegisterImpl::D, from, 16, F60);
4334 __ ldf(FloatRegisterImpl::D, from, 24, F62);
4335 __ ldf(FloatRegisterImpl::D, from, 32, F52);
4336 __ faligndata(F48, F50, F48);
4337 __ faligndata(F50, F60, F50);
4338 __ faligndata(F60, F62, F60);
4339 __ faligndata(F62, F52, F62);
4340 __ movdtox(F48, G4);
4341 __ movdtox(F50, G5);
4342 __ movdtox(F60, L4);
4343 __ movdtox(F62, L5);
4344 __ mov(G1, from);
4345
4346 __ BIND(L_transform_next2_blocks192);
4347 // F48:F50 used for first 16-bytes
3922 __ xor3(L2,G4,G1); 4348 __ xor3(L2,G4,G1);
3923 __ movxtod(G1,F48); 4349 __ movxtod(G1,F48);
3924 __ xor3(L3,G5,G1); 4350 __ xor3(L3,G5,G1);
3925 __ movxtod(G1,F50); 4351 __ movxtod(G1,F50);
3926 4352
3927 // F60:F62 used for next 16-bytes 4353 // F60:F62 used for next 16-bytes
3928 __ ldx(from,16,L4);
3929 __ ldx(from,24,L5);
3930 __ xor3(L2,L4,G1); 4354 __ xor3(L2,L4,G1);
3931 __ movxtod(G1,F60); 4355 __ movxtod(G1,F60);
3932 __ xor3(L3,L5,G1); 4356 __ xor3(L3,L5,G1);
3933 __ movxtod(G1,F62); 4357 __ movxtod(G1,F62);
3934 4358
3953 __ movxtod(L0,F54); 4377 __ movxtod(L0,F54);
3954 __ movxtod(L1,F52); 4378 __ movxtod(L1,F52);
3955 __ fxor(FloatRegisterImpl::D, F54, F48, F48); 4379 __ fxor(FloatRegisterImpl::D, F54, F48, F48);
3956 __ fxor(FloatRegisterImpl::D, F52, F50, F50); 4380 __ fxor(FloatRegisterImpl::D, F52, F50, F50);
3957 4381
3958 __ stf(FloatRegisterImpl::D, F48, to, 0);
3959 __ stf(FloatRegisterImpl::D, F50, to, 8);
3960
3961 __ movxtod(G4,F56); 4382 __ movxtod(G4,F56);
3962 __ movxtod(G5,F58); 4383 __ movxtod(G5,F58);
3963 __ mov(L4,L0); 4384 __ mov(L4,L0);
3964 __ mov(L5,L1); 4385 __ mov(L5,L1);
3965 __ fxor(FloatRegisterImpl::D, F56, F60, F60); 4386 __ fxor(FloatRegisterImpl::D, F56, F60, F60);
3966 __ fxor(FloatRegisterImpl::D, F58, F62, F62); 4387 __ fxor(FloatRegisterImpl::D, F58, F62, F62);
3967 4388
4389 // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
4390 __ andcc(to, 7, G1);
4391 __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_next2_blocks192);
4392 __ delayed()->edge8n(to, G0, G2);
4393
4394 // aligned case: store output into the destination array
4395 __ stf(FloatRegisterImpl::D, F48, to, 0);
4396 __ stf(FloatRegisterImpl::D, F50, to, 8);
3968 __ stf(FloatRegisterImpl::D, F60, to, 16); 4397 __ stf(FloatRegisterImpl::D, F60, to, 16);
3969 __ stf(FloatRegisterImpl::D, F62, to, 24); 4398 __ stf(FloatRegisterImpl::D, F62, to, 24);
3970 4399 __ ba_short(L_check_decrypt_loop_end192);
4400
4401 __ BIND(L_store_misaligned_output_next2_blocks192);
4402 __ mov(8, G4);
4403 __ sub(G4, G1, G4);
4404 __ alignaddr(G4, G0, G4);
4405 __ faligndata(F48, F50, F56); // F56 can be clobbered
4406 __ faligndata(F50, F60, F50);
4407 __ faligndata(F60, F62, F60);
4408 __ faligndata(F62, F48, F48);
4409 __ mov(to, G1);
4410 __ and3(to, -8, to);
4411 __ stpartialf(to, G2, F48, Assembler::ASI_PST8_PRIMARY);
4412 __ stf(FloatRegisterImpl::D, F56, to, 8);
4413 __ stf(FloatRegisterImpl::D, F50, to, 16);
4414 __ stf(FloatRegisterImpl::D, F60, to, 24);
4415 __ add(to, 32, to);
4416 __ orn(G0, G2, G2);
4417 __ stpartialf(to, G2, F48, Assembler::ASI_PST8_PRIMARY);
4418 __ mov(G1, to);
4419
4420 __ BIND(L_check_decrypt_loop_end192);
3971 __ add(from, 32, from); 4421 __ add(from, 32, from);
3972 __ add(to, 32, to); 4422 __ add(to, 32, to);
3973 __ subcc(len_reg, 32, len_reg); 4423 __ subcc(len_reg, 32, len_reg);
3974 __ br(Assembler::notEqual, false, Assembler::pt, L_dec_next2_blocks192); 4424 __ br(Assembler::notEqual, false, Assembler::pt, L_dec_next2_blocks192);
3975 __ delayed()->nop(); 4425 __ delayed()->nop();
3976 __ br(Assembler::always, false, Assembler::pt, L_cbcdec_end); 4426 __ ba_short(L_cbcdec_end);
3977 __ delayed()->nop();
3978 4427
3979 __ align(OptoLoopAlignment); 4428 __ align(OptoLoopAlignment);
3980 __ BIND(L_dec_next2_blocks256); 4429 __ BIND(L_dec_next2_blocks256);
3981 __ nop(); 4430 __ nop();
3982 4431
3983 // F0:F2 used for first 16-bytes 4432 // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
4433 __ andcc(from, 7, G0);
4434 __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_next2_blocks256);
4435 __ delayed()->mov(from, G1); // save original 'from' address before alignaddr
4436
4437 // aligned case: load input into G4, G5, L4 and L5
3984 __ ldx(from,0,G4); 4438 __ ldx(from,0,G4);
3985 __ ldx(from,8,G5); 4439 __ ldx(from,8,G5);
4440 __ ldx(from,16,L4);
4441 __ ldx(from,24,L5);
4442 __ ba_short(L_transform_next2_blocks256);
4443
4444 __ BIND(L_load_misaligned_next2_blocks256);
4445 __ alignaddr(from, G0, from);
4446 // F0, F2, F4, F60, F62 can be clobbered
4447 __ ldf(FloatRegisterImpl::D, from, 0, F0);
4448 __ ldf(FloatRegisterImpl::D, from, 8, F2);
4449 __ ldf(FloatRegisterImpl::D, from, 16, F60);
4450 __ ldf(FloatRegisterImpl::D, from, 24, F62);
4451 __ ldf(FloatRegisterImpl::D, from, 32, F4);
4452 __ faligndata(F0, F2, F0);
4453 __ faligndata(F2, F60, F2);
4454 __ faligndata(F60, F62, F60);
4455 __ faligndata(F62, F4, F62);
4456 __ movdtox(F0, G4);
4457 __ movdtox(F2, G5);
4458 __ movdtox(F60, L4);
4459 __ movdtox(F62, L5);
4460 __ mov(G1, from);
4461
4462 __ BIND(L_transform_next2_blocks256);
4463 // F0:F2 used for first 16-bytes
3986 __ xor3(L2,G4,G1); 4464 __ xor3(L2,G4,G1);
3987 __ movxtod(G1,F0); 4465 __ movxtod(G1,F0);
3988 __ xor3(L3,G5,G1); 4466 __ xor3(L3,G5,G1);
3989 __ movxtod(G1,F2); 4467 __ movxtod(G1,F2);
3990 4468
3991 // F60:F62 used for next 16-bytes 4469 // F60:F62 used for next 16-bytes
3992 __ ldx(from,16,L4);
3993 __ ldx(from,24,L5);
3994 __ xor3(L2,L4,G1); 4470 __ xor3(L2,L4,G1);
3995 __ movxtod(G1,F60); 4471 __ movxtod(G1,F60);
3996 __ xor3(L3,L5,G1); 4472 __ xor3(L3,L5,G1);
3997 __ movxtod(G1,F62); 4473 __ movxtod(G1,F62);
3998 4474
4041 __ movxtod(L0,F6); 4517 __ movxtod(L0,F6);
4042 __ movxtod(L1,F4); 4518 __ movxtod(L1,F4);
4043 __ fxor(FloatRegisterImpl::D, F6, F0, F0); 4519 __ fxor(FloatRegisterImpl::D, F6, F0, F0);
4044 __ fxor(FloatRegisterImpl::D, F4, F2, F2); 4520 __ fxor(FloatRegisterImpl::D, F4, F2, F2);
4045 4521
4046 __ stf(FloatRegisterImpl::D, F0, to, 0);
4047 __ stf(FloatRegisterImpl::D, F2, to, 8);
4048
4049 __ movxtod(G4,F56); 4522 __ movxtod(G4,F56);
4050 __ movxtod(G5,F58); 4523 __ movxtod(G5,F58);
4051 __ mov(L4,L0); 4524 __ mov(L4,L0);
4052 __ mov(L5,L1); 4525 __ mov(L5,L1);
4053 __ fxor(FloatRegisterImpl::D, F56, F60, F60); 4526 __ fxor(FloatRegisterImpl::D, F56, F60, F60);
4054 __ fxor(FloatRegisterImpl::D, F58, F62, F62); 4527 __ fxor(FloatRegisterImpl::D, F58, F62, F62);
4055 4528
4529 // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
4530 __ andcc(to, 7, G1);
4531 __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_next2_blocks256);
4532 __ delayed()->edge8n(to, G0, G2);
4533
4534 // aligned case: store output into the destination array
4535 __ stf(FloatRegisterImpl::D, F0, to, 0);
4536 __ stf(FloatRegisterImpl::D, F2, to, 8);
4056 __ stf(FloatRegisterImpl::D, F60, to, 16); 4537 __ stf(FloatRegisterImpl::D, F60, to, 16);
4057 __ stf(FloatRegisterImpl::D, F62, to, 24); 4538 __ stf(FloatRegisterImpl::D, F62, to, 24);
4058 4539 __ ba_short(L_check_decrypt_loop_end256);
4540
4541 __ BIND(L_store_misaligned_output_next2_blocks256);
4542 __ mov(8, G4);
4543 __ sub(G4, G1, G4);
4544 __ alignaddr(G4, G0, G4);
4545 __ faligndata(F0, F2, F56); // F56 can be clobbered
4546 __ faligndata(F2, F60, F2);
4547 __ faligndata(F60, F62, F60);
4548 __ faligndata(F62, F0, F0);
4549 __ mov(to, G1);
4550 __ and3(to, -8, to);
4551 __ stpartialf(to, G2, F0, Assembler::ASI_PST8_PRIMARY);
4552 __ stf(FloatRegisterImpl::D, F56, to, 8);
4553 __ stf(FloatRegisterImpl::D, F2, to, 16);
4554 __ stf(FloatRegisterImpl::D, F60, to, 24);
4555 __ add(to, 32, to);
4556 __ orn(G0, G2, G2);
4557 __ stpartialf(to, G2, F0, Assembler::ASI_PST8_PRIMARY);
4558 __ mov(G1, to);
4559
4560 __ BIND(L_check_decrypt_loop_end256);
4059 __ add(from, 32, from); 4561 __ add(from, 32, from);
4060 __ add(to, 32, to); 4562 __ add(to, 32, to);
4061 __ subcc(len_reg, 32, len_reg); 4563 __ subcc(len_reg, 32, len_reg);
4062 __ br(Assembler::notEqual, false, Assembler::pt, L_dec_next2_blocks256); 4564 __ br(Assembler::notEqual, false, Assembler::pt, L_dec_next2_blocks256);
4063 __ delayed()->nop(); 4565 __ delayed()->nop();
4064 4566
4065 __ BIND(L_cbcdec_end); 4567 __ BIND(L_cbcdec_end);
4568 // re-init intial vector for next block, 8-byte alignment is guaranteed
4066 __ stx(L0, rvec, 0); 4569 __ stx(L0, rvec, 0);
4067 __ stx(L1, rvec, 8); 4570 __ stx(L1, rvec, 8);
4068 __ restore(); 4571 __ restore();
4069 __ mov(L0, O0); 4572 __ mov(L0, O0);
4070 __ retl(); 4573 __ retl();