Mercurial > hg > graal-compiler
comparison src/cpu/sparc/vm/stubGenerator_sparc.cpp @ 17910:03214612e77e
8035936: SIGBUS in StubRoutines::aesencryptBlock, solaris-sparc
Summary: Fix the arbitrary alignment issue in SPARC AES crypto stub routines.
Reviewed-by: kvn, iveresov
Contributed-by: shrinivas.joshi@oracle.com
author | kvn |
---|---|
date | Wed, 30 Apr 2014 14:14:01 -0700 |
parents | 04d32e7fad07 |
children | 0fb5b60ab4a2 |
comparison
equal
deleted
inserted
replaced
17909:85d6efcb1fa3 | 17910:03214612e77e |
---|---|
1 /* | 1 /* |
2 * Copyright (c) 1997, 2013, Oracle and/or its affiliates. All rights reserved. | 2 * Copyright (c) 1997, 2014, Oracle and/or its affiliates. All rights reserved. |
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. | 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. |
4 * | 4 * |
5 * This code is free software; you can redistribute it and/or modify it | 5 * This code is free software; you can redistribute it and/or modify it |
6 * under the terms of the GNU General Public License version 2 only, as | 6 * under the terms of the GNU General Public License version 2 only, as |
7 * published by the Free Software Foundation. | 7 * published by the Free Software Foundation. |
3303 StubRoutines::_zero_aligned_words = generate_zero_aligned_words("zero_aligned_words"); | 3303 StubRoutines::_zero_aligned_words = generate_zero_aligned_words("zero_aligned_words"); |
3304 } | 3304 } |
3305 } | 3305 } |
3306 | 3306 |
3307 address generate_aescrypt_encryptBlock() { | 3307 address generate_aescrypt_encryptBlock() { |
3308 // required since we read expanded key 'int' array starting first element without alignment considerations | |
3309 assert((arrayOopDesc::base_offset_in_bytes(T_INT) & 7) == 0, | |
3310 "the following code assumes that first element of an int array is aligned to 8 bytes"); | |
3308 __ align(CodeEntryAlignment); | 3311 __ align(CodeEntryAlignment); |
3309 StubCodeMark mark(this, "StubRoutines", "aesencryptBlock"); | 3312 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); |
3310 Label L_doLast128bit, L_storeOutput; | 3313 Label L_load_misaligned_input, L_load_expanded_key, L_doLast128bit, L_storeOutput, L_store_misaligned_output; |
3311 address start = __ pc(); | 3314 address start = __ pc(); |
3312 Register from = O0; // source byte array | 3315 Register from = O0; // source byte array |
3313 Register to = O1; // destination byte array | 3316 Register to = O1; // destination byte array |
3314 Register key = O2; // expanded key array | 3317 Register key = O2; // expanded key array |
3315 const Register keylen = O4; //reg for storing expanded key array length | 3318 const Register keylen = O4; //reg for storing expanded key array length |
3316 | 3319 |
3317 // read expanded key length | 3320 // read expanded key length |
3318 __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0); | 3321 __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0); |
3319 | 3322 |
3320 // load input into F54-F56; F30-F31 used as temp | 3323 // Method to address arbitrary alignment for load instructions: |
3321 __ ldf(FloatRegisterImpl::S, from, 0, F30); | 3324 // Check last 3 bits of 'from' address to see if it is aligned to 8-byte boundary |
3322 __ ldf(FloatRegisterImpl::S, from, 4, F31); | 3325 // If zero/aligned then continue with double FP load instructions |
3323 __ fmov(FloatRegisterImpl::D, F30, F54); | 3326 // If not zero/mis-aligned then alignaddr will set GSR.align with number of bytes to skip during faligndata |
3324 __ ldf(FloatRegisterImpl::S, from, 8, F30); | 3327 // alignaddr will also convert arbitrary aligned 'from' address to nearest 8-byte aligned address |
3325 __ ldf(FloatRegisterImpl::S, from, 12, F31); | 3328 // load 3 * 8-byte components (to read 16 bytes input) in 3 different FP regs starting at this aligned address |
3326 __ fmov(FloatRegisterImpl::D, F30, F56); | 3329 // faligndata will then extract (based on GSR.align value) the appropriate 8 bytes from the 2 source regs |
3327 | 3330 |
3328 // load expanded key | 3331 // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero |
3332 __ andcc(from, 7, G0); | |
3333 __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input); | |
3334 __ delayed()->alignaddr(from, G0, from); | |
3335 | |
3336 // aligned case: load input into F54-F56 | |
3337 __ ldf(FloatRegisterImpl::D, from, 0, F54); | |
3338 __ ldf(FloatRegisterImpl::D, from, 8, F56); | |
3339 __ ba_short(L_load_expanded_key); | |
3340 | |
3341 __ BIND(L_load_misaligned_input); | |
3342 __ ldf(FloatRegisterImpl::D, from, 0, F54); | |
3343 __ ldf(FloatRegisterImpl::D, from, 8, F56); | |
3344 __ ldf(FloatRegisterImpl::D, from, 16, F58); | |
3345 __ faligndata(F54, F56, F54); | |
3346 __ faligndata(F56, F58, F56); | |
3347 | |
3348 __ BIND(L_load_expanded_key); | |
3349 // Since we load expanded key buffers starting first element, 8-byte alignment is guaranteed | |
3329 for ( int i = 0; i <= 38; i += 2 ) { | 3350 for ( int i = 0; i <= 38; i += 2 ) { |
3330 __ ldf(FloatRegisterImpl::D, key, i*4, as_FloatRegister(i)); | 3351 __ ldf(FloatRegisterImpl::D, key, i*4, as_FloatRegister(i)); |
3331 } | 3352 } |
3332 | 3353 |
3333 // perform cipher transformation | 3354 // perform cipher transformation |
3363 __ ldf(FloatRegisterImpl::D, key, 216, F46); | 3384 __ ldf(FloatRegisterImpl::D, key, 216, F46); |
3364 __ ldf(FloatRegisterImpl::D, key, 224, F48); | 3385 __ ldf(FloatRegisterImpl::D, key, 224, F48); |
3365 __ ldf(FloatRegisterImpl::D, key, 232, F50); | 3386 __ ldf(FloatRegisterImpl::D, key, 232, F50); |
3366 __ aes_eround01(F52, F54, F56, F58); //round 13 | 3387 __ aes_eround01(F52, F54, F56, F58); //round 13 |
3367 __ aes_eround23(F46, F54, F56, F60); | 3388 __ aes_eround23(F46, F54, F56, F60); |
3368 __ br(Assembler::always, false, Assembler::pt, L_storeOutput); | 3389 __ ba_short(L_storeOutput); |
3369 __ delayed()->nop(); | |
3370 | 3390 |
3371 __ BIND(L_doLast128bit); | 3391 __ BIND(L_doLast128bit); |
3372 __ ldf(FloatRegisterImpl::D, key, 160, F48); | 3392 __ ldf(FloatRegisterImpl::D, key, 160, F48); |
3373 __ ldf(FloatRegisterImpl::D, key, 168, F50); | 3393 __ ldf(FloatRegisterImpl::D, key, 168, F50); |
3374 | 3394 |
3375 __ BIND(L_storeOutput); | 3395 __ BIND(L_storeOutput); |
3376 // perform last round of encryption common for all key sizes | 3396 // perform last round of encryption common for all key sizes |
3377 __ aes_eround01_l(F48, F58, F60, F54); //last round | 3397 __ aes_eround01_l(F48, F58, F60, F54); //last round |
3378 __ aes_eround23_l(F50, F58, F60, F56); | 3398 __ aes_eround23_l(F50, F58, F60, F56); |
3379 | 3399 |
3380 // store output into the destination array, F0-F1 used as temp | 3400 // Method to address arbitrary alignment for store instructions: |
3381 __ fmov(FloatRegisterImpl::D, F54, F0); | 3401 // Check last 3 bits of 'dest' address to see if it is aligned to 8-byte boundary |
3382 __ stf(FloatRegisterImpl::S, F0, to, 0); | 3402 // If zero/aligned then continue with double FP store instructions |
3383 __ stf(FloatRegisterImpl::S, F1, to, 4); | 3403 // If not zero/mis-aligned then edge8n will generate edge mask in result reg (O3 in below case) |
3384 __ fmov(FloatRegisterImpl::D, F56, F0); | 3404 // Example: If dest address is 0x07 and nearest 8-byte aligned address is 0x00 then edge mask will be 00000001 |
3385 __ stf(FloatRegisterImpl::S, F0, to, 8); | 3405 // Compute (8-n) where n is # of bytes skipped by partial store(stpartialf) inst from edge mask, n=7 in this case |
3406 // We get the value of n from the andcc that checks 'dest' alignment. n is available in O5 in below case. | |
3407 // Set GSR.align to (8-n) using alignaddr | |
3408 // Circular byte shift store values by n places so that the original bytes are at correct position for stpartialf | |
3409 // Set the arbitrarily aligned 'dest' address to nearest 8-byte aligned address | |
3410 // Store (partial) the original first (8-n) bytes starting at the original 'dest' address | |
3411 // Negate the edge mask so that the subsequent stpartialf can store the original (8-n-1)th through 8th bytes at appropriate address | |
3412 // We need to execute this process for both the 8-byte result values | |
3413 | |
3414 // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero | |
3415 __ andcc(to, 7, O5); | |
3416 __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output); | |
3417 __ delayed()->edge8n(to, G0, O3); | |
3418 | |
3419 // aligned case: store output into the destination array | |
3420 __ stf(FloatRegisterImpl::D, F54, to, 0); | |
3386 __ retl(); | 3421 __ retl(); |
3387 __ delayed()->stf(FloatRegisterImpl::S, F1, to, 12); | 3422 __ delayed()->stf(FloatRegisterImpl::D, F56, to, 8); |
3423 | |
3424 __ BIND(L_store_misaligned_output); | |
3425 __ add(to, 8, O4); | |
3426 __ mov(8, O2); | |
3427 __ sub(O2, O5, O2); | |
3428 __ alignaddr(O2, G0, O2); | |
3429 __ faligndata(F54, F54, F54); | |
3430 __ faligndata(F56, F56, F56); | |
3431 __ and3(to, -8, to); | |
3432 __ and3(O4, -8, O4); | |
3433 __ stpartialf(to, O3, F54, Assembler::ASI_PST8_PRIMARY); | |
3434 __ stpartialf(O4, O3, F56, Assembler::ASI_PST8_PRIMARY); | |
3435 __ add(to, 8, to); | |
3436 __ add(O4, 8, O4); | |
3437 __ orn(G0, O3, O3); | |
3438 __ stpartialf(to, O3, F54, Assembler::ASI_PST8_PRIMARY); | |
3439 __ retl(); | |
3440 __ delayed()->stpartialf(O4, O3, F56, Assembler::ASI_PST8_PRIMARY); | |
3388 | 3441 |
3389 return start; | 3442 return start; |
3390 } | 3443 } |
3391 | 3444 |
3392 address generate_aescrypt_decryptBlock() { | 3445 address generate_aescrypt_decryptBlock() { |
3446 assert((arrayOopDesc::base_offset_in_bytes(T_INT) & 7) == 0, | |
3447 "the following code assumes that first element of an int array is aligned to 8 bytes"); | |
3448 // required since we read original key 'byte' array as well in the decryption stubs | |
3449 assert((arrayOopDesc::base_offset_in_bytes(T_BYTE) & 7) == 0, | |
3450 "the following code assumes that first element of a byte array is aligned to 8 bytes"); | |
3393 __ align(CodeEntryAlignment); | 3451 __ align(CodeEntryAlignment); |
3394 StubCodeMark mark(this, "StubRoutines", "aesdecryptBlock"); | 3452 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock"); |
3395 address start = __ pc(); | 3453 address start = __ pc(); |
3396 Label L_expand192bit, L_expand256bit, L_common_transform; | 3454 Label L_load_misaligned_input, L_load_original_key, L_expand192bit, L_expand256bit, L_reload_misaligned_input; |
3455 Label L_256bit_transform, L_common_transform, L_store_misaligned_output; | |
3397 Register from = O0; // source byte array | 3456 Register from = O0; // source byte array |
3398 Register to = O1; // destination byte array | 3457 Register to = O1; // destination byte array |
3399 Register key = O2; // expanded key array | 3458 Register key = O2; // expanded key array |
3400 Register original_key = O3; // original key array only required during decryption | 3459 Register original_key = O3; // original key array only required during decryption |
3401 const Register keylen = O4; // reg for storing expanded key array length | 3460 const Register keylen = O4; // reg for storing expanded key array length |
3402 | 3461 |
3403 // read expanded key array length | 3462 // read expanded key array length |
3404 __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0); | 3463 __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0); |
3405 | 3464 |
3406 // load input into F52-F54; F30,F31 used as temp | 3465 // save 'from' since we may need to recheck alignment in case of 256-bit decryption |
3407 __ ldf(FloatRegisterImpl::S, from, 0, F30); | 3466 __ mov(from, G1); |
3408 __ ldf(FloatRegisterImpl::S, from, 4, F31); | 3467 |
3409 __ fmov(FloatRegisterImpl::D, F30, F52); | 3468 // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero |
3410 __ ldf(FloatRegisterImpl::S, from, 8, F30); | 3469 __ andcc(from, 7, G0); |
3411 __ ldf(FloatRegisterImpl::S, from, 12, F31); | 3470 __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input); |
3412 __ fmov(FloatRegisterImpl::D, F30, F54); | 3471 __ delayed()->alignaddr(from, G0, from); |
3413 | 3472 |
3473 // aligned case: load input into F52-F54 | |
3474 __ ldf(FloatRegisterImpl::D, from, 0, F52); | |
3475 __ ldf(FloatRegisterImpl::D, from, 8, F54); | |
3476 __ ba_short(L_load_original_key); | |
3477 | |
3478 __ BIND(L_load_misaligned_input); | |
3479 __ ldf(FloatRegisterImpl::D, from, 0, F52); | |
3480 __ ldf(FloatRegisterImpl::D, from, 8, F54); | |
3481 __ ldf(FloatRegisterImpl::D, from, 16, F56); | |
3482 __ faligndata(F52, F54, F52); | |
3483 __ faligndata(F54, F56, F54); | |
3484 | |
3485 __ BIND(L_load_original_key); | |
3414 // load original key from SunJCE expanded decryption key | 3486 // load original key from SunJCE expanded decryption key |
3487 // Since we load original key buffer starting first element, 8-byte alignment is guaranteed | |
3415 for ( int i = 0; i <= 3; i++ ) { | 3488 for ( int i = 0; i <= 3; i++ ) { |
3416 __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i)); | 3489 __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i)); |
3417 } | 3490 } |
3418 | 3491 |
3419 // 256-bit original key size | 3492 // 256-bit original key size |
3430 } | 3503 } |
3431 | 3504 |
3432 // perform 128-bit key specific inverse cipher transformation | 3505 // perform 128-bit key specific inverse cipher transformation |
3433 __ fxor(FloatRegisterImpl::D, F42, F54, F54); | 3506 __ fxor(FloatRegisterImpl::D, F42, F54, F54); |
3434 __ fxor(FloatRegisterImpl::D, F40, F52, F52); | 3507 __ fxor(FloatRegisterImpl::D, F40, F52, F52); |
3435 __ br(Assembler::always, false, Assembler::pt, L_common_transform); | 3508 __ ba_short(L_common_transform); |
3436 __ delayed()->nop(); | |
3437 | 3509 |
3438 __ BIND(L_expand192bit); | 3510 __ BIND(L_expand192bit); |
3439 | 3511 |
3440 // start loading rest of the 192-bit key | 3512 // start loading rest of the 192-bit key |
3441 __ ldf(FloatRegisterImpl::S, original_key, 16, F4); | 3513 __ ldf(FloatRegisterImpl::S, original_key, 16, F4); |
3455 __ fxor(FloatRegisterImpl::D, F48, F52, F52); | 3527 __ fxor(FloatRegisterImpl::D, F48, F52, F52); |
3456 __ aes_dround23(F46, F52, F54, F58); | 3528 __ aes_dround23(F46, F52, F54, F58); |
3457 __ aes_dround01(F44, F52, F54, F56); | 3529 __ aes_dround01(F44, F52, F54, F56); |
3458 __ aes_dround23(F42, F56, F58, F54); | 3530 __ aes_dround23(F42, F56, F58, F54); |
3459 __ aes_dround01(F40, F56, F58, F52); | 3531 __ aes_dround01(F40, F56, F58, F52); |
3460 __ br(Assembler::always, false, Assembler::pt, L_common_transform); | 3532 __ ba_short(L_common_transform); |
3461 __ delayed()->nop(); | |
3462 | 3533 |
3463 __ BIND(L_expand256bit); | 3534 __ BIND(L_expand256bit); |
3464 | 3535 |
3465 // load rest of the 256-bit key | 3536 // load rest of the 256-bit key |
3466 for ( int i = 4; i <= 7; i++ ) { | 3537 for ( int i = 4; i <= 7; i++ ) { |
3476 } | 3547 } |
3477 __ aes_kexpand1(F48, F54, 6, F56); | 3548 __ aes_kexpand1(F48, F54, 6, F56); |
3478 __ aes_kexpand2(F50, F56, F58); | 3549 __ aes_kexpand2(F50, F56, F58); |
3479 | 3550 |
3480 for ( int i = 0; i <= 6; i += 2 ) { | 3551 for ( int i = 0; i <= 6; i += 2 ) { |
3481 __ fmov(FloatRegisterImpl::D, as_FloatRegister(58-i), as_FloatRegister(i)); | 3552 __ fsrc2(FloatRegisterImpl::D, as_FloatRegister(58-i), as_FloatRegister(i)); |
3482 } | 3553 } |
3483 | 3554 |
3484 // load input into F52-F54 | 3555 // reload original 'from' address |
3556 __ mov(G1, from); | |
3557 | |
3558 // re-check 8-byte alignment | |
3559 __ andcc(from, 7, G0); | |
3560 __ br(Assembler::notZero, true, Assembler::pn, L_reload_misaligned_input); | |
3561 __ delayed()->alignaddr(from, G0, from); | |
3562 | |
3563 // aligned case: load input into F52-F54 | |
3485 __ ldf(FloatRegisterImpl::D, from, 0, F52); | 3564 __ ldf(FloatRegisterImpl::D, from, 0, F52); |
3486 __ ldf(FloatRegisterImpl::D, from, 8, F54); | 3565 __ ldf(FloatRegisterImpl::D, from, 8, F54); |
3566 __ ba_short(L_256bit_transform); | |
3567 | |
3568 __ BIND(L_reload_misaligned_input); | |
3569 __ ldf(FloatRegisterImpl::D, from, 0, F52); | |
3570 __ ldf(FloatRegisterImpl::D, from, 8, F54); | |
3571 __ ldf(FloatRegisterImpl::D, from, 16, F56); | |
3572 __ faligndata(F52, F54, F52); | |
3573 __ faligndata(F54, F56, F54); | |
3487 | 3574 |
3488 // perform 256-bit key specific inverse cipher transformation | 3575 // perform 256-bit key specific inverse cipher transformation |
3576 __ BIND(L_256bit_transform); | |
3489 __ fxor(FloatRegisterImpl::D, F0, F54, F54); | 3577 __ fxor(FloatRegisterImpl::D, F0, F54, F54); |
3490 __ fxor(FloatRegisterImpl::D, F2, F52, F52); | 3578 __ fxor(FloatRegisterImpl::D, F2, F52, F52); |
3491 __ aes_dround23(F4, F52, F54, F58); | 3579 __ aes_dround23(F4, F52, F54, F58); |
3492 __ aes_dround01(F6, F52, F54, F56); | 3580 __ aes_dround01(F6, F52, F54, F56); |
3493 __ aes_dround23(F50, F56, F58, F54); | 3581 __ aes_dround23(F50, F56, F58, F54); |
3513 __ aes_dround23_l(as_FloatRegister(i-4), F56, F58, F54); | 3601 __ aes_dround23_l(as_FloatRegister(i-4), F56, F58, F54); |
3514 __ aes_dround01_l(as_FloatRegister(i-6), F56, F58, F52); | 3602 __ aes_dround01_l(as_FloatRegister(i-6), F56, F58, F52); |
3515 } | 3603 } |
3516 } | 3604 } |
3517 | 3605 |
3518 // store output to destination array, F0-F1 used as temp | 3606 // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero |
3519 __ fmov(FloatRegisterImpl::D, F52, F0); | 3607 __ andcc(to, 7, O5); |
3520 __ stf(FloatRegisterImpl::S, F0, to, 0); | 3608 __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output); |
3521 __ stf(FloatRegisterImpl::S, F1, to, 4); | 3609 __ delayed()->edge8n(to, G0, O3); |
3522 __ fmov(FloatRegisterImpl::D, F54, F0); | 3610 |
3523 __ stf(FloatRegisterImpl::S, F0, to, 8); | 3611 // aligned case: store output into the destination array |
3612 __ stf(FloatRegisterImpl::D, F52, to, 0); | |
3524 __ retl(); | 3613 __ retl(); |
3525 __ delayed()->stf(FloatRegisterImpl::S, F1, to, 12); | 3614 __ delayed()->stf(FloatRegisterImpl::D, F54, to, 8); |
3615 | |
3616 __ BIND(L_store_misaligned_output); | |
3617 __ add(to, 8, O4); | |
3618 __ mov(8, O2); | |
3619 __ sub(O2, O5, O2); | |
3620 __ alignaddr(O2, G0, O2); | |
3621 __ faligndata(F52, F52, F52); | |
3622 __ faligndata(F54, F54, F54); | |
3623 __ and3(to, -8, to); | |
3624 __ and3(O4, -8, O4); | |
3625 __ stpartialf(to, O3, F52, Assembler::ASI_PST8_PRIMARY); | |
3626 __ stpartialf(O4, O3, F54, Assembler::ASI_PST8_PRIMARY); | |
3627 __ add(to, 8, to); | |
3628 __ add(O4, 8, O4); | |
3629 __ orn(G0, O3, O3); | |
3630 __ stpartialf(to, O3, F52, Assembler::ASI_PST8_PRIMARY); | |
3631 __ retl(); | |
3632 __ delayed()->stpartialf(O4, O3, F54, Assembler::ASI_PST8_PRIMARY); | |
3526 | 3633 |
3527 return start; | 3634 return start; |
3528 } | 3635 } |
3529 | 3636 |
3530 address generate_cipherBlockChaining_encryptAESCrypt() { | 3637 address generate_cipherBlockChaining_encryptAESCrypt() { |
3638 assert((arrayOopDesc::base_offset_in_bytes(T_INT) & 7) == 0, | |
3639 "the following code assumes that first element of an int array is aligned to 8 bytes"); | |
3640 assert((arrayOopDesc::base_offset_in_bytes(T_BYTE) & 7) == 0, | |
3641 "the following code assumes that first element of a byte array is aligned to 8 bytes"); | |
3531 __ align(CodeEntryAlignment); | 3642 __ align(CodeEntryAlignment); |
3532 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); | 3643 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); |
3533 Label L_cbcenc128, L_cbcenc192, L_cbcenc256; | 3644 Label L_cbcenc128, L_load_misaligned_input_128bit, L_128bit_transform, L_store_misaligned_output_128bit; |
3645 Label L_check_loop_end_128bit, L_cbcenc192, L_load_misaligned_input_192bit, L_192bit_transform; | |
3646 Label L_store_misaligned_output_192bit, L_check_loop_end_192bit, L_cbcenc256, L_load_misaligned_input_256bit; | |
3647 Label L_256bit_transform, L_store_misaligned_output_256bit, L_check_loop_end_256bit; | |
3534 address start = __ pc(); | 3648 address start = __ pc(); |
3535 Register from = O0; // source byte array | 3649 Register from = I0; // source byte array |
3536 Register to = O1; // destination byte array | 3650 Register to = I1; // destination byte array |
3537 Register key = O2; // expanded key array | 3651 Register key = I2; // expanded key array |
3538 Register rvec = O3; // init vector | 3652 Register rvec = I3; // init vector |
3539 const Register len_reg = O4; // cipher length | 3653 const Register len_reg = I4; // cipher length |
3540 const Register keylen = O5; // reg for storing expanded key array length | 3654 const Register keylen = I5; // reg for storing expanded key array length |
3541 | 3655 |
3542 // save cipher len to return in the end | 3656 // save cipher len before save_frame, to return in the end |
3543 __ mov(len_reg, L1); | 3657 __ mov(O4, L0); |
3658 __ save_frame(0); | |
3544 | 3659 |
3545 // read expanded key length | 3660 // read expanded key length |
3546 __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0); | 3661 __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0); |
3547 | 3662 |
3548 // load init vector | 3663 // load initial vector, 8-byte alignment is guranteed |
3549 __ ldf(FloatRegisterImpl::D, rvec, 0, F60); | 3664 __ ldf(FloatRegisterImpl::D, rvec, 0, F60); |
3550 __ ldf(FloatRegisterImpl::D, rvec, 8, F62); | 3665 __ ldf(FloatRegisterImpl::D, rvec, 8, F62); |
3666 // load key, 8-byte alignment is guranteed | |
3551 __ ldx(key,0,G1); | 3667 __ ldx(key,0,G1); |
3552 __ ldx(key,8,G2); | 3668 __ ldx(key,8,G5); |
3553 | 3669 |
3554 // start loading expanded key | 3670 // start loading expanded key, 8-byte alignment is guranteed |
3555 for ( int i = 0, j = 16; i <= 38; i += 2, j += 8 ) { | 3671 for ( int i = 0, j = 16; i <= 38; i += 2, j += 8 ) { |
3556 __ ldf(FloatRegisterImpl::D, key, j, as_FloatRegister(i)); | 3672 __ ldf(FloatRegisterImpl::D, key, j, as_FloatRegister(i)); |
3557 } | 3673 } |
3558 | 3674 |
3559 // 128-bit original key size | 3675 // 128-bit original key size |
3569 for ( int i = 48, j = 208; i <= 54; i += 2, j += 8 ) { | 3685 for ( int i = 48, j = 208; i <= 54; i += 2, j += 8 ) { |
3570 __ ldf(FloatRegisterImpl::D, key, j, as_FloatRegister(i)); | 3686 __ ldf(FloatRegisterImpl::D, key, j, as_FloatRegister(i)); |
3571 } | 3687 } |
3572 | 3688 |
3573 // 256-bit original key size | 3689 // 256-bit original key size |
3574 __ br(Assembler::always, false, Assembler::pt, L_cbcenc256); | 3690 __ ba_short(L_cbcenc256); |
3575 __ delayed()->nop(); | |
3576 | 3691 |
3577 __ align(OptoLoopAlignment); | 3692 __ align(OptoLoopAlignment); |
3578 __ BIND(L_cbcenc128); | 3693 __ BIND(L_cbcenc128); |
3694 // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero | |
3695 __ andcc(from, 7, G0); | |
3696 __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input_128bit); | |
3697 __ delayed()->mov(from, L1); // save original 'from' address before alignaddr | |
3698 | |
3699 // aligned case: load input into G3 and G4 | |
3579 __ ldx(from,0,G3); | 3700 __ ldx(from,0,G3); |
3580 __ ldx(from,8,G4); | 3701 __ ldx(from,8,G4); |
3702 __ ba_short(L_128bit_transform); | |
3703 | |
3704 __ BIND(L_load_misaligned_input_128bit); | |
3705 // can clobber F48, F50 and F52 as they are not used in 128 and 192-bit key encryption | |
3706 __ alignaddr(from, G0, from); | |
3707 __ ldf(FloatRegisterImpl::D, from, 0, F48); | |
3708 __ ldf(FloatRegisterImpl::D, from, 8, F50); | |
3709 __ ldf(FloatRegisterImpl::D, from, 16, F52); | |
3710 __ faligndata(F48, F50, F48); | |
3711 __ faligndata(F50, F52, F50); | |
3712 __ movdtox(F48, G3); | |
3713 __ movdtox(F50, G4); | |
3714 __ mov(L1, from); | |
3715 | |
3716 __ BIND(L_128bit_transform); | |
3581 __ xor3(G1,G3,G3); | 3717 __ xor3(G1,G3,G3); |
3582 __ xor3(G2,G4,G4); | 3718 __ xor3(G5,G4,G4); |
3583 __ movxtod(G3,F56); | 3719 __ movxtod(G3,F56); |
3584 __ movxtod(G4,F58); | 3720 __ movxtod(G4,F58); |
3585 __ fxor(FloatRegisterImpl::D, F60, F56, F60); | 3721 __ fxor(FloatRegisterImpl::D, F60, F56, F60); |
3586 __ fxor(FloatRegisterImpl::D, F62, F58, F62); | 3722 __ fxor(FloatRegisterImpl::D, F62, F58, F62); |
3587 | 3723 |
3596 __ aes_eround01_l(as_FloatRegister(i+4), F56, F58, F60); | 3732 __ aes_eround01_l(as_FloatRegister(i+4), F56, F58, F60); |
3597 __ aes_eround23_l(as_FloatRegister(i+6), F56, F58, F62); | 3733 __ aes_eround23_l(as_FloatRegister(i+6), F56, F58, F62); |
3598 } | 3734 } |
3599 } | 3735 } |
3600 | 3736 |
3737 // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero | |
3738 __ andcc(to, 7, L1); | |
3739 __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_128bit); | |
3740 __ delayed()->edge8n(to, G0, L2); | |
3741 | |
3742 // aligned case: store output into the destination array | |
3601 __ stf(FloatRegisterImpl::D, F60, to, 0); | 3743 __ stf(FloatRegisterImpl::D, F60, to, 0); |
3602 __ stf(FloatRegisterImpl::D, F62, to, 8); | 3744 __ stf(FloatRegisterImpl::D, F62, to, 8); |
3745 __ ba_short(L_check_loop_end_128bit); | |
3746 | |
3747 __ BIND(L_store_misaligned_output_128bit); | |
3748 __ add(to, 8, L3); | |
3749 __ mov(8, L4); | |
3750 __ sub(L4, L1, L4); | |
3751 __ alignaddr(L4, G0, L4); | |
3752 // save cipher text before circular right shift | |
3753 // as it needs to be stored as iv for next block (see code before next retl) | |
3754 __ movdtox(F60, L6); | |
3755 __ movdtox(F62, L7); | |
3756 __ faligndata(F60, F60, F60); | |
3757 __ faligndata(F62, F62, F62); | |
3758 __ mov(to, L5); | |
3759 __ and3(to, -8, to); | |
3760 __ and3(L3, -8, L3); | |
3761 __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY); | |
3762 __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY); | |
3763 __ add(to, 8, to); | |
3764 __ add(L3, 8, L3); | |
3765 __ orn(G0, L2, L2); | |
3766 __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY); | |
3767 __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY); | |
3768 __ mov(L5, to); | |
3769 __ movxtod(L6, F60); | |
3770 __ movxtod(L7, F62); | |
3771 | |
3772 __ BIND(L_check_loop_end_128bit); | |
3603 __ add(from, 16, from); | 3773 __ add(from, 16, from); |
3604 __ add(to, 16, to); | 3774 __ add(to, 16, to); |
3605 __ subcc(len_reg, 16, len_reg); | 3775 __ subcc(len_reg, 16, len_reg); |
3606 __ br(Assembler::notEqual, false, Assembler::pt, L_cbcenc128); | 3776 __ br(Assembler::notEqual, false, Assembler::pt, L_cbcenc128); |
3607 __ delayed()->nop(); | 3777 __ delayed()->nop(); |
3778 // re-init intial vector for next block, 8-byte alignment is guaranteed | |
3608 __ stf(FloatRegisterImpl::D, F60, rvec, 0); | 3779 __ stf(FloatRegisterImpl::D, F60, rvec, 0); |
3609 __ stf(FloatRegisterImpl::D, F62, rvec, 8); | 3780 __ stf(FloatRegisterImpl::D, F62, rvec, 8); |
3781 __ restore(); | |
3610 __ retl(); | 3782 __ retl(); |
3611 __ delayed()->mov(L1, O0); | 3783 __ delayed()->mov(L0, O0); |
3612 | 3784 |
3613 __ align(OptoLoopAlignment); | 3785 __ align(OptoLoopAlignment); |
3614 __ BIND(L_cbcenc192); | 3786 __ BIND(L_cbcenc192); |
3787 // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero | |
3788 __ andcc(from, 7, G0); | |
3789 __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input_192bit); | |
3790 __ delayed()->mov(from, L1); // save original 'from' address before alignaddr | |
3791 | |
3792 // aligned case: load input into G3 and G4 | |
3615 __ ldx(from,0,G3); | 3793 __ ldx(from,0,G3); |
3616 __ ldx(from,8,G4); | 3794 __ ldx(from,8,G4); |
3795 __ ba_short(L_192bit_transform); | |
3796 | |
3797 __ BIND(L_load_misaligned_input_192bit); | |
3798 // can clobber F48, F50 and F52 as they are not used in 128 and 192-bit key encryption | |
3799 __ alignaddr(from, G0, from); | |
3800 __ ldf(FloatRegisterImpl::D, from, 0, F48); | |
3801 __ ldf(FloatRegisterImpl::D, from, 8, F50); | |
3802 __ ldf(FloatRegisterImpl::D, from, 16, F52); | |
3803 __ faligndata(F48, F50, F48); | |
3804 __ faligndata(F50, F52, F50); | |
3805 __ movdtox(F48, G3); | |
3806 __ movdtox(F50, G4); | |
3807 __ mov(L1, from); | |
3808 | |
3809 __ BIND(L_192bit_transform); | |
3617 __ xor3(G1,G3,G3); | 3810 __ xor3(G1,G3,G3); |
3618 __ xor3(G2,G4,G4); | 3811 __ xor3(G5,G4,G4); |
3619 __ movxtod(G3,F56); | 3812 __ movxtod(G3,F56); |
3620 __ movxtod(G4,F58); | 3813 __ movxtod(G4,F58); |
3621 __ fxor(FloatRegisterImpl::D, F60, F56, F60); | 3814 __ fxor(FloatRegisterImpl::D, F60, F56, F60); |
3622 __ fxor(FloatRegisterImpl::D, F62, F58, F62); | 3815 __ fxor(FloatRegisterImpl::D, F62, F58, F62); |
3623 | 3816 |
3632 __ aes_eround01_l(as_FloatRegister(i+4), F56, F58, F60); | 3825 __ aes_eround01_l(as_FloatRegister(i+4), F56, F58, F60); |
3633 __ aes_eround23_l(as_FloatRegister(i+6), F56, F58, F62); | 3826 __ aes_eround23_l(as_FloatRegister(i+6), F56, F58, F62); |
3634 } | 3827 } |
3635 } | 3828 } |
3636 | 3829 |
3830 // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero | |
3831 __ andcc(to, 7, L1); | |
3832 __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_192bit); | |
3833 __ delayed()->edge8n(to, G0, L2); | |
3834 | |
3835 // aligned case: store output into the destination array | |
3637 __ stf(FloatRegisterImpl::D, F60, to, 0); | 3836 __ stf(FloatRegisterImpl::D, F60, to, 0); |
3638 __ stf(FloatRegisterImpl::D, F62, to, 8); | 3837 __ stf(FloatRegisterImpl::D, F62, to, 8); |
3838 __ ba_short(L_check_loop_end_192bit); | |
3839 | |
3840 __ BIND(L_store_misaligned_output_192bit); | |
3841 __ add(to, 8, L3); | |
3842 __ mov(8, L4); | |
3843 __ sub(L4, L1, L4); | |
3844 __ alignaddr(L4, G0, L4); | |
3845 __ movdtox(F60, L6); | |
3846 __ movdtox(F62, L7); | |
3847 __ faligndata(F60, F60, F60); | |
3848 __ faligndata(F62, F62, F62); | |
3849 __ mov(to, L5); | |
3850 __ and3(to, -8, to); | |
3851 __ and3(L3, -8, L3); | |
3852 __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY); | |
3853 __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY); | |
3854 __ add(to, 8, to); | |
3855 __ add(L3, 8, L3); | |
3856 __ orn(G0, L2, L2); | |
3857 __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY); | |
3858 __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY); | |
3859 __ mov(L5, to); | |
3860 __ movxtod(L6, F60); | |
3861 __ movxtod(L7, F62); | |
3862 | |
3863 __ BIND(L_check_loop_end_192bit); | |
3639 __ add(from, 16, from); | 3864 __ add(from, 16, from); |
3640 __ subcc(len_reg, 16, len_reg); | 3865 __ subcc(len_reg, 16, len_reg); |
3641 __ add(to, 16, to); | 3866 __ add(to, 16, to); |
3642 __ br(Assembler::notEqual, false, Assembler::pt, L_cbcenc192); | 3867 __ br(Assembler::notEqual, false, Assembler::pt, L_cbcenc192); |
3643 __ delayed()->nop(); | 3868 __ delayed()->nop(); |
3869 // re-init intial vector for next block, 8-byte alignment is guaranteed | |
3644 __ stf(FloatRegisterImpl::D, F60, rvec, 0); | 3870 __ stf(FloatRegisterImpl::D, F60, rvec, 0); |
3645 __ stf(FloatRegisterImpl::D, F62, rvec, 8); | 3871 __ stf(FloatRegisterImpl::D, F62, rvec, 8); |
3872 __ restore(); | |
3646 __ retl(); | 3873 __ retl(); |
3647 __ delayed()->mov(L1, O0); | 3874 __ delayed()->mov(L0, O0); |
3648 | 3875 |
3649 __ align(OptoLoopAlignment); | 3876 __ align(OptoLoopAlignment); |
3650 __ BIND(L_cbcenc256); | 3877 __ BIND(L_cbcenc256); |
3878 // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero | |
3879 __ andcc(from, 7, G0); | |
3880 __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input_256bit); | |
3881 __ delayed()->mov(from, L1); // save original 'from' address before alignaddr | |
3882 | |
3883 // aligned case: load input into G3 and G4 | |
3651 __ ldx(from,0,G3); | 3884 __ ldx(from,0,G3); |
3652 __ ldx(from,8,G4); | 3885 __ ldx(from,8,G4); |
3886 __ ba_short(L_256bit_transform); | |
3887 | |
3888 __ BIND(L_load_misaligned_input_256bit); | |
3889 // cannot clobber F48, F50 and F52. F56, F58 can be used though | |
3890 __ alignaddr(from, G0, from); | |
3891 __ movdtox(F60, L2); // save F60 before overwriting | |
3892 __ ldf(FloatRegisterImpl::D, from, 0, F56); | |
3893 __ ldf(FloatRegisterImpl::D, from, 8, F58); | |
3894 __ ldf(FloatRegisterImpl::D, from, 16, F60); | |
3895 __ faligndata(F56, F58, F56); | |
3896 __ faligndata(F58, F60, F58); | |
3897 __ movdtox(F56, G3); | |
3898 __ movdtox(F58, G4); | |
3899 __ mov(L1, from); | |
3900 __ movxtod(L2, F60); | |
3901 | |
3902 __ BIND(L_256bit_transform); | |
3653 __ xor3(G1,G3,G3); | 3903 __ xor3(G1,G3,G3); |
3654 __ xor3(G2,G4,G4); | 3904 __ xor3(G5,G4,G4); |
3655 __ movxtod(G3,F56); | 3905 __ movxtod(G3,F56); |
3656 __ movxtod(G4,F58); | 3906 __ movxtod(G4,F58); |
3657 __ fxor(FloatRegisterImpl::D, F60, F56, F60); | 3907 __ fxor(FloatRegisterImpl::D, F60, F56, F60); |
3658 __ fxor(FloatRegisterImpl::D, F62, F58, F62); | 3908 __ fxor(FloatRegisterImpl::D, F62, F58, F62); |
3659 | 3909 |
3668 __ aes_eround01_l(as_FloatRegister(i+4), F56, F58, F60); | 3918 __ aes_eround01_l(as_FloatRegister(i+4), F56, F58, F60); |
3669 __ aes_eround23_l(as_FloatRegister(i+6), F56, F58, F62); | 3919 __ aes_eround23_l(as_FloatRegister(i+6), F56, F58, F62); |
3670 } | 3920 } |
3671 } | 3921 } |
3672 | 3922 |
3923 // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero | |
3924 __ andcc(to, 7, L1); | |
3925 __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_256bit); | |
3926 __ delayed()->edge8n(to, G0, L2); | |
3927 | |
3928 // aligned case: store output into the destination array | |
3673 __ stf(FloatRegisterImpl::D, F60, to, 0); | 3929 __ stf(FloatRegisterImpl::D, F60, to, 0); |
3674 __ stf(FloatRegisterImpl::D, F62, to, 8); | 3930 __ stf(FloatRegisterImpl::D, F62, to, 8); |
3931 __ ba_short(L_check_loop_end_256bit); | |
3932 | |
3933 __ BIND(L_store_misaligned_output_256bit); | |
3934 __ add(to, 8, L3); | |
3935 __ mov(8, L4); | |
3936 __ sub(L4, L1, L4); | |
3937 __ alignaddr(L4, G0, L4); | |
3938 __ movdtox(F60, L6); | |
3939 __ movdtox(F62, L7); | |
3940 __ faligndata(F60, F60, F60); | |
3941 __ faligndata(F62, F62, F62); | |
3942 __ mov(to, L5); | |
3943 __ and3(to, -8, to); | |
3944 __ and3(L3, -8, L3); | |
3945 __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY); | |
3946 __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY); | |
3947 __ add(to, 8, to); | |
3948 __ add(L3, 8, L3); | |
3949 __ orn(G0, L2, L2); | |
3950 __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY); | |
3951 __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY); | |
3952 __ mov(L5, to); | |
3953 __ movxtod(L6, F60); | |
3954 __ movxtod(L7, F62); | |
3955 | |
3956 __ BIND(L_check_loop_end_256bit); | |
3675 __ add(from, 16, from); | 3957 __ add(from, 16, from); |
3676 __ subcc(len_reg, 16, len_reg); | 3958 __ subcc(len_reg, 16, len_reg); |
3677 __ add(to, 16, to); | 3959 __ add(to, 16, to); |
3678 __ br(Assembler::notEqual, false, Assembler::pt, L_cbcenc256); | 3960 __ br(Assembler::notEqual, false, Assembler::pt, L_cbcenc256); |
3679 __ delayed()->nop(); | 3961 __ delayed()->nop(); |
3962 // re-init intial vector for next block, 8-byte alignment is guaranteed | |
3680 __ stf(FloatRegisterImpl::D, F60, rvec, 0); | 3963 __ stf(FloatRegisterImpl::D, F60, rvec, 0); |
3681 __ stf(FloatRegisterImpl::D, F62, rvec, 8); | 3964 __ stf(FloatRegisterImpl::D, F62, rvec, 8); |
3965 __ restore(); | |
3682 __ retl(); | 3966 __ retl(); |
3683 __ delayed()->mov(L1, O0); | 3967 __ delayed()->mov(L0, O0); |
3684 | 3968 |
3685 return start; | 3969 return start; |
3686 } | 3970 } |
3687 | 3971 |
3688 address generate_cipherBlockChaining_decryptAESCrypt_Parallel() { | 3972 address generate_cipherBlockChaining_decryptAESCrypt_Parallel() { |
3973 assert((arrayOopDesc::base_offset_in_bytes(T_INT) & 7) == 0, | |
3974 "the following code assumes that first element of an int array is aligned to 8 bytes"); | |
3975 assert((arrayOopDesc::base_offset_in_bytes(T_BYTE) & 7) == 0, | |
3976 "the following code assumes that first element of a byte array is aligned to 8 bytes"); | |
3689 __ align(CodeEntryAlignment); | 3977 __ align(CodeEntryAlignment); |
3690 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); | 3978 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); |
3691 Label L_cbcdec_end, L_expand192bit, L_expand256bit, L_dec_first_block_start; | 3979 Label L_cbcdec_end, L_expand192bit, L_expand256bit, L_dec_first_block_start; |
3692 Label L_dec_first_block128, L_dec_first_block192, L_dec_next2_blocks128, L_dec_next2_blocks192, L_dec_next2_blocks256; | 3980 Label L_dec_first_block128, L_dec_first_block192, L_dec_next2_blocks128, L_dec_next2_blocks192, L_dec_next2_blocks256; |
3981 Label L_load_misaligned_input_first_block, L_transform_first_block, L_load_misaligned_next2_blocks128, L_transform_next2_blocks128; | |
3982 Label L_load_misaligned_next2_blocks192, L_transform_next2_blocks192, L_load_misaligned_next2_blocks256, L_transform_next2_blocks256; | |
3983 Label L_store_misaligned_output_first_block, L_check_decrypt_end, L_store_misaligned_output_next2_blocks128; | |
3984 Label L_check_decrypt_loop_end128, L_store_misaligned_output_next2_blocks192, L_check_decrypt_loop_end192; | |
3985 Label L_store_misaligned_output_next2_blocks256, L_check_decrypt_loop_end256; | |
3693 address start = __ pc(); | 3986 address start = __ pc(); |
3694 Register from = I0; // source byte array | 3987 Register from = I0; // source byte array |
3695 Register to = I1; // destination byte array | 3988 Register to = I1; // destination byte array |
3696 Register key = I2; // expanded key array | 3989 Register key = I2; // expanded key array |
3697 Register rvec = I3; // init vector | 3990 Register rvec = I3; // init vector |
3702 // save cipher len before save_frame, to return in the end | 3995 // save cipher len before save_frame, to return in the end |
3703 __ mov(O4, L0); | 3996 __ mov(O4, L0); |
3704 __ save_frame(0); //args are read from I* registers since we save the frame in the beginning | 3997 __ save_frame(0); //args are read from I* registers since we save the frame in the beginning |
3705 | 3998 |
3706 // load original key from SunJCE expanded decryption key | 3999 // load original key from SunJCE expanded decryption key |
4000 // Since we load original key buffer starting first element, 8-byte alignment is guaranteed | |
3707 for ( int i = 0; i <= 3; i++ ) { | 4001 for ( int i = 0; i <= 3; i++ ) { |
3708 __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i)); | 4002 __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i)); |
3709 } | 4003 } |
3710 | 4004 |
3711 // load initial vector | 4005 // load initial vector, 8-byte alignment is guaranteed |
3712 __ ldx(rvec,0,L0); | 4006 __ ldx(rvec,0,L0); |
3713 __ ldx(rvec,8,L1); | 4007 __ ldx(rvec,8,L1); |
3714 | 4008 |
3715 // read expanded key array length | 4009 // read expanded key array length |
3716 __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0); | 4010 __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0); |
3731 // load expanded key[last-1] and key[last] elements | 4025 // load expanded key[last-1] and key[last] elements |
3732 __ movdtox(F40,L2); | 4026 __ movdtox(F40,L2); |
3733 __ movdtox(F42,L3); | 4027 __ movdtox(F42,L3); |
3734 | 4028 |
3735 __ and3(len_reg, 16, L4); | 4029 __ and3(len_reg, 16, L4); |
3736 __ br_null(L4, false, Assembler::pt, L_dec_next2_blocks128); | 4030 __ br_null_short(L4, Assembler::pt, L_dec_next2_blocks128); |
3737 __ delayed()->nop(); | 4031 __ nop(); |
3738 | 4032 |
3739 __ br(Assembler::always, false, Assembler::pt, L_dec_first_block_start); | 4033 __ ba_short(L_dec_first_block_start); |
3740 __ delayed()->nop(); | |
3741 | 4034 |
3742 __ BIND(L_expand192bit); | 4035 __ BIND(L_expand192bit); |
3743 // load rest of the 192-bit key | 4036 // load rest of the 192-bit key |
3744 __ ldf(FloatRegisterImpl::S, original_key, 16, F4); | 4037 __ ldf(FloatRegisterImpl::S, original_key, 16, F4); |
3745 __ ldf(FloatRegisterImpl::S, original_key, 20, F5); | 4038 __ ldf(FloatRegisterImpl::S, original_key, 20, F5); |
3756 // load expanded key[last-1] and key[last] elements | 4049 // load expanded key[last-1] and key[last] elements |
3757 __ movdtox(F48,L2); | 4050 __ movdtox(F48,L2); |
3758 __ movdtox(F50,L3); | 4051 __ movdtox(F50,L3); |
3759 | 4052 |
3760 __ and3(len_reg, 16, L4); | 4053 __ and3(len_reg, 16, L4); |
3761 __ br_null(L4, false, Assembler::pt, L_dec_next2_blocks192); | 4054 __ br_null_short(L4, Assembler::pt, L_dec_next2_blocks192); |
3762 __ delayed()->nop(); | 4055 __ nop(); |
3763 | 4056 |
3764 __ br(Assembler::always, false, Assembler::pt, L_dec_first_block_start); | 4057 __ ba_short(L_dec_first_block_start); |
3765 __ delayed()->nop(); | |
3766 | 4058 |
3767 __ BIND(L_expand256bit); | 4059 __ BIND(L_expand256bit); |
3768 // load rest of the 256-bit key | 4060 // load rest of the 256-bit key |
3769 for ( int i = 4; i <= 7; i++ ) { | 4061 for ( int i = 4; i <= 7; i++ ) { |
3770 __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i)); | 4062 __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i)); |
3783 // load expanded key[last-1] and key[last] elements | 4075 // load expanded key[last-1] and key[last] elements |
3784 __ movdtox(F56,L2); | 4076 __ movdtox(F56,L2); |
3785 __ movdtox(F58,L3); | 4077 __ movdtox(F58,L3); |
3786 | 4078 |
3787 __ and3(len_reg, 16, L4); | 4079 __ and3(len_reg, 16, L4); |
3788 __ br_null(L4, false, Assembler::pt, L_dec_next2_blocks256); | 4080 __ br_null_short(L4, Assembler::pt, L_dec_next2_blocks256); |
3789 __ delayed()->nop(); | |
3790 | 4081 |
3791 __ BIND(L_dec_first_block_start); | 4082 __ BIND(L_dec_first_block_start); |
4083 // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero | |
4084 __ andcc(from, 7, G0); | |
4085 __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input_first_block); | |
4086 __ delayed()->mov(from, G1); // save original 'from' address before alignaddr | |
4087 | |
4088 // aligned case: load input into L4 and L5 | |
3792 __ ldx(from,0,L4); | 4089 __ ldx(from,0,L4); |
3793 __ ldx(from,8,L5); | 4090 __ ldx(from,8,L5); |
4091 __ ba_short(L_transform_first_block); | |
4092 | |
4093 __ BIND(L_load_misaligned_input_first_block); | |
4094 __ alignaddr(from, G0, from); | |
4095 // F58, F60, F62 can be clobbered | |
4096 __ ldf(FloatRegisterImpl::D, from, 0, F58); | |
4097 __ ldf(FloatRegisterImpl::D, from, 8, F60); | |
4098 __ ldf(FloatRegisterImpl::D, from, 16, F62); | |
4099 __ faligndata(F58, F60, F58); | |
4100 __ faligndata(F60, F62, F60); | |
4101 __ movdtox(F58, L4); | |
4102 __ movdtox(F60, L5); | |
4103 __ mov(G1, from); | |
4104 | |
4105 __ BIND(L_transform_first_block); | |
3794 __ xor3(L2,L4,G1); | 4106 __ xor3(L2,L4,G1); |
3795 __ movxtod(G1,F60); | 4107 __ movxtod(G1,F60); |
3796 __ xor3(L3,L5,G1); | 4108 __ xor3(L3,L5,G1); |
3797 __ movxtod(G1,F62); | 4109 __ movxtod(G1,F62); |
3798 | 4110 |
3831 __ mov(L4,L0); | 4143 __ mov(L4,L0); |
3832 __ mov(L5,L1); | 4144 __ mov(L5,L1); |
3833 __ fxor(FloatRegisterImpl::D, F56, F60, F60); | 4145 __ fxor(FloatRegisterImpl::D, F56, F60, F60); |
3834 __ fxor(FloatRegisterImpl::D, F58, F62, F62); | 4146 __ fxor(FloatRegisterImpl::D, F58, F62, F62); |
3835 | 4147 |
4148 // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero | |
4149 __ andcc(to, 7, G1); | |
4150 __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_first_block); | |
4151 __ delayed()->edge8n(to, G0, G2); | |
4152 | |
4153 // aligned case: store output into the destination array | |
3836 __ stf(FloatRegisterImpl::D, F60, to, 0); | 4154 __ stf(FloatRegisterImpl::D, F60, to, 0); |
3837 __ stf(FloatRegisterImpl::D, F62, to, 8); | 4155 __ stf(FloatRegisterImpl::D, F62, to, 8); |
3838 | 4156 __ ba_short(L_check_decrypt_end); |
4157 | |
4158 __ BIND(L_store_misaligned_output_first_block); | |
4159 __ add(to, 8, G3); | |
4160 __ mov(8, G4); | |
4161 __ sub(G4, G1, G4); | |
4162 __ alignaddr(G4, G0, G4); | |
4163 __ faligndata(F60, F60, F60); | |
4164 __ faligndata(F62, F62, F62); | |
4165 __ mov(to, G1); | |
4166 __ and3(to, -8, to); | |
4167 __ and3(G3, -8, G3); | |
4168 __ stpartialf(to, G2, F60, Assembler::ASI_PST8_PRIMARY); | |
4169 __ stpartialf(G3, G2, F62, Assembler::ASI_PST8_PRIMARY); | |
4170 __ add(to, 8, to); | |
4171 __ add(G3, 8, G3); | |
4172 __ orn(G0, G2, G2); | |
4173 __ stpartialf(to, G2, F60, Assembler::ASI_PST8_PRIMARY); | |
4174 __ stpartialf(G3, G2, F62, Assembler::ASI_PST8_PRIMARY); | |
4175 __ mov(G1, to); | |
4176 | |
4177 __ BIND(L_check_decrypt_end); | |
3839 __ add(from, 16, from); | 4178 __ add(from, 16, from); |
3840 __ add(to, 16, to); | 4179 __ add(to, 16, to); |
3841 __ subcc(len_reg, 16, len_reg); | 4180 __ subcc(len_reg, 16, len_reg); |
3842 __ br(Assembler::equal, false, Assembler::pt, L_cbcdec_end); | 4181 __ br(Assembler::equal, false, Assembler::pt, L_cbcdec_end); |
3843 __ delayed()->nop(); | 4182 __ delayed()->nop(); |
3850 | 4189 |
3851 __ align(OptoLoopAlignment); | 4190 __ align(OptoLoopAlignment); |
3852 __ BIND(L_dec_next2_blocks128); | 4191 __ BIND(L_dec_next2_blocks128); |
3853 __ nop(); | 4192 __ nop(); |
3854 | 4193 |
3855 // F40:F42 used for first 16-bytes | 4194 // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero |
4195 __ andcc(from, 7, G0); | |
4196 __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_next2_blocks128); | |
4197 __ delayed()->mov(from, G1); // save original 'from' address before alignaddr | |
4198 | |
4199 // aligned case: load input into G4, G5, L4 and L5 | |
3856 __ ldx(from,0,G4); | 4200 __ ldx(from,0,G4); |
3857 __ ldx(from,8,G5); | 4201 __ ldx(from,8,G5); |
4202 __ ldx(from,16,L4); | |
4203 __ ldx(from,24,L5); | |
4204 __ ba_short(L_transform_next2_blocks128); | |
4205 | |
4206 __ BIND(L_load_misaligned_next2_blocks128); | |
4207 __ alignaddr(from, G0, from); | |
4208 // F40, F42, F58, F60, F62 can be clobbered | |
4209 __ ldf(FloatRegisterImpl::D, from, 0, F40); | |
4210 __ ldf(FloatRegisterImpl::D, from, 8, F42); | |
4211 __ ldf(FloatRegisterImpl::D, from, 16, F60); | |
4212 __ ldf(FloatRegisterImpl::D, from, 24, F62); | |
4213 __ ldf(FloatRegisterImpl::D, from, 32, F58); | |
4214 __ faligndata(F40, F42, F40); | |
4215 __ faligndata(F42, F60, F42); | |
4216 __ faligndata(F60, F62, F60); | |
4217 __ faligndata(F62, F58, F62); | |
4218 __ movdtox(F40, G4); | |
4219 __ movdtox(F42, G5); | |
4220 __ movdtox(F60, L4); | |
4221 __ movdtox(F62, L5); | |
4222 __ mov(G1, from); | |
4223 | |
4224 __ BIND(L_transform_next2_blocks128); | |
4225 // F40:F42 used for first 16-bytes | |
3858 __ xor3(L2,G4,G1); | 4226 __ xor3(L2,G4,G1); |
3859 __ movxtod(G1,F40); | 4227 __ movxtod(G1,F40); |
3860 __ xor3(L3,G5,G1); | 4228 __ xor3(L3,G5,G1); |
3861 __ movxtod(G1,F42); | 4229 __ movxtod(G1,F42); |
3862 | 4230 |
3863 // F60:F62 used for next 16-bytes | 4231 // F60:F62 used for next 16-bytes |
3864 __ ldx(from,16,L4); | |
3865 __ ldx(from,24,L5); | |
3866 __ xor3(L2,L4,G1); | 4232 __ xor3(L2,L4,G1); |
3867 __ movxtod(G1,F60); | 4233 __ movxtod(G1,F60); |
3868 __ xor3(L3,L5,G1); | 4234 __ xor3(L3,L5,G1); |
3869 __ movxtod(G1,F62); | 4235 __ movxtod(G1,F62); |
3870 | 4236 |
3889 __ movxtod(L0,F46); | 4255 __ movxtod(L0,F46); |
3890 __ movxtod(L1,F44); | 4256 __ movxtod(L1,F44); |
3891 __ fxor(FloatRegisterImpl::D, F46, F40, F40); | 4257 __ fxor(FloatRegisterImpl::D, F46, F40, F40); |
3892 __ fxor(FloatRegisterImpl::D, F44, F42, F42); | 4258 __ fxor(FloatRegisterImpl::D, F44, F42, F42); |
3893 | 4259 |
3894 __ stf(FloatRegisterImpl::D, F40, to, 0); | |
3895 __ stf(FloatRegisterImpl::D, F42, to, 8); | |
3896 | |
3897 __ movxtod(G4,F56); | 4260 __ movxtod(G4,F56); |
3898 __ movxtod(G5,F58); | 4261 __ movxtod(G5,F58); |
3899 __ mov(L4,L0); | 4262 __ mov(L4,L0); |
3900 __ mov(L5,L1); | 4263 __ mov(L5,L1); |
3901 __ fxor(FloatRegisterImpl::D, F56, F60, F60); | 4264 __ fxor(FloatRegisterImpl::D, F56, F60, F60); |
3902 __ fxor(FloatRegisterImpl::D, F58, F62, F62); | 4265 __ fxor(FloatRegisterImpl::D, F58, F62, F62); |
3903 | 4266 |
4267 // For mis-aligned store of 32 bytes of result we can do: | |
4268 // Circular right-shift all 4 FP registers so that 'head' and 'tail' | |
4269 // parts that need to be stored starting at mis-aligned address are in a FP reg | |
4270 // the other 3 FP regs can thus be stored using regular store | |
4271 // we then use the edge + partial-store mechanism to store the 'head' and 'tail' parts | |
4272 | |
4273 // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero | |
4274 __ andcc(to, 7, G1); | |
4275 __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_next2_blocks128); | |
4276 __ delayed()->edge8n(to, G0, G2); | |
4277 | |
4278 // aligned case: store output into the destination array | |
4279 __ stf(FloatRegisterImpl::D, F40, to, 0); | |
4280 __ stf(FloatRegisterImpl::D, F42, to, 8); | |
3904 __ stf(FloatRegisterImpl::D, F60, to, 16); | 4281 __ stf(FloatRegisterImpl::D, F60, to, 16); |
3905 __ stf(FloatRegisterImpl::D, F62, to, 24); | 4282 __ stf(FloatRegisterImpl::D, F62, to, 24); |
3906 | 4283 __ ba_short(L_check_decrypt_loop_end128); |
4284 | |
4285 __ BIND(L_store_misaligned_output_next2_blocks128); | |
4286 __ mov(8, G4); | |
4287 __ sub(G4, G1, G4); | |
4288 __ alignaddr(G4, G0, G4); | |
4289 __ faligndata(F40, F42, F56); // F56 can be clobbered | |
4290 __ faligndata(F42, F60, F42); | |
4291 __ faligndata(F60, F62, F60); | |
4292 __ faligndata(F62, F40, F40); | |
4293 __ mov(to, G1); | |
4294 __ and3(to, -8, to); | |
4295 __ stpartialf(to, G2, F40, Assembler::ASI_PST8_PRIMARY); | |
4296 __ stf(FloatRegisterImpl::D, F56, to, 8); | |
4297 __ stf(FloatRegisterImpl::D, F42, to, 16); | |
4298 __ stf(FloatRegisterImpl::D, F60, to, 24); | |
4299 __ add(to, 32, to); | |
4300 __ orn(G0, G2, G2); | |
4301 __ stpartialf(to, G2, F40, Assembler::ASI_PST8_PRIMARY); | |
4302 __ mov(G1, to); | |
4303 | |
4304 __ BIND(L_check_decrypt_loop_end128); | |
3907 __ add(from, 32, from); | 4305 __ add(from, 32, from); |
3908 __ add(to, 32, to); | 4306 __ add(to, 32, to); |
3909 __ subcc(len_reg, 32, len_reg); | 4307 __ subcc(len_reg, 32, len_reg); |
3910 __ br(Assembler::notEqual, false, Assembler::pt, L_dec_next2_blocks128); | 4308 __ br(Assembler::notEqual, false, Assembler::pt, L_dec_next2_blocks128); |
3911 __ delayed()->nop(); | 4309 __ delayed()->nop(); |
3912 __ br(Assembler::always, false, Assembler::pt, L_cbcdec_end); | 4310 __ ba_short(L_cbcdec_end); |
3913 __ delayed()->nop(); | |
3914 | 4311 |
3915 __ align(OptoLoopAlignment); | 4312 __ align(OptoLoopAlignment); |
3916 __ BIND(L_dec_next2_blocks192); | 4313 __ BIND(L_dec_next2_blocks192); |
3917 __ nop(); | 4314 __ nop(); |
3918 | 4315 |
3919 // F48:F50 used for first 16-bytes | 4316 // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero |
4317 __ andcc(from, 7, G0); | |
4318 __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_next2_blocks192); | |
4319 __ delayed()->mov(from, G1); // save original 'from' address before alignaddr | |
4320 | |
4321 // aligned case: load input into G4, G5, L4 and L5 | |
3920 __ ldx(from,0,G4); | 4322 __ ldx(from,0,G4); |
3921 __ ldx(from,8,G5); | 4323 __ ldx(from,8,G5); |
4324 __ ldx(from,16,L4); | |
4325 __ ldx(from,24,L5); | |
4326 __ ba_short(L_transform_next2_blocks192); | |
4327 | |
4328 __ BIND(L_load_misaligned_next2_blocks192); | |
4329 __ alignaddr(from, G0, from); | |
4330 // F48, F50, F52, F60, F62 can be clobbered | |
4331 __ ldf(FloatRegisterImpl::D, from, 0, F48); | |
4332 __ ldf(FloatRegisterImpl::D, from, 8, F50); | |
4333 __ ldf(FloatRegisterImpl::D, from, 16, F60); | |
4334 __ ldf(FloatRegisterImpl::D, from, 24, F62); | |
4335 __ ldf(FloatRegisterImpl::D, from, 32, F52); | |
4336 __ faligndata(F48, F50, F48); | |
4337 __ faligndata(F50, F60, F50); | |
4338 __ faligndata(F60, F62, F60); | |
4339 __ faligndata(F62, F52, F62); | |
4340 __ movdtox(F48, G4); | |
4341 __ movdtox(F50, G5); | |
4342 __ movdtox(F60, L4); | |
4343 __ movdtox(F62, L5); | |
4344 __ mov(G1, from); | |
4345 | |
4346 __ BIND(L_transform_next2_blocks192); | |
4347 // F48:F50 used for first 16-bytes | |
3922 __ xor3(L2,G4,G1); | 4348 __ xor3(L2,G4,G1); |
3923 __ movxtod(G1,F48); | 4349 __ movxtod(G1,F48); |
3924 __ xor3(L3,G5,G1); | 4350 __ xor3(L3,G5,G1); |
3925 __ movxtod(G1,F50); | 4351 __ movxtod(G1,F50); |
3926 | 4352 |
3927 // F60:F62 used for next 16-bytes | 4353 // F60:F62 used for next 16-bytes |
3928 __ ldx(from,16,L4); | |
3929 __ ldx(from,24,L5); | |
3930 __ xor3(L2,L4,G1); | 4354 __ xor3(L2,L4,G1); |
3931 __ movxtod(G1,F60); | 4355 __ movxtod(G1,F60); |
3932 __ xor3(L3,L5,G1); | 4356 __ xor3(L3,L5,G1); |
3933 __ movxtod(G1,F62); | 4357 __ movxtod(G1,F62); |
3934 | 4358 |
3953 __ movxtod(L0,F54); | 4377 __ movxtod(L0,F54); |
3954 __ movxtod(L1,F52); | 4378 __ movxtod(L1,F52); |
3955 __ fxor(FloatRegisterImpl::D, F54, F48, F48); | 4379 __ fxor(FloatRegisterImpl::D, F54, F48, F48); |
3956 __ fxor(FloatRegisterImpl::D, F52, F50, F50); | 4380 __ fxor(FloatRegisterImpl::D, F52, F50, F50); |
3957 | 4381 |
3958 __ stf(FloatRegisterImpl::D, F48, to, 0); | |
3959 __ stf(FloatRegisterImpl::D, F50, to, 8); | |
3960 | |
3961 __ movxtod(G4,F56); | 4382 __ movxtod(G4,F56); |
3962 __ movxtod(G5,F58); | 4383 __ movxtod(G5,F58); |
3963 __ mov(L4,L0); | 4384 __ mov(L4,L0); |
3964 __ mov(L5,L1); | 4385 __ mov(L5,L1); |
3965 __ fxor(FloatRegisterImpl::D, F56, F60, F60); | 4386 __ fxor(FloatRegisterImpl::D, F56, F60, F60); |
3966 __ fxor(FloatRegisterImpl::D, F58, F62, F62); | 4387 __ fxor(FloatRegisterImpl::D, F58, F62, F62); |
3967 | 4388 |
4389 // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero | |
4390 __ andcc(to, 7, G1); | |
4391 __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_next2_blocks192); | |
4392 __ delayed()->edge8n(to, G0, G2); | |
4393 | |
4394 // aligned case: store output into the destination array | |
4395 __ stf(FloatRegisterImpl::D, F48, to, 0); | |
4396 __ stf(FloatRegisterImpl::D, F50, to, 8); | |
3968 __ stf(FloatRegisterImpl::D, F60, to, 16); | 4397 __ stf(FloatRegisterImpl::D, F60, to, 16); |
3969 __ stf(FloatRegisterImpl::D, F62, to, 24); | 4398 __ stf(FloatRegisterImpl::D, F62, to, 24); |
3970 | 4399 __ ba_short(L_check_decrypt_loop_end192); |
4400 | |
4401 __ BIND(L_store_misaligned_output_next2_blocks192); | |
4402 __ mov(8, G4); | |
4403 __ sub(G4, G1, G4); | |
4404 __ alignaddr(G4, G0, G4); | |
4405 __ faligndata(F48, F50, F56); // F56 can be clobbered | |
4406 __ faligndata(F50, F60, F50); | |
4407 __ faligndata(F60, F62, F60); | |
4408 __ faligndata(F62, F48, F48); | |
4409 __ mov(to, G1); | |
4410 __ and3(to, -8, to); | |
4411 __ stpartialf(to, G2, F48, Assembler::ASI_PST8_PRIMARY); | |
4412 __ stf(FloatRegisterImpl::D, F56, to, 8); | |
4413 __ stf(FloatRegisterImpl::D, F50, to, 16); | |
4414 __ stf(FloatRegisterImpl::D, F60, to, 24); | |
4415 __ add(to, 32, to); | |
4416 __ orn(G0, G2, G2); | |
4417 __ stpartialf(to, G2, F48, Assembler::ASI_PST8_PRIMARY); | |
4418 __ mov(G1, to); | |
4419 | |
4420 __ BIND(L_check_decrypt_loop_end192); | |
3971 __ add(from, 32, from); | 4421 __ add(from, 32, from); |
3972 __ add(to, 32, to); | 4422 __ add(to, 32, to); |
3973 __ subcc(len_reg, 32, len_reg); | 4423 __ subcc(len_reg, 32, len_reg); |
3974 __ br(Assembler::notEqual, false, Assembler::pt, L_dec_next2_blocks192); | 4424 __ br(Assembler::notEqual, false, Assembler::pt, L_dec_next2_blocks192); |
3975 __ delayed()->nop(); | 4425 __ delayed()->nop(); |
3976 __ br(Assembler::always, false, Assembler::pt, L_cbcdec_end); | 4426 __ ba_short(L_cbcdec_end); |
3977 __ delayed()->nop(); | |
3978 | 4427 |
3979 __ align(OptoLoopAlignment); | 4428 __ align(OptoLoopAlignment); |
3980 __ BIND(L_dec_next2_blocks256); | 4429 __ BIND(L_dec_next2_blocks256); |
3981 __ nop(); | 4430 __ nop(); |
3982 | 4431 |
3983 // F0:F2 used for first 16-bytes | 4432 // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero |
4433 __ andcc(from, 7, G0); | |
4434 __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_next2_blocks256); | |
4435 __ delayed()->mov(from, G1); // save original 'from' address before alignaddr | |
4436 | |
4437 // aligned case: load input into G4, G5, L4 and L5 | |
3984 __ ldx(from,0,G4); | 4438 __ ldx(from,0,G4); |
3985 __ ldx(from,8,G5); | 4439 __ ldx(from,8,G5); |
4440 __ ldx(from,16,L4); | |
4441 __ ldx(from,24,L5); | |
4442 __ ba_short(L_transform_next2_blocks256); | |
4443 | |
4444 __ BIND(L_load_misaligned_next2_blocks256); | |
4445 __ alignaddr(from, G0, from); | |
4446 // F0, F2, F4, F60, F62 can be clobbered | |
4447 __ ldf(FloatRegisterImpl::D, from, 0, F0); | |
4448 __ ldf(FloatRegisterImpl::D, from, 8, F2); | |
4449 __ ldf(FloatRegisterImpl::D, from, 16, F60); | |
4450 __ ldf(FloatRegisterImpl::D, from, 24, F62); | |
4451 __ ldf(FloatRegisterImpl::D, from, 32, F4); | |
4452 __ faligndata(F0, F2, F0); | |
4453 __ faligndata(F2, F60, F2); | |
4454 __ faligndata(F60, F62, F60); | |
4455 __ faligndata(F62, F4, F62); | |
4456 __ movdtox(F0, G4); | |
4457 __ movdtox(F2, G5); | |
4458 __ movdtox(F60, L4); | |
4459 __ movdtox(F62, L5); | |
4460 __ mov(G1, from); | |
4461 | |
4462 __ BIND(L_transform_next2_blocks256); | |
4463 // F0:F2 used for first 16-bytes | |
3986 __ xor3(L2,G4,G1); | 4464 __ xor3(L2,G4,G1); |
3987 __ movxtod(G1,F0); | 4465 __ movxtod(G1,F0); |
3988 __ xor3(L3,G5,G1); | 4466 __ xor3(L3,G5,G1); |
3989 __ movxtod(G1,F2); | 4467 __ movxtod(G1,F2); |
3990 | 4468 |
3991 // F60:F62 used for next 16-bytes | 4469 // F60:F62 used for next 16-bytes |
3992 __ ldx(from,16,L4); | |
3993 __ ldx(from,24,L5); | |
3994 __ xor3(L2,L4,G1); | 4470 __ xor3(L2,L4,G1); |
3995 __ movxtod(G1,F60); | 4471 __ movxtod(G1,F60); |
3996 __ xor3(L3,L5,G1); | 4472 __ xor3(L3,L5,G1); |
3997 __ movxtod(G1,F62); | 4473 __ movxtod(G1,F62); |
3998 | 4474 |
4041 __ movxtod(L0,F6); | 4517 __ movxtod(L0,F6); |
4042 __ movxtod(L1,F4); | 4518 __ movxtod(L1,F4); |
4043 __ fxor(FloatRegisterImpl::D, F6, F0, F0); | 4519 __ fxor(FloatRegisterImpl::D, F6, F0, F0); |
4044 __ fxor(FloatRegisterImpl::D, F4, F2, F2); | 4520 __ fxor(FloatRegisterImpl::D, F4, F2, F2); |
4045 | 4521 |
4046 __ stf(FloatRegisterImpl::D, F0, to, 0); | |
4047 __ stf(FloatRegisterImpl::D, F2, to, 8); | |
4048 | |
4049 __ movxtod(G4,F56); | 4522 __ movxtod(G4,F56); |
4050 __ movxtod(G5,F58); | 4523 __ movxtod(G5,F58); |
4051 __ mov(L4,L0); | 4524 __ mov(L4,L0); |
4052 __ mov(L5,L1); | 4525 __ mov(L5,L1); |
4053 __ fxor(FloatRegisterImpl::D, F56, F60, F60); | 4526 __ fxor(FloatRegisterImpl::D, F56, F60, F60); |
4054 __ fxor(FloatRegisterImpl::D, F58, F62, F62); | 4527 __ fxor(FloatRegisterImpl::D, F58, F62, F62); |
4055 | 4528 |
4529 // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero | |
4530 __ andcc(to, 7, G1); | |
4531 __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_next2_blocks256); | |
4532 __ delayed()->edge8n(to, G0, G2); | |
4533 | |
4534 // aligned case: store output into the destination array | |
4535 __ stf(FloatRegisterImpl::D, F0, to, 0); | |
4536 __ stf(FloatRegisterImpl::D, F2, to, 8); | |
4056 __ stf(FloatRegisterImpl::D, F60, to, 16); | 4537 __ stf(FloatRegisterImpl::D, F60, to, 16); |
4057 __ stf(FloatRegisterImpl::D, F62, to, 24); | 4538 __ stf(FloatRegisterImpl::D, F62, to, 24); |
4058 | 4539 __ ba_short(L_check_decrypt_loop_end256); |
4540 | |
4541 __ BIND(L_store_misaligned_output_next2_blocks256); | |
4542 __ mov(8, G4); | |
4543 __ sub(G4, G1, G4); | |
4544 __ alignaddr(G4, G0, G4); | |
4545 __ faligndata(F0, F2, F56); // F56 can be clobbered | |
4546 __ faligndata(F2, F60, F2); | |
4547 __ faligndata(F60, F62, F60); | |
4548 __ faligndata(F62, F0, F0); | |
4549 __ mov(to, G1); | |
4550 __ and3(to, -8, to); | |
4551 __ stpartialf(to, G2, F0, Assembler::ASI_PST8_PRIMARY); | |
4552 __ stf(FloatRegisterImpl::D, F56, to, 8); | |
4553 __ stf(FloatRegisterImpl::D, F2, to, 16); | |
4554 __ stf(FloatRegisterImpl::D, F60, to, 24); | |
4555 __ add(to, 32, to); | |
4556 __ orn(G0, G2, G2); | |
4557 __ stpartialf(to, G2, F0, Assembler::ASI_PST8_PRIMARY); | |
4558 __ mov(G1, to); | |
4559 | |
4560 __ BIND(L_check_decrypt_loop_end256); | |
4059 __ add(from, 32, from); | 4561 __ add(from, 32, from); |
4060 __ add(to, 32, to); | 4562 __ add(to, 32, to); |
4061 __ subcc(len_reg, 32, len_reg); | 4563 __ subcc(len_reg, 32, len_reg); |
4062 __ br(Assembler::notEqual, false, Assembler::pt, L_dec_next2_blocks256); | 4564 __ br(Assembler::notEqual, false, Assembler::pt, L_dec_next2_blocks256); |
4063 __ delayed()->nop(); | 4565 __ delayed()->nop(); |
4064 | 4566 |
4065 __ BIND(L_cbcdec_end); | 4567 __ BIND(L_cbcdec_end); |
4568 // re-init intial vector for next block, 8-byte alignment is guaranteed | |
4066 __ stx(L0, rvec, 0); | 4569 __ stx(L0, rvec, 0); |
4067 __ stx(L1, rvec, 8); | 4570 __ stx(L1, rvec, 8); |
4068 __ restore(); | 4571 __ restore(); |
4069 __ mov(L0, O0); | 4572 __ mov(L0, O0); |
4070 __ retl(); | 4573 __ retl(); |