truffle: src/cpu/sparc/vm/stubGenerator

comparison src/cpu/sparc/vm/stubGenerator_sparc.cpp @ 18041:52b4284cb496

Merge with jdk8u20-b26

author	Gilles Duboscq <duboscq@ssw.jku.at>
date	Wed, 15 Oct 2014 16:02:50 +0200
parents	89152779163c 0342d80559e0
children	7848fc12602b

comparison

equal deleted inserted replaced

-:45d7b2c7029d
+:52b4284cb496
 /*
-* Copyright (c) 1997, 2013, Oracle and/or its affiliates. All rights reserved.
+* Copyright (c) 1997, 2014, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 only, as
 * published by the Free Software Foundation.
 class StubGenerator: public StubCodeGenerator {
 private:
 #ifdef PRODUCT
-#define inc_counter_np(a,b,c) (0)
+#define inc_counter_np(a,b,c)
 #else
 #define inc_counter_np(counter, t1, t2) \
 BLOCK_COMMENT("inc_counter " #counter); \
 __ inc_counter(&counter, t1, t2);
 #endif
 //
 typedef void (StubGenerator::*CopyLoopFunc)(Register from, Register to, Register count, int count_dec,
 Label& L_loop, bool use_prefetch, bool use_bis);
 void disjoint_copy_core(Register from, Register to, Register count, int log2_elem_size,
-int iter_size, CopyLoopFunc copy_loop_func) {
+int iter_size, StubGenerator::CopyLoopFunc copy_loop_func) {
 Label L_copy;
 assert(log2_elem_size <= 3, "the following code should be changed");
 int count_dec = 16>>log2_elem_size;
 __ andn(from, 7, from);     // Align address
 __ ldx(from, 0, O3);
 __ inc(from, 8);
 __ sllx(O3, left_shift,  O3);
-disjoint_copy_core(from, to, count, log2_elem_size, 16, copy_16_bytes_shift_loop);
+disjoint_copy_core(from, to, count, log2_elem_size, 16, &StubGenerator::copy_16_bytes_shift_loop);
 __ inccc(count, count_dec>>1 ); // + 8 bytes
 __ brx(Assembler::negative, true, Assembler::pn, L_copy_last_bytes);
 __ delayed()->inc(count, count_dec>>1); // restore 'count'
 // copy with shift 4 elements (16 bytes) at a time
 __ dec(count, 4);   // The cmp at the beginning guaranty count >= 4
 __ sllx(O3, 32,  O3);
-disjoint_copy_core(from, to, count, 2, 16, copy_16_bytes_loop);
+disjoint_copy_core(from, to, count, 2, 16, &StubGenerator::copy_16_bytes_loop);
 __ br(Assembler::always, false, Assembler::pt, L_copy_4_bytes);
 __ delayed()->inc(count, 4); // restore 'count'
 __ BIND(L_aligned_copy);
 // Now we can use O4(offset0), O5(offset8) as temps
 __ mov(O3, count);
 // count >= 0 (original count - 8)
 __ mov(from, from64);
-disjoint_copy_core(from64, to64, count, 3, 64, copy_64_bytes_loop);
+disjoint_copy_core(from64, to64, count, 3, 64, &StubGenerator::copy_64_bytes_loop);
 // Restore O4(offset0), O5(offset8)
 __ sub(from64, from, offset0);
 __ inccc(count, 6); // restore count
 __ brx(Assembler::negative, false, Assembler::pn, L_copy_8_bytes );
 if (UseBlockZeroing) {
 StubRoutines::_zero_aligned_words = generate_zero_aligned_words("zero_aligned_words");
 }
 }
+address generate_aescrypt_encryptBlock() {
+// required since we read expanded key 'int' array starting first element without alignment considerations
+assert((arrayOopDesc::base_offset_in_bytes(T_INT) & 7) == 0,
+"the following code assumes that first element of an int array is aligned to 8 bytes");
+__ align(CodeEntryAlignment);
+StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
+Label L_load_misaligned_input, L_load_expanded_key, L_doLast128bit, L_storeOutput, L_store_misaligned_output;
+address start = __ pc();
+Register from = O0; // source byte array
+Register to = O1;   // destination byte array
+Register key = O2;  // expanded key array
+const Register keylen = O4; //reg for storing expanded key array length
+// read expanded key length
+__ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0);
+// Method to address arbitrary alignment for load instructions:
+// Check last 3 bits of 'from' address to see if it is aligned to 8-byte boundary
+// If zero/aligned then continue with double FP load instructions
+// If not zero/mis-aligned then alignaddr will set GSR.align with number of bytes to skip during faligndata
+// alignaddr will also convert arbitrary aligned 'from' address to nearest 8-byte aligned address
+// load 3 * 8-byte components (to read 16 bytes input) in 3 different FP regs starting at this aligned address
+// faligndata will then extract (based on GSR.align value) the appropriate 8 bytes from the 2 source regs
+// check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
+__ andcc(from, 7, G0);
+__ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input);
+__ delayed()->alignaddr(from, G0, from);
+// aligned case: load input into F54-F56
+__ ldf(FloatRegisterImpl::D, from, 0, F54);
+__ ldf(FloatRegisterImpl::D, from, 8, F56);
+__ ba_short(L_load_expanded_key);
+__ BIND(L_load_misaligned_input);
+__ ldf(FloatRegisterImpl::D, from, 0, F54);
+__ ldf(FloatRegisterImpl::D, from, 8, F56);
+__ ldf(FloatRegisterImpl::D, from, 16, F58);
+__ faligndata(F54, F56, F54);
+__ faligndata(F56, F58, F56);
+__ BIND(L_load_expanded_key);
+// Since we load expanded key buffers starting first element, 8-byte alignment is guaranteed
+for ( int i = 0;  i <= 38; i += 2 ) {
+__ ldf(FloatRegisterImpl::D, key, i*4, as_FloatRegister(i));
+}
+// perform cipher transformation
+__ fxor(FloatRegisterImpl::D, F0, F54, F54);
+__ fxor(FloatRegisterImpl::D, F2, F56, F56);
+// rounds 1 through 8
+for ( int i = 4;  i <= 28; i += 8 ) {
+__ aes_eround01(as_FloatRegister(i), F54, F56, F58);
+__ aes_eround23(as_FloatRegister(i+2), F54, F56, F60);
+__ aes_eround01(as_FloatRegister(i+4), F58, F60, F54);
+__ aes_eround23(as_FloatRegister(i+6), F58, F60, F56);
+}
+__ aes_eround01(F36, F54, F56, F58); //round 9
+__ aes_eround23(F38, F54, F56, F60);
+// 128-bit original key size
+__ cmp_and_brx_short(keylen, 44, Assembler::equal, Assembler::pt, L_doLast128bit);
+for ( int i = 40;  i <= 50; i += 2 ) {
+__ ldf(FloatRegisterImpl::D, key, i*4, as_FloatRegister(i) );
+}
+__ aes_eround01(F40, F58, F60, F54); //round 10
+__ aes_eround23(F42, F58, F60, F56);
+__ aes_eround01(F44, F54, F56, F58); //round 11
+__ aes_eround23(F46, F54, F56, F60);
+// 192-bit original key size
+__ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pt, L_storeOutput);
+__ ldf(FloatRegisterImpl::D, key, 208, F52);
+__ aes_eround01(F48, F58, F60, F54); //round 12
+__ aes_eround23(F50, F58, F60, F56);
+__ ldf(FloatRegisterImpl::D, key, 216, F46);
+__ ldf(FloatRegisterImpl::D, key, 224, F48);
+__ ldf(FloatRegisterImpl::D, key, 232, F50);
+__ aes_eround01(F52, F54, F56, F58); //round 13
+__ aes_eround23(F46, F54, F56, F60);
+__ ba_short(L_storeOutput);
+__ BIND(L_doLast128bit);
+__ ldf(FloatRegisterImpl::D, key, 160, F48);
+__ ldf(FloatRegisterImpl::D, key, 168, F50);
+__ BIND(L_storeOutput);
+// perform last round of encryption common for all key sizes
+__ aes_eround01_l(F48, F58, F60, F54); //last round
+__ aes_eround23_l(F50, F58, F60, F56);
+// Method to address arbitrary alignment for store instructions:
+// Check last 3 bits of 'dest' address to see if it is aligned to 8-byte boundary
+// If zero/aligned then continue with double FP store instructions
+// If not zero/mis-aligned then edge8n will generate edge mask in result reg (O3 in below case)
+// Example: If dest address is 0x07 and nearest 8-byte aligned address is 0x00 then edge mask will be 00000001
+// Compute (8-n) where n is # of bytes skipped by partial store(stpartialf) inst from edge mask, n=7 in this case
+// We get the value of n from the andcc that checks 'dest' alignment. n is available in O5 in below case.
+// Set GSR.align to (8-n) using alignaddr
+// Circular byte shift store values by n places so that the original bytes are at correct position for stpartialf
+// Set the arbitrarily aligned 'dest' address to nearest 8-byte aligned address
+// Store (partial) the original first (8-n) bytes starting at the original 'dest' address
+// Negate the edge mask so that the subsequent stpartialf can store the original (8-n-1)th through 8th bytes at appropriate address
+// We need to execute this process for both the 8-byte result values
+// check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
+__ andcc(to, 7, O5);
+__ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output);
+__ delayed()->edge8n(to, G0, O3);
+// aligned case: store output into the destination array
+__ stf(FloatRegisterImpl::D, F54, to, 0);
+__ retl();
+__ delayed()->stf(FloatRegisterImpl::D, F56, to, 8);
+__ BIND(L_store_misaligned_output);
+__ add(to, 8, O4);
+__ mov(8, O2);
+__ sub(O2, O5, O2);
+__ alignaddr(O2, G0, O2);
+__ faligndata(F54, F54, F54);
+__ faligndata(F56, F56, F56);
+__ and3(to, -8, to);
+__ and3(O4, -8, O4);
+__ stpartialf(to, O3, F54, Assembler::ASI_PST8_PRIMARY);
+__ stpartialf(O4, O3, F56, Assembler::ASI_PST8_PRIMARY);
+__ add(to, 8, to);
+__ add(O4, 8, O4);
+__ orn(G0, O3, O3);
+__ stpartialf(to, O3, F54, Assembler::ASI_PST8_PRIMARY);
+__ retl();
+__ delayed()->stpartialf(O4, O3, F56, Assembler::ASI_PST8_PRIMARY);
+return start;
+}
+address generate_aescrypt_decryptBlock() {
+assert((arrayOopDesc::base_offset_in_bytes(T_INT) & 7) == 0,
+"the following code assumes that first element of an int array is aligned to 8 bytes");
+// required since we read original key 'byte' array as well in the decryption stubs
+assert((arrayOopDesc::base_offset_in_bytes(T_BYTE) & 7) == 0,
+"the following code assumes that first element of a byte array is aligned to 8 bytes");
+__ align(CodeEntryAlignment);
+StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
+address start = __ pc();
+Label L_load_misaligned_input, L_load_original_key, L_expand192bit, L_expand256bit, L_reload_misaligned_input;
+Label L_256bit_transform, L_common_transform, L_store_misaligned_output;
+Register from = O0; // source byte array
+Register to = O1;   // destination byte array
+Register key = O2;  // expanded key array
+Register original_key = O3;  // original key array only required during decryption
+const Register keylen = O4;  // reg for storing expanded key array length
+// read expanded key array length
+__ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0);
+// save 'from' since we may need to recheck alignment in case of 256-bit decryption
+__ mov(from, G1);
+// check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
+__ andcc(from, 7, G0);
+__ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input);
+__ delayed()->alignaddr(from, G0, from);
+// aligned case: load input into F52-F54
+__ ldf(FloatRegisterImpl::D, from, 0, F52);
+__ ldf(FloatRegisterImpl::D, from, 8, F54);
+__ ba_short(L_load_original_key);
+__ BIND(L_load_misaligned_input);
+__ ldf(FloatRegisterImpl::D, from, 0, F52);
+__ ldf(FloatRegisterImpl::D, from, 8, F54);
+__ ldf(FloatRegisterImpl::D, from, 16, F56);
+__ faligndata(F52, F54, F52);
+__ faligndata(F54, F56, F54);
+__ BIND(L_load_original_key);
+// load original key from SunJCE expanded decryption key
+// Since we load original key buffer starting first element, 8-byte alignment is guaranteed
+for ( int i = 0;  i <= 3; i++ ) {
+__ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i));
+}
+// 256-bit original key size
+__ cmp_and_brx_short(keylen, 60, Assembler::equal, Assembler::pn, L_expand256bit);
+// 192-bit original key size
+__ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pn, L_expand192bit);
+// 128-bit original key size
+// perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions
+for ( int i = 0;  i <= 36; i += 4 ) {
+__ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+2), i/4, as_FloatRegister(i+4));
+__ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+4), as_FloatRegister(i+6));
+}
+// perform 128-bit key specific inverse cipher transformation
+__ fxor(FloatRegisterImpl::D, F42, F54, F54);
+__ fxor(FloatRegisterImpl::D, F40, F52, F52);
+__ ba_short(L_common_transform);
+__ BIND(L_expand192bit);
+// start loading rest of the 192-bit key
+__ ldf(FloatRegisterImpl::S, original_key, 16, F4);
+__ ldf(FloatRegisterImpl::S, original_key, 20, F5);
+// perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions
+for ( int i = 0;  i <= 36; i += 6 ) {
+__ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+4), i/6, as_FloatRegister(i+6));
+__ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+6), as_FloatRegister(i+8));
+__ aes_kexpand2(as_FloatRegister(i+4), as_FloatRegister(i+8), as_FloatRegister(i+10));
+}
+__ aes_kexpand1(F42, F46, 7, F48);
+__ aes_kexpand2(F44, F48, F50);
+// perform 192-bit key specific inverse cipher transformation
+__ fxor(FloatRegisterImpl::D, F50, F54, F54);
+__ fxor(FloatRegisterImpl::D, F48, F52, F52);
+__ aes_dround23(F46, F52, F54, F58);
+__ aes_dround01(F44, F52, F54, F56);
+__ aes_dround23(F42, F56, F58, F54);
+__ aes_dround01(F40, F56, F58, F52);
+__ ba_short(L_common_transform);
+__ BIND(L_expand256bit);
+// load rest of the 256-bit key
+for ( int i = 4;  i <= 7; i++ ) {
+__ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i));
+}
+// perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions
+for ( int i = 0;  i <= 40; i += 8 ) {
+__ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+6), i/8, as_FloatRegister(i+8));
+__ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+8), as_FloatRegister(i+10));
+__ aes_kexpand0(as_FloatRegister(i+4), as_FloatRegister(i+10), as_FloatRegister(i+12));
+__ aes_kexpand2(as_FloatRegister(i+6), as_FloatRegister(i+12), as_FloatRegister(i+14));
+}
+__ aes_kexpand1(F48, F54, 6, F56);
+__ aes_kexpand2(F50, F56, F58);
+for ( int i = 0;  i <= 6; i += 2 ) {
+__ fsrc2(FloatRegisterImpl::D, as_FloatRegister(58-i), as_FloatRegister(i));
+}
+// reload original 'from' address
+__ mov(G1, from);
+// re-check 8-byte alignment
+__ andcc(from, 7, G0);
+__ br(Assembler::notZero, true, Assembler::pn, L_reload_misaligned_input);
+__ delayed()->alignaddr(from, G0, from);
+// aligned case: load input into F52-F54
+__ ldf(FloatRegisterImpl::D, from, 0, F52);
+__ ldf(FloatRegisterImpl::D, from, 8, F54);
+__ ba_short(L_256bit_transform);
+__ BIND(L_reload_misaligned_input);
+__ ldf(FloatRegisterImpl::D, from, 0, F52);
+__ ldf(FloatRegisterImpl::D, from, 8, F54);
+__ ldf(FloatRegisterImpl::D, from, 16, F56);
+__ faligndata(F52, F54, F52);
+__ faligndata(F54, F56, F54);
+// perform 256-bit key specific inverse cipher transformation
+__ BIND(L_256bit_transform);
+__ fxor(FloatRegisterImpl::D, F0, F54, F54);
+__ fxor(FloatRegisterImpl::D, F2, F52, F52);
+__ aes_dround23(F4, F52, F54, F58);
+__ aes_dround01(F6, F52, F54, F56);
+__ aes_dround23(F50, F56, F58, F54);
+__ aes_dround01(F48, F56, F58, F52);
+__ aes_dround23(F46, F52, F54, F58);
+__ aes_dround01(F44, F52, F54, F56);
+__ aes_dround23(F42, F56, F58, F54);
+__ aes_dround01(F40, F56, F58, F52);
+for ( int i = 0;  i <= 7; i++ ) {
+__ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i));
+}
+// perform inverse cipher transformations common for all key sizes
+__ BIND(L_common_transform);
+for ( int i = 38;  i >= 6; i -= 8 ) {
+__ aes_dround23(as_FloatRegister(i), F52, F54, F58);
+__ aes_dround01(as_FloatRegister(i-2), F52, F54, F56);
+if ( i != 6) {
+__ aes_dround23(as_FloatRegister(i-4), F56, F58, F54);
+__ aes_dround01(as_FloatRegister(i-6), F56, F58, F52);
+} else {
+__ aes_dround23_l(as_FloatRegister(i-4), F56, F58, F54);
+__ aes_dround01_l(as_FloatRegister(i-6), F56, F58, F52);
+}
+}
+// check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
+__ andcc(to, 7, O5);
+__ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output);
+__ delayed()->edge8n(to, G0, O3);
+// aligned case: store output into the destination array
+__ stf(FloatRegisterImpl::D, F52, to, 0);
+__ retl();
+__ delayed()->stf(FloatRegisterImpl::D, F54, to, 8);
+__ BIND(L_store_misaligned_output);
+__ add(to, 8, O4);
+__ mov(8, O2);
+__ sub(O2, O5, O2);
+__ alignaddr(O2, G0, O2);
+__ faligndata(F52, F52, F52);
+__ faligndata(F54, F54, F54);
+__ and3(to, -8, to);
+__ and3(O4, -8, O4);
+__ stpartialf(to, O3, F52, Assembler::ASI_PST8_PRIMARY);
+__ stpartialf(O4, O3, F54, Assembler::ASI_PST8_PRIMARY);
+__ add(to, 8, to);
+__ add(O4, 8, O4);
+__ orn(G0, O3, O3);
+__ stpartialf(to, O3, F52, Assembler::ASI_PST8_PRIMARY);
+__ retl();
+__ delayed()->stpartialf(O4, O3, F54, Assembler::ASI_PST8_PRIMARY);
+return start;
+}
+address generate_cipherBlockChaining_encryptAESCrypt() {
+assert((arrayOopDesc::base_offset_in_bytes(T_INT) & 7) == 0,
+"the following code assumes that first element of an int array is aligned to 8 bytes");
+assert((arrayOopDesc::base_offset_in_bytes(T_BYTE) & 7) == 0,
+"the following code assumes that first element of a byte array is aligned to 8 bytes");
+__ align(CodeEntryAlignment);
+StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
+Label L_cbcenc128, L_load_misaligned_input_128bit, L_128bit_transform, L_store_misaligned_output_128bit;
+Label L_check_loop_end_128bit, L_cbcenc192, L_load_misaligned_input_192bit, L_192bit_transform;
+Label L_store_misaligned_output_192bit, L_check_loop_end_192bit, L_cbcenc256, L_load_misaligned_input_256bit;
+Label L_256bit_transform, L_store_misaligned_output_256bit, L_check_loop_end_256bit;
+address start = __ pc();
+Register from = I0; // source byte array
+Register to = I1;   // destination byte array
+Register key = I2;  // expanded key array
+Register rvec = I3; // init vector
+const Register len_reg = I4; // cipher length
+const Register keylen = I5;  // reg for storing expanded key array length
+__ save_frame(0);
+// save cipher len to return in the end
+__ mov(len_reg, L0);
+// read expanded key length
+__ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0);
+// load initial vector, 8-byte alignment is guranteed
+__ ldf(FloatRegisterImpl::D, rvec, 0, F60);
+__ ldf(FloatRegisterImpl::D, rvec, 8, F62);
+// load key, 8-byte alignment is guranteed
+__ ldx(key,0,G1);
+__ ldx(key,8,G5);
+// start loading expanded key, 8-byte alignment is guranteed
+for ( int i = 0, j = 16;  i <= 38; i += 2, j += 8 ) {
+__ ldf(FloatRegisterImpl::D, key, j, as_FloatRegister(i));
+}
+// 128-bit original key size
+__ cmp_and_brx_short(keylen, 44, Assembler::equal, Assembler::pt, L_cbcenc128);
+for ( int i = 40, j = 176;  i <= 46; i += 2, j += 8 ) {
+__ ldf(FloatRegisterImpl::D, key, j, as_FloatRegister(i));
+}
+// 192-bit original key size
+__ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pt, L_cbcenc192);
+for ( int i = 48, j = 208;  i <= 54; i += 2, j += 8 ) {
+__ ldf(FloatRegisterImpl::D, key, j, as_FloatRegister(i));
+}
+// 256-bit original key size
+__ ba_short(L_cbcenc256);
+__ align(OptoLoopAlignment);
+__ BIND(L_cbcenc128);
+// check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
+__ andcc(from, 7, G0);
+__ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input_128bit);
+__ delayed()->mov(from, L1); // save original 'from' address before alignaddr
+// aligned case: load input into G3 and G4
+__ ldx(from,0,G3);
+__ ldx(from,8,G4);
+__ ba_short(L_128bit_transform);
+__ BIND(L_load_misaligned_input_128bit);
+// can clobber F48, F50 and F52 as they are not used in 128 and 192-bit key encryption
+__ alignaddr(from, G0, from);
+__ ldf(FloatRegisterImpl::D, from, 0, F48);
+__ ldf(FloatRegisterImpl::D, from, 8, F50);
+__ ldf(FloatRegisterImpl::D, from, 16, F52);
+__ faligndata(F48, F50, F48);
+__ faligndata(F50, F52, F50);
+__ movdtox(F48, G3);
+__ movdtox(F50, G4);
+__ mov(L1, from);
+__ BIND(L_128bit_transform);
+__ xor3(G1,G3,G3);
+__ xor3(G5,G4,G4);
+__ movxtod(G3,F56);
+__ movxtod(G4,F58);
+__ fxor(FloatRegisterImpl::D, F60, F56, F60);
+__ fxor(FloatRegisterImpl::D, F62, F58, F62);
+// TEN_EROUNDS
+for ( int i = 0;  i <= 32; i += 8 ) {
+__ aes_eround01(as_FloatRegister(i), F60, F62, F56);
+__ aes_eround23(as_FloatRegister(i+2), F60, F62, F58);
+if (i != 32 ) {
+__ aes_eround01(as_FloatRegister(i+4), F56, F58, F60);
+__ aes_eround23(as_FloatRegister(i+6), F56, F58, F62);
+} else {
+__ aes_eround01_l(as_FloatRegister(i+4), F56, F58, F60);
+__ aes_eround23_l(as_FloatRegister(i+6), F56, F58, F62);
+}
+}
+// check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
+__ andcc(to, 7, L1);
+__ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_128bit);
+__ delayed()->edge8n(to, G0, L2);
+// aligned case: store output into the destination array
+__ stf(FloatRegisterImpl::D, F60, to, 0);
+__ stf(FloatRegisterImpl::D, F62, to, 8);
+__ ba_short(L_check_loop_end_128bit);
+__ BIND(L_store_misaligned_output_128bit);
+__ add(to, 8, L3);
+__ mov(8, L4);
+__ sub(L4, L1, L4);
+__ alignaddr(L4, G0, L4);
+// save cipher text before circular right shift
+// as it needs to be stored as iv for next block (see code before next retl)
+__ movdtox(F60, L6);
+__ movdtox(F62, L7);
+__ faligndata(F60, F60, F60);
+__ faligndata(F62, F62, F62);
+__ mov(to, L5);
+__ and3(to, -8, to);
+__ and3(L3, -8, L3);
+__ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY);
+__ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY);
+__ add(to, 8, to);
+__ add(L3, 8, L3);
+__ orn(G0, L2, L2);
+__ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY);
+__ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY);
+__ mov(L5, to);
+__ movxtod(L6, F60);
+__ movxtod(L7, F62);
+__ BIND(L_check_loop_end_128bit);
+__ add(from, 16, from);
+__ add(to, 16, to);
+__ subcc(len_reg, 16, len_reg);
+__ br(Assembler::notEqual, false, Assembler::pt, L_cbcenc128);
+__ delayed()->nop();
+// re-init intial vector for next block, 8-byte alignment is guaranteed
+__ stf(FloatRegisterImpl::D, F60, rvec, 0);
+__ stf(FloatRegisterImpl::D, F62, rvec, 8);
+__ mov(L0, I0);
+__ ret();
+__ delayed()->restore();
+__ align(OptoLoopAlignment);
+__ BIND(L_cbcenc192);
+// check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
+__ andcc(from, 7, G0);
+__ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input_192bit);
+__ delayed()->mov(from, L1); // save original 'from' address before alignaddr
+// aligned case: load input into G3 and G4
+__ ldx(from,0,G3);
+__ ldx(from,8,G4);
+__ ba_short(L_192bit_transform);
+__ BIND(L_load_misaligned_input_192bit);
+// can clobber F48, F50 and F52 as they are not used in 128 and 192-bit key encryption
+__ alignaddr(from, G0, from);
+__ ldf(FloatRegisterImpl::D, from, 0, F48);
+__ ldf(FloatRegisterImpl::D, from, 8, F50);
+__ ldf(FloatRegisterImpl::D, from, 16, F52);
+__ faligndata(F48, F50, F48);
+__ faligndata(F50, F52, F50);
+__ movdtox(F48, G3);
+__ movdtox(F50, G4);
+__ mov(L1, from);
+__ BIND(L_192bit_transform);
+__ xor3(G1,G3,G3);
+__ xor3(G5,G4,G4);
+__ movxtod(G3,F56);
+__ movxtod(G4,F58);
+__ fxor(FloatRegisterImpl::D, F60, F56, F60);
+__ fxor(FloatRegisterImpl::D, F62, F58, F62);
+// TWELEVE_EROUNDS
+for ( int i = 0;  i <= 40; i += 8 ) {
+__ aes_eround01(as_FloatRegister(i), F60, F62, F56);
+__ aes_eround23(as_FloatRegister(i+2), F60, F62, F58);
+if (i != 40 ) {
+__ aes_eround01(as_FloatRegister(i+4), F56, F58, F60);
+__ aes_eround23(as_FloatRegister(i+6), F56, F58, F62);
+} else {
+__ aes_eround01_l(as_FloatRegister(i+4), F56, F58, F60);
+__ aes_eround23_l(as_FloatRegister(i+6), F56, F58, F62);
+}
+}
+// check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
+__ andcc(to, 7, L1);
+__ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_192bit);
+__ delayed()->edge8n(to, G0, L2);
+// aligned case: store output into the destination array
+__ stf(FloatRegisterImpl::D, F60, to, 0);
+__ stf(FloatRegisterImpl::D, F62, to, 8);
+__ ba_short(L_check_loop_end_192bit);
+__ BIND(L_store_misaligned_output_192bit);
+__ add(to, 8, L3);
+__ mov(8, L4);
+__ sub(L4, L1, L4);
+__ alignaddr(L4, G0, L4);
+__ movdtox(F60, L6);
+__ movdtox(F62, L7);
+__ faligndata(F60, F60, F60);
+__ faligndata(F62, F62, F62);
+__ mov(to, L5);
+__ and3(to, -8, to);
+__ and3(L3, -8, L3);
+__ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY);
+__ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY);
+__ add(to, 8, to);
+__ add(L3, 8, L3);
+__ orn(G0, L2, L2);
+__ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY);
+__ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY);
+__ mov(L5, to);
+__ movxtod(L6, F60);
+__ movxtod(L7, F62);
+__ BIND(L_check_loop_end_192bit);
+__ add(from, 16, from);
+__ subcc(len_reg, 16, len_reg);
+__ add(to, 16, to);
+__ br(Assembler::notEqual, false, Assembler::pt, L_cbcenc192);
+__ delayed()->nop();
+// re-init intial vector for next block, 8-byte alignment is guaranteed
+__ stf(FloatRegisterImpl::D, F60, rvec, 0);
+__ stf(FloatRegisterImpl::D, F62, rvec, 8);
+__ mov(L0, I0);
+__ ret();
+__ delayed()->restore();
+__ align(OptoLoopAlignment);
+__ BIND(L_cbcenc256);
+// check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
+__ andcc(from, 7, G0);
+__ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input_256bit);
+__ delayed()->mov(from, L1); // save original 'from' address before alignaddr
+// aligned case: load input into G3 and G4
+__ ldx(from,0,G3);
+__ ldx(from,8,G4);
+__ ba_short(L_256bit_transform);
+__ BIND(L_load_misaligned_input_256bit);
+// cannot clobber F48, F50 and F52. F56, F58 can be used though
+__ alignaddr(from, G0, from);
+__ movdtox(F60, L2); // save F60 before overwriting
+__ ldf(FloatRegisterImpl::D, from, 0, F56);
+__ ldf(FloatRegisterImpl::D, from, 8, F58);
+__ ldf(FloatRegisterImpl::D, from, 16, F60);
+__ faligndata(F56, F58, F56);
+__ faligndata(F58, F60, F58);
+__ movdtox(F56, G3);
+__ movdtox(F58, G4);
+__ mov(L1, from);
+__ movxtod(L2, F60);
+__ BIND(L_256bit_transform);
+__ xor3(G1,G3,G3);
+__ xor3(G5,G4,G4);
+__ movxtod(G3,F56);
+__ movxtod(G4,F58);
+__ fxor(FloatRegisterImpl::D, F60, F56, F60);
+__ fxor(FloatRegisterImpl::D, F62, F58, F62);
+// FOURTEEN_EROUNDS
+for ( int i = 0;  i <= 48; i += 8 ) {
+__ aes_eround01(as_FloatRegister(i), F60, F62, F56);
+__ aes_eround23(as_FloatRegister(i+2), F60, F62, F58);
+if (i != 48 ) {
+__ aes_eround01(as_FloatRegister(i+4), F56, F58, F60);
+__ aes_eround23(as_FloatRegister(i+6), F56, F58, F62);
+} else {
+__ aes_eround01_l(as_FloatRegister(i+4), F56, F58, F60);
+__ aes_eround23_l(as_FloatRegister(i+6), F56, F58, F62);
+}
+}
+// check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
+__ andcc(to, 7, L1);
+__ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_256bit);
+__ delayed()->edge8n(to, G0, L2);
+// aligned case: store output into the destination array
+__ stf(FloatRegisterImpl::D, F60, to, 0);
+__ stf(FloatRegisterImpl::D, F62, to, 8);
+__ ba_short(L_check_loop_end_256bit);
+__ BIND(L_store_misaligned_output_256bit);
+__ add(to, 8, L3);
+__ mov(8, L4);
+__ sub(L4, L1, L4);
+__ alignaddr(L4, G0, L4);
+__ movdtox(F60, L6);
+__ movdtox(F62, L7);
+__ faligndata(F60, F60, F60);
+__ faligndata(F62, F62, F62);
+__ mov(to, L5);
+__ and3(to, -8, to);
+__ and3(L3, -8, L3);
+__ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY);
+__ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY);
+__ add(to, 8, to);
+__ add(L3, 8, L3);
+__ orn(G0, L2, L2);
+__ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY);
+__ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY);
+__ mov(L5, to);
+__ movxtod(L6, F60);
+__ movxtod(L7, F62);
+__ BIND(L_check_loop_end_256bit);
+__ add(from, 16, from);
+__ subcc(len_reg, 16, len_reg);
+__ add(to, 16, to);
+__ br(Assembler::notEqual, false, Assembler::pt, L_cbcenc256);
+__ delayed()->nop();
+// re-init intial vector for next block, 8-byte alignment is guaranteed
+__ stf(FloatRegisterImpl::D, F60, rvec, 0);
+__ stf(FloatRegisterImpl::D, F62, rvec, 8);
+__ mov(L0, I0);
+__ ret();
+__ delayed()->restore();
+return start;
+}
+address generate_cipherBlockChaining_decryptAESCrypt_Parallel() {
+assert((arrayOopDesc::base_offset_in_bytes(T_INT) & 7) == 0,
+"the following code assumes that first element of an int array is aligned to 8 bytes");
+assert((arrayOopDesc::base_offset_in_bytes(T_BYTE) & 7) == 0,
+"the following code assumes that first element of a byte array is aligned to 8 bytes");
+__ align(CodeEntryAlignment);
+StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
+Label L_cbcdec_end, L_expand192bit, L_expand256bit, L_dec_first_block_start;
+Label L_dec_first_block128, L_dec_first_block192, L_dec_next2_blocks128, L_dec_next2_blocks192, L_dec_next2_blocks256;
+Label L_load_misaligned_input_first_block, L_transform_first_block, L_load_misaligned_next2_blocks128, L_transform_next2_blocks128;
+Label L_load_misaligned_next2_blocks192, L_transform_next2_blocks192, L_load_misaligned_next2_blocks256, L_transform_next2_blocks256;
+Label L_store_misaligned_output_first_block, L_check_decrypt_end, L_store_misaligned_output_next2_blocks128;
+Label L_check_decrypt_loop_end128, L_store_misaligned_output_next2_blocks192, L_check_decrypt_loop_end192;
+Label L_store_misaligned_output_next2_blocks256, L_check_decrypt_loop_end256;
+address start = __ pc();
+Register from = I0; // source byte array
+Register to = I1;   // destination byte array
+Register key = I2;  // expanded key array
+Register rvec = I3; // init vector
+const Register len_reg = I4; // cipher length
+const Register original_key = I5;  // original key array only required during decryption
+const Register keylen = L6;  // reg for storing expanded key array length
+__ save_frame(0); //args are read from I* registers since we save the frame in the beginning
+// save cipher len to return in the end
+__ mov(len_reg, L7);
+// load original key from SunJCE expanded decryption key
+// Since we load original key buffer starting first element, 8-byte alignment is guaranteed
+for ( int i = 0;  i <= 3; i++ ) {
+__ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i));
+}
+// load initial vector, 8-byte alignment is guaranteed
+__ ldx(rvec,0,L0);
+__ ldx(rvec,8,L1);
+// read expanded key array length
+__ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0);
+// 256-bit original key size
+__ cmp_and_brx_short(keylen, 60, Assembler::equal, Assembler::pn, L_expand256bit);
+// 192-bit original key size
+__ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pn, L_expand192bit);
+// 128-bit original key size
+// perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions
+for ( int i = 0;  i <= 36; i += 4 ) {
+__ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+2), i/4, as_FloatRegister(i+4));
+__ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+4), as_FloatRegister(i+6));
+}
+// load expanded key[last-1] and key[last] elements
+__ movdtox(F40,L2);
+__ movdtox(F42,L3);
+__ and3(len_reg, 16, L4);
+__ br_null_short(L4, Assembler::pt, L_dec_next2_blocks128);
+__ nop();
+__ ba_short(L_dec_first_block_start);
+__ BIND(L_expand192bit);
+// load rest of the 192-bit key
+__ ldf(FloatRegisterImpl::S, original_key, 16, F4);
+__ ldf(FloatRegisterImpl::S, original_key, 20, F5);
+// perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions
+for ( int i = 0;  i <= 36; i += 6 ) {
+__ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+4), i/6, as_FloatRegister(i+6));
+__ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+6), as_FloatRegister(i+8));
+__ aes_kexpand2(as_FloatRegister(i+4), as_FloatRegister(i+8), as_FloatRegister(i+10));
+}
+__ aes_kexpand1(F42, F46, 7, F48);
+__ aes_kexpand2(F44, F48, F50);
+// load expanded key[last-1] and key[last] elements
+__ movdtox(F48,L2);
+__ movdtox(F50,L3);
+__ and3(len_reg, 16, L4);
+__ br_null_short(L4, Assembler::pt, L_dec_next2_blocks192);
+__ nop();
+__ ba_short(L_dec_first_block_start);
+__ BIND(L_expand256bit);
+// load rest of the 256-bit key
+for ( int i = 4;  i <= 7; i++ ) {
+__ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i));
+}
+// perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions
+for ( int i = 0;  i <= 40; i += 8 ) {
+__ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+6), i/8, as_FloatRegister(i+8));
+__ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+8), as_FloatRegister(i+10));
+__ aes_kexpand0(as_FloatRegister(i+4), as_FloatRegister(i+10), as_FloatRegister(i+12));
+__ aes_kexpand2(as_FloatRegister(i+6), as_FloatRegister(i+12), as_FloatRegister(i+14));
+}
+__ aes_kexpand1(F48, F54, 6, F56);
+__ aes_kexpand2(F50, F56, F58);
+// load expanded key[last-1] and key[last] elements
+__ movdtox(F56,L2);
+__ movdtox(F58,L3);
+__ and3(len_reg, 16, L4);
+__ br_null_short(L4, Assembler::pt, L_dec_next2_blocks256);
+__ BIND(L_dec_first_block_start);
+// check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
+__ andcc(from, 7, G0);
+__ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input_first_block);
+__ delayed()->mov(from, G1); // save original 'from' address before alignaddr
+// aligned case: load input into L4 and L5
+__ ldx(from,0,L4);
+__ ldx(from,8,L5);
+__ ba_short(L_transform_first_block);
+__ BIND(L_load_misaligned_input_first_block);
+__ alignaddr(from, G0, from);
+// F58, F60, F62 can be clobbered
+__ ldf(FloatRegisterImpl::D, from, 0, F58);
+__ ldf(FloatRegisterImpl::D, from, 8, F60);
+__ ldf(FloatRegisterImpl::D, from, 16, F62);
+__ faligndata(F58, F60, F58);
+__ faligndata(F60, F62, F60);
+__ movdtox(F58, L4);
+__ movdtox(F60, L5);
+__ mov(G1, from);
+__ BIND(L_transform_first_block);
+__ xor3(L2,L4,G1);
+__ movxtod(G1,F60);
+__ xor3(L3,L5,G1);
+__ movxtod(G1,F62);
+// 128-bit original key size
+__ cmp_and_brx_short(keylen, 44, Assembler::equal, Assembler::pn, L_dec_first_block128);
+// 192-bit original key size
+__ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pn, L_dec_first_block192);
+__ aes_dround23(F54, F60, F62, F58);
+__ aes_dround01(F52, F60, F62, F56);
+__ aes_dround23(F50, F56, F58, F62);
+__ aes_dround01(F48, F56, F58, F60);
+__ BIND(L_dec_first_block192);
+__ aes_dround23(F46, F60, F62, F58);
+__ aes_dround01(F44, F60, F62, F56);
+__ aes_dround23(F42, F56, F58, F62);
+__ aes_dround01(F40, F56, F58, F60);
+__ BIND(L_dec_first_block128);
+for ( int i = 38;  i >= 6; i -= 8 ) {
+__ aes_dround23(as_FloatRegister(i), F60, F62, F58);
+__ aes_dround01(as_FloatRegister(i-2), F60, F62, F56);
+if ( i != 6) {
+__ aes_dround23(as_FloatRegister(i-4), F56, F58, F62);
+__ aes_dround01(as_FloatRegister(i-6), F56, F58, F60);
+} else {
+__ aes_dround23_l(as_FloatRegister(i-4), F56, F58, F62);
+__ aes_dround01_l(as_FloatRegister(i-6), F56, F58, F60);
+}
+}
+__ movxtod(L0,F56);
+__ movxtod(L1,F58);
+__ mov(L4,L0);
+__ mov(L5,L1);
+__ fxor(FloatRegisterImpl::D, F56, F60, F60);
+__ fxor(FloatRegisterImpl::D, F58, F62, F62);
+// check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
+__ andcc(to, 7, G1);
+__ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_first_block);
+__ delayed()->edge8n(to, G0, G2);
+// aligned case: store output into the destination array
+__ stf(FloatRegisterImpl::D, F60, to, 0);
+__ stf(FloatRegisterImpl::D, F62, to, 8);
+__ ba_short(L_check_decrypt_end);
+__ BIND(L_store_misaligned_output_first_block);
+__ add(to, 8, G3);
+__ mov(8, G4);
+__ sub(G4, G1, G4);
+__ alignaddr(G4, G0, G4);
+__ faligndata(F60, F60, F60);
+__ faligndata(F62, F62, F62);
+__ mov(to, G1);
+__ and3(to, -8, to);
+__ and3(G3, -8, G3);
+__ stpartialf(to, G2, F60, Assembler::ASI_PST8_PRIMARY);
+__ stpartialf(G3, G2, F62, Assembler::ASI_PST8_PRIMARY);
+__ add(to, 8, to);
+__ add(G3, 8, G3);
+__ orn(G0, G2, G2);
+__ stpartialf(to, G2, F60, Assembler::ASI_PST8_PRIMARY);
+__ stpartialf(G3, G2, F62, Assembler::ASI_PST8_PRIMARY);
+__ mov(G1, to);
+__ BIND(L_check_decrypt_end);
+__ add(from, 16, from);
+__ add(to, 16, to);
+__ subcc(len_reg, 16, len_reg);
+__ br(Assembler::equal, false, Assembler::pt, L_cbcdec_end);
+__ delayed()->nop();
+// 256-bit original key size
+__ cmp_and_brx_short(keylen, 60, Assembler::equal, Assembler::pn, L_dec_next2_blocks256);
+// 192-bit original key size
+__ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pn, L_dec_next2_blocks192);
+__ align(OptoLoopAlignment);
+__ BIND(L_dec_next2_blocks128);
+__ nop();
+// check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
+__ andcc(from, 7, G0);
+__ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_next2_blocks128);
+__ delayed()->mov(from, G1); // save original 'from' address before alignaddr
+// aligned case: load input into G4, G5, L4 and L5
+__ ldx(from,0,G4);
+__ ldx(from,8,G5);
+__ ldx(from,16,L4);
+__ ldx(from,24,L5);
+__ ba_short(L_transform_next2_blocks128);
+__ BIND(L_load_misaligned_next2_blocks128);
+__ alignaddr(from, G0, from);
+// F40, F42, F58, F60, F62 can be clobbered
+__ ldf(FloatRegisterImpl::D, from, 0, F40);
+__ ldf(FloatRegisterImpl::D, from, 8, F42);
+__ ldf(FloatRegisterImpl::D, from, 16, F60);
+__ ldf(FloatRegisterImpl::D, from, 24, F62);
+__ ldf(FloatRegisterImpl::D, from, 32, F58);
+__ faligndata(F40, F42, F40);
+__ faligndata(F42, F60, F42);
+__ faligndata(F60, F62, F60);
+__ faligndata(F62, F58, F62);
+__ movdtox(F40, G4);
+__ movdtox(F42, G5);
+__ movdtox(F60, L4);
+__ movdtox(F62, L5);
+__ mov(G1, from);
+__ BIND(L_transform_next2_blocks128);
+// F40:F42 used for first 16-bytes
+__ xor3(L2,G4,G1);
+__ movxtod(G1,F40);
+__ xor3(L3,G5,G1);
+__ movxtod(G1,F42);
+// F60:F62 used for next 16-bytes
+__ xor3(L2,L4,G1);
+__ movxtod(G1,F60);
+__ xor3(L3,L5,G1);
+__ movxtod(G1,F62);
+for ( int i = 38;  i >= 6; i -= 8 ) {
+__ aes_dround23(as_FloatRegister(i), F40, F42, F44);
+__ aes_dround01(as_FloatRegister(i-2), F40, F42, F46);
+__ aes_dround23(as_FloatRegister(i), F60, F62, F58);
+__ aes_dround01(as_FloatRegister(i-2), F60, F62, F56);
+if (i != 6 ) {
+__ aes_dround23(as_FloatRegister(i-4), F46, F44, F42);
+__ aes_dround01(as_FloatRegister(i-6), F46, F44, F40);
+__ aes_dround23(as_FloatRegister(i-4), F56, F58, F62);
+__ aes_dround01(as_FloatRegister(i-6), F56, F58, F60);
+} else {
+__ aes_dround23_l(as_FloatRegister(i-4), F46, F44, F42);
+__ aes_dround01_l(as_FloatRegister(i-6), F46, F44, F40);
+__ aes_dround23_l(as_FloatRegister(i-4), F56, F58, F62);
+__ aes_dround01_l(as_FloatRegister(i-6), F56, F58, F60);
+}
+}
+__ movxtod(L0,F46);
+__ movxtod(L1,F44);
+__ fxor(FloatRegisterImpl::D, F46, F40, F40);
+__ fxor(FloatRegisterImpl::D, F44, F42, F42);
+__ movxtod(G4,F56);
+__ movxtod(G5,F58);
+__ mov(L4,L0);
+__ mov(L5,L1);
+__ fxor(FloatRegisterImpl::D, F56, F60, F60);
+__ fxor(FloatRegisterImpl::D, F58, F62, F62);
+// For mis-aligned store of 32 bytes of result we can do:
+// Circular right-shift all 4 FP registers so that 'head' and 'tail'
+// parts that need to be stored starting at mis-aligned address are in a FP reg
+// the other 3 FP regs can thus be stored using regular store
+// we then use the edge + partial-store mechanism to store the 'head' and 'tail' parts
+// check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
+__ andcc(to, 7, G1);
+__ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_next2_blocks128);
+__ delayed()->edge8n(to, G0, G2);
+// aligned case: store output into the destination array
+__ stf(FloatRegisterImpl::D, F40, to, 0);
+__ stf(FloatRegisterImpl::D, F42, to, 8);
+__ stf(FloatRegisterImpl::D, F60, to, 16);
+__ stf(FloatRegisterImpl::D, F62, to, 24);
+__ ba_short(L_check_decrypt_loop_end128);
+__ BIND(L_store_misaligned_output_next2_blocks128);
+__ mov(8, G4);
+__ sub(G4, G1, G4);
+__ alignaddr(G4, G0, G4);
+__ faligndata(F40, F42, F56); // F56 can be clobbered
+__ faligndata(F42, F60, F42);
+__ faligndata(F60, F62, F60);
+__ faligndata(F62, F40, F40);
+__ mov(to, G1);
+__ and3(to, -8, to);
+__ stpartialf(to, G2, F40, Assembler::ASI_PST8_PRIMARY);
+__ stf(FloatRegisterImpl::D, F56, to, 8);
+__ stf(FloatRegisterImpl::D, F42, to, 16);
+__ stf(FloatRegisterImpl::D, F60, to, 24);
+__ add(to, 32, to);
+__ orn(G0, G2, G2);
+__ stpartialf(to, G2, F40, Assembler::ASI_PST8_PRIMARY);
+__ mov(G1, to);
+__ BIND(L_check_decrypt_loop_end128);
+__ add(from, 32, from);
+__ add(to, 32, to);
+__ subcc(len_reg, 32, len_reg);
+__ br(Assembler::notEqual, false, Assembler::pt, L_dec_next2_blocks128);
+__ delayed()->nop();
+__ ba_short(L_cbcdec_end);
+__ align(OptoLoopAlignment);
+__ BIND(L_dec_next2_blocks192);
+__ nop();
+// check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
+__ andcc(from, 7, G0);
+__ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_next2_blocks192);
+__ delayed()->mov(from, G1); // save original 'from' address before alignaddr
+// aligned case: load input into G4, G5, L4 and L5
+__ ldx(from,0,G4);
+__ ldx(from,8,G5);
+__ ldx(from,16,L4);
+__ ldx(from,24,L5);
+__ ba_short(L_transform_next2_blocks192);
+__ BIND(L_load_misaligned_next2_blocks192);
+__ alignaddr(from, G0, from);
+// F48, F50, F52, F60, F62 can be clobbered
+__ ldf(FloatRegisterImpl::D, from, 0, F48);
+__ ldf(FloatRegisterImpl::D, from, 8, F50);
+__ ldf(FloatRegisterImpl::D, from, 16, F60);
+__ ldf(FloatRegisterImpl::D, from, 24, F62);
+__ ldf(FloatRegisterImpl::D, from, 32, F52);
+__ faligndata(F48, F50, F48);
+__ faligndata(F50, F60, F50);
+__ faligndata(F60, F62, F60);
+__ faligndata(F62, F52, F62);
+__ movdtox(F48, G4);
+__ movdtox(F50, G5);
+__ movdtox(F60, L4);
+__ movdtox(F62, L5);
+__ mov(G1, from);
+__ BIND(L_transform_next2_blocks192);
+// F48:F50 used for first 16-bytes
+__ xor3(L2,G4,G1);
+__ movxtod(G1,F48);
+__ xor3(L3,G5,G1);
+__ movxtod(G1,F50);
+// F60:F62 used for next 16-bytes
+__ xor3(L2,L4,G1);
+__ movxtod(G1,F60);
+__ xor3(L3,L5,G1);
+__ movxtod(G1,F62);
+for ( int i = 46;  i >= 6; i -= 8 ) {
+__ aes_dround23(as_FloatRegister(i), F48, F50, F52);
+__ aes_dround01(as_FloatRegister(i-2), F48, F50, F54);
+__ aes_dround23(as_FloatRegister(i), F60, F62, F58);
+__ aes_dround01(as_FloatRegister(i-2), F60, F62, F56);
+if (i != 6 ) {
+__ aes_dround23(as_FloatRegister(i-4), F54, F52, F50);
+__ aes_dround01(as_FloatRegister(i-6), F54, F52, F48);
+__ aes_dround23(as_FloatRegister(i-4), F56, F58, F62);
+__ aes_dround01(as_FloatRegister(i-6), F56, F58, F60);
+} else {
+__ aes_dround23_l(as_FloatRegister(i-4), F54, F52, F50);
+__ aes_dround01_l(as_FloatRegister(i-6), F54, F52, F48);
+__ aes_dround23_l(as_FloatRegister(i-4), F56, F58, F62);
+__ aes_dround01_l(as_FloatRegister(i-6), F56, F58, F60);
+}
+}
+__ movxtod(L0,F54);
+__ movxtod(L1,F52);
+__ fxor(FloatRegisterImpl::D, F54, F48, F48);
+__ fxor(FloatRegisterImpl::D, F52, F50, F50);
+__ movxtod(G4,F56);
+__ movxtod(G5,F58);
+__ mov(L4,L0);
+__ mov(L5,L1);
+__ fxor(FloatRegisterImpl::D, F56, F60, F60);
+__ fxor(FloatRegisterImpl::D, F58, F62, F62);
+// check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
+__ andcc(to, 7, G1);
+__ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_next2_blocks192);
+__ delayed()->edge8n(to, G0, G2);
+// aligned case: store output into the destination array
+__ stf(FloatRegisterImpl::D, F48, to, 0);
+__ stf(FloatRegisterImpl::D, F50, to, 8);
+__ stf(FloatRegisterImpl::D, F60, to, 16);
+__ stf(FloatRegisterImpl::D, F62, to, 24);
+__ ba_short(L_check_decrypt_loop_end192);
+__ BIND(L_store_misaligned_output_next2_blocks192);
+__ mov(8, G4);
+__ sub(G4, G1, G4);
+__ alignaddr(G4, G0, G4);
+__ faligndata(F48, F50, F56); // F56 can be clobbered
+__ faligndata(F50, F60, F50);
+__ faligndata(F60, F62, F60);
+__ faligndata(F62, F48, F48);
+__ mov(to, G1);
+__ and3(to, -8, to);
+__ stpartialf(to, G2, F48, Assembler::ASI_PST8_PRIMARY);
+__ stf(FloatRegisterImpl::D, F56, to, 8);
+__ stf(FloatRegisterImpl::D, F50, to, 16);
+__ stf(FloatRegisterImpl::D, F60, to, 24);
+__ add(to, 32, to);
+__ orn(G0, G2, G2);
+__ stpartialf(to, G2, F48, Assembler::ASI_PST8_PRIMARY);
+__ mov(G1, to);
+__ BIND(L_check_decrypt_loop_end192);
+__ add(from, 32, from);
+__ add(to, 32, to);
+__ subcc(len_reg, 32, len_reg);
+__ br(Assembler::notEqual, false, Assembler::pt, L_dec_next2_blocks192);
+__ delayed()->nop();
+__ ba_short(L_cbcdec_end);
+__ align(OptoLoopAlignment);
+__ BIND(L_dec_next2_blocks256);
+__ nop();
+// check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
+__ andcc(from, 7, G0);
+__ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_next2_blocks256);
+__ delayed()->mov(from, G1); // save original 'from' address before alignaddr
+// aligned case: load input into G4, G5, L4 and L5
+__ ldx(from,0,G4);
+__ ldx(from,8,G5);
+__ ldx(from,16,L4);
+__ ldx(from,24,L5);
+__ ba_short(L_transform_next2_blocks256);
+__ BIND(L_load_misaligned_next2_blocks256);
+__ alignaddr(from, G0, from);
+// F0, F2, F4, F60, F62 can be clobbered
+__ ldf(FloatRegisterImpl::D, from, 0, F0);
+__ ldf(FloatRegisterImpl::D, from, 8, F2);
+__ ldf(FloatRegisterImpl::D, from, 16, F60);
+__ ldf(FloatRegisterImpl::D, from, 24, F62);
+__ ldf(FloatRegisterImpl::D, from, 32, F4);
+__ faligndata(F0, F2, F0);
+__ faligndata(F2, F60, F2);
+__ faligndata(F60, F62, F60);
+__ faligndata(F62, F4, F62);
+__ movdtox(F0, G4);
+__ movdtox(F2, G5);
+__ movdtox(F60, L4);
+__ movdtox(F62, L5);
+__ mov(G1, from);
+__ BIND(L_transform_next2_blocks256);
+// F0:F2 used for first 16-bytes
+__ xor3(L2,G4,G1);
+__ movxtod(G1,F0);
+__ xor3(L3,G5,G1);
+__ movxtod(G1,F2);
+// F60:F62 used for next 16-bytes
+__ xor3(L2,L4,G1);
+__ movxtod(G1,F60);
+__ xor3(L3,L5,G1);
+__ movxtod(G1,F62);
+__ aes_dround23(F54, F0, F2, F4);
+__ aes_dround01(F52, F0, F2, F6);
+__ aes_dround23(F54, F60, F62, F58);
+__ aes_dround01(F52, F60, F62, F56);
+__ aes_dround23(F50, F6, F4, F2);
+__ aes_dround01(F48, F6, F4, F0);
+__ aes_dround23(F50, F56, F58, F62);
+__ aes_dround01(F48, F56, F58, F60);
+// save F48:F54 in temp registers
+__ movdtox(F54,G2);
+__ movdtox(F52,G3);
+__ movdtox(F50,G6);
+__ movdtox(F48,G1);
+for ( int i = 46;  i >= 14; i -= 8 ) {
+__ aes_dround23(as_FloatRegister(i), F0, F2, F4);
+__ aes_dround01(as_FloatRegister(i-2), F0, F2, F6);
+__ aes_dround23(as_FloatRegister(i), F60, F62, F58);
+__ aes_dround01(as_FloatRegister(i-2), F60, F62, F56);
+__ aes_dround23(as_FloatRegister(i-4), F6, F4, F2);
+__ aes_dround01(as_FloatRegister(i-6), F6, F4, F0);
+__ aes_dround23(as_FloatRegister(i-4), F56, F58, F62);
+__ aes_dround01(as_FloatRegister(i-6), F56, F58, F60);
+}
+// init F48:F54 with F0:F6 values (original key)
+__ ldf(FloatRegisterImpl::D, original_key, 0, F48);
+__ ldf(FloatRegisterImpl::D, original_key, 8, F50);
+__ ldf(FloatRegisterImpl::D, original_key, 16, F52);
+__ ldf(FloatRegisterImpl::D, original_key, 24, F54);
+__ aes_dround23(F54, F0, F2, F4);
+__ aes_dround01(F52, F0, F2, F6);
+__ aes_dround23(F54, F60, F62, F58);
+__ aes_dround01(F52, F60, F62, F56);
+__ aes_dround23_l(F50, F6, F4, F2);
+__ aes_dround01_l(F48, F6, F4, F0);
+__ aes_dround23_l(F50, F56, F58, F62);
+__ aes_dround01_l(F48, F56, F58, F60);
+// re-init F48:F54 with their original values
+__ movxtod(G2,F54);
+__ movxtod(G3,F52);
+__ movxtod(G6,F50);
+__ movxtod(G1,F48);
+__ movxtod(L0,F6);
+__ movxtod(L1,F4);
+__ fxor(FloatRegisterImpl::D, F6, F0, F0);
+__ fxor(FloatRegisterImpl::D, F4, F2, F2);
+__ movxtod(G4,F56);
+__ movxtod(G5,F58);
+__ mov(L4,L0);
+__ mov(L5,L1);
+__ fxor(FloatRegisterImpl::D, F56, F60, F60);
+__ fxor(FloatRegisterImpl::D, F58, F62, F62);
+// check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
+__ andcc(to, 7, G1);
+__ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_next2_blocks256);
+__ delayed()->edge8n(to, G0, G2);
+// aligned case: store output into the destination array
+__ stf(FloatRegisterImpl::D, F0, to, 0);
+__ stf(FloatRegisterImpl::D, F2, to, 8);
+__ stf(FloatRegisterImpl::D, F60, to, 16);
+__ stf(FloatRegisterImpl::D, F62, to, 24);
+__ ba_short(L_check_decrypt_loop_end256);
+__ BIND(L_store_misaligned_output_next2_blocks256);
+__ mov(8, G4);
+__ sub(G4, G1, G4);
+__ alignaddr(G4, G0, G4);
+__ faligndata(F0, F2, F56); // F56 can be clobbered
+__ faligndata(F2, F60, F2);
+__ faligndata(F60, F62, F60);
+__ faligndata(F62, F0, F0);
+__ mov(to, G1);
+__ and3(to, -8, to);
+__ stpartialf(to, G2, F0, Assembler::ASI_PST8_PRIMARY);
+__ stf(FloatRegisterImpl::D, F56, to, 8);
+__ stf(FloatRegisterImpl::D, F2, to, 16);
+__ stf(FloatRegisterImpl::D, F60, to, 24);
+__ add(to, 32, to);
+__ orn(G0, G2, G2);
+__ stpartialf(to, G2, F0, Assembler::ASI_PST8_PRIMARY);
+__ mov(G1, to);
+__ BIND(L_check_decrypt_loop_end256);
+__ add(from, 32, from);
+__ add(to, 32, to);
+__ subcc(len_reg, 32, len_reg);
+__ br(Assembler::notEqual, false, Assembler::pt, L_dec_next2_blocks256);
+__ delayed()->nop();
+__ BIND(L_cbcdec_end);
+// re-init intial vector for next block, 8-byte alignment is guaranteed
+__ stx(L0, rvec, 0);
+__ stx(L1, rvec, 8);
+__ mov(L7, I0);
+__ ret();
+__ delayed()->restore();
+return start;
+}
 void generate_initial() {
 // Generates all stubs and initializes the entry points
 //------------------------------------------------------------------------------------------------------------------------
 // entry points that exist in all platforms
 &StubRoutines::_safefetch32_fault_pc,
 &StubRoutines::_safefetch32_continuation_pc);
 generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
 &StubRoutines::_safefetchN_fault_pc,
 &StubRoutines::_safefetchN_continuation_pc);
+// generate AES intrinsics code
+if (UseAESIntrinsics) {
+StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
+StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
+StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
+StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel();
+}
 }
 public:
 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {

Mercurial > hg > truffle

comparison src/cpu/sparc/vm/stubGenerator_sparc.cpp @ 18041:52b4284cb496