comparison src/cpu/x86/vm/stubGenerator_x86_64.cpp @ 6894:a3ecd773a7b9

7184394: add intrinsics to use AES instructions Summary: Use new x86 AES instructions for AESCrypt. Reviewed-by: twisti, kvn, roland Contributed-by: tom.deneau@amd.com
author kvn
date Wed, 24 Oct 2012 14:33:22 -0700
parents d8ce2825b193
children e522a00b91aa f34d701e952e cd3d6a6b95d9
comparison
equal deleted inserted replaced
6893:b2c669fd8114 6894:a3ecd773a7b9
2939 __ addq(rsp, 8); 2939 __ addq(rsp, 8);
2940 __ ret(0); 2940 __ ret(0);
2941 } 2941 }
2942 } 2942 }
2943 2943
2944 // AES intrinsic stubs
2945 enum {AESBlockSize = 16};
2946
2947 address generate_key_shuffle_mask() {
2948 __ align(16);
2949 StubCodeMark mark(this, "StubRoutines", "key_shuffle_mask");
2950 address start = __ pc();
2951 __ emit_data64( 0x0405060700010203, relocInfo::none );
2952 __ emit_data64( 0x0c0d0e0f08090a0b, relocInfo::none );
2953 return start;
2954 }
2955
2956 // Utility routine for loading a 128-bit key word in little endian format
2957 // can optionally specify that the shuffle mask is already in an xmmregister
2958 void load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
2959 __ movdqu(xmmdst, Address(key, offset));
2960 if (xmm_shuf_mask != NULL) {
2961 __ pshufb(xmmdst, xmm_shuf_mask);
2962 } else {
2963 __ pshufb(xmmdst, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
2964 }
2965 }
2966
2967 // aesenc using specified key+offset
2968 // can optionally specify that the shuffle mask is already in an xmmregister
2969 void aes_enc_key(XMMRegister xmmdst, XMMRegister xmmtmp, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
2970 load_key(xmmtmp, key, offset, xmm_shuf_mask);
2971 __ aesenc(xmmdst, xmmtmp);
2972 }
2973
2974 // aesdec using specified key+offset
2975 // can optionally specify that the shuffle mask is already in an xmmregister
2976 void aes_dec_key(XMMRegister xmmdst, XMMRegister xmmtmp, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
2977 load_key(xmmtmp, key, offset, xmm_shuf_mask);
2978 __ aesdec(xmmdst, xmmtmp);
2979 }
2980
2981
2982 // Arguments:
2983 //
2984 // Inputs:
2985 // c_rarg0 - source byte array address
2986 // c_rarg1 - destination byte array address
2987 // c_rarg2 - K (key) in little endian int array
2988 //
2989 address generate_aescrypt_encryptBlock() {
2990 assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support");
2991 __ align(CodeEntryAlignment);
2992 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2993 Label L_doLast;
2994 address start = __ pc();
2995
2996 const Register from = c_rarg0; // source array address
2997 const Register to = c_rarg1; // destination array address
2998 const Register key = c_rarg2; // key array address
2999 const Register keylen = rax;
3000
3001 const XMMRegister xmm_result = xmm0;
3002 const XMMRegister xmm_temp = xmm1;
3003 const XMMRegister xmm_key_shuf_mask = xmm2;
3004
3005 __ enter(); // required for proper stackwalking of RuntimeStub frame
3006
3007 __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3008 // keylen = # of 32-bit words, convert to 128-bit words
3009 __ shrl(keylen, 2);
3010 __ subl(keylen, 11); // every key has at least 11 128-bit words, some have more
3011
3012 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3013 __ movdqu(xmm_result, Address(from, 0)); // get 16 bytes of input
3014
3015 // For encryption, the java expanded key ordering is just what we need
3016 // we don't know if the key is aligned, hence not using load-execute form
3017
3018 load_key(xmm_temp, key, 0x00, xmm_key_shuf_mask);
3019 __ pxor(xmm_result, xmm_temp);
3020 for (int offset = 0x10; offset <= 0x90; offset += 0x10) {
3021 aes_enc_key(xmm_result, xmm_temp, key, offset, xmm_key_shuf_mask);
3022 }
3023 load_key (xmm_temp, key, 0xa0, xmm_key_shuf_mask);
3024 __ cmpl(keylen, 0);
3025 __ jcc(Assembler::equal, L_doLast);
3026 __ aesenc(xmm_result, xmm_temp); // only in 192 and 256 bit keys
3027 aes_enc_key(xmm_result, xmm_temp, key, 0xb0, xmm_key_shuf_mask);
3028 load_key(xmm_temp, key, 0xc0, xmm_key_shuf_mask);
3029 __ subl(keylen, 2);
3030 __ jcc(Assembler::equal, L_doLast);
3031 __ aesenc(xmm_result, xmm_temp); // only in 256 bit keys
3032 aes_enc_key(xmm_result, xmm_temp, key, 0xd0, xmm_key_shuf_mask);
3033 load_key(xmm_temp, key, 0xe0, xmm_key_shuf_mask);
3034
3035 __ BIND(L_doLast);
3036 __ aesenclast(xmm_result, xmm_temp);
3037 __ movdqu(Address(to, 0), xmm_result); // store the result
3038 __ xorptr(rax, rax); // return 0
3039 __ leave(); // required for proper stackwalking of RuntimeStub frame
3040 __ ret(0);
3041
3042 return start;
3043 }
3044
3045
3046 // Arguments:
3047 //
3048 // Inputs:
3049 // c_rarg0 - source byte array address
3050 // c_rarg1 - destination byte array address
3051 // c_rarg2 - K (key) in little endian int array
3052 //
3053 address generate_aescrypt_decryptBlock() {
3054 assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support");
3055 __ align(CodeEntryAlignment);
3056 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
3057 Label L_doLast;
3058 address start = __ pc();
3059
3060 const Register from = c_rarg0; // source array address
3061 const Register to = c_rarg1; // destination array address
3062 const Register key = c_rarg2; // key array address
3063 const Register keylen = rax;
3064
3065 const XMMRegister xmm_result = xmm0;
3066 const XMMRegister xmm_temp = xmm1;
3067 const XMMRegister xmm_key_shuf_mask = xmm2;
3068
3069 __ enter(); // required for proper stackwalking of RuntimeStub frame
3070
3071 __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3072 // keylen = # of 32-bit words, convert to 128-bit words
3073 __ shrl(keylen, 2);
3074 __ subl(keylen, 11); // every key has at least 11 128-bit words, some have more
3075
3076 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3077 __ movdqu(xmm_result, Address(from, 0));
3078
3079 // for decryption java expanded key ordering is rotated one position from what we want
3080 // so we start from 0x10 here and hit 0x00 last
3081 // we don't know if the key is aligned, hence not using load-execute form
3082 load_key(xmm_temp, key, 0x10, xmm_key_shuf_mask);
3083 __ pxor (xmm_result, xmm_temp);
3084 for (int offset = 0x20; offset <= 0xa0; offset += 0x10) {
3085 aes_dec_key(xmm_result, xmm_temp, key, offset, xmm_key_shuf_mask);
3086 }
3087 __ cmpl(keylen, 0);
3088 __ jcc(Assembler::equal, L_doLast);
3089 // only in 192 and 256 bit keys
3090 aes_dec_key(xmm_result, xmm_temp, key, 0xb0, xmm_key_shuf_mask);
3091 aes_dec_key(xmm_result, xmm_temp, key, 0xc0, xmm_key_shuf_mask);
3092 __ subl(keylen, 2);
3093 __ jcc(Assembler::equal, L_doLast);
3094 // only in 256 bit keys
3095 aes_dec_key(xmm_result, xmm_temp, key, 0xd0, xmm_key_shuf_mask);
3096 aes_dec_key(xmm_result, xmm_temp, key, 0xe0, xmm_key_shuf_mask);
3097
3098 __ BIND(L_doLast);
3099 // for decryption the aesdeclast operation is always on key+0x00
3100 load_key(xmm_temp, key, 0x00, xmm_key_shuf_mask);
3101 __ aesdeclast(xmm_result, xmm_temp);
3102
3103 __ movdqu(Address(to, 0), xmm_result); // store the result
3104
3105 __ xorptr(rax, rax); // return 0
3106 __ leave(); // required for proper stackwalking of RuntimeStub frame
3107 __ ret(0);
3108
3109 return start;
3110 }
3111
3112
3113 // Arguments:
3114 //
3115 // Inputs:
3116 // c_rarg0 - source byte array address
3117 // c_rarg1 - destination byte array address
3118 // c_rarg2 - K (key) in little endian int array
3119 // c_rarg3 - r vector byte array address
3120 // c_rarg4 - input length
3121 //
3122 address generate_cipherBlockChaining_encryptAESCrypt() {
3123 assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support");
3124 __ align(CodeEntryAlignment);
3125 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
3126 address start = __ pc();
3127
3128 Label L_exit, L_key_192_256, L_key_256, L_loopTop_128, L_loopTop_192, L_loopTop_256;
3129 const Register from = c_rarg0; // source array address
3130 const Register to = c_rarg1; // destination array address
3131 const Register key = c_rarg2; // key array address
3132 const Register rvec = c_rarg3; // r byte array initialized from initvector array address
3133 // and left with the results of the last encryption block
3134 #ifndef _WIN64
3135 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16)
3136 #else
3137 const Address len_mem(rsp, 6 * wordSize); // length is on stack on Win64
3138 const Register len_reg = r10; // pick the first volatile windows register
3139 #endif
3140 const Register pos = rax;
3141
3142 // xmm register assignments for the loops below
3143 const XMMRegister xmm_result = xmm0;
3144 const XMMRegister xmm_temp = xmm1;
3145 // keys 0-10 preloaded into xmm2-xmm12
3146 const int XMM_REG_NUM_KEY_FIRST = 2;
3147 const int XMM_REG_NUM_KEY_LAST = 12;
3148 const XMMRegister xmm_key0 = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
3149 const XMMRegister xmm_key10 = as_XMMRegister(XMM_REG_NUM_KEY_LAST);
3150
3151 __ enter(); // required for proper stackwalking of RuntimeStub frame
3152
3153 #ifdef _WIN64
3154 // on win64, fill len_reg from stack position
3155 __ movl(len_reg, len_mem);
3156 // save the xmm registers which must be preserved 6-12
3157 __ subptr(rsp, -rsp_after_call_off * wordSize);
3158 for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) {
3159 __ movdqu(xmm_save(i), as_XMMRegister(i));
3160 }
3161 #endif
3162
3163 const XMMRegister xmm_key_shuf_mask = xmm_temp; // used temporarily to swap key bytes up front
3164 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3165 // load up xmm regs 2 thru 12 with key 0x00 - 0xa0
3166 for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x00; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) {
3167 load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
3168 offset += 0x10;
3169 }
3170
3171 __ movdqu(xmm_result, Address(rvec, 0x00)); // initialize xmm_result with r vec
3172
3173 // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
3174 __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3175 __ cmpl(rax, 44);
3176 __ jcc(Assembler::notEqual, L_key_192_256);
3177
3178 // 128 bit code follows here
3179 __ movptr(pos, 0);
3180 __ align(OptoLoopAlignment);
3181 __ BIND(L_loopTop_128);
3182 __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input
3183 __ pxor (xmm_result, xmm_temp); // xor with the current r vector
3184
3185 __ pxor (xmm_result, xmm_key0); // do the aes rounds
3186 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST - 1; rnum++) {
3187 __ aesenc(xmm_result, as_XMMRegister(rnum));
3188 }
3189 __ aesenclast(xmm_result, xmm_key10);
3190
3191 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output
3192 // no need to store r to memory until we exit
3193 __ addptr(pos, AESBlockSize);
3194 __ subptr(len_reg, AESBlockSize);
3195 __ jcc(Assembler::notEqual, L_loopTop_128);
3196
3197 __ BIND(L_exit);
3198 __ movdqu(Address(rvec, 0), xmm_result); // final value of r stored in rvec of CipherBlockChaining object
3199
3200 #ifdef _WIN64
3201 // restore xmm regs belonging to calling function
3202 for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) {
3203 __ movdqu(as_XMMRegister(i), xmm_save(i));
3204 }
3205 #endif
3206 __ movl(rax, 0); // return 0 (why?)
3207 __ leave(); // required for proper stackwalking of RuntimeStub frame
3208 __ ret(0);
3209
3210 __ BIND(L_key_192_256);
3211 // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256)
3212 __ cmpl(rax, 52);
3213 __ jcc(Assembler::notEqual, L_key_256);
3214
3215 // 192-bit code follows here (could be changed to use more xmm registers)
3216 __ movptr(pos, 0);
3217 __ align(OptoLoopAlignment);
3218 __ BIND(L_loopTop_192);
3219 __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input
3220 __ pxor (xmm_result, xmm_temp); // xor with the current r vector
3221
3222 __ pxor (xmm_result, xmm_key0); // do the aes rounds
3223 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) {
3224 __ aesenc(xmm_result, as_XMMRegister(rnum));
3225 }
3226 aes_enc_key(xmm_result, xmm_temp, key, 0xb0);
3227 load_key(xmm_temp, key, 0xc0);
3228 __ aesenclast(xmm_result, xmm_temp);
3229
3230 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output
3231 // no need to store r to memory until we exit
3232 __ addptr(pos, AESBlockSize);
3233 __ subptr(len_reg, AESBlockSize);
3234 __ jcc(Assembler::notEqual, L_loopTop_192);
3235 __ jmp(L_exit);
3236
3237 __ BIND(L_key_256);
3238 // 256-bit code follows here (could be changed to use more xmm registers)
3239 __ movptr(pos, 0);
3240 __ align(OptoLoopAlignment);
3241 __ BIND(L_loopTop_256);
3242 __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input
3243 __ pxor (xmm_result, xmm_temp); // xor with the current r vector
3244
3245 __ pxor (xmm_result, xmm_key0); // do the aes rounds
3246 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) {
3247 __ aesenc(xmm_result, as_XMMRegister(rnum));
3248 }
3249 aes_enc_key(xmm_result, xmm_temp, key, 0xb0);
3250 aes_enc_key(xmm_result, xmm_temp, key, 0xc0);
3251 aes_enc_key(xmm_result, xmm_temp, key, 0xd0);
3252 load_key(xmm_temp, key, 0xe0);
3253 __ aesenclast(xmm_result, xmm_temp);
3254
3255 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output
3256 // no need to store r to memory until we exit
3257 __ addptr(pos, AESBlockSize);
3258 __ subptr(len_reg, AESBlockSize);
3259 __ jcc(Assembler::notEqual, L_loopTop_256);
3260 __ jmp(L_exit);
3261
3262 return start;
3263 }
3264
3265
3266
3267 // This is a version of CBC/AES Decrypt which does 4 blocks in a loop at a time
3268 // to hide instruction latency
3269 //
3270 // Arguments:
3271 //
3272 // Inputs:
3273 // c_rarg0 - source byte array address
3274 // c_rarg1 - destination byte array address
3275 // c_rarg2 - K (key) in little endian int array
3276 // c_rarg3 - r vector byte array address
3277 // c_rarg4 - input length
3278 //
3279
3280 address generate_cipherBlockChaining_decryptAESCrypt_Parallel() {
3281 assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support");
3282 __ align(CodeEntryAlignment);
3283 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
3284 address start = __ pc();
3285
3286 Label L_exit, L_key_192_256, L_key_256;
3287 Label L_singleBlock_loopTop_128, L_multiBlock_loopTop_128;
3288 Label L_singleBlock_loopTop_192, L_singleBlock_loopTop_256;
3289 const Register from = c_rarg0; // source array address
3290 const Register to = c_rarg1; // destination array address
3291 const Register key = c_rarg2; // key array address
3292 const Register rvec = c_rarg3; // r byte array initialized from initvector array address
3293 // and left with the results of the last encryption block
3294 #ifndef _WIN64
3295 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16)
3296 #else
3297 const Address len_mem(rsp, 6 * wordSize); // length is on stack on Win64
3298 const Register len_reg = r10; // pick the first volatile windows register
3299 #endif
3300 const Register pos = rax;
3301
3302 // xmm register assignments for the loops below
3303 const XMMRegister xmm_result = xmm0;
3304 // keys 0-10 preloaded into xmm2-xmm12
3305 const int XMM_REG_NUM_KEY_FIRST = 5;
3306 const int XMM_REG_NUM_KEY_LAST = 15;
3307 const XMMRegister xmm_key_first = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
3308 const XMMRegister xmm_key_last = as_XMMRegister(XMM_REG_NUM_KEY_LAST);
3309
3310 __ enter(); // required for proper stackwalking of RuntimeStub frame
3311
3312 #ifdef _WIN64
3313 // on win64, fill len_reg from stack position
3314 __ movl(len_reg, len_mem);
3315 // save the xmm registers which must be preserved 6-15
3316 __ subptr(rsp, -rsp_after_call_off * wordSize);
3317 for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) {
3318 __ movdqu(xmm_save(i), as_XMMRegister(i));
3319 }
3320 #endif
3321 // the java expanded key ordering is rotated one position from what we want
3322 // so we start from 0x10 here and hit 0x00 last
3323 const XMMRegister xmm_key_shuf_mask = xmm1; // used temporarily to swap key bytes up front
3324 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3325 // load up xmm regs 5 thru 15 with key 0x10 - 0xa0 - 0x00
3326 for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x10; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) {
3327 if (rnum == XMM_REG_NUM_KEY_LAST) offset = 0x00;
3328 load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
3329 offset += 0x10;
3330 }
3331
3332 const XMMRegister xmm_prev_block_cipher = xmm1; // holds cipher of previous block
3333 // registers holding the four results in the parallelized loop
3334 const XMMRegister xmm_result0 = xmm0;
3335 const XMMRegister xmm_result1 = xmm2;
3336 const XMMRegister xmm_result2 = xmm3;
3337 const XMMRegister xmm_result3 = xmm4;
3338
3339 __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00)); // initialize with initial rvec
3340
3341 // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
3342 __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3343 __ cmpl(rax, 44);
3344 __ jcc(Assembler::notEqual, L_key_192_256);
3345
3346
3347 // 128-bit code follows here, parallelized
3348 __ movptr(pos, 0);
3349 __ align(OptoLoopAlignment);
3350 __ BIND(L_multiBlock_loopTop_128);
3351 __ cmpptr(len_reg, 4*AESBlockSize); // see if at least 4 blocks left
3352 __ jcc(Assembler::less, L_singleBlock_loopTop_128);
3353
3354 __ movdqu(xmm_result0, Address(from, pos, Address::times_1, 0*AESBlockSize)); // get next 4 blocks into xmmresult registers
3355 __ movdqu(xmm_result1, Address(from, pos, Address::times_1, 1*AESBlockSize));
3356 __ movdqu(xmm_result2, Address(from, pos, Address::times_1, 2*AESBlockSize));
3357 __ movdqu(xmm_result3, Address(from, pos, Address::times_1, 3*AESBlockSize));
3358
3359 #define DoFour(opc, src_reg) \
3360 __ opc(xmm_result0, src_reg); \
3361 __ opc(xmm_result1, src_reg); \
3362 __ opc(xmm_result2, src_reg); \
3363 __ opc(xmm_result3, src_reg);
3364
3365 DoFour(pxor, xmm_key_first);
3366 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST - 1; rnum++) {
3367 DoFour(aesdec, as_XMMRegister(rnum));
3368 }
3369 DoFour(aesdeclast, xmm_key_last);
3370 // for each result, xor with the r vector of previous cipher block
3371 __ pxor(xmm_result0, xmm_prev_block_cipher);
3372 __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 0*AESBlockSize));
3373 __ pxor(xmm_result1, xmm_prev_block_cipher);
3374 __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 1*AESBlockSize));
3375 __ pxor(xmm_result2, xmm_prev_block_cipher);
3376 __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 2*AESBlockSize));
3377 __ pxor(xmm_result3, xmm_prev_block_cipher);
3378 __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 3*AESBlockSize)); // this will carry over to next set of blocks
3379
3380 __ movdqu(Address(to, pos, Address::times_1, 0*AESBlockSize), xmm_result0); // store 4 results into the next 64 bytes of output
3381 __ movdqu(Address(to, pos, Address::times_1, 1*AESBlockSize), xmm_result1);
3382 __ movdqu(Address(to, pos, Address::times_1, 2*AESBlockSize), xmm_result2);
3383 __ movdqu(Address(to, pos, Address::times_1, 3*AESBlockSize), xmm_result3);
3384
3385 __ addptr(pos, 4*AESBlockSize);
3386 __ subptr(len_reg, 4*AESBlockSize);
3387 __ jmp(L_multiBlock_loopTop_128);
3388
3389 // registers used in the non-parallelized loops
3390 const XMMRegister xmm_prev_block_cipher_save = xmm2;
3391 const XMMRegister xmm_temp = xmm3;
3392
3393 __ align(OptoLoopAlignment);
3394 __ BIND(L_singleBlock_loopTop_128);
3395 __ cmpptr(len_reg, 0); // any blocks left??
3396 __ jcc(Assembler::equal, L_exit);
3397 __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input
3398 __ movdqa(xmm_prev_block_cipher_save, xmm_result); // save for next r vector
3399 __ pxor (xmm_result, xmm_key_first); // do the aes dec rounds
3400 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST - 1; rnum++) {
3401 __ aesdec(xmm_result, as_XMMRegister(rnum));
3402 }
3403 __ aesdeclast(xmm_result, xmm_key_last);
3404 __ pxor (xmm_result, xmm_prev_block_cipher); // xor with the current r vector
3405 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output
3406 // no need to store r to memory until we exit
3407 __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save); // set up next r vector with cipher input from this block
3408
3409 __ addptr(pos, AESBlockSize);
3410 __ subptr(len_reg, AESBlockSize);
3411 __ jmp(L_singleBlock_loopTop_128);
3412
3413
3414 __ BIND(L_exit);
3415 __ movdqu(Address(rvec, 0), xmm_prev_block_cipher); // final value of r stored in rvec of CipherBlockChaining object
3416 #ifdef _WIN64
3417 // restore regs belonging to calling function
3418 for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) {
3419 __ movdqu(as_XMMRegister(i), xmm_save(i));
3420 }
3421 #endif
3422 __ movl(rax, 0); // return 0 (why?)
3423 __ leave(); // required for proper stackwalking of RuntimeStub frame
3424 __ ret(0);
3425
3426
3427 __ BIND(L_key_192_256);
3428 // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256)
3429 __ cmpl(rax, 52);
3430 __ jcc(Assembler::notEqual, L_key_256);
3431
3432 // 192-bit code follows here (could be optimized to use parallelism)
3433 __ movptr(pos, 0);
3434 __ align(OptoLoopAlignment);
3435 __ BIND(L_singleBlock_loopTop_192);
3436 __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input
3437 __ movdqa(xmm_prev_block_cipher_save, xmm_result); // save for next r vector
3438 __ pxor (xmm_result, xmm_key_first); // do the aes dec rounds
3439 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST - 1; rnum++) {
3440 __ aesdec(xmm_result, as_XMMRegister(rnum));
3441 }
3442 aes_dec_key(xmm_result, xmm_temp, key, 0xb0); // 192-bit key goes up to c0
3443 aes_dec_key(xmm_result, xmm_temp, key, 0xc0);
3444 __ aesdeclast(xmm_result, xmm_key_last); // xmm15 always came from key+0
3445 __ pxor (xmm_result, xmm_prev_block_cipher); // xor with the current r vector
3446 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output
3447 // no need to store r to memory until we exit
3448 __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save); // set up next r vector with cipher input from this block
3449
3450 __ addptr(pos, AESBlockSize);
3451 __ subptr(len_reg, AESBlockSize);
3452 __ jcc(Assembler::notEqual,L_singleBlock_loopTop_192);
3453 __ jmp(L_exit);
3454
3455 __ BIND(L_key_256);
3456 // 256-bit code follows here (could be optimized to use parallelism)
3457 __ movptr(pos, 0);
3458 __ align(OptoLoopAlignment);
3459 __ BIND(L_singleBlock_loopTop_256);
3460 __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input
3461 __ movdqa(xmm_prev_block_cipher_save, xmm_result); // save for next r vector
3462 __ pxor (xmm_result, xmm_key_first); // do the aes dec rounds
3463 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST - 1; rnum++) {
3464 __ aesdec(xmm_result, as_XMMRegister(rnum));
3465 }
3466 aes_dec_key(xmm_result, xmm_temp, key, 0xb0); // 256-bit key goes up to e0
3467 aes_dec_key(xmm_result, xmm_temp, key, 0xc0);
3468 aes_dec_key(xmm_result, xmm_temp, key, 0xd0);
3469 aes_dec_key(xmm_result, xmm_temp, key, 0xe0);
3470 __ aesdeclast(xmm_result, xmm_key_last); // xmm15 came from key+0
3471 __ pxor (xmm_result, xmm_prev_block_cipher); // xor with the current r vector
3472 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output
3473 // no need to store r to memory until we exit
3474 __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save); // set up next r vector with cipher input from this block
3475
3476 __ addptr(pos, AESBlockSize);
3477 __ subptr(len_reg, AESBlockSize);
3478 __ jcc(Assembler::notEqual,L_singleBlock_loopTop_256);
3479 __ jmp(L_exit);
3480
3481 return start;
3482 }
3483
3484
3485
2944 #undef __ 3486 #undef __
2945 #define __ masm-> 3487 #define __ masm->
2946 3488
2947 // Continuation point for throwing of implicit exceptions that are 3489 // Continuation point for throwing of implicit exceptions that are
2948 // not handled in the current activation. Fabricates an exception 3490 // not handled in the current activation. Fabricates an exception
3133 3675
3134 // arraycopy stubs used by compilers 3676 // arraycopy stubs used by compilers
3135 generate_arraycopy_stubs(); 3677 generate_arraycopy_stubs();
3136 3678
3137 generate_math_stubs(); 3679 generate_math_stubs();
3680
3681 // don't bother generating these AES intrinsic stubs unless global flag is set
3682 if (UseAESIntrinsics) {
3683 StubRoutines::x86::_key_shuffle_mask_addr = generate_key_shuffle_mask(); // needed by the others
3684
3685 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
3686 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
3687 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
3688 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel();
3689 }
3138 } 3690 }
3139 3691
3140 public: 3692 public:
3141 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) { 3693 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
3142 if (all) { 3694 if (all) {