Mercurial > hg > truffle
comparison src/cpu/x86/vm/stubGenerator_x86_64.cpp @ 6894:a3ecd773a7b9
7184394: add intrinsics to use AES instructions
Summary: Use new x86 AES instructions for AESCrypt.
Reviewed-by: twisti, kvn, roland
Contributed-by: tom.deneau@amd.com
author | kvn |
---|---|
date | Wed, 24 Oct 2012 14:33:22 -0700 |
parents | d8ce2825b193 |
children | e522a00b91aa f34d701e952e cd3d6a6b95d9 |
comparison
equal
deleted
inserted
replaced
6893:b2c669fd8114 | 6894:a3ecd773a7b9 |
---|---|
2939 __ addq(rsp, 8); | 2939 __ addq(rsp, 8); |
2940 __ ret(0); | 2940 __ ret(0); |
2941 } | 2941 } |
2942 } | 2942 } |
2943 | 2943 |
2944 // AES intrinsic stubs | |
2945 enum {AESBlockSize = 16}; | |
2946 | |
2947 address generate_key_shuffle_mask() { | |
2948 __ align(16); | |
2949 StubCodeMark mark(this, "StubRoutines", "key_shuffle_mask"); | |
2950 address start = __ pc(); | |
2951 __ emit_data64( 0x0405060700010203, relocInfo::none ); | |
2952 __ emit_data64( 0x0c0d0e0f08090a0b, relocInfo::none ); | |
2953 return start; | |
2954 } | |
2955 | |
2956 // Utility routine for loading a 128-bit key word in little endian format | |
2957 // can optionally specify that the shuffle mask is already in an xmmregister | |
2958 void load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) { | |
2959 __ movdqu(xmmdst, Address(key, offset)); | |
2960 if (xmm_shuf_mask != NULL) { | |
2961 __ pshufb(xmmdst, xmm_shuf_mask); | |
2962 } else { | |
2963 __ pshufb(xmmdst, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); | |
2964 } | |
2965 } | |
2966 | |
2967 // aesenc using specified key+offset | |
2968 // can optionally specify that the shuffle mask is already in an xmmregister | |
2969 void aes_enc_key(XMMRegister xmmdst, XMMRegister xmmtmp, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) { | |
2970 load_key(xmmtmp, key, offset, xmm_shuf_mask); | |
2971 __ aesenc(xmmdst, xmmtmp); | |
2972 } | |
2973 | |
2974 // aesdec using specified key+offset | |
2975 // can optionally specify that the shuffle mask is already in an xmmregister | |
2976 void aes_dec_key(XMMRegister xmmdst, XMMRegister xmmtmp, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) { | |
2977 load_key(xmmtmp, key, offset, xmm_shuf_mask); | |
2978 __ aesdec(xmmdst, xmmtmp); | |
2979 } | |
2980 | |
2981 | |
2982 // Arguments: | |
2983 // | |
2984 // Inputs: | |
2985 // c_rarg0 - source byte array address | |
2986 // c_rarg1 - destination byte array address | |
2987 // c_rarg2 - K (key) in little endian int array | |
2988 // | |
2989 address generate_aescrypt_encryptBlock() { | |
2990 assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support"); | |
2991 __ align(CodeEntryAlignment); | |
2992 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); | |
2993 Label L_doLast; | |
2994 address start = __ pc(); | |
2995 | |
2996 const Register from = c_rarg0; // source array address | |
2997 const Register to = c_rarg1; // destination array address | |
2998 const Register key = c_rarg2; // key array address | |
2999 const Register keylen = rax; | |
3000 | |
3001 const XMMRegister xmm_result = xmm0; | |
3002 const XMMRegister xmm_temp = xmm1; | |
3003 const XMMRegister xmm_key_shuf_mask = xmm2; | |
3004 | |
3005 __ enter(); // required for proper stackwalking of RuntimeStub frame | |
3006 | |
3007 __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); | |
3008 // keylen = # of 32-bit words, convert to 128-bit words | |
3009 __ shrl(keylen, 2); | |
3010 __ subl(keylen, 11); // every key has at least 11 128-bit words, some have more | |
3011 | |
3012 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); | |
3013 __ movdqu(xmm_result, Address(from, 0)); // get 16 bytes of input | |
3014 | |
3015 // For encryption, the java expanded key ordering is just what we need | |
3016 // we don't know if the key is aligned, hence not using load-execute form | |
3017 | |
3018 load_key(xmm_temp, key, 0x00, xmm_key_shuf_mask); | |
3019 __ pxor(xmm_result, xmm_temp); | |
3020 for (int offset = 0x10; offset <= 0x90; offset += 0x10) { | |
3021 aes_enc_key(xmm_result, xmm_temp, key, offset, xmm_key_shuf_mask); | |
3022 } | |
3023 load_key (xmm_temp, key, 0xa0, xmm_key_shuf_mask); | |
3024 __ cmpl(keylen, 0); | |
3025 __ jcc(Assembler::equal, L_doLast); | |
3026 __ aesenc(xmm_result, xmm_temp); // only in 192 and 256 bit keys | |
3027 aes_enc_key(xmm_result, xmm_temp, key, 0xb0, xmm_key_shuf_mask); | |
3028 load_key(xmm_temp, key, 0xc0, xmm_key_shuf_mask); | |
3029 __ subl(keylen, 2); | |
3030 __ jcc(Assembler::equal, L_doLast); | |
3031 __ aesenc(xmm_result, xmm_temp); // only in 256 bit keys | |
3032 aes_enc_key(xmm_result, xmm_temp, key, 0xd0, xmm_key_shuf_mask); | |
3033 load_key(xmm_temp, key, 0xe0, xmm_key_shuf_mask); | |
3034 | |
3035 __ BIND(L_doLast); | |
3036 __ aesenclast(xmm_result, xmm_temp); | |
3037 __ movdqu(Address(to, 0), xmm_result); // store the result | |
3038 __ xorptr(rax, rax); // return 0 | |
3039 __ leave(); // required for proper stackwalking of RuntimeStub frame | |
3040 __ ret(0); | |
3041 | |
3042 return start; | |
3043 } | |
3044 | |
3045 | |
3046 // Arguments: | |
3047 // | |
3048 // Inputs: | |
3049 // c_rarg0 - source byte array address | |
3050 // c_rarg1 - destination byte array address | |
3051 // c_rarg2 - K (key) in little endian int array | |
3052 // | |
3053 address generate_aescrypt_decryptBlock() { | |
3054 assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support"); | |
3055 __ align(CodeEntryAlignment); | |
3056 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock"); | |
3057 Label L_doLast; | |
3058 address start = __ pc(); | |
3059 | |
3060 const Register from = c_rarg0; // source array address | |
3061 const Register to = c_rarg1; // destination array address | |
3062 const Register key = c_rarg2; // key array address | |
3063 const Register keylen = rax; | |
3064 | |
3065 const XMMRegister xmm_result = xmm0; | |
3066 const XMMRegister xmm_temp = xmm1; | |
3067 const XMMRegister xmm_key_shuf_mask = xmm2; | |
3068 | |
3069 __ enter(); // required for proper stackwalking of RuntimeStub frame | |
3070 | |
3071 __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); | |
3072 // keylen = # of 32-bit words, convert to 128-bit words | |
3073 __ shrl(keylen, 2); | |
3074 __ subl(keylen, 11); // every key has at least 11 128-bit words, some have more | |
3075 | |
3076 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); | |
3077 __ movdqu(xmm_result, Address(from, 0)); | |
3078 | |
3079 // for decryption java expanded key ordering is rotated one position from what we want | |
3080 // so we start from 0x10 here and hit 0x00 last | |
3081 // we don't know if the key is aligned, hence not using load-execute form | |
3082 load_key(xmm_temp, key, 0x10, xmm_key_shuf_mask); | |
3083 __ pxor (xmm_result, xmm_temp); | |
3084 for (int offset = 0x20; offset <= 0xa0; offset += 0x10) { | |
3085 aes_dec_key(xmm_result, xmm_temp, key, offset, xmm_key_shuf_mask); | |
3086 } | |
3087 __ cmpl(keylen, 0); | |
3088 __ jcc(Assembler::equal, L_doLast); | |
3089 // only in 192 and 256 bit keys | |
3090 aes_dec_key(xmm_result, xmm_temp, key, 0xb0, xmm_key_shuf_mask); | |
3091 aes_dec_key(xmm_result, xmm_temp, key, 0xc0, xmm_key_shuf_mask); | |
3092 __ subl(keylen, 2); | |
3093 __ jcc(Assembler::equal, L_doLast); | |
3094 // only in 256 bit keys | |
3095 aes_dec_key(xmm_result, xmm_temp, key, 0xd0, xmm_key_shuf_mask); | |
3096 aes_dec_key(xmm_result, xmm_temp, key, 0xe0, xmm_key_shuf_mask); | |
3097 | |
3098 __ BIND(L_doLast); | |
3099 // for decryption the aesdeclast operation is always on key+0x00 | |
3100 load_key(xmm_temp, key, 0x00, xmm_key_shuf_mask); | |
3101 __ aesdeclast(xmm_result, xmm_temp); | |
3102 | |
3103 __ movdqu(Address(to, 0), xmm_result); // store the result | |
3104 | |
3105 __ xorptr(rax, rax); // return 0 | |
3106 __ leave(); // required for proper stackwalking of RuntimeStub frame | |
3107 __ ret(0); | |
3108 | |
3109 return start; | |
3110 } | |
3111 | |
3112 | |
3113 // Arguments: | |
3114 // | |
3115 // Inputs: | |
3116 // c_rarg0 - source byte array address | |
3117 // c_rarg1 - destination byte array address | |
3118 // c_rarg2 - K (key) in little endian int array | |
3119 // c_rarg3 - r vector byte array address | |
3120 // c_rarg4 - input length | |
3121 // | |
3122 address generate_cipherBlockChaining_encryptAESCrypt() { | |
3123 assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support"); | |
3124 __ align(CodeEntryAlignment); | |
3125 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); | |
3126 address start = __ pc(); | |
3127 | |
3128 Label L_exit, L_key_192_256, L_key_256, L_loopTop_128, L_loopTop_192, L_loopTop_256; | |
3129 const Register from = c_rarg0; // source array address | |
3130 const Register to = c_rarg1; // destination array address | |
3131 const Register key = c_rarg2; // key array address | |
3132 const Register rvec = c_rarg3; // r byte array initialized from initvector array address | |
3133 // and left with the results of the last encryption block | |
3134 #ifndef _WIN64 | |
3135 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) | |
3136 #else | |
3137 const Address len_mem(rsp, 6 * wordSize); // length is on stack on Win64 | |
3138 const Register len_reg = r10; // pick the first volatile windows register | |
3139 #endif | |
3140 const Register pos = rax; | |
3141 | |
3142 // xmm register assignments for the loops below | |
3143 const XMMRegister xmm_result = xmm0; | |
3144 const XMMRegister xmm_temp = xmm1; | |
3145 // keys 0-10 preloaded into xmm2-xmm12 | |
3146 const int XMM_REG_NUM_KEY_FIRST = 2; | |
3147 const int XMM_REG_NUM_KEY_LAST = 12; | |
3148 const XMMRegister xmm_key0 = as_XMMRegister(XMM_REG_NUM_KEY_FIRST); | |
3149 const XMMRegister xmm_key10 = as_XMMRegister(XMM_REG_NUM_KEY_LAST); | |
3150 | |
3151 __ enter(); // required for proper stackwalking of RuntimeStub frame | |
3152 | |
3153 #ifdef _WIN64 | |
3154 // on win64, fill len_reg from stack position | |
3155 __ movl(len_reg, len_mem); | |
3156 // save the xmm registers which must be preserved 6-12 | |
3157 __ subptr(rsp, -rsp_after_call_off * wordSize); | |
3158 for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) { | |
3159 __ movdqu(xmm_save(i), as_XMMRegister(i)); | |
3160 } | |
3161 #endif | |
3162 | |
3163 const XMMRegister xmm_key_shuf_mask = xmm_temp; // used temporarily to swap key bytes up front | |
3164 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); | |
3165 // load up xmm regs 2 thru 12 with key 0x00 - 0xa0 | |
3166 for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x00; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) { | |
3167 load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask); | |
3168 offset += 0x10; | |
3169 } | |
3170 | |
3171 __ movdqu(xmm_result, Address(rvec, 0x00)); // initialize xmm_result with r vec | |
3172 | |
3173 // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256)) | |
3174 __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); | |
3175 __ cmpl(rax, 44); | |
3176 __ jcc(Assembler::notEqual, L_key_192_256); | |
3177 | |
3178 // 128 bit code follows here | |
3179 __ movptr(pos, 0); | |
3180 __ align(OptoLoopAlignment); | |
3181 __ BIND(L_loopTop_128); | |
3182 __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input | |
3183 __ pxor (xmm_result, xmm_temp); // xor with the current r vector | |
3184 | |
3185 __ pxor (xmm_result, xmm_key0); // do the aes rounds | |
3186 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST - 1; rnum++) { | |
3187 __ aesenc(xmm_result, as_XMMRegister(rnum)); | |
3188 } | |
3189 __ aesenclast(xmm_result, xmm_key10); | |
3190 | |
3191 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output | |
3192 // no need to store r to memory until we exit | |
3193 __ addptr(pos, AESBlockSize); | |
3194 __ subptr(len_reg, AESBlockSize); | |
3195 __ jcc(Assembler::notEqual, L_loopTop_128); | |
3196 | |
3197 __ BIND(L_exit); | |
3198 __ movdqu(Address(rvec, 0), xmm_result); // final value of r stored in rvec of CipherBlockChaining object | |
3199 | |
3200 #ifdef _WIN64 | |
3201 // restore xmm regs belonging to calling function | |
3202 for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) { | |
3203 __ movdqu(as_XMMRegister(i), xmm_save(i)); | |
3204 } | |
3205 #endif | |
3206 __ movl(rax, 0); // return 0 (why?) | |
3207 __ leave(); // required for proper stackwalking of RuntimeStub frame | |
3208 __ ret(0); | |
3209 | |
3210 __ BIND(L_key_192_256); | |
3211 // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256) | |
3212 __ cmpl(rax, 52); | |
3213 __ jcc(Assembler::notEqual, L_key_256); | |
3214 | |
3215 // 192-bit code follows here (could be changed to use more xmm registers) | |
3216 __ movptr(pos, 0); | |
3217 __ align(OptoLoopAlignment); | |
3218 __ BIND(L_loopTop_192); | |
3219 __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input | |
3220 __ pxor (xmm_result, xmm_temp); // xor with the current r vector | |
3221 | |
3222 __ pxor (xmm_result, xmm_key0); // do the aes rounds | |
3223 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) { | |
3224 __ aesenc(xmm_result, as_XMMRegister(rnum)); | |
3225 } | |
3226 aes_enc_key(xmm_result, xmm_temp, key, 0xb0); | |
3227 load_key(xmm_temp, key, 0xc0); | |
3228 __ aesenclast(xmm_result, xmm_temp); | |
3229 | |
3230 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output | |
3231 // no need to store r to memory until we exit | |
3232 __ addptr(pos, AESBlockSize); | |
3233 __ subptr(len_reg, AESBlockSize); | |
3234 __ jcc(Assembler::notEqual, L_loopTop_192); | |
3235 __ jmp(L_exit); | |
3236 | |
3237 __ BIND(L_key_256); | |
3238 // 256-bit code follows here (could be changed to use more xmm registers) | |
3239 __ movptr(pos, 0); | |
3240 __ align(OptoLoopAlignment); | |
3241 __ BIND(L_loopTop_256); | |
3242 __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input | |
3243 __ pxor (xmm_result, xmm_temp); // xor with the current r vector | |
3244 | |
3245 __ pxor (xmm_result, xmm_key0); // do the aes rounds | |
3246 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) { | |
3247 __ aesenc(xmm_result, as_XMMRegister(rnum)); | |
3248 } | |
3249 aes_enc_key(xmm_result, xmm_temp, key, 0xb0); | |
3250 aes_enc_key(xmm_result, xmm_temp, key, 0xc0); | |
3251 aes_enc_key(xmm_result, xmm_temp, key, 0xd0); | |
3252 load_key(xmm_temp, key, 0xe0); | |
3253 __ aesenclast(xmm_result, xmm_temp); | |
3254 | |
3255 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output | |
3256 // no need to store r to memory until we exit | |
3257 __ addptr(pos, AESBlockSize); | |
3258 __ subptr(len_reg, AESBlockSize); | |
3259 __ jcc(Assembler::notEqual, L_loopTop_256); | |
3260 __ jmp(L_exit); | |
3261 | |
3262 return start; | |
3263 } | |
3264 | |
3265 | |
3266 | |
3267 // This is a version of CBC/AES Decrypt which does 4 blocks in a loop at a time | |
3268 // to hide instruction latency | |
3269 // | |
3270 // Arguments: | |
3271 // | |
3272 // Inputs: | |
3273 // c_rarg0 - source byte array address | |
3274 // c_rarg1 - destination byte array address | |
3275 // c_rarg2 - K (key) in little endian int array | |
3276 // c_rarg3 - r vector byte array address | |
3277 // c_rarg4 - input length | |
3278 // | |
3279 | |
3280 address generate_cipherBlockChaining_decryptAESCrypt_Parallel() { | |
3281 assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support"); | |
3282 __ align(CodeEntryAlignment); | |
3283 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); | |
3284 address start = __ pc(); | |
3285 | |
3286 Label L_exit, L_key_192_256, L_key_256; | |
3287 Label L_singleBlock_loopTop_128, L_multiBlock_loopTop_128; | |
3288 Label L_singleBlock_loopTop_192, L_singleBlock_loopTop_256; | |
3289 const Register from = c_rarg0; // source array address | |
3290 const Register to = c_rarg1; // destination array address | |
3291 const Register key = c_rarg2; // key array address | |
3292 const Register rvec = c_rarg3; // r byte array initialized from initvector array address | |
3293 // and left with the results of the last encryption block | |
3294 #ifndef _WIN64 | |
3295 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) | |
3296 #else | |
3297 const Address len_mem(rsp, 6 * wordSize); // length is on stack on Win64 | |
3298 const Register len_reg = r10; // pick the first volatile windows register | |
3299 #endif | |
3300 const Register pos = rax; | |
3301 | |
3302 // xmm register assignments for the loops below | |
3303 const XMMRegister xmm_result = xmm0; | |
3304 // keys 0-10 preloaded into xmm2-xmm12 | |
3305 const int XMM_REG_NUM_KEY_FIRST = 5; | |
3306 const int XMM_REG_NUM_KEY_LAST = 15; | |
3307 const XMMRegister xmm_key_first = as_XMMRegister(XMM_REG_NUM_KEY_FIRST); | |
3308 const XMMRegister xmm_key_last = as_XMMRegister(XMM_REG_NUM_KEY_LAST); | |
3309 | |
3310 __ enter(); // required for proper stackwalking of RuntimeStub frame | |
3311 | |
3312 #ifdef _WIN64 | |
3313 // on win64, fill len_reg from stack position | |
3314 __ movl(len_reg, len_mem); | |
3315 // save the xmm registers which must be preserved 6-15 | |
3316 __ subptr(rsp, -rsp_after_call_off * wordSize); | |
3317 for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) { | |
3318 __ movdqu(xmm_save(i), as_XMMRegister(i)); | |
3319 } | |
3320 #endif | |
3321 // the java expanded key ordering is rotated one position from what we want | |
3322 // so we start from 0x10 here and hit 0x00 last | |
3323 const XMMRegister xmm_key_shuf_mask = xmm1; // used temporarily to swap key bytes up front | |
3324 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); | |
3325 // load up xmm regs 5 thru 15 with key 0x10 - 0xa0 - 0x00 | |
3326 for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x10; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) { | |
3327 if (rnum == XMM_REG_NUM_KEY_LAST) offset = 0x00; | |
3328 load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask); | |
3329 offset += 0x10; | |
3330 } | |
3331 | |
3332 const XMMRegister xmm_prev_block_cipher = xmm1; // holds cipher of previous block | |
3333 // registers holding the four results in the parallelized loop | |
3334 const XMMRegister xmm_result0 = xmm0; | |
3335 const XMMRegister xmm_result1 = xmm2; | |
3336 const XMMRegister xmm_result2 = xmm3; | |
3337 const XMMRegister xmm_result3 = xmm4; | |
3338 | |
3339 __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00)); // initialize with initial rvec | |
3340 | |
3341 // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256)) | |
3342 __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); | |
3343 __ cmpl(rax, 44); | |
3344 __ jcc(Assembler::notEqual, L_key_192_256); | |
3345 | |
3346 | |
3347 // 128-bit code follows here, parallelized | |
3348 __ movptr(pos, 0); | |
3349 __ align(OptoLoopAlignment); | |
3350 __ BIND(L_multiBlock_loopTop_128); | |
3351 __ cmpptr(len_reg, 4*AESBlockSize); // see if at least 4 blocks left | |
3352 __ jcc(Assembler::less, L_singleBlock_loopTop_128); | |
3353 | |
3354 __ movdqu(xmm_result0, Address(from, pos, Address::times_1, 0*AESBlockSize)); // get next 4 blocks into xmmresult registers | |
3355 __ movdqu(xmm_result1, Address(from, pos, Address::times_1, 1*AESBlockSize)); | |
3356 __ movdqu(xmm_result2, Address(from, pos, Address::times_1, 2*AESBlockSize)); | |
3357 __ movdqu(xmm_result3, Address(from, pos, Address::times_1, 3*AESBlockSize)); | |
3358 | |
3359 #define DoFour(opc, src_reg) \ | |
3360 __ opc(xmm_result0, src_reg); \ | |
3361 __ opc(xmm_result1, src_reg); \ | |
3362 __ opc(xmm_result2, src_reg); \ | |
3363 __ opc(xmm_result3, src_reg); | |
3364 | |
3365 DoFour(pxor, xmm_key_first); | |
3366 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST - 1; rnum++) { | |
3367 DoFour(aesdec, as_XMMRegister(rnum)); | |
3368 } | |
3369 DoFour(aesdeclast, xmm_key_last); | |
3370 // for each result, xor with the r vector of previous cipher block | |
3371 __ pxor(xmm_result0, xmm_prev_block_cipher); | |
3372 __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 0*AESBlockSize)); | |
3373 __ pxor(xmm_result1, xmm_prev_block_cipher); | |
3374 __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 1*AESBlockSize)); | |
3375 __ pxor(xmm_result2, xmm_prev_block_cipher); | |
3376 __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 2*AESBlockSize)); | |
3377 __ pxor(xmm_result3, xmm_prev_block_cipher); | |
3378 __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 3*AESBlockSize)); // this will carry over to next set of blocks | |
3379 | |
3380 __ movdqu(Address(to, pos, Address::times_1, 0*AESBlockSize), xmm_result0); // store 4 results into the next 64 bytes of output | |
3381 __ movdqu(Address(to, pos, Address::times_1, 1*AESBlockSize), xmm_result1); | |
3382 __ movdqu(Address(to, pos, Address::times_1, 2*AESBlockSize), xmm_result2); | |
3383 __ movdqu(Address(to, pos, Address::times_1, 3*AESBlockSize), xmm_result3); | |
3384 | |
3385 __ addptr(pos, 4*AESBlockSize); | |
3386 __ subptr(len_reg, 4*AESBlockSize); | |
3387 __ jmp(L_multiBlock_loopTop_128); | |
3388 | |
3389 // registers used in the non-parallelized loops | |
3390 const XMMRegister xmm_prev_block_cipher_save = xmm2; | |
3391 const XMMRegister xmm_temp = xmm3; | |
3392 | |
3393 __ align(OptoLoopAlignment); | |
3394 __ BIND(L_singleBlock_loopTop_128); | |
3395 __ cmpptr(len_reg, 0); // any blocks left?? | |
3396 __ jcc(Assembler::equal, L_exit); | |
3397 __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input | |
3398 __ movdqa(xmm_prev_block_cipher_save, xmm_result); // save for next r vector | |
3399 __ pxor (xmm_result, xmm_key_first); // do the aes dec rounds | |
3400 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST - 1; rnum++) { | |
3401 __ aesdec(xmm_result, as_XMMRegister(rnum)); | |
3402 } | |
3403 __ aesdeclast(xmm_result, xmm_key_last); | |
3404 __ pxor (xmm_result, xmm_prev_block_cipher); // xor with the current r vector | |
3405 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output | |
3406 // no need to store r to memory until we exit | |
3407 __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save); // set up next r vector with cipher input from this block | |
3408 | |
3409 __ addptr(pos, AESBlockSize); | |
3410 __ subptr(len_reg, AESBlockSize); | |
3411 __ jmp(L_singleBlock_loopTop_128); | |
3412 | |
3413 | |
3414 __ BIND(L_exit); | |
3415 __ movdqu(Address(rvec, 0), xmm_prev_block_cipher); // final value of r stored in rvec of CipherBlockChaining object | |
3416 #ifdef _WIN64 | |
3417 // restore regs belonging to calling function | |
3418 for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) { | |
3419 __ movdqu(as_XMMRegister(i), xmm_save(i)); | |
3420 } | |
3421 #endif | |
3422 __ movl(rax, 0); // return 0 (why?) | |
3423 __ leave(); // required for proper stackwalking of RuntimeStub frame | |
3424 __ ret(0); | |
3425 | |
3426 | |
3427 __ BIND(L_key_192_256); | |
3428 // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256) | |
3429 __ cmpl(rax, 52); | |
3430 __ jcc(Assembler::notEqual, L_key_256); | |
3431 | |
3432 // 192-bit code follows here (could be optimized to use parallelism) | |
3433 __ movptr(pos, 0); | |
3434 __ align(OptoLoopAlignment); | |
3435 __ BIND(L_singleBlock_loopTop_192); | |
3436 __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input | |
3437 __ movdqa(xmm_prev_block_cipher_save, xmm_result); // save for next r vector | |
3438 __ pxor (xmm_result, xmm_key_first); // do the aes dec rounds | |
3439 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST - 1; rnum++) { | |
3440 __ aesdec(xmm_result, as_XMMRegister(rnum)); | |
3441 } | |
3442 aes_dec_key(xmm_result, xmm_temp, key, 0xb0); // 192-bit key goes up to c0 | |
3443 aes_dec_key(xmm_result, xmm_temp, key, 0xc0); | |
3444 __ aesdeclast(xmm_result, xmm_key_last); // xmm15 always came from key+0 | |
3445 __ pxor (xmm_result, xmm_prev_block_cipher); // xor with the current r vector | |
3446 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output | |
3447 // no need to store r to memory until we exit | |
3448 __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save); // set up next r vector with cipher input from this block | |
3449 | |
3450 __ addptr(pos, AESBlockSize); | |
3451 __ subptr(len_reg, AESBlockSize); | |
3452 __ jcc(Assembler::notEqual,L_singleBlock_loopTop_192); | |
3453 __ jmp(L_exit); | |
3454 | |
3455 __ BIND(L_key_256); | |
3456 // 256-bit code follows here (could be optimized to use parallelism) | |
3457 __ movptr(pos, 0); | |
3458 __ align(OptoLoopAlignment); | |
3459 __ BIND(L_singleBlock_loopTop_256); | |
3460 __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input | |
3461 __ movdqa(xmm_prev_block_cipher_save, xmm_result); // save for next r vector | |
3462 __ pxor (xmm_result, xmm_key_first); // do the aes dec rounds | |
3463 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST - 1; rnum++) { | |
3464 __ aesdec(xmm_result, as_XMMRegister(rnum)); | |
3465 } | |
3466 aes_dec_key(xmm_result, xmm_temp, key, 0xb0); // 256-bit key goes up to e0 | |
3467 aes_dec_key(xmm_result, xmm_temp, key, 0xc0); | |
3468 aes_dec_key(xmm_result, xmm_temp, key, 0xd0); | |
3469 aes_dec_key(xmm_result, xmm_temp, key, 0xe0); | |
3470 __ aesdeclast(xmm_result, xmm_key_last); // xmm15 came from key+0 | |
3471 __ pxor (xmm_result, xmm_prev_block_cipher); // xor with the current r vector | |
3472 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output | |
3473 // no need to store r to memory until we exit | |
3474 __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save); // set up next r vector with cipher input from this block | |
3475 | |
3476 __ addptr(pos, AESBlockSize); | |
3477 __ subptr(len_reg, AESBlockSize); | |
3478 __ jcc(Assembler::notEqual,L_singleBlock_loopTop_256); | |
3479 __ jmp(L_exit); | |
3480 | |
3481 return start; | |
3482 } | |
3483 | |
3484 | |
3485 | |
2944 #undef __ | 3486 #undef __ |
2945 #define __ masm-> | 3487 #define __ masm-> |
2946 | 3488 |
2947 // Continuation point for throwing of implicit exceptions that are | 3489 // Continuation point for throwing of implicit exceptions that are |
2948 // not handled in the current activation. Fabricates an exception | 3490 // not handled in the current activation. Fabricates an exception |
3133 | 3675 |
3134 // arraycopy stubs used by compilers | 3676 // arraycopy stubs used by compilers |
3135 generate_arraycopy_stubs(); | 3677 generate_arraycopy_stubs(); |
3136 | 3678 |
3137 generate_math_stubs(); | 3679 generate_math_stubs(); |
3680 | |
3681 // don't bother generating these AES intrinsic stubs unless global flag is set | |
3682 if (UseAESIntrinsics) { | |
3683 StubRoutines::x86::_key_shuffle_mask_addr = generate_key_shuffle_mask(); // needed by the others | |
3684 | |
3685 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); | |
3686 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); | |
3687 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); | |
3688 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel(); | |
3689 } | |
3138 } | 3690 } |
3139 | 3691 |
3140 public: | 3692 public: |
3141 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) { | 3693 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) { |
3142 if (all) { | 3694 if (all) { |