comparison src/cpu/x86/vm/stubGenerator_x86_64.cpp @ 6948:e522a00b91aa

Merge with http://hg.openjdk.java.net/hsx/hsx25/hotspot/ after NPG - C++ build works
author Doug Simon <doug.simon@oracle.com>
date Mon, 12 Nov 2012 23:14:12 +0100
parents 957c266d8bc5 a3ecd773a7b9
children 291ffc492eb6
comparison
equal deleted inserted replaced
6711:ae13cc658b80 6948:e522a00b91aa
1 /* 1 /*
2 * Copyright (c) 2003, 2011, Oracle and/or its affiliates. All rights reserved. 2 * Copyright (c) 2003, 2012, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 * 4 *
5 * This code is free software; you can redistribute it and/or modify it 5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as 6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation. 7 * published by the Free Software Foundation.
26 #include "asm/assembler.hpp" 26 #include "asm/assembler.hpp"
27 #include "assembler_x86.inline.hpp" 27 #include "assembler_x86.inline.hpp"
28 #include "interpreter/interpreter.hpp" 28 #include "interpreter/interpreter.hpp"
29 #include "nativeInst_x86.hpp" 29 #include "nativeInst_x86.hpp"
30 #include "oops/instanceOop.hpp" 30 #include "oops/instanceOop.hpp"
31 #include "oops/methodOop.hpp" 31 #include "oops/method.hpp"
32 #include "oops/objArrayKlass.hpp" 32 #include "oops/objArrayKlass.hpp"
33 #include "oops/oop.inline.hpp" 33 #include "oops/oop.inline.hpp"
34 #include "prims/methodHandles.hpp" 34 #include "prims/methodHandles.hpp"
35 #include "runtime/frame.inline.hpp" 35 #include "runtime/frame.inline.hpp"
36 #include "runtime/handles.inline.hpp" 36 #include "runtime/handles.inline.hpp"
107 // 107 //
108 // Linux Arguments: 108 // Linux Arguments:
109 // c_rarg0: call wrapper address address 109 // c_rarg0: call wrapper address address
110 // c_rarg1: result address 110 // c_rarg1: result address
111 // c_rarg2: result type BasicType 111 // c_rarg2: result type BasicType
112 // c_rarg3: method methodOop 112 // c_rarg3: method Method*
113 // c_rarg4: (interpreter) entry point address 113 // c_rarg4: (interpreter) entry point address
114 // c_rarg5: parameters intptr_t* 114 // c_rarg5: parameters intptr_t*
115 // 16(rbp): parameter size (in words) int 115 // 16(rbp): parameter size (in words) int
116 // 24(rbp): thread Thread* 116 // 24(rbp): thread Thread*
117 // 117 //
137 // 137 //
138 // Windows Arguments: 138 // Windows Arguments:
139 // c_rarg0: call wrapper address address 139 // c_rarg0: call wrapper address address
140 // c_rarg1: result address 140 // c_rarg1: result address
141 // c_rarg2: result type BasicType 141 // c_rarg2: result type BasicType
142 // c_rarg3: method methodOop 142 // c_rarg3: method Method*
143 // 48(rbp): (interpreter) entry point address 143 // 48(rbp): (interpreter) entry point address
144 // 56(rbp): parameters intptr_t* 144 // 56(rbp): parameters intptr_t*
145 // 64(rbp): parameter size (in words) int 145 // 64(rbp): parameter size (in words) int
146 // 72(rbp): thread Thread* 146 // 72(rbp): thread Thread*
147 // 147 //
330 __ push(rax); // pass parameter 330 __ push(rax); // pass parameter
331 __ jcc(Assembler::notZero, loop); 331 __ jcc(Assembler::notZero, loop);
332 332
333 // call Java function 333 // call Java function
334 __ BIND(parameters_done); 334 __ BIND(parameters_done);
335 __ movptr(rbx, method); // get methodOop 335 __ movptr(rbx, method); // get Method*
336 __ movptr(c_rarg1, entry_point); // get entry_point 336 __ movptr(c_rarg1, entry_point); // get entry_point
337 __ mov(r13, rsp); // set sender sp 337 __ mov(r13, rsp); // set sender sp
338 BLOCK_COMMENT("call Java function"); 338 BLOCK_COMMENT("call Java function");
339 __ call(c_rarg1); 339 __ call(c_rarg1);
340 340
1025 __ jcc(Assembler::notZero, error); 1025 __ jcc(Assembler::notZero, error);
1026 1026
1027 // set r12 to heapbase for load_klass() 1027 // set r12 to heapbase for load_klass()
1028 __ reinit_heapbase(); 1028 __ reinit_heapbase();
1029 1029
1030 // make sure klass is 'reasonable' 1030 // make sure klass is 'reasonable', which is not zero.
1031 __ load_klass(rax, rax); // get klass 1031 __ load_klass(rax, rax); // get klass
1032 __ testptr(rax, rax); 1032 __ testptr(rax, rax);
1033 __ jcc(Assembler::zero, error); // if klass is NULL it is broken 1033 __ jcc(Assembler::zero, error); // if klass is NULL it is broken
1034 // Check if the klass is in the right area of memory 1034 // TODO: Future assert that klass is lower 4g memory for UseCompressedKlassPointers
1035 __ mov(c_rarg2, rax);
1036 __ movptr(c_rarg3, (intptr_t) Universe::verify_klass_mask());
1037 __ andptr(c_rarg2, c_rarg3);
1038 __ movptr(c_rarg3, (intptr_t) Universe::verify_klass_bits());
1039 __ cmpptr(c_rarg2, c_rarg3);
1040 __ jcc(Assembler::notZero, error);
1041
1042 // make sure klass' klass is 'reasonable'
1043 __ load_klass(rax, rax);
1044 __ testptr(rax, rax);
1045 __ jcc(Assembler::zero, error); // if klass' klass is NULL it is broken
1046 // Check if the klass' klass is in the right area of memory
1047 __ movptr(c_rarg3, (intptr_t) Universe::verify_klass_mask());
1048 __ andptr(rax, c_rarg3);
1049 __ movptr(c_rarg3, (intptr_t) Universe::verify_klass_bits());
1050 __ cmpptr(rax, c_rarg3);
1051 __ jcc(Assembler::notZero, error);
1052 1035
1053 // return if everything seems ok 1036 // return if everything seems ok
1054 __ bind(exit); 1037 __ bind(exit);
1055 __ movptr(rax, Address(rsp, saved_rax)); // get saved rax back 1038 __ movptr(rax, Address(rsp, saved_rax)); // get saved rax back
1056 __ movptr(rscratch1, Address(rsp, saved_r10)); // get saved r10 back 1039 __ movptr(rscratch1, Address(rsp, saved_r10)); // get saved r10 back
2619 #endif 2602 #endif
2620 2603
2621 arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length, 2604 arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
2622 r10, L_failed); 2605 r10, L_failed);
2623 2606
2624 // typeArrayKlass 2607 // TypeArrayKlass
2625 // 2608 //
2626 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 2609 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2627 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 2610 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2628 // 2611 //
2629 2612
2685 __ lea(from, Address(src, src_pos, Address::times_8, 0));// src_addr 2668 __ lea(from, Address(src, src_pos, Address::times_8, 0));// src_addr
2686 __ lea(to, Address(dst, dst_pos, Address::times_8, 0));// dst_addr 2669 __ lea(to, Address(dst, dst_pos, Address::times_8, 0));// dst_addr
2687 __ movl2ptr(count, r11_length); // length 2670 __ movl2ptr(count, r11_length); // length
2688 __ jump(RuntimeAddress(long_copy_entry)); 2671 __ jump(RuntimeAddress(long_copy_entry));
2689 2672
2690 // objArrayKlass 2673 // ObjArrayKlass
2691 __ BIND(L_objArray); 2674 __ BIND(L_objArray);
2692 // live at this point: r10_src_klass, r11_length, src[_pos], dst[_pos] 2675 // live at this point: r10_src_klass, r11_length, src[_pos], dst[_pos]
2693 2676
2694 Label L_plain_copy, L_checkcast_copy; 2677 Label L_plain_copy, L_checkcast_copy;
2695 // test array classes for subtyping 2678 // test array classes for subtyping
2738 const int sco_offset = in_bytes(Klass::super_check_offset_offset()); 2721 const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2739 __ movl(sco_temp, Address(r11_dst_klass, sco_offset)); 2722 __ movl(sco_temp, Address(r11_dst_klass, sco_offset));
2740 assert_clean_int(sco_temp, rax); 2723 assert_clean_int(sco_temp, rax);
2741 generate_type_check(r10_src_klass, sco_temp, r11_dst_klass, L_plain_copy); 2724 generate_type_check(r10_src_klass, sco_temp, r11_dst_klass, L_plain_copy);
2742 2725
2743 // Fetch destination element klass from the objArrayKlass header. 2726 // Fetch destination element klass from the ObjArrayKlass header.
2744 int ek_offset = in_bytes(objArrayKlass::element_klass_offset()); 2727 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2745 __ movptr(r11_dst_klass, Address(r11_dst_klass, ek_offset)); 2728 __ movptr(r11_dst_klass, Address(r11_dst_klass, ek_offset));
2746 __ movl( sco_temp, Address(r11_dst_klass, sco_offset)); 2729 __ movl( sco_temp, Address(r11_dst_klass, sco_offset));
2747 assert_clean_int(sco_temp, rax); 2730 assert_clean_int(sco_temp, rax);
2748 2731
2749 // the checkcast_copy loop needs two extra arguments: 2732 // the checkcast_copy loop needs two extra arguments:
2955 __ movdbl(xmm0, Address(rsp, 0)); 2938 __ movdbl(xmm0, Address(rsp, 0));
2956 __ addq(rsp, 8); 2939 __ addq(rsp, 8);
2957 __ ret(0); 2940 __ ret(0);
2958 } 2941 }
2959 } 2942 }
2943
2944 // AES intrinsic stubs
2945 enum {AESBlockSize = 16};
2946
2947 address generate_key_shuffle_mask() {
2948 __ align(16);
2949 StubCodeMark mark(this, "StubRoutines", "key_shuffle_mask");
2950 address start = __ pc();
2951 __ emit_data64( 0x0405060700010203, relocInfo::none );
2952 __ emit_data64( 0x0c0d0e0f08090a0b, relocInfo::none );
2953 return start;
2954 }
2955
2956 // Utility routine for loading a 128-bit key word in little endian format
2957 // can optionally specify that the shuffle mask is already in an xmmregister
2958 void load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
2959 __ movdqu(xmmdst, Address(key, offset));
2960 if (xmm_shuf_mask != NULL) {
2961 __ pshufb(xmmdst, xmm_shuf_mask);
2962 } else {
2963 __ pshufb(xmmdst, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
2964 }
2965 }
2966
2967 // aesenc using specified key+offset
2968 // can optionally specify that the shuffle mask is already in an xmmregister
2969 void aes_enc_key(XMMRegister xmmdst, XMMRegister xmmtmp, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
2970 load_key(xmmtmp, key, offset, xmm_shuf_mask);
2971 __ aesenc(xmmdst, xmmtmp);
2972 }
2973
2974 // aesdec using specified key+offset
2975 // can optionally specify that the shuffle mask is already in an xmmregister
2976 void aes_dec_key(XMMRegister xmmdst, XMMRegister xmmtmp, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
2977 load_key(xmmtmp, key, offset, xmm_shuf_mask);
2978 __ aesdec(xmmdst, xmmtmp);
2979 }
2980
2981
2982 // Arguments:
2983 //
2984 // Inputs:
2985 // c_rarg0 - source byte array address
2986 // c_rarg1 - destination byte array address
2987 // c_rarg2 - K (key) in little endian int array
2988 //
2989 address generate_aescrypt_encryptBlock() {
2990 assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support");
2991 __ align(CodeEntryAlignment);
2992 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2993 Label L_doLast;
2994 address start = __ pc();
2995
2996 const Register from = c_rarg0; // source array address
2997 const Register to = c_rarg1; // destination array address
2998 const Register key = c_rarg2; // key array address
2999 const Register keylen = rax;
3000
3001 const XMMRegister xmm_result = xmm0;
3002 const XMMRegister xmm_temp = xmm1;
3003 const XMMRegister xmm_key_shuf_mask = xmm2;
3004
3005 __ enter(); // required for proper stackwalking of RuntimeStub frame
3006
3007 __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3008 // keylen = # of 32-bit words, convert to 128-bit words
3009 __ shrl(keylen, 2);
3010 __ subl(keylen, 11); // every key has at least 11 128-bit words, some have more
3011
3012 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3013 __ movdqu(xmm_result, Address(from, 0)); // get 16 bytes of input
3014
3015 // For encryption, the java expanded key ordering is just what we need
3016 // we don't know if the key is aligned, hence not using load-execute form
3017
3018 load_key(xmm_temp, key, 0x00, xmm_key_shuf_mask);
3019 __ pxor(xmm_result, xmm_temp);
3020 for (int offset = 0x10; offset <= 0x90; offset += 0x10) {
3021 aes_enc_key(xmm_result, xmm_temp, key, offset, xmm_key_shuf_mask);
3022 }
3023 load_key (xmm_temp, key, 0xa0, xmm_key_shuf_mask);
3024 __ cmpl(keylen, 0);
3025 __ jcc(Assembler::equal, L_doLast);
3026 __ aesenc(xmm_result, xmm_temp); // only in 192 and 256 bit keys
3027 aes_enc_key(xmm_result, xmm_temp, key, 0xb0, xmm_key_shuf_mask);
3028 load_key(xmm_temp, key, 0xc0, xmm_key_shuf_mask);
3029 __ subl(keylen, 2);
3030 __ jcc(Assembler::equal, L_doLast);
3031 __ aesenc(xmm_result, xmm_temp); // only in 256 bit keys
3032 aes_enc_key(xmm_result, xmm_temp, key, 0xd0, xmm_key_shuf_mask);
3033 load_key(xmm_temp, key, 0xe0, xmm_key_shuf_mask);
3034
3035 __ BIND(L_doLast);
3036 __ aesenclast(xmm_result, xmm_temp);
3037 __ movdqu(Address(to, 0), xmm_result); // store the result
3038 __ xorptr(rax, rax); // return 0
3039 __ leave(); // required for proper stackwalking of RuntimeStub frame
3040 __ ret(0);
3041
3042 return start;
3043 }
3044
3045
3046 // Arguments:
3047 //
3048 // Inputs:
3049 // c_rarg0 - source byte array address
3050 // c_rarg1 - destination byte array address
3051 // c_rarg2 - K (key) in little endian int array
3052 //
3053 address generate_aescrypt_decryptBlock() {
3054 assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support");
3055 __ align(CodeEntryAlignment);
3056 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
3057 Label L_doLast;
3058 address start = __ pc();
3059
3060 const Register from = c_rarg0; // source array address
3061 const Register to = c_rarg1; // destination array address
3062 const Register key = c_rarg2; // key array address
3063 const Register keylen = rax;
3064
3065 const XMMRegister xmm_result = xmm0;
3066 const XMMRegister xmm_temp = xmm1;
3067 const XMMRegister xmm_key_shuf_mask = xmm2;
3068
3069 __ enter(); // required for proper stackwalking of RuntimeStub frame
3070
3071 __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3072 // keylen = # of 32-bit words, convert to 128-bit words
3073 __ shrl(keylen, 2);
3074 __ subl(keylen, 11); // every key has at least 11 128-bit words, some have more
3075
3076 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3077 __ movdqu(xmm_result, Address(from, 0));
3078
3079 // for decryption java expanded key ordering is rotated one position from what we want
3080 // so we start from 0x10 here and hit 0x00 last
3081 // we don't know if the key is aligned, hence not using load-execute form
3082 load_key(xmm_temp, key, 0x10, xmm_key_shuf_mask);
3083 __ pxor (xmm_result, xmm_temp);
3084 for (int offset = 0x20; offset <= 0xa0; offset += 0x10) {
3085 aes_dec_key(xmm_result, xmm_temp, key, offset, xmm_key_shuf_mask);
3086 }
3087 __ cmpl(keylen, 0);
3088 __ jcc(Assembler::equal, L_doLast);
3089 // only in 192 and 256 bit keys
3090 aes_dec_key(xmm_result, xmm_temp, key, 0xb0, xmm_key_shuf_mask);
3091 aes_dec_key(xmm_result, xmm_temp, key, 0xc0, xmm_key_shuf_mask);
3092 __ subl(keylen, 2);
3093 __ jcc(Assembler::equal, L_doLast);
3094 // only in 256 bit keys
3095 aes_dec_key(xmm_result, xmm_temp, key, 0xd0, xmm_key_shuf_mask);
3096 aes_dec_key(xmm_result, xmm_temp, key, 0xe0, xmm_key_shuf_mask);
3097
3098 __ BIND(L_doLast);
3099 // for decryption the aesdeclast operation is always on key+0x00
3100 load_key(xmm_temp, key, 0x00, xmm_key_shuf_mask);
3101 __ aesdeclast(xmm_result, xmm_temp);
3102
3103 __ movdqu(Address(to, 0), xmm_result); // store the result
3104
3105 __ xorptr(rax, rax); // return 0
3106 __ leave(); // required for proper stackwalking of RuntimeStub frame
3107 __ ret(0);
3108
3109 return start;
3110 }
3111
3112
3113 // Arguments:
3114 //
3115 // Inputs:
3116 // c_rarg0 - source byte array address
3117 // c_rarg1 - destination byte array address
3118 // c_rarg2 - K (key) in little endian int array
3119 // c_rarg3 - r vector byte array address
3120 // c_rarg4 - input length
3121 //
3122 address generate_cipherBlockChaining_encryptAESCrypt() {
3123 assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support");
3124 __ align(CodeEntryAlignment);
3125 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
3126 address start = __ pc();
3127
3128 Label L_exit, L_key_192_256, L_key_256, L_loopTop_128, L_loopTop_192, L_loopTop_256;
3129 const Register from = c_rarg0; // source array address
3130 const Register to = c_rarg1; // destination array address
3131 const Register key = c_rarg2; // key array address
3132 const Register rvec = c_rarg3; // r byte array initialized from initvector array address
3133 // and left with the results of the last encryption block
3134 #ifndef _WIN64
3135 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16)
3136 #else
3137 const Address len_mem(rsp, 6 * wordSize); // length is on stack on Win64
3138 const Register len_reg = r10; // pick the first volatile windows register
3139 #endif
3140 const Register pos = rax;
3141
3142 // xmm register assignments for the loops below
3143 const XMMRegister xmm_result = xmm0;
3144 const XMMRegister xmm_temp = xmm1;
3145 // keys 0-10 preloaded into xmm2-xmm12
3146 const int XMM_REG_NUM_KEY_FIRST = 2;
3147 const int XMM_REG_NUM_KEY_LAST = 12;
3148 const XMMRegister xmm_key0 = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
3149 const XMMRegister xmm_key10 = as_XMMRegister(XMM_REG_NUM_KEY_LAST);
3150
3151 __ enter(); // required for proper stackwalking of RuntimeStub frame
3152
3153 #ifdef _WIN64
3154 // on win64, fill len_reg from stack position
3155 __ movl(len_reg, len_mem);
3156 // save the xmm registers which must be preserved 6-12
3157 __ subptr(rsp, -rsp_after_call_off * wordSize);
3158 for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) {
3159 __ movdqu(xmm_save(i), as_XMMRegister(i));
3160 }
3161 #endif
3162
3163 const XMMRegister xmm_key_shuf_mask = xmm_temp; // used temporarily to swap key bytes up front
3164 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3165 // load up xmm regs 2 thru 12 with key 0x00 - 0xa0
3166 for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x00; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) {
3167 load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
3168 offset += 0x10;
3169 }
3170
3171 __ movdqu(xmm_result, Address(rvec, 0x00)); // initialize xmm_result with r vec
3172
3173 // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
3174 __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3175 __ cmpl(rax, 44);
3176 __ jcc(Assembler::notEqual, L_key_192_256);
3177
3178 // 128 bit code follows here
3179 __ movptr(pos, 0);
3180 __ align(OptoLoopAlignment);
3181 __ BIND(L_loopTop_128);
3182 __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input
3183 __ pxor (xmm_result, xmm_temp); // xor with the current r vector
3184
3185 __ pxor (xmm_result, xmm_key0); // do the aes rounds
3186 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST - 1; rnum++) {
3187 __ aesenc(xmm_result, as_XMMRegister(rnum));
3188 }
3189 __ aesenclast(xmm_result, xmm_key10);
3190
3191 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output
3192 // no need to store r to memory until we exit
3193 __ addptr(pos, AESBlockSize);
3194 __ subptr(len_reg, AESBlockSize);
3195 __ jcc(Assembler::notEqual, L_loopTop_128);
3196
3197 __ BIND(L_exit);
3198 __ movdqu(Address(rvec, 0), xmm_result); // final value of r stored in rvec of CipherBlockChaining object
3199
3200 #ifdef _WIN64
3201 // restore xmm regs belonging to calling function
3202 for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) {
3203 __ movdqu(as_XMMRegister(i), xmm_save(i));
3204 }
3205 #endif
3206 __ movl(rax, 0); // return 0 (why?)
3207 __ leave(); // required for proper stackwalking of RuntimeStub frame
3208 __ ret(0);
3209
3210 __ BIND(L_key_192_256);
3211 // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256)
3212 __ cmpl(rax, 52);
3213 __ jcc(Assembler::notEqual, L_key_256);
3214
3215 // 192-bit code follows here (could be changed to use more xmm registers)
3216 __ movptr(pos, 0);
3217 __ align(OptoLoopAlignment);
3218 __ BIND(L_loopTop_192);
3219 __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input
3220 __ pxor (xmm_result, xmm_temp); // xor with the current r vector
3221
3222 __ pxor (xmm_result, xmm_key0); // do the aes rounds
3223 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) {
3224 __ aesenc(xmm_result, as_XMMRegister(rnum));
3225 }
3226 aes_enc_key(xmm_result, xmm_temp, key, 0xb0);
3227 load_key(xmm_temp, key, 0xc0);
3228 __ aesenclast(xmm_result, xmm_temp);
3229
3230 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output
3231 // no need to store r to memory until we exit
3232 __ addptr(pos, AESBlockSize);
3233 __ subptr(len_reg, AESBlockSize);
3234 __ jcc(Assembler::notEqual, L_loopTop_192);
3235 __ jmp(L_exit);
3236
3237 __ BIND(L_key_256);
3238 // 256-bit code follows here (could be changed to use more xmm registers)
3239 __ movptr(pos, 0);
3240 __ align(OptoLoopAlignment);
3241 __ BIND(L_loopTop_256);
3242 __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input
3243 __ pxor (xmm_result, xmm_temp); // xor with the current r vector
3244
3245 __ pxor (xmm_result, xmm_key0); // do the aes rounds
3246 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) {
3247 __ aesenc(xmm_result, as_XMMRegister(rnum));
3248 }
3249 aes_enc_key(xmm_result, xmm_temp, key, 0xb0);
3250 aes_enc_key(xmm_result, xmm_temp, key, 0xc0);
3251 aes_enc_key(xmm_result, xmm_temp, key, 0xd0);
3252 load_key(xmm_temp, key, 0xe0);
3253 __ aesenclast(xmm_result, xmm_temp);
3254
3255 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output
3256 // no need to store r to memory until we exit
3257 __ addptr(pos, AESBlockSize);
3258 __ subptr(len_reg, AESBlockSize);
3259 __ jcc(Assembler::notEqual, L_loopTop_256);
3260 __ jmp(L_exit);
3261
3262 return start;
3263 }
3264
3265
3266
3267 // This is a version of CBC/AES Decrypt which does 4 blocks in a loop at a time
3268 // to hide instruction latency
3269 //
3270 // Arguments:
3271 //
3272 // Inputs:
3273 // c_rarg0 - source byte array address
3274 // c_rarg1 - destination byte array address
3275 // c_rarg2 - K (key) in little endian int array
3276 // c_rarg3 - r vector byte array address
3277 // c_rarg4 - input length
3278 //
3279
3280 address generate_cipherBlockChaining_decryptAESCrypt_Parallel() {
3281 assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support");
3282 __ align(CodeEntryAlignment);
3283 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
3284 address start = __ pc();
3285
3286 Label L_exit, L_key_192_256, L_key_256;
3287 Label L_singleBlock_loopTop_128, L_multiBlock_loopTop_128;
3288 Label L_singleBlock_loopTop_192, L_singleBlock_loopTop_256;
3289 const Register from = c_rarg0; // source array address
3290 const Register to = c_rarg1; // destination array address
3291 const Register key = c_rarg2; // key array address
3292 const Register rvec = c_rarg3; // r byte array initialized from initvector array address
3293 // and left with the results of the last encryption block
3294 #ifndef _WIN64
3295 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16)
3296 #else
3297 const Address len_mem(rsp, 6 * wordSize); // length is on stack on Win64
3298 const Register len_reg = r10; // pick the first volatile windows register
3299 #endif
3300 const Register pos = rax;
3301
3302 // xmm register assignments for the loops below
3303 const XMMRegister xmm_result = xmm0;
3304 // keys 0-10 preloaded into xmm2-xmm12
3305 const int XMM_REG_NUM_KEY_FIRST = 5;
3306 const int XMM_REG_NUM_KEY_LAST = 15;
3307 const XMMRegister xmm_key_first = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
3308 const XMMRegister xmm_key_last = as_XMMRegister(XMM_REG_NUM_KEY_LAST);
3309
3310 __ enter(); // required for proper stackwalking of RuntimeStub frame
3311
3312 #ifdef _WIN64
3313 // on win64, fill len_reg from stack position
3314 __ movl(len_reg, len_mem);
3315 // save the xmm registers which must be preserved 6-15
3316 __ subptr(rsp, -rsp_after_call_off * wordSize);
3317 for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) {
3318 __ movdqu(xmm_save(i), as_XMMRegister(i));
3319 }
3320 #endif
3321 // the java expanded key ordering is rotated one position from what we want
3322 // so we start from 0x10 here and hit 0x00 last
3323 const XMMRegister xmm_key_shuf_mask = xmm1; // used temporarily to swap key bytes up front
3324 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3325 // load up xmm regs 5 thru 15 with key 0x10 - 0xa0 - 0x00
3326 for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x10; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) {
3327 if (rnum == XMM_REG_NUM_KEY_LAST) offset = 0x00;
3328 load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
3329 offset += 0x10;
3330 }
3331
3332 const XMMRegister xmm_prev_block_cipher = xmm1; // holds cipher of previous block
3333 // registers holding the four results in the parallelized loop
3334 const XMMRegister xmm_result0 = xmm0;
3335 const XMMRegister xmm_result1 = xmm2;
3336 const XMMRegister xmm_result2 = xmm3;
3337 const XMMRegister xmm_result3 = xmm4;
3338
3339 __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00)); // initialize with initial rvec
3340
3341 // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
3342 __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3343 __ cmpl(rax, 44);
3344 __ jcc(Assembler::notEqual, L_key_192_256);
3345
3346
3347 // 128-bit code follows here, parallelized
3348 __ movptr(pos, 0);
3349 __ align(OptoLoopAlignment);
3350 __ BIND(L_multiBlock_loopTop_128);
3351 __ cmpptr(len_reg, 4*AESBlockSize); // see if at least 4 blocks left
3352 __ jcc(Assembler::less, L_singleBlock_loopTop_128);
3353
3354 __ movdqu(xmm_result0, Address(from, pos, Address::times_1, 0*AESBlockSize)); // get next 4 blocks into xmmresult registers
3355 __ movdqu(xmm_result1, Address(from, pos, Address::times_1, 1*AESBlockSize));
3356 __ movdqu(xmm_result2, Address(from, pos, Address::times_1, 2*AESBlockSize));
3357 __ movdqu(xmm_result3, Address(from, pos, Address::times_1, 3*AESBlockSize));
3358
3359 #define DoFour(opc, src_reg) \
3360 __ opc(xmm_result0, src_reg); \
3361 __ opc(xmm_result1, src_reg); \
3362 __ opc(xmm_result2, src_reg); \
3363 __ opc(xmm_result3, src_reg);
3364
3365 DoFour(pxor, xmm_key_first);
3366 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST - 1; rnum++) {
3367 DoFour(aesdec, as_XMMRegister(rnum));
3368 }
3369 DoFour(aesdeclast, xmm_key_last);
3370 // for each result, xor with the r vector of previous cipher block
3371 __ pxor(xmm_result0, xmm_prev_block_cipher);
3372 __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 0*AESBlockSize));
3373 __ pxor(xmm_result1, xmm_prev_block_cipher);
3374 __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 1*AESBlockSize));
3375 __ pxor(xmm_result2, xmm_prev_block_cipher);
3376 __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 2*AESBlockSize));
3377 __ pxor(xmm_result3, xmm_prev_block_cipher);
3378 __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 3*AESBlockSize)); // this will carry over to next set of blocks
3379
3380 __ movdqu(Address(to, pos, Address::times_1, 0*AESBlockSize), xmm_result0); // store 4 results into the next 64 bytes of output
3381 __ movdqu(Address(to, pos, Address::times_1, 1*AESBlockSize), xmm_result1);
3382 __ movdqu(Address(to, pos, Address::times_1, 2*AESBlockSize), xmm_result2);
3383 __ movdqu(Address(to, pos, Address::times_1, 3*AESBlockSize), xmm_result3);
3384
3385 __ addptr(pos, 4*AESBlockSize);
3386 __ subptr(len_reg, 4*AESBlockSize);
3387 __ jmp(L_multiBlock_loopTop_128);
3388
3389 // registers used in the non-parallelized loops
3390 const XMMRegister xmm_prev_block_cipher_save = xmm2;
3391 const XMMRegister xmm_temp = xmm3;
3392
3393 __ align(OptoLoopAlignment);
3394 __ BIND(L_singleBlock_loopTop_128);
3395 __ cmpptr(len_reg, 0); // any blocks left??
3396 __ jcc(Assembler::equal, L_exit);
3397 __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input
3398 __ movdqa(xmm_prev_block_cipher_save, xmm_result); // save for next r vector
3399 __ pxor (xmm_result, xmm_key_first); // do the aes dec rounds
3400 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST - 1; rnum++) {
3401 __ aesdec(xmm_result, as_XMMRegister(rnum));
3402 }
3403 __ aesdeclast(xmm_result, xmm_key_last);
3404 __ pxor (xmm_result, xmm_prev_block_cipher); // xor with the current r vector
3405 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output
3406 // no need to store r to memory until we exit
3407 __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save); // set up next r vector with cipher input from this block
3408
3409 __ addptr(pos, AESBlockSize);
3410 __ subptr(len_reg, AESBlockSize);
3411 __ jmp(L_singleBlock_loopTop_128);
3412
3413
3414 __ BIND(L_exit);
3415 __ movdqu(Address(rvec, 0), xmm_prev_block_cipher); // final value of r stored in rvec of CipherBlockChaining object
3416 #ifdef _WIN64
3417 // restore regs belonging to calling function
3418 for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) {
3419 __ movdqu(as_XMMRegister(i), xmm_save(i));
3420 }
3421 #endif
3422 __ movl(rax, 0); // return 0 (why?)
3423 __ leave(); // required for proper stackwalking of RuntimeStub frame
3424 __ ret(0);
3425
3426
3427 __ BIND(L_key_192_256);
3428 // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256)
3429 __ cmpl(rax, 52);
3430 __ jcc(Assembler::notEqual, L_key_256);
3431
3432 // 192-bit code follows here (could be optimized to use parallelism)
3433 __ movptr(pos, 0);
3434 __ align(OptoLoopAlignment);
3435 __ BIND(L_singleBlock_loopTop_192);
3436 __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input
3437 __ movdqa(xmm_prev_block_cipher_save, xmm_result); // save for next r vector
3438 __ pxor (xmm_result, xmm_key_first); // do the aes dec rounds
3439 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST - 1; rnum++) {
3440 __ aesdec(xmm_result, as_XMMRegister(rnum));
3441 }
3442 aes_dec_key(xmm_result, xmm_temp, key, 0xb0); // 192-bit key goes up to c0
3443 aes_dec_key(xmm_result, xmm_temp, key, 0xc0);
3444 __ aesdeclast(xmm_result, xmm_key_last); // xmm15 always came from key+0
3445 __ pxor (xmm_result, xmm_prev_block_cipher); // xor with the current r vector
3446 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output
3447 // no need to store r to memory until we exit
3448 __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save); // set up next r vector with cipher input from this block
3449
3450 __ addptr(pos, AESBlockSize);
3451 __ subptr(len_reg, AESBlockSize);
3452 __ jcc(Assembler::notEqual,L_singleBlock_loopTop_192);
3453 __ jmp(L_exit);
3454
3455 __ BIND(L_key_256);
3456 // 256-bit code follows here (could be optimized to use parallelism)
3457 __ movptr(pos, 0);
3458 __ align(OptoLoopAlignment);
3459 __ BIND(L_singleBlock_loopTop_256);
3460 __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input
3461 __ movdqa(xmm_prev_block_cipher_save, xmm_result); // save for next r vector
3462 __ pxor (xmm_result, xmm_key_first); // do the aes dec rounds
3463 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST - 1; rnum++) {
3464 __ aesdec(xmm_result, as_XMMRegister(rnum));
3465 }
3466 aes_dec_key(xmm_result, xmm_temp, key, 0xb0); // 256-bit key goes up to e0
3467 aes_dec_key(xmm_result, xmm_temp, key, 0xc0);
3468 aes_dec_key(xmm_result, xmm_temp, key, 0xd0);
3469 aes_dec_key(xmm_result, xmm_temp, key, 0xe0);
3470 __ aesdeclast(xmm_result, xmm_key_last); // xmm15 came from key+0
3471 __ pxor (xmm_result, xmm_prev_block_cipher); // xor with the current r vector
3472 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output
3473 // no need to store r to memory until we exit
3474 __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save); // set up next r vector with cipher input from this block
3475
3476 __ addptr(pos, AESBlockSize);
3477 __ subptr(len_reg, AESBlockSize);
3478 __ jcc(Assembler::notEqual,L_singleBlock_loopTop_256);
3479 __ jmp(L_exit);
3480
3481 return start;
3482 }
3483
3484
2960 3485
2961 #undef __ 3486 #undef __
2962 #define __ masm-> 3487 #define __ masm->
2963 3488
2964 // Continuation point for throwing of implicit exceptions that are 3489 // Continuation point for throwing of implicit exceptions that are
3150 3675
3151 // arraycopy stubs used by compilers 3676 // arraycopy stubs used by compilers
3152 generate_arraycopy_stubs(); 3677 generate_arraycopy_stubs();
3153 3678
3154 generate_math_stubs(); 3679 generate_math_stubs();
3680
3681 // don't bother generating these AES intrinsic stubs unless global flag is set
3682 if (UseAESIntrinsics) {
3683 StubRoutines::x86::_key_shuffle_mask_addr = generate_key_shuffle_mask(); // needed by the others
3684
3685 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
3686 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
3687 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
3688 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel();
3689 }
3155 } 3690 }
3156 3691
3157 public: 3692 public:
3158 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) { 3693 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
3159 if (all) { 3694 if (all) {