Mercurial > hg > truffle
comparison src/cpu/x86/vm/stubGenerator_x86_64.cpp @ 6948:e522a00b91aa
Merge with http://hg.openjdk.java.net/hsx/hsx25/hotspot/ after NPG - C++ build works
author | Doug Simon <doug.simon@oracle.com> |
---|---|
date | Mon, 12 Nov 2012 23:14:12 +0100 |
parents | 957c266d8bc5 a3ecd773a7b9 |
children | 291ffc492eb6 |
comparison
equal
deleted
inserted
replaced
6711:ae13cc658b80 | 6948:e522a00b91aa |
---|---|
1 /* | 1 /* |
2 * Copyright (c) 2003, 2011, Oracle and/or its affiliates. All rights reserved. | 2 * Copyright (c) 2003, 2012, Oracle and/or its affiliates. All rights reserved. |
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. | 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. |
4 * | 4 * |
5 * This code is free software; you can redistribute it and/or modify it | 5 * This code is free software; you can redistribute it and/or modify it |
6 * under the terms of the GNU General Public License version 2 only, as | 6 * under the terms of the GNU General Public License version 2 only, as |
7 * published by the Free Software Foundation. | 7 * published by the Free Software Foundation. |
26 #include "asm/assembler.hpp" | 26 #include "asm/assembler.hpp" |
27 #include "assembler_x86.inline.hpp" | 27 #include "assembler_x86.inline.hpp" |
28 #include "interpreter/interpreter.hpp" | 28 #include "interpreter/interpreter.hpp" |
29 #include "nativeInst_x86.hpp" | 29 #include "nativeInst_x86.hpp" |
30 #include "oops/instanceOop.hpp" | 30 #include "oops/instanceOop.hpp" |
31 #include "oops/methodOop.hpp" | 31 #include "oops/method.hpp" |
32 #include "oops/objArrayKlass.hpp" | 32 #include "oops/objArrayKlass.hpp" |
33 #include "oops/oop.inline.hpp" | 33 #include "oops/oop.inline.hpp" |
34 #include "prims/methodHandles.hpp" | 34 #include "prims/methodHandles.hpp" |
35 #include "runtime/frame.inline.hpp" | 35 #include "runtime/frame.inline.hpp" |
36 #include "runtime/handles.inline.hpp" | 36 #include "runtime/handles.inline.hpp" |
107 // | 107 // |
108 // Linux Arguments: | 108 // Linux Arguments: |
109 // c_rarg0: call wrapper address address | 109 // c_rarg0: call wrapper address address |
110 // c_rarg1: result address | 110 // c_rarg1: result address |
111 // c_rarg2: result type BasicType | 111 // c_rarg2: result type BasicType |
112 // c_rarg3: method methodOop | 112 // c_rarg3: method Method* |
113 // c_rarg4: (interpreter) entry point address | 113 // c_rarg4: (interpreter) entry point address |
114 // c_rarg5: parameters intptr_t* | 114 // c_rarg5: parameters intptr_t* |
115 // 16(rbp): parameter size (in words) int | 115 // 16(rbp): parameter size (in words) int |
116 // 24(rbp): thread Thread* | 116 // 24(rbp): thread Thread* |
117 // | 117 // |
137 // | 137 // |
138 // Windows Arguments: | 138 // Windows Arguments: |
139 // c_rarg0: call wrapper address address | 139 // c_rarg0: call wrapper address address |
140 // c_rarg1: result address | 140 // c_rarg1: result address |
141 // c_rarg2: result type BasicType | 141 // c_rarg2: result type BasicType |
142 // c_rarg3: method methodOop | 142 // c_rarg3: method Method* |
143 // 48(rbp): (interpreter) entry point address | 143 // 48(rbp): (interpreter) entry point address |
144 // 56(rbp): parameters intptr_t* | 144 // 56(rbp): parameters intptr_t* |
145 // 64(rbp): parameter size (in words) int | 145 // 64(rbp): parameter size (in words) int |
146 // 72(rbp): thread Thread* | 146 // 72(rbp): thread Thread* |
147 // | 147 // |
330 __ push(rax); // pass parameter | 330 __ push(rax); // pass parameter |
331 __ jcc(Assembler::notZero, loop); | 331 __ jcc(Assembler::notZero, loop); |
332 | 332 |
333 // call Java function | 333 // call Java function |
334 __ BIND(parameters_done); | 334 __ BIND(parameters_done); |
335 __ movptr(rbx, method); // get methodOop | 335 __ movptr(rbx, method); // get Method* |
336 __ movptr(c_rarg1, entry_point); // get entry_point | 336 __ movptr(c_rarg1, entry_point); // get entry_point |
337 __ mov(r13, rsp); // set sender sp | 337 __ mov(r13, rsp); // set sender sp |
338 BLOCK_COMMENT("call Java function"); | 338 BLOCK_COMMENT("call Java function"); |
339 __ call(c_rarg1); | 339 __ call(c_rarg1); |
340 | 340 |
1025 __ jcc(Assembler::notZero, error); | 1025 __ jcc(Assembler::notZero, error); |
1026 | 1026 |
1027 // set r12 to heapbase for load_klass() | 1027 // set r12 to heapbase for load_klass() |
1028 __ reinit_heapbase(); | 1028 __ reinit_heapbase(); |
1029 | 1029 |
1030 // make sure klass is 'reasonable' | 1030 // make sure klass is 'reasonable', which is not zero. |
1031 __ load_klass(rax, rax); // get klass | 1031 __ load_klass(rax, rax); // get klass |
1032 __ testptr(rax, rax); | 1032 __ testptr(rax, rax); |
1033 __ jcc(Assembler::zero, error); // if klass is NULL it is broken | 1033 __ jcc(Assembler::zero, error); // if klass is NULL it is broken |
1034 // Check if the klass is in the right area of memory | 1034 // TODO: Future assert that klass is lower 4g memory for UseCompressedKlassPointers |
1035 __ mov(c_rarg2, rax); | |
1036 __ movptr(c_rarg3, (intptr_t) Universe::verify_klass_mask()); | |
1037 __ andptr(c_rarg2, c_rarg3); | |
1038 __ movptr(c_rarg3, (intptr_t) Universe::verify_klass_bits()); | |
1039 __ cmpptr(c_rarg2, c_rarg3); | |
1040 __ jcc(Assembler::notZero, error); | |
1041 | |
1042 // make sure klass' klass is 'reasonable' | |
1043 __ load_klass(rax, rax); | |
1044 __ testptr(rax, rax); | |
1045 __ jcc(Assembler::zero, error); // if klass' klass is NULL it is broken | |
1046 // Check if the klass' klass is in the right area of memory | |
1047 __ movptr(c_rarg3, (intptr_t) Universe::verify_klass_mask()); | |
1048 __ andptr(rax, c_rarg3); | |
1049 __ movptr(c_rarg3, (intptr_t) Universe::verify_klass_bits()); | |
1050 __ cmpptr(rax, c_rarg3); | |
1051 __ jcc(Assembler::notZero, error); | |
1052 | 1035 |
1053 // return if everything seems ok | 1036 // return if everything seems ok |
1054 __ bind(exit); | 1037 __ bind(exit); |
1055 __ movptr(rax, Address(rsp, saved_rax)); // get saved rax back | 1038 __ movptr(rax, Address(rsp, saved_rax)); // get saved rax back |
1056 __ movptr(rscratch1, Address(rsp, saved_r10)); // get saved r10 back | 1039 __ movptr(rscratch1, Address(rsp, saved_r10)); // get saved r10 back |
2619 #endif | 2602 #endif |
2620 | 2603 |
2621 arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length, | 2604 arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length, |
2622 r10, L_failed); | 2605 r10, L_failed); |
2623 | 2606 |
2624 // typeArrayKlass | 2607 // TypeArrayKlass |
2625 // | 2608 // |
2626 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); | 2609 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); |
2627 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); | 2610 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); |
2628 // | 2611 // |
2629 | 2612 |
2685 __ lea(from, Address(src, src_pos, Address::times_8, 0));// src_addr | 2668 __ lea(from, Address(src, src_pos, Address::times_8, 0));// src_addr |
2686 __ lea(to, Address(dst, dst_pos, Address::times_8, 0));// dst_addr | 2669 __ lea(to, Address(dst, dst_pos, Address::times_8, 0));// dst_addr |
2687 __ movl2ptr(count, r11_length); // length | 2670 __ movl2ptr(count, r11_length); // length |
2688 __ jump(RuntimeAddress(long_copy_entry)); | 2671 __ jump(RuntimeAddress(long_copy_entry)); |
2689 | 2672 |
2690 // objArrayKlass | 2673 // ObjArrayKlass |
2691 __ BIND(L_objArray); | 2674 __ BIND(L_objArray); |
2692 // live at this point: r10_src_klass, r11_length, src[_pos], dst[_pos] | 2675 // live at this point: r10_src_klass, r11_length, src[_pos], dst[_pos] |
2693 | 2676 |
2694 Label L_plain_copy, L_checkcast_copy; | 2677 Label L_plain_copy, L_checkcast_copy; |
2695 // test array classes for subtyping | 2678 // test array classes for subtyping |
2738 const int sco_offset = in_bytes(Klass::super_check_offset_offset()); | 2721 const int sco_offset = in_bytes(Klass::super_check_offset_offset()); |
2739 __ movl(sco_temp, Address(r11_dst_klass, sco_offset)); | 2722 __ movl(sco_temp, Address(r11_dst_klass, sco_offset)); |
2740 assert_clean_int(sco_temp, rax); | 2723 assert_clean_int(sco_temp, rax); |
2741 generate_type_check(r10_src_klass, sco_temp, r11_dst_klass, L_plain_copy); | 2724 generate_type_check(r10_src_klass, sco_temp, r11_dst_klass, L_plain_copy); |
2742 | 2725 |
2743 // Fetch destination element klass from the objArrayKlass header. | 2726 // Fetch destination element klass from the ObjArrayKlass header. |
2744 int ek_offset = in_bytes(objArrayKlass::element_klass_offset()); | 2727 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); |
2745 __ movptr(r11_dst_klass, Address(r11_dst_klass, ek_offset)); | 2728 __ movptr(r11_dst_klass, Address(r11_dst_klass, ek_offset)); |
2746 __ movl( sco_temp, Address(r11_dst_klass, sco_offset)); | 2729 __ movl( sco_temp, Address(r11_dst_klass, sco_offset)); |
2747 assert_clean_int(sco_temp, rax); | 2730 assert_clean_int(sco_temp, rax); |
2748 | 2731 |
2749 // the checkcast_copy loop needs two extra arguments: | 2732 // the checkcast_copy loop needs two extra arguments: |
2955 __ movdbl(xmm0, Address(rsp, 0)); | 2938 __ movdbl(xmm0, Address(rsp, 0)); |
2956 __ addq(rsp, 8); | 2939 __ addq(rsp, 8); |
2957 __ ret(0); | 2940 __ ret(0); |
2958 } | 2941 } |
2959 } | 2942 } |
2943 | |
2944 // AES intrinsic stubs | |
2945 enum {AESBlockSize = 16}; | |
2946 | |
2947 address generate_key_shuffle_mask() { | |
2948 __ align(16); | |
2949 StubCodeMark mark(this, "StubRoutines", "key_shuffle_mask"); | |
2950 address start = __ pc(); | |
2951 __ emit_data64( 0x0405060700010203, relocInfo::none ); | |
2952 __ emit_data64( 0x0c0d0e0f08090a0b, relocInfo::none ); | |
2953 return start; | |
2954 } | |
2955 | |
2956 // Utility routine for loading a 128-bit key word in little endian format | |
2957 // can optionally specify that the shuffle mask is already in an xmmregister | |
2958 void load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) { | |
2959 __ movdqu(xmmdst, Address(key, offset)); | |
2960 if (xmm_shuf_mask != NULL) { | |
2961 __ pshufb(xmmdst, xmm_shuf_mask); | |
2962 } else { | |
2963 __ pshufb(xmmdst, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); | |
2964 } | |
2965 } | |
2966 | |
2967 // aesenc using specified key+offset | |
2968 // can optionally specify that the shuffle mask is already in an xmmregister | |
2969 void aes_enc_key(XMMRegister xmmdst, XMMRegister xmmtmp, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) { | |
2970 load_key(xmmtmp, key, offset, xmm_shuf_mask); | |
2971 __ aesenc(xmmdst, xmmtmp); | |
2972 } | |
2973 | |
2974 // aesdec using specified key+offset | |
2975 // can optionally specify that the shuffle mask is already in an xmmregister | |
2976 void aes_dec_key(XMMRegister xmmdst, XMMRegister xmmtmp, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) { | |
2977 load_key(xmmtmp, key, offset, xmm_shuf_mask); | |
2978 __ aesdec(xmmdst, xmmtmp); | |
2979 } | |
2980 | |
2981 | |
2982 // Arguments: | |
2983 // | |
2984 // Inputs: | |
2985 // c_rarg0 - source byte array address | |
2986 // c_rarg1 - destination byte array address | |
2987 // c_rarg2 - K (key) in little endian int array | |
2988 // | |
2989 address generate_aescrypt_encryptBlock() { | |
2990 assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support"); | |
2991 __ align(CodeEntryAlignment); | |
2992 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); | |
2993 Label L_doLast; | |
2994 address start = __ pc(); | |
2995 | |
2996 const Register from = c_rarg0; // source array address | |
2997 const Register to = c_rarg1; // destination array address | |
2998 const Register key = c_rarg2; // key array address | |
2999 const Register keylen = rax; | |
3000 | |
3001 const XMMRegister xmm_result = xmm0; | |
3002 const XMMRegister xmm_temp = xmm1; | |
3003 const XMMRegister xmm_key_shuf_mask = xmm2; | |
3004 | |
3005 __ enter(); // required for proper stackwalking of RuntimeStub frame | |
3006 | |
3007 __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); | |
3008 // keylen = # of 32-bit words, convert to 128-bit words | |
3009 __ shrl(keylen, 2); | |
3010 __ subl(keylen, 11); // every key has at least 11 128-bit words, some have more | |
3011 | |
3012 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); | |
3013 __ movdqu(xmm_result, Address(from, 0)); // get 16 bytes of input | |
3014 | |
3015 // For encryption, the java expanded key ordering is just what we need | |
3016 // we don't know if the key is aligned, hence not using load-execute form | |
3017 | |
3018 load_key(xmm_temp, key, 0x00, xmm_key_shuf_mask); | |
3019 __ pxor(xmm_result, xmm_temp); | |
3020 for (int offset = 0x10; offset <= 0x90; offset += 0x10) { | |
3021 aes_enc_key(xmm_result, xmm_temp, key, offset, xmm_key_shuf_mask); | |
3022 } | |
3023 load_key (xmm_temp, key, 0xa0, xmm_key_shuf_mask); | |
3024 __ cmpl(keylen, 0); | |
3025 __ jcc(Assembler::equal, L_doLast); | |
3026 __ aesenc(xmm_result, xmm_temp); // only in 192 and 256 bit keys | |
3027 aes_enc_key(xmm_result, xmm_temp, key, 0xb0, xmm_key_shuf_mask); | |
3028 load_key(xmm_temp, key, 0xc0, xmm_key_shuf_mask); | |
3029 __ subl(keylen, 2); | |
3030 __ jcc(Assembler::equal, L_doLast); | |
3031 __ aesenc(xmm_result, xmm_temp); // only in 256 bit keys | |
3032 aes_enc_key(xmm_result, xmm_temp, key, 0xd0, xmm_key_shuf_mask); | |
3033 load_key(xmm_temp, key, 0xe0, xmm_key_shuf_mask); | |
3034 | |
3035 __ BIND(L_doLast); | |
3036 __ aesenclast(xmm_result, xmm_temp); | |
3037 __ movdqu(Address(to, 0), xmm_result); // store the result | |
3038 __ xorptr(rax, rax); // return 0 | |
3039 __ leave(); // required for proper stackwalking of RuntimeStub frame | |
3040 __ ret(0); | |
3041 | |
3042 return start; | |
3043 } | |
3044 | |
3045 | |
3046 // Arguments: | |
3047 // | |
3048 // Inputs: | |
3049 // c_rarg0 - source byte array address | |
3050 // c_rarg1 - destination byte array address | |
3051 // c_rarg2 - K (key) in little endian int array | |
3052 // | |
3053 address generate_aescrypt_decryptBlock() { | |
3054 assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support"); | |
3055 __ align(CodeEntryAlignment); | |
3056 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock"); | |
3057 Label L_doLast; | |
3058 address start = __ pc(); | |
3059 | |
3060 const Register from = c_rarg0; // source array address | |
3061 const Register to = c_rarg1; // destination array address | |
3062 const Register key = c_rarg2; // key array address | |
3063 const Register keylen = rax; | |
3064 | |
3065 const XMMRegister xmm_result = xmm0; | |
3066 const XMMRegister xmm_temp = xmm1; | |
3067 const XMMRegister xmm_key_shuf_mask = xmm2; | |
3068 | |
3069 __ enter(); // required for proper stackwalking of RuntimeStub frame | |
3070 | |
3071 __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); | |
3072 // keylen = # of 32-bit words, convert to 128-bit words | |
3073 __ shrl(keylen, 2); | |
3074 __ subl(keylen, 11); // every key has at least 11 128-bit words, some have more | |
3075 | |
3076 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); | |
3077 __ movdqu(xmm_result, Address(from, 0)); | |
3078 | |
3079 // for decryption java expanded key ordering is rotated one position from what we want | |
3080 // so we start from 0x10 here and hit 0x00 last | |
3081 // we don't know if the key is aligned, hence not using load-execute form | |
3082 load_key(xmm_temp, key, 0x10, xmm_key_shuf_mask); | |
3083 __ pxor (xmm_result, xmm_temp); | |
3084 for (int offset = 0x20; offset <= 0xa0; offset += 0x10) { | |
3085 aes_dec_key(xmm_result, xmm_temp, key, offset, xmm_key_shuf_mask); | |
3086 } | |
3087 __ cmpl(keylen, 0); | |
3088 __ jcc(Assembler::equal, L_doLast); | |
3089 // only in 192 and 256 bit keys | |
3090 aes_dec_key(xmm_result, xmm_temp, key, 0xb0, xmm_key_shuf_mask); | |
3091 aes_dec_key(xmm_result, xmm_temp, key, 0xc0, xmm_key_shuf_mask); | |
3092 __ subl(keylen, 2); | |
3093 __ jcc(Assembler::equal, L_doLast); | |
3094 // only in 256 bit keys | |
3095 aes_dec_key(xmm_result, xmm_temp, key, 0xd0, xmm_key_shuf_mask); | |
3096 aes_dec_key(xmm_result, xmm_temp, key, 0xe0, xmm_key_shuf_mask); | |
3097 | |
3098 __ BIND(L_doLast); | |
3099 // for decryption the aesdeclast operation is always on key+0x00 | |
3100 load_key(xmm_temp, key, 0x00, xmm_key_shuf_mask); | |
3101 __ aesdeclast(xmm_result, xmm_temp); | |
3102 | |
3103 __ movdqu(Address(to, 0), xmm_result); // store the result | |
3104 | |
3105 __ xorptr(rax, rax); // return 0 | |
3106 __ leave(); // required for proper stackwalking of RuntimeStub frame | |
3107 __ ret(0); | |
3108 | |
3109 return start; | |
3110 } | |
3111 | |
3112 | |
3113 // Arguments: | |
3114 // | |
3115 // Inputs: | |
3116 // c_rarg0 - source byte array address | |
3117 // c_rarg1 - destination byte array address | |
3118 // c_rarg2 - K (key) in little endian int array | |
3119 // c_rarg3 - r vector byte array address | |
3120 // c_rarg4 - input length | |
3121 // | |
3122 address generate_cipherBlockChaining_encryptAESCrypt() { | |
3123 assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support"); | |
3124 __ align(CodeEntryAlignment); | |
3125 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); | |
3126 address start = __ pc(); | |
3127 | |
3128 Label L_exit, L_key_192_256, L_key_256, L_loopTop_128, L_loopTop_192, L_loopTop_256; | |
3129 const Register from = c_rarg0; // source array address | |
3130 const Register to = c_rarg1; // destination array address | |
3131 const Register key = c_rarg2; // key array address | |
3132 const Register rvec = c_rarg3; // r byte array initialized from initvector array address | |
3133 // and left with the results of the last encryption block | |
3134 #ifndef _WIN64 | |
3135 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) | |
3136 #else | |
3137 const Address len_mem(rsp, 6 * wordSize); // length is on stack on Win64 | |
3138 const Register len_reg = r10; // pick the first volatile windows register | |
3139 #endif | |
3140 const Register pos = rax; | |
3141 | |
3142 // xmm register assignments for the loops below | |
3143 const XMMRegister xmm_result = xmm0; | |
3144 const XMMRegister xmm_temp = xmm1; | |
3145 // keys 0-10 preloaded into xmm2-xmm12 | |
3146 const int XMM_REG_NUM_KEY_FIRST = 2; | |
3147 const int XMM_REG_NUM_KEY_LAST = 12; | |
3148 const XMMRegister xmm_key0 = as_XMMRegister(XMM_REG_NUM_KEY_FIRST); | |
3149 const XMMRegister xmm_key10 = as_XMMRegister(XMM_REG_NUM_KEY_LAST); | |
3150 | |
3151 __ enter(); // required for proper stackwalking of RuntimeStub frame | |
3152 | |
3153 #ifdef _WIN64 | |
3154 // on win64, fill len_reg from stack position | |
3155 __ movl(len_reg, len_mem); | |
3156 // save the xmm registers which must be preserved 6-12 | |
3157 __ subptr(rsp, -rsp_after_call_off * wordSize); | |
3158 for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) { | |
3159 __ movdqu(xmm_save(i), as_XMMRegister(i)); | |
3160 } | |
3161 #endif | |
3162 | |
3163 const XMMRegister xmm_key_shuf_mask = xmm_temp; // used temporarily to swap key bytes up front | |
3164 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); | |
3165 // load up xmm regs 2 thru 12 with key 0x00 - 0xa0 | |
3166 for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x00; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) { | |
3167 load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask); | |
3168 offset += 0x10; | |
3169 } | |
3170 | |
3171 __ movdqu(xmm_result, Address(rvec, 0x00)); // initialize xmm_result with r vec | |
3172 | |
3173 // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256)) | |
3174 __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); | |
3175 __ cmpl(rax, 44); | |
3176 __ jcc(Assembler::notEqual, L_key_192_256); | |
3177 | |
3178 // 128 bit code follows here | |
3179 __ movptr(pos, 0); | |
3180 __ align(OptoLoopAlignment); | |
3181 __ BIND(L_loopTop_128); | |
3182 __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input | |
3183 __ pxor (xmm_result, xmm_temp); // xor with the current r vector | |
3184 | |
3185 __ pxor (xmm_result, xmm_key0); // do the aes rounds | |
3186 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST - 1; rnum++) { | |
3187 __ aesenc(xmm_result, as_XMMRegister(rnum)); | |
3188 } | |
3189 __ aesenclast(xmm_result, xmm_key10); | |
3190 | |
3191 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output | |
3192 // no need to store r to memory until we exit | |
3193 __ addptr(pos, AESBlockSize); | |
3194 __ subptr(len_reg, AESBlockSize); | |
3195 __ jcc(Assembler::notEqual, L_loopTop_128); | |
3196 | |
3197 __ BIND(L_exit); | |
3198 __ movdqu(Address(rvec, 0), xmm_result); // final value of r stored in rvec of CipherBlockChaining object | |
3199 | |
3200 #ifdef _WIN64 | |
3201 // restore xmm regs belonging to calling function | |
3202 for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) { | |
3203 __ movdqu(as_XMMRegister(i), xmm_save(i)); | |
3204 } | |
3205 #endif | |
3206 __ movl(rax, 0); // return 0 (why?) | |
3207 __ leave(); // required for proper stackwalking of RuntimeStub frame | |
3208 __ ret(0); | |
3209 | |
3210 __ BIND(L_key_192_256); | |
3211 // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256) | |
3212 __ cmpl(rax, 52); | |
3213 __ jcc(Assembler::notEqual, L_key_256); | |
3214 | |
3215 // 192-bit code follows here (could be changed to use more xmm registers) | |
3216 __ movptr(pos, 0); | |
3217 __ align(OptoLoopAlignment); | |
3218 __ BIND(L_loopTop_192); | |
3219 __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input | |
3220 __ pxor (xmm_result, xmm_temp); // xor with the current r vector | |
3221 | |
3222 __ pxor (xmm_result, xmm_key0); // do the aes rounds | |
3223 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) { | |
3224 __ aesenc(xmm_result, as_XMMRegister(rnum)); | |
3225 } | |
3226 aes_enc_key(xmm_result, xmm_temp, key, 0xb0); | |
3227 load_key(xmm_temp, key, 0xc0); | |
3228 __ aesenclast(xmm_result, xmm_temp); | |
3229 | |
3230 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output | |
3231 // no need to store r to memory until we exit | |
3232 __ addptr(pos, AESBlockSize); | |
3233 __ subptr(len_reg, AESBlockSize); | |
3234 __ jcc(Assembler::notEqual, L_loopTop_192); | |
3235 __ jmp(L_exit); | |
3236 | |
3237 __ BIND(L_key_256); | |
3238 // 256-bit code follows here (could be changed to use more xmm registers) | |
3239 __ movptr(pos, 0); | |
3240 __ align(OptoLoopAlignment); | |
3241 __ BIND(L_loopTop_256); | |
3242 __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input | |
3243 __ pxor (xmm_result, xmm_temp); // xor with the current r vector | |
3244 | |
3245 __ pxor (xmm_result, xmm_key0); // do the aes rounds | |
3246 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) { | |
3247 __ aesenc(xmm_result, as_XMMRegister(rnum)); | |
3248 } | |
3249 aes_enc_key(xmm_result, xmm_temp, key, 0xb0); | |
3250 aes_enc_key(xmm_result, xmm_temp, key, 0xc0); | |
3251 aes_enc_key(xmm_result, xmm_temp, key, 0xd0); | |
3252 load_key(xmm_temp, key, 0xe0); | |
3253 __ aesenclast(xmm_result, xmm_temp); | |
3254 | |
3255 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output | |
3256 // no need to store r to memory until we exit | |
3257 __ addptr(pos, AESBlockSize); | |
3258 __ subptr(len_reg, AESBlockSize); | |
3259 __ jcc(Assembler::notEqual, L_loopTop_256); | |
3260 __ jmp(L_exit); | |
3261 | |
3262 return start; | |
3263 } | |
3264 | |
3265 | |
3266 | |
3267 // This is a version of CBC/AES Decrypt which does 4 blocks in a loop at a time | |
3268 // to hide instruction latency | |
3269 // | |
3270 // Arguments: | |
3271 // | |
3272 // Inputs: | |
3273 // c_rarg0 - source byte array address | |
3274 // c_rarg1 - destination byte array address | |
3275 // c_rarg2 - K (key) in little endian int array | |
3276 // c_rarg3 - r vector byte array address | |
3277 // c_rarg4 - input length | |
3278 // | |
3279 | |
3280 address generate_cipherBlockChaining_decryptAESCrypt_Parallel() { | |
3281 assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support"); | |
3282 __ align(CodeEntryAlignment); | |
3283 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); | |
3284 address start = __ pc(); | |
3285 | |
3286 Label L_exit, L_key_192_256, L_key_256; | |
3287 Label L_singleBlock_loopTop_128, L_multiBlock_loopTop_128; | |
3288 Label L_singleBlock_loopTop_192, L_singleBlock_loopTop_256; | |
3289 const Register from = c_rarg0; // source array address | |
3290 const Register to = c_rarg1; // destination array address | |
3291 const Register key = c_rarg2; // key array address | |
3292 const Register rvec = c_rarg3; // r byte array initialized from initvector array address | |
3293 // and left with the results of the last encryption block | |
3294 #ifndef _WIN64 | |
3295 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) | |
3296 #else | |
3297 const Address len_mem(rsp, 6 * wordSize); // length is on stack on Win64 | |
3298 const Register len_reg = r10; // pick the first volatile windows register | |
3299 #endif | |
3300 const Register pos = rax; | |
3301 | |
3302 // xmm register assignments for the loops below | |
3303 const XMMRegister xmm_result = xmm0; | |
3304 // keys 0-10 preloaded into xmm2-xmm12 | |
3305 const int XMM_REG_NUM_KEY_FIRST = 5; | |
3306 const int XMM_REG_NUM_KEY_LAST = 15; | |
3307 const XMMRegister xmm_key_first = as_XMMRegister(XMM_REG_NUM_KEY_FIRST); | |
3308 const XMMRegister xmm_key_last = as_XMMRegister(XMM_REG_NUM_KEY_LAST); | |
3309 | |
3310 __ enter(); // required for proper stackwalking of RuntimeStub frame | |
3311 | |
3312 #ifdef _WIN64 | |
3313 // on win64, fill len_reg from stack position | |
3314 __ movl(len_reg, len_mem); | |
3315 // save the xmm registers which must be preserved 6-15 | |
3316 __ subptr(rsp, -rsp_after_call_off * wordSize); | |
3317 for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) { | |
3318 __ movdqu(xmm_save(i), as_XMMRegister(i)); | |
3319 } | |
3320 #endif | |
3321 // the java expanded key ordering is rotated one position from what we want | |
3322 // so we start from 0x10 here and hit 0x00 last | |
3323 const XMMRegister xmm_key_shuf_mask = xmm1; // used temporarily to swap key bytes up front | |
3324 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); | |
3325 // load up xmm regs 5 thru 15 with key 0x10 - 0xa0 - 0x00 | |
3326 for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x10; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) { | |
3327 if (rnum == XMM_REG_NUM_KEY_LAST) offset = 0x00; | |
3328 load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask); | |
3329 offset += 0x10; | |
3330 } | |
3331 | |
3332 const XMMRegister xmm_prev_block_cipher = xmm1; // holds cipher of previous block | |
3333 // registers holding the four results in the parallelized loop | |
3334 const XMMRegister xmm_result0 = xmm0; | |
3335 const XMMRegister xmm_result1 = xmm2; | |
3336 const XMMRegister xmm_result2 = xmm3; | |
3337 const XMMRegister xmm_result3 = xmm4; | |
3338 | |
3339 __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00)); // initialize with initial rvec | |
3340 | |
3341 // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256)) | |
3342 __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); | |
3343 __ cmpl(rax, 44); | |
3344 __ jcc(Assembler::notEqual, L_key_192_256); | |
3345 | |
3346 | |
3347 // 128-bit code follows here, parallelized | |
3348 __ movptr(pos, 0); | |
3349 __ align(OptoLoopAlignment); | |
3350 __ BIND(L_multiBlock_loopTop_128); | |
3351 __ cmpptr(len_reg, 4*AESBlockSize); // see if at least 4 blocks left | |
3352 __ jcc(Assembler::less, L_singleBlock_loopTop_128); | |
3353 | |
3354 __ movdqu(xmm_result0, Address(from, pos, Address::times_1, 0*AESBlockSize)); // get next 4 blocks into xmmresult registers | |
3355 __ movdqu(xmm_result1, Address(from, pos, Address::times_1, 1*AESBlockSize)); | |
3356 __ movdqu(xmm_result2, Address(from, pos, Address::times_1, 2*AESBlockSize)); | |
3357 __ movdqu(xmm_result3, Address(from, pos, Address::times_1, 3*AESBlockSize)); | |
3358 | |
3359 #define DoFour(opc, src_reg) \ | |
3360 __ opc(xmm_result0, src_reg); \ | |
3361 __ opc(xmm_result1, src_reg); \ | |
3362 __ opc(xmm_result2, src_reg); \ | |
3363 __ opc(xmm_result3, src_reg); | |
3364 | |
3365 DoFour(pxor, xmm_key_first); | |
3366 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST - 1; rnum++) { | |
3367 DoFour(aesdec, as_XMMRegister(rnum)); | |
3368 } | |
3369 DoFour(aesdeclast, xmm_key_last); | |
3370 // for each result, xor with the r vector of previous cipher block | |
3371 __ pxor(xmm_result0, xmm_prev_block_cipher); | |
3372 __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 0*AESBlockSize)); | |
3373 __ pxor(xmm_result1, xmm_prev_block_cipher); | |
3374 __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 1*AESBlockSize)); | |
3375 __ pxor(xmm_result2, xmm_prev_block_cipher); | |
3376 __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 2*AESBlockSize)); | |
3377 __ pxor(xmm_result3, xmm_prev_block_cipher); | |
3378 __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 3*AESBlockSize)); // this will carry over to next set of blocks | |
3379 | |
3380 __ movdqu(Address(to, pos, Address::times_1, 0*AESBlockSize), xmm_result0); // store 4 results into the next 64 bytes of output | |
3381 __ movdqu(Address(to, pos, Address::times_1, 1*AESBlockSize), xmm_result1); | |
3382 __ movdqu(Address(to, pos, Address::times_1, 2*AESBlockSize), xmm_result2); | |
3383 __ movdqu(Address(to, pos, Address::times_1, 3*AESBlockSize), xmm_result3); | |
3384 | |
3385 __ addptr(pos, 4*AESBlockSize); | |
3386 __ subptr(len_reg, 4*AESBlockSize); | |
3387 __ jmp(L_multiBlock_loopTop_128); | |
3388 | |
3389 // registers used in the non-parallelized loops | |
3390 const XMMRegister xmm_prev_block_cipher_save = xmm2; | |
3391 const XMMRegister xmm_temp = xmm3; | |
3392 | |
3393 __ align(OptoLoopAlignment); | |
3394 __ BIND(L_singleBlock_loopTop_128); | |
3395 __ cmpptr(len_reg, 0); // any blocks left?? | |
3396 __ jcc(Assembler::equal, L_exit); | |
3397 __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input | |
3398 __ movdqa(xmm_prev_block_cipher_save, xmm_result); // save for next r vector | |
3399 __ pxor (xmm_result, xmm_key_first); // do the aes dec rounds | |
3400 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST - 1; rnum++) { | |
3401 __ aesdec(xmm_result, as_XMMRegister(rnum)); | |
3402 } | |
3403 __ aesdeclast(xmm_result, xmm_key_last); | |
3404 __ pxor (xmm_result, xmm_prev_block_cipher); // xor with the current r vector | |
3405 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output | |
3406 // no need to store r to memory until we exit | |
3407 __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save); // set up next r vector with cipher input from this block | |
3408 | |
3409 __ addptr(pos, AESBlockSize); | |
3410 __ subptr(len_reg, AESBlockSize); | |
3411 __ jmp(L_singleBlock_loopTop_128); | |
3412 | |
3413 | |
3414 __ BIND(L_exit); | |
3415 __ movdqu(Address(rvec, 0), xmm_prev_block_cipher); // final value of r stored in rvec of CipherBlockChaining object | |
3416 #ifdef _WIN64 | |
3417 // restore regs belonging to calling function | |
3418 for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) { | |
3419 __ movdqu(as_XMMRegister(i), xmm_save(i)); | |
3420 } | |
3421 #endif | |
3422 __ movl(rax, 0); // return 0 (why?) | |
3423 __ leave(); // required for proper stackwalking of RuntimeStub frame | |
3424 __ ret(0); | |
3425 | |
3426 | |
3427 __ BIND(L_key_192_256); | |
3428 // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256) | |
3429 __ cmpl(rax, 52); | |
3430 __ jcc(Assembler::notEqual, L_key_256); | |
3431 | |
3432 // 192-bit code follows here (could be optimized to use parallelism) | |
3433 __ movptr(pos, 0); | |
3434 __ align(OptoLoopAlignment); | |
3435 __ BIND(L_singleBlock_loopTop_192); | |
3436 __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input | |
3437 __ movdqa(xmm_prev_block_cipher_save, xmm_result); // save for next r vector | |
3438 __ pxor (xmm_result, xmm_key_first); // do the aes dec rounds | |
3439 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST - 1; rnum++) { | |
3440 __ aesdec(xmm_result, as_XMMRegister(rnum)); | |
3441 } | |
3442 aes_dec_key(xmm_result, xmm_temp, key, 0xb0); // 192-bit key goes up to c0 | |
3443 aes_dec_key(xmm_result, xmm_temp, key, 0xc0); | |
3444 __ aesdeclast(xmm_result, xmm_key_last); // xmm15 always came from key+0 | |
3445 __ pxor (xmm_result, xmm_prev_block_cipher); // xor with the current r vector | |
3446 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output | |
3447 // no need to store r to memory until we exit | |
3448 __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save); // set up next r vector with cipher input from this block | |
3449 | |
3450 __ addptr(pos, AESBlockSize); | |
3451 __ subptr(len_reg, AESBlockSize); | |
3452 __ jcc(Assembler::notEqual,L_singleBlock_loopTop_192); | |
3453 __ jmp(L_exit); | |
3454 | |
3455 __ BIND(L_key_256); | |
3456 // 256-bit code follows here (could be optimized to use parallelism) | |
3457 __ movptr(pos, 0); | |
3458 __ align(OptoLoopAlignment); | |
3459 __ BIND(L_singleBlock_loopTop_256); | |
3460 __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input | |
3461 __ movdqa(xmm_prev_block_cipher_save, xmm_result); // save for next r vector | |
3462 __ pxor (xmm_result, xmm_key_first); // do the aes dec rounds | |
3463 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST - 1; rnum++) { | |
3464 __ aesdec(xmm_result, as_XMMRegister(rnum)); | |
3465 } | |
3466 aes_dec_key(xmm_result, xmm_temp, key, 0xb0); // 256-bit key goes up to e0 | |
3467 aes_dec_key(xmm_result, xmm_temp, key, 0xc0); | |
3468 aes_dec_key(xmm_result, xmm_temp, key, 0xd0); | |
3469 aes_dec_key(xmm_result, xmm_temp, key, 0xe0); | |
3470 __ aesdeclast(xmm_result, xmm_key_last); // xmm15 came from key+0 | |
3471 __ pxor (xmm_result, xmm_prev_block_cipher); // xor with the current r vector | |
3472 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output | |
3473 // no need to store r to memory until we exit | |
3474 __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save); // set up next r vector with cipher input from this block | |
3475 | |
3476 __ addptr(pos, AESBlockSize); | |
3477 __ subptr(len_reg, AESBlockSize); | |
3478 __ jcc(Assembler::notEqual,L_singleBlock_loopTop_256); | |
3479 __ jmp(L_exit); | |
3480 | |
3481 return start; | |
3482 } | |
3483 | |
3484 | |
2960 | 3485 |
2961 #undef __ | 3486 #undef __ |
2962 #define __ masm-> | 3487 #define __ masm-> |
2963 | 3488 |
2964 // Continuation point for throwing of implicit exceptions that are | 3489 // Continuation point for throwing of implicit exceptions that are |
3150 | 3675 |
3151 // arraycopy stubs used by compilers | 3676 // arraycopy stubs used by compilers |
3152 generate_arraycopy_stubs(); | 3677 generate_arraycopy_stubs(); |
3153 | 3678 |
3154 generate_math_stubs(); | 3679 generate_math_stubs(); |
3680 | |
3681 // don't bother generating these AES intrinsic stubs unless global flag is set | |
3682 if (UseAESIntrinsics) { | |
3683 StubRoutines::x86::_key_shuffle_mask_addr = generate_key_shuffle_mask(); // needed by the others | |
3684 | |
3685 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); | |
3686 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); | |
3687 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); | |
3688 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel(); | |
3689 } | |
3155 } | 3690 } |
3156 | 3691 |
3157 public: | 3692 public: |
3158 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) { | 3693 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) { |
3159 if (all) { | 3694 if (all) { |