Mercurial > hg > graal-jvmci-8
comparison src/cpu/x86/vm/sharedRuntime_x86_64.cpp @ 23824:ea7ac121a5d3
8130150: Implement BigInteger.montgomeryMultiply intrinsic
Reviewed-by: kvn, mdoerr
author | vkempik |
---|---|
date | Fri, 04 Mar 2016 16:15:48 +0300 |
parents | e8260b6328fb |
children | f13e777eb255 |
comparison
equal
deleted
inserted
replaced
23823:ebd6745380b9 | 23824:ea7ac121a5d3 |
---|---|
21 * questions. | 21 * questions. |
22 * | 22 * |
23 */ | 23 */ |
24 | 24 |
25 #include "precompiled.hpp" | 25 #include "precompiled.hpp" |
26 #ifndef _WINDOWS | |
27 #include "alloca.h" | |
28 #endif | |
26 #include "asm/macroAssembler.hpp" | 29 #include "asm/macroAssembler.hpp" |
27 #include "asm/macroAssembler.inline.hpp" | 30 #include "asm/macroAssembler.inline.hpp" |
28 #include "code/debugInfoRec.hpp" | 31 #include "code/debugInfoRec.hpp" |
29 #include "code/icBuffer.hpp" | 32 #include "code/icBuffer.hpp" |
30 #include "code/vtableStubs.hpp" | 33 #include "code/vtableStubs.hpp" |
3964 // frame_size_words or bytes?? | 3967 // frame_size_words or bytes?? |
3965 return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true); | 3968 return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true); |
3966 } | 3969 } |
3967 | 3970 |
3968 | 3971 |
3972 //------------------------------Montgomery multiplication------------------------ | |
3973 // | |
3974 | |
3975 #ifndef _WINDOWS | |
3976 | |
3977 #define ASM_SUBTRACT | |
3978 | |
3979 #ifdef ASM_SUBTRACT | |
3980 // Subtract 0:b from carry:a. Return carry. | |
3981 static unsigned long | |
3982 sub(unsigned long a[], unsigned long b[], unsigned long carry, long len) { | |
3983 long i = 0, cnt = len; | |
3984 unsigned long tmp; | |
3985 asm volatile("clc; " | |
3986 "0: ; " | |
3987 "mov (%[b], %[i], 8), %[tmp]; " | |
3988 "sbb %[tmp], (%[a], %[i], 8); " | |
3989 "inc %[i]; dec %[cnt]; " | |
3990 "jne 0b; " | |
3991 "mov %[carry], %[tmp]; sbb $0, %[tmp]; " | |
3992 : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp) | |
3993 : [a]"r"(a), [b]"r"(b), [carry]"r"(carry) | |
3994 : "memory"); | |
3995 return tmp; | |
3996 } | |
3997 #else // ASM_SUBTRACT | |
3998 typedef int __attribute__((mode(TI))) int128; | |
3999 | |
4000 // Subtract 0:b from carry:a. Return carry. | |
4001 static unsigned long | |
4002 sub(unsigned long a[], unsigned long b[], unsigned long carry, int len) { | |
4003 int128 tmp = 0; | |
4004 int i; | |
4005 for (i = 0; i < len; i++) { | |
4006 tmp += a[i]; | |
4007 tmp -= b[i]; | |
4008 a[i] = tmp; | |
4009 tmp >>= 64; | |
4010 assert(-1 <= tmp && tmp <= 0, "invariant"); | |
4011 } | |
4012 return tmp + carry; | |
4013 } | |
4014 #endif // ! ASM_SUBTRACT | |
4015 | |
4016 // Multiply (unsigned) Long A by Long B, accumulating the double- | |
4017 // length result into the accumulator formed of T0, T1, and T2. | |
4018 #define MACC(A, B, T0, T1, T2) \ | |
4019 do { \ | |
4020 unsigned long hi, lo; \ | |
4021 asm volatile("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4" \ | |
4022 : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2) \ | |
4023 : "r"(A), "a"(B) : "cc"); \ | |
4024 } while(0) | |
4025 | |
4026 // As above, but add twice the double-length result into the | |
4027 // accumulator. | |
4028 #define MACC2(A, B, T0, T1, T2) \ | |
4029 do { \ | |
4030 unsigned long hi, lo; \ | |
4031 asm volatile("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4;" \ | |
4032 "add %%rax, %2; adc %%rdx, %3; adc $0, %4" \ | |
4033 : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2) \ | |
4034 : "r"(A), "a"(B) : "cc"); \ | |
4035 } while(0) | |
4036 | |
4037 // Fast Montgomery multiplication. The derivation of the algorithm is | |
4038 // in A Cryptographic Library for the Motorola DSP56000, | |
4039 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. | |
4040 | |
4041 static void __attribute__((noinline)) | |
4042 montgomery_multiply(unsigned long a[], unsigned long b[], unsigned long n[], | |
4043 unsigned long m[], unsigned long inv, int len) { | |
4044 unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator | |
4045 int i; | |
4046 | |
4047 assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); | |
4048 | |
4049 for (i = 0; i < len; i++) { | |
4050 int j; | |
4051 for (j = 0; j < i; j++) { | |
4052 MACC(a[j], b[i-j], t0, t1, t2); | |
4053 MACC(m[j], n[i-j], t0, t1, t2); | |
4054 } | |
4055 MACC(a[i], b[0], t0, t1, t2); | |
4056 m[i] = t0 * inv; | |
4057 MACC(m[i], n[0], t0, t1, t2); | |
4058 | |
4059 assert(t0 == 0, "broken Montgomery multiply"); | |
4060 | |
4061 t0 = t1; t1 = t2; t2 = 0; | |
4062 } | |
4063 | |
4064 for (i = len; i < 2*len; i++) { | |
4065 int j; | |
4066 for (j = i-len+1; j < len; j++) { | |
4067 MACC(a[j], b[i-j], t0, t1, t2); | |
4068 MACC(m[j], n[i-j], t0, t1, t2); | |
4069 } | |
4070 m[i-len] = t0; | |
4071 t0 = t1; t1 = t2; t2 = 0; | |
4072 } | |
4073 | |
4074 while (t0) | |
4075 t0 = sub(m, n, t0, len); | |
4076 } | |
4077 | |
4078 // Fast Montgomery squaring. This uses asymptotically 25% fewer | |
4079 // multiplies so it should be up to 25% faster than Montgomery | |
4080 // multiplication. However, its loop control is more complex and it | |
4081 // may actually run slower on some machines. | |
4082 | |
4083 static void __attribute__((noinline)) | |
4084 montgomery_square(unsigned long a[], unsigned long n[], | |
4085 unsigned long m[], unsigned long inv, int len) { | |
4086 unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator | |
4087 int i; | |
4088 | |
4089 assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); | |
4090 | |
4091 for (i = 0; i < len; i++) { | |
4092 int j; | |
4093 int end = (i+1)/2; | |
4094 for (j = 0; j < end; j++) { | |
4095 MACC2(a[j], a[i-j], t0, t1, t2); | |
4096 MACC(m[j], n[i-j], t0, t1, t2); | |
4097 } | |
4098 if ((i & 1) == 0) { | |
4099 MACC(a[j], a[j], t0, t1, t2); | |
4100 } | |
4101 for (; j < i; j++) { | |
4102 MACC(m[j], n[i-j], t0, t1, t2); | |
4103 } | |
4104 m[i] = t0 * inv; | |
4105 MACC(m[i], n[0], t0, t1, t2); | |
4106 | |
4107 assert(t0 == 0, "broken Montgomery square"); | |
4108 | |
4109 t0 = t1; t1 = t2; t2 = 0; | |
4110 } | |
4111 | |
4112 for (i = len; i < 2*len; i++) { | |
4113 int start = i-len+1; | |
4114 int end = start + (len - start)/2; | |
4115 int j; | |
4116 for (j = start; j < end; j++) { | |
4117 MACC2(a[j], a[i-j], t0, t1, t2); | |
4118 MACC(m[j], n[i-j], t0, t1, t2); | |
4119 } | |
4120 if ((i & 1) == 0) { | |
4121 MACC(a[j], a[j], t0, t1, t2); | |
4122 } | |
4123 for (; j < len; j++) { | |
4124 MACC(m[j], n[i-j], t0, t1, t2); | |
4125 } | |
4126 m[i-len] = t0; | |
4127 t0 = t1; t1 = t2; t2 = 0; | |
4128 } | |
4129 | |
4130 while (t0) | |
4131 t0 = sub(m, n, t0, len); | |
4132 } | |
4133 | |
4134 // Swap words in a longword. | |
4135 static unsigned long swap(unsigned long x) { | |
4136 return (x << 32) | (x >> 32); | |
4137 } | |
4138 | |
4139 // Copy len longwords from s to d, word-swapping as we go. The | |
4140 // destination array is reversed. | |
4141 static void reverse_words(unsigned long *s, unsigned long *d, int len) { | |
4142 d += len; | |
4143 while(len-- > 0) { | |
4144 d--; | |
4145 *d = swap(*s); | |
4146 s++; | |
4147 } | |
4148 } | |
4149 | |
4150 // The threshold at which squaring is advantageous was determined | |
4151 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz. | |
4152 #define MONTGOMERY_SQUARING_THRESHOLD 64 | |
4153 | |
4154 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints, | |
4155 jint len, jlong inv, | |
4156 jint *m_ints) { | |
4157 assert(len % 2 == 0, "array length in montgomery_multiply must be even"); | |
4158 int longwords = len/2; | |
4159 | |
4160 // Make very sure we don't use so much space that the stack might | |
4161 // overflow. 512 jints corresponds to an 16384-bit integer and | |
4162 // will use here a total of 8k bytes of stack space. | |
4163 int total_allocation = longwords * sizeof (unsigned long) * 4; | |
4164 guarantee(total_allocation <= 8192, "must be"); | |
4165 unsigned long *scratch = (unsigned long *)alloca(total_allocation); | |
4166 | |
4167 // Local scratch arrays | |
4168 unsigned long | |
4169 *a = scratch + 0 * longwords, | |
4170 *b = scratch + 1 * longwords, | |
4171 *n = scratch + 2 * longwords, | |
4172 *m = scratch + 3 * longwords; | |
4173 | |
4174 reverse_words((unsigned long *)a_ints, a, longwords); | |
4175 reverse_words((unsigned long *)b_ints, b, longwords); | |
4176 reverse_words((unsigned long *)n_ints, n, longwords); | |
4177 | |
4178 ::montgomery_multiply(a, b, n, m, (unsigned long)inv, longwords); | |
4179 | |
4180 reverse_words(m, (unsigned long *)m_ints, longwords); | |
4181 } | |
4182 | |
4183 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints, | |
4184 jint len, jlong inv, | |
4185 jint *m_ints) { | |
4186 assert(len % 2 == 0, "array length in montgomery_square must be even"); | |
4187 int longwords = len/2; | |
4188 | |
4189 // Make very sure we don't use so much space that the stack might | |
4190 // overflow. 512 jints corresponds to an 16384-bit integer and | |
4191 // will use here a total of 6k bytes of stack space. | |
4192 int total_allocation = longwords * sizeof (unsigned long) * 3; | |
4193 guarantee(total_allocation <= 8192, "must be"); | |
4194 unsigned long *scratch = (unsigned long *)alloca(total_allocation); | |
4195 | |
4196 // Local scratch arrays | |
4197 unsigned long | |
4198 *a = scratch + 0 * longwords, | |
4199 *n = scratch + 1 * longwords, | |
4200 *m = scratch + 2 * longwords; | |
4201 | |
4202 reverse_words((unsigned long *)a_ints, a, longwords); | |
4203 reverse_words((unsigned long *)n_ints, n, longwords); | |
4204 | |
4205 //montgomery_square fails to pass BigIntegerTest on solaris amd64 | |
4206 //on jdk7 and jdk8. | |
4207 #ifndef SOLARIS | |
4208 if (len >= MONTGOMERY_SQUARING_THRESHOLD) { | |
4209 #else | |
4210 if (0) { | |
4211 #endif | |
4212 ::montgomery_square(a, n, m, (unsigned long)inv, longwords); | |
4213 } else { | |
4214 ::montgomery_multiply(a, a, n, m, (unsigned long)inv, longwords); | |
4215 } | |
4216 | |
4217 reverse_words(m, (unsigned long *)m_ints, longwords); | |
4218 } | |
4219 | |
4220 #endif // WINDOWS | |
4221 | |
3969 #ifdef COMPILER2 | 4222 #ifdef COMPILER2 |
3970 // This is here instead of runtime_x86_64.cpp because it uses SimpleRuntimeFrame | 4223 // This is here instead of runtime_x86_64.cpp because it uses SimpleRuntimeFrame |
3971 // | 4224 // |
3972 //------------------------------generate_exception_blob--------------------------- | 4225 //------------------------------generate_exception_blob--------------------------- |
3973 // creates exception blob at the end | 4226 // creates exception blob at the end |