comparison src/cpu/x86/vm/sharedRuntime_x86_64.cpp @ 23824:ea7ac121a5d3

8130150: Implement BigInteger.montgomeryMultiply intrinsic Reviewed-by: kvn, mdoerr
author vkempik
date Fri, 04 Mar 2016 16:15:48 +0300
parents e8260b6328fb
children f13e777eb255
comparison
equal deleted inserted replaced
23823:ebd6745380b9 23824:ea7ac121a5d3
21 * questions. 21 * questions.
22 * 22 *
23 */ 23 */
24 24
25 #include "precompiled.hpp" 25 #include "precompiled.hpp"
26 #ifndef _WINDOWS
27 #include "alloca.h"
28 #endif
26 #include "asm/macroAssembler.hpp" 29 #include "asm/macroAssembler.hpp"
27 #include "asm/macroAssembler.inline.hpp" 30 #include "asm/macroAssembler.inline.hpp"
28 #include "code/debugInfoRec.hpp" 31 #include "code/debugInfoRec.hpp"
29 #include "code/icBuffer.hpp" 32 #include "code/icBuffer.hpp"
30 #include "code/vtableStubs.hpp" 33 #include "code/vtableStubs.hpp"
3964 // frame_size_words or bytes?? 3967 // frame_size_words or bytes??
3965 return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true); 3968 return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true);
3966 } 3969 }
3967 3970
3968 3971
3972 //------------------------------Montgomery multiplication------------------------
3973 //
3974
3975 #ifndef _WINDOWS
3976
3977 #define ASM_SUBTRACT
3978
3979 #ifdef ASM_SUBTRACT
3980 // Subtract 0:b from carry:a. Return carry.
3981 static unsigned long
3982 sub(unsigned long a[], unsigned long b[], unsigned long carry, long len) {
3983 long i = 0, cnt = len;
3984 unsigned long tmp;
3985 asm volatile("clc; "
3986 "0: ; "
3987 "mov (%[b], %[i], 8), %[tmp]; "
3988 "sbb %[tmp], (%[a], %[i], 8); "
3989 "inc %[i]; dec %[cnt]; "
3990 "jne 0b; "
3991 "mov %[carry], %[tmp]; sbb $0, %[tmp]; "
3992 : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp)
3993 : [a]"r"(a), [b]"r"(b), [carry]"r"(carry)
3994 : "memory");
3995 return tmp;
3996 }
3997 #else // ASM_SUBTRACT
3998 typedef int __attribute__((mode(TI))) int128;
3999
4000 // Subtract 0:b from carry:a. Return carry.
4001 static unsigned long
4002 sub(unsigned long a[], unsigned long b[], unsigned long carry, int len) {
4003 int128 tmp = 0;
4004 int i;
4005 for (i = 0; i < len; i++) {
4006 tmp += a[i];
4007 tmp -= b[i];
4008 a[i] = tmp;
4009 tmp >>= 64;
4010 assert(-1 <= tmp && tmp <= 0, "invariant");
4011 }
4012 return tmp + carry;
4013 }
4014 #endif // ! ASM_SUBTRACT
4015
4016 // Multiply (unsigned) Long A by Long B, accumulating the double-
4017 // length result into the accumulator formed of T0, T1, and T2.
4018 #define MACC(A, B, T0, T1, T2) \
4019 do { \
4020 unsigned long hi, lo; \
4021 asm volatile("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4" \
4022 : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2) \
4023 : "r"(A), "a"(B) : "cc"); \
4024 } while(0)
4025
4026 // As above, but add twice the double-length result into the
4027 // accumulator.
4028 #define MACC2(A, B, T0, T1, T2) \
4029 do { \
4030 unsigned long hi, lo; \
4031 asm volatile("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4;" \
4032 "add %%rax, %2; adc %%rdx, %3; adc $0, %4" \
4033 : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2) \
4034 : "r"(A), "a"(B) : "cc"); \
4035 } while(0)
4036
4037 // Fast Montgomery multiplication. The derivation of the algorithm is
4038 // in A Cryptographic Library for the Motorola DSP56000,
4039 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
4040
4041 static void __attribute__((noinline))
4042 montgomery_multiply(unsigned long a[], unsigned long b[], unsigned long n[],
4043 unsigned long m[], unsigned long inv, int len) {
4044 unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
4045 int i;
4046
4047 assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
4048
4049 for (i = 0; i < len; i++) {
4050 int j;
4051 for (j = 0; j < i; j++) {
4052 MACC(a[j], b[i-j], t0, t1, t2);
4053 MACC(m[j], n[i-j], t0, t1, t2);
4054 }
4055 MACC(a[i], b[0], t0, t1, t2);
4056 m[i] = t0 * inv;
4057 MACC(m[i], n[0], t0, t1, t2);
4058
4059 assert(t0 == 0, "broken Montgomery multiply");
4060
4061 t0 = t1; t1 = t2; t2 = 0;
4062 }
4063
4064 for (i = len; i < 2*len; i++) {
4065 int j;
4066 for (j = i-len+1; j < len; j++) {
4067 MACC(a[j], b[i-j], t0, t1, t2);
4068 MACC(m[j], n[i-j], t0, t1, t2);
4069 }
4070 m[i-len] = t0;
4071 t0 = t1; t1 = t2; t2 = 0;
4072 }
4073
4074 while (t0)
4075 t0 = sub(m, n, t0, len);
4076 }
4077
4078 // Fast Montgomery squaring. This uses asymptotically 25% fewer
4079 // multiplies so it should be up to 25% faster than Montgomery
4080 // multiplication. However, its loop control is more complex and it
4081 // may actually run slower on some machines.
4082
4083 static void __attribute__((noinline))
4084 montgomery_square(unsigned long a[], unsigned long n[],
4085 unsigned long m[], unsigned long inv, int len) {
4086 unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
4087 int i;
4088
4089 assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
4090
4091 for (i = 0; i < len; i++) {
4092 int j;
4093 int end = (i+1)/2;
4094 for (j = 0; j < end; j++) {
4095 MACC2(a[j], a[i-j], t0, t1, t2);
4096 MACC(m[j], n[i-j], t0, t1, t2);
4097 }
4098 if ((i & 1) == 0) {
4099 MACC(a[j], a[j], t0, t1, t2);
4100 }
4101 for (; j < i; j++) {
4102 MACC(m[j], n[i-j], t0, t1, t2);
4103 }
4104 m[i] = t0 * inv;
4105 MACC(m[i], n[0], t0, t1, t2);
4106
4107 assert(t0 == 0, "broken Montgomery square");
4108
4109 t0 = t1; t1 = t2; t2 = 0;
4110 }
4111
4112 for (i = len; i < 2*len; i++) {
4113 int start = i-len+1;
4114 int end = start + (len - start)/2;
4115 int j;
4116 for (j = start; j < end; j++) {
4117 MACC2(a[j], a[i-j], t0, t1, t2);
4118 MACC(m[j], n[i-j], t0, t1, t2);
4119 }
4120 if ((i & 1) == 0) {
4121 MACC(a[j], a[j], t0, t1, t2);
4122 }
4123 for (; j < len; j++) {
4124 MACC(m[j], n[i-j], t0, t1, t2);
4125 }
4126 m[i-len] = t0;
4127 t0 = t1; t1 = t2; t2 = 0;
4128 }
4129
4130 while (t0)
4131 t0 = sub(m, n, t0, len);
4132 }
4133
4134 // Swap words in a longword.
4135 static unsigned long swap(unsigned long x) {
4136 return (x << 32) | (x >> 32);
4137 }
4138
4139 // Copy len longwords from s to d, word-swapping as we go. The
4140 // destination array is reversed.
4141 static void reverse_words(unsigned long *s, unsigned long *d, int len) {
4142 d += len;
4143 while(len-- > 0) {
4144 d--;
4145 *d = swap(*s);
4146 s++;
4147 }
4148 }
4149
4150 // The threshold at which squaring is advantageous was determined
4151 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz.
4152 #define MONTGOMERY_SQUARING_THRESHOLD 64
4153
4154 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints,
4155 jint len, jlong inv,
4156 jint *m_ints) {
4157 assert(len % 2 == 0, "array length in montgomery_multiply must be even");
4158 int longwords = len/2;
4159
4160 // Make very sure we don't use so much space that the stack might
4161 // overflow. 512 jints corresponds to an 16384-bit integer and
4162 // will use here a total of 8k bytes of stack space.
4163 int total_allocation = longwords * sizeof (unsigned long) * 4;
4164 guarantee(total_allocation <= 8192, "must be");
4165 unsigned long *scratch = (unsigned long *)alloca(total_allocation);
4166
4167 // Local scratch arrays
4168 unsigned long
4169 *a = scratch + 0 * longwords,
4170 *b = scratch + 1 * longwords,
4171 *n = scratch + 2 * longwords,
4172 *m = scratch + 3 * longwords;
4173
4174 reverse_words((unsigned long *)a_ints, a, longwords);
4175 reverse_words((unsigned long *)b_ints, b, longwords);
4176 reverse_words((unsigned long *)n_ints, n, longwords);
4177
4178 ::montgomery_multiply(a, b, n, m, (unsigned long)inv, longwords);
4179
4180 reverse_words(m, (unsigned long *)m_ints, longwords);
4181 }
4182
4183 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
4184 jint len, jlong inv,
4185 jint *m_ints) {
4186 assert(len % 2 == 0, "array length in montgomery_square must be even");
4187 int longwords = len/2;
4188
4189 // Make very sure we don't use so much space that the stack might
4190 // overflow. 512 jints corresponds to an 16384-bit integer and
4191 // will use here a total of 6k bytes of stack space.
4192 int total_allocation = longwords * sizeof (unsigned long) * 3;
4193 guarantee(total_allocation <= 8192, "must be");
4194 unsigned long *scratch = (unsigned long *)alloca(total_allocation);
4195
4196 // Local scratch arrays
4197 unsigned long
4198 *a = scratch + 0 * longwords,
4199 *n = scratch + 1 * longwords,
4200 *m = scratch + 2 * longwords;
4201
4202 reverse_words((unsigned long *)a_ints, a, longwords);
4203 reverse_words((unsigned long *)n_ints, n, longwords);
4204
4205 //montgomery_square fails to pass BigIntegerTest on solaris amd64
4206 //on jdk7 and jdk8.
4207 #ifndef SOLARIS
4208 if (len >= MONTGOMERY_SQUARING_THRESHOLD) {
4209 #else
4210 if (0) {
4211 #endif
4212 ::montgomery_square(a, n, m, (unsigned long)inv, longwords);
4213 } else {
4214 ::montgomery_multiply(a, a, n, m, (unsigned long)inv, longwords);
4215 }
4216
4217 reverse_words(m, (unsigned long *)m_ints, longwords);
4218 }
4219
4220 #endif // WINDOWS
4221
3969 #ifdef COMPILER2 4222 #ifdef COMPILER2
3970 // This is here instead of runtime_x86_64.cpp because it uses SimpleRuntimeFrame 4223 // This is here instead of runtime_x86_64.cpp because it uses SimpleRuntimeFrame
3971 // 4224 //
3972 //------------------------------generate_exception_blob--------------------------- 4225 //------------------------------generate_exception_blob---------------------------
3973 // creates exception blob at the end 4226 // creates exception blob at the end