Mercurial > hg > graal-compiler
comparison src/cpu/x86/vm/x86_32.ad @ 6084:6759698e3140
7133857: exp() and pow() should use the x87 ISA on x86
Summary: use x87 instructions to implement exp() and pow() in interpreter/c1/c2.
Reviewed-by: kvn, never, twisti
author | roland |
---|---|
date | Tue, 15 May 2012 10:10:23 +0200 |
parents | 61b82be3b1ff |
children | ccaa67adfe5b |
comparison
equal
deleted
inserted
replaced
6057:8f972594effc | 6084:6759698e3140 |
---|---|
2532 | 2532 |
2533 enc_class push_xmm_to_fpr1(regD src) %{ | 2533 enc_class push_xmm_to_fpr1(regD src) %{ |
2534 MacroAssembler _masm(&cbuf); | 2534 MacroAssembler _masm(&cbuf); |
2535 __ movdbl(Address(rsp, 0), $src$$XMMRegister); | 2535 __ movdbl(Address(rsp, 0), $src$$XMMRegister); |
2536 __ fld_d(Address(rsp, 0)); | 2536 __ fld_d(Address(rsp, 0)); |
2537 %} | |
2538 | |
2539 // Compute X^Y using Intel's fast hardware instructions, if possible. | |
2540 // Otherwise return a NaN. | |
2541 enc_class pow_exp_core_encoding %{ | |
2542 // FPR1 holds Y*ln2(X). Compute FPR1 = 2^(Y*ln2(X)) | |
2543 emit_opcode(cbuf,0xD9); emit_opcode(cbuf,0xC0); // fdup = fld st(0) Q Q | |
2544 emit_opcode(cbuf,0xD9); emit_opcode(cbuf,0xFC); // frndint int(Q) Q | |
2545 emit_opcode(cbuf,0xDC); emit_opcode(cbuf,0xE9); // fsub st(1) -= st(0); int(Q) frac(Q) | |
2546 emit_opcode(cbuf,0xDB); // FISTP [ESP] frac(Q) | |
2547 emit_opcode(cbuf,0x1C); | |
2548 emit_d8(cbuf,0x24); | |
2549 emit_opcode(cbuf,0xD9); emit_opcode(cbuf,0xF0); // f2xm1 2^frac(Q)-1 | |
2550 emit_opcode(cbuf,0xD9); emit_opcode(cbuf,0xE8); // fld1 1 2^frac(Q)-1 | |
2551 emit_opcode(cbuf,0xDE); emit_opcode(cbuf,0xC1); // faddp 2^frac(Q) | |
2552 emit_opcode(cbuf,0x8B); // mov rax,[esp+0]=int(Q) | |
2553 encode_RegMem(cbuf, EAX_enc, ESP_enc, 0x4, 0, 0, false); | |
2554 emit_opcode(cbuf,0xC7); // mov rcx,0xFFFFF800 - overflow mask | |
2555 emit_rm(cbuf, 0x3, 0x0, ECX_enc); | |
2556 emit_d32(cbuf,0xFFFFF800); | |
2557 emit_opcode(cbuf,0x81); // add rax,1023 - the double exponent bias | |
2558 emit_rm(cbuf, 0x3, 0x0, EAX_enc); | |
2559 emit_d32(cbuf,1023); | |
2560 emit_opcode(cbuf,0x8B); // mov rbx,eax | |
2561 emit_rm(cbuf, 0x3, EBX_enc, EAX_enc); | |
2562 emit_opcode(cbuf,0xC1); // shl rax,20 - Slide to exponent position | |
2563 emit_rm(cbuf,0x3,0x4,EAX_enc); | |
2564 emit_d8(cbuf,20); | |
2565 emit_opcode(cbuf,0x85); // test rbx,ecx - check for overflow | |
2566 emit_rm(cbuf, 0x3, EBX_enc, ECX_enc); | |
2567 emit_opcode(cbuf,0x0F); emit_opcode(cbuf,0x45); // CMOVne rax,ecx - overflow; stuff NAN into EAX | |
2568 emit_rm(cbuf, 0x3, EAX_enc, ECX_enc); | |
2569 emit_opcode(cbuf,0x89); // mov [esp+4],eax - Store as part of double word | |
2570 encode_RegMem(cbuf, EAX_enc, ESP_enc, 0x4, 0, 4, false); | |
2571 emit_opcode(cbuf,0xC7); // mov [esp+0],0 - [ESP] = (double)(1<<int(Q)) = 2^int(Q) | |
2572 encode_RegMem(cbuf, 0x0, ESP_enc, 0x4, 0, 0, false); | |
2573 emit_d32(cbuf,0); | |
2574 emit_opcode(cbuf,0xDC); // fmul dword st(0),[esp+0]; FPR1 = 2^int(Q)*2^frac(Q) = 2^Q | |
2575 encode_RegMem(cbuf, 0x1, ESP_enc, 0x4, 0, 0, false); | |
2576 %} | 2537 %} |
2577 | 2538 |
2578 enc_class Push_Result_Mod_DPR( regDPR src) %{ | 2539 enc_class Push_Result_Mod_DPR( regDPR src) %{ |
2579 if ($src$$reg != FPR1L_enc) { | 2540 if ($src$$reg != FPR1L_enc) { |
2580 // fincstp | 2541 // fincstp |
10098 ins_encode( Push_Reg_DPR(src), | 10059 ins_encode( Push_Reg_DPR(src), |
10099 OpcS, OpcP, Pop_Reg_DPR(dst) ); | 10060 OpcS, OpcP, Pop_Reg_DPR(dst) ); |
10100 ins_pipe( pipe_slow ); | 10061 ins_pipe( pipe_slow ); |
10101 %} | 10062 %} |
10102 | 10063 |
10103 instruct powDPR_reg(regDPR X, regDPR1 Y, eAXRegI rax, eBXRegI rbx, eCXRegI rcx) %{ | 10064 instruct powDPR_reg(regDPR X, regDPR1 Y, eAXRegI rax, eDXRegI rdx, eCXRegI rcx, eFlagsReg cr) %{ |
10104 predicate (UseSSE<=1); | 10065 predicate (UseSSE<=1); |
10105 match(Set Y (PowD X Y)); // Raise X to the Yth power | 10066 match(Set Y (PowD X Y)); // Raise X to the Yth power |
10106 effect(KILL rax, KILL rbx, KILL rcx); | 10067 effect(KILL rax, KILL rdx, KILL rcx, KILL cr); |
10107 format %{ "SUB ESP,8\t\t# Fast-path POW encoding\n\t" | 10068 format %{ "fast_pow $X $Y -> $Y // KILL $rax, $rcx, $rdx" %} |
10108 "FLD_D $X\n\t" | 10069 ins_encode %{ |
10109 "FYL2X \t\t\t# Q=Y*ln2(X)\n\t" | 10070 __ subptr(rsp, 8); |
10110 | 10071 __ fld_s($X$$reg - 1); |
10111 "FDUP \t\t\t# Q Q\n\t" | 10072 __ fast_pow(); |
10112 "FRNDINT\t\t\t# int(Q) Q\n\t" | 10073 __ addptr(rsp, 8); |
10113 "FSUB ST(1),ST(0)\t# int(Q) frac(Q)\n\t" | 10074 %} |
10114 "FISTP dword [ESP]\n\t" | 10075 ins_pipe( pipe_slow ); |
10115 "F2XM1 \t\t\t# 2^frac(Q)-1 int(Q)\n\t" | 10076 %} |
10116 "FLD1 \t\t\t# 1 2^frac(Q)-1 int(Q)\n\t" | 10077 |
10117 "FADDP \t\t\t# 2^frac(Q) int(Q)\n\t" // could use FADD [1.000] instead | 10078 instruct powD_reg(regD dst, regD src0, regD src1, eAXRegI rax, eDXRegI rdx, eCXRegI rcx, eFlagsReg cr) %{ |
10118 "MOV EAX,[ESP]\t# Pick up int(Q)\n\t" | |
10119 "MOV ECX,0xFFFFF800\t# Overflow mask\n\t" | |
10120 "ADD EAX,1023\t\t# Double exponent bias\n\t" | |
10121 "MOV EBX,EAX\t\t# Preshifted biased expo\n\t" | |
10122 "SHL EAX,20\t\t# Shift exponent into place\n\t" | |
10123 "TEST EBX,ECX\t\t# Check for overflow\n\t" | |
10124 "CMOVne EAX,ECX\t\t# If overflow, stuff NaN into EAX\n\t" | |
10125 "MOV [ESP+4],EAX\t# Marshal 64-bit scaling double\n\t" | |
10126 "MOV [ESP+0],0\n\t" | |
10127 "FMUL ST(0),[ESP+0]\t# Scale\n\t" | |
10128 | |
10129 "ADD ESP,8" | |
10130 %} | |
10131 ins_encode( push_stack_temp_qword, | |
10132 Push_Reg_DPR(X), | |
10133 Opcode(0xD9), Opcode(0xF1), // fyl2x | |
10134 pow_exp_core_encoding, | |
10135 pop_stack_temp_qword); | |
10136 ins_pipe( pipe_slow ); | |
10137 %} | |
10138 | |
10139 instruct powD_reg(regD dst, regD src0, regD src1, regDPR1 tmp1, eAXRegI rax, eBXRegI rbx, eCXRegI rcx ) %{ | |
10140 predicate (UseSSE>=2); | 10079 predicate (UseSSE>=2); |
10141 match(Set dst (PowD src0 src1)); // Raise src0 to the src1'th power | 10080 match(Set dst (PowD src0 src1)); // Raise src0 to the src1'th power |
10142 effect(KILL tmp1, KILL rax, KILL rbx, KILL rcx ); | 10081 effect(KILL rax, KILL rdx, KILL rcx, KILL cr); |
10143 format %{ "SUB ESP,8\t\t# Fast-path POW encoding\n\t" | 10082 format %{ "fast_pow $src0 $src1 -> $dst // KILL $rax, $rcx, $rdx" %} |
10144 "MOVSD [ESP],$src1\n\t" | 10083 ins_encode %{ |
10145 "FLD FPR1,$src1\n\t" | 10084 __ subptr(rsp, 8); |
10146 "MOVSD [ESP],$src0\n\t" | 10085 __ movdbl(Address(rsp, 0), $src1$$XMMRegister); |
10147 "FLD FPR1,$src0\n\t" | 10086 __ fld_d(Address(rsp, 0)); |
10148 "FYL2X \t\t\t# Q=Y*ln2(X)\n\t" | 10087 __ movdbl(Address(rsp, 0), $src0$$XMMRegister); |
10149 | 10088 __ fld_d(Address(rsp, 0)); |
10150 "FDUP \t\t\t# Q Q\n\t" | 10089 __ fast_pow(); |
10151 "FRNDINT\t\t\t# int(Q) Q\n\t" | 10090 __ fstp_d(Address(rsp, 0)); |
10152 "FSUB ST(1),ST(0)\t# int(Q) frac(Q)\n\t" | 10091 __ movdbl($dst$$XMMRegister, Address(rsp, 0)); |
10153 "FISTP dword [ESP]\n\t" | 10092 __ addptr(rsp, 8); |
10154 "F2XM1 \t\t\t# 2^frac(Q)-1 int(Q)\n\t" | 10093 %} |
10155 "FLD1 \t\t\t# 1 2^frac(Q)-1 int(Q)\n\t" | 10094 ins_pipe( pipe_slow ); |
10156 "FADDP \t\t\t# 2^frac(Q) int(Q)\n\t" // could use FADD [1.000] instead | 10095 %} |
10157 "MOV EAX,[ESP]\t# Pick up int(Q)\n\t" | 10096 |
10158 "MOV ECX,0xFFFFF800\t# Overflow mask\n\t" | 10097 |
10159 "ADD EAX,1023\t\t# Double exponent bias\n\t" | 10098 instruct expDPR_reg(regDPR1 dpr1, eAXRegI rax, eDXRegI rdx, eCXRegI rcx, eFlagsReg cr) %{ |
10160 "MOV EBX,EAX\t\t# Preshifted biased expo\n\t" | |
10161 "SHL EAX,20\t\t# Shift exponent into place\n\t" | |
10162 "TEST EBX,ECX\t\t# Check for overflow\n\t" | |
10163 "CMOVne EAX,ECX\t\t# If overflow, stuff NaN into EAX\n\t" | |
10164 "MOV [ESP+4],EAX\t# Marshal 64-bit scaling double\n\t" | |
10165 "MOV [ESP+0],0\n\t" | |
10166 "FMUL ST(0),[ESP+0]\t# Scale\n\t" | |
10167 | |
10168 "FST_D [ESP]\n\t" | |
10169 "MOVSD $dst,[ESP]\n\t" | |
10170 "ADD ESP,8" | |
10171 %} | |
10172 ins_encode( push_stack_temp_qword, | |
10173 push_xmm_to_fpr1(src1), | |
10174 push_xmm_to_fpr1(src0), | |
10175 Opcode(0xD9), Opcode(0xF1), // fyl2x | |
10176 pow_exp_core_encoding, | |
10177 Push_ResultD(dst) ); | |
10178 ins_pipe( pipe_slow ); | |
10179 %} | |
10180 | |
10181 | |
10182 instruct expDPR_reg(regDPR1 dpr1, eAXRegI rax, eBXRegI rbx, eCXRegI rcx) %{ | |
10183 predicate (UseSSE<=1); | 10099 predicate (UseSSE<=1); |
10184 match(Set dpr1 (ExpD dpr1)); | 10100 match(Set dpr1 (ExpD dpr1)); |
10185 effect(KILL rax, KILL rbx, KILL rcx); | 10101 effect(KILL rax, KILL rcx, KILL rdx, KILL cr); |
10186 format %{ "SUB ESP,8\t\t# Fast-path EXP encoding" | 10102 format %{ "fast_exp $dpr1 -> $dpr1 // KILL $rax, $rcx, $rdx" %} |
10187 "FLDL2E \t\t\t# Ld log2(e) X\n\t" | 10103 ins_encode %{ |
10188 "FMULP \t\t\t# Q=X*log2(e)\n\t" | 10104 __ fast_exp(); |
10189 | 10105 %} |
10190 "FDUP \t\t\t# Q Q\n\t" | 10106 ins_pipe( pipe_slow ); |
10191 "FRNDINT\t\t\t# int(Q) Q\n\t" | 10107 %} |
10192 "FSUB ST(1),ST(0)\t# int(Q) frac(Q)\n\t" | 10108 |
10193 "FISTP dword [ESP]\n\t" | 10109 instruct expD_reg(regD dst, regD src, eAXRegI rax, eDXRegI rdx, eCXRegI rcx, eFlagsReg cr) %{ |
10194 "F2XM1 \t\t\t# 2^frac(Q)-1 int(Q)\n\t" | |
10195 "FLD1 \t\t\t# 1 2^frac(Q)-1 int(Q)\n\t" | |
10196 "FADDP \t\t\t# 2^frac(Q) int(Q)\n\t" // could use FADD [1.000] instead | |
10197 "MOV EAX,[ESP]\t# Pick up int(Q)\n\t" | |
10198 "MOV ECX,0xFFFFF800\t# Overflow mask\n\t" | |
10199 "ADD EAX,1023\t\t# Double exponent bias\n\t" | |
10200 "MOV EBX,EAX\t\t# Preshifted biased expo\n\t" | |
10201 "SHL EAX,20\t\t# Shift exponent into place\n\t" | |
10202 "TEST EBX,ECX\t\t# Check for overflow\n\t" | |
10203 "CMOVne EAX,ECX\t\t# If overflow, stuff NaN into EAX\n\t" | |
10204 "MOV [ESP+4],EAX\t# Marshal 64-bit scaling double\n\t" | |
10205 "MOV [ESP+0],0\n\t" | |
10206 "FMUL ST(0),[ESP+0]\t# Scale\n\t" | |
10207 | |
10208 "ADD ESP,8" | |
10209 %} | |
10210 ins_encode( push_stack_temp_qword, | |
10211 Opcode(0xD9), Opcode(0xEA), // fldl2e | |
10212 Opcode(0xDE), Opcode(0xC9), // fmulp | |
10213 pow_exp_core_encoding, | |
10214 pop_stack_temp_qword); | |
10215 ins_pipe( pipe_slow ); | |
10216 %} | |
10217 | |
10218 instruct expD_reg(regD dst, regD src, regDPR1 tmp1, eAXRegI rax, eBXRegI rbx, eCXRegI rcx) %{ | |
10219 predicate (UseSSE>=2); | 10110 predicate (UseSSE>=2); |
10220 match(Set dst (ExpD src)); | 10111 match(Set dst (ExpD src)); |
10221 effect(KILL tmp1, KILL rax, KILL rbx, KILL rcx); | 10112 effect(KILL rax, KILL rcx, KILL rdx, KILL cr); |
10222 format %{ "SUB ESP,8\t\t# Fast-path EXP encoding\n\t" | 10113 format %{ "fast_exp $dst -> $src // KILL $rax, $rcx, $rdx" %} |
10223 "MOVSD [ESP],$src\n\t" | 10114 ins_encode %{ |
10224 "FLDL2E \t\t\t# Ld log2(e) X\n\t" | 10115 __ subptr(rsp, 8); |
10225 "FMULP \t\t\t# Q=X*log2(e) X\n\t" | 10116 __ movdbl(Address(rsp, 0), $src$$XMMRegister); |
10226 | 10117 __ fld_d(Address(rsp, 0)); |
10227 "FDUP \t\t\t# Q Q\n\t" | 10118 __ fast_exp(); |
10228 "FRNDINT\t\t\t# int(Q) Q\n\t" | 10119 __ fstp_d(Address(rsp, 0)); |
10229 "FSUB ST(1),ST(0)\t# int(Q) frac(Q)\n\t" | 10120 __ movdbl($dst$$XMMRegister, Address(rsp, 0)); |
10230 "FISTP dword [ESP]\n\t" | 10121 __ addptr(rsp, 8); |
10231 "F2XM1 \t\t\t# 2^frac(Q)-1 int(Q)\n\t" | 10122 %} |
10232 "FLD1 \t\t\t# 1 2^frac(Q)-1 int(Q)\n\t" | 10123 ins_pipe( pipe_slow ); |
10233 "FADDP \t\t\t# 2^frac(Q) int(Q)\n\t" // could use FADD [1.000] instead | 10124 %} |
10234 "MOV EAX,[ESP]\t# Pick up int(Q)\n\t" | |
10235 "MOV ECX,0xFFFFF800\t# Overflow mask\n\t" | |
10236 "ADD EAX,1023\t\t# Double exponent bias\n\t" | |
10237 "MOV EBX,EAX\t\t# Preshifted biased expo\n\t" | |
10238 "SHL EAX,20\t\t# Shift exponent into place\n\t" | |
10239 "TEST EBX,ECX\t\t# Check for overflow\n\t" | |
10240 "CMOVne EAX,ECX\t\t# If overflow, stuff NaN into EAX\n\t" | |
10241 "MOV [ESP+4],EAX\t# Marshal 64-bit scaling double\n\t" | |
10242 "MOV [ESP+0],0\n\t" | |
10243 "FMUL ST(0),[ESP+0]\t# Scale\n\t" | |
10244 | |
10245 "FST_D [ESP]\n\t" | |
10246 "MOVSD $dst,[ESP]\n\t" | |
10247 "ADD ESP,8" | |
10248 %} | |
10249 ins_encode( Push_SrcD(src), | |
10250 Opcode(0xD9), Opcode(0xEA), // fldl2e | |
10251 Opcode(0xDE), Opcode(0xC9), // fmulp | |
10252 pow_exp_core_encoding, | |
10253 Push_ResultD(dst) ); | |
10254 ins_pipe( pipe_slow ); | |
10255 %} | |
10256 | |
10257 | |
10258 | 10125 |
10259 instruct log10DPR_reg(regDPR1 dst, regDPR1 src) %{ | 10126 instruct log10DPR_reg(regDPR1 dst, regDPR1 src) %{ |
10260 predicate (UseSSE<=1); | 10127 predicate (UseSSE<=1); |
10261 // The source Double operand on FPU stack | 10128 // The source Double operand on FPU stack |
10262 match(Set dst (Log10D src)); | 10129 match(Set dst (Log10D src)); |