Mercurial > hg > graal-compiler
comparison src/cpu/x86/vm/macroAssembler_x86.cpp @ 11080:b800986664f4
7088419: Use x86 Hardware CRC32 Instruction with java.util.zip.CRC32
Summary: add intrinsics using new instruction to interpreter, C1, C2, for suitable x86; add test
Reviewed-by: kvn, twisti
author | drchase |
---|---|
date | Tue, 02 Jul 2013 20:42:12 -0400 |
parents | e961c11b85fe |
children | 6b0fd0964b87 740e263c80c6 |
comparison
equal
deleted
inserted
replaced
11079:738e04fb1232 | 11080:b800986664f4 |
---|---|
1 /* | 1 /* |
2 * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved. | 2 * Copyright (c) 1997, 2013, Oracle and/or its affiliates. All rights reserved. |
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. | 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. |
4 * | 4 * |
5 * This code is free software; you can redistribute it and/or modify it | 5 * This code is free software; you can redistribute it and/or modify it |
6 * under the terms of the GNU General Public License version 2 only, as | 6 * under the terms of the GNU General Public License version 2 only, as |
7 * published by the Free Software Foundation. | 7 * published by the Free Software Foundation. |
2789 if (reachable(src)) { | 2789 if (reachable(src)) { |
2790 Assembler::movdqu(dst, as_Address(src)); | 2790 Assembler::movdqu(dst, as_Address(src)); |
2791 } else { | 2791 } else { |
2792 lea(rscratch1, src); | 2792 lea(rscratch1, src); |
2793 Assembler::movdqu(dst, Address(rscratch1, 0)); | 2793 Assembler::movdqu(dst, Address(rscratch1, 0)); |
2794 } | |
2795 } | |
2796 | |
2797 void MacroAssembler::movdqa(XMMRegister dst, AddressLiteral src) { | |
2798 if (reachable(src)) { | |
2799 Assembler::movdqa(dst, as_Address(src)); | |
2800 } else { | |
2801 lea(rscratch1, src); | |
2802 Assembler::movdqa(dst, Address(rscratch1, 0)); | |
2794 } | 2803 } |
2795 } | 2804 } |
2796 | 2805 |
2797 void MacroAssembler::movsd(XMMRegister dst, AddressLiteral src) { | 2806 void MacroAssembler::movsd(XMMRegister dst, AddressLiteral src) { |
2798 if (reachable(src)) { | 2807 if (reachable(src)) { |
6386 bind(L_copy_1_char_exit); | 6395 bind(L_copy_1_char_exit); |
6387 addptr(result, len); // len is negative count of not processed elements | 6396 addptr(result, len); // len is negative count of not processed elements |
6388 bind(L_done); | 6397 bind(L_done); |
6389 } | 6398 } |
6390 | 6399 |
6400 /** | |
6401 * Emits code to update CRC-32 with a byte value according to constants in table | |
6402 * | |
6403 * @param [in,out]crc Register containing the crc. | |
6404 * @param [in]val Register containing the byte to fold into the CRC. | |
6405 * @param [in]table Register containing the table of crc constants. | |
6406 * | |
6407 * uint32_t crc; | |
6408 * val = crc_table[(val ^ crc) & 0xFF]; | |
6409 * crc = val ^ (crc >> 8); | |
6410 * | |
6411 */ | |
6412 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) { | |
6413 xorl(val, crc); | |
6414 andl(val, 0xFF); | |
6415 shrl(crc, 8); // unsigned shift | |
6416 xorl(crc, Address(table, val, Address::times_4, 0)); | |
6417 } | |
6418 | |
6419 /** | |
6420 * Fold 128-bit data chunk | |
6421 */ | |
6422 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset) { | |
6423 vpclmulhdq(xtmp, xK, xcrc); // [123:64] | |
6424 vpclmulldq(xcrc, xK, xcrc); // [63:0] | |
6425 vpxor(xcrc, xcrc, Address(buf, offset), false /* vector256 */); | |
6426 pxor(xcrc, xtmp); | |
6427 } | |
6428 | |
6429 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, XMMRegister xbuf) { | |
6430 vpclmulhdq(xtmp, xK, xcrc); | |
6431 vpclmulldq(xcrc, xK, xcrc); | |
6432 pxor(xcrc, xbuf); | |
6433 pxor(xcrc, xtmp); | |
6434 } | |
6435 | |
6436 /** | |
6437 * 8-bit folds to compute 32-bit CRC | |
6438 * | |
6439 * uint64_t xcrc; | |
6440 * timesXtoThe32[xcrc & 0xFF] ^ (xcrc >> 8); | |
6441 */ | |
6442 void MacroAssembler::fold_8bit_crc32(XMMRegister xcrc, Register table, XMMRegister xtmp, Register tmp) { | |
6443 movdl(tmp, xcrc); | |
6444 andl(tmp, 0xFF); | |
6445 movdl(xtmp, Address(table, tmp, Address::times_4, 0)); | |
6446 psrldq(xcrc, 1); // unsigned shift one byte | |
6447 pxor(xcrc, xtmp); | |
6448 } | |
6449 | |
6450 /** | |
6451 * uint32_t crc; | |
6452 * timesXtoThe32[crc & 0xFF] ^ (crc >> 8); | |
6453 */ | |
6454 void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) { | |
6455 movl(tmp, crc); | |
6456 andl(tmp, 0xFF); | |
6457 shrl(crc, 8); | |
6458 xorl(crc, Address(table, tmp, Address::times_4, 0)); | |
6459 } | |
6460 | |
6461 /** | |
6462 * @param crc register containing existing CRC (32-bit) | |
6463 * @param buf register pointing to input byte buffer (byte*) | |
6464 * @param len register containing number of bytes | |
6465 * @param table register that will contain address of CRC table | |
6466 * @param tmp scratch register | |
6467 */ | |
6468 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len, Register table, Register tmp) { | |
6469 assert_different_registers(crc, buf, len, table, tmp, rax); | |
6470 | |
6471 Label L_tail, L_tail_restore, L_tail_loop, L_exit, L_align_loop, L_aligned; | |
6472 Label L_fold_tail, L_fold_128b, L_fold_512b, L_fold_512b_loop, L_fold_tail_loop; | |
6473 | |
6474 lea(table, ExternalAddress(StubRoutines::crc_table_addr())); | |
6475 notl(crc); // ~crc | |
6476 cmpl(len, 16); | |
6477 jcc(Assembler::less, L_tail); | |
6478 | |
6479 // Align buffer to 16 bytes | |
6480 movl(tmp, buf); | |
6481 andl(tmp, 0xF); | |
6482 jccb(Assembler::zero, L_aligned); | |
6483 subl(tmp, 16); | |
6484 addl(len, tmp); | |
6485 | |
6486 align(4); | |
6487 BIND(L_align_loop); | |
6488 movsbl(rax, Address(buf, 0)); // load byte with sign extension | |
6489 update_byte_crc32(crc, rax, table); | |
6490 increment(buf); | |
6491 incrementl(tmp); | |
6492 jccb(Assembler::less, L_align_loop); | |
6493 | |
6494 BIND(L_aligned); | |
6495 movl(tmp, len); // save | |
6496 shrl(len, 4); | |
6497 jcc(Assembler::zero, L_tail_restore); | |
6498 | |
6499 // Fold crc into first bytes of vector | |
6500 movdqa(xmm1, Address(buf, 0)); | |
6501 movdl(rax, xmm1); | |
6502 xorl(crc, rax); | |
6503 pinsrd(xmm1, crc, 0); | |
6504 addptr(buf, 16); | |
6505 subl(len, 4); // len > 0 | |
6506 jcc(Assembler::less, L_fold_tail); | |
6507 | |
6508 movdqa(xmm2, Address(buf, 0)); | |
6509 movdqa(xmm3, Address(buf, 16)); | |
6510 movdqa(xmm4, Address(buf, 32)); | |
6511 addptr(buf, 48); | |
6512 subl(len, 3); | |
6513 jcc(Assembler::lessEqual, L_fold_512b); | |
6514 | |
6515 // Fold total 512 bits of polynomial on each iteration, | |
6516 // 128 bits per each of 4 parallel streams. | |
6517 movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 32)); | |
6518 | |
6519 align(32); | |
6520 BIND(L_fold_512b_loop); | |
6521 fold_128bit_crc32(xmm1, xmm0, xmm5, buf, 0); | |
6522 fold_128bit_crc32(xmm2, xmm0, xmm5, buf, 16); | |
6523 fold_128bit_crc32(xmm3, xmm0, xmm5, buf, 32); | |
6524 fold_128bit_crc32(xmm4, xmm0, xmm5, buf, 48); | |
6525 addptr(buf, 64); | |
6526 subl(len, 4); | |
6527 jcc(Assembler::greater, L_fold_512b_loop); | |
6528 | |
6529 // Fold 512 bits to 128 bits. | |
6530 BIND(L_fold_512b); | |
6531 movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16)); | |
6532 fold_128bit_crc32(xmm1, xmm0, xmm5, xmm2); | |
6533 fold_128bit_crc32(xmm1, xmm0, xmm5, xmm3); | |
6534 fold_128bit_crc32(xmm1, xmm0, xmm5, xmm4); | |
6535 | |
6536 // Fold the rest of 128 bits data chunks | |
6537 BIND(L_fold_tail); | |
6538 addl(len, 3); | |
6539 jccb(Assembler::lessEqual, L_fold_128b); | |
6540 movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16)); | |
6541 | |
6542 BIND(L_fold_tail_loop); | |
6543 fold_128bit_crc32(xmm1, xmm0, xmm5, buf, 0); | |
6544 addptr(buf, 16); | |
6545 decrementl(len); | |
6546 jccb(Assembler::greater, L_fold_tail_loop); | |
6547 | |
6548 // Fold 128 bits in xmm1 down into 32 bits in crc register. | |
6549 BIND(L_fold_128b); | |
6550 movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr())); | |
6551 vpclmulqdq(xmm2, xmm0, xmm1, 0x1); | |
6552 vpand(xmm3, xmm0, xmm2, false /* vector256 */); | |
6553 vpclmulqdq(xmm0, xmm0, xmm3, 0x1); | |
6554 psrldq(xmm1, 8); | |
6555 psrldq(xmm2, 4); | |
6556 pxor(xmm0, xmm1); | |
6557 pxor(xmm0, xmm2); | |
6558 | |
6559 // 8 8-bit folds to compute 32-bit CRC. | |
6560 for (int j = 0; j < 4; j++) { | |
6561 fold_8bit_crc32(xmm0, table, xmm1, rax); | |
6562 } | |
6563 movdl(crc, xmm0); // mov 32 bits to general register | |
6564 for (int j = 0; j < 4; j++) { | |
6565 fold_8bit_crc32(crc, table, rax); | |
6566 } | |
6567 | |
6568 BIND(L_tail_restore); | |
6569 movl(len, tmp); // restore | |
6570 BIND(L_tail); | |
6571 andl(len, 0xf); | |
6572 jccb(Assembler::zero, L_exit); | |
6573 | |
6574 // Fold the rest of bytes | |
6575 align(4); | |
6576 BIND(L_tail_loop); | |
6577 movsbl(rax, Address(buf, 0)); // load byte with sign extension | |
6578 update_byte_crc32(crc, rax, table); | |
6579 increment(buf); | |
6580 decrementl(len); | |
6581 jccb(Assembler::greater, L_tail_loop); | |
6582 | |
6583 BIND(L_exit); | |
6584 notl(crc); // ~c | |
6585 } | |
6586 | |
6391 #undef BIND | 6587 #undef BIND |
6392 #undef BLOCK_COMMENT | 6588 #undef BLOCK_COMMENT |
6393 | 6589 |
6394 | 6590 |
6395 Assembler::Condition MacroAssembler::negate_condition(Assembler::Condition cond) { | 6591 Assembler::Condition MacroAssembler::negate_condition(Assembler::Condition cond) { |