comparison src/cpu/x86/vm/macroAssembler_x86.cpp @ 11080:b800986664f4

7088419: Use x86 Hardware CRC32 Instruction with java.util.zip.CRC32 Summary: add intrinsics using new instruction to interpreter, C1, C2, for suitable x86; add test Reviewed-by: kvn, twisti
author drchase
date Tue, 02 Jul 2013 20:42:12 -0400
parents e961c11b85fe
children 6b0fd0964b87 740e263c80c6
comparison
equal deleted inserted replaced
11079:738e04fb1232 11080:b800986664f4
1 /* 1 /*
2 * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved. 2 * Copyright (c) 1997, 2013, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 * 4 *
5 * This code is free software; you can redistribute it and/or modify it 5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as 6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation. 7 * published by the Free Software Foundation.
2789 if (reachable(src)) { 2789 if (reachable(src)) {
2790 Assembler::movdqu(dst, as_Address(src)); 2790 Assembler::movdqu(dst, as_Address(src));
2791 } else { 2791 } else {
2792 lea(rscratch1, src); 2792 lea(rscratch1, src);
2793 Assembler::movdqu(dst, Address(rscratch1, 0)); 2793 Assembler::movdqu(dst, Address(rscratch1, 0));
2794 }
2795 }
2796
2797 void MacroAssembler::movdqa(XMMRegister dst, AddressLiteral src) {
2798 if (reachable(src)) {
2799 Assembler::movdqa(dst, as_Address(src));
2800 } else {
2801 lea(rscratch1, src);
2802 Assembler::movdqa(dst, Address(rscratch1, 0));
2794 } 2803 }
2795 } 2804 }
2796 2805
2797 void MacroAssembler::movsd(XMMRegister dst, AddressLiteral src) { 2806 void MacroAssembler::movsd(XMMRegister dst, AddressLiteral src) {
2798 if (reachable(src)) { 2807 if (reachable(src)) {
6386 bind(L_copy_1_char_exit); 6395 bind(L_copy_1_char_exit);
6387 addptr(result, len); // len is negative count of not processed elements 6396 addptr(result, len); // len is negative count of not processed elements
6388 bind(L_done); 6397 bind(L_done);
6389 } 6398 }
6390 6399
6400 /**
6401 * Emits code to update CRC-32 with a byte value according to constants in table
6402 *
6403 * @param [in,out]crc Register containing the crc.
6404 * @param [in]val Register containing the byte to fold into the CRC.
6405 * @param [in]table Register containing the table of crc constants.
6406 *
6407 * uint32_t crc;
6408 * val = crc_table[(val ^ crc) & 0xFF];
6409 * crc = val ^ (crc >> 8);
6410 *
6411 */
6412 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
6413 xorl(val, crc);
6414 andl(val, 0xFF);
6415 shrl(crc, 8); // unsigned shift
6416 xorl(crc, Address(table, val, Address::times_4, 0));
6417 }
6418
6419 /**
6420 * Fold 128-bit data chunk
6421 */
6422 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset) {
6423 vpclmulhdq(xtmp, xK, xcrc); // [123:64]
6424 vpclmulldq(xcrc, xK, xcrc); // [63:0]
6425 vpxor(xcrc, xcrc, Address(buf, offset), false /* vector256 */);
6426 pxor(xcrc, xtmp);
6427 }
6428
6429 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, XMMRegister xbuf) {
6430 vpclmulhdq(xtmp, xK, xcrc);
6431 vpclmulldq(xcrc, xK, xcrc);
6432 pxor(xcrc, xbuf);
6433 pxor(xcrc, xtmp);
6434 }
6435
6436 /**
6437 * 8-bit folds to compute 32-bit CRC
6438 *
6439 * uint64_t xcrc;
6440 * timesXtoThe32[xcrc & 0xFF] ^ (xcrc >> 8);
6441 */
6442 void MacroAssembler::fold_8bit_crc32(XMMRegister xcrc, Register table, XMMRegister xtmp, Register tmp) {
6443 movdl(tmp, xcrc);
6444 andl(tmp, 0xFF);
6445 movdl(xtmp, Address(table, tmp, Address::times_4, 0));
6446 psrldq(xcrc, 1); // unsigned shift one byte
6447 pxor(xcrc, xtmp);
6448 }
6449
6450 /**
6451 * uint32_t crc;
6452 * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
6453 */
6454 void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) {
6455 movl(tmp, crc);
6456 andl(tmp, 0xFF);
6457 shrl(crc, 8);
6458 xorl(crc, Address(table, tmp, Address::times_4, 0));
6459 }
6460
6461 /**
6462 * @param crc register containing existing CRC (32-bit)
6463 * @param buf register pointing to input byte buffer (byte*)
6464 * @param len register containing number of bytes
6465 * @param table register that will contain address of CRC table
6466 * @param tmp scratch register
6467 */
6468 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len, Register table, Register tmp) {
6469 assert_different_registers(crc, buf, len, table, tmp, rax);
6470
6471 Label L_tail, L_tail_restore, L_tail_loop, L_exit, L_align_loop, L_aligned;
6472 Label L_fold_tail, L_fold_128b, L_fold_512b, L_fold_512b_loop, L_fold_tail_loop;
6473
6474 lea(table, ExternalAddress(StubRoutines::crc_table_addr()));
6475 notl(crc); // ~crc
6476 cmpl(len, 16);
6477 jcc(Assembler::less, L_tail);
6478
6479 // Align buffer to 16 bytes
6480 movl(tmp, buf);
6481 andl(tmp, 0xF);
6482 jccb(Assembler::zero, L_aligned);
6483 subl(tmp, 16);
6484 addl(len, tmp);
6485
6486 align(4);
6487 BIND(L_align_loop);
6488 movsbl(rax, Address(buf, 0)); // load byte with sign extension
6489 update_byte_crc32(crc, rax, table);
6490 increment(buf);
6491 incrementl(tmp);
6492 jccb(Assembler::less, L_align_loop);
6493
6494 BIND(L_aligned);
6495 movl(tmp, len); // save
6496 shrl(len, 4);
6497 jcc(Assembler::zero, L_tail_restore);
6498
6499 // Fold crc into first bytes of vector
6500 movdqa(xmm1, Address(buf, 0));
6501 movdl(rax, xmm1);
6502 xorl(crc, rax);
6503 pinsrd(xmm1, crc, 0);
6504 addptr(buf, 16);
6505 subl(len, 4); // len > 0
6506 jcc(Assembler::less, L_fold_tail);
6507
6508 movdqa(xmm2, Address(buf, 0));
6509 movdqa(xmm3, Address(buf, 16));
6510 movdqa(xmm4, Address(buf, 32));
6511 addptr(buf, 48);
6512 subl(len, 3);
6513 jcc(Assembler::lessEqual, L_fold_512b);
6514
6515 // Fold total 512 bits of polynomial on each iteration,
6516 // 128 bits per each of 4 parallel streams.
6517 movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 32));
6518
6519 align(32);
6520 BIND(L_fold_512b_loop);
6521 fold_128bit_crc32(xmm1, xmm0, xmm5, buf, 0);
6522 fold_128bit_crc32(xmm2, xmm0, xmm5, buf, 16);
6523 fold_128bit_crc32(xmm3, xmm0, xmm5, buf, 32);
6524 fold_128bit_crc32(xmm4, xmm0, xmm5, buf, 48);
6525 addptr(buf, 64);
6526 subl(len, 4);
6527 jcc(Assembler::greater, L_fold_512b_loop);
6528
6529 // Fold 512 bits to 128 bits.
6530 BIND(L_fold_512b);
6531 movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16));
6532 fold_128bit_crc32(xmm1, xmm0, xmm5, xmm2);
6533 fold_128bit_crc32(xmm1, xmm0, xmm5, xmm3);
6534 fold_128bit_crc32(xmm1, xmm0, xmm5, xmm4);
6535
6536 // Fold the rest of 128 bits data chunks
6537 BIND(L_fold_tail);
6538 addl(len, 3);
6539 jccb(Assembler::lessEqual, L_fold_128b);
6540 movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16));
6541
6542 BIND(L_fold_tail_loop);
6543 fold_128bit_crc32(xmm1, xmm0, xmm5, buf, 0);
6544 addptr(buf, 16);
6545 decrementl(len);
6546 jccb(Assembler::greater, L_fold_tail_loop);
6547
6548 // Fold 128 bits in xmm1 down into 32 bits in crc register.
6549 BIND(L_fold_128b);
6550 movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr()));
6551 vpclmulqdq(xmm2, xmm0, xmm1, 0x1);
6552 vpand(xmm3, xmm0, xmm2, false /* vector256 */);
6553 vpclmulqdq(xmm0, xmm0, xmm3, 0x1);
6554 psrldq(xmm1, 8);
6555 psrldq(xmm2, 4);
6556 pxor(xmm0, xmm1);
6557 pxor(xmm0, xmm2);
6558
6559 // 8 8-bit folds to compute 32-bit CRC.
6560 for (int j = 0; j < 4; j++) {
6561 fold_8bit_crc32(xmm0, table, xmm1, rax);
6562 }
6563 movdl(crc, xmm0); // mov 32 bits to general register
6564 for (int j = 0; j < 4; j++) {
6565 fold_8bit_crc32(crc, table, rax);
6566 }
6567
6568 BIND(L_tail_restore);
6569 movl(len, tmp); // restore
6570 BIND(L_tail);
6571 andl(len, 0xf);
6572 jccb(Assembler::zero, L_exit);
6573
6574 // Fold the rest of bytes
6575 align(4);
6576 BIND(L_tail_loop);
6577 movsbl(rax, Address(buf, 0)); // load byte with sign extension
6578 update_byte_crc32(crc, rax, table);
6579 increment(buf);
6580 decrementl(len);
6581 jccb(Assembler::greater, L_tail_loop);
6582
6583 BIND(L_exit);
6584 notl(crc); // ~c
6585 }
6586
6391 #undef BIND 6587 #undef BIND
6392 #undef BLOCK_COMMENT 6588 #undef BLOCK_COMMENT
6393 6589
6394 6590
6395 Assembler::Condition MacroAssembler::negate_condition(Assembler::Condition cond) { 6591 Assembler::Condition MacroAssembler::negate_condition(Assembler::Condition cond) {