comparison src/cpu/x86/vm/macroAssembler_x86.cpp @ 20804:7848fc12602b

Merge with jdk8u40-b25
author Gilles Duboscq <gilles.m.duboscq@oracle.com>
date Tue, 07 Apr 2015 14:58:49 +0200
parents 52b4284cb496 ab72a2f48aef
children be896a1983c0
comparison
equal deleted inserted replaced
20184:84105dcdb05b 20804:7848fc12602b
1767 // order to reduce the number of conditional branches in the most common cases. 1767 // order to reduce the number of conditional branches in the most common cases.
1768 // Beware -- there's a subtle invariant that fetch of the markword 1768 // Beware -- there's a subtle invariant that fetch of the markword
1769 // at [FETCH], below, will never observe a biased encoding (*101b). 1769 // at [FETCH], below, will never observe a biased encoding (*101b).
1770 // If this invariant is not held we risk exclusion (safety) failure. 1770 // If this invariant is not held we risk exclusion (safety) failure.
1771 if (UseBiasedLocking && !UseOptoBiasInlining) { 1771 if (UseBiasedLocking && !UseOptoBiasInlining) {
1772 biased_locking_enter(boxReg, objReg, tmpReg, scrReg, true, DONE_LABEL, NULL, counters); 1772 biased_locking_enter(boxReg, objReg, tmpReg, scrReg, false, DONE_LABEL, NULL, counters);
1773 } 1773 }
1774 1774
1775 #if INCLUDE_RTM_OPT 1775 #if INCLUDE_RTM_OPT
1776 if (UseRTMForStackLocks && use_rtm) { 1776 if (UseRTMForStackLocks && use_rtm) {
1777 rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg, 1777 rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg,
7291 bind(L_copy_1_char_exit); 7291 bind(L_copy_1_char_exit);
7292 addptr(result, len); // len is negative count of not processed elements 7292 addptr(result, len); // len is negative count of not processed elements
7293 bind(L_done); 7293 bind(L_done);
7294 } 7294 }
7295 7295
7296 #ifdef _LP64
7297 /**
7298 * Helper for multiply_to_len().
7299 */
7300 void MacroAssembler::add2_with_carry(Register dest_hi, Register dest_lo, Register src1, Register src2) {
7301 addq(dest_lo, src1);
7302 adcq(dest_hi, 0);
7303 addq(dest_lo, src2);
7304 adcq(dest_hi, 0);
7305 }
7306
7307 /**
7308 * Multiply 64 bit by 64 bit first loop.
7309 */
7310 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
7311 Register y, Register y_idx, Register z,
7312 Register carry, Register product,
7313 Register idx, Register kdx) {
7314 //
7315 // jlong carry, x[], y[], z[];
7316 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
7317 // huge_128 product = y[idx] * x[xstart] + carry;
7318 // z[kdx] = (jlong)product;
7319 // carry = (jlong)(product >>> 64);
7320 // }
7321 // z[xstart] = carry;
7322 //
7323
7324 Label L_first_loop, L_first_loop_exit;
7325 Label L_one_x, L_one_y, L_multiply;
7326
7327 decrementl(xstart);
7328 jcc(Assembler::negative, L_one_x);
7329
7330 movq(x_xstart, Address(x, xstart, Address::times_4, 0));
7331 rorq(x_xstart, 32); // convert big-endian to little-endian
7332
7333 bind(L_first_loop);
7334 decrementl(idx);
7335 jcc(Assembler::negative, L_first_loop_exit);
7336 decrementl(idx);
7337 jcc(Assembler::negative, L_one_y);
7338 movq(y_idx, Address(y, idx, Address::times_4, 0));
7339 rorq(y_idx, 32); // convert big-endian to little-endian
7340 bind(L_multiply);
7341 movq(product, x_xstart);
7342 mulq(y_idx); // product(rax) * y_idx -> rdx:rax
7343 addq(product, carry);
7344 adcq(rdx, 0);
7345 subl(kdx, 2);
7346 movl(Address(z, kdx, Address::times_4, 4), product);
7347 shrq(product, 32);
7348 movl(Address(z, kdx, Address::times_4, 0), product);
7349 movq(carry, rdx);
7350 jmp(L_first_loop);
7351
7352 bind(L_one_y);
7353 movl(y_idx, Address(y, 0));
7354 jmp(L_multiply);
7355
7356 bind(L_one_x);
7357 movl(x_xstart, Address(x, 0));
7358 jmp(L_first_loop);
7359
7360 bind(L_first_loop_exit);
7361 }
7362
7363 /**
7364 * Multiply 64 bit by 64 bit and add 128 bit.
7365 */
7366 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y, Register z,
7367 Register yz_idx, Register idx,
7368 Register carry, Register product, int offset) {
7369 // huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
7370 // z[kdx] = (jlong)product;
7371
7372 movq(yz_idx, Address(y, idx, Address::times_4, offset));
7373 rorq(yz_idx, 32); // convert big-endian to little-endian
7374 movq(product, x_xstart);
7375 mulq(yz_idx); // product(rax) * yz_idx -> rdx:product(rax)
7376 movq(yz_idx, Address(z, idx, Address::times_4, offset));
7377 rorq(yz_idx, 32); // convert big-endian to little-endian
7378
7379 add2_with_carry(rdx, product, carry, yz_idx);
7380
7381 movl(Address(z, idx, Address::times_4, offset+4), product);
7382 shrq(product, 32);
7383 movl(Address(z, idx, Address::times_4, offset), product);
7384
7385 }
7386
7387 /**
7388 * Multiply 128 bit by 128 bit. Unrolled inner loop.
7389 */
7390 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart, Register y, Register z,
7391 Register yz_idx, Register idx, Register jdx,
7392 Register carry, Register product,
7393 Register carry2) {
7394 // jlong carry, x[], y[], z[];
7395 // int kdx = ystart+1;
7396 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
7397 // huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
7398 // z[kdx+idx+1] = (jlong)product;
7399 // jlong carry2 = (jlong)(product >>> 64);
7400 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry2;
7401 // z[kdx+idx] = (jlong)product;
7402 // carry = (jlong)(product >>> 64);
7403 // }
7404 // idx += 2;
7405 // if (idx > 0) {
7406 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry;
7407 // z[kdx+idx] = (jlong)product;
7408 // carry = (jlong)(product >>> 64);
7409 // }
7410 //
7411
7412 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
7413
7414 movl(jdx, idx);
7415 andl(jdx, 0xFFFFFFFC);
7416 shrl(jdx, 2);
7417
7418 bind(L_third_loop);
7419 subl(jdx, 1);
7420 jcc(Assembler::negative, L_third_loop_exit);
7421 subl(idx, 4);
7422
7423 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 8);
7424 movq(carry2, rdx);
7425
7426 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product, 0);
7427 movq(carry, rdx);
7428 jmp(L_third_loop);
7429
7430 bind (L_third_loop_exit);
7431
7432 andl (idx, 0x3);
7433 jcc(Assembler::zero, L_post_third_loop_done);
7434
7435 Label L_check_1;
7436 subl(idx, 2);
7437 jcc(Assembler::negative, L_check_1);
7438
7439 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 0);
7440 movq(carry, rdx);
7441
7442 bind (L_check_1);
7443 addl (idx, 0x2);
7444 andl (idx, 0x1);
7445 subl(idx, 1);
7446 jcc(Assembler::negative, L_post_third_loop_done);
7447
7448 movl(yz_idx, Address(y, idx, Address::times_4, 0));
7449 movq(product, x_xstart);
7450 mulq(yz_idx); // product(rax) * yz_idx -> rdx:product(rax)
7451 movl(yz_idx, Address(z, idx, Address::times_4, 0));
7452
7453 add2_with_carry(rdx, product, yz_idx, carry);
7454
7455 movl(Address(z, idx, Address::times_4, 0), product);
7456 shrq(product, 32);
7457
7458 shlq(rdx, 32);
7459 orq(product, rdx);
7460 movq(carry, product);
7461
7462 bind(L_post_third_loop_done);
7463 }
7464
7465 /**
7466 * Multiply 128 bit by 128 bit using BMI2. Unrolled inner loop.
7467 *
7468 */
7469 void MacroAssembler::multiply_128_x_128_bmi2_loop(Register y, Register z,
7470 Register carry, Register carry2,
7471 Register idx, Register jdx,
7472 Register yz_idx1, Register yz_idx2,
7473 Register tmp, Register tmp3, Register tmp4) {
7474 assert(UseBMI2Instructions, "should be used only when BMI2 is available");
7475
7476 // jlong carry, x[], y[], z[];
7477 // int kdx = ystart+1;
7478 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
7479 // huge_128 tmp3 = (y[idx+1] * rdx) + z[kdx+idx+1] + carry;
7480 // jlong carry2 = (jlong)(tmp3 >>> 64);
7481 // huge_128 tmp4 = (y[idx] * rdx) + z[kdx+idx] + carry2;
7482 // carry = (jlong)(tmp4 >>> 64);
7483 // z[kdx+idx+1] = (jlong)tmp3;
7484 // z[kdx+idx] = (jlong)tmp4;
7485 // }
7486 // idx += 2;
7487 // if (idx > 0) {
7488 // yz_idx1 = (y[idx] * rdx) + z[kdx+idx] + carry;
7489 // z[kdx+idx] = (jlong)yz_idx1;
7490 // carry = (jlong)(yz_idx1 >>> 64);
7491 // }
7492 //
7493
7494 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
7495
7496 movl(jdx, idx);
7497 andl(jdx, 0xFFFFFFFC);
7498 shrl(jdx, 2);
7499
7500 bind(L_third_loop);
7501 subl(jdx, 1);
7502 jcc(Assembler::negative, L_third_loop_exit);
7503 subl(idx, 4);
7504
7505 movq(yz_idx1, Address(y, idx, Address::times_4, 8));
7506 rorxq(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
7507 movq(yz_idx2, Address(y, idx, Address::times_4, 0));
7508 rorxq(yz_idx2, yz_idx2, 32);
7509
7510 mulxq(tmp4, tmp3, yz_idx1); // yz_idx1 * rdx -> tmp4:tmp3
7511 mulxq(carry2, tmp, yz_idx2); // yz_idx2 * rdx -> carry2:tmp
7512
7513 movq(yz_idx1, Address(z, idx, Address::times_4, 8));
7514 rorxq(yz_idx1, yz_idx1, 32);
7515 movq(yz_idx2, Address(z, idx, Address::times_4, 0));
7516 rorxq(yz_idx2, yz_idx2, 32);
7517
7518 if (VM_Version::supports_adx()) {
7519 adcxq(tmp3, carry);
7520 adoxq(tmp3, yz_idx1);
7521
7522 adcxq(tmp4, tmp);
7523 adoxq(tmp4, yz_idx2);
7524
7525 movl(carry, 0); // does not affect flags
7526 adcxq(carry2, carry);
7527 adoxq(carry2, carry);
7528 } else {
7529 add2_with_carry(tmp4, tmp3, carry, yz_idx1);
7530 add2_with_carry(carry2, tmp4, tmp, yz_idx2);
7531 }
7532 movq(carry, carry2);
7533
7534 movl(Address(z, idx, Address::times_4, 12), tmp3);
7535 shrq(tmp3, 32);
7536 movl(Address(z, idx, Address::times_4, 8), tmp3);
7537
7538 movl(Address(z, idx, Address::times_4, 4), tmp4);
7539 shrq(tmp4, 32);
7540 movl(Address(z, idx, Address::times_4, 0), tmp4);
7541
7542 jmp(L_third_loop);
7543
7544 bind (L_third_loop_exit);
7545
7546 andl (idx, 0x3);
7547 jcc(Assembler::zero, L_post_third_loop_done);
7548
7549 Label L_check_1;
7550 subl(idx, 2);
7551 jcc(Assembler::negative, L_check_1);
7552
7553 movq(yz_idx1, Address(y, idx, Address::times_4, 0));
7554 rorxq(yz_idx1, yz_idx1, 32);
7555 mulxq(tmp4, tmp3, yz_idx1); // yz_idx1 * rdx -> tmp4:tmp3
7556 movq(yz_idx2, Address(z, idx, Address::times_4, 0));
7557 rorxq(yz_idx2, yz_idx2, 32);
7558
7559 add2_with_carry(tmp4, tmp3, carry, yz_idx2);
7560
7561 movl(Address(z, idx, Address::times_4, 4), tmp3);
7562 shrq(tmp3, 32);
7563 movl(Address(z, idx, Address::times_4, 0), tmp3);
7564 movq(carry, tmp4);
7565
7566 bind (L_check_1);
7567 addl (idx, 0x2);
7568 andl (idx, 0x1);
7569 subl(idx, 1);
7570 jcc(Assembler::negative, L_post_third_loop_done);
7571 movl(tmp4, Address(y, idx, Address::times_4, 0));
7572 mulxq(carry2, tmp3, tmp4); // tmp4 * rdx -> carry2:tmp3
7573 movl(tmp4, Address(z, idx, Address::times_4, 0));
7574
7575 add2_with_carry(carry2, tmp3, tmp4, carry);
7576
7577 movl(Address(z, idx, Address::times_4, 0), tmp3);
7578 shrq(tmp3, 32);
7579
7580 shlq(carry2, 32);
7581 orq(tmp3, carry2);
7582 movq(carry, tmp3);
7583
7584 bind(L_post_third_loop_done);
7585 }
7586
7587 /**
7588 * Code for BigInteger::multiplyToLen() instrinsic.
7589 *
7590 * rdi: x
7591 * rax: xlen
7592 * rsi: y
7593 * rcx: ylen
7594 * r8: z
7595 * r11: zlen
7596 * r12: tmp1
7597 * r13: tmp2
7598 * r14: tmp3
7599 * r15: tmp4
7600 * rbx: tmp5
7601 *
7602 */
7603 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z, Register zlen,
7604 Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) {
7605 ShortBranchVerifier sbv(this);
7606 assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, rdx);
7607
7608 push(tmp1);
7609 push(tmp2);
7610 push(tmp3);
7611 push(tmp4);
7612 push(tmp5);
7613
7614 push(xlen);
7615 push(zlen);
7616
7617 const Register idx = tmp1;
7618 const Register kdx = tmp2;
7619 const Register xstart = tmp3;
7620
7621 const Register y_idx = tmp4;
7622 const Register carry = tmp5;
7623 const Register product = xlen;
7624 const Register x_xstart = zlen; // reuse register
7625
7626 // First Loop.
7627 //
7628 // final static long LONG_MASK = 0xffffffffL;
7629 // int xstart = xlen - 1;
7630 // int ystart = ylen - 1;
7631 // long carry = 0;
7632 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
7633 // long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
7634 // z[kdx] = (int)product;
7635 // carry = product >>> 32;
7636 // }
7637 // z[xstart] = (int)carry;
7638 //
7639
7640 movl(idx, ylen); // idx = ylen;
7641 movl(kdx, zlen); // kdx = xlen+ylen;
7642 xorq(carry, carry); // carry = 0;
7643
7644 Label L_done;
7645
7646 movl(xstart, xlen);
7647 decrementl(xstart);
7648 jcc(Assembler::negative, L_done);
7649
7650 multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
7651
7652 Label L_second_loop;
7653 testl(kdx, kdx);
7654 jcc(Assembler::zero, L_second_loop);
7655
7656 Label L_carry;
7657 subl(kdx, 1);
7658 jcc(Assembler::zero, L_carry);
7659
7660 movl(Address(z, kdx, Address::times_4, 0), carry);
7661 shrq(carry, 32);
7662 subl(kdx, 1);
7663
7664 bind(L_carry);
7665 movl(Address(z, kdx, Address::times_4, 0), carry);
7666
7667 // Second and third (nested) loops.
7668 //
7669 // for (int i = xstart-1; i >= 0; i--) { // Second loop
7670 // carry = 0;
7671 // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
7672 // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
7673 // (z[k] & LONG_MASK) + carry;
7674 // z[k] = (int)product;
7675 // carry = product >>> 32;
7676 // }
7677 // z[i] = (int)carry;
7678 // }
7679 //
7680 // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx
7681
7682 const Register jdx = tmp1;
7683
7684 bind(L_second_loop);
7685 xorl(carry, carry); // carry = 0;
7686 movl(jdx, ylen); // j = ystart+1
7687
7688 subl(xstart, 1); // i = xstart-1;
7689 jcc(Assembler::negative, L_done);
7690
7691 push (z);
7692
7693 Label L_last_x;
7694 lea(z, Address(z, xstart, Address::times_4, 4)); // z = z + k - j
7695 subl(xstart, 1); // i = xstart-1;
7696 jcc(Assembler::negative, L_last_x);
7697
7698 if (UseBMI2Instructions) {
7699 movq(rdx, Address(x, xstart, Address::times_4, 0));
7700 rorxq(rdx, rdx, 32); // convert big-endian to little-endian
7701 } else {
7702 movq(x_xstart, Address(x, xstart, Address::times_4, 0));
7703 rorq(x_xstart, 32); // convert big-endian to little-endian
7704 }
7705
7706 Label L_third_loop_prologue;
7707 bind(L_third_loop_prologue);
7708
7709 push (x);
7710 push (xstart);
7711 push (ylen);
7712
7713
7714 if (UseBMI2Instructions) {
7715 multiply_128_x_128_bmi2_loop(y, z, carry, x, jdx, ylen, product, tmp2, x_xstart, tmp3, tmp4);
7716 } else { // !UseBMI2Instructions
7717 multiply_128_x_128_loop(x_xstart, y, z, y_idx, jdx, ylen, carry, product, x);
7718 }
7719
7720 pop(ylen);
7721 pop(xlen);
7722 pop(x);
7723 pop(z);
7724
7725 movl(tmp3, xlen);
7726 addl(tmp3, 1);
7727 movl(Address(z, tmp3, Address::times_4, 0), carry);
7728 subl(tmp3, 1);
7729 jccb(Assembler::negative, L_done);
7730
7731 shrq(carry, 32);
7732 movl(Address(z, tmp3, Address::times_4, 0), carry);
7733 jmp(L_second_loop);
7734
7735 // Next infrequent code is moved outside loops.
7736 bind(L_last_x);
7737 if (UseBMI2Instructions) {
7738 movl(rdx, Address(x, 0));
7739 } else {
7740 movl(x_xstart, Address(x, 0));
7741 }
7742 jmp(L_third_loop_prologue);
7743
7744 bind(L_done);
7745
7746 pop(zlen);
7747 pop(xlen);
7748
7749 pop(tmp5);
7750 pop(tmp4);
7751 pop(tmp3);
7752 pop(tmp2);
7753 pop(tmp1);
7754 }
7755 #endif
7756
7296 /** 7757 /**
7297 * Emits code to update CRC-32 with a byte value according to constants in table 7758 * Emits code to update CRC-32 with a byte value according to constants in table
7298 * 7759 *
7299 * @param [in,out]crc Register containing the crc. 7760 * @param [in,out]crc Register containing the crc.
7300 * @param [in]val Register containing the byte to fold into the CRC. 7761 * @param [in]val Register containing the byte to fold into the CRC.
7314 7775
7315 /** 7776 /**
7316 * Fold 128-bit data chunk 7777 * Fold 128-bit data chunk
7317 */ 7778 */
7318 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset) { 7779 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset) {
7319 vpclmulhdq(xtmp, xK, xcrc); // [123:64] 7780 if (UseAVX > 0) {
7320 vpclmulldq(xcrc, xK, xcrc); // [63:0] 7781 vpclmulhdq(xtmp, xK, xcrc); // [123:64]
7321 vpxor(xcrc, xcrc, Address(buf, offset), false /* vector256 */); 7782 vpclmulldq(xcrc, xK, xcrc); // [63:0]
7322 pxor(xcrc, xtmp); 7783 vpxor(xcrc, xcrc, Address(buf, offset), false /* vector256 */);
7784 pxor(xcrc, xtmp);
7785 } else {
7786 movdqa(xtmp, xcrc);
7787 pclmulhdq(xtmp, xK); // [123:64]
7788 pclmulldq(xcrc, xK); // [63:0]
7789 pxor(xcrc, xtmp);
7790 movdqu(xtmp, Address(buf, offset));
7791 pxor(xcrc, xtmp);
7792 }
7323 } 7793 }
7324 7794
7325 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, XMMRegister xbuf) { 7795 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, XMMRegister xbuf) {
7326 vpclmulhdq(xtmp, xK, xcrc); 7796 if (UseAVX > 0) {
7327 vpclmulldq(xcrc, xK, xcrc); 7797 vpclmulhdq(xtmp, xK, xcrc);
7328 pxor(xcrc, xbuf); 7798 vpclmulldq(xcrc, xK, xcrc);
7329 pxor(xcrc, xtmp); 7799 pxor(xcrc, xbuf);
7800 pxor(xcrc, xtmp);
7801 } else {
7802 movdqa(xtmp, xcrc);
7803 pclmulhdq(xtmp, xK);
7804 pclmulldq(xcrc, xK);
7805 pxor(xcrc, xbuf);
7806 pxor(xcrc, xtmp);
7807 }
7330 } 7808 }
7331 7809
7332 /** 7810 /**
7333 * 8-bit folds to compute 32-bit CRC 7811 * 8-bit folds to compute 32-bit CRC
7334 * 7812 *
7442 jccb(Assembler::greater, L_fold_tail_loop); 7920 jccb(Assembler::greater, L_fold_tail_loop);
7443 7921
7444 // Fold 128 bits in xmm1 down into 32 bits in crc register. 7922 // Fold 128 bits in xmm1 down into 32 bits in crc register.
7445 BIND(L_fold_128b); 7923 BIND(L_fold_128b);
7446 movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr())); 7924 movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr()));
7447 vpclmulqdq(xmm2, xmm0, xmm1, 0x1); 7925 if (UseAVX > 0) {
7448 vpand(xmm3, xmm0, xmm2, false /* vector256 */); 7926 vpclmulqdq(xmm2, xmm0, xmm1, 0x1);
7449 vpclmulqdq(xmm0, xmm0, xmm3, 0x1); 7927 vpand(xmm3, xmm0, xmm2, false /* vector256 */);
7928 vpclmulqdq(xmm0, xmm0, xmm3, 0x1);
7929 } else {
7930 movdqa(xmm2, xmm0);
7931 pclmulqdq(xmm2, xmm1, 0x1);
7932 movdqa(xmm3, xmm0);
7933 pand(xmm3, xmm2);
7934 pclmulqdq(xmm0, xmm3, 0x1);
7935 }
7450 psrldq(xmm1, 8); 7936 psrldq(xmm1, 8);
7451 psrldq(xmm2, 4); 7937 psrldq(xmm2, 4);
7452 pxor(xmm0, xmm1); 7938 pxor(xmm0, xmm1);
7453 pxor(xmm0, xmm2); 7939 pxor(xmm0, xmm2);
7454 7940