Mercurial > hg > truffle
diff src/cpu/sparc/vm/stubGenerator_sparc.cpp @ 20313:b20a35eae442
8035968: Leverage CPU Instructions to Improve SHA Performance on SPARC
Summary: Add C2 SHA intrinsics on SPARC
Reviewed-by: kvn, roland
Contributed-by: james.cheng@oracle.com
author | kvn |
---|---|
date | Wed, 11 Jun 2014 11:05:10 -0700 |
parents | 0342d80559e0 |
children | 7848fc12602b |
line wrap: on
line diff
--- a/src/cpu/sparc/vm/stubGenerator_sparc.cpp Tue Jun 10 12:28:06 2014 -0700 +++ b/src/cpu/sparc/vm/stubGenerator_sparc.cpp Wed Jun 11 11:05:10 2014 -0700 @@ -4575,6 +4575,219 @@ return start; } + address generate_sha1_implCompress(bool multi_block, const char *name) { + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", name); + address start = __ pc(); + + Label L_sha1_loop, L_sha1_unaligned_input, L_sha1_unaligned_input_loop; + int i; + + Register buf = O0; // byte[] source+offset + Register state = O1; // int[] SHA.state + Register ofs = O2; // int offset + Register limit = O3; // int limit + + // load state into F0-F4 + for (i = 0; i < 5; i++) { + __ ldf(FloatRegisterImpl::S, state, i*4, as_FloatRegister(i)); + } + + __ andcc(buf, 7, G0); + __ br(Assembler::notZero, false, Assembler::pn, L_sha1_unaligned_input); + __ delayed()->nop(); + + __ BIND(L_sha1_loop); + // load buf into F8-F22 + for (i = 0; i < 8; i++) { + __ ldf(FloatRegisterImpl::D, buf, i*8, as_FloatRegister(i*2 + 8)); + } + __ sha1(); + if (multi_block) { + __ add(ofs, 64, ofs); + __ add(buf, 64, buf); + __ cmp_and_brx_short(ofs, limit, Assembler::lessEqual, Assembler::pt, L_sha1_loop); + __ mov(ofs, O0); // to be returned + } + + // store F0-F4 into state and return + for (i = 0; i < 4; i++) { + __ stf(FloatRegisterImpl::S, as_FloatRegister(i), state, i*4); + } + __ retl(); + __ delayed()->stf(FloatRegisterImpl::S, F4, state, 0x10); + + __ BIND(L_sha1_unaligned_input); + __ alignaddr(buf, G0, buf); + + __ BIND(L_sha1_unaligned_input_loop); + // load buf into F8-F22 + for (i = 0; i < 9; i++) { + __ ldf(FloatRegisterImpl::D, buf, i*8, as_FloatRegister(i*2 + 8)); + } + for (i = 0; i < 8; i++) { + __ faligndata(as_FloatRegister(i*2 + 8), as_FloatRegister(i*2 + 10), as_FloatRegister(i*2 + 8)); + } + __ sha1(); + if (multi_block) { + __ add(ofs, 64, ofs); + __ add(buf, 64, buf); + __ cmp_and_brx_short(ofs, limit, Assembler::lessEqual, Assembler::pt, L_sha1_unaligned_input_loop); + __ mov(ofs, O0); // to be returned + } + + // store F0-F4 into state and return + for (i = 0; i < 4; i++) { + __ stf(FloatRegisterImpl::S, as_FloatRegister(i), state, i*4); + } + __ retl(); + __ delayed()->stf(FloatRegisterImpl::S, F4, state, 0x10); + + return start; + } + + address generate_sha256_implCompress(bool multi_block, const char *name) { + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", name); + address start = __ pc(); + + Label L_sha256_loop, L_sha256_unaligned_input, L_sha256_unaligned_input_loop; + int i; + + Register buf = O0; // byte[] source+offset + Register state = O1; // int[] SHA2.state + Register ofs = O2; // int offset + Register limit = O3; // int limit + + // load state into F0-F7 + for (i = 0; i < 8; i++) { + __ ldf(FloatRegisterImpl::S, state, i*4, as_FloatRegister(i)); + } + + __ andcc(buf, 7, G0); + __ br(Assembler::notZero, false, Assembler::pn, L_sha256_unaligned_input); + __ delayed()->nop(); + + __ BIND(L_sha256_loop); + // load buf into F8-F22 + for (i = 0; i < 8; i++) { + __ ldf(FloatRegisterImpl::D, buf, i*8, as_FloatRegister(i*2 + 8)); + } + __ sha256(); + if (multi_block) { + __ add(ofs, 64, ofs); + __ add(buf, 64, buf); + __ cmp_and_brx_short(ofs, limit, Assembler::lessEqual, Assembler::pt, L_sha256_loop); + __ mov(ofs, O0); // to be returned + } + + // store F0-F7 into state and return + for (i = 0; i < 7; i++) { + __ stf(FloatRegisterImpl::S, as_FloatRegister(i), state, i*4); + } + __ retl(); + __ delayed()->stf(FloatRegisterImpl::S, F7, state, 0x1c); + + __ BIND(L_sha256_unaligned_input); + __ alignaddr(buf, G0, buf); + + __ BIND(L_sha256_unaligned_input_loop); + // load buf into F8-F22 + for (i = 0; i < 9; i++) { + __ ldf(FloatRegisterImpl::D, buf, i*8, as_FloatRegister(i*2 + 8)); + } + for (i = 0; i < 8; i++) { + __ faligndata(as_FloatRegister(i*2 + 8), as_FloatRegister(i*2 + 10), as_FloatRegister(i*2 + 8)); + } + __ sha256(); + if (multi_block) { + __ add(ofs, 64, ofs); + __ add(buf, 64, buf); + __ cmp_and_brx_short(ofs, limit, Assembler::lessEqual, Assembler::pt, L_sha256_unaligned_input_loop); + __ mov(ofs, O0); // to be returned + } + + // store F0-F7 into state and return + for (i = 0; i < 7; i++) { + __ stf(FloatRegisterImpl::S, as_FloatRegister(i), state, i*4); + } + __ retl(); + __ delayed()->stf(FloatRegisterImpl::S, F7, state, 0x1c); + + return start; + } + + address generate_sha512_implCompress(bool multi_block, const char *name) { + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", name); + address start = __ pc(); + + Label L_sha512_loop, L_sha512_unaligned_input, L_sha512_unaligned_input_loop; + int i; + + Register buf = O0; // byte[] source+offset + Register state = O1; // long[] SHA5.state + Register ofs = O2; // int offset + Register limit = O3; // int limit + + // load state into F0-F14 + for (i = 0; i < 8; i++) { + __ ldf(FloatRegisterImpl::D, state, i*8, as_FloatRegister(i*2)); + } + + __ andcc(buf, 7, G0); + __ br(Assembler::notZero, false, Assembler::pn, L_sha512_unaligned_input); + __ delayed()->nop(); + + __ BIND(L_sha512_loop); + // load buf into F16-F46 + for (i = 0; i < 16; i++) { + __ ldf(FloatRegisterImpl::D, buf, i*8, as_FloatRegister(i*2 + 16)); + } + __ sha512(); + if (multi_block) { + __ add(ofs, 128, ofs); + __ add(buf, 128, buf); + __ cmp_and_brx_short(ofs, limit, Assembler::lessEqual, Assembler::pt, L_sha512_loop); + __ mov(ofs, O0); // to be returned + } + + // store F0-F14 into state and return + for (i = 0; i < 7; i++) { + __ stf(FloatRegisterImpl::D, as_FloatRegister(i*2), state, i*8); + } + __ retl(); + __ delayed()->stf(FloatRegisterImpl::D, F14, state, 0x38); + + __ BIND(L_sha512_unaligned_input); + __ alignaddr(buf, G0, buf); + + __ BIND(L_sha512_unaligned_input_loop); + // load buf into F16-F46 + for (i = 0; i < 17; i++) { + __ ldf(FloatRegisterImpl::D, buf, i*8, as_FloatRegister(i*2 + 16)); + } + for (i = 0; i < 16; i++) { + __ faligndata(as_FloatRegister(i*2 + 16), as_FloatRegister(i*2 + 18), as_FloatRegister(i*2 + 16)); + } + __ sha512(); + if (multi_block) { + __ add(ofs, 128, ofs); + __ add(buf, 128, buf); + __ cmp_and_brx_short(ofs, limit, Assembler::lessEqual, Assembler::pt, L_sha512_unaligned_input_loop); + __ mov(ofs, O0); // to be returned + } + + // store F0-F14 into state and return + for (i = 0; i < 7; i++) { + __ stf(FloatRegisterImpl::D, as_FloatRegister(i*2), state, i*8); + } + __ retl(); + __ delayed()->stf(FloatRegisterImpl::D, F14, state, 0x38); + + return start; + } + void generate_initial() { // Generates all stubs and initializes the entry points @@ -4647,6 +4860,20 @@ StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel(); } + + // generate SHA1/SHA256/SHA512 intrinsics code + if (UseSHA1Intrinsics) { + StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress"); + StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB"); + } + if (UseSHA256Intrinsics) { + StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress"); + StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB"); + } + if (UseSHA512Intrinsics) { + StubRoutines::_sha512_implCompress = generate_sha512_implCompress(false, "sha512_implCompress"); + StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true, "sha512_implCompressMB"); + } }