Mercurial > hg > graal-compiler
comparison src/cpu/x86/vm/macroAssembler_x86.cpp @ 17714:4d4ea046d32a
8033805: Move Fast_Lock/Fast_Unlock code from .ad files to macroassembler
Summary: Consolidated C2 x86 locking code in one place in macroAssembler_x86.cpp.
Reviewed-by: roland
author | kvn |
---|---|
date | Mon, 24 Feb 2014 15:12:26 -0800 |
parents | 61746b5f0ed3 |
children | d8041d695d19 606acabe7b5c |
comparison
equal
deleted
inserted
replaced
17713:e7cf34c87433 | 17714:4d4ea046d32a |
---|---|
96 | 96 |
97 Address MacroAssembler::as_Address(ArrayAddress adr) { | 97 Address MacroAssembler::as_Address(ArrayAddress adr) { |
98 return Address::make_array(adr); | 98 return Address::make_array(adr); |
99 } | 99 } |
100 | 100 |
101 int MacroAssembler::biased_locking_enter(Register lock_reg, | |
102 Register obj_reg, | |
103 Register swap_reg, | |
104 Register tmp_reg, | |
105 bool swap_reg_contains_mark, | |
106 Label& done, | |
107 Label* slow_case, | |
108 BiasedLockingCounters* counters) { | |
109 assert(UseBiasedLocking, "why call this otherwise?"); | |
110 assert(swap_reg == rax, "swap_reg must be rax, for cmpxchg"); | |
111 assert_different_registers(lock_reg, obj_reg, swap_reg); | |
112 | |
113 if (PrintBiasedLockingStatistics && counters == NULL) | |
114 counters = BiasedLocking::counters(); | |
115 | |
116 bool need_tmp_reg = false; | |
117 if (tmp_reg == noreg) { | |
118 need_tmp_reg = true; | |
119 tmp_reg = lock_reg; | |
120 } else { | |
121 assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg); | |
122 } | |
123 assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout"); | |
124 Address mark_addr (obj_reg, oopDesc::mark_offset_in_bytes()); | |
125 Address klass_addr (obj_reg, oopDesc::klass_offset_in_bytes()); | |
126 Address saved_mark_addr(lock_reg, 0); | |
127 | |
128 // Biased locking | |
129 // See whether the lock is currently biased toward our thread and | |
130 // whether the epoch is still valid | |
131 // Note that the runtime guarantees sufficient alignment of JavaThread | |
132 // pointers to allow age to be placed into low bits | |
133 // First check to see whether biasing is even enabled for this object | |
134 Label cas_label; | |
135 int null_check_offset = -1; | |
136 if (!swap_reg_contains_mark) { | |
137 null_check_offset = offset(); | |
138 movl(swap_reg, mark_addr); | |
139 } | |
140 if (need_tmp_reg) { | |
141 push(tmp_reg); | |
142 } | |
143 movl(tmp_reg, swap_reg); | |
144 andl(tmp_reg, markOopDesc::biased_lock_mask_in_place); | |
145 cmpl(tmp_reg, markOopDesc::biased_lock_pattern); | |
146 if (need_tmp_reg) { | |
147 pop(tmp_reg); | |
148 } | |
149 jcc(Assembler::notEqual, cas_label); | |
150 // The bias pattern is present in the object's header. Need to check | |
151 // whether the bias owner and the epoch are both still current. | |
152 // Note that because there is no current thread register on x86 we | |
153 // need to store off the mark word we read out of the object to | |
154 // avoid reloading it and needing to recheck invariants below. This | |
155 // store is unfortunate but it makes the overall code shorter and | |
156 // simpler. | |
157 movl(saved_mark_addr, swap_reg); | |
158 if (need_tmp_reg) { | |
159 push(tmp_reg); | |
160 } | |
161 get_thread(tmp_reg); | |
162 xorl(swap_reg, tmp_reg); | |
163 if (swap_reg_contains_mark) { | |
164 null_check_offset = offset(); | |
165 } | |
166 movl(tmp_reg, klass_addr); | |
167 xorl(swap_reg, Address(tmp_reg, Klass::prototype_header_offset())); | |
168 andl(swap_reg, ~((int) markOopDesc::age_mask_in_place)); | |
169 if (need_tmp_reg) { | |
170 pop(tmp_reg); | |
171 } | |
172 if (counters != NULL) { | |
173 cond_inc32(Assembler::zero, | |
174 ExternalAddress((address)counters->biased_lock_entry_count_addr())); | |
175 } | |
176 jcc(Assembler::equal, done); | |
177 | |
178 Label try_revoke_bias; | |
179 Label try_rebias; | |
180 | |
181 // At this point we know that the header has the bias pattern and | |
182 // that we are not the bias owner in the current epoch. We need to | |
183 // figure out more details about the state of the header in order to | |
184 // know what operations can be legally performed on the object's | |
185 // header. | |
186 | |
187 // If the low three bits in the xor result aren't clear, that means | |
188 // the prototype header is no longer biased and we have to revoke | |
189 // the bias on this object. | |
190 testl(swap_reg, markOopDesc::biased_lock_mask_in_place); | |
191 jcc(Assembler::notZero, try_revoke_bias); | |
192 | |
193 // Biasing is still enabled for this data type. See whether the | |
194 // epoch of the current bias is still valid, meaning that the epoch | |
195 // bits of the mark word are equal to the epoch bits of the | |
196 // prototype header. (Note that the prototype header's epoch bits | |
197 // only change at a safepoint.) If not, attempt to rebias the object | |
198 // toward the current thread. Note that we must be absolutely sure | |
199 // that the current epoch is invalid in order to do this because | |
200 // otherwise the manipulations it performs on the mark word are | |
201 // illegal. | |
202 testl(swap_reg, markOopDesc::epoch_mask_in_place); | |
203 jcc(Assembler::notZero, try_rebias); | |
204 | |
205 // The epoch of the current bias is still valid but we know nothing | |
206 // about the owner; it might be set or it might be clear. Try to | |
207 // acquire the bias of the object using an atomic operation. If this | |
208 // fails we will go in to the runtime to revoke the object's bias. | |
209 // Note that we first construct the presumed unbiased header so we | |
210 // don't accidentally blow away another thread's valid bias. | |
211 movl(swap_reg, saved_mark_addr); | |
212 andl(swap_reg, | |
213 markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place); | |
214 if (need_tmp_reg) { | |
215 push(tmp_reg); | |
216 } | |
217 get_thread(tmp_reg); | |
218 orl(tmp_reg, swap_reg); | |
219 if (os::is_MP()) { | |
220 lock(); | |
221 } | |
222 cmpxchgptr(tmp_reg, Address(obj_reg, 0)); | |
223 if (need_tmp_reg) { | |
224 pop(tmp_reg); | |
225 } | |
226 // If the biasing toward our thread failed, this means that | |
227 // another thread succeeded in biasing it toward itself and we | |
228 // need to revoke that bias. The revocation will occur in the | |
229 // interpreter runtime in the slow case. | |
230 if (counters != NULL) { | |
231 cond_inc32(Assembler::zero, | |
232 ExternalAddress((address)counters->anonymously_biased_lock_entry_count_addr())); | |
233 } | |
234 if (slow_case != NULL) { | |
235 jcc(Assembler::notZero, *slow_case); | |
236 } | |
237 jmp(done); | |
238 | |
239 bind(try_rebias); | |
240 // At this point we know the epoch has expired, meaning that the | |
241 // current "bias owner", if any, is actually invalid. Under these | |
242 // circumstances _only_, we are allowed to use the current header's | |
243 // value as the comparison value when doing the cas to acquire the | |
244 // bias in the current epoch. In other words, we allow transfer of | |
245 // the bias from one thread to another directly in this situation. | |
246 // | |
247 // FIXME: due to a lack of registers we currently blow away the age | |
248 // bits in this situation. Should attempt to preserve them. | |
249 if (need_tmp_reg) { | |
250 push(tmp_reg); | |
251 } | |
252 get_thread(tmp_reg); | |
253 movl(swap_reg, klass_addr); | |
254 orl(tmp_reg, Address(swap_reg, Klass::prototype_header_offset())); | |
255 movl(swap_reg, saved_mark_addr); | |
256 if (os::is_MP()) { | |
257 lock(); | |
258 } | |
259 cmpxchgptr(tmp_reg, Address(obj_reg, 0)); | |
260 if (need_tmp_reg) { | |
261 pop(tmp_reg); | |
262 } | |
263 // If the biasing toward our thread failed, then another thread | |
264 // succeeded in biasing it toward itself and we need to revoke that | |
265 // bias. The revocation will occur in the runtime in the slow case. | |
266 if (counters != NULL) { | |
267 cond_inc32(Assembler::zero, | |
268 ExternalAddress((address)counters->rebiased_lock_entry_count_addr())); | |
269 } | |
270 if (slow_case != NULL) { | |
271 jcc(Assembler::notZero, *slow_case); | |
272 } | |
273 jmp(done); | |
274 | |
275 bind(try_revoke_bias); | |
276 // The prototype mark in the klass doesn't have the bias bit set any | |
277 // more, indicating that objects of this data type are not supposed | |
278 // to be biased any more. We are going to try to reset the mark of | |
279 // this object to the prototype value and fall through to the | |
280 // CAS-based locking scheme. Note that if our CAS fails, it means | |
281 // that another thread raced us for the privilege of revoking the | |
282 // bias of this particular object, so it's okay to continue in the | |
283 // normal locking code. | |
284 // | |
285 // FIXME: due to a lack of registers we currently blow away the age | |
286 // bits in this situation. Should attempt to preserve them. | |
287 movl(swap_reg, saved_mark_addr); | |
288 if (need_tmp_reg) { | |
289 push(tmp_reg); | |
290 } | |
291 movl(tmp_reg, klass_addr); | |
292 movl(tmp_reg, Address(tmp_reg, Klass::prototype_header_offset())); | |
293 if (os::is_MP()) { | |
294 lock(); | |
295 } | |
296 cmpxchgptr(tmp_reg, Address(obj_reg, 0)); | |
297 if (need_tmp_reg) { | |
298 pop(tmp_reg); | |
299 } | |
300 // Fall through to the normal CAS-based lock, because no matter what | |
301 // the result of the above CAS, some thread must have succeeded in | |
302 // removing the bias bit from the object's header. | |
303 if (counters != NULL) { | |
304 cond_inc32(Assembler::zero, | |
305 ExternalAddress((address)counters->revoked_lock_entry_count_addr())); | |
306 } | |
307 | |
308 bind(cas_label); | |
309 | |
310 return null_check_offset; | |
311 } | |
312 void MacroAssembler::call_VM_leaf_base(address entry_point, | 101 void MacroAssembler::call_VM_leaf_base(address entry_point, |
313 int number_of_arguments) { | 102 int number_of_arguments) { |
314 call(RuntimeAddress(entry_point)); | 103 call(RuntimeAddress(entry_point)); |
315 increment(rsp, number_of_arguments * wordSize); | 104 increment(rsp, number_of_arguments * wordSize); |
316 } | 105 } |
724 assert(index._disp == 0, "must not have disp"); // maybe it can? | 513 assert(index._disp == 0, "must not have disp"); // maybe it can? |
725 Address array(rscratch1, index._index, index._scale, index._disp); | 514 Address array(rscratch1, index._index, index._scale, index._disp); |
726 return array; | 515 return array; |
727 } | 516 } |
728 | 517 |
729 int MacroAssembler::biased_locking_enter(Register lock_reg, | |
730 Register obj_reg, | |
731 Register swap_reg, | |
732 Register tmp_reg, | |
733 bool swap_reg_contains_mark, | |
734 Label& done, | |
735 Label* slow_case, | |
736 BiasedLockingCounters* counters) { | |
737 assert(UseBiasedLocking, "why call this otherwise?"); | |
738 assert(swap_reg == rax, "swap_reg must be rax for cmpxchgq"); | |
739 assert(tmp_reg != noreg, "tmp_reg must be supplied"); | |
740 assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg); | |
741 assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout"); | |
742 Address mark_addr (obj_reg, oopDesc::mark_offset_in_bytes()); | |
743 Address saved_mark_addr(lock_reg, 0); | |
744 | |
745 if (PrintBiasedLockingStatistics && counters == NULL) | |
746 counters = BiasedLocking::counters(); | |
747 | |
748 // Biased locking | |
749 // See whether the lock is currently biased toward our thread and | |
750 // whether the epoch is still valid | |
751 // Note that the runtime guarantees sufficient alignment of JavaThread | |
752 // pointers to allow age to be placed into low bits | |
753 // First check to see whether biasing is even enabled for this object | |
754 Label cas_label; | |
755 int null_check_offset = -1; | |
756 if (!swap_reg_contains_mark) { | |
757 null_check_offset = offset(); | |
758 movq(swap_reg, mark_addr); | |
759 } | |
760 movq(tmp_reg, swap_reg); | |
761 andq(tmp_reg, markOopDesc::biased_lock_mask_in_place); | |
762 cmpq(tmp_reg, markOopDesc::biased_lock_pattern); | |
763 jcc(Assembler::notEqual, cas_label); | |
764 // The bias pattern is present in the object's header. Need to check | |
765 // whether the bias owner and the epoch are both still current. | |
766 load_prototype_header(tmp_reg, obj_reg); | |
767 orq(tmp_reg, r15_thread); | |
768 xorq(tmp_reg, swap_reg); | |
769 andq(tmp_reg, ~((int) markOopDesc::age_mask_in_place)); | |
770 if (counters != NULL) { | |
771 cond_inc32(Assembler::zero, | |
772 ExternalAddress((address) counters->anonymously_biased_lock_entry_count_addr())); | |
773 } | |
774 jcc(Assembler::equal, done); | |
775 | |
776 Label try_revoke_bias; | |
777 Label try_rebias; | |
778 | |
779 // At this point we know that the header has the bias pattern and | |
780 // that we are not the bias owner in the current epoch. We need to | |
781 // figure out more details about the state of the header in order to | |
782 // know what operations can be legally performed on the object's | |
783 // header. | |
784 | |
785 // If the low three bits in the xor result aren't clear, that means | |
786 // the prototype header is no longer biased and we have to revoke | |
787 // the bias on this object. | |
788 testq(tmp_reg, markOopDesc::biased_lock_mask_in_place); | |
789 jcc(Assembler::notZero, try_revoke_bias); | |
790 | |
791 // Biasing is still enabled for this data type. See whether the | |
792 // epoch of the current bias is still valid, meaning that the epoch | |
793 // bits of the mark word are equal to the epoch bits of the | |
794 // prototype header. (Note that the prototype header's epoch bits | |
795 // only change at a safepoint.) If not, attempt to rebias the object | |
796 // toward the current thread. Note that we must be absolutely sure | |
797 // that the current epoch is invalid in order to do this because | |
798 // otherwise the manipulations it performs on the mark word are | |
799 // illegal. | |
800 testq(tmp_reg, markOopDesc::epoch_mask_in_place); | |
801 jcc(Assembler::notZero, try_rebias); | |
802 | |
803 // The epoch of the current bias is still valid but we know nothing | |
804 // about the owner; it might be set or it might be clear. Try to | |
805 // acquire the bias of the object using an atomic operation. If this | |
806 // fails we will go in to the runtime to revoke the object's bias. | |
807 // Note that we first construct the presumed unbiased header so we | |
808 // don't accidentally blow away another thread's valid bias. | |
809 andq(swap_reg, | |
810 markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place); | |
811 movq(tmp_reg, swap_reg); | |
812 orq(tmp_reg, r15_thread); | |
813 if (os::is_MP()) { | |
814 lock(); | |
815 } | |
816 cmpxchgq(tmp_reg, Address(obj_reg, 0)); | |
817 // If the biasing toward our thread failed, this means that | |
818 // another thread succeeded in biasing it toward itself and we | |
819 // need to revoke that bias. The revocation will occur in the | |
820 // interpreter runtime in the slow case. | |
821 if (counters != NULL) { | |
822 cond_inc32(Assembler::zero, | |
823 ExternalAddress((address) counters->anonymously_biased_lock_entry_count_addr())); | |
824 } | |
825 if (slow_case != NULL) { | |
826 jcc(Assembler::notZero, *slow_case); | |
827 } | |
828 jmp(done); | |
829 | |
830 bind(try_rebias); | |
831 // At this point we know the epoch has expired, meaning that the | |
832 // current "bias owner", if any, is actually invalid. Under these | |
833 // circumstances _only_, we are allowed to use the current header's | |
834 // value as the comparison value when doing the cas to acquire the | |
835 // bias in the current epoch. In other words, we allow transfer of | |
836 // the bias from one thread to another directly in this situation. | |
837 // | |
838 // FIXME: due to a lack of registers we currently blow away the age | |
839 // bits in this situation. Should attempt to preserve them. | |
840 load_prototype_header(tmp_reg, obj_reg); | |
841 orq(tmp_reg, r15_thread); | |
842 if (os::is_MP()) { | |
843 lock(); | |
844 } | |
845 cmpxchgq(tmp_reg, Address(obj_reg, 0)); | |
846 // If the biasing toward our thread failed, then another thread | |
847 // succeeded in biasing it toward itself and we need to revoke that | |
848 // bias. The revocation will occur in the runtime in the slow case. | |
849 if (counters != NULL) { | |
850 cond_inc32(Assembler::zero, | |
851 ExternalAddress((address) counters->rebiased_lock_entry_count_addr())); | |
852 } | |
853 if (slow_case != NULL) { | |
854 jcc(Assembler::notZero, *slow_case); | |
855 } | |
856 jmp(done); | |
857 | |
858 bind(try_revoke_bias); | |
859 // The prototype mark in the klass doesn't have the bias bit set any | |
860 // more, indicating that objects of this data type are not supposed | |
861 // to be biased any more. We are going to try to reset the mark of | |
862 // this object to the prototype value and fall through to the | |
863 // CAS-based locking scheme. Note that if our CAS fails, it means | |
864 // that another thread raced us for the privilege of revoking the | |
865 // bias of this particular object, so it's okay to continue in the | |
866 // normal locking code. | |
867 // | |
868 // FIXME: due to a lack of registers we currently blow away the age | |
869 // bits in this situation. Should attempt to preserve them. | |
870 load_prototype_header(tmp_reg, obj_reg); | |
871 if (os::is_MP()) { | |
872 lock(); | |
873 } | |
874 cmpxchgq(tmp_reg, Address(obj_reg, 0)); | |
875 // Fall through to the normal CAS-based lock, because no matter what | |
876 // the result of the above CAS, some thread must have succeeded in | |
877 // removing the bias bit from the object's header. | |
878 if (counters != NULL) { | |
879 cond_inc32(Assembler::zero, | |
880 ExternalAddress((address) counters->revoked_lock_entry_count_addr())); | |
881 } | |
882 | |
883 bind(cas_label); | |
884 | |
885 return null_check_offset; | |
886 } | |
887 | |
888 void MacroAssembler::call_VM_leaf_base(address entry_point, int num_args) { | 518 void MacroAssembler::call_VM_leaf_base(address entry_point, int num_args) { |
889 Label L, E; | 519 Label L, E; |
890 | 520 |
891 #ifdef _WIN64 | 521 #ifdef _WIN64 |
892 // Windows always allocates space for it's register args | 522 // Windows always allocates space for it's register args |
1358 LP64_ONLY(andq(dst, imm32)) NOT_LP64(andl(dst, imm32)); | 988 LP64_ONLY(andq(dst, imm32)) NOT_LP64(andl(dst, imm32)); |
1359 } | 989 } |
1360 | 990 |
1361 void MacroAssembler::atomic_incl(AddressLiteral counter_addr) { | 991 void MacroAssembler::atomic_incl(AddressLiteral counter_addr) { |
1362 pushf(); | 992 pushf(); |
1363 if (os::is_MP()) | 993 if (reachable(counter_addr)) { |
1364 lock(); | 994 if (os::is_MP()) |
1365 incrementl(counter_addr); | 995 lock(); |
996 incrementl(as_Address(counter_addr)); | |
997 } else { | |
998 lea(rscratch1, counter_addr); | |
999 if (os::is_MP()) | |
1000 lock(); | |
1001 incrementl(Address(rscratch1, 0)); | |
1002 } | |
1366 popf(); | 1003 popf(); |
1367 } | 1004 } |
1368 | 1005 |
1369 // Writes to stack successive pages until offset reached to check for | 1006 // Writes to stack successive pages until offset reached to check for |
1370 // stack overflow + shadow pages. This clobbers tmp. | 1007 // stack overflow + shadow pages. This clobbers tmp. |
1391 // so the bigger the better. | 1028 // so the bigger the better. |
1392 movptr(Address(tmp, (-i*os::vm_page_size())), size ); | 1029 movptr(Address(tmp, (-i*os::vm_page_size())), size ); |
1393 } | 1030 } |
1394 } | 1031 } |
1395 | 1032 |
1033 int MacroAssembler::biased_locking_enter(Register lock_reg, | |
1034 Register obj_reg, | |
1035 Register swap_reg, | |
1036 Register tmp_reg, | |
1037 bool swap_reg_contains_mark, | |
1038 Label& done, | |
1039 Label* slow_case, | |
1040 BiasedLockingCounters* counters) { | |
1041 assert(UseBiasedLocking, "why call this otherwise?"); | |
1042 assert(swap_reg == rax, "swap_reg must be rax for cmpxchgq"); | |
1043 LP64_ONLY( assert(tmp_reg != noreg, "tmp_reg must be supplied"); ) | |
1044 bool need_tmp_reg = false; | |
1045 if (tmp_reg == noreg) { | |
1046 need_tmp_reg = true; | |
1047 tmp_reg = lock_reg; | |
1048 assert_different_registers(lock_reg, obj_reg, swap_reg); | |
1049 } else { | |
1050 assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg); | |
1051 } | |
1052 assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout"); | |
1053 Address mark_addr (obj_reg, oopDesc::mark_offset_in_bytes()); | |
1054 Address saved_mark_addr(lock_reg, 0); | |
1055 | |
1056 if (PrintBiasedLockingStatistics && counters == NULL) { | |
1057 counters = BiasedLocking::counters(); | |
1058 } | |
1059 // Biased locking | |
1060 // See whether the lock is currently biased toward our thread and | |
1061 // whether the epoch is still valid | |
1062 // Note that the runtime guarantees sufficient alignment of JavaThread | |
1063 // pointers to allow age to be placed into low bits | |
1064 // First check to see whether biasing is even enabled for this object | |
1065 Label cas_label; | |
1066 int null_check_offset = -1; | |
1067 if (!swap_reg_contains_mark) { | |
1068 null_check_offset = offset(); | |
1069 movptr(swap_reg, mark_addr); | |
1070 } | |
1071 if (need_tmp_reg) { | |
1072 push(tmp_reg); | |
1073 } | |
1074 movptr(tmp_reg, swap_reg); | |
1075 andptr(tmp_reg, markOopDesc::biased_lock_mask_in_place); | |
1076 cmpptr(tmp_reg, markOopDesc::biased_lock_pattern); | |
1077 if (need_tmp_reg) { | |
1078 pop(tmp_reg); | |
1079 } | |
1080 jcc(Assembler::notEqual, cas_label); | |
1081 // The bias pattern is present in the object's header. Need to check | |
1082 // whether the bias owner and the epoch are both still current. | |
1083 #ifndef _LP64 | |
1084 // Note that because there is no current thread register on x86_32 we | |
1085 // need to store off the mark word we read out of the object to | |
1086 // avoid reloading it and needing to recheck invariants below. This | |
1087 // store is unfortunate but it makes the overall code shorter and | |
1088 // simpler. | |
1089 movptr(saved_mark_addr, swap_reg); | |
1090 #endif | |
1091 if (need_tmp_reg) { | |
1092 push(tmp_reg); | |
1093 } | |
1094 if (swap_reg_contains_mark) { | |
1095 null_check_offset = offset(); | |
1096 } | |
1097 load_prototype_header(tmp_reg, obj_reg); | |
1098 #ifdef _LP64 | |
1099 orptr(tmp_reg, r15_thread); | |
1100 xorptr(tmp_reg, swap_reg); | |
1101 Register header_reg = tmp_reg; | |
1102 #else | |
1103 xorptr(tmp_reg, swap_reg); | |
1104 get_thread(swap_reg); | |
1105 xorptr(swap_reg, tmp_reg); | |
1106 Register header_reg = swap_reg; | |
1107 #endif | |
1108 andptr(header_reg, ~((int) markOopDesc::age_mask_in_place)); | |
1109 if (need_tmp_reg) { | |
1110 pop(tmp_reg); | |
1111 } | |
1112 if (counters != NULL) { | |
1113 cond_inc32(Assembler::zero, | |
1114 ExternalAddress((address) counters->biased_lock_entry_count_addr())); | |
1115 } | |
1116 jcc(Assembler::equal, done); | |
1117 | |
1118 Label try_revoke_bias; | |
1119 Label try_rebias; | |
1120 | |
1121 // At this point we know that the header has the bias pattern and | |
1122 // that we are not the bias owner in the current epoch. We need to | |
1123 // figure out more details about the state of the header in order to | |
1124 // know what operations can be legally performed on the object's | |
1125 // header. | |
1126 | |
1127 // If the low three bits in the xor result aren't clear, that means | |
1128 // the prototype header is no longer biased and we have to revoke | |
1129 // the bias on this object. | |
1130 testptr(header_reg, markOopDesc::biased_lock_mask_in_place); | |
1131 jccb(Assembler::notZero, try_revoke_bias); | |
1132 | |
1133 // Biasing is still enabled for this data type. See whether the | |
1134 // epoch of the current bias is still valid, meaning that the epoch | |
1135 // bits of the mark word are equal to the epoch bits of the | |
1136 // prototype header. (Note that the prototype header's epoch bits | |
1137 // only change at a safepoint.) If not, attempt to rebias the object | |
1138 // toward the current thread. Note that we must be absolutely sure | |
1139 // that the current epoch is invalid in order to do this because | |
1140 // otherwise the manipulations it performs on the mark word are | |
1141 // illegal. | |
1142 testptr(header_reg, markOopDesc::epoch_mask_in_place); | |
1143 jccb(Assembler::notZero, try_rebias); | |
1144 | |
1145 // The epoch of the current bias is still valid but we know nothing | |
1146 // about the owner; it might be set or it might be clear. Try to | |
1147 // acquire the bias of the object using an atomic operation. If this | |
1148 // fails we will go in to the runtime to revoke the object's bias. | |
1149 // Note that we first construct the presumed unbiased header so we | |
1150 // don't accidentally blow away another thread's valid bias. | |
1151 NOT_LP64( movptr(swap_reg, saved_mark_addr); ) | |
1152 andptr(swap_reg, | |
1153 markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place); | |
1154 if (need_tmp_reg) { | |
1155 push(tmp_reg); | |
1156 } | |
1157 #ifdef _LP64 | |
1158 movptr(tmp_reg, swap_reg); | |
1159 orptr(tmp_reg, r15_thread); | |
1160 #else | |
1161 get_thread(tmp_reg); | |
1162 orptr(tmp_reg, swap_reg); | |
1163 #endif | |
1164 if (os::is_MP()) { | |
1165 lock(); | |
1166 } | |
1167 cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg | |
1168 if (need_tmp_reg) { | |
1169 pop(tmp_reg); | |
1170 } | |
1171 // If the biasing toward our thread failed, this means that | |
1172 // another thread succeeded in biasing it toward itself and we | |
1173 // need to revoke that bias. The revocation will occur in the | |
1174 // interpreter runtime in the slow case. | |
1175 if (counters != NULL) { | |
1176 cond_inc32(Assembler::zero, | |
1177 ExternalAddress((address) counters->anonymously_biased_lock_entry_count_addr())); | |
1178 } | |
1179 if (slow_case != NULL) { | |
1180 jcc(Assembler::notZero, *slow_case); | |
1181 } | |
1182 jmp(done); | |
1183 | |
1184 bind(try_rebias); | |
1185 // At this point we know the epoch has expired, meaning that the | |
1186 // current "bias owner", if any, is actually invalid. Under these | |
1187 // circumstances _only_, we are allowed to use the current header's | |
1188 // value as the comparison value when doing the cas to acquire the | |
1189 // bias in the current epoch. In other words, we allow transfer of | |
1190 // the bias from one thread to another directly in this situation. | |
1191 // | |
1192 // FIXME: due to a lack of registers we currently blow away the age | |
1193 // bits in this situation. Should attempt to preserve them. | |
1194 if (need_tmp_reg) { | |
1195 push(tmp_reg); | |
1196 } | |
1197 load_prototype_header(tmp_reg, obj_reg); | |
1198 #ifdef _LP64 | |
1199 orptr(tmp_reg, r15_thread); | |
1200 #else | |
1201 get_thread(swap_reg); | |
1202 orptr(tmp_reg, swap_reg); | |
1203 movptr(swap_reg, saved_mark_addr); | |
1204 #endif | |
1205 if (os::is_MP()) { | |
1206 lock(); | |
1207 } | |
1208 cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg | |
1209 if (need_tmp_reg) { | |
1210 pop(tmp_reg); | |
1211 } | |
1212 // If the biasing toward our thread failed, then another thread | |
1213 // succeeded in biasing it toward itself and we need to revoke that | |
1214 // bias. The revocation will occur in the runtime in the slow case. | |
1215 if (counters != NULL) { | |
1216 cond_inc32(Assembler::zero, | |
1217 ExternalAddress((address) counters->rebiased_lock_entry_count_addr())); | |
1218 } | |
1219 if (slow_case != NULL) { | |
1220 jcc(Assembler::notZero, *slow_case); | |
1221 } | |
1222 jmp(done); | |
1223 | |
1224 bind(try_revoke_bias); | |
1225 // The prototype mark in the klass doesn't have the bias bit set any | |
1226 // more, indicating that objects of this data type are not supposed | |
1227 // to be biased any more. We are going to try to reset the mark of | |
1228 // this object to the prototype value and fall through to the | |
1229 // CAS-based locking scheme. Note that if our CAS fails, it means | |
1230 // that another thread raced us for the privilege of revoking the | |
1231 // bias of this particular object, so it's okay to continue in the | |
1232 // normal locking code. | |
1233 // | |
1234 // FIXME: due to a lack of registers we currently blow away the age | |
1235 // bits in this situation. Should attempt to preserve them. | |
1236 NOT_LP64( movptr(swap_reg, saved_mark_addr); ) | |
1237 if (need_tmp_reg) { | |
1238 push(tmp_reg); | |
1239 } | |
1240 load_prototype_header(tmp_reg, obj_reg); | |
1241 if (os::is_MP()) { | |
1242 lock(); | |
1243 } | |
1244 cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg | |
1245 if (need_tmp_reg) { | |
1246 pop(tmp_reg); | |
1247 } | |
1248 // Fall through to the normal CAS-based lock, because no matter what | |
1249 // the result of the above CAS, some thread must have succeeded in | |
1250 // removing the bias bit from the object's header. | |
1251 if (counters != NULL) { | |
1252 cond_inc32(Assembler::zero, | |
1253 ExternalAddress((address) counters->revoked_lock_entry_count_addr())); | |
1254 } | |
1255 | |
1256 bind(cas_label); | |
1257 | |
1258 return null_check_offset; | |
1259 } | |
1260 | |
1396 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) { | 1261 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) { |
1397 assert(UseBiasedLocking, "why call this otherwise?"); | 1262 assert(UseBiasedLocking, "why call this otherwise?"); |
1398 | 1263 |
1399 // Check for biased locking unlock case, which is a no-op | 1264 // Check for biased locking unlock case, which is a no-op |
1400 // Note: we do not have to check the thread ID for two reasons. | 1265 // Note: we do not have to check the thread ID for two reasons. |
1405 movptr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes())); | 1270 movptr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes())); |
1406 andptr(temp_reg, markOopDesc::biased_lock_mask_in_place); | 1271 andptr(temp_reg, markOopDesc::biased_lock_mask_in_place); |
1407 cmpptr(temp_reg, markOopDesc::biased_lock_pattern); | 1272 cmpptr(temp_reg, markOopDesc::biased_lock_pattern); |
1408 jcc(Assembler::equal, done); | 1273 jcc(Assembler::equal, done); |
1409 } | 1274 } |
1275 | |
1276 #ifdef COMPILER2 | |
1277 // Fast_Lock and Fast_Unlock used by C2 | |
1278 | |
1279 // Because the transitions from emitted code to the runtime | |
1280 // monitorenter/exit helper stubs are so slow it's critical that | |
1281 // we inline both the stack-locking fast-path and the inflated fast path. | |
1282 // | |
1283 // See also: cmpFastLock and cmpFastUnlock. | |
1284 // | |
1285 // What follows is a specialized inline transliteration of the code | |
1286 // in slow_enter() and slow_exit(). If we're concerned about I$ bloat | |
1287 // another option would be to emit TrySlowEnter and TrySlowExit methods | |
1288 // at startup-time. These methods would accept arguments as | |
1289 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure | |
1290 // indications in the icc.ZFlag. Fast_Lock and Fast_Unlock would simply | |
1291 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit. | |
1292 // In practice, however, the # of lock sites is bounded and is usually small. | |
1293 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer | |
1294 // if the processor uses simple bimodal branch predictors keyed by EIP | |
1295 // Since the helper routines would be called from multiple synchronization | |
1296 // sites. | |
1297 // | |
1298 // An even better approach would be write "MonitorEnter()" and "MonitorExit()" | |
1299 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites | |
1300 // to those specialized methods. That'd give us a mostly platform-independent | |
1301 // implementation that the JITs could optimize and inline at their pleasure. | |
1302 // Done correctly, the only time we'd need to cross to native could would be | |
1303 // to park() or unpark() threads. We'd also need a few more unsafe operators | |
1304 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and | |
1305 // (b) explicit barriers or fence operations. | |
1306 // | |
1307 // TODO: | |
1308 // | |
1309 // * Arrange for C2 to pass "Self" into Fast_Lock and Fast_Unlock in one of the registers (scr). | |
1310 // This avoids manifesting the Self pointer in the Fast_Lock and Fast_Unlock terminals. | |
1311 // Given TLAB allocation, Self is usually manifested in a register, so passing it into | |
1312 // the lock operators would typically be faster than reifying Self. | |
1313 // | |
1314 // * Ideally I'd define the primitives as: | |
1315 // fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED. | |
1316 // fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED | |
1317 // Unfortunately ADLC bugs prevent us from expressing the ideal form. | |
1318 // Instead, we're stuck with a rather awkward and brittle register assignments below. | |
1319 // Furthermore the register assignments are overconstrained, possibly resulting in | |
1320 // sub-optimal code near the synchronization site. | |
1321 // | |
1322 // * Eliminate the sp-proximity tests and just use "== Self" tests instead. | |
1323 // Alternately, use a better sp-proximity test. | |
1324 // | |
1325 // * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value. | |
1326 // Either one is sufficient to uniquely identify a thread. | |
1327 // TODO: eliminate use of sp in _owner and use get_thread(tr) instead. | |
1328 // | |
1329 // * Intrinsify notify() and notifyAll() for the common cases where the | |
1330 // object is locked by the calling thread but the waitlist is empty. | |
1331 // avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll(). | |
1332 // | |
1333 // * use jccb and jmpb instead of jcc and jmp to improve code density. | |
1334 // But beware of excessive branch density on AMD Opterons. | |
1335 // | |
1336 // * Both Fast_Lock and Fast_Unlock set the ICC.ZF to indicate success | |
1337 // or failure of the fast-path. If the fast-path fails then we pass | |
1338 // control to the slow-path, typically in C. In Fast_Lock and | |
1339 // Fast_Unlock we often branch to DONE_LABEL, just to find that C2 | |
1340 // will emit a conditional branch immediately after the node. | |
1341 // So we have branches to branches and lots of ICC.ZF games. | |
1342 // Instead, it might be better to have C2 pass a "FailureLabel" | |
1343 // into Fast_Lock and Fast_Unlock. In the case of success, control | |
1344 // will drop through the node. ICC.ZF is undefined at exit. | |
1345 // In the case of failure, the node will branch directly to the | |
1346 // FailureLabel | |
1347 | |
1348 | |
1349 // obj: object to lock | |
1350 // box: on-stack box address (displaced header location) - KILLED | |
1351 // rax,: tmp -- KILLED | |
1352 // scr: tmp -- KILLED | |
1353 void MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg, Register scrReg, BiasedLockingCounters* counters) { | |
1354 // Ensure the register assignents are disjoint | |
1355 guarantee (objReg != boxReg, ""); | |
1356 guarantee (objReg != tmpReg, ""); | |
1357 guarantee (objReg != scrReg, ""); | |
1358 guarantee (boxReg != tmpReg, ""); | |
1359 guarantee (boxReg != scrReg, ""); | |
1360 guarantee (tmpReg == rax, ""); | |
1361 | |
1362 if (counters != NULL) { | |
1363 atomic_incl(ExternalAddress((address)counters->total_entry_count_addr())); | |
1364 } | |
1365 if (EmitSync & 1) { | |
1366 // set box->dhw = unused_mark (3) | |
1367 // Force all sync thru slow-path: slow_enter() and slow_exit() | |
1368 movptr (Address(boxReg, 0), (int32_t)intptr_t(markOopDesc::unused_mark())); | |
1369 cmpptr (rsp, (int32_t)NULL_WORD); | |
1370 } else | |
1371 if (EmitSync & 2) { | |
1372 Label DONE_LABEL ; | |
1373 if (UseBiasedLocking) { | |
1374 // Note: tmpReg maps to the swap_reg argument and scrReg to the tmp_reg argument. | |
1375 biased_locking_enter(boxReg, objReg, tmpReg, scrReg, false, DONE_LABEL, NULL, counters); | |
1376 } | |
1377 | |
1378 movptr(tmpReg, Address(objReg, 0)); // fetch markword | |
1379 orptr (tmpReg, 0x1); | |
1380 movptr(Address(boxReg, 0), tmpReg); // Anticipate successful CAS | |
1381 if (os::is_MP()) { | |
1382 lock(); | |
1383 } | |
1384 cmpxchgptr(boxReg, Address(objReg, 0)); // Updates tmpReg | |
1385 jccb(Assembler::equal, DONE_LABEL); | |
1386 // Recursive locking | |
1387 subptr(tmpReg, rsp); | |
1388 andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - os::vm_page_size())) ); | |
1389 movptr(Address(boxReg, 0), tmpReg); | |
1390 bind(DONE_LABEL); | |
1391 } else { | |
1392 // Possible cases that we'll encounter in fast_lock | |
1393 // ------------------------------------------------ | |
1394 // * Inflated | |
1395 // -- unlocked | |
1396 // -- Locked | |
1397 // = by self | |
1398 // = by other | |
1399 // * biased | |
1400 // -- by Self | |
1401 // -- by other | |
1402 // * neutral | |
1403 // * stack-locked | |
1404 // -- by self | |
1405 // = sp-proximity test hits | |
1406 // = sp-proximity test generates false-negative | |
1407 // -- by other | |
1408 // | |
1409 | |
1410 Label IsInflated, DONE_LABEL; | |
1411 | |
1412 // it's stack-locked, biased or neutral | |
1413 // TODO: optimize away redundant LDs of obj->mark and improve the markword triage | |
1414 // order to reduce the number of conditional branches in the most common cases. | |
1415 // Beware -- there's a subtle invariant that fetch of the markword | |
1416 // at [FETCH], below, will never observe a biased encoding (*101b). | |
1417 // If this invariant is not held we risk exclusion (safety) failure. | |
1418 if (UseBiasedLocking && !UseOptoBiasInlining) { | |
1419 biased_locking_enter(boxReg, objReg, tmpReg, scrReg, true, DONE_LABEL, NULL, counters); | |
1420 } | |
1421 | |
1422 movptr(tmpReg, Address(objReg, 0)); // [FETCH] | |
1423 testl (tmpReg, markOopDesc::monitor_value); // inflated vs stack-locked|neutral|biased | |
1424 jccb (Assembler::notZero, IsInflated); | |
1425 | |
1426 // Attempt stack-locking ... | |
1427 orptr (tmpReg, 0x1); | |
1428 movptr(Address(boxReg, 0), tmpReg); // Anticipate successful CAS | |
1429 if (os::is_MP()) { | |
1430 lock(); | |
1431 } | |
1432 cmpxchgptr(boxReg, Address(objReg, 0)); // Updates tmpReg | |
1433 if (counters != NULL) { | |
1434 cond_inc32(Assembler::equal, | |
1435 ExternalAddress((address)counters->fast_path_entry_count_addr())); | |
1436 } | |
1437 jccb(Assembler::equal, DONE_LABEL); | |
1438 | |
1439 // Recursive locking | |
1440 subptr(tmpReg, rsp); | |
1441 andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - os::vm_page_size())) ); | |
1442 movptr(Address(boxReg, 0), tmpReg); | |
1443 if (counters != NULL) { | |
1444 cond_inc32(Assembler::equal, | |
1445 ExternalAddress((address)counters->fast_path_entry_count_addr())); | |
1446 } | |
1447 jmpb(DONE_LABEL); | |
1448 | |
1449 bind(IsInflated); | |
1450 #ifndef _LP64 | |
1451 // The object is inflated. | |
1452 // | |
1453 // TODO-FIXME: eliminate the ugly use of manifest constants: | |
1454 // Use markOopDesc::monitor_value instead of "2". | |
1455 // use markOop::unused_mark() instead of "3". | |
1456 // The tmpReg value is an objectMonitor reference ORed with | |
1457 // markOopDesc::monitor_value (2). We can either convert tmpReg to an | |
1458 // objectmonitor pointer by masking off the "2" bit or we can just | |
1459 // use tmpReg as an objectmonitor pointer but bias the objectmonitor | |
1460 // field offsets with "-2" to compensate for and annul the low-order tag bit. | |
1461 // | |
1462 // I use the latter as it avoids AGI stalls. | |
1463 // As such, we write "mov r, [tmpReg+OFFSETOF(Owner)-2]" | |
1464 // instead of "mov r, [tmpReg+OFFSETOF(Owner)]". | |
1465 // | |
1466 #define OFFSET_SKEWED(f) ((ObjectMonitor::f ## _offset_in_bytes())-2) | |
1467 | |
1468 // boxReg refers to the on-stack BasicLock in the current frame. | |
1469 // We'd like to write: | |
1470 // set box->_displaced_header = markOop::unused_mark(). Any non-0 value suffices. | |
1471 // This is convenient but results a ST-before-CAS penalty. The following CAS suffers | |
1472 // additional latency as we have another ST in the store buffer that must drain. | |
1473 | |
1474 if (EmitSync & 8192) { | |
1475 movptr(Address(boxReg, 0), 3); // results in ST-before-CAS penalty | |
1476 get_thread (scrReg); | |
1477 movptr(boxReg, tmpReg); // consider: LEA box, [tmp-2] | |
1478 movptr(tmpReg, NULL_WORD); // consider: xor vs mov | |
1479 if (os::is_MP()) { | |
1480 lock(); | |
1481 } | |
1482 cmpxchgptr(scrReg, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2)); | |
1483 } else | |
1484 if ((EmitSync & 128) == 0) { // avoid ST-before-CAS | |
1485 movptr(scrReg, boxReg); | |
1486 movptr(boxReg, tmpReg); // consider: LEA box, [tmp-2] | |
1487 | |
1488 // Using a prefetchw helps avoid later RTS->RTO upgrades and cache probes | |
1489 if ((EmitSync & 2048) && VM_Version::supports_3dnow_prefetch() && os::is_MP()) { | |
1490 // prefetchw [eax + Offset(_owner)-2] | |
1491 prefetchw(Address(tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)); | |
1492 } | |
1493 | |
1494 if ((EmitSync & 64) == 0) { | |
1495 // Optimistic form: consider XORL tmpReg,tmpReg | |
1496 movptr(tmpReg, NULL_WORD); | |
1497 } else { | |
1498 // Can suffer RTS->RTO upgrades on shared or cold $ lines | |
1499 // Test-And-CAS instead of CAS | |
1500 movptr(tmpReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)); // rax, = m->_owner | |
1501 testptr(tmpReg, tmpReg); // Locked ? | |
1502 jccb (Assembler::notZero, DONE_LABEL); | |
1503 } | |
1504 | |
1505 // Appears unlocked - try to swing _owner from null to non-null. | |
1506 // Ideally, I'd manifest "Self" with get_thread and then attempt | |
1507 // to CAS the register containing Self into m->Owner. | |
1508 // But we don't have enough registers, so instead we can either try to CAS | |
1509 // rsp or the address of the box (in scr) into &m->owner. If the CAS succeeds | |
1510 // we later store "Self" into m->Owner. Transiently storing a stack address | |
1511 // (rsp or the address of the box) into m->owner is harmless. | |
1512 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand. | |
1513 if (os::is_MP()) { | |
1514 lock(); | |
1515 } | |
1516 cmpxchgptr(scrReg, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2)); | |
1517 movptr(Address(scrReg, 0), 3); // box->_displaced_header = 3 | |
1518 jccb (Assembler::notZero, DONE_LABEL); | |
1519 get_thread (scrReg); // beware: clobbers ICCs | |
1520 movptr(Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2), scrReg); | |
1521 xorptr(boxReg, boxReg); // set icc.ZFlag = 1 to indicate success | |
1522 | |
1523 // If the CAS fails we can either retry or pass control to the slow-path. | |
1524 // We use the latter tactic. | |
1525 // Pass the CAS result in the icc.ZFlag into DONE_LABEL | |
1526 // If the CAS was successful ... | |
1527 // Self has acquired the lock | |
1528 // Invariant: m->_recursions should already be 0, so we don't need to explicitly set it. | |
1529 // Intentional fall-through into DONE_LABEL ... | |
1530 } else { | |
1531 movptr(Address(boxReg, 0), intptr_t(markOopDesc::unused_mark())); // results in ST-before-CAS penalty | |
1532 movptr(boxReg, tmpReg); | |
1533 | |
1534 // Using a prefetchw helps avoid later RTS->RTO upgrades and cache probes | |
1535 if ((EmitSync & 2048) && VM_Version::supports_3dnow_prefetch() && os::is_MP()) { | |
1536 // prefetchw [eax + Offset(_owner)-2] | |
1537 prefetchw(Address(tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)); | |
1538 } | |
1539 | |
1540 if ((EmitSync & 64) == 0) { | |
1541 // Optimistic form | |
1542 xorptr (tmpReg, tmpReg); | |
1543 } else { | |
1544 // Can suffer RTS->RTO upgrades on shared or cold $ lines | |
1545 movptr(tmpReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)); // rax, = m->_owner | |
1546 testptr(tmpReg, tmpReg); // Locked ? | |
1547 jccb (Assembler::notZero, DONE_LABEL); | |
1548 } | |
1549 | |
1550 // Appears unlocked - try to swing _owner from null to non-null. | |
1551 // Use either "Self" (in scr) or rsp as thread identity in _owner. | |
1552 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand. | |
1553 get_thread (scrReg); | |
1554 if (os::is_MP()) { | |
1555 lock(); | |
1556 } | |
1557 cmpxchgptr(scrReg, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2)); | |
1558 | |
1559 // If the CAS fails we can either retry or pass control to the slow-path. | |
1560 // We use the latter tactic. | |
1561 // Pass the CAS result in the icc.ZFlag into DONE_LABEL | |
1562 // If the CAS was successful ... | |
1563 // Self has acquired the lock | |
1564 // Invariant: m->_recursions should already be 0, so we don't need to explicitly set it. | |
1565 // Intentional fall-through into DONE_LABEL ... | |
1566 } | |
1567 #else // _LP64 | |
1568 // It's inflated | |
1569 | |
1570 // TODO: someday avoid the ST-before-CAS penalty by | |
1571 // relocating (deferring) the following ST. | |
1572 // We should also think about trying a CAS without having | |
1573 // fetched _owner. If the CAS is successful we may | |
1574 // avoid an RTO->RTS upgrade on the $line. | |
1575 | |
1576 // Without cast to int32_t a movptr will destroy r10 which is typically obj | |
1577 movptr(Address(boxReg, 0), (int32_t)intptr_t(markOopDesc::unused_mark())); | |
1578 | |
1579 mov (boxReg, tmpReg); | |
1580 movptr (tmpReg, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2)); | |
1581 testptr(tmpReg, tmpReg); | |
1582 jccb (Assembler::notZero, DONE_LABEL); | |
1583 | |
1584 // It's inflated and appears unlocked | |
1585 if (os::is_MP()) { | |
1586 lock(); | |
1587 } | |
1588 cmpxchgptr(r15_thread, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2)); | |
1589 // Intentional fall-through into DONE_LABEL ... | |
1590 | |
1591 #endif | |
1592 | |
1593 // DONE_LABEL is a hot target - we'd really like to place it at the | |
1594 // start of cache line by padding with NOPs. | |
1595 // See the AMD and Intel software optimization manuals for the | |
1596 // most efficient "long" NOP encodings. | |
1597 // Unfortunately none of our alignment mechanisms suffice. | |
1598 bind(DONE_LABEL); | |
1599 | |
1600 // At DONE_LABEL the icc ZFlag is set as follows ... | |
1601 // Fast_Unlock uses the same protocol. | |
1602 // ZFlag == 1 -> Success | |
1603 // ZFlag == 0 -> Failure - force control through the slow-path | |
1604 } | |
1605 } | |
1606 | |
1607 // obj: object to unlock | |
1608 // box: box address (displaced header location), killed. Must be EAX. | |
1609 // tmp: killed, cannot be obj nor box. | |
1610 // | |
1611 // Some commentary on balanced locking: | |
1612 // | |
1613 // Fast_Lock and Fast_Unlock are emitted only for provably balanced lock sites. | |
1614 // Methods that don't have provably balanced locking are forced to run in the | |
1615 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock. | |
1616 // The interpreter provides two properties: | |
1617 // I1: At return-time the interpreter automatically and quietly unlocks any | |
1618 // objects acquired the current activation (frame). Recall that the | |
1619 // interpreter maintains an on-stack list of locks currently held by | |
1620 // a frame. | |
1621 // I2: If a method attempts to unlock an object that is not held by the | |
1622 // the frame the interpreter throws IMSX. | |
1623 // | |
1624 // Lets say A(), which has provably balanced locking, acquires O and then calls B(). | |
1625 // B() doesn't have provably balanced locking so it runs in the interpreter. | |
1626 // Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O | |
1627 // is still locked by A(). | |
1628 // | |
1629 // The only other source of unbalanced locking would be JNI. The "Java Native Interface: | |
1630 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter | |
1631 // should not be unlocked by "normal" java-level locking and vice-versa. The specification | |
1632 // doesn't specify what will occur if a program engages in such mixed-mode locking, however. | |
1633 | |
1634 void MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg) { | |
1635 guarantee (objReg != boxReg, ""); | |
1636 guarantee (objReg != tmpReg, ""); | |
1637 guarantee (boxReg != tmpReg, ""); | |
1638 guarantee (boxReg == rax, ""); | |
1639 | |
1640 if (EmitSync & 4) { | |
1641 // Disable - inhibit all inlining. Force control through the slow-path | |
1642 cmpptr (rsp, 0); | |
1643 } else | |
1644 if (EmitSync & 8) { | |
1645 Label DONE_LABEL; | |
1646 if (UseBiasedLocking) { | |
1647 biased_locking_exit(objReg, tmpReg, DONE_LABEL); | |
1648 } | |
1649 // Classic stack-locking code ... | |
1650 // Check whether the displaced header is 0 | |
1651 //(=> recursive unlock) | |
1652 movptr(tmpReg, Address(boxReg, 0)); | |
1653 testptr(tmpReg, tmpReg); | |
1654 jccb(Assembler::zero, DONE_LABEL); | |
1655 // If not recursive lock, reset the header to displaced header | |
1656 if (os::is_MP()) { | |
1657 lock(); | |
1658 } | |
1659 cmpxchgptr(tmpReg, Address(objReg, 0)); // Uses RAX which is box | |
1660 bind(DONE_LABEL); | |
1661 } else { | |
1662 Label DONE_LABEL, Stacked, CheckSucc; | |
1663 | |
1664 // Critically, the biased locking test must have precedence over | |
1665 // and appear before the (box->dhw == 0) recursive stack-lock test. | |
1666 if (UseBiasedLocking && !UseOptoBiasInlining) { | |
1667 biased_locking_exit(objReg, tmpReg, DONE_LABEL); | |
1668 } | |
1669 | |
1670 cmpptr(Address(boxReg, 0), (int32_t)NULL_WORD); // Examine the displaced header | |
1671 movptr(tmpReg, Address(objReg, 0)); // Examine the object's markword | |
1672 jccb (Assembler::zero, DONE_LABEL); // 0 indicates recursive stack-lock | |
1673 | |
1674 testptr(tmpReg, 0x02); // Inflated? | |
1675 jccb (Assembler::zero, Stacked); | |
1676 | |
1677 // It's inflated. | |
1678 // Despite our balanced locking property we still check that m->_owner == Self | |
1679 // as java routines or native JNI code called by this thread might | |
1680 // have released the lock. | |
1681 // Refer to the comments in synchronizer.cpp for how we might encode extra | |
1682 // state in _succ so we can avoid fetching EntryList|cxq. | |
1683 // | |
1684 // I'd like to add more cases in fast_lock() and fast_unlock() -- | |
1685 // such as recursive enter and exit -- but we have to be wary of | |
1686 // I$ bloat, T$ effects and BP$ effects. | |
1687 // | |
1688 // If there's no contention try a 1-0 exit. That is, exit without | |
1689 // a costly MEMBAR or CAS. See synchronizer.cpp for details on how | |
1690 // we detect and recover from the race that the 1-0 exit admits. | |
1691 // | |
1692 // Conceptually Fast_Unlock() must execute a STST|LDST "release" barrier | |
1693 // before it STs null into _owner, releasing the lock. Updates | |
1694 // to data protected by the critical section must be visible before | |
1695 // we drop the lock (and thus before any other thread could acquire | |
1696 // the lock and observe the fields protected by the lock). | |
1697 // IA32's memory-model is SPO, so STs are ordered with respect to | |
1698 // each other and there's no need for an explicit barrier (fence). | |
1699 // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html. | |
1700 #ifndef _LP64 | |
1701 get_thread (boxReg); | |
1702 if ((EmitSync & 4096) && VM_Version::supports_3dnow_prefetch() && os::is_MP()) { | |
1703 // prefetchw [ebx + Offset(_owner)-2] | |
1704 prefetchw(Address(tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)); | |
1705 } | |
1706 | |
1707 // Note that we could employ various encoding schemes to reduce | |
1708 // the number of loads below (currently 4) to just 2 or 3. | |
1709 // Refer to the comments in synchronizer.cpp. | |
1710 // In practice the chain of fetches doesn't seem to impact performance, however. | |
1711 if ((EmitSync & 65536) == 0 && (EmitSync & 256)) { | |
1712 // Attempt to reduce branch density - AMD's branch predictor. | |
1713 xorptr(boxReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)); | |
1714 orptr(boxReg, Address (tmpReg, ObjectMonitor::recursions_offset_in_bytes()-2)); | |
1715 orptr(boxReg, Address (tmpReg, ObjectMonitor::EntryList_offset_in_bytes()-2)); | |
1716 orptr(boxReg, Address (tmpReg, ObjectMonitor::cxq_offset_in_bytes()-2)); | |
1717 jccb (Assembler::notZero, DONE_LABEL); | |
1718 movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), NULL_WORD); | |
1719 jmpb (DONE_LABEL); | |
1720 } else { | |
1721 xorptr(boxReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)); | |
1722 orptr(boxReg, Address (tmpReg, ObjectMonitor::recursions_offset_in_bytes()-2)); | |
1723 jccb (Assembler::notZero, DONE_LABEL); | |
1724 movptr(boxReg, Address (tmpReg, ObjectMonitor::EntryList_offset_in_bytes()-2)); | |
1725 orptr(boxReg, Address (tmpReg, ObjectMonitor::cxq_offset_in_bytes()-2)); | |
1726 jccb (Assembler::notZero, CheckSucc); | |
1727 movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), NULL_WORD); | |
1728 jmpb (DONE_LABEL); | |
1729 } | |
1730 | |
1731 // The Following code fragment (EmitSync & 65536) improves the performance of | |
1732 // contended applications and contended synchronization microbenchmarks. | |
1733 // Unfortunately the emission of the code - even though not executed - causes regressions | |
1734 // in scimark and jetstream, evidently because of $ effects. Replacing the code | |
1735 // with an equal number of never-executed NOPs results in the same regression. | |
1736 // We leave it off by default. | |
1737 | |
1738 if ((EmitSync & 65536) != 0) { | |
1739 Label LSuccess, LGoSlowPath ; | |
1740 | |
1741 bind (CheckSucc); | |
1742 | |
1743 // Optional pre-test ... it's safe to elide this | |
1744 if ((EmitSync & 16) == 0) { | |
1745 cmpptr(Address (tmpReg, ObjectMonitor::succ_offset_in_bytes()-2), (int32_t)NULL_WORD); | |
1746 jccb (Assembler::zero, LGoSlowPath); | |
1747 } | |
1748 | |
1749 // We have a classic Dekker-style idiom: | |
1750 // ST m->_owner = 0 ; MEMBAR; LD m->_succ | |
1751 // There are a number of ways to implement the barrier: | |
1752 // (1) lock:andl &m->_owner, 0 | |
1753 // is fast, but mask doesn't currently support the "ANDL M,IMM32" form. | |
1754 // LOCK: ANDL [ebx+Offset(_Owner)-2], 0 | |
1755 // Encodes as 81 31 OFF32 IMM32 or 83 63 OFF8 IMM8 | |
1756 // (2) If supported, an explicit MFENCE is appealing. | |
1757 // In older IA32 processors MFENCE is slower than lock:add or xchg | |
1758 // particularly if the write-buffer is full as might be the case if | |
1759 // if stores closely precede the fence or fence-equivalent instruction. | |
1760 // In more modern implementations MFENCE appears faster, however. | |
1761 // (3) In lieu of an explicit fence, use lock:addl to the top-of-stack | |
1762 // The $lines underlying the top-of-stack should be in M-state. | |
1763 // The locked add instruction is serializing, of course. | |
1764 // (4) Use xchg, which is serializing | |
1765 // mov boxReg, 0; xchgl boxReg, [tmpReg + Offset(_owner)-2] also works | |
1766 // (5) ST m->_owner = 0 and then execute lock:orl &m->_succ, 0. | |
1767 // The integer condition codes will tell us if succ was 0. | |
1768 // Since _succ and _owner should reside in the same $line and | |
1769 // we just stored into _owner, it's likely that the $line | |
1770 // remains in M-state for the lock:orl. | |
1771 // | |
1772 // We currently use (3), although it's likely that switching to (2) | |
1773 // is correct for the future. | |
1774 | |
1775 movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), NULL_WORD); | |
1776 if (os::is_MP()) { | |
1777 if (VM_Version::supports_sse2() && 1 == FenceInstruction) { | |
1778 mfence(); | |
1779 } else { | |
1780 lock (); addptr(Address(rsp, 0), 0); | |
1781 } | |
1782 } | |
1783 // Ratify _succ remains non-null | |
1784 cmpptr(Address (tmpReg, ObjectMonitor::succ_offset_in_bytes()-2), 0); | |
1785 jccb (Assembler::notZero, LSuccess); | |
1786 | |
1787 xorptr(boxReg, boxReg); // box is really EAX | |
1788 if (os::is_MP()) { lock(); } | |
1789 cmpxchgptr(rsp, Address(tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)); | |
1790 jccb (Assembler::notEqual, LSuccess); | |
1791 // Since we're low on registers we installed rsp as a placeholding in _owner. | |
1792 // Now install Self over rsp. This is safe as we're transitioning from | |
1793 // non-null to non=null | |
1794 get_thread (boxReg); | |
1795 movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), boxReg); | |
1796 // Intentional fall-through into LGoSlowPath ... | |
1797 | |
1798 bind (LGoSlowPath); | |
1799 orptr(boxReg, 1); // set ICC.ZF=0 to indicate failure | |
1800 jmpb (DONE_LABEL); | |
1801 | |
1802 bind (LSuccess); | |
1803 xorptr(boxReg, boxReg); // set ICC.ZF=1 to indicate success | |
1804 jmpb (DONE_LABEL); | |
1805 } | |
1806 | |
1807 bind (Stacked); | |
1808 // It's not inflated and it's not recursively stack-locked and it's not biased. | |
1809 // It must be stack-locked. | |
1810 // Try to reset the header to displaced header. | |
1811 // The "box" value on the stack is stable, so we can reload | |
1812 // and be assured we observe the same value as above. | |
1813 movptr(tmpReg, Address(boxReg, 0)); | |
1814 if (os::is_MP()) { | |
1815 lock(); | |
1816 } | |
1817 cmpxchgptr(tmpReg, Address(objReg, 0)); // Uses RAX which is box | |
1818 // Intention fall-thru into DONE_LABEL | |
1819 | |
1820 // DONE_LABEL is a hot target - we'd really like to place it at the | |
1821 // start of cache line by padding with NOPs. | |
1822 // See the AMD and Intel software optimization manuals for the | |
1823 // most efficient "long" NOP encodings. | |
1824 // Unfortunately none of our alignment mechanisms suffice. | |
1825 if ((EmitSync & 65536) == 0) { | |
1826 bind (CheckSucc); | |
1827 } | |
1828 #else // _LP64 | |
1829 // It's inflated | |
1830 movptr(boxReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)); | |
1831 xorptr(boxReg, r15_thread); | |
1832 orptr (boxReg, Address (tmpReg, ObjectMonitor::recursions_offset_in_bytes()-2)); | |
1833 jccb (Assembler::notZero, DONE_LABEL); | |
1834 movptr(boxReg, Address (tmpReg, ObjectMonitor::cxq_offset_in_bytes()-2)); | |
1835 orptr (boxReg, Address (tmpReg, ObjectMonitor::EntryList_offset_in_bytes()-2)); | |
1836 jccb (Assembler::notZero, CheckSucc); | |
1837 movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), (int32_t)NULL_WORD); | |
1838 jmpb (DONE_LABEL); | |
1839 | |
1840 if ((EmitSync & 65536) == 0) { | |
1841 Label LSuccess, LGoSlowPath ; | |
1842 bind (CheckSucc); | |
1843 cmpptr(Address (tmpReg, ObjectMonitor::succ_offset_in_bytes()-2), (int32_t)NULL_WORD); | |
1844 jccb (Assembler::zero, LGoSlowPath); | |
1845 | |
1846 // I'd much rather use lock:andl m->_owner, 0 as it's faster than the | |
1847 // the explicit ST;MEMBAR combination, but masm doesn't currently support | |
1848 // "ANDQ M,IMM". Don't use MFENCE here. lock:add to TOS, xchg, etc | |
1849 // are all faster when the write buffer is populated. | |
1850 movptr (Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), (int32_t)NULL_WORD); | |
1851 if (os::is_MP()) { | |
1852 lock (); addl (Address(rsp, 0), 0); | |
1853 } | |
1854 cmpptr(Address (tmpReg, ObjectMonitor::succ_offset_in_bytes()-2), (int32_t)NULL_WORD); | |
1855 jccb (Assembler::notZero, LSuccess); | |
1856 | |
1857 movptr (boxReg, (int32_t)NULL_WORD); // box is really EAX | |
1858 if (os::is_MP()) { lock(); } | |
1859 cmpxchgptr(r15_thread, Address(tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)); | |
1860 jccb (Assembler::notEqual, LSuccess); | |
1861 // Intentional fall-through into slow-path | |
1862 | |
1863 bind (LGoSlowPath); | |
1864 orl (boxReg, 1); // set ICC.ZF=0 to indicate failure | |
1865 jmpb (DONE_LABEL); | |
1866 | |
1867 bind (LSuccess); | |
1868 testl (boxReg, 0); // set ICC.ZF=1 to indicate success | |
1869 jmpb (DONE_LABEL); | |
1870 } | |
1871 | |
1872 bind (Stacked); | |
1873 movptr(tmpReg, Address (boxReg, 0)); // re-fetch | |
1874 if (os::is_MP()) { lock(); } | |
1875 cmpxchgptr(tmpReg, Address(objReg, 0)); // Uses RAX which is box | |
1876 | |
1877 if (EmitSync & 65536) { | |
1878 bind (CheckSucc); | |
1879 } | |
1880 #endif | |
1881 bind(DONE_LABEL); | |
1882 // Avoid branch to branch on AMD processors | |
1883 if (EmitSync & 32768) { | |
1884 nop(); | |
1885 } | |
1886 } | |
1887 } | |
1888 #endif // COMPILER2 | |
1410 | 1889 |
1411 void MacroAssembler::c2bool(Register x) { | 1890 void MacroAssembler::c2bool(Register x) { |
1412 // implements x == 0 ? 0 : 1 | 1891 // implements x == 0 ? 0 : 1 |
1413 // note: must only look at least-significant byte of x | 1892 // note: must only look at least-significant byte of x |
1414 // since C-style booleans are stored in one byte | 1893 // since C-style booleans are stored in one byte |