Mercurial > hg > truffle
annotate src/share/vm/utilities/utf8.cpp @ 6862:8a5ea0a9ccc4
7127708: G1: change task num types from int to uint in concurrent mark
Summary: Change the type of various task num fields, parameters etc to unsigned and rename them to be more consistent with the other collectors. Code changes were also reviewed by Vitaly Davidovich.
Reviewed-by: johnc
Contributed-by: Kaushik Srenevasan <kaushik@twitter.com>
author | johnc |
---|---|
date | Sat, 06 Oct 2012 01:17:44 -0700 |
parents | 1d1603768966 |
children | bd7a7ce2e264 |
rev | line source |
---|---|
0 | 1 /* |
2426
1d1603768966
7010070: Update all 2010 Oracle-changed OpenJDK files to have the proper copyright dates - second pass
trims
parents:
2177
diff
changeset
|
2 * Copyright (c) 1997, 2011, Oracle and/or its affiliates. All rights reserved. |
0 | 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. |
4 * | |
5 * This code is free software; you can redistribute it and/or modify it | |
6 * under the terms of the GNU General Public License version 2 only, as | |
7 * published by the Free Software Foundation. | |
8 * | |
9 * This code is distributed in the hope that it will be useful, but WITHOUT | |
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | |
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License | |
12 * version 2 for more details (a copy is included in the LICENSE file that | |
13 * accompanied this code). | |
14 * | |
15 * You should have received a copy of the GNU General Public License version | |
16 * 2 along with this work; if not, write to the Free Software Foundation, | |
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. | |
18 * | |
1552
c18cbe5936b8
6941466: Oracle rebranding changes for Hotspot repositories
trims
parents:
0
diff
changeset
|
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA |
c18cbe5936b8
6941466: Oracle rebranding changes for Hotspot repositories
trims
parents:
0
diff
changeset
|
20 * or visit www.oracle.com if you need additional information or have any |
c18cbe5936b8
6941466: Oracle rebranding changes for Hotspot repositories
trims
parents:
0
diff
changeset
|
21 * questions. |
0 | 22 * |
23 */ | |
24 | |
1972 | 25 #include "precompiled.hpp" |
26 #include "utilities/utf8.hpp" | |
0 | 27 |
28 // Assume the utf8 string is in legal form and has been | |
29 // checked in the class file parser/format checker. | |
30 char* UTF8::next(const char* str, jchar* value) { | |
31 unsigned const char *ptr = (const unsigned char *)str; | |
32 unsigned char ch, ch2, ch3; | |
33 int length = -1; /* bad length */ | |
34 jchar result; | |
35 switch ((ch = ptr[0]) >> 4) { | |
36 default: | |
37 result = ch; | |
38 length = 1; | |
39 break; | |
40 | |
41 case 0x8: case 0x9: case 0xA: case 0xB: case 0xF: | |
42 /* Shouldn't happen. */ | |
43 break; | |
44 | |
45 case 0xC: case 0xD: | |
46 /* 110xxxxx 10xxxxxx */ | |
47 if (((ch2 = ptr[1]) & 0xC0) == 0x80) { | |
48 unsigned char high_five = ch & 0x1F; | |
49 unsigned char low_six = ch2 & 0x3F; | |
50 result = (high_five << 6) + low_six; | |
51 length = 2; | |
52 break; | |
53 } | |
54 break; | |
55 | |
56 case 0xE: | |
57 /* 1110xxxx 10xxxxxx 10xxxxxx */ | |
58 if (((ch2 = ptr[1]) & 0xC0) == 0x80) { | |
59 if (((ch3 = ptr[2]) & 0xC0) == 0x80) { | |
60 unsigned char high_four = ch & 0x0f; | |
61 unsigned char mid_six = ch2 & 0x3f; | |
62 unsigned char low_six = ch3 & 0x3f; | |
63 result = (((high_four << 6) + mid_six) << 6) + low_six; | |
64 length = 3; | |
65 } | |
66 } | |
67 break; | |
68 } /* end of switch */ | |
69 | |
70 if (length <= 0) { | |
71 *value = ptr[0]; /* default bad result; */ | |
72 return (char*)(ptr + 1); // make progress somehow | |
73 } | |
74 | |
75 *value = result; | |
76 | |
77 // The assert is correct but the .class file is wrong | |
78 // assert(UNICODE::utf8_size(result) == length, "checking reverse computation"); | |
79 return (char *)(ptr + length); | |
80 } | |
81 | |
82 char* UTF8::next_character(const char* str, jint* value) { | |
83 unsigned const char *ptr = (const unsigned char *)str; | |
84 /* See if it's legal supplementary character: | |
85 11101101 1010xxxx 10xxxxxx 11101101 1011xxxx 10xxxxxx */ | |
86 if (is_supplementary_character(ptr)) { | |
87 *value = get_supplementary_character(ptr); | |
88 return (char *)(ptr + 6); | |
89 } | |
90 jchar result; | |
91 char* next_ch = next(str, &result); | |
92 *value = result; | |
93 return next_ch; | |
94 } | |
95 | |
96 // Count bytes of the form 10xxxxxx and deduct this count | |
97 // from the total byte count. The utf8 string must be in | |
98 // legal form which has been verified in the format checker. | |
99 int UTF8::unicode_length(const char* str, int len) { | |
100 int num_chars = len; | |
101 for (int i = 0; i < len; i++) { | |
102 if ((str[i] & 0xC0) == 0x80) { | |
103 --num_chars; | |
104 } | |
105 } | |
106 return num_chars; | |
107 } | |
108 | |
109 // Count bytes of the utf8 string except those in form | |
110 // 10xxxxxx which only appear in multibyte characters. | |
111 // The utf8 string must be in legal form and has been | |
112 // verified in the format checker. | |
113 int UTF8::unicode_length(const char* str) { | |
114 int num_chars = 0; | |
115 for (const char* p = str; *p; p++) { | |
116 if (((*p) & 0xC0) != 0x80) { | |
117 num_chars++; | |
118 } | |
119 } | |
120 return num_chars; | |
121 } | |
122 | |
123 // Writes a jchar a utf8 and returns the end | |
124 static u_char* utf8_write(u_char* base, jchar ch) { | |
125 if ((ch != 0) && (ch <=0x7f)) { | |
126 base[0] = (u_char) ch; | |
127 return base + 1; | |
128 } | |
129 | |
130 if (ch <= 0x7FF) { | |
131 /* 11 bits or less. */ | |
132 unsigned char high_five = ch >> 6; | |
133 unsigned char low_six = ch & 0x3F; | |
134 base[0] = high_five | 0xC0; /* 110xxxxx */ | |
135 base[1] = low_six | 0x80; /* 10xxxxxx */ | |
136 return base + 2; | |
137 } | |
138 /* possibly full 16 bits. */ | |
139 char high_four = ch >> 12; | |
140 char mid_six = (ch >> 6) & 0x3F; | |
141 char low_six = ch & 0x3f; | |
142 base[0] = high_four | 0xE0; /* 1110xxxx */ | |
143 base[1] = mid_six | 0x80; /* 10xxxxxx */ | |
144 base[2] = low_six | 0x80; /* 10xxxxxx */ | |
145 return base + 3; | |
146 } | |
147 | |
148 void UTF8::convert_to_unicode(const char* utf8_str, jchar* unicode_str, int unicode_length) { | |
149 unsigned char ch; | |
150 const char *ptr = (const char *)utf8_str; | |
151 int index = 0; | |
152 | |
153 /* ASCII case loop optimization */ | |
154 for (; index < unicode_length; index++) { | |
155 if((ch = ptr[0]) > 0x7F) { break; } | |
156 unicode_str[index] = ch; | |
157 ptr = (const char *)(ptr + 1); | |
158 } | |
159 | |
160 for (; index < unicode_length; index++) { | |
161 ptr = UTF8::next(ptr, &unicode_str[index]); | |
162 } | |
163 } | |
164 | |
165 // Returns NULL if 'c' it not found. This only works as long | |
166 // as 'c' is an ASCII character | |
2177
3582bf76420e
6990754: Use native memory and reference counting to implement SymbolTable
coleenp
parents:
1972
diff
changeset
|
167 const jbyte* UTF8::strrchr(const jbyte* base, int length, jbyte c) { |
0 | 168 assert(length >= 0, "sanity check"); |
169 assert(c >= 0, "does not work for non-ASCII characters"); | |
170 // Skip backwards in string until 'c' is found or end is reached | |
171 while(--length >= 0 && base[length] != c); | |
172 return (length < 0) ? NULL : &base[length]; | |
173 } | |
174 | |
2177
3582bf76420e
6990754: Use native memory and reference counting to implement SymbolTable
coleenp
parents:
1972
diff
changeset
|
175 bool UTF8::equal(const jbyte* base1, int length1, const jbyte* base2, int length2) { |
0 | 176 // Length must be the same |
177 if (length1 != length2) return false; | |
178 for (int i = 0; i < length1; i++) { | |
179 if (base1[i] != base2[i]) return false; | |
180 } | |
181 return true; | |
182 } | |
183 | |
184 bool UTF8::is_supplementary_character(const unsigned char* str) { | |
185 return ((str[0] & 0xFF) == 0xED) && ((str[1] & 0xF0) == 0xA0) && ((str[2] & 0xC0) == 0x80) | |
186 && ((str[3] & 0xFF) == 0xED) && ((str[4] & 0xF0) == 0xB0) && ((str[5] & 0xC0) == 0x80); | |
187 } | |
188 | |
189 jint UTF8::get_supplementary_character(const unsigned char* str) { | |
190 return 0x10000 + ((str[1] & 0x0f) << 16) + ((str[2] & 0x3f) << 10) | |
191 + ((str[4] & 0x0f) << 6) + (str[5] & 0x3f); | |
192 } | |
193 | |
194 | |
195 //------------------------------------------------------------------------------------- | |
196 | |
197 | |
198 int UNICODE::utf8_size(jchar c) { | |
199 if ((0x0001 <= c) && (c <= 0x007F)) return 1; | |
200 if (c <= 0x07FF) return 2; | |
201 return 3; | |
202 } | |
203 | |
204 int UNICODE::utf8_length(jchar* base, int length) { | |
205 int result = 0; | |
206 for (int index = 0; index < length; index++) { | |
207 jchar c = base[index]; | |
208 if ((0x0001 <= c) && (c <= 0x007F)) result += 1; | |
209 else if (c <= 0x07FF) result += 2; | |
210 else result += 3; | |
211 } | |
212 return result; | |
213 } | |
214 | |
215 char* UNICODE::as_utf8(jchar* base, int length) { | |
216 int utf8_len = utf8_length(base, length); | |
217 u_char* result = NEW_RESOURCE_ARRAY(u_char, utf8_len + 1); | |
218 u_char* p = result; | |
219 for (int index = 0; index < length; index++) { | |
220 p = utf8_write(p, base[index]); | |
221 } | |
222 *p = '\0'; | |
223 assert(p == &result[utf8_len], "length prediction must be correct"); | |
224 return (char*) result; | |
225 } | |
226 | |
227 char* UNICODE::as_utf8(jchar* base, int length, char* buf, int buflen) { | |
228 u_char* p = (u_char*)buf; | |
229 u_char* end = (u_char*)buf + buflen; | |
230 for (int index = 0; index < length; index++) { | |
231 jchar c = base[index]; | |
232 if (p + utf8_size(c) >= end) break; // string is truncated | |
233 p = utf8_write(p, base[index]); | |
234 } | |
235 *p = '\0'; | |
236 return buf; | |
237 } | |
238 | |
239 void UNICODE::convert_to_utf8(const jchar* base, int length, char* utf8_buffer) { | |
240 for(int index = 0; index < length; index++) { | |
241 utf8_buffer = (char*)utf8_write((u_char*)utf8_buffer, base[index]); | |
242 } | |
243 *utf8_buffer = '\0'; | |
244 } |