truffle: src/share/vm/utilities/utf8.cpp annotate

annotate src/share/vm/utilities/utf8.cpp @ 6862:8a5ea0a9ccc4

7127708: G1: change task num types from int to uint in concurrent mark Summary: Change the type of various task num fields, parameters etc to unsigned and rename them to be more consistent with the other collectors. Code changes were also reviewed by Vitaly Davidovich. Reviewed-by: johnc Contributed-by: Kaushik Srenevasan <kaushik@twitter.com>

author	johnc
date	Sat, 06 Oct 2012 01:17:44 -0700
parents	1d1603768966
children	bd7a7ce2e264

rev	line source
0 a61af66fc99e Initial load duke parents: diff changeset	1 /*
2426 1d1603768966 7010070: Update all 2010 Oracle-changed OpenJDK files to have the proper copyright dates - second pass trims parents: 2177 diff changeset	2 * Copyright (c) 1997, 2011, Oracle and/or its affiliates. All rights reserved.
0 a61af66fc99e Initial load duke parents: diff changeset	3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
a61af66fc99e Initial load duke parents: diff changeset	4 *
a61af66fc99e Initial load duke parents: diff changeset	5 * This code is free software; you can redistribute it and/or modify it
a61af66fc99e Initial load duke parents: diff changeset	6 * under the terms of the GNU General Public License version 2 only, as
a61af66fc99e Initial load duke parents: diff changeset	7 * published by the Free Software Foundation.
a61af66fc99e Initial load duke parents: diff changeset	8 *
a61af66fc99e Initial load duke parents: diff changeset	9 * This code is distributed in the hope that it will be useful, but WITHOUT
a61af66fc99e Initial load duke parents: diff changeset	10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
a61af66fc99e Initial load duke parents: diff changeset	11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
a61af66fc99e Initial load duke parents: diff changeset	12 * version 2 for more details (a copy is included in the LICENSE file that
a61af66fc99e Initial load duke parents: diff changeset	13 * accompanied this code).
a61af66fc99e Initial load duke parents: diff changeset	14 *
a61af66fc99e Initial load duke parents: diff changeset	15 * You should have received a copy of the GNU General Public License version
a61af66fc99e Initial load duke parents: diff changeset	16 * 2 along with this work; if not, write to the Free Software Foundation,
a61af66fc99e Initial load duke parents: diff changeset	17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
a61af66fc99e Initial load duke parents: diff changeset	18 *
1552 c18cbe5936b8 6941466: Oracle rebranding changes for Hotspot repositories trims parents: 0 diff changeset	19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
c18cbe5936b8 6941466: Oracle rebranding changes for Hotspot repositories trims parents: 0 diff changeset	20 * or visit www.oracle.com if you need additional information or have any
c18cbe5936b8 6941466: Oracle rebranding changes for Hotspot repositories trims parents: 0 diff changeset	21 * questions.
0 a61af66fc99e Initial load duke parents: diff changeset	22 *
a61af66fc99e Initial load duke parents: diff changeset	23 */
a61af66fc99e Initial load duke parents: diff changeset	24
1972 f95d63e2154a 6989984: Use standard include model for Hospot stefank parents: 1552 diff changeset	25 #include "precompiled.hpp"
f95d63e2154a 6989984: Use standard include model for Hospot stefank parents: 1552 diff changeset	26 #include "utilities/utf8.hpp"
0 a61af66fc99e Initial load duke parents: diff changeset	27
a61af66fc99e Initial load duke parents: diff changeset	28 // Assume the utf8 string is in legal form and has been
a61af66fc99e Initial load duke parents: diff changeset	29 // checked in the class file parser/format checker.
a61af66fc99e Initial load duke parents: diff changeset	30 char* UTF8::next(const char* str, jchar* value) {
a61af66fc99e Initial load duke parents: diff changeset	31 unsigned const char ptr = (const unsigned char )str;
a61af66fc99e Initial load duke parents: diff changeset	32 unsigned char ch, ch2, ch3;
a61af66fc99e Initial load duke parents: diff changeset	33 int length = -1; /* bad length */
a61af66fc99e Initial load duke parents: diff changeset	34 jchar result;
a61af66fc99e Initial load duke parents: diff changeset	35 switch ((ch = ptr[0]) >> 4) {
a61af66fc99e Initial load duke parents: diff changeset	36 default:
a61af66fc99e Initial load duke parents: diff changeset	37 result = ch;
a61af66fc99e Initial load duke parents: diff changeset	38 length = 1;
a61af66fc99e Initial load duke parents: diff changeset	39 break;
a61af66fc99e Initial load duke parents: diff changeset	40
a61af66fc99e Initial load duke parents: diff changeset	41 case 0x8: case 0x9: case 0xA: case 0xB: case 0xF:
a61af66fc99e Initial load duke parents: diff changeset	42 /* Shouldn't happen. */
a61af66fc99e Initial load duke parents: diff changeset	43 break;
a61af66fc99e Initial load duke parents: diff changeset	44
a61af66fc99e Initial load duke parents: diff changeset	45 case 0xC: case 0xD:
a61af66fc99e Initial load duke parents: diff changeset	46 /* 110xxxxx 10xxxxxx */
a61af66fc99e Initial load duke parents: diff changeset	47 if (((ch2 = ptr[1]) & 0xC0) == 0x80) {
a61af66fc99e Initial load duke parents: diff changeset	48 unsigned char high_five = ch & 0x1F;
a61af66fc99e Initial load duke parents: diff changeset	49 unsigned char low_six = ch2 & 0x3F;
a61af66fc99e Initial load duke parents: diff changeset	50 result = (high_five << 6) + low_six;
a61af66fc99e Initial load duke parents: diff changeset	51 length = 2;
a61af66fc99e Initial load duke parents: diff changeset	52 break;
a61af66fc99e Initial load duke parents: diff changeset	53 }
a61af66fc99e Initial load duke parents: diff changeset	54 break;
a61af66fc99e Initial load duke parents: diff changeset	55
a61af66fc99e Initial load duke parents: diff changeset	56 case 0xE:
a61af66fc99e Initial load duke parents: diff changeset	57 /* 1110xxxx 10xxxxxx 10xxxxxx */
a61af66fc99e Initial load duke parents: diff changeset	58 if (((ch2 = ptr[1]) & 0xC0) == 0x80) {
a61af66fc99e Initial load duke parents: diff changeset	59 if (((ch3 = ptr[2]) & 0xC0) == 0x80) {
a61af66fc99e Initial load duke parents: diff changeset	60 unsigned char high_four = ch & 0x0f;
a61af66fc99e Initial load duke parents: diff changeset	61 unsigned char mid_six = ch2 & 0x3f;
a61af66fc99e Initial load duke parents: diff changeset	62 unsigned char low_six = ch3 & 0x3f;
a61af66fc99e Initial load duke parents: diff changeset	63 result = (((high_four << 6) + mid_six) << 6) + low_six;
a61af66fc99e Initial load duke parents: diff changeset	64 length = 3;
a61af66fc99e Initial load duke parents: diff changeset	65 }
a61af66fc99e Initial load duke parents: diff changeset	66 }
a61af66fc99e Initial load duke parents: diff changeset	67 break;
a61af66fc99e Initial load duke parents: diff changeset	68 } /* end of switch */
a61af66fc99e Initial load duke parents: diff changeset	69
a61af66fc99e Initial load duke parents: diff changeset	70 if (length <= 0) {
a61af66fc99e Initial load duke parents: diff changeset	71 value = ptr[0]; / default bad result; */
a61af66fc99e Initial load duke parents: diff changeset	72 return (char*)(ptr + 1); // make progress somehow
a61af66fc99e Initial load duke parents: diff changeset	73 }
a61af66fc99e Initial load duke parents: diff changeset	74
a61af66fc99e Initial load duke parents: diff changeset	75 *value = result;
a61af66fc99e Initial load duke parents: diff changeset	76
a61af66fc99e Initial load duke parents: diff changeset	77 // The assert is correct but the .class file is wrong
a61af66fc99e Initial load duke parents: diff changeset	78 // assert(UNICODE::utf8_size(result) == length, "checking reverse computation");
a61af66fc99e Initial load duke parents: diff changeset	79 return (char *)(ptr + length);
a61af66fc99e Initial load duke parents: diff changeset	80 }
a61af66fc99e Initial load duke parents: diff changeset	81
a61af66fc99e Initial load duke parents: diff changeset	82 char* UTF8::next_character(const char* str, jint* value) {
a61af66fc99e Initial load duke parents: diff changeset	83 unsigned const char ptr = (const unsigned char )str;
a61af66fc99e Initial load duke parents: diff changeset	84 /* See if it's legal supplementary character:
a61af66fc99e Initial load duke parents: diff changeset	85 11101101 1010xxxx 10xxxxxx 11101101 1011xxxx 10xxxxxx */
a61af66fc99e Initial load duke parents: diff changeset	86 if (is_supplementary_character(ptr)) {
a61af66fc99e Initial load duke parents: diff changeset	87 *value = get_supplementary_character(ptr);
a61af66fc99e Initial load duke parents: diff changeset	88 return (char *)(ptr + 6);
a61af66fc99e Initial load duke parents: diff changeset	89 }
a61af66fc99e Initial load duke parents: diff changeset	90 jchar result;
a61af66fc99e Initial load duke parents: diff changeset	91 char* next_ch = next(str, &result);
a61af66fc99e Initial load duke parents: diff changeset	92 *value = result;
a61af66fc99e Initial load duke parents: diff changeset	93 return next_ch;
a61af66fc99e Initial load duke parents: diff changeset	94 }
a61af66fc99e Initial load duke parents: diff changeset	95
a61af66fc99e Initial load duke parents: diff changeset	96 // Count bytes of the form 10xxxxxx and deduct this count
a61af66fc99e Initial load duke parents: diff changeset	97 // from the total byte count. The utf8 string must be in
a61af66fc99e Initial load duke parents: diff changeset	98 // legal form which has been verified in the format checker.
a61af66fc99e Initial load duke parents: diff changeset	99 int UTF8::unicode_length(const char* str, int len) {
a61af66fc99e Initial load duke parents: diff changeset	100 int num_chars = len;
a61af66fc99e Initial load duke parents: diff changeset	101 for (int i = 0; i < len; i++) {
a61af66fc99e Initial load duke parents: diff changeset	102 if ((str[i] & 0xC0) == 0x80) {
a61af66fc99e Initial load duke parents: diff changeset	103 --num_chars;
a61af66fc99e Initial load duke parents: diff changeset	104 }
a61af66fc99e Initial load duke parents: diff changeset	105 }
a61af66fc99e Initial load duke parents: diff changeset	106 return num_chars;
a61af66fc99e Initial load duke parents: diff changeset	107 }
a61af66fc99e Initial load duke parents: diff changeset	108
a61af66fc99e Initial load duke parents: diff changeset	109 // Count bytes of the utf8 string except those in form
a61af66fc99e Initial load duke parents: diff changeset	110 // 10xxxxxx which only appear in multibyte characters.
a61af66fc99e Initial load duke parents: diff changeset	111 // The utf8 string must be in legal form and has been
a61af66fc99e Initial load duke parents: diff changeset	112 // verified in the format checker.
a61af66fc99e Initial load duke parents: diff changeset	113 int UTF8::unicode_length(const char* str) {
a61af66fc99e Initial load duke parents: diff changeset	114 int num_chars = 0;
a61af66fc99e Initial load duke parents: diff changeset	115 for (const char* p = str; *p; p++) {
a61af66fc99e Initial load duke parents: diff changeset	116 if (((*p) & 0xC0) != 0x80) {
a61af66fc99e Initial load duke parents: diff changeset	117 num_chars++;
a61af66fc99e Initial load duke parents: diff changeset	118 }
a61af66fc99e Initial load duke parents: diff changeset	119 }
a61af66fc99e Initial load duke parents: diff changeset	120 return num_chars;
a61af66fc99e Initial load duke parents: diff changeset	121 }
a61af66fc99e Initial load duke parents: diff changeset	122
a61af66fc99e Initial load duke parents: diff changeset	123 // Writes a jchar a utf8 and returns the end
a61af66fc99e Initial load duke parents: diff changeset	124 static u_char* utf8_write(u_char* base, jchar ch) {
a61af66fc99e Initial load duke parents: diff changeset	125 if ((ch != 0) && (ch <=0x7f)) {
a61af66fc99e Initial load duke parents: diff changeset	126 base[0] = (u_char) ch;
a61af66fc99e Initial load duke parents: diff changeset	127 return base + 1;
a61af66fc99e Initial load duke parents: diff changeset	128 }
a61af66fc99e Initial load duke parents: diff changeset	129
a61af66fc99e Initial load duke parents: diff changeset	130 if (ch <= 0x7FF) {
a61af66fc99e Initial load duke parents: diff changeset	131 /* 11 bits or less. */
a61af66fc99e Initial load duke parents: diff changeset	132 unsigned char high_five = ch >> 6;
a61af66fc99e Initial load duke parents: diff changeset	133 unsigned char low_six = ch & 0x3F;
a61af66fc99e Initial load duke parents: diff changeset	134 base[0] = high_five \| 0xC0; /* 110xxxxx */
a61af66fc99e Initial load duke parents: diff changeset	135 base[1] = low_six \| 0x80; /* 10xxxxxx */
a61af66fc99e Initial load duke parents: diff changeset	136 return base + 2;
a61af66fc99e Initial load duke parents: diff changeset	137 }
a61af66fc99e Initial load duke parents: diff changeset	138 /* possibly full 16 bits. */
a61af66fc99e Initial load duke parents: diff changeset	139 char high_four = ch >> 12;
a61af66fc99e Initial load duke parents: diff changeset	140 char mid_six = (ch >> 6) & 0x3F;
a61af66fc99e Initial load duke parents: diff changeset	141 char low_six = ch & 0x3f;
a61af66fc99e Initial load duke parents: diff changeset	142 base[0] = high_four \| 0xE0; /* 1110xxxx */
a61af66fc99e Initial load duke parents: diff changeset	143 base[1] = mid_six \| 0x80; /* 10xxxxxx */
a61af66fc99e Initial load duke parents: diff changeset	144 base[2] = low_six \| 0x80; /* 10xxxxxx */
a61af66fc99e Initial load duke parents: diff changeset	145 return base + 3;
a61af66fc99e Initial load duke parents: diff changeset	146 }
a61af66fc99e Initial load duke parents: diff changeset	147
a61af66fc99e Initial load duke parents: diff changeset	148 void UTF8::convert_to_unicode(const char* utf8_str, jchar* unicode_str, int unicode_length) {
a61af66fc99e Initial load duke parents: diff changeset	149 unsigned char ch;
a61af66fc99e Initial load duke parents: diff changeset	150 const char ptr = (const char )utf8_str;
a61af66fc99e Initial load duke parents: diff changeset	151 int index = 0;
a61af66fc99e Initial load duke parents: diff changeset	152
a61af66fc99e Initial load duke parents: diff changeset	153 /* ASCII case loop optimization */
a61af66fc99e Initial load duke parents: diff changeset	154 for (; index < unicode_length; index++) {
a61af66fc99e Initial load duke parents: diff changeset	155 if((ch = ptr[0]) > 0x7F) { break; }
a61af66fc99e Initial load duke parents: diff changeset	156 unicode_str[index] = ch;
a61af66fc99e Initial load duke parents: diff changeset	157 ptr = (const char *)(ptr + 1);
a61af66fc99e Initial load duke parents: diff changeset	158 }
a61af66fc99e Initial load duke parents: diff changeset	159
a61af66fc99e Initial load duke parents: diff changeset	160 for (; index < unicode_length; index++) {
a61af66fc99e Initial load duke parents: diff changeset	161 ptr = UTF8::next(ptr, &unicode_str[index]);
a61af66fc99e Initial load duke parents: diff changeset	162 }
a61af66fc99e Initial load duke parents: diff changeset	163 }
a61af66fc99e Initial load duke parents: diff changeset	164
a61af66fc99e Initial load duke parents: diff changeset	165 // Returns NULL if 'c' it not found. This only works as long
a61af66fc99e Initial load duke parents: diff changeset	166 // as 'c' is an ASCII character
2177 3582bf76420e 6990754: Use native memory and reference counting to implement SymbolTable coleenp parents: 1972 diff changeset	167 const jbyte* UTF8::strrchr(const jbyte* base, int length, jbyte c) {
0 a61af66fc99e Initial load duke parents: diff changeset	168 assert(length >= 0, "sanity check");
a61af66fc99e Initial load duke parents: diff changeset	169 assert(c >= 0, "does not work for non-ASCII characters");
a61af66fc99e Initial load duke parents: diff changeset	170 // Skip backwards in string until 'c' is found or end is reached
a61af66fc99e Initial load duke parents: diff changeset	171 while(--length >= 0 && base[length] != c);
a61af66fc99e Initial load duke parents: diff changeset	172 return (length < 0) ? NULL : &base[length];
a61af66fc99e Initial load duke parents: diff changeset	173 }
a61af66fc99e Initial load duke parents: diff changeset	174
2177 3582bf76420e 6990754: Use native memory and reference counting to implement SymbolTable coleenp parents: 1972 diff changeset	175 bool UTF8::equal(const jbyte* base1, int length1, const jbyte* base2, int length2) {
0 a61af66fc99e Initial load duke parents: diff changeset	176 // Length must be the same
a61af66fc99e Initial load duke parents: diff changeset	177 if (length1 != length2) return false;
a61af66fc99e Initial load duke parents: diff changeset	178 for (int i = 0; i < length1; i++) {
a61af66fc99e Initial load duke parents: diff changeset	179 if (base1[i] != base2[i]) return false;
a61af66fc99e Initial load duke parents: diff changeset	180 }
a61af66fc99e Initial load duke parents: diff changeset	181 return true;
a61af66fc99e Initial load duke parents: diff changeset	182 }
a61af66fc99e Initial load duke parents: diff changeset	183
a61af66fc99e Initial load duke parents: diff changeset	184 bool UTF8::is_supplementary_character(const unsigned char* str) {
a61af66fc99e Initial load duke parents: diff changeset	185 return ((str[0] & 0xFF) == 0xED) && ((str[1] & 0xF0) == 0xA0) && ((str[2] & 0xC0) == 0x80)
a61af66fc99e Initial load duke parents: diff changeset	186 && ((str[3] & 0xFF) == 0xED) && ((str[4] & 0xF0) == 0xB0) && ((str[5] & 0xC0) == 0x80);
a61af66fc99e Initial load duke parents: diff changeset	187 }
a61af66fc99e Initial load duke parents: diff changeset	188
a61af66fc99e Initial load duke parents: diff changeset	189 jint UTF8::get_supplementary_character(const unsigned char* str) {
a61af66fc99e Initial load duke parents: diff changeset	190 return 0x10000 + ((str[1] & 0x0f) << 16) + ((str[2] & 0x3f) << 10)
a61af66fc99e Initial load duke parents: diff changeset	191 + ((str[4] & 0x0f) << 6) + (str[5] & 0x3f);
a61af66fc99e Initial load duke parents: diff changeset	192 }
a61af66fc99e Initial load duke parents: diff changeset	193
a61af66fc99e Initial load duke parents: diff changeset	194
a61af66fc99e Initial load duke parents: diff changeset	195 //-------------------------------------------------------------------------------------
a61af66fc99e Initial load duke parents: diff changeset	196
a61af66fc99e Initial load duke parents: diff changeset	197
a61af66fc99e Initial load duke parents: diff changeset	198 int UNICODE::utf8_size(jchar c) {
a61af66fc99e Initial load duke parents: diff changeset	199 if ((0x0001 <= c) && (c <= 0x007F)) return 1;
a61af66fc99e Initial load duke parents: diff changeset	200 if (c <= 0x07FF) return 2;
a61af66fc99e Initial load duke parents: diff changeset	201 return 3;
a61af66fc99e Initial load duke parents: diff changeset	202 }
a61af66fc99e Initial load duke parents: diff changeset	203
a61af66fc99e Initial load duke parents: diff changeset	204 int UNICODE::utf8_length(jchar* base, int length) {
a61af66fc99e Initial load duke parents: diff changeset	205 int result = 0;
a61af66fc99e Initial load duke parents: diff changeset	206 for (int index = 0; index < length; index++) {
a61af66fc99e Initial load duke parents: diff changeset	207 jchar c = base[index];
a61af66fc99e Initial load duke parents: diff changeset	208 if ((0x0001 <= c) && (c <= 0x007F)) result += 1;
a61af66fc99e Initial load duke parents: diff changeset	209 else if (c <= 0x07FF) result += 2;
a61af66fc99e Initial load duke parents: diff changeset	210 else result += 3;
a61af66fc99e Initial load duke parents: diff changeset	211 }
a61af66fc99e Initial load duke parents: diff changeset	212 return result;
a61af66fc99e Initial load duke parents: diff changeset	213 }
a61af66fc99e Initial load duke parents: diff changeset	214
a61af66fc99e Initial load duke parents: diff changeset	215 char* UNICODE::as_utf8(jchar* base, int length) {
a61af66fc99e Initial load duke parents: diff changeset	216 int utf8_len = utf8_length(base, length);
a61af66fc99e Initial load duke parents: diff changeset	217 u_char* result = NEW_RESOURCE_ARRAY(u_char, utf8_len + 1);
a61af66fc99e Initial load duke parents: diff changeset	218 u_char* p = result;
a61af66fc99e Initial load duke parents: diff changeset	219 for (int index = 0; index < length; index++) {
a61af66fc99e Initial load duke parents: diff changeset	220 p = utf8_write(p, base[index]);
a61af66fc99e Initial load duke parents: diff changeset	221 }
a61af66fc99e Initial load duke parents: diff changeset	222 *p = '\0';
a61af66fc99e Initial load duke parents: diff changeset	223 assert(p == &result[utf8_len], "length prediction must be correct");
a61af66fc99e Initial load duke parents: diff changeset	224 return (char*) result;
a61af66fc99e Initial load duke parents: diff changeset	225 }
a61af66fc99e Initial load duke parents: diff changeset	226
a61af66fc99e Initial load duke parents: diff changeset	227 char* UNICODE::as_utf8(jchar* base, int length, char* buf, int buflen) {
a61af66fc99e Initial load duke parents: diff changeset	228 u_char* p = (u_char*)buf;
a61af66fc99e Initial load duke parents: diff changeset	229 u_char* end = (u_char*)buf + buflen;
a61af66fc99e Initial load duke parents: diff changeset	230 for (int index = 0; index < length; index++) {
a61af66fc99e Initial load duke parents: diff changeset	231 jchar c = base[index];
a61af66fc99e Initial load duke parents: diff changeset	232 if (p + utf8_size(c) >= end) break; // string is truncated
a61af66fc99e Initial load duke parents: diff changeset	233 p = utf8_write(p, base[index]);
a61af66fc99e Initial load duke parents: diff changeset	234 }
a61af66fc99e Initial load duke parents: diff changeset	235 *p = '\0';
a61af66fc99e Initial load duke parents: diff changeset	236 return buf;
a61af66fc99e Initial load duke parents: diff changeset	237 }
a61af66fc99e Initial load duke parents: diff changeset	238
a61af66fc99e Initial load duke parents: diff changeset	239 void UNICODE::convert_to_utf8(const jchar* base, int length, char* utf8_buffer) {
a61af66fc99e Initial load duke parents: diff changeset	240 for(int index = 0; index < length; index++) {
a61af66fc99e Initial load duke parents: diff changeset	241 utf8_buffer = (char)utf8_write((u_char)utf8_buffer, base[index]);
a61af66fc99e Initial load duke parents: diff changeset	242 }
a61af66fc99e Initial load duke parents: diff changeset	243 *utf8_buffer = '\0';
a61af66fc99e Initial load duke parents: diff changeset	244 }

Mercurial > hg > truffle

annotate src/share/vm/utilities/utf8.cpp @ 6862:8a5ea0a9ccc4