Mercurial > hg > truffle
comparison test/compiler/7070134/Stemmer.java @ 3840:4e761e7e6e12
7070134: Hotspot crashes with sigsegv from PorterStemmer
Summary: Do not move data nodes which are attached to a predicate test to a dominating test.
Reviewed-by: never
author | kvn |
---|---|
date | Tue, 26 Jul 2011 19:35:23 -0700 |
parents | |
children | 3a97daec1b34 |
comparison
equal
deleted
inserted
replaced
3839:3d42f82cd811 | 3840:4e761e7e6e12 |
---|---|
1 /** | |
2 * @test | |
3 * @bug 7070134 | |
4 * @summary Hotspot crashes with sigsegv from PorterStemmer | |
5 * | |
6 * @run shell Test7070134.sh | |
7 */ | |
8 | |
9 /* | |
10 | |
11 Porter stemmer in Java. The original paper is in | |
12 | |
13 Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14, | |
14 no. 3, pp 130-137, | |
15 | |
16 See also http://www.tartarus.org/~martin/PorterStemmer | |
17 | |
18 History: | |
19 | |
20 Release 1 | |
21 | |
22 Bug 1 (reported by Gonzalo Parra 16/10/99) fixed as marked below. | |
23 The words 'aed', 'eed', 'oed' leave k at 'a' for step 3, and b[k-1] | |
24 is then out outside the bounds of b. | |
25 | |
26 Release 2 | |
27 | |
28 Similarly, | |
29 | |
30 Bug 2 (reported by Steve Dyrdahl 22/2/00) fixed as marked below. | |
31 'ion' by itself leaves j = -1 in the test for 'ion' in step 5, and | |
32 b[j] is then outside the bounds of b. | |
33 | |
34 Release 3 | |
35 | |
36 Considerably revised 4/9/00 in the light of many helpful suggestions | |
37 from Brian Goetz of Quiotix Corporation (brian@quiotix.com). | |
38 | |
39 Release 4 | |
40 | |
41 */ | |
42 | |
43 import java.io.*; | |
44 | |
45 /** | |
46 * Stemmer, implementing the Porter Stemming Algorithm | |
47 * | |
48 * The Stemmer class transforms a word into its root form. The input | |
49 * word can be provided a character at time (by calling add()), or at once | |
50 * by calling one of the various stem(something) methods. | |
51 */ | |
52 | |
53 class Stemmer | |
54 { private char[] b; | |
55 private int i, /* offset into b */ | |
56 i_end, /* offset to end of stemmed word */ | |
57 j, k; | |
58 private static final int INC = 50; | |
59 /* unit of size whereby b is increased */ | |
60 public Stemmer() | |
61 { b = new char[INC]; | |
62 i = 0; | |
63 i_end = 0; | |
64 } | |
65 | |
66 /** | |
67 * Add a character to the word being stemmed. When you are finished | |
68 * adding characters, you can call stem(void) to stem the word. | |
69 */ | |
70 | |
71 public void add(char ch) | |
72 { if (i == b.length) | |
73 { char[] new_b = new char[i+INC]; | |
74 for (int c = 0; c < i; c++) new_b[c] = b[c]; | |
75 b = new_b; | |
76 } | |
77 b[i++] = ch; | |
78 } | |
79 | |
80 | |
81 /** Adds wLen characters to the word being stemmed contained in a portion | |
82 * of a char[] array. This is like repeated calls of add(char ch), but | |
83 * faster. | |
84 */ | |
85 | |
86 public void add(char[] w, int wLen) | |
87 { if (i+wLen >= b.length) | |
88 { char[] new_b = new char[i+wLen+INC]; | |
89 for (int c = 0; c < i; c++) new_b[c] = b[c]; | |
90 b = new_b; | |
91 } | |
92 for (int c = 0; c < wLen; c++) b[i++] = w[c]; | |
93 } | |
94 | |
95 /** | |
96 * After a word has been stemmed, it can be retrieved by toString(), | |
97 * or a reference to the internal buffer can be retrieved by getResultBuffer | |
98 * and getResultLength (which is generally more efficient.) | |
99 */ | |
100 public String toString() { return new String(b,0,i_end); } | |
101 | |
102 /** | |
103 * Returns the length of the word resulting from the stemming process. | |
104 */ | |
105 public int getResultLength() { return i_end; } | |
106 | |
107 /** | |
108 * Returns a reference to a character buffer containing the results of | |
109 * the stemming process. You also need to consult getResultLength() | |
110 * to determine the length of the result. | |
111 */ | |
112 public char[] getResultBuffer() { return b; } | |
113 | |
114 /* cons(i) is true <=> b[i] is a consonant. */ | |
115 | |
116 private final boolean cons(int i) | |
117 { switch (b[i]) | |
118 { case 'a': case 'e': case 'i': case 'o': case 'u': return false; | |
119 case 'y': return (i==0) ? true : !cons(i-1); | |
120 default: return true; | |
121 } | |
122 } | |
123 | |
124 /* m() measures the number of consonant sequences between 0 and j. if c is | |
125 a consonant sequence and v a vowel sequence, and <..> indicates arbitrary | |
126 presence, | |
127 | |
128 <c><v> gives 0 | |
129 <c>vc<v> gives 1 | |
130 <c>vcvc<v> gives 2 | |
131 <c>vcvcvc<v> gives 3 | |
132 .... | |
133 */ | |
134 | |
135 private final int m() | |
136 { int n = 0; | |
137 int i = 0; | |
138 while(true) | |
139 { if (i > j) return n; | |
140 if (! cons(i)) break; i++; | |
141 } | |
142 i++; | |
143 while(true) | |
144 { while(true) | |
145 { if (i > j) return n; | |
146 if (cons(i)) break; | |
147 i++; | |
148 } | |
149 i++; | |
150 n++; | |
151 while(true) | |
152 { if (i > j) return n; | |
153 if (! cons(i)) break; | |
154 i++; | |
155 } | |
156 i++; | |
157 } | |
158 } | |
159 | |
160 /* vowelinstem() is true <=> 0,...j contains a vowel */ | |
161 | |
162 private final boolean vowelinstem() | |
163 { int i; for (i = 0; i <= j; i++) if (! cons(i)) return true; | |
164 return false; | |
165 } | |
166 | |
167 /* doublec(j) is true <=> j,(j-1) contain a double consonant. */ | |
168 | |
169 private final boolean doublec(int j) | |
170 { if (j < 1) return false; | |
171 if (b[j] != b[j-1]) return false; | |
172 return cons(j); | |
173 } | |
174 | |
175 /* cvc(i) is true <=> i-2,i-1,i has the form consonant - vowel - consonant | |
176 and also if the second c is not w,x or y. this is used when trying to | |
177 restore an e at the end of a short word. e.g. | |
178 | |
179 cav(e), lov(e), hop(e), crim(e), but | |
180 snow, box, tray. | |
181 | |
182 */ | |
183 | |
184 private final boolean cvc(int i) | |
185 { if (i < 2 || !cons(i) || cons(i-1) || !cons(i-2)) return false; | |
186 { int ch = b[i]; | |
187 if (ch == 'w' || ch == 'x' || ch == 'y') return false; | |
188 } | |
189 return true; | |
190 } | |
191 | |
192 private final boolean ends(String s) | |
193 { int l = s.length(); | |
194 int o = k-l+1; | |
195 if (o < 0) return false; | |
196 for (int i = 0; i < l; i++) if (b[o+i] != s.charAt(i)) return false; | |
197 j = k-l; | |
198 return true; | |
199 } | |
200 | |
201 /* setto(s) sets (j+1),...k to the characters in the string s, readjusting | |
202 k. */ | |
203 | |
204 private final void setto(String s) | |
205 { int l = s.length(); | |
206 int o = j+1; | |
207 for (int i = 0; i < l; i++) b[o+i] = s.charAt(i); | |
208 k = j+l; | |
209 } | |
210 | |
211 /* r(s) is used further down. */ | |
212 | |
213 private final void r(String s) { if (m() > 0) setto(s); } | |
214 | |
215 /* step1() gets rid of plurals and -ed or -ing. e.g. | |
216 | |
217 caresses -> caress | |
218 ponies -> poni | |
219 ties -> ti | |
220 caress -> caress | |
221 cats -> cat | |
222 | |
223 feed -> feed | |
224 agreed -> agree | |
225 disabled -> disable | |
226 | |
227 matting -> mat | |
228 mating -> mate | |
229 meeting -> meet | |
230 milling -> mill | |
231 messing -> mess | |
232 | |
233 meetings -> meet | |
234 | |
235 */ | |
236 | |
237 private final void step1() | |
238 { if (b[k] == 's') | |
239 { if (ends("sses")) k -= 2; else | |
240 if (ends("ies")) setto("i"); else | |
241 if (b[k-1] != 's') k--; | |
242 } | |
243 if (ends("eed")) { if (m() > 0) k--; } else | |
244 if ((ends("ed") || ends("ing")) && vowelinstem()) | |
245 { k = j; | |
246 if (ends("at")) setto("ate"); else | |
247 if (ends("bl")) setto("ble"); else | |
248 if (ends("iz")) setto("ize"); else | |
249 if (doublec(k)) | |
250 { k--; | |
251 { int ch = b[k]; | |
252 if (ch == 'l' || ch == 's' || ch == 'z') k++; | |
253 } | |
254 } | |
255 else if (m() == 1 && cvc(k)) setto("e"); | |
256 } | |
257 } | |
258 | |
259 /* step2() turns terminal y to i when there is another vowel in the stem. */ | |
260 | |
261 private final void step2() { if (ends("y") && vowelinstem()) b[k] = 'i'; } | |
262 | |
263 /* step3() maps double suffices to single ones. so -ization ( = -ize plus | |
264 -ation) maps to -ize etc. note that the string before the suffix must give | |
265 m() > 0. */ | |
266 | |
267 private final void step3() { if (k == 0) return; /* For Bug 1 */ switch (b[k-1]) | |
268 { | |
269 case 'a': if (ends("ational")) { r("ate"); break; } | |
270 if (ends("tional")) { r("tion"); break; } | |
271 break; | |
272 case 'c': if (ends("enci")) { r("ence"); break; } | |
273 if (ends("anci")) { r("ance"); break; } | |
274 break; | |
275 case 'e': if (ends("izer")) { r("ize"); break; } | |
276 break; | |
277 case 'l': if (ends("bli")) { r("ble"); break; } | |
278 if (ends("alli")) { r("al"); break; } | |
279 if (ends("entli")) { r("ent"); break; } | |
280 if (ends("eli")) { r("e"); break; } | |
281 if (ends("ousli")) { r("ous"); break; } | |
282 break; | |
283 case 'o': if (ends("ization")) { r("ize"); break; } | |
284 if (ends("ation")) { r("ate"); break; } | |
285 if (ends("ator")) { r("ate"); break; } | |
286 break; | |
287 case 's': if (ends("alism")) { r("al"); break; } | |
288 if (ends("iveness")) { r("ive"); break; } | |
289 if (ends("fulness")) { r("ful"); break; } | |
290 if (ends("ousness")) { r("ous"); break; } | |
291 break; | |
292 case 't': if (ends("aliti")) { r("al"); break; } | |
293 if (ends("iviti")) { r("ive"); break; } | |
294 if (ends("biliti")) { r("ble"); break; } | |
295 break; | |
296 case 'g': if (ends("logi")) { r("log"); break; } | |
297 } } | |
298 | |
299 /* step4() deals with -ic-, -full, -ness etc. similar strategy to step3. */ | |
300 | |
301 private final void step4() { switch (b[k]) | |
302 { | |
303 case 'e': if (ends("icate")) { r("ic"); break; } | |
304 if (ends("ative")) { r(""); break; } | |
305 if (ends("alize")) { r("al"); break; } | |
306 break; | |
307 case 'i': if (ends("iciti")) { r("ic"); break; } | |
308 break; | |
309 case 'l': if (ends("ical")) { r("ic"); break; } | |
310 if (ends("ful")) { r(""); break; } | |
311 break; | |
312 case 's': if (ends("ness")) { r(""); break; } | |
313 break; | |
314 } } | |
315 | |
316 /* step5() takes off -ant, -ence etc., in context <c>vcvc<v>. */ | |
317 | |
318 private final void step5() | |
319 { if (k == 0) return; /* for Bug 1 */ switch (b[k-1]) | |
320 { case 'a': if (ends("al")) break; return; | |
321 case 'c': if (ends("ance")) break; | |
322 if (ends("ence")) break; return; | |
323 case 'e': if (ends("er")) break; return; | |
324 case 'i': if (ends("ic")) break; return; | |
325 case 'l': if (ends("able")) break; | |
326 if (ends("ible")) break; return; | |
327 case 'n': if (ends("ant")) break; | |
328 if (ends("ement")) break; | |
329 if (ends("ment")) break; | |
330 /* element etc. not stripped before the m */ | |
331 if (ends("ent")) break; return; | |
332 case 'o': if (ends("ion") && j >= 0 && (b[j] == 's' || b[j] == 't')) break; | |
333 /* j >= 0 fixes Bug 2 */ | |
334 if (ends("ou")) break; return; | |
335 /* takes care of -ous */ | |
336 case 's': if (ends("ism")) break; return; | |
337 case 't': if (ends("ate")) break; | |
338 if (ends("iti")) break; return; | |
339 case 'u': if (ends("ous")) break; return; | |
340 case 'v': if (ends("ive")) break; return; | |
341 case 'z': if (ends("ize")) break; return; | |
342 default: return; | |
343 } | |
344 if (m() > 1) k = j; | |
345 } | |
346 | |
347 /* step6() removes a final -e if m() > 1. */ | |
348 | |
349 private final void step6() | |
350 { j = k; | |
351 if (b[k] == 'e') | |
352 { int a = m(); | |
353 if (a > 1 || a == 1 && !cvc(k-1)) k--; | |
354 } | |
355 if (b[k] == 'l' && doublec(k) && m() > 1) k--; | |
356 } | |
357 | |
358 /** Stem the word placed into the Stemmer buffer through calls to add(). | |
359 * Returns true if the stemming process resulted in a word different | |
360 * from the input. You can retrieve the result with | |
361 * getResultLength()/getResultBuffer() or toString(). | |
362 */ | |
363 public void stem() | |
364 { k = i - 1; | |
365 if (k > 1) { step1(); step2(); step3(); step4(); step5(); step6(); } | |
366 i_end = k+1; i = 0; | |
367 } | |
368 | |
369 /** Test program for demonstrating the Stemmer. It reads text from a | |
370 * a list of files, stems each word, and writes the result to standard | |
371 * output. Note that the word stemmed is expected to be in lower case: | |
372 * forcing lower case must be done outside the Stemmer class. | |
373 * Usage: Stemmer file-name file-name ... | |
374 */ | |
375 public static void main(String[] args) | |
376 { | |
377 char[] w = new char[501]; | |
378 Stemmer s = new Stemmer(); | |
379 for (int i = 0; i < args.length; i++) | |
380 try | |
381 { | |
382 FileInputStream in = new FileInputStream(args[i]); | |
383 | |
384 try | |
385 { while(true) | |
386 | |
387 { int ch = in.read(); | |
388 if (Character.isLetter((char) ch)) | |
389 { | |
390 int j = 0; | |
391 while(true) | |
392 { ch = Character.toLowerCase((char) ch); | |
393 w[j] = (char) ch; | |
394 if (j < 500) j++; | |
395 ch = in.read(); | |
396 if (!Character.isLetter((char) ch)) | |
397 { | |
398 /* to test add(char ch) */ | |
399 for (int c = 0; c < j; c++) s.add(w[c]); | |
400 | |
401 /* or, to test add(char[] w, int j) */ | |
402 /* s.add(w, j); */ | |
403 | |
404 s.stem(); | |
405 { String u; | |
406 | |
407 /* and now, to test toString() : */ | |
408 u = s.toString(); | |
409 | |
410 /* to test getResultBuffer(), getResultLength() : */ | |
411 /* u = new String(s.getResultBuffer(), 0, s.getResultLength()); */ | |
412 | |
413 System.out.print(u); | |
414 } | |
415 break; | |
416 } | |
417 } | |
418 } | |
419 if (ch < 0) break; | |
420 System.out.print((char)ch); | |
421 } | |
422 } | |
423 catch (IOException e) | |
424 { System.out.println("error reading " + args[i]); | |
425 break; | |
426 } | |
427 } | |
428 catch (FileNotFoundException e) | |
429 { System.out.println("file " + args[i] + " not found"); | |
430 break; | |
431 } | |
432 } | |
433 } |