annotate test/compiler/7070134/Stemmer.java @ 4155:394404b2d9bd

Removed strict requirement for GRAAL environment variable. It only needs to be set now if the graal directory is not in the directory hierarchy of GraalVM JDK.
author Doug Simon <doug.simon@oracle.com>
date Wed, 21 Dec 2011 11:25:27 +0100
parents 4e761e7e6e12
children 3a97daec1b34
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
3840
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
1 /**
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
2 * @test
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
3 * @bug 7070134
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
4 * @summary Hotspot crashes with sigsegv from PorterStemmer
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
5 *
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
6 * @run shell Test7070134.sh
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
7 */
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
8
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
9 /*
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
10
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
11 Porter stemmer in Java. The original paper is in
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
12
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
13 Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
14 no. 3, pp 130-137,
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
15
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
16 See also http://www.tartarus.org/~martin/PorterStemmer
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
17
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
18 History:
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
19
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
20 Release 1
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
21
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
22 Bug 1 (reported by Gonzalo Parra 16/10/99) fixed as marked below.
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
23 The words 'aed', 'eed', 'oed' leave k at 'a' for step 3, and b[k-1]
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
24 is then out outside the bounds of b.
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
25
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
26 Release 2
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
27
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
28 Similarly,
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
29
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
30 Bug 2 (reported by Steve Dyrdahl 22/2/00) fixed as marked below.
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
31 'ion' by itself leaves j = -1 in the test for 'ion' in step 5, and
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
32 b[j] is then outside the bounds of b.
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
33
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
34 Release 3
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
35
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
36 Considerably revised 4/9/00 in the light of many helpful suggestions
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
37 from Brian Goetz of Quiotix Corporation (brian@quiotix.com).
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
38
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
39 Release 4
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
40
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
41 */
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
42
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
43 import java.io.*;
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
44
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
45 /**
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
46 * Stemmer, implementing the Porter Stemming Algorithm
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
47 *
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
48 * The Stemmer class transforms a word into its root form. The input
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
49 * word can be provided a character at time (by calling add()), or at once
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
50 * by calling one of the various stem(something) methods.
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
51 */
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
52
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
53 class Stemmer
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
54 { private char[] b;
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
55 private int i, /* offset into b */
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
56 i_end, /* offset to end of stemmed word */
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
57 j, k;
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
58 private static final int INC = 50;
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
59 /* unit of size whereby b is increased */
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
60 public Stemmer()
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
61 { b = new char[INC];
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
62 i = 0;
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
63 i_end = 0;
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
64 }
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
65
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
66 /**
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
67 * Add a character to the word being stemmed. When you are finished
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
68 * adding characters, you can call stem(void) to stem the word.
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
69 */
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
70
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
71 public void add(char ch)
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
72 { if (i == b.length)
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
73 { char[] new_b = new char[i+INC];
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
74 for (int c = 0; c < i; c++) new_b[c] = b[c];
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
75 b = new_b;
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
76 }
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
77 b[i++] = ch;
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
78 }
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
79
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
80
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
81 /** Adds wLen characters to the word being stemmed contained in a portion
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
82 * of a char[] array. This is like repeated calls of add(char ch), but
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
83 * faster.
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
84 */
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
85
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
86 public void add(char[] w, int wLen)
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
87 { if (i+wLen >= b.length)
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
88 { char[] new_b = new char[i+wLen+INC];
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
89 for (int c = 0; c < i; c++) new_b[c] = b[c];
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
90 b = new_b;
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
91 }
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
92 for (int c = 0; c < wLen; c++) b[i++] = w[c];
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
93 }
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
94
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
95 /**
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
96 * After a word has been stemmed, it can be retrieved by toString(),
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
97 * or a reference to the internal buffer can be retrieved by getResultBuffer
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
98 * and getResultLength (which is generally more efficient.)
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
99 */
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
100 public String toString() { return new String(b,0,i_end); }
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
101
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
102 /**
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
103 * Returns the length of the word resulting from the stemming process.
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
104 */
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
105 public int getResultLength() { return i_end; }
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
106
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
107 /**
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
108 * Returns a reference to a character buffer containing the results of
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
109 * the stemming process. You also need to consult getResultLength()
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
110 * to determine the length of the result.
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
111 */
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
112 public char[] getResultBuffer() { return b; }
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
113
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
114 /* cons(i) is true <=> b[i] is a consonant. */
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
115
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
116 private final boolean cons(int i)
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
117 { switch (b[i])
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
118 { case 'a': case 'e': case 'i': case 'o': case 'u': return false;
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
119 case 'y': return (i==0) ? true : !cons(i-1);
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
120 default: return true;
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
121 }
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
122 }
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
123
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
124 /* m() measures the number of consonant sequences between 0 and j. if c is
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
125 a consonant sequence and v a vowel sequence, and <..> indicates arbitrary
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
126 presence,
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
127
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
128 <c><v> gives 0
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
129 <c>vc<v> gives 1
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
130 <c>vcvc<v> gives 2
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
131 <c>vcvcvc<v> gives 3
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
132 ....
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
133 */
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
134
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
135 private final int m()
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
136 { int n = 0;
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
137 int i = 0;
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
138 while(true)
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
139 { if (i > j) return n;
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
140 if (! cons(i)) break; i++;
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
141 }
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
142 i++;
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
143 while(true)
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
144 { while(true)
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
145 { if (i > j) return n;
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
146 if (cons(i)) break;
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
147 i++;
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
148 }
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
149 i++;
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
150 n++;
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
151 while(true)
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
152 { if (i > j) return n;
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
153 if (! cons(i)) break;
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
154 i++;
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
155 }
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
156 i++;
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
157 }
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
158 }
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
159
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
160 /* vowelinstem() is true <=> 0,...j contains a vowel */
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
161
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
162 private final boolean vowelinstem()
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
163 { int i; for (i = 0; i <= j; i++) if (! cons(i)) return true;
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
164 return false;
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
165 }
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
166
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
167 /* doublec(j) is true <=> j,(j-1) contain a double consonant. */
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
168
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
169 private final boolean doublec(int j)
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
170 { if (j < 1) return false;
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
171 if (b[j] != b[j-1]) return false;
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
172 return cons(j);
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
173 }
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
174
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
175 /* cvc(i) is true <=> i-2,i-1,i has the form consonant - vowel - consonant
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
176 and also if the second c is not w,x or y. this is used when trying to
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
177 restore an e at the end of a short word. e.g.
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
178
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
179 cav(e), lov(e), hop(e), crim(e), but
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
180 snow, box, tray.
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
181
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
182 */
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
183
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
184 private final boolean cvc(int i)
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
185 { if (i < 2 || !cons(i) || cons(i-1) || !cons(i-2)) return false;
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
186 { int ch = b[i];
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
187 if (ch == 'w' || ch == 'x' || ch == 'y') return false;
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
188 }
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
189 return true;
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
190 }
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
191
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
192 private final boolean ends(String s)
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
193 { int l = s.length();
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
194 int o = k-l+1;
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
195 if (o < 0) return false;
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
196 for (int i = 0; i < l; i++) if (b[o+i] != s.charAt(i)) return false;
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
197 j = k-l;
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
198 return true;
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
199 }
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
200
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
201 /* setto(s) sets (j+1),...k to the characters in the string s, readjusting
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
202 k. */
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
203
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
204 private final void setto(String s)
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
205 { int l = s.length();
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
206 int o = j+1;
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
207 for (int i = 0; i < l; i++) b[o+i] = s.charAt(i);
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
208 k = j+l;
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
209 }
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
210
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
211 /* r(s) is used further down. */
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
212
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
213 private final void r(String s) { if (m() > 0) setto(s); }
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
214
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
215 /* step1() gets rid of plurals and -ed or -ing. e.g.
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
216
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
217 caresses -> caress
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
218 ponies -> poni
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
219 ties -> ti
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
220 caress -> caress
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
221 cats -> cat
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
222
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
223 feed -> feed
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
224 agreed -> agree
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
225 disabled -> disable
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
226
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
227 matting -> mat
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
228 mating -> mate
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
229 meeting -> meet
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
230 milling -> mill
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
231 messing -> mess
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
232
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
233 meetings -> meet
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
234
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
235 */
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
236
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
237 private final void step1()
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
238 { if (b[k] == 's')
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
239 { if (ends("sses")) k -= 2; else
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
240 if (ends("ies")) setto("i"); else
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
241 if (b[k-1] != 's') k--;
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
242 }
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
243 if (ends("eed")) { if (m() > 0) k--; } else
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
244 if ((ends("ed") || ends("ing")) && vowelinstem())
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
245 { k = j;
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
246 if (ends("at")) setto("ate"); else
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
247 if (ends("bl")) setto("ble"); else
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
248 if (ends("iz")) setto("ize"); else
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
249 if (doublec(k))
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
250 { k--;
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
251 { int ch = b[k];
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
252 if (ch == 'l' || ch == 's' || ch == 'z') k++;
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
253 }
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
254 }
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
255 else if (m() == 1 && cvc(k)) setto("e");
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
256 }
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
257 }
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
258
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
259 /* step2() turns terminal y to i when there is another vowel in the stem. */
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
260
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
261 private final void step2() { if (ends("y") && vowelinstem()) b[k] = 'i'; }
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
262
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
263 /* step3() maps double suffices to single ones. so -ization ( = -ize plus
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
264 -ation) maps to -ize etc. note that the string before the suffix must give
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
265 m() > 0. */
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
266
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
267 private final void step3() { if (k == 0) return; /* For Bug 1 */ switch (b[k-1])
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
268 {
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
269 case 'a': if (ends("ational")) { r("ate"); break; }
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
270 if (ends("tional")) { r("tion"); break; }
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
271 break;
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
272 case 'c': if (ends("enci")) { r("ence"); break; }
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
273 if (ends("anci")) { r("ance"); break; }
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
274 break;
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
275 case 'e': if (ends("izer")) { r("ize"); break; }
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
276 break;
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
277 case 'l': if (ends("bli")) { r("ble"); break; }
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
278 if (ends("alli")) { r("al"); break; }
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
279 if (ends("entli")) { r("ent"); break; }
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
280 if (ends("eli")) { r("e"); break; }
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
281 if (ends("ousli")) { r("ous"); break; }
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
282 break;
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
283 case 'o': if (ends("ization")) { r("ize"); break; }
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
284 if (ends("ation")) { r("ate"); break; }
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
285 if (ends("ator")) { r("ate"); break; }
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
286 break;
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
287 case 's': if (ends("alism")) { r("al"); break; }
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
288 if (ends("iveness")) { r("ive"); break; }
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
289 if (ends("fulness")) { r("ful"); break; }
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
290 if (ends("ousness")) { r("ous"); break; }
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
291 break;
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
292 case 't': if (ends("aliti")) { r("al"); break; }
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
293 if (ends("iviti")) { r("ive"); break; }
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
294 if (ends("biliti")) { r("ble"); break; }
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
295 break;
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
296 case 'g': if (ends("logi")) { r("log"); break; }
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
297 } }
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
298
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
299 /* step4() deals with -ic-, -full, -ness etc. similar strategy to step3. */
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
300
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
301 private final void step4() { switch (b[k])
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
302 {
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
303 case 'e': if (ends("icate")) { r("ic"); break; }
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
304 if (ends("ative")) { r(""); break; }
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
305 if (ends("alize")) { r("al"); break; }
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
306 break;
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
307 case 'i': if (ends("iciti")) { r("ic"); break; }
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
308 break;
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
309 case 'l': if (ends("ical")) { r("ic"); break; }
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
310 if (ends("ful")) { r(""); break; }
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
311 break;
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
312 case 's': if (ends("ness")) { r(""); break; }
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
313 break;
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
314 } }
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
315
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
316 /* step5() takes off -ant, -ence etc., in context <c>vcvc<v>. */
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
317
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
318 private final void step5()
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
319 { if (k == 0) return; /* for Bug 1 */ switch (b[k-1])
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
320 { case 'a': if (ends("al")) break; return;
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
321 case 'c': if (ends("ance")) break;
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
322 if (ends("ence")) break; return;
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
323 case 'e': if (ends("er")) break; return;
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
324 case 'i': if (ends("ic")) break; return;
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
325 case 'l': if (ends("able")) break;
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
326 if (ends("ible")) break; return;
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
327 case 'n': if (ends("ant")) break;
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
328 if (ends("ement")) break;
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
329 if (ends("ment")) break;
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
330 /* element etc. not stripped before the m */
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
331 if (ends("ent")) break; return;
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
332 case 'o': if (ends("ion") && j >= 0 && (b[j] == 's' || b[j] == 't')) break;
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
333 /* j >= 0 fixes Bug 2 */
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
334 if (ends("ou")) break; return;
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
335 /* takes care of -ous */
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
336 case 's': if (ends("ism")) break; return;
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
337 case 't': if (ends("ate")) break;
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
338 if (ends("iti")) break; return;
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
339 case 'u': if (ends("ous")) break; return;
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
340 case 'v': if (ends("ive")) break; return;
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
341 case 'z': if (ends("ize")) break; return;
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
342 default: return;
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
343 }
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
344 if (m() > 1) k = j;
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
345 }
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
346
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
347 /* step6() removes a final -e if m() > 1. */
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
348
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
349 private final void step6()
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
350 { j = k;
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
351 if (b[k] == 'e')
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
352 { int a = m();
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
353 if (a > 1 || a == 1 && !cvc(k-1)) k--;
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
354 }
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
355 if (b[k] == 'l' && doublec(k) && m() > 1) k--;
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
356 }
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
357
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
358 /** Stem the word placed into the Stemmer buffer through calls to add().
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
359 * Returns true if the stemming process resulted in a word different
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
360 * from the input. You can retrieve the result with
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
361 * getResultLength()/getResultBuffer() or toString().
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
362 */
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
363 public void stem()
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
364 { k = i - 1;
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
365 if (k > 1) { step1(); step2(); step3(); step4(); step5(); step6(); }
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
366 i_end = k+1; i = 0;
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
367 }
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
368
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
369 /** Test program for demonstrating the Stemmer. It reads text from a
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
370 * a list of files, stems each word, and writes the result to standard
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
371 * output. Note that the word stemmed is expected to be in lower case:
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
372 * forcing lower case must be done outside the Stemmer class.
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
373 * Usage: Stemmer file-name file-name ...
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
374 */
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
375 public static void main(String[] args)
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
376 {
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
377 char[] w = new char[501];
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
378 Stemmer s = new Stemmer();
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
379 for (int i = 0; i < args.length; i++)
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
380 try
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
381 {
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
382 FileInputStream in = new FileInputStream(args[i]);
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
383
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
384 try
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
385 { while(true)
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
386
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
387 { int ch = in.read();
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
388 if (Character.isLetter((char) ch))
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
389 {
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
390 int j = 0;
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
391 while(true)
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
392 { ch = Character.toLowerCase((char) ch);
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
393 w[j] = (char) ch;
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
394 if (j < 500) j++;
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
395 ch = in.read();
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
396 if (!Character.isLetter((char) ch))
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
397 {
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
398 /* to test add(char ch) */
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
399 for (int c = 0; c < j; c++) s.add(w[c]);
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
400
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
401 /* or, to test add(char[] w, int j) */
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
402 /* s.add(w, j); */
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
403
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
404 s.stem();
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
405 { String u;
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
406
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
407 /* and now, to test toString() : */
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
408 u = s.toString();
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
409
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
410 /* to test getResultBuffer(), getResultLength() : */
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
411 /* u = new String(s.getResultBuffer(), 0, s.getResultLength()); */
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
412
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
413 System.out.print(u);
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
414 }
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
415 break;
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
416 }
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
417 }
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
418 }
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
419 if (ch < 0) break;
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
420 System.out.print((char)ch);
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
421 }
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
422 }
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
423 catch (IOException e)
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
424 { System.out.println("error reading " + args[i]);
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
425 break;
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
426 }
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
427 }
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
428 catch (FileNotFoundException e)
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
429 { System.out.println("file " + args[i] + " not found");
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
430 break;
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
431 }
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
432 }
4e761e7e6e12 7070134: Hotspot crashes with sigsegv from PorterStemmer
kvn
parents:
diff changeset
433 }