2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
9 * http://www.apache.org/licenses/LICENSE-2.0
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
18 package org.opendaylight.yangtools.xsd.regex;
20 import java.text.CharacterIterator;
21 import java.util.ArrayDeque;
22 import java.util.Deque;
23 import java.util.Locale;
26 * A regular expression matching engine using Non-deterministic Finite Automaton (NFA).
27 * This engine does not conform to the POSIX regular expression.
36 * RegularExpression re = new RegularExpression(<var>regex</var>);
37 * if (re.matches(text)) { ... }
40 * <dt>B. Capturing groups
43 * RegularExpression re = new RegularExpression(<var>regex</var>);
44 * Match match = new Match();
45 * if (re.matches(text, match)) {
46 * ... // You can refer captured texts with methods of the <code>Match</code> class.
52 * <h4>Case-insensitive matching</h4>
54 * RegularExpression re = new RegularExpression(<var>regex</var>, "i");
55 * if (re.matches(text) >= 0) { ...}
59 * <p>You can specify options to <a href="#RegularExpression(java.lang.String, java.lang.String)"><code>RegularExpression(</code><var>regex</var><code>, </code><var>options</var><code>)</code></a>
60 * or <a href="#setPattern(java.lang.String, java.lang.String)"><code>setPattern(</code><var>regex</var><code>, </code><var>options</var><code>)</code></a>.
61 * This <var>options</var> parameter consists of the following characters.
64 * <dt><a name="I_OPTION"><code>"i"</code></a>
65 * <dd>This option indicates case-insensitive matching.
66 * <dt><a name="M_OPTION"><code>"m"</code></a>
67 * <dd class="REGEX"><kbd>^</kbd> and <kbd>$</kbd> consider the EOL characters within the text.
68 * <dt><a name="S_OPTION"><code>"s"</code></a>
69 * <dd class="REGEX"><kbd>.</kbd> matches any one character.
70 * <dt><a name="U_OPTION"><code>"u"</code></a>
71 * <dd class="REGEX">Redefines <Kbd>\d \D \w \W \s \S \b \B \< \></kbd> as becoming to Unicode.
72 * <dt><a name="W_OPTION"><code>"w"</code></a>
73 * <dd class="REGEX">By this option, <kbd>\b \B \< \></kbd> are processed with the method of
74 * 'Unicode Regular Expression Guidelines' Revision 4.
75 * When "w" and "u" are specified at the same time,
76 * <kbd>\b \B \< \></kbd> are processed for the "w" option.
77 * <dt><a name="COMMA_OPTION"><code>","</code></a>
78 * <dd>The parser treats a comma in a character class as a range separator.
79 * <kbd class="REGEX">[a,b]</kbd> matches <kbd>a</kbd> or <kbd>,</kbd> or <kbd>b</kbd> without this option.
80 * <kbd class="REGEX">[a,b]</kbd> matches <kbd>a</kbd> or <kbd>b</kbd> with this option.
82 * <dt><a name="X_OPTION"><code>"X"</code></a>
84 * By this option, the engine confoms to <a href="http://www.w3.org/TR/2000/WD-xmlschema-2-20000407/#regexs">XML Schema: Regular Expression</a>.
85 * The <code>match()</code> method does not do subsring matching
86 * but entire string matching.
92 * <table border="1" bgcolor="#ddeeff">
95 * <h4>Differences from the Perl 5 regular expression</h4>
97 * <li>There is 6-digit hexadecimal character representation (<kbd>\u005cv</kbd><var>HHHHHH</var>.)
98 * <li>Supports subtraction, union, and intersection operations for character classes.
99 * <li>Not supported: <kbd>\</kbd><var>ooo</var> (Octal character representations),
100 * <Kbd>\G</kbd>, <kbd>\C</kbd>, <kbd>\l</kbd><var>c</var>,
101 * <kbd>\u005c u</kbd><var>c</var>, <kbd>\L</kbd>, <kbd>\U</kbd>,
102 * <kbd>\E</kbd>, <kbd>\Q</kbd>, <kbd>\N{</kbd><var>name</var><kbd>}</kbd>,
103 * <Kbd>(?{<kbd><var>code</var><kbd>})</kbd>, <Kbd>(??{<kbd><var>code</var><kbd>})</kbd>
109 * <P>Meta characters are `<KBD>. * + ? { [ ( ) | \ ^ $</KBD>'.</P>
113 * <dt class="REGEX"><kbd>.</kbd> (A period)
114 * <dd>Matches any one character except the following characters.
115 * <dd>LINE FEED (U+000A), CARRIAGE RETURN (U+000D),
116 * PARAGRAPH SEPARATOR (U+2029), LINE SEPARATOR (U+2028)
117 * <dd>This expression matches one code point in Unicode. It can match a pair of surrogates.
118 * <dd>When <a href="#S_OPTION">the "s" option</a> is specified,
119 * it matches any character including the above four characters.
121 * <dt class="REGEX"><Kbd>\e \f \n \r \t</kbd>
122 * <dd>Matches ESCAPE (U+001B), FORM FEED (U+000C), LINE FEED (U+000A),
123 * CARRIAGE RETURN (U+000D), HORIZONTAL TABULATION (U+0009)
125 * <dt class="REGEX"><kbd>\c</kbd><var>C</var>
126 * <dd>Matches a control character.
127 * The <var>C</var> must be one of '<kbd>@</kbd>', '<kbd>A</kbd>'-'<kbd>Z</kbd>',
128 * '<kbd>[</kbd>', '<kbd>\u005c</kbd>', '<kbd>]</kbd>', '<kbd>^</kbd>', '<kbd>_</kbd>'.
129 * It matches a control character of which the character code is less than
130 * the character code of the <var>C</var> by 0x0040.
131 * <dd class="REGEX">For example, a <kbd>\cJ</kbd> matches a LINE FEED (U+000A),
132 * and a <kbd>\c[</kbd> matches an ESCAPE (U+001B).
134 * <dt class="REGEX">a non-meta character
135 * <dd>Matches the character.
137 * <dt class="REGEX"><KBD>\</KBD> + a meta character
138 * <dd>Matches the meta character.
140 * <dt class="REGEX"><kbd>\u005cx</kbd><var>HH</var> <kbd>\u005cx{</kbd><var>HHHH</var><kbd>}</kbd>
141 * <dd>Matches a character of which code point is <var>HH</var> (Hexadecimal) in Unicode.
142 * You can write just 2 digits for <kbd>\u005cx</kbd><var>HH</var>, and
143 * variable length digits for <kbd>\u005cx{</kbd><var>HHHH</var><kbd>}</kbd>.
146 * <dt class="REGEX"><kbd>\u005c u</kbd><var>HHHH</var>
147 * <dd>Matches a character of which code point is <var>HHHH</var> (Hexadecimal) in Unicode.
150 * <dt class="REGEX"><kbd>\u005cv</kbd><var>HHHHHH</var>
151 * <dd>Matches a character of which code point is <var>HHHHHH</var> (Hexadecimal) in Unicode.
153 * <dt class="REGEX"><kbd>\g</kbd>
154 * <dd>Matches a grapheme.
155 * <dd class="REGEX">It is equivalent to <kbd>(?[\p{ASSIGNED}]-[\p{M}\p{C}])?(?:\p{M}|[\x{094D}\x{09CD}\x{0A4D}\x{0ACD}\x{0B3D}\x{0BCD}\x{0C4D}\x{0CCD}\x{0D4D}\x{0E3A}\x{0F84}]\p{L}|[\x{1160}-\x{11A7}]|[\x{11A8}-\x{11FF}]|[\x{FF9E}\x{FF9F}])*</kbd>
157 * <dt class="REGEX"><kbd>\X</kbd>
158 * <dd class="REGEX">Matches a combining character sequence.
159 * It is equivalent to <kbd>(?:\PM\pM*)</kbd>
163 * <li>Character class
165 + * <dt class="REGEX"><kbd>[</kbd><var>R<sub>1</sub></var><var>R<sub>2</sub></var><var>...</var><var>R<sub>n</sub></var><kbd>]</kbd> (without <a href="#COMMA_OPTION">"," option</a>)
166 + * <dt class="REGEX"><kbd>[</kbd><var>R<sub>1</sub></var><kbd>,</kbd><var>R<sub>2</sub></var><kbd>,</kbd><var>...</var><kbd>,</kbd><var>R<sub>n</sub></var><kbd>]</kbd> (with <a href="#COMMA_OPTION">"," option</a>)
167 * <dd>Positive character class. It matches a character in ranges.
168 * <dd><var>R<sub>n</sub></var>:
170 * <li class="REGEX">A character (including <Kbd>\e \f \n \r \t</kbd> <kbd>\u005cx</kbd><var>HH</var> <kbd>\u005cx{</kbd><var>HHHH</var><kbd>}</kbd> <!--kbd>\u005c u</kbd><var>HHHH</var--> <kbd>\u005cv</kbd><var>HHHHHH</var>)
171 * <p>This range matches the character.
172 * <li class="REGEX"><var>C<sub>1</sub></var><kbd>-</kbd><var>C<sub>2</sub></var>
173 * <p>This range matches a character which has a code point that is >= <var>C<sub>1</sub></var>'s code point and <= <var>C<sub>2</sub></var>'s code point.
174 + * <li class="REGEX">A POSIX character class: <Kbd>[:alpha:] [:alnum:] [:ascii:] [:cntrl:] [:digit:] [:graph:] [:lower:] [:print:] [:punct:] [:space:] [:upper:] [:xdigit:]</kbd>,
175 + * and negative POSIX character classes in Perl like <kbd>[:^alpha:]</kbd>
177 * <li class="REGEX"><kbd>\d \D \s \S \w \W \p{</kbd><var>name</var><kbd>} \P{</kbd><var>name</var><kbd>}</kbd>
178 * <p>These expressions specifies the same ranges as the following expressions.
180 * <p class="REGEX">Enumerated ranges are merged (union operation).
181 * <kbd>[a-ec-z]</kbd> is equivalent to <kbd>[a-z]</kbd>
183 * <dt class="REGEX"><kbd>[^</kbd><var>R<sub>1</sub></var><var>R<sub>2</sub></var><var>...</var><var>R<sub>n</sub></var><kbd>]</kbd> (without a <a href="#COMMA_OPTION">"," option</a>)
184 * <dt class="REGEX"><kbd>[^</kbd><var>R<sub>1</sub></var><kbd>,</kbd><var>R<sub>2</sub></var><kbd>,</kbd><var>...</var><kbd>,</kbd><var>R<sub>n</sub></var><kbd>]</kbd> (with a <a href="#COMMA_OPTION">"," option</a>)
185 * <dd>Negative character class. It matches a character not in ranges.
187 * <dt class="REGEX"><kbd>(?[</kbd><var>ranges</var><kbd>]</kbd><var>op</var><kbd>[</kbd><var>ranges</var><kbd>]</kbd><var>op</var><kbd>[</kbd><var>ranges</var><kbd>]</kbd> ... <Kbd>)</kbd>
188 * (<var>op</var> is <kbd>-</kbd> or <kbd>+</kbd> or <kbd>&</kbd>.)
189 * <dd>Subtraction or union or intersection for character classes.
190 * <dd class="REGEX">For exmaple, <kbd>(?[A-Z]-[CF])</kbd> is equivalent to <kbd>[A-BD-EG-Z]</kbd>, and <kbd>(?[0x00-0x7f]-[K]&[\p{Lu}])</kbd> is equivalent to <kbd>[A-JL-Z]</kbd>.
191 * <dd>The result of this operations is a <u>positive character class</u>
192 * even if an expression includes any negative character classes.
193 * You have to take care on this in case-insensitive matching.
194 * For instance, <kbd>(?[^b])</kbd> is equivalent to <kbd>[\x00-ac-\x{10ffff}]</kbd>,
195 * which is equivalent to <kbd>[^b]</kbd> in case-sensitive matching.
196 * But, in case-insensitive matching, <kbd>(?[^b])</kbd> matches any character because
197 * it includes '<kbd>B</kbd>' and '<kbd>B</kbd>' matches '<kbd>b</kbd>'
198 * though <kbd>[^b]</kbd> is processed as <kbd>[^Bb]</kbd>.
200 * <dt class="REGEX"><kbd>[</kbd><var>R<sub>1</sub>R<sub>2</sub>...</var><kbd>-[</kbd><var>R<sub>n</sub>R<sub>n+1</sub>...</var><kbd>]]</kbd> (with an <a href="#X_OPTION">"X" option</a>)</dt>
201 * <dd>Character class subtraction for the XML Schema.
202 * You can use this syntax when you specify an <a href="#X_OPTION">"X" option</a>.
204 * <dt class="REGEX"><kbd>\d</kbd>
205 * <dd class="REGEX">Equivalent to <kbd>[0-9]</kbd>.
206 * <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
207 * <span class="REGEX"><kbd>\p{Nd}</kbd></span>.
209 * <dt class="REGEX"><kbd>\D</kbd>
210 * <dd class="REGEX">Equivalent to <kbd>[^0-9]</kbd>
211 * <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
212 * <span class="REGEX"><kbd>\P{Nd}</kbd></span>.
214 * <dt class="REGEX"><kbd>\s</kbd>
215 * <dd class="REGEX">Equivalent to <kbd>[ \f\n\r\t]</kbd>
216 * <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
217 * <span class="REGEX"><kbd>[ \f\n\r\t\p{Z}]</kbd></span>.
219 * <dt class="REGEX"><kbd>\S</kbd>
220 * <dd class="REGEX">Equivalent to <kbd>[^ \f\n\r\t]</kbd>
221 * <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
222 * <span class="REGEX"><kbd>[^ \f\n\r\t\p{Z}]</kbd></span>.
224 * <dt class="REGEX"><kbd>\w</kbd>
225 * <dd class="REGEX">Equivalent to <kbd>[a-zA-Z0-9_]</kbd>
226 * <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
227 * <span class="REGEX"><kbd>[\p{Lu}\p{Ll}\p{Lo}\p{Nd}_]</kbd></span>.
229 * <dt class="REGEX"><kbd>\W</kbd>
230 * <dd class="REGEX">Equivalent to <kbd>[^a-zA-Z0-9_]</kbd>
231 * <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
232 * <span class="REGEX"><kbd>[^\p{Lu}\p{Ll}\p{Lo}\p{Nd}_]</kbd></span>.
234 * <dt class="REGEX"><kbd>\p{</kbd><var>name</var><kbd>}</kbd>
235 * <dd>Matches one character in the specified General Category (the second field in <a href="ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt"><kbd>UnicodeData.txt</kbd></a>) or the specified <a href="ftp://ftp.unicode.org/Public/UNIDATA/Blocks.txt">Block</a>.
236 * The following names are available:
238 * <dt>Unicode General Categories:
240 * L, M, N, Z, C, P, S, Lu, Ll, Lt, Lm, Lo, Mn, Me, Mc, Nd, Nl, No, Zs, Zl, Zp,
241 * Cc, Cf, Cn, Co, Cs, Pd, Ps, Pe, Pc, Po, Sm, Sc, Sk, So,
243 * <dd>(Currently the Cn category includes U+10000-U+10FFFF characters)
244 * <dt>Unicode Blocks:
246 * Basic Latin, Latin-1 Supplement, Latin Extended-A, Latin Extended-B,
247 * IPA Extensions, Spacing Modifier Letters, Combining Diacritical Marks, Greek,
248 * Cyrillic, Armenian, Hebrew, Arabic, Devanagari, Bengali, Gurmukhi, Gujarati,
249 * Oriya, Tamil, Telugu, Kannada, Malayalam, Thai, Lao, Tibetan, Georgian,
250 * Hangul Jamo, Latin Extended Additional, Greek Extended, General Punctuation,
251 * Superscripts and Subscripts, Currency Symbols, Combining Marks for Symbols,
252 * Letterlike Symbols, Number Forms, Arrows, Mathematical Operators,
253 * Miscellaneous Technical, Control Pictures, Optical Character Recognition,
254 * Enclosed Alphanumerics, Box Drawing, Block Elements, Geometric Shapes,
255 * Miscellaneous Symbols, Dingbats, CJK Symbols and Punctuation, Hiragana,
256 * Katakana, Bopomofo, Hangul Compatibility Jamo, Kanbun,
257 * Enclosed CJK Letters and Months, CJK Compatibility, CJK Unified Ideographs,
258 * Hangul Syllables, High Surrogates, High Private Use Surrogates, Low Surrogates,
259 * Private Use, CJK Compatibility Ideographs, Alphabetic Presentation Forms,
260 * Arabic Presentation Forms-A, Combining Half Marks, CJK Compatibility Forms,
261 * Small Form Variants, Arabic Presentation Forms-B, Specials,
262 * Halfwidth and Fullwidth Forms
265 * <dd><kbd>ALL</kbd> (Equivalent to <kbd>[\u005cu0000-\u005cv10FFFF]</kbd>)
266 * <dd><kbd>ASSGINED</kbd> (<kbd>\p{ASSIGNED}</kbd> is equivalent to <kbd>\P{Cn}</kbd>)
267 * <dd><kbd>UNASSGINED</kbd>
268 * (<kbd>\p{UNASSIGNED}</kbd> is equivalent to <kbd>\p{Cn}</kbd>)
271 * <dt class="REGEX"><kbd>\P{</kbd><var>name</var><kbd>}</kbd>
272 * <dd>Matches one character not in the specified General Category or the specified Block.
276 * <li>Selection and Quantifier
278 * <dt class="REGEX"><VAR>X</VAR><kbd>|</kbd><VAR>Y</VAR>
281 * <dt class="REGEX"><VAR>X</VAR><kbd>*</KBD>
282 * <dd>Matches 0 or more <var>X</var>.
284 * <dt class="REGEX"><VAR>X</VAR><kbd>+</KBD>
285 * <dd>Matches 1 or more <var>X</var>.
287 * <dt class="REGEX"><VAR>X</VAR><kbd>?</KBD>
288 * <dd>Matches 0 or 1 <var>X</var>.
290 * <dt class="REGEX"><var>X</var><kbd>{</kbd><var>number</var><kbd>}</kbd>
291 * <dd>Matches <var>number</var> times.
293 * <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,}</kbd>
296 * <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,</kbd><var>max</var><kbd>}</kbd>
299 * <dt class="REGEX"><VAR>X</VAR><kbd>*?</kbd>
300 * <dt class="REGEX"><VAR>X</VAR><kbd>+?</kbd>
301 * <dt class="REGEX"><VAR>X</VAR><kbd>??</kbd>
302 * <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,}?</kbd>
303 * <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,</kbd><var>max</var><kbd>}?</kbd>
304 * <dd>Non-greedy matching.
308 * <li>Grouping, Capturing, and Back-reference
310 * <dt class="REGEX"><KBD>(?:</kbd><VAR>X</VAR><kbd>)</KBD>
311 * <dd>Grouping. "<KBD>foo+</KBD>" matches "<KBD>foo</KBD>" or "<KBD>foooo</KBD>".
312 * If you want it matches "<KBD>foofoo</KBD>" or "<KBD>foofoofoo</KBD>",
313 * you have to write "<KBD>(?:foo)+</KBD>".
315 * <dt class="REGEX"><KBD>(</kbd><VAR>X</VAR><kbd>)</KBD>
316 * <dd>Grouping with capturing.
317 * It make a group and applications can know
318 * where in target text a group matched with methods of a <code>Match</code> instance
319 * after <code><a href="#matches(java.lang.String, org.apache.xerces.utils.regex.Match)">matches(String,Match)</a></code>.
320 * The 0th group means whole of this regular expression.
321 * The <VAR>N</VAR>th gorup is the inside of the <VAR>N</VAR>th left parenthesis.
323 * <p>For instance, a regular expression is
324 * "<FONT color=blue><KBD> *([^<:]*) +<([^>]*)> *</KBD></FONT>"
326 * "<FONT color=red><KBD>From: TAMURA Kent <kent@trl.ibm.co.jp></KBD></FONT>":
328 * <li><code>Match.getCapturedText(0)</code>:
329 * "<FONT color=red><KBD> TAMURA Kent <kent@trl.ibm.co.jp></KBD></FONT>"
330 * <li><code>Match.getCapturedText(1)</code>: "<FONT color=red><KBD>TAMURA Kent</KBD></FONT>"
331 * <li><code>Match.getCapturedText(2)</code>: "<FONT color=red><KBD>kent@trl.ibm.co.jp</KBD></FONT>"
334 * <dt class="REGEX"><kbd>\1 \2 \3 \4 \5 \6 \7 \8 \9</kbd>
337 * <dt class="REGEX"><kbd>(?></kbd><var>X</var><kbd>)</kbd>
338 * <dd>Independent expression group. ................
340 * <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>:</kbd><var>X</var><kbd>)</kbd>
341 * <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>-</kbd><var>options2</var><kbd>:</kbd><var>X</var><kbd>)</kbd>
342 * <dd>............................
343 * <dd>The <var>options</var> or the <var>options2</var> consists of 'i' 'm' 's' 'w'.
344 * Note that it can not contain 'u'.
346 * <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>)</kbd>
347 * <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>-</kbd><var>options2</var><kbd>)</kbd>
349 * <dd>These expressions must be at the beginning of a group.
355 * <dt class="REGEX"><kbd>\A</kbd>
356 * <dd>Matches the beginnig of the text.
358 * <dt class="REGEX"><kbd>\Z</kbd>
359 * <dd>Matches the end of the text, or before an EOL character at the end of the text,
360 * or CARRIAGE RETURN + LINE FEED at the end of the text.
362 * <dt class="REGEX"><kbd>\z</kbd>
363 * <dd>Matches the end of the text.
365 * <dt class="REGEX"><kbd>^</kbd>
366 * <dd>Matches the beginning of the text. It is equivalent to <span class="REGEX"><Kbd>\A</kbd></span>.
367 * <dd>When <a href="#M_OPTION">a "m" option</a> is set,
368 * it matches the beginning of the text, or after one of EOL characters (
369 * LINE FEED (U+000A), CARRIAGE RETURN (U+000D), LINE SEPARATOR (U+2028),
370 * PARAGRAPH SEPARATOR (U+2029).)
372 * <dt class="REGEX"><kbd>$</kbd>
373 * <dd>Matches the end of the text, or before an EOL character at the end of the text,
374 * or CARRIAGE RETURN + LINE FEED at the end of the text.
375 * <dd>When <a href="#M_OPTION">a "m" option</a> is set,
376 * it matches the end of the text, or before an EOL character.
378 * <dt class="REGEX"><kbd>\b</kbd>
379 * <dd>Matches word boundary.
380 * (See <a href="#W_OPTION">a "w" option</a>)
382 * <dt class="REGEX"><kbd>\B</kbd>
383 * <dd>Matches non word boundary.
384 * (See <a href="#W_OPTION">a "w" option</a>)
386 * <dt class="REGEX"><kbd>\<</kbd>
387 * <dd>Matches the beginning of a word.
388 * (See <a href="#W_OPTION">a "w" option</a>)
390 * <dt class="REGEX"><kbd>\></kbd>
391 * <dd>Matches the end of a word.
392 * (See <a href="#W_OPTION">a "w" option</a>)
395 * <li>Lookahead and lookbehind
397 * <dt class="REGEX"><kbd>(?=</kbd><var>X</var><kbd>)</kbd>
400 * <dt class="REGEX"><kbd>(?!</kbd><var>X</var><kbd>)</kbd>
401 * <dd>Negative lookahead.
403 * <dt class="REGEX"><kbd>(?<=</kbd><var>X</var><kbd>)</kbd>
405 * <dd>(Note for text capturing......)
407 * <dt class="REGEX"><kbd>(?<!</kbd><var>X</var><kbd>)</kbd>
408 * <dd>Negative lookbehind.
414 * <dt class="REGEX"><kbd>(?(</Kbd><var>condition</var><Kbd>)</kbd><var>yes-pattern</var><kbd>|</kbd><var>no-pattern</var><kbd>)</kbd>,
415 * <dt class="REGEX"><kbd>(?(</kbd><var>condition</var><kbd>)</kbd><var>yes-pattern</var><kbd>)</kbd>
417 * <dt class="REGEX"><kbd>(?#</kbd><var>comment</var><kbd>)</kbd>
418 * <dd>Comment. A comment string consists of characters except '<kbd>)</kbd>'.
419 * You can not write comments in character classes and before quantifiers.
426 * <h3>BNF for the regular expression</h3>
428 * regex ::= ('(?' options ')')? term ('|' term)*
430 * factor ::= anchors | atom (('*' | '+' | '?' | minmax ) '?'? )?
432 * minmax ::= '{' ([0-9]+ | [0-9]+ ',' | ',' [0-9]+ | [0-9]+ ',' [0-9]+) '}'
433 * atom ::= char | '.' | char-class | '(' regex ')' | '(?:' regex ')' | '\' [0-9]
434 * | '\w' | '\W' | '\d' | '\D' | '\s' | '\S' | category-block | '\X'
435 * | '(?>' regex ')' | '(?' options ':' regex ')'
436 * | '(?' ('(' [0-9] ')' | '(' anchors ')' | looks) term ('|' term)? ')'
437 * options ::= [imsw]* ('-' [imsw]+)?
438 * anchors ::= '^' | '$' | '\A' | '\Z' | '\z' | '\b' | '\B' | '\<' | '\>'
439 * looks ::= '(?=' regex ')' | '(?!' regex ')'
440 * | '(?<=' regex ')' | '(?<!' regex ')'
441 * char ::= '\\' | '\' [efnrtv] | '\c' [@-_] | code-point | character-1
442 * category-block ::= '\' [pP] category-symbol-1
443 * | ('\p{' | '\P{') (category-symbol | block-name
444 * | other-properties) '}'
445 * category-symbol-1 ::= 'L' | 'M' | 'N' | 'Z' | 'C' | 'P' | 'S'
446 * category-symbol ::= category-symbol-1 | 'Lu' | 'Ll' | 'Lt' | 'Lm' | Lo'
447 * | 'Mn' | 'Me' | 'Mc' | 'Nd' | 'Nl' | 'No'
448 * | 'Zs' | 'Zl' | 'Zp' | 'Cc' | 'Cf' | 'Cn' | 'Co' | 'Cs'
449 * | 'Pd' | 'Ps' | 'Pe' | 'Pc' | 'Po'
450 * | 'Sm' | 'Sc' | 'Sk' | 'So'
451 * block-name ::= (See above)
452 * other-properties ::= 'ALL' | 'ASSIGNED' | 'UNASSIGNED'
453 * character-1 ::= (any character except meta-characters)
455 * char-class ::= '[' ranges ']'
456 * | '(?[' ranges ']' ([-+&] '[' ranges ']')? ')'
457 * ranges ::= '^'? (range <a href="#COMMA_OPTION">','?</a>)+
458 * range ::= '\d' | '\w' | '\s' | '\D' | '\W' | '\S' | category-block
459 * | range-char | range-char '-' range-char
460 * range-char ::= '\[' | '\]' | '\\' | '\' [,-efnrtv] | code-point | character-2
461 * code-point ::= '\x' hex-char hex-char
462 * | '\x{' hex-char+ '}'
463 * <!-- | '\u005c u' hex-char hex-char hex-char hex-char
464 * --> | '\v' hex-char hex-char hex-char hex-char hex-char hex-char
465 * hex-char ::= [0-9a-fA-F]
466 * character-2 ::= (any character except \[]-,)
472 * <li><a href="http://www.unicode.org/unicode/reports/tr18/">Unicode Regular Expression Guidelines</a>
474 * <li>2.4 Canonical Equivalents
477 * <li>Parsing performance
484 * @author TAMURA Kent <kent@trl.ibm.co.jp>
485 * @version $Id: RegularExpression.java 961928 2010-07-08 20:43:46Z knoaman $
487 public class RegularExpression implements java.io.Serializable {
489 private static final long serialVersionUID = 6242499334195006401L;
491 static final boolean DEBUG = false;
494 * Compiles a token tree into an operation flow.
496 private synchronized void compile(Token tok) {
497 if (this.operations != null) {
500 this.numberOfClosures = 0;
501 this.operations = this.compile(tok, null, false);
505 * Converts a token to an operation.
507 private Op compile(Token tok, Op next, boolean reverse) {
511 ret = Op.createDot();
516 ret = Op.createChar(tok.getChar());
521 ret = Op.createAnchor(tok.getChar());
527 ret = Op.createRange(tok);
534 for (int i = tok.size()-1; i >= 0; i --) {
535 ret = compile(tok.getChild(i), ret, false);
538 for (int i = 0; i < tok.size(); i ++) {
539 ret = compile(tok.getChild(i), ret, true);
545 Op.UnionOp uni = Op.createUnion(tok.size());
546 for (int i = 0; i < tok.size(); i ++) {
547 uni.addElement(compile(tok.getChild(i), next, reverse));
549 ret = uni; // ret.next is null.
553 case Token.NONGREEDYCLOSURE:
554 Token child = tok.getChild(0);
555 int min = tok.getMin();
556 int max = tok.getMax();
557 if (min >= 0 && min == max) { // {n}
559 for (int i = 0; i < min; i ++) {
560 ret = compile(child, ret, reverse);
564 if (min > 0 && max > 0) {
568 // X{2,6} -> XX(X(X(XX?)?)?)?
570 for (int i = 0; i < max; i ++) {
571 Op.ChildOp q = Op.createQuestion(tok.type == Token.NONGREEDYCLOSURE);
573 q.setChild(compile(child, ret, reverse));
578 if (tok.type == Token.NONGREEDYCLOSURE) {
579 op = Op.createNonGreedyClosure();
580 } else { // Token.CLOSURE
581 op = Op.createClosure(this.numberOfClosures++);
584 op.setChild(compile(child, op, reverse));
588 for (int i = 0; i < min; i ++) {
589 ret = compile(child, ret, reverse);
599 ret = Op.createString(tok.getString());
603 case Token.BACKREFERENCE:
604 ret = Op.createBackReference(tok.getReferenceNumber());
609 if (tok.getParenNumber() == 0) {
610 ret = compile(tok.getChild(0), next, reverse);
611 } else if (reverse) {
612 next = Op.createCapture(tok.getParenNumber(), next);
613 next = compile(tok.getChild(0), next, reverse);
614 ret = Op.createCapture(-tok.getParenNumber(), next);
616 next = Op.createCapture(-tok.getParenNumber(), next);
617 next = compile(tok.getChild(0), next, reverse);
618 ret = Op.createCapture(tok.getParenNumber(), next);
622 case Token.LOOKAHEAD:
623 ret = Op.createLook(Op.LOOKAHEAD, next, compile(tok.getChild(0), null, false));
625 case Token.NEGATIVELOOKAHEAD:
626 ret = Op.createLook(Op.NEGATIVELOOKAHEAD, next, compile(tok.getChild(0), null, false));
628 case Token.LOOKBEHIND:
629 ret = Op.createLook(Op.LOOKBEHIND, next, compile(tok.getChild(0), null, true));
631 case Token.NEGATIVELOOKBEHIND:
632 ret = Op.createLook(Op.NEGATIVELOOKBEHIND, next, compile(tok.getChild(0), null, true));
635 case Token.INDEPENDENT:
636 ret = Op.createIndependent(next, compile(tok.getChild(0), null, reverse));
639 case Token.MODIFIERGROUP:
640 ret = Op.createModifier(next, compile(tok.getChild(0), null, reverse),
641 ((Token.ModifierToken)tok).getOptions(),
642 ((Token.ModifierToken)tok).getOptionsMask());
645 case Token.CONDITION:
646 Token.ConditionToken ctok = (Token.ConditionToken)tok;
647 int ref = ctok.refNumber;
648 Op condition = ctok.condition == null ? null : compile(ctok.condition, null, reverse);
649 Op yes = compile(ctok.yes, next, reverse);
650 Op no = ctok.no == null ? null : compile(ctok.no, next, reverse);
651 ret = Op.createCondition(next, ref, condition, yes, no);
655 throw new RuntimeException("Unknown token type: "+tok.type);
656 } // switch (tok.type)
664 * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
666 * @return true if the target is matched to this regular expression.
668 public boolean matches(char[] target) {
669 return this.matches(target, 0, target .length , null);
673 * Checks whether the <var>target</var> text <strong>contains</strong> this pattern
674 * in specified range or not.
676 * @param start Start offset of the range.
677 * @param end End offset +1 of the range.
678 * @return true if the target is matched to this regular expression.
680 public boolean matches(char[] target, int start, int end) {
681 return this.matches(target, start, end, null);
685 * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
687 * @param match A Match instance for storing matching result.
688 * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
690 public boolean matches(char[] target, Match match) {
691 return this.matches(target, 0, target .length , match);
696 * Checks whether the <var>target</var> text <strong>contains</strong> this pattern
697 * in specified range or not.
699 * @param start Start offset of the range.
700 * @param end End offset +1 of the range.
701 * @param match A Match instance for storing matching result.
702 * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
704 public boolean matches(char[] target, int start, int end, Match match) {
706 synchronized (this) {
707 if (this.operations == null) {
710 if (this.context == null) {
711 this.context = new Context();
715 synchronized (this.context) {
716 con = this.context.inuse ? new Context() : this.context;
717 con.reset(target, start, end, this.numberOfClosures);
720 match.setNumberOfGroups(this.nofparen);
721 match.setSource(target);
722 } else if (this.hasBackReferences) {
724 match.setNumberOfGroups(this.nofparen);
725 // Need not to call setSource() because
726 // a caller can not access this match instance.
730 if (RegularExpression.isSet(this.options, XMLSCHEMA_MODE)) {
731 int matchEnd = this. match(con, this.operations, con.start, 1, this.options);
732 //System.err.println("DEBUG: matchEnd="+matchEnd);
733 if (matchEnd == con.limit) {
734 if (con.match != null) {
735 con.match.setBeginning(0, con.start);
736 con.match.setEnd(0, matchEnd);
745 * The pattern has only fixed string.
746 * The engine uses Boyer-Moore.
748 if (this.fixedStringOnly) {
749 //System.err.println("DEBUG: fixed-only: "+this.fixedString);
750 int o = this.fixedStringTable.matches(target, con.start, con.limit);
752 if (con.match != null) {
753 con.match.setBeginning(0, o);
754 con.match.setEnd(0, o+this.fixedString.length());
764 * The pattern contains a fixed string.
765 * The engine checks with Boyer-Moore whether the text contains the fixed string or not.
766 * If not, it return with false.
768 if (this.fixedString != null) {
769 int o = this.fixedStringTable.matches(target, con.start, con.limit);
771 //System.err.println("Non-match in fixed-string search.");
777 int limit = con.limit-this.minlength;
782 * Checks whether the expression starts with ".*".
784 if (this.operations != null
785 && this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) {
786 if (isSet(this.options, SINGLE_LINE)) {
787 matchStart = con.start;
788 matchEnd = this. match(con, this.operations, con.start, 1, this.options);
790 boolean previousIsEOL = true;
791 for (matchStart = con.start; matchStart <= limit; matchStart ++) {
792 int ch = target [ matchStart ] ;
794 previousIsEOL = true;
797 if (0 <= (matchEnd = this. match(con, this.operations,
798 matchStart, 1, this.options))) {
802 previousIsEOL = false;
809 * Optimization against the first character.
811 else if (this.firstChar != null) {
812 //System.err.println("DEBUG: with firstchar-matching: "+this.firstChar);
813 RangeToken range = this.firstChar;
814 for (matchStart = con.start; matchStart <= limit; matchStart ++) {
815 int ch = target [matchStart] ;
816 if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit) {
817 ch = REUtil.composeFromSurrogates(ch, target[matchStart+1]);
819 if (!range.match(ch)) {
822 if (0 <= (matchEnd = this. match(con, this.operations,
823 matchStart, 1, this.options))) {
830 * Straightforward matching.
833 for (matchStart = con.start; matchStart <= limit; matchStart ++) {
834 if (0 <= (matchEnd = this. match(con, this.operations, matchStart, 1, this.options))) {
841 if (con.match != null) {
842 con.match.setBeginning(0, matchStart);
843 con.match.setEnd(0, matchEnd);
854 * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
856 * @return true if the target is matched to this regular expression.
858 public boolean matches(String target) {
859 return this.matches(target, 0, target .length() , null);
863 * Checks whether the <var>target</var> text <strong>contains</strong> this pattern
864 * in specified range or not.
866 * @param start Start offset of the range.
867 * @param end End offset +1 of the range.
868 * @return true if the target is matched to this regular expression.
870 public boolean matches(String target, int start, int end) {
871 return this.matches(target, start, end, null);
875 * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
877 * @param match A Match instance for storing matching result.
878 * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
880 public boolean matches(String target, Match match) {
881 return this.matches(target, 0, target .length() , match);
885 * Checks whether the <var>target</var> text <strong>contains</strong> this pattern
886 * in specified range or not.
888 * @param start Start offset of the range.
889 * @param end End offset +1 of the range.
890 * @param match A Match instance for storing matching result.
891 * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
893 public boolean matches(String target, int start, int end, Match match) {
895 synchronized (this) {
896 if (this.operations == null) {
899 if (this.context == null) {
900 this.context = new Context();
904 synchronized (this.context) {
905 con = this.context.inuse ? new Context() : this.context;
906 con.reset(target, start, end, this.numberOfClosures);
909 match.setNumberOfGroups(this.nofparen);
910 match.setSource(target);
911 } else if (this.hasBackReferences) {
913 match.setNumberOfGroups(this.nofparen);
914 // Need not to call setSource() because
915 // a caller can not access this match instance.
919 if (RegularExpression.isSet(this.options, XMLSCHEMA_MODE)) {
921 System.err.println("target string="+target);
923 int matchEnd = this. match(con, this.operations, con.start, 1, this.options);
925 System.err.println("matchEnd="+matchEnd);
926 System.err.println("con.limit="+con.limit);
928 if (matchEnd == con.limit) {
929 if (con.match != null) {
930 con.match.setBeginning(0, con.start);
931 con.match.setEnd(0, matchEnd);
940 * The pattern has only fixed string.
941 * The engine uses Boyer-Moore.
943 if (this.fixedStringOnly) {
944 //System.err.println("DEBUG: fixed-only: "+this.fixedString);
945 int o = this.fixedStringTable.matches(target, con.start, con.limit);
947 if (con.match != null) {
948 con.match.setBeginning(0, o);
949 con.match.setEnd(0, o+this.fixedString.length());
959 * The pattern contains a fixed string.
960 * The engine checks with Boyer-Moore whether the text contains the fixed string or not.
961 * If not, it return with false.
963 if (this.fixedString != null) {
964 int o = this.fixedStringTable.matches(target, con.start, con.limit);
966 //System.err.println("Non-match in fixed-string search.");
972 int limit = con.limit-this.minlength;
977 * Checks whether the expression starts with ".*".
979 if (this.operations != null
980 && this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) {
981 if (isSet(this.options, SINGLE_LINE)) {
982 matchStart = con.start;
983 matchEnd = this.match(con, this.operations, con.start, 1, this.options);
985 boolean previousIsEOL = true;
986 for (matchStart = con.start; matchStart <= limit; matchStart ++) {
987 int ch = target .charAt( matchStart ) ;
989 previousIsEOL = true;
992 if (0 <= (matchEnd = this.match(con, this.operations,
993 matchStart, 1, this.options))) {
997 previousIsEOL = false;
1004 * Optimization against the first character.
1006 else if (this.firstChar != null) {
1007 //System.err.println("DEBUG: with firstchar-matching: "+this.firstChar);
1008 RangeToken range = this.firstChar;
1009 for (matchStart = con.start; matchStart <= limit; matchStart ++) {
1010 int ch = target .charAt( matchStart ) ;
1011 if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit) {
1012 ch = REUtil.composeFromSurrogates(ch, target.charAt(matchStart+1));
1014 if (!range.match(ch)) {
1017 if (0 <= (matchEnd = this.match(con, this.operations,
1018 matchStart, 1, this.options))) {
1025 * Straightforward matching.
1028 for (matchStart = con.start; matchStart <= limit; matchStart ++) {
1029 if (0 <= (matchEnd = this.match(con, this.operations, matchStart, 1, this.options))) {
1035 if (matchEnd >= 0) {
1036 if (con.match != null) {
1037 con.match.setBeginning(0, matchStart);
1038 con.match.setEnd(0, matchEnd);
1040 con.setInUse(false);
1043 con.setInUse(false);
1049 * @return -1 when not match; offset of the end of matched string when match.
1051 private int match(Context con, Op op, int offset, int dx, int opts) {
1052 final ExpressionTarget target = con.target;
1053 final Deque<Op> opStack = new ArrayDeque<>();
1054 final IntStack dataStack = new IntStack();
1055 final boolean isSetIgnoreCase = isSet(opts, IGNORE_CASE);
1057 boolean returned = false;
1060 if (op == null || offset > con.limit || offset < con.start) {
1062 retValue = isSet(opts, XMLSCHEMA_MODE) && offset != con.limit ? -1 : offset;
1071 // dx value is either 1 or -1
1075 final int o1 = (dx > 0) ? offset : offset -1;
1076 if (o1 >= con.limit || o1 < 0 || !matchChar(op.getData(), target.charAt(o1), isSetIgnoreCase)) {
1087 int o1 = (dx > 0) ? offset : offset - 1;
1088 if (o1 >= con.limit || o1 < 0) {
1092 if (isSet(opts, SINGLE_LINE)) {
1093 if (REUtil.isHighSurrogate(target.charAt(o1)) && o1+dx >= 0 && o1+dx < con.limit) {
1098 int ch = target.charAt(o1);
1099 if (REUtil.isHighSurrogate(ch) && o1+dx >= 0 && o1+dx < con.limit) {
1101 ch = REUtil.composeFromSurrogates(ch, target.charAt(o1));
1103 if (isEOLChar(ch)) {
1108 offset = (dx > 0) ? o1 + 1 : o1;
1116 int o1 = (dx > 0) ? offset : offset -1;
1117 if (o1 >= con.limit || o1 < 0) {
1121 int ch = target.charAt(offset);
1122 if (REUtil.isHighSurrogate(ch) && o1+dx < con.limit && o1+dx >=0) {
1124 ch = REUtil.composeFromSurrogates(ch, target.charAt(o1));
1126 final RangeToken tok = op.getToken();
1127 if (!tok.match(ch)) {
1131 offset = (dx > 0) ? o1+1 : o1;
1138 if (!matchAnchor(target, op, con, offset, opts)) {
1146 case Op.BACKREFERENCE:
1148 int refno = op.getData();
1149 if (refno <= 0 || refno >= this.nofparen) {
1150 throw new RuntimeException("Internal Error: Reference number must be more than zero: "+refno);
1152 if (con.match.getBeginning(refno) < 0 || con.match.getEnd(refno) < 0) {
1156 int o2 = con.match.getBeginning(refno);
1157 int literallen = con.match.getEnd(refno)-o2;
1159 if (!target.regionMatches(isSetIgnoreCase, offset, con.limit, o2, literallen)) {
1163 offset += literallen;
1166 if (!target.regionMatches(isSetIgnoreCase, offset-literallen, con.limit, o2, literallen)) {
1170 offset -= literallen;
1178 String literal = op.getString();
1179 int literallen = literal.length();
1181 if (!target.regionMatches(isSetIgnoreCase, offset, con.limit, literal, literallen)) {
1185 offset += literallen;
1188 if (!target.regionMatches(isSetIgnoreCase, offset-literallen, con.limit, literal, literallen)) {
1192 offset -= literallen;
1200 // Saves current position to avoid zero-width repeats.
1201 final int id = op.getData();
1202 if (con.closureContexts[id].contains(offset)) {
1207 con.closureContexts[id].addOffset(offset);
1214 dataStack.push(offset);
1219 case Op.NONGREEDYCLOSURE:
1220 case Op.NONGREEDYQUESTION:
1223 dataStack.push(offset);
1229 if (op.size() == 0) {
1235 dataStack.push(offset);
1236 op = op.elementAt(0);
1242 final int refno = op.getData();
1243 if (con.match != null) {
1245 dataStack.push(con.match.getBeginning(refno));
1246 con.match.setBeginning(refno, offset);
1249 final int index = -refno;
1250 dataStack.push(con.match.getEnd(index));
1251 con.match.setEnd(index, offset);
1254 dataStack.push(offset);
1261 case Op.NEGATIVELOOKAHEAD:
1263 case Op.NEGATIVELOOKBEHIND:
1267 dataStack.push(offset);
1268 dx = (op.type == Op.LOOKAHEAD || op.type == Op.NEGATIVELOOKAHEAD) ? 1 : -1;
1273 case Op.INDEPENDENT:
1276 dataStack.push(offset);
1283 int localopts = opts;
1284 localopts |= op.getData();
1285 localopts &= ~op.getData2();
1287 dataStack.push(opts);
1288 dataStack.push(offset);
1296 Op.ConditionOp cop = (Op.ConditionOp)op;
1297 if (cop.refNumber > 0) {
1298 if (cop.refNumber >= this.nofparen) {
1299 throw new RuntimeException("Internal Error: Reference number must be more than zero: "+cop.refNumber);
1301 if (con.match.getBeginning(cop.refNumber) >= 0
1302 && con.match.getEnd(cop.refNumber) >= 0) {
1305 else if (cop.no != null) {
1314 dataStack.push(offset);
1321 throw new RuntimeException("Unknown operation type: " + op.type);
1325 // handle recursive operations
1327 // exhausted all the operations
1328 if (opStack.isEmpty()) {
1333 offset = dataStack.pop();
1344 case Op.NONGREEDYCLOSURE:
1345 case Op.NONGREEDYQUESTION:
1354 int unionIndex = dataStack.pop();
1356 System.err.println("UNION: "+unionIndex+", ret="+retValue);
1360 if (++unionIndex < op.size()) {
1362 dataStack.push(unionIndex);
1363 dataStack.push(offset);
1364 op = op.elementAt(unionIndex);
1375 final int refno = op.getData();
1376 final int saved = dataStack.pop();
1379 con.match.setBeginning(refno, saved);
1382 con.match.setEnd(-refno, saved);
1390 dx = dataStack.pop();
1391 if (0 <= retValue) {
1399 case Op.NEGATIVELOOKAHEAD:
1400 case Op.NEGATIVELOOKBEHIND:
1402 dx = dataStack.pop();
1412 opts = dataStack.pop();
1415 case Op.INDEPENDENT:
1416 if (retValue >= 0) {
1425 final Op.ConditionOp cop = (Op.ConditionOp)op;
1426 if (0 <= retValue) {
1429 else if (cop.no != null) {
1446 private static boolean matchChar(int ch, int other, boolean ignoreCase) {
1447 return (ignoreCase) ? matchIgnoreCase(ch, other) : ch == other;
1450 boolean matchAnchor(ExpressionTarget target, Op op, Context con, int offset, int opts) {
1452 switch (op.getData()) {
1454 if (isSet(opts, MULTIPLE_LINES)) {
1455 if (!(offset == con.start
1456 || offset > con.start && offset < con.limit && isEOLChar(target.charAt(offset-1)))) {
1460 if (offset != con.start) {
1466 case '@': // Internal use only.
1467 // The @ always matches line beginnings.
1468 if (!(offset == con.start
1469 || offset > con.start && isEOLChar(target.charAt(offset-1)))) {
1475 if (isSet(opts, MULTIPLE_LINES)) {
1476 if (!(offset == con.limit
1477 || offset < con.limit && isEOLChar(target.charAt(offset)))) {
1481 if (!(offset == con.limit
1482 || offset+1 == con.limit && isEOLChar(target.charAt(offset))
1483 || offset+2 == con.limit && target.charAt(offset) == CARRIAGE_RETURN
1484 && target.charAt(offset+1) == LINE_FEED)) {
1491 if (offset != con.start) {
1497 if (!(offset == con.limit
1498 || offset+1 == con.limit && isEOLChar(target.charAt(offset))
1499 || offset+2 == con.limit && target.charAt(offset) == CARRIAGE_RETURN
1500 && target.charAt(offset+1) == LINE_FEED)) {
1506 if (offset != con.limit) {
1512 if (con.length == 0) {
1516 int after = getWordType(target, con.start, con.limit, offset, opts);
1517 if (after == WT_IGNORE) {
1520 int before = getPreviousWordType(target, con.start, con.limit, offset, opts);
1521 if (after == before) {
1528 if (con.length == 0) {
1531 int after = getWordType(target, con.start, con.limit, offset, opts);
1532 go = after == WT_IGNORE
1533 || after == getPreviousWordType(target, con.start, con.limit, offset, opts);
1541 if (con.length == 0 || offset == con.limit) {
1544 if (getWordType(target, con.start, con.limit, offset, opts) != WT_LETTER
1545 || getPreviousWordType(target, con.start, con.limit, offset, opts) != WT_OTHER) {
1551 if (con.length == 0 || offset == con.start) {
1554 if (getWordType(target, con.start, con.limit, offset, opts) != WT_OTHER
1555 || getPreviousWordType(target, con.start, con.limit, offset, opts) != WT_LETTER) {
1559 } // switch anchor type
1564 private static final int getPreviousWordType(ExpressionTarget target, int begin, int end,
1565 int offset, int opts) {
1566 int ret = getWordType(target, begin, end, --offset, opts);
1567 while (ret == WT_IGNORE) {
1568 ret = getWordType(target, begin, end, --offset, opts);
1573 private static final int getWordType(ExpressionTarget target, int begin, int end,
1574 int offset, int opts) {
1575 if (offset < begin || offset >= end) {
1578 return getWordType0(target.charAt(offset) , opts);
1583 * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
1585 * @return true if the target is matched to this regular expression.
1587 public boolean matches(CharacterIterator target) {
1588 return this.matches(target, null);
1593 * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
1595 * @param match A Match instance for storing matching result.
1596 * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
1598 public boolean matches(CharacterIterator target, Match match) {
1599 int start = target.getBeginIndex();
1600 int end = target.getEndIndex();
1604 synchronized (this) {
1605 if (this.operations == null) {
1608 if (this.context == null) {
1609 this.context = new Context();
1613 synchronized (this.context) {
1614 con = this.context.inuse ? new Context() : this.context;
1615 con.reset(target, start, end, this.numberOfClosures);
1617 if (match != null) {
1618 match.setNumberOfGroups(this.nofparen);
1619 match.setSource(target);
1620 } else if (this.hasBackReferences) {
1621 match = new Match();
1622 match.setNumberOfGroups(this.nofparen);
1623 // Need not to call setSource() because
1624 // a caller can not access this match instance.
1628 if (RegularExpression.isSet(this.options, XMLSCHEMA_MODE)) {
1629 int matchEnd = this.match(con, this.operations, con.start, 1, this.options);
1630 //System.err.println("DEBUG: matchEnd="+matchEnd);
1631 if (matchEnd == con.limit) {
1632 if (con.match != null) {
1633 con.match.setBeginning(0, con.start);
1634 con.match.setEnd(0, matchEnd);
1636 con.setInUse(false);
1643 * The pattern has only fixed string.
1644 * The engine uses Boyer-Moore.
1646 if (this.fixedStringOnly) {
1647 //System.err.println("DEBUG: fixed-only: "+this.fixedString);
1648 int o = this.fixedStringTable.matches(target, con.start, con.limit);
1650 if (con.match != null) {
1651 con.match.setBeginning(0, o);
1652 con.match.setEnd(0, o+this.fixedString.length());
1654 con.setInUse(false);
1657 con.setInUse(false);
1662 * The pattern contains a fixed string.
1663 * The engine checks with Boyer-Moore whether the text contains the fixed string or not.
1664 * If not, it return with false.
1666 if (this.fixedString != null) {
1667 int o = this.fixedStringTable.matches(target, con.start, con.limit);
1669 //System.err.println("Non-match in fixed-string search.");
1670 con.setInUse(false);
1675 int limit = con.limit-this.minlength;
1680 * Checks whether the expression starts with ".*".
1682 if (this.operations != null
1683 && this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) {
1684 if (isSet(this.options, SINGLE_LINE)) {
1685 matchStart = con.start;
1686 matchEnd = this.match(con, this.operations, con.start, 1, this.options);
1688 boolean previousIsEOL = true;
1689 for (matchStart = con.start; matchStart <= limit; matchStart ++) {
1690 int ch = target .setIndex( matchStart ) ;
1691 if (isEOLChar(ch)) {
1692 previousIsEOL = true;
1694 if (previousIsEOL) {
1695 if (0 <= (matchEnd = this.match(con, this.operations,
1696 matchStart, 1, this.options))) {
1700 previousIsEOL = false;
1707 * Optimization against the first character.
1709 else if (this.firstChar != null) {
1710 //System.err.println("DEBUG: with firstchar-matching: "+this.firstChar);
1711 RangeToken range = this.firstChar;
1712 for (matchStart = con.start; matchStart <= limit; matchStart ++) {
1713 int ch = target .setIndex( matchStart ) ;
1714 if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit) {
1715 ch = REUtil.composeFromSurrogates(ch, target.setIndex(matchStart+1));
1717 if (!range.match(ch)) {
1720 if (0 <= (matchEnd = this.match(con, this.operations,
1721 matchStart, 1, this.options))) {
1728 * Straightforward matching.
1731 for (matchStart = con.start; matchStart <= limit; matchStart ++) {
1732 if (0 <= (matchEnd = this. match(con, this.operations, matchStart, 1, this.options))) {
1738 if (matchEnd >= 0) {
1739 if (con.match != null) {
1740 con.match.setBeginning(0, matchStart);
1741 con.match.setEnd(0, matchEnd);
1743 con.setInUse(false);
1746 con.setInUse(false);
1751 // ================================================================
1754 * A regular expression.
1764 * The number of parenthesis in the regular expression.
1769 * Internal representation of the regular expression.
1774 boolean hasBackReferences = false;
1776 transient int minlength;
1777 transient Op operations = null;
1778 transient int numberOfClosures;
1779 transient Context context = null;
1780 transient RangeToken firstChar = null;
1782 transient String fixedString = null;
1783 transient int fixedStringOptions;
1784 transient BMPattern fixedStringTable = null;
1785 transient boolean fixedStringOnly = false;
1787 static abstract class ExpressionTarget {
1788 abstract char charAt(int index);
1789 abstract boolean regionMatches(boolean ignoreCase, int offset, int limit, String part, int partlen);
1790 abstract boolean regionMatches(boolean ignoreCase, int offset, int limit, int offset2, int partlen);
1793 static final class StringTarget extends ExpressionTarget {
1795 private String target;
1797 StringTarget(String target) {
1798 this.target = target;
1801 final void resetTarget(String target) {
1802 this.target = target;
1806 final char charAt(int index) {
1807 return target.charAt(index);
1811 final boolean regionMatches(boolean ignoreCase, int offset, int limit,
1812 String part, int partlen) {
1813 if (limit-offset < partlen) {
1816 return (ignoreCase) ? target.regionMatches(true, offset, part, 0, partlen) : target.regionMatches(offset, part, 0, partlen);
1820 final boolean regionMatches(boolean ignoreCase, int offset, int limit,
1821 int offset2, int partlen) {
1822 if (limit-offset < partlen) {
1825 return (ignoreCase) ? target.regionMatches(true, offset, target, offset2, partlen)
1826 : target.regionMatches(offset, target, offset2, partlen);
1830 static final class CharArrayTarget extends ExpressionTarget {
1834 CharArrayTarget(char[] target) {
1835 this.target = target;
1838 final void resetTarget(char[] target) {
1839 this.target = target;
1843 char charAt(int index) {
1844 return target[index];
1848 final boolean regionMatches(boolean ignoreCase, int offset, int limit,
1849 String part, int partlen) {
1850 if (offset < 0 || limit-offset < partlen) {
1853 return (ignoreCase) ? regionMatchesIgnoreCase(offset, limit, part, partlen)
1854 : regionMatches(offset, limit, part, partlen);
1857 private final boolean regionMatches(int offset, int limit, String part, int partlen) {
1859 while (partlen-- > 0) {
1860 if (target[offset++] != part.charAt(i++)) {
1867 private final boolean regionMatchesIgnoreCase(int offset, int limit, String part, int partlen) {
1869 while (partlen-- > 0) {
1870 final char ch1 = target[offset++] ;
1871 final char ch2 = part.charAt(i++);
1875 final char uch1 = Character.toUpperCase(ch1);
1876 final char uch2 = Character.toUpperCase(ch2);
1880 if (Character.toLowerCase(uch1) != Character.toLowerCase(uch2)) {
1888 final boolean regionMatches(boolean ignoreCase, int offset, int limit, int offset2, int partlen) {
1889 if (offset < 0 || limit-offset < partlen) {
1892 return (ignoreCase) ? regionMatchesIgnoreCase(offset, limit, offset2, partlen)
1893 : regionMatches(offset, limit, offset2, partlen);
1896 private final boolean regionMatches(int offset, int limit, int offset2, int partlen) {
1898 while (partlen-- > 0) {
1899 if ( target [ offset++ ] != target [ i++ ] ) {
1906 private final boolean regionMatchesIgnoreCase(int offset, int limit, int offset2, int partlen) {
1908 while (partlen-- > 0) {
1909 final char ch1 = target[offset++] ;
1910 final char ch2 = target[i++] ;
1914 final char uch1 = Character.toUpperCase(ch1);
1915 final char uch2 = Character.toUpperCase(ch2);
1919 if (Character.toLowerCase(uch1) != Character.toLowerCase(uch2)) {
1927 static final class CharacterIteratorTarget extends ExpressionTarget {
1928 CharacterIterator target;
1930 CharacterIteratorTarget(CharacterIterator target) {
1931 this.target = target;
1934 final void resetTarget(CharacterIterator target) {
1935 this.target = target;
1939 final char charAt(int index) {
1940 return target.setIndex(index);
1944 final boolean regionMatches(boolean ignoreCase, int offset, int limit,
1945 String part, int partlen) {
1946 if (offset < 0 || limit-offset < partlen) {
1949 return (ignoreCase) ? regionMatchesIgnoreCase(offset, limit, part, partlen)
1950 : regionMatches(offset, limit, part, partlen);
1953 private final boolean regionMatches(int offset, int limit, String part, int partlen) {
1955 while (partlen-- > 0) {
1956 if (target.setIndex(offset++) != part.charAt(i++)) {
1963 private final boolean regionMatchesIgnoreCase(int offset, int limit, String part, int partlen) {
1965 while (partlen-- > 0) {
1966 final char ch1 = target.setIndex(offset++) ;
1967 final char ch2 = part.charAt(i++);
1971 final char uch1 = Character.toUpperCase(ch1);
1972 final char uch2 = Character.toUpperCase(ch2);
1976 if (Character.toLowerCase(uch1) != Character.toLowerCase(uch2)) {
1984 final boolean regionMatches(boolean ignoreCase, int offset, int limit, int offset2, int partlen) {
1985 if (offset < 0 || limit-offset < partlen) {
1988 return (ignoreCase) ? regionMatchesIgnoreCase(offset, limit, offset2, partlen)
1989 : regionMatches(offset, limit, offset2, partlen);
1992 private final boolean regionMatches(int offset, int limit, int offset2, int partlen) {
1994 while (partlen-- > 0) {
1995 if (target.setIndex(offset++) != target.setIndex(i++)) {
2002 private final boolean regionMatchesIgnoreCase(int offset, int limit, int offset2, int partlen) {
2004 while (partlen-- > 0) {
2005 final char ch1 = target.setIndex(offset++) ;
2006 final char ch2 = target.setIndex(i++) ;
2010 final char uch1 = Character.toUpperCase(ch1);
2011 final char uch2 = Character.toUpperCase(ch2);
2015 if (Character.toLowerCase(uch1) != Character.toLowerCase(uch2)) {
2023 static final class ClosureContext {
2025 int[] offsets = new int[4];
2026 int currentIndex = 0;
2028 boolean contains(int offset) {
2029 for (int i=0; i<currentIndex;++i) {
2030 if (offsets[i] == offset) {
2041 void addOffset(int offset) {
2042 // We do not check for duplicates, caller is responsible for that
2043 if (currentIndex == offsets.length) {
2044 offsets = expandOffsets();
2046 offsets[currentIndex++] = offset;
2049 private int[] expandOffsets() {
2050 final int len = offsets.length;
2051 final int newLen = len << 1;
2052 int[] newOffsets = new int[newLen];
2054 System.arraycopy(offsets, 0, newOffsets, 0, currentIndex);
2059 static final class Context {
2064 boolean inuse = false;
2065 ClosureContext[] closureContexts;
2067 private StringTarget stringTarget;
2068 private CharArrayTarget charArrayTarget;
2069 private CharacterIteratorTarget characterIteratorTarget;
2071 ExpressionTarget target;
2076 private void resetCommon(int nofclosures) {
2077 this.length = this.limit-this.start;
2080 if (this.closureContexts == null || this.closureContexts.length != nofclosures) {
2081 this.closureContexts = new ClosureContext[nofclosures];
2083 for (int i = 0; i < nofclosures; i ++) {
2084 if (this.closureContexts[i] == null) {
2085 this.closureContexts[i] = new ClosureContext();
2088 this.closureContexts[i].reset();
2093 void reset(CharacterIterator target, int start, int limit, int nofclosures) {
2094 if (characterIteratorTarget == null) {
2095 characterIteratorTarget = new CharacterIteratorTarget(target);
2098 characterIteratorTarget.resetTarget(target);
2100 this.target = characterIteratorTarget;
2103 this.resetCommon(nofclosures);
2106 void reset(String target, int start, int limit, int nofclosures) {
2107 if (stringTarget == null) {
2108 stringTarget = new StringTarget(target);
2111 stringTarget.resetTarget(target);
2113 this.target = stringTarget;
2116 this.resetCommon(nofclosures);
2119 void reset(char[] target, int start, int limit, int nofclosures) {
2120 if (charArrayTarget == null) {
2121 charArrayTarget = new CharArrayTarget(target);
2124 charArrayTarget.resetTarget(target);
2126 this.target = charArrayTarget;
2129 this.resetCommon(nofclosures);
2131 synchronized void setInUse(boolean inUse) {
2137 * Prepares for matching. This method is called just before starting matching.
2141 Op.nofinstances = 0;
2143 this.compile(this.tokentree);
2145 if (this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) { // .*
2146 Op anchor = Op.createAnchor(isSet(this.options, SINGLE_LINE) ? 'A' : '@');
2147 anchor.next = this.operations;
2148 this.operations = anchor;
2152 System.err.println("DEBUG: The number of operations: "+Op.nofinstances);
2155 this.minlength = this.tokentree.getMinLength();
2157 this.firstChar = null;
2158 if (!isSet(this.options, PROHIBIT_HEAD_CHARACTER_OPTIMIZATION)
2159 && !isSet(this.options, XMLSCHEMA_MODE)) {
2160 RangeToken firstChar = Token.createRange();
2161 int fresult = this.tokentree.analyzeFirstCharacter(firstChar, this.options);
2162 if (fresult == Token.FC_TERMINAL) {
2163 firstChar.compactRanges();
2164 this.firstChar = firstChar;
2166 System.err.println("DEBUG: Use the first character optimization: "+firstChar);
2171 if (this.operations != null
2172 && (this.operations.type == Op.STRING || this.operations.type == Op.CHAR)
2173 && this.operations.next == null) {
2175 System.err.print(" *** Only fixed string! *** ");
2177 this.fixedStringOnly = true;
2178 if (this.operations.type == Op.STRING) {
2179 this.fixedString = this.operations.getString();
2180 } else if (this.operations.getData() >= 0x10000) { // Op.CHAR
2181 this.fixedString = REUtil.decomposeToSurrogates(this.operations.getData());
2183 char[] ac = new char[1];
2184 ac[0] = (char)this.operations.getData();
2185 this.fixedString = new String(ac);
2187 this.fixedStringOptions = this.options;
2188 this.fixedStringTable = new BMPattern(this.fixedString, 256,
2189 isSet(this.fixedStringOptions, IGNORE_CASE));
2190 } else if (!isSet(this.options, PROHIBIT_FIXED_STRING_OPTIMIZATION)
2191 && !isSet(this.options, XMLSCHEMA_MODE)) {
2192 Token.FixedStringContainer container = new Token.FixedStringContainer();
2193 this.tokentree.findFixedString(container, this.options);
2194 this.fixedString = container.token == null ? null : container.token.getString();
2195 this.fixedStringOptions = container.options;
2196 if (this.fixedString != null && this.fixedString.length() < 2) {
2197 this.fixedString = null;
2199 // This pattern has a fixed string of which length is more than one.
2200 if (this.fixedString != null) {
2201 this.fixedStringTable = new BMPattern(this.fixedString, 256,
2202 isSet(this.fixedStringOptions, IGNORE_CASE));
2204 System.err.println("DEBUG: The longest fixed string: "+this.fixedString.length()
2205 +"/" //+this.fixedString
2206 +"/"+REUtil.createOptionString(this.fixedStringOptions));
2207 System.err.print("String: ");
2208 REUtil.dumpString(this.fixedString);
2216 * If you specify this option, <span class="REGEX"><kbd>(</kbd><var>X</var><kbd>)</kbd></span>
2217 * captures matched text, and <span class="REGEX"><kbd>(:?</kbd><var>X</var><kbd>)</kbd></span>
2220 * @see #RegularExpression(java.lang.String,int)
2221 * @see #setPattern(java.lang.String,int)
2222 static final int MARK_PARENS = 1<<0;
2228 static final int IGNORE_CASE = 1<<1;
2233 static final int SINGLE_LINE = 1<<2;
2238 static final int MULTIPLE_LINES = 1<<3;
2243 static final int EXTENDED_COMMENT = 1<<4;
2246 * This option redefines <span class="REGEX"><kbd>\d \D \w \W \s \S</kbd></span>.
2248 * @see #RegularExpression(java.lang.String,int)
2249 * @see #setPattern(java.lang.String,int)
2250 * @see #UNICODE_WORD_BOUNDARY
2252 static final int USE_UNICODE_CATEGORY = 1<<5; // "u"
2256 * This enables to process locale-independent word boundary for <span class="REGEX"><kbd>\b \B \< \></kbd></span>.
2257 * <p>By default, the engine considers a position between a word character
2258 * (<span class="REGEX"><Kbd>\w</kbd></span>) and a non word character
2259 * is a word boundary.
2260 * <p>By this option, the engine checks word boundaries with the method of
2261 * 'Unicode Regular Expression Guidelines' Revision 4.
2263 * @see #RegularExpression(java.lang.String,int)
2264 * @see #setPattern(java.lang.String,int)
2266 static final int UNICODE_WORD_BOUNDARY = 1<<6; // "w"
2271 static final int PROHIBIT_HEAD_CHARACTER_OPTIMIZATION = 1<<7;
2275 static final int PROHIBIT_FIXED_STRING_OPTIMIZATION = 1<<8;
2277 * "X". XML Schema mode.
2279 static final int XMLSCHEMA_MODE = 1<<9;
2283 static final int SPECIAL_COMMA = 1<<10;
2286 private static final boolean isSet(int options, int flag) {
2287 return (options & flag) == flag;
2291 * Creates a new RegularExpression instance.
2293 * @param regex A regular expression
2294 * @exception ParseException <VAR>regex</VAR> is not conforming to the syntax.
2296 public RegularExpression(String regex) throws ParseException {
2301 * Creates a new RegularExpression instance with options.
2303 * @param regex A regular expression
2304 * @param options A String consisted of "i" "m" "s" "u" "w" "," "X"
2305 * @exception ParseException <VAR>regex</VAR> is not conforming to the syntax.
2307 public RegularExpression(String regex, String options) throws ParseException {
2308 this.setPattern(regex, options);
2312 * Creates a new RegularExpression instance with options.
2314 * @param regex A regular expression
2315 * @param options A String consisted of "i" "m" "s" "u" "w" "," "X"
2316 * @exception ParseException <VAR>regex</VAR> is not conforming to the syntax.
2318 public RegularExpression(String regex, String options, Locale locale) throws ParseException {
2319 this.setPattern(regex, options, locale);
2322 RegularExpression(String regex, Token tok, int parens, boolean hasBackReferences, int options) {
2324 this.tokentree = tok;
2325 this.nofparen = parens;
2326 this.options = options;
2327 this.hasBackReferences = hasBackReferences;
2333 public void setPattern(String newPattern) throws ParseException {
2334 this.setPattern(newPattern, Locale.getDefault());
2337 public void setPattern(String newPattern, Locale locale) throws ParseException {
2338 this.setPattern(newPattern, this.options, locale);
2341 private void setPattern(String newPattern, int options, Locale locale) throws ParseException {
2342 this.regex = newPattern;
2343 this.options = options;
2344 RegexParser rp = RegularExpression.isSet(this.options, RegularExpression.XMLSCHEMA_MODE)
2345 ? new ParserForXMLSchema(locale) : new RegexParser(locale);
2346 this.tokentree = rp.parse(this.regex, this.options);
2347 this.nofparen = rp.parennumber;
2348 this.hasBackReferences = rp.hasBackReferences;
2350 this.operations = null;
2351 this.context = null;
2356 public void setPattern(String newPattern, String options) throws ParseException {
2357 this.setPattern(newPattern, options, Locale.getDefault());
2360 public void setPattern(String newPattern, String options, Locale locale) throws ParseException {
2361 this.setPattern(newPattern, REUtil.parseOptions(options), locale);
2367 public String getPattern() {
2372 * Represents this instance in String.
2375 public String toString() {
2376 return this.tokentree.toString(this.options & ~XMLSCHEMA_MODE);
2380 * Returns a {@link java.util.regex.Pattern}-compatible string representation of this expression.
2382 * @return A Pattern-compatible String representation
2384 public String toPatternString() {
2385 final String str = this.tokentree.toString(this.options);
2386 return isSet(options, XMLSCHEMA_MODE) ? "^" + str + "$" : str;
2390 * Returns a option string.
2391 * The order of letters in it may be different from a string specified
2392 * in a constructor or <code>setPattern()</code>.
2394 * @see #RegularExpression(java.lang.String,java.lang.String)
2395 * @see #setPattern(java.lang.String,java.lang.String)
2397 public String getOptions() {
2398 return REUtil.createOptionString(this.options);
2402 * Return true if patterns are the same and the options are equivalent.
2405 public boolean equals(Object obj) {
2409 if (!(obj instanceof RegularExpression)) {
2412 RegularExpression r = (RegularExpression)obj;
2413 return this.regex.equals(r.regex) && this.options == r.options;
2416 boolean equals(String pattern, int options) {
2417 return this.regex.equals(pattern) && this.options == options;
2424 public int hashCode() {
2425 return (this.regex+"/"+this.getOptions()).hashCode();
2429 * Return the number of regular expression groups.
2430 * This method returns 1 when the regular expression has no capturing-parenthesis.
2433 public int getNumberOfGroups() {
2434 return this.nofparen;
2437 // ================================================================
2439 private static final int WT_IGNORE = 0;
2440 private static final int WT_LETTER = 1;
2441 private static final int WT_OTHER = 2;
2442 private static final int getWordType0(char ch, int opts) {
2443 if (!isSet(opts, UNICODE_WORD_BOUNDARY)) {
2444 if (isSet(opts, USE_UNICODE_CATEGORY)) {
2445 return (Token.getRange("IsWord", true).match(ch)) ? WT_LETTER : WT_OTHER;
2447 return isWordChar(ch) ? WT_LETTER : WT_OTHER;
2450 switch (Character.getType(ch)) {
2451 case Character.UPPERCASE_LETTER: // L
2452 case Character.LOWERCASE_LETTER: // L
2453 case Character.TITLECASE_LETTER: // L
2454 case Character.MODIFIER_LETTER: // L
2455 case Character.OTHER_LETTER: // L
2456 case Character.LETTER_NUMBER: // N
2457 case Character.DECIMAL_DIGIT_NUMBER: // N
2458 case Character.OTHER_NUMBER: // N
2459 case Character.COMBINING_SPACING_MARK: // Mc
2462 case Character.FORMAT: // Cf
2463 case Character.NON_SPACING_MARK: // Mn
2464 case Character.ENCLOSING_MARK: // Mc
2467 case Character.CONTROL: // Cc
2484 // ================================================================
2486 static final int LINE_FEED = 0x000A;
2487 static final int CARRIAGE_RETURN = 0x000D;
2488 static final int LINE_SEPARATOR = 0x2028;
2489 static final int PARAGRAPH_SEPARATOR = 0x2029;
2491 private static final boolean isEOLChar(int ch) {
2492 return ch == LINE_FEED || ch == CARRIAGE_RETURN || ch == LINE_SEPARATOR
2493 || ch == PARAGRAPH_SEPARATOR;
2496 private static final boolean isWordChar(int ch) { // Legacy word characters
2521 private static final boolean matchIgnoreCase(int chardata, int ch) {
2522 if (chardata == ch) {
2525 if (chardata > 0xffff || ch > 0xffff) {
2528 char uch1 = Character.toUpperCase((char)chardata);
2529 char uch2 = Character.toUpperCase((char)ch);
2533 return Character.toLowerCase(uch1) == Character.toLowerCase(uch2);