third-party/xsd-regex/src/main/java/org/opendaylight/yangtools/xsd/regex/RegularExpression.java

   1 /*
   2  * Licensed to the Apache Software Foundation (ASF) under one or more
   3  * contributor license agreements.  See the NOTICE file distributed with
   4  * this work for additional information regarding copyright ownership.
   5  * The ASF licenses this file to You under the Apache License, Version 2.0
   6  * (the "License"); you may not use this file except in compliance with
   7  * the License.  You may obtain a copy of the License at
   8  *
   9  *      http://www.apache.org/licenses/LICENSE-2.0
  10  *
  11  * Unless required by applicable law or agreed to in writing, software
  12  * distributed under the License is distributed on an "AS IS" BASIS,
  13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14  * See the License for the specific language governing permissions and
  15  * limitations under the License.
  16  */
  17
  18 package org.opendaylight.yangtools.xsd.regex;
  19
  20 import java.text.CharacterIterator;
  21 import java.util.ArrayDeque;
  22 import java.util.Deque;
  23 import java.util.Locale;
  24
  25 /**
  26  * A regular expression matching engine using Non-deterministic Finite Automaton (NFA).
  27  * This engine does not conform to the POSIX regular expression.
  28  *
  29  * <hr width="50%">
  30  * <h3>How to use</h3>
  31  *
  32  * <dl>
  33  *   <dt>A. Standard way
  34  *   <dd>
  35  * <pre>
  36  * RegularExpression re = new RegularExpression(<var>regex</var>);
  37  * if (re.matches(text)) { ... }
  38  * </pre>
  39  *
  40  *   <dt>B. Capturing groups
  41  *   <dd>
  42  * <pre>
  43  * RegularExpression re = new RegularExpression(<var>regex</var>);
  44  * Match match = new Match();
  45  * if (re.matches(text, match)) {
  46  *     ... // You can refer captured texts with methods of the <code>Match</code> class.
  47  * }
  48  * </pre>
  49  *
  50  * </dl>
  51  *
  52  * <h4>Case-insensitive matching</h4>
  53  * <pre>
  54  * RegularExpression re = new RegularExpression(<var>regex</var>, "i");
  55  * if (re.matches(text) >= 0) { ...}
  56  * </pre>
  57  *
  58  * <h4>Options</h4>
  59  * <p>You can specify options to <a href="#RegularExpression(java.lang.String, java.lang.String)"><code>RegularExpression(</code><var>regex</var><code>, </code><var>options</var><code>)</code></a>
  60  *    or <a href="#setPattern(java.lang.String, java.lang.String)"><code>setPattern(</code><var>regex</var><code>, </code><var>options</var><code>)</code></a>.
  61  *    This <var>options</var> parameter consists of the following characters.
  62  * </p>
  63  * <dl>
  64  *   <dt><a name="I_OPTION"><code>"i"</code></a>
  65  *   <dd>This option indicates case-insensitive matching.
  66  *   <dt><a name="M_OPTION"><code>"m"</code></a>
  67  *   <dd class="REGEX"><kbd>^</kbd> and <kbd>$</kbd> consider the EOL characters within the text.
  68  *   <dt><a name="S_OPTION"><code>"s"</code></a>
  69  *   <dd class="REGEX"><kbd>.</kbd> matches any one character.
  70  *   <dt><a name="U_OPTION"><code>"u"</code></a>
  71  *   <dd class="REGEX">Redefines <Kbd>\d \D \w \W \s \S \b \B \&lt; \></kbd> as becoming to Unicode.
  72  *   <dt><a name="W_OPTION"><code>"w"</code></a>
  73  *   <dd class="REGEX">By this option, <kbd>\b \B \&lt; \></kbd> are processed with the method of
  74  *      'Unicode Regular Expression Guidelines' Revision 4.
  75  *      When "w" and "u" are specified at the same time,
  76  *      <kbd>\b \B \&lt; \></kbd> are processed for the "w" option.
  77  *   <dt><a name="COMMA_OPTION"><code>","</code></a>
  78  *   <dd>The parser treats a comma in a character class as a range separator.
  79  *      <kbd class="REGEX">[a,b]</kbd> matches <kbd>a</kbd> or <kbd>,</kbd> or <kbd>b</kbd> without this option.
  80  *      <kbd class="REGEX">[a,b]</kbd> matches <kbd>a</kbd> or <kbd>b</kbd> with this option.
  81  *
  82  *   <dt><a name="X_OPTION"><code>"X"</code></a>
  83  *   <dd class="REGEX">
  84  *       By this option, the engine confoms to <a href="http://www.w3.org/TR/2000/WD-xmlschema-2-20000407/#regexs">XML Schema: Regular Expression</a>.
  85  *       The <code>match()</code> method does not do subsring matching
  86  *       but entire string matching.
  87  *
  88  * </dl>
  89  *
  90  * <hr width="50%">
  91  * <h3>Syntax</h3>
  92  * <table border="1" bgcolor="#ddeeff">
  93  *   <tr>
  94  *    <td>
  95  *     <h4>Differences from the Perl 5 regular expression</h4>
  96  *     <ul>
  97  *      <li>There is 6-digit hexadecimal character representation  (<kbd>\u005cv</kbd><var>HHHHHH</var>.)
  98  *      <li>Supports subtraction, union, and intersection operations for character classes.
  99  *      <li>Not supported: <kbd>\</kbd><var>ooo</var> (Octal character representations),
 100  *          <Kbd>\G</kbd>, <kbd>\C</kbd>, <kbd>\l</kbd><var>c</var>,
 101  *          <kbd>\u005c u</kbd><var>c</var>, <kbd>\L</kbd>, <kbd>\U</kbd>,
 102  *          <kbd>\E</kbd>, <kbd>\Q</kbd>, <kbd>\N{</kbd><var>name</var><kbd>}</kbd>,
 103  *          <Kbd>(?{<kbd><var>code</var><kbd>})</kbd>, <Kbd>(??{<kbd><var>code</var><kbd>})</kbd>
 104  *     </ul>
 105  *    </td>
 106  *   </tr>
 107  * </table>
 108  *
 109  * <P>Meta characters are `<KBD>. * + ? { [ ( ) | \ ^ $</KBD>'.</P>
 110  * <ul>
 111  *   <li>Character
 112  *     <dl>
 113  *       <dt class="REGEX"><kbd>.</kbd> (A period)
 114  *       <dd>Matches any one character except the following characters.
 115  *       <dd>LINE FEED (U+000A), CARRIAGE RETURN (U+000D),
 116  *           PARAGRAPH SEPARATOR (U+2029), LINE SEPARATOR (U+2028)
 117  *       <dd>This expression matches one code point in Unicode. It can match a pair of surrogates.
 118  *       <dd>When <a href="#S_OPTION">the "s" option</a> is specified,
 119  *           it matches any character including the above four characters.
 120  *
 121  *       <dt class="REGEX"><Kbd>\e \f \n \r \t</kbd>
 122  *       <dd>Matches ESCAPE (U+001B), FORM FEED (U+000C), LINE FEED (U+000A),
 123  *           CARRIAGE RETURN (U+000D), HORIZONTAL TABULATION (U+0009)
 124  *
 125  *       <dt class="REGEX"><kbd>\c</kbd><var>C</var>
 126  *       <dd>Matches a control character.
 127  *           The <var>C</var> must be one of '<kbd>@</kbd>', '<kbd>A</kbd>'-'<kbd>Z</kbd>',
 128  *           '<kbd>[</kbd>', '<kbd>\u005c</kbd>', '<kbd>]</kbd>', '<kbd>^</kbd>', '<kbd>_</kbd>'.
 129  *           It matches a control character of which the character code is less than
 130  *           the character code of the <var>C</var> by 0x0040.
 131  *       <dd class="REGEX">For example, a <kbd>\cJ</kbd> matches a LINE FEED (U+000A),
 132  *           and a <kbd>\c[</kbd> matches an ESCAPE (U+001B).
 133  *
 134  *       <dt class="REGEX">a non-meta character
 135  *       <dd>Matches the character.
 136  *
 137  *       <dt class="REGEX"><KBD>\</KBD> + a meta character
 138  *       <dd>Matches the meta character.
 139  *
 140  *       <dt class="REGEX"><kbd>\u005cx</kbd><var>HH</var> <kbd>\u005cx{</kbd><var>HHHH</var><kbd>}</kbd>
 141  *       <dd>Matches a character of which code point is <var>HH</var> (Hexadecimal) in Unicode.
 142  *           You can write just 2 digits for <kbd>\u005cx</kbd><var>HH</var>, and
 143  *           variable length digits for <kbd>\u005cx{</kbd><var>HHHH</var><kbd>}</kbd>.
 144  *
 145  *       <!--
 146  *       <dt class="REGEX"><kbd>\u005c u</kbd><var>HHHH</var>
 147  *       <dd>Matches a character of which code point is <var>HHHH</var> (Hexadecimal) in Unicode.
 148  *       -->
 149  *
 150  *       <dt class="REGEX"><kbd>\u005cv</kbd><var>HHHHHH</var>
 151  *       <dd>Matches a character of which code point is <var>HHHHHH</var> (Hexadecimal) in Unicode.
 152  *
 153  *       <dt class="REGEX"><kbd>\g</kbd>
 154  *       <dd>Matches a grapheme.
 155  *       <dd class="REGEX">It is equivalent to <kbd>(?[\p{ASSIGNED}]-[\p{M}\p{C}])?(?:\p{M}|[\x{094D}\x{09CD}\x{0A4D}\x{0ACD}\x{0B3D}\x{0BCD}\x{0C4D}\x{0CCD}\x{0D4D}\x{0E3A}\x{0F84}]\p{L}|[\x{1160}-\x{11A7}]|[\x{11A8}-\x{11FF}]|[\x{FF9E}\x{FF9F}])*</kbd>
 156  *
 157  *       <dt class="REGEX"><kbd>\X</kbd>
 158  *       <dd class="REGEX">Matches a combining character sequence.
 159  *       It is equivalent to <kbd>(?:\PM\pM*)</kbd>
 160  *     </dl>
 161  *   </li>
 162  *
 163  *   <li>Character class
 164  *     <dl>
 165 + *       <dt class="REGEX"><kbd>[</kbd><var>R<sub>1</sub></var><var>R<sub>2</sub></var><var>...</var><var>R<sub>n</sub></var><kbd>]</kbd> (without <a href="#COMMA_OPTION">"," option</a>)
 166 + *       <dt class="REGEX"><kbd>[</kbd><var>R<sub>1</sub></var><kbd>,</kbd><var>R<sub>2</sub></var><kbd>,</kbd><var>...</var><kbd>,</kbd><var>R<sub>n</sub></var><kbd>]</kbd> (with <a href="#COMMA_OPTION">"," option</a>)
 167  *       <dd>Positive character class.  It matches a character in ranges.
 168  *       <dd><var>R<sub>n</sub></var>:
 169  *       <ul>
 170  *         <li class="REGEX">A character (including <Kbd>\e \f \n \r \t</kbd> <kbd>\u005cx</kbd><var>HH</var> <kbd>\u005cx{</kbd><var>HHHH</var><kbd>}</kbd> <!--kbd>\u005c u</kbd><var>HHHH</var--> <kbd>\u005cv</kbd><var>HHHHHH</var>)
 171  *             <p>This range matches the character.
 172  *         <li class="REGEX"><var>C<sub>1</sub></var><kbd>-</kbd><var>C<sub>2</sub></var>
 173  *             <p>This range matches a character which has a code point that is >= <var>C<sub>1</sub></var>'s code point and &lt;= <var>C<sub>2</sub></var>'s code point.
 174 + *         <li class="REGEX">A POSIX character class: <Kbd>[:alpha:] [:alnum:] [:ascii:] [:cntrl:] [:digit:] [:graph:] [:lower:] [:print:] [:punct:] [:space:] [:upper:] [:xdigit:]</kbd>,
 175 + *             and negative POSIX character classes in Perl like <kbd>[:^alpha:]</kbd>
 176  *             <p>...
 177  *         <li class="REGEX"><kbd>\d \D \s \S \w \W \p{</kbd><var>name</var><kbd>} \P{</kbd><var>name</var><kbd>}</kbd>
 178  *             <p>These expressions specifies the same ranges as the following expressions.
 179  *       </ul>
 180  *       <p class="REGEX">Enumerated ranges are merged (union operation).
 181  *          <kbd>[a-ec-z]</kbd> is equivalent to <kbd>[a-z]</kbd>
 182  *
 183  *       <dt class="REGEX"><kbd>[^</kbd><var>R<sub>1</sub></var><var>R<sub>2</sub></var><var>...</var><var>R<sub>n</sub></var><kbd>]</kbd> (without a <a href="#COMMA_OPTION">"," option</a>)
 184  *       <dt class="REGEX"><kbd>[^</kbd><var>R<sub>1</sub></var><kbd>,</kbd><var>R<sub>2</sub></var><kbd>,</kbd><var>...</var><kbd>,</kbd><var>R<sub>n</sub></var><kbd>]</kbd> (with a <a href="#COMMA_OPTION">"," option</a>)
 185  *       <dd>Negative character class.  It matches a character not in ranges.
 186  *
 187  *       <dt class="REGEX"><kbd>(?[</kbd><var>ranges</var><kbd>]</kbd><var>op</var><kbd>[</kbd><var>ranges</var><kbd>]</kbd><var>op</var><kbd>[</kbd><var>ranges</var><kbd>]</kbd> ... <Kbd>)</kbd>
 188  *       (<var>op</var> is <kbd>-</kbd> or <kbd>+</kbd> or <kbd>&</kbd>.)
 189  *       <dd>Subtraction or union or intersection for character classes.
 190  *       <dd class="REGEX">For exmaple, <kbd>(?[A-Z]-[CF])</kbd> is equivalent to <kbd>[A-BD-EG-Z]</kbd>, and <kbd>(?[0x00-0x7f]-[K]&[\p{Lu}])</kbd> is equivalent to <kbd>[A-JL-Z]</kbd>.
 191  *       <dd>The result of this operations is a <u>positive character class</u>
 192  *           even if an expression includes any negative character classes.
 193  *           You have to take care on this in case-insensitive matching.
 194  *           For instance, <kbd>(?[^b])</kbd> is equivalent to <kbd>[\x00-ac-\x{10ffff}]</kbd>,
 195  *           which is equivalent to <kbd>[^b]</kbd> in case-sensitive matching.
 196  *           But, in case-insensitive matching, <kbd>(?[^b])</kbd> matches any character because
 197  *           it includes '<kbd>B</kbd>' and '<kbd>B</kbd>' matches '<kbd>b</kbd>'
 198  *           though <kbd>[^b]</kbd> is processed as <kbd>[^Bb]</kbd>.
 199  *
 200  *       <dt class="REGEX"><kbd>[</kbd><var>R<sub>1</sub>R<sub>2</sub>...</var><kbd>-[</kbd><var>R<sub>n</sub>R<sub>n+1</sub>...</var><kbd>]]</kbd> (with an <a href="#X_OPTION">"X" option</a>)</dt>
 201  *       <dd>Character class subtraction for the XML Schema.
 202  *           You can use this syntax when you specify an <a href="#X_OPTION">"X" option</a>.
 203  *
 204  *       <dt class="REGEX"><kbd>\d</kbd>
 205  *       <dd class="REGEX">Equivalent to <kbd>[0-9]</kbd>.
 206  *       <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
 207  *           <span class="REGEX"><kbd>\p{Nd}</kbd></span>.
 208  *
 209  *       <dt class="REGEX"><kbd>\D</kbd>
 210  *       <dd class="REGEX">Equivalent to <kbd>[^0-9]</kbd>
 211  *       <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
 212  *           <span class="REGEX"><kbd>\P{Nd}</kbd></span>.
 213  *
 214  *       <dt class="REGEX"><kbd>\s</kbd>
 215  *       <dd class="REGEX">Equivalent to <kbd>[ \f\n\r\t]</kbd>
 216  *       <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
 217  *           <span class="REGEX"><kbd>[ \f\n\r\t\p{Z}]</kbd></span>.
 218  *
 219  *       <dt class="REGEX"><kbd>\S</kbd>
 220  *       <dd class="REGEX">Equivalent to <kbd>[^ \f\n\r\t]</kbd>
 221  *       <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
 222  *           <span class="REGEX"><kbd>[^ \f\n\r\t\p{Z}]</kbd></span>.
 223  *
 224  *       <dt class="REGEX"><kbd>\w</kbd>
 225  *       <dd class="REGEX">Equivalent to <kbd>[a-zA-Z0-9_]</kbd>
 226  *       <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
 227  *           <span class="REGEX"><kbd>[\p{Lu}\p{Ll}\p{Lo}\p{Nd}_]</kbd></span>.
 228  *
 229  *       <dt class="REGEX"><kbd>\W</kbd>
 230  *       <dd class="REGEX">Equivalent to <kbd>[^a-zA-Z0-9_]</kbd>
 231  *       <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
 232  *           <span class="REGEX"><kbd>[^\p{Lu}\p{Ll}\p{Lo}\p{Nd}_]</kbd></span>.
 233  *
 234  *       <dt class="REGEX"><kbd>\p{</kbd><var>name</var><kbd>}</kbd>
 235  *       <dd>Matches one character in the specified General Category (the second field in <a href="ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt"><kbd>UnicodeData.txt</kbd></a>) or the specified <a href="ftp://ftp.unicode.org/Public/UNIDATA/Blocks.txt">Block</a>.
 236  *       The following names are available:
 237  *       <dl>
 238  *         <dt>Unicode General Categories:
 239  *         <dd><kbd>
 240  *       L, M, N, Z, C, P, S, Lu, Ll, Lt, Lm, Lo, Mn, Me, Mc, Nd, Nl, No, Zs, Zl, Zp,
 241  *       Cc, Cf, Cn, Co, Cs, Pd, Ps, Pe, Pc, Po, Sm, Sc, Sk, So,
 242  *         </kbd>
 243  *         <dd>(Currently the Cn category includes U+10000-U+10FFFF characters)
 244  *         <dt>Unicode Blocks:
 245  *         <dd><kbd>
 246  *       Basic Latin, Latin-1 Supplement, Latin Extended-A, Latin Extended-B,
 247  *       IPA Extensions, Spacing Modifier Letters, Combining Diacritical Marks, Greek,
 248  *       Cyrillic, Armenian, Hebrew, Arabic, Devanagari, Bengali, Gurmukhi, Gujarati,
 249  *       Oriya, Tamil, Telugu, Kannada, Malayalam, Thai, Lao, Tibetan, Georgian,
 250  *       Hangul Jamo, Latin Extended Additional, Greek Extended, General Punctuation,
 251  *       Superscripts and Subscripts, Currency Symbols, Combining Marks for Symbols,
 252  *       Letterlike Symbols, Number Forms, Arrows, Mathematical Operators,
 253  *       Miscellaneous Technical, Control Pictures, Optical Character Recognition,
 254  *       Enclosed Alphanumerics, Box Drawing, Block Elements, Geometric Shapes,
 255  *       Miscellaneous Symbols, Dingbats, CJK Symbols and Punctuation, Hiragana,
 256  *       Katakana, Bopomofo, Hangul Compatibility Jamo, Kanbun,
 257  *       Enclosed CJK Letters and Months, CJK Compatibility, CJK Unified Ideographs,
 258  *       Hangul Syllables, High Surrogates, High Private Use Surrogates, Low Surrogates,
 259  *       Private Use, CJK Compatibility Ideographs, Alphabetic Presentation Forms,
 260  *       Arabic Presentation Forms-A, Combining Half Marks, CJK Compatibility Forms,
 261  *       Small Form Variants, Arabic Presentation Forms-B, Specials,
 262  *       Halfwidth and Fullwidth Forms
 263  *         </kbd>
 264  *         <dt>Others:
 265  *         <dd><kbd>ALL</kbd> (Equivalent to <kbd>[\u005cu0000-\u005cv10FFFF]</kbd>)
 266  *         <dd><kbd>ASSGINED</kbd> (<kbd>\p{ASSIGNED}</kbd> is equivalent to <kbd>\P{Cn}</kbd>)
 267  *         <dd><kbd>UNASSGINED</kbd>
 268  *             (<kbd>\p{UNASSIGNED}</kbd> is equivalent to <kbd>\p{Cn}</kbd>)
 269  *       </dl>
 270  *
 271  *       <dt class="REGEX"><kbd>\P{</kbd><var>name</var><kbd>}</kbd>
 272  *       <dd>Matches one character not in the specified General Category or the specified Block.
 273  *     </dl>
 274  *   </li>
 275  *
 276  *   <li>Selection and Quantifier
 277  *     <dl>
 278  *       <dt class="REGEX"><VAR>X</VAR><kbd>|</kbd><VAR>Y</VAR>
 279  *       <dd>...
 280  *
 281  *       <dt class="REGEX"><VAR>X</VAR><kbd>*</KBD>
 282  *       <dd>Matches 0 or more <var>X</var>.
 283  *
 284  *       <dt class="REGEX"><VAR>X</VAR><kbd>+</KBD>
 285  *       <dd>Matches 1 or more <var>X</var>.
 286  *
 287  *       <dt class="REGEX"><VAR>X</VAR><kbd>?</KBD>
 288  *       <dd>Matches 0 or 1 <var>X</var>.
 289  *
 290  *       <dt class="REGEX"><var>X</var><kbd>{</kbd><var>number</var><kbd>}</kbd>
 291  *       <dd>Matches <var>number</var> times.
 292  *
 293  *       <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,}</kbd>
 294  *       <dd>...
 295  *
 296  *       <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,</kbd><var>max</var><kbd>}</kbd>
 297  *       <dd>...
 298  *
 299  *       <dt class="REGEX"><VAR>X</VAR><kbd>*?</kbd>
 300  *       <dt class="REGEX"><VAR>X</VAR><kbd>+?</kbd>
 301  *       <dt class="REGEX"><VAR>X</VAR><kbd>??</kbd>
 302  *       <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,}?</kbd>
 303  *       <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,</kbd><var>max</var><kbd>}?</kbd>
 304  *       <dd>Non-greedy matching.
 305  *     </dl>
 306  *   </li>
 307  *
 308  *   <li>Grouping, Capturing, and Back-reference
 309  *     <dl>
 310  *       <dt class="REGEX"><KBD>(?:</kbd><VAR>X</VAR><kbd>)</KBD>
 311  *       <dd>Grouping. "<KBD>foo+</KBD>" matches "<KBD>foo</KBD>" or "<KBD>foooo</KBD>".
 312  *       If you want it matches "<KBD>foofoo</KBD>" or "<KBD>foofoofoo</KBD>",
 313  *       you have to write "<KBD>(?:foo)+</KBD>".
 314  *
 315  *       <dt class="REGEX"><KBD>(</kbd><VAR>X</VAR><kbd>)</KBD>
 316  *       <dd>Grouping with capturing.
 317  * It make a group and applications can know
 318  * where in target text a group matched with methods of a <code>Match</code> instance
 319  * after <code><a href="#matches(java.lang.String, org.apache.xerces.utils.regex.Match)">matches(String,Match)</a></code>.
 320  * The 0th group means whole of this regular expression.
 321  * The <VAR>N</VAR>th gorup is the inside of the <VAR>N</VAR>th left parenthesis.
 322  *
 323  *   <p>For instance, a regular expression is
 324  *   "<FONT color=blue><KBD> *([^&lt;:]*) +&lt;([^&gt;]*)&gt; *</KBD></FONT>"
 325  *   and target text is
 326  *   "<FONT color=red><KBD>From: TAMURA Kent &lt;kent@trl.ibm.co.jp&gt;</KBD></FONT>":
 327  *   <ul>
 328  *     <li><code>Match.getCapturedText(0)</code>:
 329  *     "<FONT color=red><KBD> TAMURA Kent &lt;kent@trl.ibm.co.jp&gt;</KBD></FONT>"
 330  *     <li><code>Match.getCapturedText(1)</code>: "<FONT color=red><KBD>TAMURA Kent</KBD></FONT>"
 331  *     <li><code>Match.getCapturedText(2)</code>: "<FONT color=red><KBD>kent@trl.ibm.co.jp</KBD></FONT>"
 332  *   </ul>
 333  *
 334  *       <dt class="REGEX"><kbd>\1 \2 \3 \4 \5 \6 \7 \8 \9</kbd>
 335  *       <dd>
 336  *
 337  *       <dt class="REGEX"><kbd>(?></kbd><var>X</var><kbd>)</kbd>
 338  *       <dd>Independent expression group. ................
 339  *
 340  *       <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>:</kbd><var>X</var><kbd>)</kbd>
 341  *       <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>-</kbd><var>options2</var><kbd>:</kbd><var>X</var><kbd>)</kbd>
 342  *       <dd>............................
 343  *       <dd>The <var>options</var> or the <var>options2</var> consists of 'i' 'm' 's' 'w'.
 344  *           Note that it can not contain 'u'.
 345  *
 346  *       <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>)</kbd>
 347  *       <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>-</kbd><var>options2</var><kbd>)</kbd>
 348  *       <dd>......
 349  *       <dd>These expressions must be at the beginning of a group.
 350  *     </dl>
 351  *   </li>
 352  *
 353  *   <li>Anchor
 354  *     <dl>
 355  *       <dt class="REGEX"><kbd>\A</kbd>
 356  *       <dd>Matches the beginnig of the text.
 357  *
 358  *       <dt class="REGEX"><kbd>\Z</kbd>
 359  *       <dd>Matches the end of the text, or before an EOL character at the end of the text,
 360  *           or CARRIAGE RETURN + LINE FEED at the end of the text.
 361  *
 362  *       <dt class="REGEX"><kbd>\z</kbd>
 363  *       <dd>Matches the end of the text.
 364  *
 365  *       <dt class="REGEX"><kbd>^</kbd>
 366  *       <dd>Matches the beginning of the text.  It is equivalent to <span class="REGEX"><Kbd>\A</kbd></span>.
 367  *       <dd>When <a href="#M_OPTION">a "m" option</a> is set,
 368  *           it matches the beginning of the text, or after one of EOL characters (
 369  *           LINE FEED (U+000A), CARRIAGE RETURN (U+000D), LINE SEPARATOR (U+2028),
 370  *           PARAGRAPH SEPARATOR (U+2029).)
 371  *
 372  *       <dt class="REGEX"><kbd>$</kbd>
 373  *       <dd>Matches the end of the text, or before an EOL character at the end of the text,
 374  *           or CARRIAGE RETURN + LINE FEED at the end of the text.
 375  *       <dd>When <a href="#M_OPTION">a "m" option</a> is set,
 376  *           it matches the end of the text, or before an EOL character.
 377  *
 378  *       <dt class="REGEX"><kbd>\b</kbd>
 379  *       <dd>Matches word boundary.
 380  *           (See <a href="#W_OPTION">a "w" option</a>)
 381  *
 382  *       <dt class="REGEX"><kbd>\B</kbd>
 383  *       <dd>Matches non word boundary.
 384  *           (See <a href="#W_OPTION">a "w" option</a>)
 385  *
 386  *       <dt class="REGEX"><kbd>\&lt;</kbd>
 387  *       <dd>Matches the beginning of a word.
 388  *           (See <a href="#W_OPTION">a "w" option</a>)
 389  *
 390  *       <dt class="REGEX"><kbd>\&gt;</kbd>
 391  *       <dd>Matches the end of a word.
 392  *           (See <a href="#W_OPTION">a "w" option</a>)
 393  *     </dl>
 394  *   </li>
 395  *   <li>Lookahead and lookbehind
 396  *     <dl>
 397  *       <dt class="REGEX"><kbd>(?=</kbd><var>X</var><kbd>)</kbd>
 398  *       <dd>Lookahead.
 399  *
 400  *       <dt class="REGEX"><kbd>(?!</kbd><var>X</var><kbd>)</kbd>
 401  *       <dd>Negative lookahead.
 402  *
 403  *       <dt class="REGEX"><kbd>(?&lt;=</kbd><var>X</var><kbd>)</kbd>
 404  *       <dd>Lookbehind.
 405  *       <dd>(Note for text capturing......)
 406  *
 407  *       <dt class="REGEX"><kbd>(?&lt;!</kbd><var>X</var><kbd>)</kbd>
 408  *       <dd>Negative lookbehind.
 409  *     </dl>
 410  *   </li>
 411  *
 412  *   <li>Misc.
 413  *     <dl>
 414  *       <dt class="REGEX"><kbd>(?(</Kbd><var>condition</var><Kbd>)</kbd><var>yes-pattern</var><kbd>|</kbd><var>no-pattern</var><kbd>)</kbd>,
 415  *       <dt class="REGEX"><kbd>(?(</kbd><var>condition</var><kbd>)</kbd><var>yes-pattern</var><kbd>)</kbd>
 416  *       <dd>......
 417  *       <dt class="REGEX"><kbd>(?#</kbd><var>comment</var><kbd>)</kbd>
 418  *       <dd>Comment.  A comment string consists of characters except '<kbd>)</kbd>'.
 419  *           You can not write comments in character classes and before quantifiers.
 420  *     </dl>
 421  *   </li>
 422  * </ul>
 423  *
 424  *
 425  * <hr width="50%">
 426  * <h3>BNF for the regular expression</h3>
 427  * <pre>
 428  * regex ::= ('(?' options ')')? term ('|' term)*
 429  * term ::= factor+
 430  * factor ::= anchors | atom (('*' | '+' | '?' | minmax ) '?'? )?
 431  *            | '(?#' [^)]* ')'
 432  * minmax ::= '{' ([0-9]+ | [0-9]+ ',' | ',' [0-9]+ | [0-9]+ ',' [0-9]+) '}'
 433  * atom ::= char | '.' | char-class | '(' regex ')' | '(?:' regex ')' | '\' [0-9]
 434  *          | '\w' | '\W' | '\d' | '\D' | '\s' | '\S' | category-block | '\X'
 435  *          | '(?>' regex ')' | '(?' options ':' regex ')'
 436  *          | '(?' ('(' [0-9] ')' | '(' anchors ')' | looks) term ('|' term)? ')'
 437  * options ::= [imsw]* ('-' [imsw]+)?
 438  * anchors ::= '^' | '$' | '\A' | '\Z' | '\z' | '\b' | '\B' | '\&lt;' | '\>'
 439  * looks ::= '(?=' regex ')'  | '(?!' regex ')'
 440  *           | '(?&lt;=' regex ')' | '(?&lt;!' regex ')'
 441  * char ::= '\\' | '\' [efnrtv] | '\c' [@-_] | code-point | character-1
 442  * category-block ::= '\' [pP] category-symbol-1
 443  *                    | ('\p{' | '\P{') (category-symbol | block-name
 444  *                                       | other-properties) '}'
 445  * category-symbol-1 ::= 'L' | 'M' | 'N' | 'Z' | 'C' | 'P' | 'S'
 446  * category-symbol ::= category-symbol-1 | 'Lu' | 'Ll' | 'Lt' | 'Lm' | Lo'
 447  *                     | 'Mn' | 'Me' | 'Mc' | 'Nd' | 'Nl' | 'No'
 448  *                     | 'Zs' | 'Zl' | 'Zp' | 'Cc' | 'Cf' | 'Cn' | 'Co' | 'Cs'
 449  *                     | 'Pd' | 'Ps' | 'Pe' | 'Pc' | 'Po'
 450  *                     | 'Sm' | 'Sc' | 'Sk' | 'So'
 451  * block-name ::= (See above)
 452  * other-properties ::= 'ALL' | 'ASSIGNED' | 'UNASSIGNED'
 453  * character-1 ::= (any character except meta-characters)
 454  *
 455  * char-class ::= '[' ranges ']'
 456  *                | '(?[' ranges ']' ([-+&] '[' ranges ']')? ')'
 457  * ranges ::= '^'? (range <a href="#COMMA_OPTION">','?</a>)+
 458  * range ::= '\d' | '\w' | '\s' | '\D' | '\W' | '\S' | category-block
 459  *           | range-char | range-char '-' range-char
 460  * range-char ::= '\[' | '\]' | '\\' | '\' [,-efnrtv] | code-point | character-2
 461  * code-point ::= '\x' hex-char hex-char
 462  *                | '\x{' hex-char+ '}'
 463  * <!--               | '\u005c u' hex-char hex-char hex-char hex-char
 464  * -->               | '\v' hex-char hex-char hex-char hex-char hex-char hex-char
 465  * hex-char ::= [0-9a-fA-F]
 466  * character-2 ::= (any character except \[]-,)
 467  * </pre>
 468  *
 469  * <hr width="50%">
 470  * <h3>TODO</h3>
 471  * <ul>
 472  *   <li><a href="http://www.unicode.org/unicode/reports/tr18/">Unicode Regular Expression Guidelines</a>
 473  *     <ul>
 474  *       <li>2.4 Canonical Equivalents
 475  *       <li>Level 3
 476  *     </ul>
 477  *   <li>Parsing performance
 478  * </ul>
 479  *
 480  * <hr width="50%">
 481  *
 482  * @xerces.internal
 483  *
 484  * @author TAMURA Kent &lt;kent@trl.ibm.co.jp&gt;
 485  * @version $Id: RegularExpression.java 961928 2010-07-08 20:43:46Z knoaman $
 486  */
 487 public class RegularExpression implements java.io.Serializable {
 488
 489     private static final long serialVersionUID = 6242499334195006401L;
 490
 491     static final boolean DEBUG = false;
 492
 493     /**
 494      * Compiles a token tree into an operation flow.
 495      */
 496     private synchronized void compile(Token tok) {
 497         if (this.operations != null) {
 498             return;
 499         }
 500         this.numberOfClosures = 0;
 501         this.operations = this.compile(tok, null, false);
 502     }
 503
 504     /**
 505      * Converts a token to an operation.
 506      */
 507     private Op compile(Token tok, Op next, boolean reverse) {
 508         Op ret;
 509         switch (tok.type) {
 510         case Token.DOT:
 511             ret = Op.createDot();
 512             ret.next = next;
 513             break;
 514
 515         case Token.CHAR:
 516             ret = Op.createChar(tok.getChar());
 517             ret.next = next;
 518             break;
 519
 520         case Token.ANCHOR:
 521             ret = Op.createAnchor(tok.getChar());
 522             ret.next = next;
 523             break;
 524
 525         case Token.RANGE:
 526         case Token.NRANGE:
 527             ret = Op.createRange(tok);
 528             ret.next = next;
 529             break;
 530
 531         case Token.CONCAT:
 532             ret = next;
 533             if (!reverse) {
 534                 for (int i = tok.size()-1;  i >= 0;  i --) {
 535                     ret = compile(tok.getChild(i), ret, false);
 536                 }
 537             } else {
 538                 for (int i = 0;  i < tok.size();  i ++) {
 539                     ret = compile(tok.getChild(i), ret, true);
 540                 }
 541             }
 542             break;
 543
 544         case Token.UNION:
 545             Op.UnionOp uni = Op.createUnion(tok.size());
 546             for (int i = 0;  i < tok.size();  i ++) {
 547                 uni.addElement(compile(tok.getChild(i), next, reverse));
 548             }
 549             ret = uni;                          // ret.next is null.
 550             break;
 551
 552         case Token.CLOSURE:
 553         case Token.NONGREEDYCLOSURE:
 554             Token child = tok.getChild(0);
 555             int min = tok.getMin();
 556             int max = tok.getMax();
 557             if (min >= 0 && min == max) { // {n}
 558                 ret = next;
 559                 for (int i = 0; i < min;  i ++) {
 560                     ret = compile(child, ret, reverse);
 561                 }
 562                 break;
 563             }
 564             if (min > 0 && max > 0) {
 565                 max -= min;
 566             }
 567             if (max > 0) {
 568                 // X{2,6} -> XX(X(X(XX?)?)?)?
 569                 ret = next;
 570                 for (int i = 0;  i < max;  i ++) {
 571                     Op.ChildOp q = Op.createQuestion(tok.type == Token.NONGREEDYCLOSURE);
 572                     q.next = next;
 573                     q.setChild(compile(child, ret, reverse));
 574                     ret = q;
 575                 }
 576             } else {
 577                 Op.ChildOp op;
 578                 if (tok.type == Token.NONGREEDYCLOSURE) {
 579                     op = Op.createNonGreedyClosure();
 580                 } else {                        // Token.CLOSURE
 581                     op = Op.createClosure(this.numberOfClosures++);
 582                 }
 583                 op.next = next;
 584                 op.setChild(compile(child, op, reverse));
 585                 ret = op;
 586             }
 587             if (min > 0) {
 588                 for (int i = 0;  i < min;  i ++) {
 589                     ret = compile(child, ret, reverse);
 590                 }
 591             }
 592             break;
 593
 594         case Token.EMPTY:
 595             ret = next;
 596             break;
 597
 598         case Token.STRING:
 599             ret = Op.createString(tok.getString());
 600             ret.next = next;
 601             break;
 602
 603         case Token.BACKREFERENCE:
 604             ret = Op.createBackReference(tok.getReferenceNumber());
 605             ret.next = next;
 606             break;
 607
 608         case Token.PAREN:
 609             if (tok.getParenNumber() == 0) {
 610                 ret = compile(tok.getChild(0), next, reverse);
 611             } else if (reverse) {
 612                 next = Op.createCapture(tok.getParenNumber(), next);
 613                 next = compile(tok.getChild(0), next, reverse);
 614                 ret = Op.createCapture(-tok.getParenNumber(), next);
 615             } else {
 616                 next = Op.createCapture(-tok.getParenNumber(), next);
 617                 next = compile(tok.getChild(0), next, reverse);
 618                 ret = Op.createCapture(tok.getParenNumber(), next);
 619             }
 620             break;
 621
 622         case Token.LOOKAHEAD:
 623             ret = Op.createLook(Op.LOOKAHEAD, next, compile(tok.getChild(0), null, false));
 624             break;
 625         case Token.NEGATIVELOOKAHEAD:
 626             ret = Op.createLook(Op.NEGATIVELOOKAHEAD, next, compile(tok.getChild(0), null, false));
 627             break;
 628         case Token.LOOKBEHIND:
 629             ret = Op.createLook(Op.LOOKBEHIND, next, compile(tok.getChild(0), null, true));
 630             break;
 631         case Token.NEGATIVELOOKBEHIND:
 632             ret = Op.createLook(Op.NEGATIVELOOKBEHIND, next, compile(tok.getChild(0), null, true));
 633             break;
 634
 635         case Token.INDEPENDENT:
 636             ret = Op.createIndependent(next, compile(tok.getChild(0), null, reverse));
 637             break;
 638
 639         case Token.MODIFIERGROUP:
 640             ret = Op.createModifier(next, compile(tok.getChild(0), null, reverse),
 641                                     ((Token.ModifierToken)tok).getOptions(),
 642                                     ((Token.ModifierToken)tok).getOptionsMask());
 643             break;
 644
 645         case Token.CONDITION:
 646             Token.ConditionToken ctok = (Token.ConditionToken)tok;
 647             int ref = ctok.refNumber;
 648             Op condition = ctok.condition == null ? null : compile(ctok.condition, null, reverse);
 649             Op yes = compile(ctok.yes, next, reverse);
 650             Op no = ctok.no == null ? null : compile(ctok.no, next, reverse);
 651             ret = Op.createCondition(next, ref, condition, yes, no);
 652             break;
 653
 654         default:
 655             throw new RuntimeException("Unknown token type: "+tok.type);
 656         } // switch (tok.type)
 657         return ret;
 658     }
 659
 660
 661 //Public
 662
 663     /**
 664      * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
 665      *
 666      * @return true if the target is matched to this regular expression.
 667      */
 668     public boolean matches(char[]  target) {
 669         return this.matches(target, 0,  target .length , (Match)null);
 670     }
 671
 672     /**
 673      * Checks whether the <var>target</var> text <strong>contains</strong> this pattern
 674      * in specified range or not.
 675      *
 676      * @param start Start offset of the range.
 677      * @param end  End offset +1 of the range.
 678      * @return true if the target is matched to this regular expression.
 679      */
 680     public boolean matches(char[]  target, int start, int end) {
 681         return this.matches(target, start, end, (Match)null);
 682     }
 683
 684     /**
 685      * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
 686      *
 687      * @param match A Match instance for storing matching result.
 688      * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
 689      */
 690     public boolean matches(char[]  target, Match match) {
 691         return this.matches(target, 0,  target .length , match);
 692     }
 693
 694
 695     /**
 696      * Checks whether the <var>target</var> text <strong>contains</strong> this pattern
 697      * in specified range or not.
 698      *
 699      * @param start Start offset of the range.
 700      * @param end  End offset +1 of the range.
 701      * @param match A Match instance for storing matching result.
 702      * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
 703      */
 704     public boolean matches(char[] target, int start, int end, Match match) {
 705
 706         synchronized (this) {
 707             if (this.operations == null) {
 708                 this.prepare();
 709             }
 710             if (this.context == null) {
 711                 this.context = new Context();
 712             }
 713         }
 714         Context con = null;
 715         synchronized (this.context) {
 716             con = this.context.inuse ? new Context() : this.context;
 717             con.reset(target, start, end, this.numberOfClosures);
 718         }
 719         if (match != null) {
 720             match.setNumberOfGroups(this.nofparen);
 721             match.setSource(target);
 722         } else if (this.hasBackReferences) {
 723             match = new Match();
 724             match.setNumberOfGroups(this.nofparen);
 725             // Need not to call setSource() because
 726             // a caller can not access this match instance.
 727         }
 728         con.match = match;
 729
 730         if (RegularExpression.isSet(this.options, XMLSCHEMA_MODE)) {
 731             int matchEnd = this. match(con, this.operations, con.start, 1, this.options);
 732             //System.err.println("DEBUG: matchEnd="+matchEnd);
 733             if (matchEnd == con.limit) {
 734                 if (con.match != null) {
 735                     con.match.setBeginning(0, con.start);
 736                     con.match.setEnd(0, matchEnd);
 737                 }
 738                 con.setInUse(false);
 739                 return true;
 740             }
 741             return false;
 742         }
 743
 744         /*
 745          * The pattern has only fixed string.
 746          * The engine uses Boyer-Moore.
 747          */
 748         if (this.fixedStringOnly) {
 749             //System.err.println("DEBUG: fixed-only: "+this.fixedString);
 750             int o = this.fixedStringTable.matches(target, con.start, con.limit);
 751             if (o >= 0) {
 752                 if (con.match != null) {
 753                     con.match.setBeginning(0, o);
 754                     con.match.setEnd(0, o+this.fixedString.length());
 755                 }
 756                 con.setInUse(false);
 757                 return true;
 758             }
 759             con.setInUse(false);
 760             return false;
 761         }
 762
 763         /*
 764          * The pattern contains a fixed string.
 765          * The engine checks with Boyer-Moore whether the text contains the fixed string or not.
 766          * If not, it return with false.
 767          */
 768         if (this.fixedString != null) {
 769             int o = this.fixedStringTable.matches(target, con.start, con.limit);
 770             if (o < 0) {
 771                 //System.err.println("Non-match in fixed-string search.");
 772                 con.setInUse(false);
 773                 return false;
 774             }
 775         }
 776
 777         int limit = con.limit-this.minlength;
 778         int matchStart;
 779         int matchEnd = -1;
 780
 781         /*
 782          * Checks whether the expression starts with ".*".
 783          */
 784         if (this.operations != null
 785             && this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) {
 786             if (isSet(this.options, SINGLE_LINE)) {
 787                 matchStart = con.start;
 788                 matchEnd = this. match(con, this.operations, con.start, 1, this.options);
 789             } else {
 790                 boolean previousIsEOL = true;
 791                 for (matchStart = con.start;  matchStart <= limit;  matchStart ++) {
 792                     int ch =  target [  matchStart ] ;
 793                     if (isEOLChar(ch)) {
 794                         previousIsEOL = true;
 795                     } else {
 796                         if (previousIsEOL) {
 797                             if (0 <= (matchEnd = this. match(con, this.operations,
 798                                                              matchStart, 1, this.options))) {
 799                                 break;
 800                             }
 801                         }
 802                         previousIsEOL = false;
 803                     }
 804                 }
 805             }
 806         }
 807
 808         /*
 809          * Optimization against the first character.
 810          */
 811         else if (this.firstChar != null) {
 812             //System.err.println("DEBUG: with firstchar-matching: "+this.firstChar);
 813             RangeToken range = this.firstChar;
 814             for (matchStart = con.start;  matchStart <= limit;  matchStart ++) {
 815                 int ch =  target [matchStart] ;
 816                 if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit) {
 817                     ch = REUtil.composeFromSurrogates(ch, target[matchStart+1]);
 818                 }
 819                 if (!range.match(ch))  {
 820                     continue;
 821                 }
 822                 if (0 <= (matchEnd = this. match(con, this.operations,
 823                                                  matchStart, 1, this.options))) {
 824                         break;
 825                 }
 826             }
 827         }
 828
 829         /*
 830          * Straightforward matching.
 831          */
 832         else {
 833             for (matchStart = con.start;  matchStart <= limit;  matchStart ++) {
 834                 if (0 <= (matchEnd = this. match(con, this.operations, matchStart, 1, this.options))) {
 835                     break;
 836                 }
 837             }
 838         }
 839
 840         if (matchEnd >= 0) {
 841             if (con.match != null) {
 842                 con.match.setBeginning(0, matchStart);
 843                 con.match.setEnd(0, matchEnd);
 844             }
 845             con.setInUse(false);
 846             return true;
 847         } else {
 848             con.setInUse(false);
 849             return false;
 850         }
 851     }
 852
 853     /**
 854      * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
 855      *
 856      * @return true if the target is matched to this regular expression.
 857      */
 858     public boolean matches(String  target) {
 859         return this.matches(target, 0,  target .length() , (Match)null);
 860     }
 861
 862     /**
 863      * Checks whether the <var>target</var> text <strong>contains</strong> this pattern
 864      * in specified range or not.
 865      *
 866      * @param start Start offset of the range.
 867      * @param end  End offset +1 of the range.
 868      * @return true if the target is matched to this regular expression.
 869      */
 870     public boolean matches(String  target, int start, int end) {
 871         return this.matches(target, start, end, (Match)null);
 872     }
 873
 874     /**
 875      * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
 876      *
 877      * @param match A Match instance for storing matching result.
 878      * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
 879      */
 880     public boolean matches(String  target, Match match) {
 881         return this.matches(target, 0,  target .length() , match);
 882     }
 883
 884     /**
 885      * Checks whether the <var>target</var> text <strong>contains</strong> this pattern
 886      * in specified range or not.
 887      *
 888      * @param start Start offset of the range.
 889      * @param end  End offset +1 of the range.
 890      * @param match A Match instance for storing matching result.
 891      * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
 892      */
 893     public boolean matches(String  target, int start, int end, Match match) {
 894
 895         synchronized (this) {
 896             if (this.operations == null) {
 897                 this.prepare();
 898             }
 899             if (this.context == null) {
 900                 this.context = new Context();
 901             }
 902         }
 903         Context con = null;
 904         synchronized (this.context) {
 905             con = this.context.inuse ? new Context() : this.context;
 906             con.reset(target, start, end, this.numberOfClosures);
 907         }
 908         if (match != null) {
 909             match.setNumberOfGroups(this.nofparen);
 910             match.setSource(target);
 911         } else if (this.hasBackReferences) {
 912             match = new Match();
 913             match.setNumberOfGroups(this.nofparen);
 914             // Need not to call setSource() because
 915             // a caller can not access this match instance.
 916         }
 917         con.match = match;
 918
 919         if (RegularExpression.isSet(this.options, XMLSCHEMA_MODE)) {
 920             if (DEBUG) {
 921                 System.err.println("target string="+target);
 922             }
 923             int matchEnd = this. match(con, this.operations, con.start, 1, this.options);
 924             if (DEBUG) {
 925                 System.err.println("matchEnd="+matchEnd);
 926                 System.err.println("con.limit="+con.limit);
 927             }
 928             if (matchEnd == con.limit) {
 929                 if (con.match != null) {
 930                     con.match.setBeginning(0, con.start);
 931                     con.match.setEnd(0, matchEnd);
 932                 }
 933                 con.setInUse(false);
 934                 return true;
 935             }
 936             return false;
 937         }
 938
 939         /*
 940          * The pattern has only fixed string.
 941          * The engine uses Boyer-Moore.
 942          */
 943         if (this.fixedStringOnly) {
 944             //System.err.println("DEBUG: fixed-only: "+this.fixedString);
 945             int o = this.fixedStringTable.matches(target, con.start, con.limit);
 946             if (o >= 0) {
 947                 if (con.match != null) {
 948                     con.match.setBeginning(0, o);
 949                     con.match.setEnd(0, o+this.fixedString.length());
 950                 }
 951                 con.setInUse(false);
 952                 return true;
 953             }
 954             con.setInUse(false);
 955             return false;
 956         }
 957
 958         /*
 959          * The pattern contains a fixed string.
 960          * The engine checks with Boyer-Moore whether the text contains the fixed string or not.
 961          * If not, it return with false.
 962          */
 963         if (this.fixedString != null) {
 964             int o = this.fixedStringTable.matches(target, con.start, con.limit);
 965             if (o < 0) {
 966                 //System.err.println("Non-match in fixed-string search.");
 967                 con.setInUse(false);
 968                 return false;
 969             }
 970         }
 971
 972         int limit = con.limit-this.minlength;
 973         int matchStart;
 974         int matchEnd = -1;
 975
 976         /*
 977          * Checks whether the expression starts with ".*".
 978          */
 979         if (this.operations != null
 980             && this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) {
 981             if (isSet(this.options, SINGLE_LINE)) {
 982                 matchStart = con.start;
 983                 matchEnd = this.match(con, this.operations, con.start, 1, this.options);
 984             } else {
 985                 boolean previousIsEOL = true;
 986                 for (matchStart = con.start;  matchStart <= limit;  matchStart ++) {
 987                     int ch =  target .charAt(  matchStart ) ;
 988                     if (isEOLChar(ch)) {
 989                         previousIsEOL = true;
 990                     } else {
 991                         if (previousIsEOL) {
 992                             if (0 <= (matchEnd = this.match(con, this.operations,
 993                                                             matchStart, 1, this.options))) {
 994                                 break;
 995                             }
 996                         }
 997                         previousIsEOL = false;
 998                     }
 999                 }
1000             }
1001         }
1002
1003         /*
1004          * Optimization against the first character.
1005          */
1006         else if (this.firstChar != null) {
1007             //System.err.println("DEBUG: with firstchar-matching: "+this.firstChar);
1008             RangeToken range = this.firstChar;
1009             for (matchStart = con.start;  matchStart <= limit;  matchStart ++) {
1010                 int ch =  target .charAt(  matchStart ) ;
1011                 if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit) {
1012                     ch = REUtil.composeFromSurrogates(ch, target.charAt(matchStart+1));
1013                 }
1014                 if (!range.match(ch)) {
1015                     continue;
1016                 }
1017                 if (0 <= (matchEnd = this.match(con, this.operations,
1018                                                 matchStart, 1, this.options))) {
1019                         break;
1020                 }
1021             }
1022         }
1023
1024         /*
1025          * Straightforward matching.
1026          */
1027         else {
1028             for (matchStart = con.start;  matchStart <= limit;  matchStart ++) {
1029                 if (0 <= (matchEnd = this.match(con, this.operations, matchStart, 1, this.options))) {
1030                     break;
1031                 }
1032             }
1033         }
1034
1035         if (matchEnd >= 0) {
1036             if (con.match != null) {
1037                 con.match.setBeginning(0, matchStart);
1038                 con.match.setEnd(0, matchEnd);
1039             }
1040             con.setInUse(false);
1041             return true;
1042         } else {
1043             con.setInUse(false);
1044             return false;
1045         }
1046     }
1047
1048     /**
1049      * @return -1 when not match; offset of the end of matched string when match.
1050      */
1051     private int match(Context con, Op op, int offset, int dx, int opts) {
1052         final ExpressionTarget target = con.target;
1053         final Deque<Op> opStack = new ArrayDeque<>();
1054         final IntStack dataStack = new IntStack();
1055         final boolean isSetIgnoreCase = isSet(opts, IGNORE_CASE);
1056         int retValue = -1;
1057         boolean returned = false;
1058
1059         for (;;) {
1060             if (op == null || offset > con.limit || offset < con.start) {
1061                 if (op == null) {
1062                     retValue = isSet(opts, XMLSCHEMA_MODE) && offset != con.limit ? -1 : offset;
1063                 }
1064                 else {
1065                    retValue = -1;
1066                 }
1067                 returned = true;
1068             }
1069             else  {
1070                 retValue = -1;
1071                 // dx value is either 1 or -1
1072                 switch (op.type) {
1073                 case Op.CHAR:
1074                     {
1075                         final int o1 = (dx > 0) ? offset : offset -1;
1076                         if (o1 >= con.limit || o1 < 0 || !matchChar(op.getData(), target.charAt(o1), isSetIgnoreCase)) {
1077                             returned = true;
1078                             break;
1079                         }
1080                         offset += dx;
1081                         op = op.next;
1082                     }
1083                     break;
1084
1085                 case Op.DOT:
1086                     {
1087                         int o1 = (dx > 0) ? offset : offset - 1;
1088                         if (o1 >= con.limit || o1 < 0) {
1089                             returned = true;
1090                             break;
1091                         }
1092                         if (isSet(opts, SINGLE_LINE)) {
1093                             if (REUtil.isHighSurrogate(target.charAt(o1)) && o1+dx >= 0 && o1+dx < con.limit) {
1094                                 o1 += dx;
1095                             }
1096                         }
1097                         else {
1098                             int ch = target.charAt(o1);
1099                             if (REUtil.isHighSurrogate(ch) && o1+dx >= 0 && o1+dx < con.limit) {
1100                                 o1 += dx;
1101                                 ch = REUtil.composeFromSurrogates(ch, target.charAt(o1));
1102                             }
1103                             if (isEOLChar(ch)) {
1104                                 returned = true;
1105                                 break;
1106                             }
1107                         }
1108                         offset = (dx > 0) ? o1 + 1 : o1;
1109                         op = op.next;
1110                     }
1111                     break;
1112
1113                 case Op.RANGE:
1114                 case Op.NRANGE:
1115                     {
1116                         int o1 = (dx > 0) ? offset : offset -1;
1117                         if (o1 >= con.limit || o1 < 0) {
1118                             returned = true;
1119                             break;
1120                         }
1121                         int ch = target.charAt(offset);
1122                         if (REUtil.isHighSurrogate(ch) && o1+dx < con.limit && o1+dx >=0) {
1123                             o1 += dx;
1124                             ch = REUtil.composeFromSurrogates(ch, target.charAt(o1));
1125                         }
1126                         final RangeToken tok = op.getToken();
1127                         if (!tok.match(ch)) {
1128                             returned = true;
1129                             break;
1130                         }
1131                         offset = (dx > 0) ? o1+1 : o1;
1132                         op = op.next;
1133                     }
1134                     break;
1135
1136                 case Op.ANCHOR:
1137                     {
1138                         if (!matchAnchor(target, op, con, offset, opts)) {
1139                             returned = true;
1140                             break;
1141                         }
1142                         op = op.next;
1143                     }
1144                     break;
1145
1146                 case Op.BACKREFERENCE:
1147                     {
1148                         int refno = op.getData();
1149                         if (refno <= 0 || refno >= this.nofparen) {
1150                             throw new RuntimeException("Internal Error: Reference number must be more than zero: "+refno);
1151                         }
1152                         if (con.match.getBeginning(refno) < 0 || con.match.getEnd(refno) < 0) {
1153                             returned = true;
1154                             break;
1155                         }
1156                         int o2 = con.match.getBeginning(refno);
1157                         int literallen = con.match.getEnd(refno)-o2;
1158                         if (dx > 0) {
1159                             if (!target.regionMatches(isSetIgnoreCase, offset, con.limit, o2, literallen)) {
1160                                 returned = true;
1161                                 break;
1162                             }
1163                             offset += literallen;
1164                         }
1165                         else {
1166                             if (!target.regionMatches(isSetIgnoreCase, offset-literallen, con.limit, o2, literallen)) {
1167                                 returned = true;
1168                                 break;
1169                             }
1170                             offset -= literallen;
1171                         }
1172                         op = op.next;
1173                     }
1174                     break;
1175
1176                 case Op.STRING:
1177                     {
1178                         String literal = op.getString();
1179                         int literallen = literal.length();
1180                         if (dx > 0) {
1181                             if (!target.regionMatches(isSetIgnoreCase, offset, con.limit, literal, literallen)) {
1182                                 returned = true;
1183                                 break;
1184                             }
1185                             offset += literallen;
1186                         }
1187                         else {
1188                             if (!target.regionMatches(isSetIgnoreCase, offset-literallen, con.limit, literal, literallen)) {
1189                                 returned = true;
1190                                 break;
1191                             }
1192                             offset -= literallen;
1193                         }
1194                         op = op.next;
1195                     }
1196                     break;
1197
1198                 case Op.CLOSURE:
1199                     {
1200                         // Saves current position to avoid zero-width repeats.
1201                         final int id = op.getData();
1202                         if (con.closureContexts[id].contains(offset)) {
1203                             returned = true;
1204                             break;
1205                         }
1206
1207                         con.closureContexts[id].addOffset(offset);
1208                     }
1209                     // fall through
1210
1211                 case Op.QUESTION:
1212                     {
1213                         opStack.push(op);
1214                         dataStack.push(offset);
1215                         op = op.getChild();
1216                     }
1217                     break;
1218
1219                 case Op.NONGREEDYCLOSURE:
1220                 case Op.NONGREEDYQUESTION:
1221                     {
1222                         opStack.push(op);
1223                         dataStack.push(offset);
1224                         op = op.next;
1225                     }
1226                     break;
1227
1228                 case Op.UNION:
1229                     if (op.size() == 0) {
1230                         returned = true;
1231                     }
1232                     else {
1233                         opStack.push(op);
1234                         dataStack.push(0);
1235                         dataStack.push(offset);
1236                         op = op.elementAt(0);
1237                     }
1238                     break;
1239
1240                 case Op.CAPTURE:
1241                     {
1242                         final int refno = op.getData();
1243                         if (con.match != null) {
1244                             if (refno > 0) {
1245                                 dataStack.push(con.match.getBeginning(refno));
1246                                 con.match.setBeginning(refno, offset);
1247                             }
1248                             else {
1249                                 final int index = -refno;
1250                                 dataStack.push(con.match.getEnd(index));
1251                                 con.match.setEnd(index, offset);
1252                             }
1253                             opStack.push(op);
1254                             dataStack.push(offset);
1255                         }
1256                         op = op.next;
1257                     }
1258                     break;
1259
1260                 case Op.LOOKAHEAD:
1261                 case Op.NEGATIVELOOKAHEAD:
1262                 case Op.LOOKBEHIND:
1263                 case Op.NEGATIVELOOKBEHIND:
1264                     {
1265                         opStack.push(op);
1266                         dataStack.push(dx);
1267                         dataStack.push(offset);
1268                         dx = (op.type == Op.LOOKAHEAD || op.type == Op.NEGATIVELOOKAHEAD) ? 1 : -1;
1269                         op = op.getChild();
1270                     }
1271                     break;
1272
1273                 case Op.INDEPENDENT:
1274                     {
1275                         opStack.push(op);
1276                         dataStack.push(offset);
1277                         op = op.getChild();
1278                     }
1279                     break;
1280
1281                 case Op.MODIFIER:
1282                     {
1283                         int localopts = opts;
1284                         localopts |= op.getData();
1285                         localopts &= ~op.getData2();
1286                         opStack.push(op);
1287                         dataStack.push(opts);
1288                         dataStack.push(offset);
1289                         opts = localopts;
1290                         op = op.getChild();
1291                     }
1292                     break;
1293
1294                 case Op.CONDITION:
1295                     {
1296                         Op.ConditionOp cop = (Op.ConditionOp)op;
1297                         if (cop.refNumber > 0) {
1298                             if (cop.refNumber >= this.nofparen) {
1299                                 throw new RuntimeException("Internal Error: Reference number must be more than zero: "+cop.refNumber);
1300                             }
1301                             if (con.match.getBeginning(cop.refNumber) >= 0
1302                                     && con.match.getEnd(cop.refNumber) >= 0) {
1303                                 op = cop.yes;
1304                             }
1305                             else if (cop.no != null) {
1306                                 op = cop.no;
1307                             }
1308                             else {
1309                                 op = cop.next;
1310                             }
1311                         }
1312                         else {
1313                             opStack.push(op);
1314                             dataStack.push(offset);
1315                             op = cop.condition;
1316                         }
1317                     }
1318                     break;
1319
1320                 default:
1321                     throw new RuntimeException("Unknown operation type: " + op.type);
1322                 }
1323             }
1324
1325             // handle recursive operations
1326             while (returned) {
1327                 // exhausted all the operations
1328                 if (opStack.isEmpty()) {
1329                     return retValue;
1330                 }
1331
1332                 op = opStack.pop();
1333                 offset = dataStack.pop();
1334
1335                 switch (op.type) {
1336                 case Op.CLOSURE:
1337                 case Op.QUESTION:
1338                     if (retValue < 0) {
1339                         op = op.next;
1340                         returned = false;
1341                     }
1342                     break;
1343
1344                 case Op.NONGREEDYCLOSURE:
1345                 case Op.NONGREEDYQUESTION:
1346                     if (retValue < 0) {
1347                         op = op.getChild();
1348                         returned = false;
1349                     }
1350                     break;
1351
1352                 case Op.UNION:
1353                     {
1354                         int unionIndex = dataStack.pop();
1355                         if (DEBUG) {
1356                             System.err.println("UNION: "+unionIndex+", ret="+retValue);
1357                         }
1358
1359                         if (retValue < 0) {
1360                             if (++unionIndex < op.size()) {
1361                                 opStack.push(op);
1362                                 dataStack.push(unionIndex);
1363                                 dataStack.push(offset);
1364                                 op = op.elementAt(unionIndex);
1365                                 returned = false;
1366                             }
1367                             else {
1368                                 retValue = -1;
1369                             }
1370                         }
1371                     }
1372                     break;
1373
1374                 case Op.CAPTURE:
1375                     final int refno = op.getData();
1376                     final int saved = dataStack.pop();
1377                     if (retValue < 0) {
1378                         if (refno > 0) {
1379                             con.match.setBeginning(refno, saved);
1380                         }
1381                         else {
1382                             con.match.setEnd(-refno, saved);
1383                         }
1384                     }
1385                     break;
1386
1387                 case Op.LOOKAHEAD:
1388                 case Op.LOOKBEHIND:
1389                     {
1390                         dx = dataStack.pop();
1391                         if (0 <= retValue) {
1392                             op = op.next;
1393                             returned = false;
1394                         }
1395                         retValue = -1;
1396                     }
1397                     break;
1398
1399                 case Op.NEGATIVELOOKAHEAD:
1400                 case Op.NEGATIVELOOKBEHIND:
1401                     {
1402                         dx = dataStack.pop();
1403                         if (0 > retValue)  {
1404                             op = op.next;
1405                             returned = false;
1406                         }
1407                         retValue = -1;
1408                     }
1409                     break;
1410
1411                 case Op.MODIFIER:
1412                     opts = dataStack.pop();
1413                     // fall through
1414
1415                 case Op.INDEPENDENT:
1416                     if (retValue >= 0)  {
1417                         offset = retValue;
1418                         op = op.next;
1419                         returned = false;
1420                     }
1421                     break;
1422
1423                 case Op.CONDITION:
1424                     {
1425                         final Op.ConditionOp cop = (Op.ConditionOp)op;
1426                         if (0 <= retValue) {
1427                             op = cop.yes;
1428                         }
1429                         else if (cop.no != null) {
1430                             op = cop.no;
1431                         }
1432                         else {
1433                             op = cop.next;
1434                         }
1435                     }
1436                     returned = false;
1437                     break;
1438
1439                 default:
1440                     break;
1441                 }
1442             }
1443         }
1444     }
1445
1446     private static boolean matchChar(int ch, int other, boolean ignoreCase) {
1447         return (ignoreCase) ? matchIgnoreCase(ch, other) : ch == other;
1448     }
1449
1450     boolean matchAnchor(ExpressionTarget target, Op op, Context con, int offset, int opts) {
1451         boolean go = false;
1452         switch (op.getData()) {
1453         case '^':
1454             if (isSet(opts, MULTIPLE_LINES)) {
1455                 if (!(offset == con.start
1456                       || offset > con.start && offset < con.limit && isEOLChar(target.charAt(offset-1)))) {
1457                     return false;
1458                 }
1459             } else {
1460                 if (offset != con.start) {
1461                     return false;
1462                 }
1463             }
1464             break;
1465
1466         case '@':                         // Internal use only.
1467             // The @ always matches line beginnings.
1468             if (!(offset == con.start
1469                   || offset > con.start && isEOLChar(target.charAt(offset-1)))) {
1470                 return false;
1471             }
1472             break;
1473
1474         case '$':
1475             if (isSet(opts, MULTIPLE_LINES)) {
1476                 if (!(offset == con.limit
1477                       || offset < con.limit && isEOLChar(target.charAt(offset)))) {
1478                     return false;
1479                 }
1480             } else {
1481                 if (!(offset == con.limit
1482                       || offset+1 == con.limit && isEOLChar(target.charAt(offset))
1483                       || offset+2 == con.limit &&  target.charAt(offset) == CARRIAGE_RETURN
1484                       &&  target.charAt(offset+1) == LINE_FEED)) {
1485                     return false;
1486                 }
1487             }
1488             break;
1489
1490         case 'A':
1491             if (offset != con.start) {
1492                 return false;
1493             }
1494             break;
1495
1496         case 'Z':
1497             if (!(offset == con.limit
1498                   || offset+1 == con.limit && isEOLChar(target.charAt(offset))
1499                   || offset+2 == con.limit &&  target.charAt(offset) == CARRIAGE_RETURN
1500                   &&  target.charAt(offset+1) == LINE_FEED)) {
1501                 return false;
1502             }
1503             break;
1504
1505         case 'z':
1506             if (offset != con.limit) {
1507                 return false;
1508             }
1509             break;
1510
1511         case 'b':
1512             if (con.length == 0) {
1513                 return false;
1514             }
1515             {
1516                 int after = getWordType(target, con.start, con.limit, offset, opts);
1517                 if (after == WT_IGNORE) {
1518                     return false;
1519                 }
1520                 int before = getPreviousWordType(target, con.start, con.limit, offset, opts);
1521                 if (after == before) {
1522                     return false;
1523                 }
1524             }
1525             break;
1526
1527         case 'B':
1528             if (con.length == 0) {
1529                 go = true;
1530             } else {
1531                 int after = getWordType(target, con.start, con.limit, offset, opts);
1532                 go = after == WT_IGNORE
1533                      || after == getPreviousWordType(target, con.start, con.limit, offset, opts);
1534             }
1535             if (!go) {
1536                 return false;
1537             }
1538             break;
1539
1540         case '<':
1541             if (con.length == 0 || offset == con.limit) {
1542                 return false;
1543             }
1544             if (getWordType(target, con.start, con.limit, offset, opts) != WT_LETTER
1545                 || getPreviousWordType(target, con.start, con.limit, offset, opts) != WT_OTHER) {
1546                 return false;
1547             }
1548             break;
1549
1550         case '>':
1551             if (con.length == 0 || offset == con.start) {
1552                 return false;
1553             }
1554             if (getWordType(target, con.start, con.limit, offset, opts) != WT_OTHER
1555                 || getPreviousWordType(target, con.start, con.limit, offset, opts) != WT_LETTER) {
1556                 return false;
1557             }
1558             break;
1559         } // switch anchor type
1560
1561         return true;
1562     }
1563
1564     private static final int getPreviousWordType(ExpressionTarget target, int begin, int end,
1565                                                  int offset, int opts) {
1566         int ret = getWordType(target, begin, end, --offset, opts);
1567         while (ret == WT_IGNORE) {
1568             ret = getWordType(target, begin, end, --offset, opts);
1569         }
1570         return ret;
1571     }
1572
1573     private static final int getWordType(ExpressionTarget target, int begin, int end,
1574                                          int offset, int opts) {
1575         if (offset < begin || offset >= end) {
1576             return WT_OTHER;
1577         }
1578         return getWordType0(target.charAt(offset) , opts);
1579     }
1580
1581
1582     /**
1583      * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
1584      *
1585      * @return true if the target is matched to this regular expression.
1586      */
1587     public boolean matches(CharacterIterator target) {
1588         return this.matches(target, (Match)null);
1589     }
1590
1591
1592     /**
1593      * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
1594      *
1595      * @param match A Match instance for storing matching result.
1596      * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
1597      */
1598     public boolean matches(CharacterIterator  target, Match match) {
1599         int start = target.getBeginIndex();
1600         int end = target.getEndIndex();
1601
1602
1603
1604         synchronized (this) {
1605             if (this.operations == null) {
1606                 this.prepare();
1607             }
1608             if (this.context == null) {
1609                 this.context = new Context();
1610             }
1611         }
1612         Context con = null;
1613         synchronized (this.context) {
1614             con = this.context.inuse ? new Context() : this.context;
1615             con.reset(target, start, end, this.numberOfClosures);
1616         }
1617         if (match != null) {
1618             match.setNumberOfGroups(this.nofparen);
1619             match.setSource(target);
1620         } else if (this.hasBackReferences) {
1621             match = new Match();
1622             match.setNumberOfGroups(this.nofparen);
1623             // Need not to call setSource() because
1624             // a caller can not access this match instance.
1625         }
1626         con.match = match;
1627
1628         if (RegularExpression.isSet(this.options, XMLSCHEMA_MODE)) {
1629             int matchEnd = this.match(con, this.operations, con.start, 1, this.options);
1630             //System.err.println("DEBUG: matchEnd="+matchEnd);
1631             if (matchEnd == con.limit) {
1632                 if (con.match != null) {
1633                     con.match.setBeginning(0, con.start);
1634                     con.match.setEnd(0, matchEnd);
1635                 }
1636                 con.setInUse(false);
1637                 return true;
1638             }
1639             return false;
1640         }
1641
1642         /*
1643          * The pattern has only fixed string.
1644          * The engine uses Boyer-Moore.
1645          */
1646         if (this.fixedStringOnly) {
1647             //System.err.println("DEBUG: fixed-only: "+this.fixedString);
1648             int o = this.fixedStringTable.matches(target, con.start, con.limit);
1649             if (o >= 0) {
1650                 if (con.match != null) {
1651                     con.match.setBeginning(0, o);
1652                     con.match.setEnd(0, o+this.fixedString.length());
1653                 }
1654                 con.setInUse(false);
1655                 return true;
1656             }
1657             con.setInUse(false);
1658             return false;
1659         }
1660
1661         /*
1662          * The pattern contains a fixed string.
1663          * The engine checks with Boyer-Moore whether the text contains the fixed string or not.
1664          * If not, it return with false.
1665          */
1666         if (this.fixedString != null) {
1667             int o = this.fixedStringTable.matches(target, con.start, con.limit);
1668             if (o < 0) {
1669                 //System.err.println("Non-match in fixed-string search.");
1670                 con.setInUse(false);
1671                 return false;
1672             }
1673         }
1674
1675         int limit = con.limit-this.minlength;
1676         int matchStart;
1677         int matchEnd = -1;
1678
1679         /*
1680          * Checks whether the expression starts with ".*".
1681          */
1682         if (this.operations != null
1683             && this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) {
1684             if (isSet(this.options, SINGLE_LINE)) {
1685                 matchStart = con.start;
1686                 matchEnd = this.match(con, this.operations, con.start, 1, this.options);
1687             } else {
1688                 boolean previousIsEOL = true;
1689                 for (matchStart = con.start;  matchStart <= limit;  matchStart ++) {
1690                     int ch =  target .setIndex(  matchStart ) ;
1691                     if (isEOLChar(ch)) {
1692                         previousIsEOL = true;
1693                     } else {
1694                         if (previousIsEOL) {
1695                             if (0 <= (matchEnd = this.match(con, this.operations,
1696                                                             matchStart, 1, this.options))) {
1697                                 break;
1698                             }
1699                         }
1700                         previousIsEOL = false;
1701                     }
1702                 }
1703             }
1704         }
1705
1706         /*
1707          * Optimization against the first character.
1708          */
1709         else if (this.firstChar != null) {
1710             //System.err.println("DEBUG: with firstchar-matching: "+this.firstChar);
1711             RangeToken range = this.firstChar;
1712             for (matchStart = con.start;  matchStart <= limit;  matchStart ++) {
1713                 int ch =  target .setIndex(  matchStart ) ;
1714                 if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit) {
1715                     ch = REUtil.composeFromSurrogates(ch, target.setIndex(matchStart+1));
1716                 }
1717                 if (!range.match(ch)) {
1718                     continue;
1719                 }
1720                 if (0 <= (matchEnd = this.match(con, this.operations,
1721                                                 matchStart, 1, this.options))) {
1722                     break;
1723                 }
1724             }
1725         }
1726
1727         /*
1728          * Straightforward matching.
1729          */
1730         else {
1731             for (matchStart = con.start;  matchStart <= limit;  matchStart ++) {
1732                 if (0 <= (matchEnd = this. match(con, this.operations, matchStart, 1, this.options))) {
1733                     break;
1734                 }
1735             }
1736         }
1737
1738         if (matchEnd >= 0) {
1739             if (con.match != null) {
1740                 con.match.setBeginning(0, matchStart);
1741                 con.match.setEnd(0, matchEnd);
1742             }
1743             con.setInUse(false);
1744             return true;
1745         } else {
1746             con.setInUse(false);
1747             return false;
1748         }
1749     }
1750
1751     // ================================================================
1752
1753     /**
1754      * A regular expression.
1755      * @serial
1756      */
1757     String regex;
1758     /**
1759      * @serial
1760      */
1761     int options;
1762
1763     /**
1764      * The number of parenthesis in the regular expression.
1765      * @serial
1766      */
1767     int nofparen;
1768     /**
1769      * Internal representation of the regular expression.
1770      * @serial
1771      */
1772     Token tokentree;
1773
1774     boolean hasBackReferences = false;
1775
1776     transient int minlength;
1777     transient Op operations = null;
1778     transient int numberOfClosures;
1779     transient Context context = null;
1780     transient RangeToken firstChar = null;
1781
1782     transient String fixedString = null;
1783     transient int fixedStringOptions;
1784     transient BMPattern fixedStringTable = null;
1785     transient boolean fixedStringOnly = false;
1786
1787     static abstract class ExpressionTarget {
1788         abstract char charAt(int index);
1789         abstract boolean regionMatches(boolean ignoreCase, int offset, int limit, String part, int partlen);
1790         abstract boolean regionMatches(boolean ignoreCase, int offset, int limit, int offset2, int partlen);
1791     }
1792
1793     static final class StringTarget extends ExpressionTarget {
1794
1795         private String target;
1796
1797         StringTarget(String target) {
1798             this.target = target;
1799         }
1800
1801         final void resetTarget(String target) {
1802             this.target = target;
1803         }
1804
1805         @Override
1806         final char charAt(int index) {
1807             return target.charAt(index);
1808         }
1809
1810         @Override
1811         final boolean regionMatches(boolean ignoreCase, int offset, int limit,
1812                               String part, int partlen) {
1813             if (limit-offset < partlen) {
1814                 return false;
1815             }
1816             return (ignoreCase) ? target.regionMatches(true, offset, part, 0, partlen) : target.regionMatches(offset, part, 0, partlen);
1817         }
1818
1819         @Override
1820         final boolean regionMatches(boolean ignoreCase, int offset, int limit,
1821                                     int offset2, int partlen) {
1822             if (limit-offset < partlen) {
1823                 return false;
1824             }
1825             return (ignoreCase) ? target.regionMatches(true, offset, target, offset2, partlen)
1826                                 : target.regionMatches(offset, target, offset2, partlen);
1827         }
1828     }
1829
1830     static final class CharArrayTarget extends ExpressionTarget {
1831
1832         char[] target;
1833
1834         CharArrayTarget(char[] target) {
1835             this.target = target;
1836         }
1837
1838         final void resetTarget(char[] target) {
1839             this.target = target;
1840         }
1841
1842         @Override
1843         char charAt(int index) {
1844             return target[index];
1845         }
1846
1847         @Override
1848         final boolean regionMatches(boolean ignoreCase, int offset, int limit,
1849                 String part, int partlen) {
1850             if (offset < 0 || limit-offset < partlen)  {
1851                 return false;
1852             }
1853             return (ignoreCase) ? regionMatchesIgnoreCase(offset, limit, part, partlen)
1854                                 : regionMatches(offset, limit, part, partlen);
1855         }
1856
1857         private final boolean regionMatches(int offset, int limit, String part, int partlen) {
1858             int i = 0;
1859             while (partlen-- > 0) {
1860                 if (target[offset++] != part.charAt(i++)) {
1861                     return false;
1862                 }
1863             }
1864             return true;
1865         }
1866
1867         private final boolean regionMatchesIgnoreCase(int offset, int limit, String part, int partlen) {
1868             int i = 0;
1869             while (partlen-- > 0) {
1870                 final char ch1 = target[offset++] ;
1871                 final char ch2 = part.charAt(i++);
1872                 if (ch1 == ch2) {
1873                     continue;
1874                 }
1875                 final char uch1 = Character.toUpperCase(ch1);
1876                 final char uch2 = Character.toUpperCase(ch2);
1877                 if (uch1 == uch2) {
1878                     continue;
1879                 }
1880                 if (Character.toLowerCase(uch1) != Character.toLowerCase(uch2)) {
1881                     return false;
1882                 }
1883             }
1884             return true;
1885         }
1886
1887         @Override
1888         final boolean regionMatches(boolean ignoreCase, int offset, int limit, int offset2, int partlen) {
1889             if (offset < 0 || limit-offset < partlen) {
1890                 return false;
1891             }
1892             return (ignoreCase) ? regionMatchesIgnoreCase(offset, limit, offset2, partlen)
1893                                 : regionMatches(offset, limit, offset2, partlen);
1894         }
1895
1896         private final boolean regionMatches(int offset, int limit, int offset2, int partlen) {
1897             int i = offset2;
1898             while (partlen-- > 0) {
1899                 if ( target [  offset++ ]  !=  target [  i++ ] ) {
1900                     return false;
1901                 }
1902             }
1903             return true;
1904         }
1905
1906         private final boolean regionMatchesIgnoreCase(int offset, int limit, int offset2, int partlen) {
1907             int i = offset2;
1908             while (partlen-- > 0) {
1909                 final char ch1 =  target[offset++] ;
1910                 final char ch2 =  target[i++] ;
1911                 if (ch1 == ch2) {
1912                     continue;
1913                 }
1914                 final char uch1 = Character.toUpperCase(ch1);
1915                 final char uch2 = Character.toUpperCase(ch2);
1916                 if (uch1 == uch2) {
1917                     continue;
1918                 }
1919                 if (Character.toLowerCase(uch1) != Character.toLowerCase(uch2)) {
1920                     return false;
1921                 }
1922             }
1923             return true;
1924         }
1925     }
1926
1927     static final class CharacterIteratorTarget extends ExpressionTarget {
1928         CharacterIterator target;
1929
1930         CharacterIteratorTarget(CharacterIterator target) {
1931             this.target = target;
1932         }
1933
1934         final void resetTarget(CharacterIterator target) {
1935             this.target = target;
1936         }
1937
1938         @Override
1939         final char charAt(int index) {
1940             return target.setIndex(index);
1941         }
1942
1943         @Override
1944         final boolean regionMatches(boolean ignoreCase, int offset, int limit,
1945                 String part, int partlen) {
1946             if (offset < 0 || limit-offset < partlen)  {
1947                 return false;
1948             }
1949             return (ignoreCase) ? regionMatchesIgnoreCase(offset, limit, part, partlen)
1950                                 : regionMatches(offset, limit, part, partlen);
1951         }
1952
1953         private final boolean regionMatches(int offset, int limit, String part, int partlen) {
1954             int i = 0;
1955             while (partlen-- > 0) {
1956                 if (target.setIndex(offset++) != part.charAt(i++)) {
1957                     return false;
1958                 }
1959             }
1960             return true;
1961         }
1962
1963         private final boolean regionMatchesIgnoreCase(int offset, int limit, String part, int partlen) {
1964             int i = 0;
1965             while (partlen-- > 0) {
1966                 final char ch1 = target.setIndex(offset++) ;
1967                 final char ch2 = part.charAt(i++);
1968                 if (ch1 == ch2) {
1969                     continue;
1970                 }
1971                 final char uch1 = Character.toUpperCase(ch1);
1972                 final char uch2 = Character.toUpperCase(ch2);
1973                 if (uch1 == uch2) {
1974                     continue;
1975                 }
1976                 if (Character.toLowerCase(uch1) != Character.toLowerCase(uch2)) {
1977                     return false;
1978                 }
1979             }
1980             return true;
1981         }
1982
1983         @Override
1984         final boolean regionMatches(boolean ignoreCase, int offset, int limit, int offset2, int partlen) {
1985             if (offset < 0 || limit-offset < partlen) {
1986                 return false;
1987             }
1988             return (ignoreCase) ? regionMatchesIgnoreCase(offset, limit, offset2, partlen)
1989                                 : regionMatches(offset, limit, offset2, partlen);
1990         }
1991
1992         private final boolean regionMatches(int offset, int limit, int offset2, int partlen) {
1993             int i = offset2;
1994             while (partlen-- > 0) {
1995                 if (target.setIndex(offset++) != target.setIndex(i++)) {
1996                     return false;
1997                 }
1998             }
1999             return true;
2000         }
2001
2002         private final boolean regionMatchesIgnoreCase(int offset, int limit, int offset2, int partlen) {
2003             int i = offset2;
2004             while (partlen-- > 0) {
2005                 final char ch1 = target.setIndex(offset++) ;
2006                 final char ch2 = target.setIndex(i++) ;
2007                 if (ch1 == ch2) {
2008                     continue;
2009                 }
2010                 final char uch1 = Character.toUpperCase(ch1);
2011                 final char uch2 = Character.toUpperCase(ch2);
2012                 if (uch1 == uch2) {
2013                     continue;
2014                 }
2015                 if (Character.toLowerCase(uch1) != Character.toLowerCase(uch2)) {
2016                     return false;
2017                 }
2018             }
2019             return true;
2020         }
2021     }
2022
2023     static final class ClosureContext {
2024
2025         int[] offsets = new int[4];
2026         int currentIndex = 0;
2027
2028         boolean contains(int offset) {
2029             for (int i=0; i<currentIndex;++i) {
2030                 if (offsets[i] == offset) {
2031                     return true;
2032                 }
2033             }
2034             return false;
2035         }
2036
2037         void reset() {
2038             currentIndex = 0;
2039         }
2040
2041         void addOffset(int offset) {
2042             // We do not check for duplicates, caller is responsible for that
2043             if (currentIndex == offsets.length) {
2044                 offsets = expandOffsets();
2045             }
2046             offsets[currentIndex++] = offset;
2047         }
2048
2049         private int[] expandOffsets() {
2050             final int len = offsets.length;
2051             final int newLen = len << 1;
2052             int[] newOffsets = new int[newLen];
2053
2054             System.arraycopy(offsets, 0, newOffsets, 0, currentIndex);
2055             return newOffsets;
2056         }
2057     }
2058
2059     static final class Context {
2060         int start;
2061         int limit;
2062         int length;
2063         Match match;
2064         boolean inuse = false;
2065         ClosureContext[] closureContexts;
2066
2067         private StringTarget stringTarget;
2068         private CharArrayTarget charArrayTarget;
2069         private CharacterIteratorTarget characterIteratorTarget;
2070
2071         ExpressionTarget target;
2072
2073         Context() {
2074         }
2075
2076         private void resetCommon(int nofclosures) {
2077             this.length = this.limit-this.start;
2078             setInUse(true);
2079             this.match = null;
2080             if (this.closureContexts == null || this.closureContexts.length != nofclosures) {
2081                 this.closureContexts = new ClosureContext[nofclosures];
2082             }
2083             for (int i = 0;  i < nofclosures;  i ++)  {
2084                 if (this.closureContexts[i] == null) {
2085                     this.closureContexts[i] = new ClosureContext();
2086                 }
2087                 else {
2088                     this.closureContexts[i].reset();
2089                 }
2090             }
2091         }
2092
2093         void reset(CharacterIterator target, int start, int limit, int nofclosures) {
2094             if (characterIteratorTarget == null) {
2095                 characterIteratorTarget = new CharacterIteratorTarget(target);
2096             }
2097             else {
2098                 characterIteratorTarget.resetTarget(target);
2099             }
2100             this.target = characterIteratorTarget;
2101             this.start = start;
2102             this.limit = limit;
2103             this.resetCommon(nofclosures);
2104         }
2105
2106         void reset(String target, int start, int limit, int nofclosures) {
2107             if (stringTarget == null) {
2108                 stringTarget = new StringTarget(target);
2109             }
2110             else {
2111                 stringTarget.resetTarget(target);
2112             }
2113             this.target = stringTarget;
2114             this.start = start;
2115             this.limit = limit;
2116             this.resetCommon(nofclosures);
2117         }
2118
2119         void reset(char[] target, int start, int limit, int nofclosures) {
2120             if (charArrayTarget == null) {
2121                 charArrayTarget = new CharArrayTarget(target);
2122             }
2123             else {
2124                 charArrayTarget.resetTarget(target);
2125             }
2126             this.target = charArrayTarget;
2127             this.start = start;
2128             this.limit = limit;
2129             this.resetCommon(nofclosures);
2130         }
2131         synchronized void setInUse(boolean inUse) {
2132             this.inuse = inUse;
2133         }
2134     }
2135
2136     /**
2137      * Prepares for matching.  This method is called just before starting matching.
2138      */
2139     void prepare() {
2140         if (Op.COUNT) {
2141             Op.nofinstances = 0;
2142         }
2143         this.compile(this.tokentree);
2144         /*
2145         if  (this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) { // .*
2146             Op anchor = Op.createAnchor(isSet(this.options, SINGLE_LINE) ? 'A' : '@');
2147             anchor.next = this.operations;
2148             this.operations = anchor;
2149         }
2150         */
2151         if (Op.COUNT) {
2152             System.err.println("DEBUG: The number of operations: "+Op.nofinstances);
2153         }
2154
2155         this.minlength = this.tokentree.getMinLength();
2156
2157         this.firstChar = null;
2158         if (!isSet(this.options, PROHIBIT_HEAD_CHARACTER_OPTIMIZATION)
2159             && !isSet(this.options, XMLSCHEMA_MODE)) {
2160             RangeToken firstChar = Token.createRange();
2161             int fresult = this.tokentree.analyzeFirstCharacter(firstChar, this.options);
2162             if (fresult == Token.FC_TERMINAL) {
2163                 firstChar.compactRanges();
2164                 this.firstChar = firstChar;
2165                 if (DEBUG) {
2166                     System.err.println("DEBUG: Use the first character optimization: "+firstChar);
2167                 }
2168             }
2169         }
2170
2171         if (this.operations != null
2172             && (this.operations.type == Op.STRING || this.operations.type == Op.CHAR)
2173             && this.operations.next == null) {
2174             if (DEBUG) {
2175                 System.err.print(" *** Only fixed string! *** ");
2176             }
2177             this.fixedStringOnly = true;
2178             if (this.operations.type == Op.STRING) {
2179                 this.fixedString = this.operations.getString();
2180             } else if (this.operations.getData() >= 0x10000) { // Op.CHAR
2181                 this.fixedString = REUtil.decomposeToSurrogates(this.operations.getData());
2182             } else {
2183                 char[] ac = new char[1];
2184                 ac[0] = (char)this.operations.getData();
2185                 this.fixedString = new String(ac);
2186             }
2187             this.fixedStringOptions = this.options;
2188             this.fixedStringTable = new BMPattern(this.fixedString, 256,
2189                                                   isSet(this.fixedStringOptions, IGNORE_CASE));
2190         } else if (!isSet(this.options, PROHIBIT_FIXED_STRING_OPTIMIZATION)
2191                    && !isSet(this.options, XMLSCHEMA_MODE)) {
2192             Token.FixedStringContainer container = new Token.FixedStringContainer();
2193             this.tokentree.findFixedString(container, this.options);
2194             this.fixedString = container.token == null ? null : container.token.getString();
2195             this.fixedStringOptions = container.options;
2196             if (this.fixedString != null && this.fixedString.length() < 2) {
2197                 this.fixedString = null;
2198             }
2199             // This pattern has a fixed string of which length is more than one.
2200             if (this.fixedString != null) {
2201                 this.fixedStringTable = new BMPattern(this.fixedString, 256,
2202                                                       isSet(this.fixedStringOptions, IGNORE_CASE));
2203                 if (DEBUG) {
2204                     System.err.println("DEBUG: The longest fixed string: "+this.fixedString.length()
2205                                        +"/" //+this.fixedString
2206                                        +"/"+REUtil.createOptionString(this.fixedStringOptions));
2207                     System.err.print("String: ");
2208                     REUtil.dumpString(this.fixedString);
2209                 }
2210             }
2211         }
2212     }
2213
2214     /**
2215      * An option.
2216      * If you specify this option, <span class="REGEX"><kbd>(</kbd><var>X</var><kbd>)</kbd></span>
2217      * captures matched text, and <span class="REGEX"><kbd>(:?</kbd><var>X</var><kbd>)</kbd></span>
2218      * does not capture.
2219      *
2220      * @see #RegularExpression(java.lang.String,int)
2221      * @see #setPattern(java.lang.String,int)
2222     static final int MARK_PARENS = 1<<0;
2223      */
2224
2225     /**
2226      * "i"
2227      */
2228     static final int IGNORE_CASE = 1<<1;
2229
2230     /**
2231      * "s"
2232      */
2233     static final int SINGLE_LINE = 1<<2;
2234
2235     /**
2236      * "m"
2237      */
2238     static final int MULTIPLE_LINES = 1<<3;
2239
2240     /**
2241      * "x"
2242      */
2243     static final int EXTENDED_COMMENT = 1<<4;
2244
2245     /**
2246      * This option redefines <span class="REGEX"><kbd>\d \D \w \W \s \S</kbd></span>.
2247      *
2248      * @see #RegularExpression(java.lang.String,int)
2249      * @see #setPattern(java.lang.String,int)
2250      * @see #UNICODE_WORD_BOUNDARY
2251      */
2252     static final int USE_UNICODE_CATEGORY = 1<<5; // "u"
2253
2254     /**
2255      * An option.
2256      * This enables to process locale-independent word boundary for <span class="REGEX"><kbd>\b \B \&lt; \></kbd></span>.
2257      * <p>By default, the engine considers a position between a word character
2258      * (<span class="REGEX"><Kbd>\w</kbd></span>) and a non word character
2259      * is a word boundary.
2260      * <p>By this option, the engine checks word boundaries with the method of
2261      * 'Unicode Regular Expression Guidelines' Revision 4.
2262      *
2263      * @see #RegularExpression(java.lang.String,int)
2264      * @see #setPattern(java.lang.String,int)
2265      */
2266     static final int UNICODE_WORD_BOUNDARY = 1<<6; // "w"
2267
2268     /**
2269      * "H"
2270      */
2271     static final int PROHIBIT_HEAD_CHARACTER_OPTIMIZATION = 1<<7;
2272     /**
2273      * "F"
2274      */
2275     static final int PROHIBIT_FIXED_STRING_OPTIMIZATION = 1<<8;
2276     /**
2277      * "X". XML Schema mode.
2278      */
2279     static final int XMLSCHEMA_MODE = 1<<9;
2280     /**
2281      * ",".
2282      */
2283     static final int SPECIAL_COMMA = 1<<10;
2284
2285
2286     private static final boolean isSet(int options, int flag) {
2287         return (options & flag) == flag;
2288     }
2289
2290     /**
2291      * Creates a new RegularExpression instance.
2292      *
2293      * @param regex A regular expression
2294      * @exception ParseException <VAR>regex</VAR> is not conforming to the syntax.
2295      */
2296     public RegularExpression(String regex) throws ParseException {
2297         this(regex, null);
2298     }
2299
2300     /**
2301      * Creates a new RegularExpression instance with options.
2302      *
2303      * @param regex A regular expression
2304      * @param options A String consisted of "i" "m" "s" "u" "w" "," "X"
2305      * @exception ParseException <VAR>regex</VAR> is not conforming to the syntax.
2306      */
2307     public RegularExpression(String regex, String options) throws ParseException {
2308         this.setPattern(regex, options);
2309     }
2310
2311     /**
2312      * Creates a new RegularExpression instance with options.
2313      *
2314      * @param regex A regular expression
2315      * @param options A String consisted of "i" "m" "s" "u" "w" "," "X"
2316      * @exception ParseException <VAR>regex</VAR> is not conforming to the syntax.
2317      */
2318     public RegularExpression(String regex, String options, Locale locale) throws ParseException {
2319         this.setPattern(regex, options, locale);
2320     }
2321
2322     RegularExpression(String regex, Token tok, int parens, boolean hasBackReferences, int options) {
2323         this.regex = regex;
2324         this.tokentree = tok;
2325         this.nofparen = parens;
2326         this.options = options;
2327         this.hasBackReferences = hasBackReferences;
2328     }
2329
2330     /**
2331      *
2332      */
2333     public void setPattern(String newPattern) throws ParseException {
2334         this.setPattern(newPattern, Locale.getDefault());
2335     }
2336
2337     public void setPattern(String newPattern, Locale locale) throws ParseException {
2338         this.setPattern(newPattern, this.options, locale);
2339     }
2340
2341     private void setPattern(String newPattern, int options, Locale locale) throws ParseException {
2342         this.regex = newPattern;
2343         this.options = options;
2344         RegexParser rp = RegularExpression.isSet(this.options, RegularExpression.XMLSCHEMA_MODE)
2345                          ? new ParserForXMLSchema(locale) : new RegexParser(locale);
2346         this.tokentree = rp.parse(this.regex, this.options);
2347         this.nofparen = rp.parennumber;
2348         this.hasBackReferences = rp.hasBackReferences;
2349
2350         this.operations = null;
2351         this.context = null;
2352     }
2353     /**
2354      *
2355      */
2356     public void setPattern(String newPattern, String options) throws ParseException {
2357         this.setPattern(newPattern, options, Locale.getDefault());
2358     }
2359
2360     public void setPattern(String newPattern, String options, Locale locale) throws ParseException {
2361         this.setPattern(newPattern, REUtil.parseOptions(options), locale);
2362     }
2363
2364     /**
2365      *
2366      */
2367     public String getPattern() {
2368         return this.regex;
2369     }
2370
2371     /**
2372      * Represents this instence in String.
2373      */
2374     @Override
2375     public String toString() {
2376         return this.tokentree.toString(this.options);
2377     }
2378
2379     /**
2380      * Returns a option string.
2381      * The order of letters in it may be different from a string specified
2382      * in a constructor or <code>setPattern()</code>.
2383      *
2384      * @see #RegularExpression(java.lang.String,java.lang.String)
2385      * @see #setPattern(java.lang.String,java.lang.String)
2386      */
2387     public String getOptions() {
2388         return REUtil.createOptionString(this.options);
2389     }
2390
2391     /**
2392      *  Return true if patterns are the same and the options are equivalent.
2393      */
2394     @Override
2395     public boolean equals(Object obj) {
2396         if (obj == null) {
2397             return false;
2398         }
2399         if (!(obj instanceof RegularExpression)) {
2400             return false;
2401         }
2402         RegularExpression r = (RegularExpression)obj;
2403         return this.regex.equals(r.regex) && this.options == r.options;
2404     }
2405
2406     boolean equals(String pattern, int options) {
2407         return this.regex.equals(pattern) && this.options == options;
2408     }
2409
2410     /**
2411      *
2412      */
2413     @Override
2414     public int hashCode() {
2415         return (this.regex+"/"+this.getOptions()).hashCode();
2416     }
2417
2418     /**
2419      * Return the number of regular expression groups.
2420      * This method returns 1 when the regular expression has no capturing-parenthesis.
2421      *
2422      */
2423     public int getNumberOfGroups() {
2424         return this.nofparen;
2425     }
2426
2427     // ================================================================
2428
2429     private static final int WT_IGNORE = 0;
2430     private static final int WT_LETTER = 1;
2431     private static final int WT_OTHER = 2;
2432     private static final int getWordType0(char ch, int opts) {
2433         if (!isSet(opts, UNICODE_WORD_BOUNDARY)) {
2434             if (isSet(opts, USE_UNICODE_CATEGORY)) {
2435                 return (Token.getRange("IsWord", true).match(ch)) ? WT_LETTER : WT_OTHER;
2436             }
2437             return isWordChar(ch) ? WT_LETTER : WT_OTHER;
2438         }
2439
2440         switch (Character.getType(ch)) {
2441         case Character.UPPERCASE_LETTER:      // L
2442         case Character.LOWERCASE_LETTER:      // L
2443         case Character.TITLECASE_LETTER:      // L
2444         case Character.MODIFIER_LETTER:       // L
2445         case Character.OTHER_LETTER:          // L
2446         case Character.LETTER_NUMBER:         // N
2447         case Character.DECIMAL_DIGIT_NUMBER:  // N
2448         case Character.OTHER_NUMBER:          // N
2449         case Character.COMBINING_SPACING_MARK: // Mc
2450             return WT_LETTER;
2451
2452         case Character.FORMAT:                // Cf
2453         case Character.NON_SPACING_MARK:      // Mn
2454         case Character.ENCLOSING_MARK:        // Mc
2455             return WT_IGNORE;
2456
2457         case Character.CONTROL:               // Cc
2458             switch (ch) {
2459             case '\t':
2460             case '\n':
2461             case '\u000B':
2462             case '\f':
2463             case '\r':
2464                 return WT_OTHER;
2465             default:
2466                 return WT_IGNORE;
2467             }
2468
2469         default:
2470             return WT_OTHER;
2471         }
2472     }
2473
2474     // ================================================================
2475
2476     static final int LINE_FEED = 0x000A;
2477     static final int CARRIAGE_RETURN = 0x000D;
2478     static final int LINE_SEPARATOR = 0x2028;
2479     static final int PARAGRAPH_SEPARATOR = 0x2029;
2480
2481     private static final boolean isEOLChar(int ch) {
2482         return ch == LINE_FEED || ch == CARRIAGE_RETURN || ch == LINE_SEPARATOR
2483         || ch == PARAGRAPH_SEPARATOR;
2484     }
2485
2486     private static final boolean isWordChar(int ch) { // Legacy word characters
2487         if (ch == '_') {
2488             return true;
2489         }
2490         if (ch < '0') {
2491             return false;
2492         }
2493         if (ch > 'z') {
2494             return false;
2495         }
2496         if (ch <= '9') {
2497             return true;
2498         }
2499         if (ch < 'A') {
2500             return false;
2501         }
2502         if (ch <= 'Z') {
2503             return true;
2504         }
2505         if (ch < 'a') {
2506             return false;
2507         }
2508         return true;
2509     }
2510
2511     private static final boolean matchIgnoreCase(int chardata, int ch) {
2512         if (chardata == ch) {
2513             return true;
2514         }
2515         if (chardata > 0xffff || ch > 0xffff) {
2516             return false;
2517         }
2518         char uch1 = Character.toUpperCase((char)chardata);
2519         char uch2 = Character.toUpperCase((char)ch);
2520         if (uch1 == uch2) {
2521             return true;
2522         }
2523         return Character.toLowerCase(uch1) == Character.toLowerCase(uch2);
2524     }
2525 }