third-party/xsd-regex/src/main/java/org/opendaylight/yangtools/xsd/regex/RegularExpression.java

   1 /*
   2  * Licensed to the Apache Software Foundation (ASF) under one or more
   3  * contributor license agreements.  See the NOTICE file distributed with
   4  * this work for additional information regarding copyright ownership.
   5  * The ASF licenses this file to You under the Apache License, Version 2.0
   6  * (the "License"); you may not use this file except in compliance with
   7  * the License.  You may obtain a copy of the License at
   8  *
   9  *      http://www.apache.org/licenses/LICENSE-2.0
  10  *
  11  * Unless required by applicable law or agreed to in writing, software
  12  * distributed under the License is distributed on an "AS IS" BASIS,
  13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14  * See the License for the specific language governing permissions and
  15  * limitations under the License.
  16  */
  17
  18 package org.opendaylight.yangtools.xsd.regex;
  19
  20 import java.text.CharacterIterator;
  21 import java.util.Locale;
  22 import java.util.Stack;
  23
  24 /**
  25  * A regular expression matching engine using Non-deterministic Finite Automaton (NFA).
  26  * This engine does not conform to the POSIX regular expression.
  27  *
  28  * <hr width="50%">
  29  * <h3>How to use</h3>
  30  *
  31  * <dl>
  32  *   <dt>A. Standard way
  33  *   <dd>
  34  * <pre>
  35  * RegularExpression re = new RegularExpression(<var>regex</var>);
  36  * if (re.matches(text)) { ... }
  37  * </pre>
  38  *
  39  *   <dt>B. Capturing groups
  40  *   <dd>
  41  * <pre>
  42  * RegularExpression re = new RegularExpression(<var>regex</var>);
  43  * Match match = new Match();
  44  * if (re.matches(text, match)) {
  45  *     ... // You can refer captured texts with methods of the <code>Match</code> class.
  46  * }
  47  * </pre>
  48  *
  49  * </dl>
  50  *
  51  * <h4>Case-insensitive matching</h4>
  52  * <pre>
  53  * RegularExpression re = new RegularExpression(<var>regex</var>, "i");
  54  * if (re.matches(text) >= 0) { ...}
  55  * </pre>
  56  *
  57  * <h4>Options</h4>
  58  * <p>You can specify options to <a href="#RegularExpression(java.lang.String, java.lang.String)"><code>RegularExpression(</code><var>regex</var><code>, </code><var>options</var><code>)</code></a>
  59  *    or <a href="#setPattern(java.lang.String, java.lang.String)"><code>setPattern(</code><var>regex</var><code>, </code><var>options</var><code>)</code></a>.
  60  *    This <var>options</var> parameter consists of the following characters.
  61  * </p>
  62  * <dl>
  63  *   <dt><a name="I_OPTION"><code>"i"</code></a>
  64  *   <dd>This option indicates case-insensitive matching.
  65  *   <dt><a name="M_OPTION"><code>"m"</code></a>
  66  *   <dd class="REGEX"><kbd>^</kbd> and <kbd>$</kbd> consider the EOL characters within the text.
  67  *   <dt><a name="S_OPTION"><code>"s"</code></a>
  68  *   <dd class="REGEX"><kbd>.</kbd> matches any one character.
  69  *   <dt><a name="U_OPTION"><code>"u"</code></a>
  70  *   <dd class="REGEX">Redefines <Kbd>\d \D \w \W \s \S \b \B \&lt; \></kbd> as becoming to Unicode.
  71  *   <dt><a name="W_OPTION"><code>"w"</code></a>
  72  *   <dd class="REGEX">By this option, <kbd>\b \B \&lt; \></kbd> are processed with the method of
  73  *      'Unicode Regular Expression Guidelines' Revision 4.
  74  *      When "w" and "u" are specified at the same time,
  75  *      <kbd>\b \B \&lt; \></kbd> are processed for the "w" option.
  76  *   <dt><a name="COMMA_OPTION"><code>","</code></a>
  77  *   <dd>The parser treats a comma in a character class as a range separator.
  78  *      <kbd class="REGEX">[a,b]</kbd> matches <kbd>a</kbd> or <kbd>,</kbd> or <kbd>b</kbd> without this option.
  79  *      <kbd class="REGEX">[a,b]</kbd> matches <kbd>a</kbd> or <kbd>b</kbd> with this option.
  80  *
  81  *   <dt><a name="X_OPTION"><code>"X"</code></a>
  82  *   <dd class="REGEX">
  83  *       By this option, the engine confoms to <a href="http://www.w3.org/TR/2000/WD-xmlschema-2-20000407/#regexs">XML Schema: Regular Expression</a>.
  84  *       The <code>match()</code> method does not do subsring matching
  85  *       but entire string matching.
  86  *
  87  * </dl>
  88  *
  89  * <hr width="50%">
  90  * <h3>Syntax</h3>
  91  * <table border="1" bgcolor="#ddeeff">
  92  *   <tr>
  93  *    <td>
  94  *     <h4>Differences from the Perl 5 regular expression</h4>
  95  *     <ul>
  96  *      <li>There is 6-digit hexadecimal character representation  (<kbd>\u005cv</kbd><var>HHHHHH</var>.)
  97  *      <li>Supports subtraction, union, and intersection operations for character classes.
  98  *      <li>Not supported: <kbd>\</kbd><var>ooo</var> (Octal character representations),
  99  *          <Kbd>\G</kbd>, <kbd>\C</kbd>, <kbd>\l</kbd><var>c</var>,
 100  *          <kbd>\u005c u</kbd><var>c</var>, <kbd>\L</kbd>, <kbd>\U</kbd>,
 101  *          <kbd>\E</kbd>, <kbd>\Q</kbd>, <kbd>\N{</kbd><var>name</var><kbd>}</kbd>,
 102  *          <Kbd>(?{<kbd><var>code</var><kbd>})</kbd>, <Kbd>(??{<kbd><var>code</var><kbd>})</kbd>
 103  *     </ul>
 104  *    </td>
 105  *   </tr>
 106  * </table>
 107  *
 108  * <P>Meta characters are `<KBD>. * + ? { [ ( ) | \ ^ $</KBD>'.</P>
 109  * <ul>
 110  *   <li>Character
 111  *     <dl>
 112  *       <dt class="REGEX"><kbd>.</kbd> (A period)
 113  *       <dd>Matches any one character except the following characters.
 114  *       <dd>LINE FEED (U+000A), CARRIAGE RETURN (U+000D),
 115  *           PARAGRAPH SEPARATOR (U+2029), LINE SEPARATOR (U+2028)
 116  *       <dd>This expression matches one code point in Unicode. It can match a pair of surrogates.
 117  *       <dd>When <a href="#S_OPTION">the "s" option</a> is specified,
 118  *           it matches any character including the above four characters.
 119  *
 120  *       <dt class="REGEX"><Kbd>\e \f \n \r \t</kbd>
 121  *       <dd>Matches ESCAPE (U+001B), FORM FEED (U+000C), LINE FEED (U+000A),
 122  *           CARRIAGE RETURN (U+000D), HORIZONTAL TABULATION (U+0009)
 123  *
 124  *       <dt class="REGEX"><kbd>\c</kbd><var>C</var>
 125  *       <dd>Matches a control character.
 126  *           The <var>C</var> must be one of '<kbd>@</kbd>', '<kbd>A</kbd>'-'<kbd>Z</kbd>',
 127  *           '<kbd>[</kbd>', '<kbd>\u005c</kbd>', '<kbd>]</kbd>', '<kbd>^</kbd>', '<kbd>_</kbd>'.
 128  *           It matches a control character of which the character code is less than
 129  *           the character code of the <var>C</var> by 0x0040.
 130  *       <dd class="REGEX">For example, a <kbd>\cJ</kbd> matches a LINE FEED (U+000A),
 131  *           and a <kbd>\c[</kbd> matches an ESCAPE (U+001B).
 132  *
 133  *       <dt class="REGEX">a non-meta character
 134  *       <dd>Matches the character.
 135  *
 136  *       <dt class="REGEX"><KBD>\</KBD> + a meta character
 137  *       <dd>Matches the meta character.
 138  *
 139  *       <dt class="REGEX"><kbd>\u005cx</kbd><var>HH</var> <kbd>\u005cx{</kbd><var>HHHH</var><kbd>}</kbd>
 140  *       <dd>Matches a character of which code point is <var>HH</var> (Hexadecimal) in Unicode.
 141  *           You can write just 2 digits for <kbd>\u005cx</kbd><var>HH</var>, and
 142  *           variable length digits for <kbd>\u005cx{</kbd><var>HHHH</var><kbd>}</kbd>.
 143  *
 144  *       <!--
 145  *       <dt class="REGEX"><kbd>\u005c u</kbd><var>HHHH</var>
 146  *       <dd>Matches a character of which code point is <var>HHHH</var> (Hexadecimal) in Unicode.
 147  *       -->
 148  *
 149  *       <dt class="REGEX"><kbd>\u005cv</kbd><var>HHHHHH</var>
 150  *       <dd>Matches a character of which code point is <var>HHHHHH</var> (Hexadecimal) in Unicode.
 151  *
 152  *       <dt class="REGEX"><kbd>\g</kbd>
 153  *       <dd>Matches a grapheme.
 154  *       <dd class="REGEX">It is equivalent to <kbd>(?[\p{ASSIGNED}]-[\p{M}\p{C}])?(?:\p{M}|[\x{094D}\x{09CD}\x{0A4D}\x{0ACD}\x{0B3D}\x{0BCD}\x{0C4D}\x{0CCD}\x{0D4D}\x{0E3A}\x{0F84}]\p{L}|[\x{1160}-\x{11A7}]|[\x{11A8}-\x{11FF}]|[\x{FF9E}\x{FF9F}])*</kbd>
 155  *
 156  *       <dt class="REGEX"><kbd>\X</kbd>
 157  *       <dd class="REGEX">Matches a combining character sequence.
 158  *       It is equivalent to <kbd>(?:\PM\pM*)</kbd>
 159  *     </dl>
 160  *   </li>
 161  *
 162  *   <li>Character class
 163  *     <dl>
 164 + *       <dt class="REGEX"><kbd>[</kbd><var>R<sub>1</sub></var><var>R<sub>2</sub></var><var>...</var><var>R<sub>n</sub></var><kbd>]</kbd> (without <a href="#COMMA_OPTION">"," option</a>)
 165 + *       <dt class="REGEX"><kbd>[</kbd><var>R<sub>1</sub></var><kbd>,</kbd><var>R<sub>2</sub></var><kbd>,</kbd><var>...</var><kbd>,</kbd><var>R<sub>n</sub></var><kbd>]</kbd> (with <a href="#COMMA_OPTION">"," option</a>)
 166  *       <dd>Positive character class.  It matches a character in ranges.
 167  *       <dd><var>R<sub>n</sub></var>:
 168  *       <ul>
 169  *         <li class="REGEX">A character (including <Kbd>\e \f \n \r \t</kbd> <kbd>\u005cx</kbd><var>HH</var> <kbd>\u005cx{</kbd><var>HHHH</var><kbd>}</kbd> <!--kbd>\u005c u</kbd><var>HHHH</var--> <kbd>\u005cv</kbd><var>HHHHHH</var>)
 170  *             <p>This range matches the character.
 171  *         <li class="REGEX"><var>C<sub>1</sub></var><kbd>-</kbd><var>C<sub>2</sub></var>
 172  *             <p>This range matches a character which has a code point that is >= <var>C<sub>1</sub></var>'s code point and &lt;= <var>C<sub>2</sub></var>'s code point.
 173 + *         <li class="REGEX">A POSIX character class: <Kbd>[:alpha:] [:alnum:] [:ascii:] [:cntrl:] [:digit:] [:graph:] [:lower:] [:print:] [:punct:] [:space:] [:upper:] [:xdigit:]</kbd>,
 174 + *             and negative POSIX character classes in Perl like <kbd>[:^alpha:]</kbd>
 175  *             <p>...
 176  *         <li class="REGEX"><kbd>\d \D \s \S \w \W \p{</kbd><var>name</var><kbd>} \P{</kbd><var>name</var><kbd>}</kbd>
 177  *             <p>These expressions specifies the same ranges as the following expressions.
 178  *       </ul>
 179  *       <p class="REGEX">Enumerated ranges are merged (union operation).
 180  *          <kbd>[a-ec-z]</kbd> is equivalent to <kbd>[a-z]</kbd>
 181  *
 182  *       <dt class="REGEX"><kbd>[^</kbd><var>R<sub>1</sub></var><var>R<sub>2</sub></var><var>...</var><var>R<sub>n</sub></var><kbd>]</kbd> (without a <a href="#COMMA_OPTION">"," option</a>)
 183  *       <dt class="REGEX"><kbd>[^</kbd><var>R<sub>1</sub></var><kbd>,</kbd><var>R<sub>2</sub></var><kbd>,</kbd><var>...</var><kbd>,</kbd><var>R<sub>n</sub></var><kbd>]</kbd> (with a <a href="#COMMA_OPTION">"," option</a>)
 184  *       <dd>Negative character class.  It matches a character not in ranges.
 185  *
 186  *       <dt class="REGEX"><kbd>(?[</kbd><var>ranges</var><kbd>]</kbd><var>op</var><kbd>[</kbd><var>ranges</var><kbd>]</kbd><var>op</var><kbd>[</kbd><var>ranges</var><kbd>]</kbd> ... <Kbd>)</kbd>
 187  *       (<var>op</var> is <kbd>-</kbd> or <kbd>+</kbd> or <kbd>&</kbd>.)
 188  *       <dd>Subtraction or union or intersection for character classes.
 189  *       <dd class="REGEX">For exmaple, <kbd>(?[A-Z]-[CF])</kbd> is equivalent to <kbd>[A-BD-EG-Z]</kbd>, and <kbd>(?[0x00-0x7f]-[K]&[\p{Lu}])</kbd> is equivalent to <kbd>[A-JL-Z]</kbd>.
 190  *       <dd>The result of this operations is a <u>positive character class</u>
 191  *           even if an expression includes any negative character classes.
 192  *           You have to take care on this in case-insensitive matching.
 193  *           For instance, <kbd>(?[^b])</kbd> is equivalent to <kbd>[\x00-ac-\x{10ffff}]</kbd>,
 194  *           which is equivalent to <kbd>[^b]</kbd> in case-sensitive matching.
 195  *           But, in case-insensitive matching, <kbd>(?[^b])</kbd> matches any character because
 196  *           it includes '<kbd>B</kbd>' and '<kbd>B</kbd>' matches '<kbd>b</kbd>'
 197  *           though <kbd>[^b]</kbd> is processed as <kbd>[^Bb]</kbd>.
 198  *
 199  *       <dt class="REGEX"><kbd>[</kbd><var>R<sub>1</sub>R<sub>2</sub>...</var><kbd>-[</kbd><var>R<sub>n</sub>R<sub>n+1</sub>...</var><kbd>]]</kbd> (with an <a href="#X_OPTION">"X" option</a>)</dt>
 200  *       <dd>Character class subtraction for the XML Schema.
 201  *           You can use this syntax when you specify an <a href="#X_OPTION">"X" option</a>.
 202  *
 203  *       <dt class="REGEX"><kbd>\d</kbd>
 204  *       <dd class="REGEX">Equivalent to <kbd>[0-9]</kbd>.
 205  *       <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
 206  *           <span class="REGEX"><kbd>\p{Nd}</kbd></span>.
 207  *
 208  *       <dt class="REGEX"><kbd>\D</kbd>
 209  *       <dd class="REGEX">Equivalent to <kbd>[^0-9]</kbd>
 210  *       <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
 211  *           <span class="REGEX"><kbd>\P{Nd}</kbd></span>.
 212  *
 213  *       <dt class="REGEX"><kbd>\s</kbd>
 214  *       <dd class="REGEX">Equivalent to <kbd>[ \f\n\r\t]</kbd>
 215  *       <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
 216  *           <span class="REGEX"><kbd>[ \f\n\r\t\p{Z}]</kbd></span>.
 217  *
 218  *       <dt class="REGEX"><kbd>\S</kbd>
 219  *       <dd class="REGEX">Equivalent to <kbd>[^ \f\n\r\t]</kbd>
 220  *       <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
 221  *           <span class="REGEX"><kbd>[^ \f\n\r\t\p{Z}]</kbd></span>.
 222  *
 223  *       <dt class="REGEX"><kbd>\w</kbd>
 224  *       <dd class="REGEX">Equivalent to <kbd>[a-zA-Z0-9_]</kbd>
 225  *       <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
 226  *           <span class="REGEX"><kbd>[\p{Lu}\p{Ll}\p{Lo}\p{Nd}_]</kbd></span>.
 227  *
 228  *       <dt class="REGEX"><kbd>\W</kbd>
 229  *       <dd class="REGEX">Equivalent to <kbd>[^a-zA-Z0-9_]</kbd>
 230  *       <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
 231  *           <span class="REGEX"><kbd>[^\p{Lu}\p{Ll}\p{Lo}\p{Nd}_]</kbd></span>.
 232  *
 233  *       <dt class="REGEX"><kbd>\p{</kbd><var>name</var><kbd>}</kbd>
 234  *       <dd>Matches one character in the specified General Category (the second field in <a href="ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt"><kbd>UnicodeData.txt</kbd></a>) or the specified <a href="ftp://ftp.unicode.org/Public/UNIDATA/Blocks.txt">Block</a>.
 235  *       The following names are available:
 236  *       <dl>
 237  *         <dt>Unicode General Categories:
 238  *         <dd><kbd>
 239  *       L, M, N, Z, C, P, S, Lu, Ll, Lt, Lm, Lo, Mn, Me, Mc, Nd, Nl, No, Zs, Zl, Zp,
 240  *       Cc, Cf, Cn, Co, Cs, Pd, Ps, Pe, Pc, Po, Sm, Sc, Sk, So,
 241  *         </kbd>
 242  *         <dd>(Currently the Cn category includes U+10000-U+10FFFF characters)
 243  *         <dt>Unicode Blocks:
 244  *         <dd><kbd>
 245  *       Basic Latin, Latin-1 Supplement, Latin Extended-A, Latin Extended-B,
 246  *       IPA Extensions, Spacing Modifier Letters, Combining Diacritical Marks, Greek,
 247  *       Cyrillic, Armenian, Hebrew, Arabic, Devanagari, Bengali, Gurmukhi, Gujarati,
 248  *       Oriya, Tamil, Telugu, Kannada, Malayalam, Thai, Lao, Tibetan, Georgian,
 249  *       Hangul Jamo, Latin Extended Additional, Greek Extended, General Punctuation,
 250  *       Superscripts and Subscripts, Currency Symbols, Combining Marks for Symbols,
 251  *       Letterlike Symbols, Number Forms, Arrows, Mathematical Operators,
 252  *       Miscellaneous Technical, Control Pictures, Optical Character Recognition,
 253  *       Enclosed Alphanumerics, Box Drawing, Block Elements, Geometric Shapes,
 254  *       Miscellaneous Symbols, Dingbats, CJK Symbols and Punctuation, Hiragana,
 255  *       Katakana, Bopomofo, Hangul Compatibility Jamo, Kanbun,
 256  *       Enclosed CJK Letters and Months, CJK Compatibility, CJK Unified Ideographs,
 257  *       Hangul Syllables, High Surrogates, High Private Use Surrogates, Low Surrogates,
 258  *       Private Use, CJK Compatibility Ideographs, Alphabetic Presentation Forms,
 259  *       Arabic Presentation Forms-A, Combining Half Marks, CJK Compatibility Forms,
 260  *       Small Form Variants, Arabic Presentation Forms-B, Specials,
 261  *       Halfwidth and Fullwidth Forms
 262  *         </kbd>
 263  *         <dt>Others:
 264  *         <dd><kbd>ALL</kbd> (Equivalent to <kbd>[\u005cu0000-\u005cv10FFFF]</kbd>)
 265  *         <dd><kbd>ASSGINED</kbd> (<kbd>\p{ASSIGNED}</kbd> is equivalent to <kbd>\P{Cn}</kbd>)
 266  *         <dd><kbd>UNASSGINED</kbd>
 267  *             (<kbd>\p{UNASSIGNED}</kbd> is equivalent to <kbd>\p{Cn}</kbd>)
 268  *       </dl>
 269  *
 270  *       <dt class="REGEX"><kbd>\P{</kbd><var>name</var><kbd>}</kbd>
 271  *       <dd>Matches one character not in the specified General Category or the specified Block.
 272  *     </dl>
 273  *   </li>
 274  *
 275  *   <li>Selection and Quantifier
 276  *     <dl>
 277  *       <dt class="REGEX"><VAR>X</VAR><kbd>|</kbd><VAR>Y</VAR>
 278  *       <dd>...
 279  *
 280  *       <dt class="REGEX"><VAR>X</VAR><kbd>*</KBD>
 281  *       <dd>Matches 0 or more <var>X</var>.
 282  *
 283  *       <dt class="REGEX"><VAR>X</VAR><kbd>+</KBD>
 284  *       <dd>Matches 1 or more <var>X</var>.
 285  *
 286  *       <dt class="REGEX"><VAR>X</VAR><kbd>?</KBD>
 287  *       <dd>Matches 0 or 1 <var>X</var>.
 288  *
 289  *       <dt class="REGEX"><var>X</var><kbd>{</kbd><var>number</var><kbd>}</kbd>
 290  *       <dd>Matches <var>number</var> times.
 291  *
 292  *       <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,}</kbd>
 293  *       <dd>...
 294  *
 295  *       <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,</kbd><var>max</var><kbd>}</kbd>
 296  *       <dd>...
 297  *
 298  *       <dt class="REGEX"><VAR>X</VAR><kbd>*?</kbd>
 299  *       <dt class="REGEX"><VAR>X</VAR><kbd>+?</kbd>
 300  *       <dt class="REGEX"><VAR>X</VAR><kbd>??</kbd>
 301  *       <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,}?</kbd>
 302  *       <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,</kbd><var>max</var><kbd>}?</kbd>
 303  *       <dd>Non-greedy matching.
 304  *     </dl>
 305  *   </li>
 306  *
 307  *   <li>Grouping, Capturing, and Back-reference
 308  *     <dl>
 309  *       <dt class="REGEX"><KBD>(?:</kbd><VAR>X</VAR><kbd>)</KBD>
 310  *       <dd>Grouping. "<KBD>foo+</KBD>" matches "<KBD>foo</KBD>" or "<KBD>foooo</KBD>".
 311  *       If you want it matches "<KBD>foofoo</KBD>" or "<KBD>foofoofoo</KBD>",
 312  *       you have to write "<KBD>(?:foo)+</KBD>".
 313  *
 314  *       <dt class="REGEX"><KBD>(</kbd><VAR>X</VAR><kbd>)</KBD>
 315  *       <dd>Grouping with capturing.
 316  * It make a group and applications can know
 317  * where in target text a group matched with methods of a <code>Match</code> instance
 318  * after <code><a href="#matches(java.lang.String, org.apache.xerces.utils.regex.Match)">matches(String,Match)</a></code>.
 319  * The 0th group means whole of this regular expression.
 320  * The <VAR>N</VAR>th gorup is the inside of the <VAR>N</VAR>th left parenthesis.
 321  *
 322  *   <p>For instance, a regular expression is
 323  *   "<FONT color=blue><KBD> *([^&lt;:]*) +&lt;([^&gt;]*)&gt; *</KBD></FONT>"
 324  *   and target text is
 325  *   "<FONT color=red><KBD>From: TAMURA Kent &lt;kent@trl.ibm.co.jp&gt;</KBD></FONT>":
 326  *   <ul>
 327  *     <li><code>Match.getCapturedText(0)</code>:
 328  *     "<FONT color=red><KBD> TAMURA Kent &lt;kent@trl.ibm.co.jp&gt;</KBD></FONT>"
 329  *     <li><code>Match.getCapturedText(1)</code>: "<FONT color=red><KBD>TAMURA Kent</KBD></FONT>"
 330  *     <li><code>Match.getCapturedText(2)</code>: "<FONT color=red><KBD>kent@trl.ibm.co.jp</KBD></FONT>"
 331  *   </ul>
 332  *
 333  *       <dt class="REGEX"><kbd>\1 \2 \3 \4 \5 \6 \7 \8 \9</kbd>
 334  *       <dd>
 335  *
 336  *       <dt class="REGEX"><kbd>(?></kbd><var>X</var><kbd>)</kbd>
 337  *       <dd>Independent expression group. ................
 338  *
 339  *       <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>:</kbd><var>X</var><kbd>)</kbd>
 340  *       <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>-</kbd><var>options2</var><kbd>:</kbd><var>X</var><kbd>)</kbd>
 341  *       <dd>............................
 342  *       <dd>The <var>options</var> or the <var>options2</var> consists of 'i' 'm' 's' 'w'.
 343  *           Note that it can not contain 'u'.
 344  *
 345  *       <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>)</kbd>
 346  *       <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>-</kbd><var>options2</var><kbd>)</kbd>
 347  *       <dd>......
 348  *       <dd>These expressions must be at the beginning of a group.
 349  *     </dl>
 350  *   </li>
 351  *
 352  *   <li>Anchor
 353  *     <dl>
 354  *       <dt class="REGEX"><kbd>\A</kbd>
 355  *       <dd>Matches the beginnig of the text.
 356  *
 357  *       <dt class="REGEX"><kbd>\Z</kbd>
 358  *       <dd>Matches the end of the text, or before an EOL character at the end of the text,
 359  *           or CARRIAGE RETURN + LINE FEED at the end of the text.
 360  *
 361  *       <dt class="REGEX"><kbd>\z</kbd>
 362  *       <dd>Matches the end of the text.
 363  *
 364  *       <dt class="REGEX"><kbd>^</kbd>
 365  *       <dd>Matches the beginning of the text.  It is equivalent to <span class="REGEX"><Kbd>\A</kbd></span>.
 366  *       <dd>When <a href="#M_OPTION">a "m" option</a> is set,
 367  *           it matches the beginning of the text, or after one of EOL characters (
 368  *           LINE FEED (U+000A), CARRIAGE RETURN (U+000D), LINE SEPARATOR (U+2028),
 369  *           PARAGRAPH SEPARATOR (U+2029).)
 370  *
 371  *       <dt class="REGEX"><kbd>$</kbd>
 372  *       <dd>Matches the end of the text, or before an EOL character at the end of the text,
 373  *           or CARRIAGE RETURN + LINE FEED at the end of the text.
 374  *       <dd>When <a href="#M_OPTION">a "m" option</a> is set,
 375  *           it matches the end of the text, or before an EOL character.
 376  *
 377  *       <dt class="REGEX"><kbd>\b</kbd>
 378  *       <dd>Matches word boundary.
 379  *           (See <a href="#W_OPTION">a "w" option</a>)
 380  *
 381  *       <dt class="REGEX"><kbd>\B</kbd>
 382  *       <dd>Matches non word boundary.
 383  *           (See <a href="#W_OPTION">a "w" option</a>)
 384  *
 385  *       <dt class="REGEX"><kbd>\&lt;</kbd>
 386  *       <dd>Matches the beginning of a word.
 387  *           (See <a href="#W_OPTION">a "w" option</a>)
 388  *
 389  *       <dt class="REGEX"><kbd>\&gt;</kbd>
 390  *       <dd>Matches the end of a word.
 391  *           (See <a href="#W_OPTION">a "w" option</a>)
 392  *     </dl>
 393  *   </li>
 394  *   <li>Lookahead and lookbehind
 395  *     <dl>
 396  *       <dt class="REGEX"><kbd>(?=</kbd><var>X</var><kbd>)</kbd>
 397  *       <dd>Lookahead.
 398  *
 399  *       <dt class="REGEX"><kbd>(?!</kbd><var>X</var><kbd>)</kbd>
 400  *       <dd>Negative lookahead.
 401  *
 402  *       <dt class="REGEX"><kbd>(?&lt;=</kbd><var>X</var><kbd>)</kbd>
 403  *       <dd>Lookbehind.
 404  *       <dd>(Note for text capturing......)
 405  *
 406  *       <dt class="REGEX"><kbd>(?&lt;!</kbd><var>X</var><kbd>)</kbd>
 407  *       <dd>Negative lookbehind.
 408  *     </dl>
 409  *   </li>
 410  *
 411  *   <li>Misc.
 412  *     <dl>
 413  *       <dt class="REGEX"><kbd>(?(</Kbd><var>condition</var><Kbd>)</kbd><var>yes-pattern</var><kbd>|</kbd><var>no-pattern</var><kbd>)</kbd>,
 414  *       <dt class="REGEX"><kbd>(?(</kbd><var>condition</var><kbd>)</kbd><var>yes-pattern</var><kbd>)</kbd>
 415  *       <dd>......
 416  *       <dt class="REGEX"><kbd>(?#</kbd><var>comment</var><kbd>)</kbd>
 417  *       <dd>Comment.  A comment string consists of characters except '<kbd>)</kbd>'.
 418  *           You can not write comments in character classes and before quantifiers.
 419  *     </dl>
 420  *   </li>
 421  * </ul>
 422  *
 423  *
 424  * <hr width="50%">
 425  * <h3>BNF for the regular expression</h3>
 426  * <pre>
 427  * regex ::= ('(?' options ')')? term ('|' term)*
 428  * term ::= factor+
 429  * factor ::= anchors | atom (('*' | '+' | '?' | minmax ) '?'? )?
 430  *            | '(?#' [^)]* ')'
 431  * minmax ::= '{' ([0-9]+ | [0-9]+ ',' | ',' [0-9]+ | [0-9]+ ',' [0-9]+) '}'
 432  * atom ::= char | '.' | char-class | '(' regex ')' | '(?:' regex ')' | '\' [0-9]
 433  *          | '\w' | '\W' | '\d' | '\D' | '\s' | '\S' | category-block | '\X'
 434  *          | '(?>' regex ')' | '(?' options ':' regex ')'
 435  *          | '(?' ('(' [0-9] ')' | '(' anchors ')' | looks) term ('|' term)? ')'
 436  * options ::= [imsw]* ('-' [imsw]+)?
 437  * anchors ::= '^' | '$' | '\A' | '\Z' | '\z' | '\b' | '\B' | '\&lt;' | '\>'
 438  * looks ::= '(?=' regex ')'  | '(?!' regex ')'
 439  *           | '(?&lt;=' regex ')' | '(?&lt;!' regex ')'
 440  * char ::= '\\' | '\' [efnrtv] | '\c' [@-_] | code-point | character-1
 441  * category-block ::= '\' [pP] category-symbol-1
 442  *                    | ('\p{' | '\P{') (category-symbol | block-name
 443  *                                       | other-properties) '}'
 444  * category-symbol-1 ::= 'L' | 'M' | 'N' | 'Z' | 'C' | 'P' | 'S'
 445  * category-symbol ::= category-symbol-1 | 'Lu' | 'Ll' | 'Lt' | 'Lm' | Lo'
 446  *                     | 'Mn' | 'Me' | 'Mc' | 'Nd' | 'Nl' | 'No'
 447  *                     | 'Zs' | 'Zl' | 'Zp' | 'Cc' | 'Cf' | 'Cn' | 'Co' | 'Cs'
 448  *                     | 'Pd' | 'Ps' | 'Pe' | 'Pc' | 'Po'
 449  *                     | 'Sm' | 'Sc' | 'Sk' | 'So'
 450  * block-name ::= (See above)
 451  * other-properties ::= 'ALL' | 'ASSIGNED' | 'UNASSIGNED'
 452  * character-1 ::= (any character except meta-characters)
 453  *
 454  * char-class ::= '[' ranges ']'
 455  *                | '(?[' ranges ']' ([-+&] '[' ranges ']')? ')'
 456  * ranges ::= '^'? (range <a href="#COMMA_OPTION">','?</a>)+
 457  * range ::= '\d' | '\w' | '\s' | '\D' | '\W' | '\S' | category-block
 458  *           | range-char | range-char '-' range-char
 459  * range-char ::= '\[' | '\]' | '\\' | '\' [,-efnrtv] | code-point | character-2
 460  * code-point ::= '\x' hex-char hex-char
 461  *                | '\x{' hex-char+ '}'
 462  * <!--               | '\u005c u' hex-char hex-char hex-char hex-char
 463  * -->               | '\v' hex-char hex-char hex-char hex-char hex-char hex-char
 464  * hex-char ::= [0-9a-fA-F]
 465  * character-2 ::= (any character except \[]-,)
 466  * </pre>
 467  *
 468  * <hr width="50%">
 469  * <h3>TODO</h3>
 470  * <ul>
 471  *   <li><a href="http://www.unicode.org/unicode/reports/tr18/">Unicode Regular Expression Guidelines</a>
 472  *     <ul>
 473  *       <li>2.4 Canonical Equivalents
 474  *       <li>Level 3
 475  *     </ul>
 476  *   <li>Parsing performance
 477  * </ul>
 478  *
 479  * <hr width="50%">
 480  *
 481  * @xerces.internal
 482  *
 483  * @author TAMURA Kent &lt;kent@trl.ibm.co.jp&gt;
 484  * @version $Id: RegularExpression.java 961928 2010-07-08 20:43:46Z knoaman $
 485  */
 486 public class RegularExpression implements java.io.Serializable {
 487
 488     private static final long serialVersionUID = 6242499334195006401L;
 489
 490     static final boolean DEBUG = false;
 491
 492     /**
 493      * Compiles a token tree into an operation flow.
 494      */
 495     private synchronized void compile(Token tok) {
 496         if (this.operations != null) {
 497             return;
 498         }
 499         this.numberOfClosures = 0;
 500         this.operations = this.compile(tok, null, false);
 501     }
 502
 503     /**
 504      * Converts a token to an operation.
 505      */
 506     private Op compile(Token tok, Op next, boolean reverse) {
 507         Op ret;
 508         switch (tok.type) {
 509         case Token.DOT:
 510             ret = Op.createDot();
 511             ret.next = next;
 512             break;
 513
 514         case Token.CHAR:
 515             ret = Op.createChar(tok.getChar());
 516             ret.next = next;
 517             break;
 518
 519         case Token.ANCHOR:
 520             ret = Op.createAnchor(tok.getChar());
 521             ret.next = next;
 522             break;
 523
 524         case Token.RANGE:
 525         case Token.NRANGE:
 526             ret = Op.createRange(tok);
 527             ret.next = next;
 528             break;
 529
 530         case Token.CONCAT:
 531             ret = next;
 532             if (!reverse) {
 533                 for (int i = tok.size()-1;  i >= 0;  i --) {
 534                     ret = compile(tok.getChild(i), ret, false);
 535                 }
 536             } else {
 537                 for (int i = 0;  i < tok.size();  i ++) {
 538                     ret = compile(tok.getChild(i), ret, true);
 539                 }
 540             }
 541             break;
 542
 543         case Token.UNION:
 544             Op.UnionOp uni = Op.createUnion(tok.size());
 545             for (int i = 0;  i < tok.size();  i ++) {
 546                 uni.addElement(compile(tok.getChild(i), next, reverse));
 547             }
 548             ret = uni;                          // ret.next is null.
 549             break;
 550
 551         case Token.CLOSURE:
 552         case Token.NONGREEDYCLOSURE:
 553             Token child = tok.getChild(0);
 554             int min = tok.getMin();
 555             int max = tok.getMax();
 556             if (min >= 0 && min == max) { // {n}
 557                 ret = next;
 558                 for (int i = 0; i < min;  i ++) {
 559                     ret = compile(child, ret, reverse);
 560                 }
 561                 break;
 562             }
 563             if (min > 0 && max > 0) {
 564                 max -= min;
 565             }
 566             if (max > 0) {
 567                 // X{2,6} -> XX(X(X(XX?)?)?)?
 568                 ret = next;
 569                 for (int i = 0;  i < max;  i ++) {
 570                     Op.ChildOp q = Op.createQuestion(tok.type == Token.NONGREEDYCLOSURE);
 571                     q.next = next;
 572                     q.setChild(compile(child, ret, reverse));
 573                     ret = q;
 574                 }
 575             } else {
 576                 Op.ChildOp op;
 577                 if (tok.type == Token.NONGREEDYCLOSURE) {
 578                     op = Op.createNonGreedyClosure();
 579                 } else {                        // Token.CLOSURE
 580                     op = Op.createClosure(this.numberOfClosures++);
 581                 }
 582                 op.next = next;
 583                 op.setChild(compile(child, op, reverse));
 584                 ret = op;
 585             }
 586             if (min > 0) {
 587                 for (int i = 0;  i < min;  i ++) {
 588                     ret = compile(child, ret, reverse);
 589                 }
 590             }
 591             break;
 592
 593         case Token.EMPTY:
 594             ret = next;
 595             break;
 596
 597         case Token.STRING:
 598             ret = Op.createString(tok.getString());
 599             ret.next = next;
 600             break;
 601
 602         case Token.BACKREFERENCE:
 603             ret = Op.createBackReference(tok.getReferenceNumber());
 604             ret.next = next;
 605             break;
 606
 607         case Token.PAREN:
 608             if (tok.getParenNumber() == 0) {
 609                 ret = compile(tok.getChild(0), next, reverse);
 610             } else if (reverse) {
 611                 next = Op.createCapture(tok.getParenNumber(), next);
 612                 next = compile(tok.getChild(0), next, reverse);
 613                 ret = Op.createCapture(-tok.getParenNumber(), next);
 614             } else {
 615                 next = Op.createCapture(-tok.getParenNumber(), next);
 616                 next = compile(tok.getChild(0), next, reverse);
 617                 ret = Op.createCapture(tok.getParenNumber(), next);
 618             }
 619             break;
 620
 621         case Token.LOOKAHEAD:
 622             ret = Op.createLook(Op.LOOKAHEAD, next, compile(tok.getChild(0), null, false));
 623             break;
 624         case Token.NEGATIVELOOKAHEAD:
 625             ret = Op.createLook(Op.NEGATIVELOOKAHEAD, next, compile(tok.getChild(0), null, false));
 626             break;
 627         case Token.LOOKBEHIND:
 628             ret = Op.createLook(Op.LOOKBEHIND, next, compile(tok.getChild(0), null, true));
 629             break;
 630         case Token.NEGATIVELOOKBEHIND:
 631             ret = Op.createLook(Op.NEGATIVELOOKBEHIND, next, compile(tok.getChild(0), null, true));
 632             break;
 633
 634         case Token.INDEPENDENT:
 635             ret = Op.createIndependent(next, compile(tok.getChild(0), null, reverse));
 636             break;
 637
 638         case Token.MODIFIERGROUP:
 639             ret = Op.createModifier(next, compile(tok.getChild(0), null, reverse),
 640                                     ((Token.ModifierToken)tok).getOptions(),
 641                                     ((Token.ModifierToken)tok).getOptionsMask());
 642             break;
 643
 644         case Token.CONDITION:
 645             Token.ConditionToken ctok = (Token.ConditionToken)tok;
 646             int ref = ctok.refNumber;
 647             Op condition = ctok.condition == null ? null : compile(ctok.condition, null, reverse);
 648             Op yes = compile(ctok.yes, next, reverse);
 649             Op no = ctok.no == null ? null : compile(ctok.no, next, reverse);
 650             ret = Op.createCondition(next, ref, condition, yes, no);
 651             break;
 652
 653         default:
 654             throw new RuntimeException("Unknown token type: "+tok.type);
 655         } // switch (tok.type)
 656         return ret;
 657     }
 658
 659
 660 //Public
 661
 662     /**
 663      * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
 664      *
 665      * @return true if the target is matched to this regular expression.
 666      */
 667     public boolean matches(char[]  target) {
 668         return this.matches(target, 0,  target .length , (Match)null);
 669     }
 670
 671     /**
 672      * Checks whether the <var>target</var> text <strong>contains</strong> this pattern
 673      * in specified range or not.
 674      *
 675      * @param start Start offset of the range.
 676      * @param end  End offset +1 of the range.
 677      * @return true if the target is matched to this regular expression.
 678      */
 679     public boolean matches(char[]  target, int start, int end) {
 680         return this.matches(target, start, end, (Match)null);
 681     }
 682
 683     /**
 684      * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
 685      *
 686      * @param match A Match instance for storing matching result.
 687      * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
 688      */
 689     public boolean matches(char[]  target, Match match) {
 690         return this.matches(target, 0,  target .length , match);
 691     }
 692
 693
 694     /**
 695      * Checks whether the <var>target</var> text <strong>contains</strong> this pattern
 696      * in specified range or not.
 697      *
 698      * @param start Start offset of the range.
 699      * @param end  End offset +1 of the range.
 700      * @param match A Match instance for storing matching result.
 701      * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
 702      */
 703     public boolean matches(char[] target, int start, int end, Match match) {
 704
 705         synchronized (this) {
 706             if (this.operations == null) {
 707                 this.prepare();
 708             }
 709             if (this.context == null) {
 710                 this.context = new Context();
 711             }
 712         }
 713         Context con = null;
 714         synchronized (this.context) {
 715             con = this.context.inuse ? new Context() : this.context;
 716             con.reset(target, start, end, this.numberOfClosures);
 717         }
 718         if (match != null) {
 719             match.setNumberOfGroups(this.nofparen);
 720             match.setSource(target);
 721         } else if (this.hasBackReferences) {
 722             match = new Match();
 723             match.setNumberOfGroups(this.nofparen);
 724             // Need not to call setSource() because
 725             // a caller can not access this match instance.
 726         }
 727         con.match = match;
 728
 729         if (RegularExpression.isSet(this.options, XMLSCHEMA_MODE)) {
 730             int matchEnd = this. match(con, this.operations, con.start, 1, this.options);
 731             //System.err.println("DEBUG: matchEnd="+matchEnd);
 732             if (matchEnd == con.limit) {
 733                 if (con.match != null) {
 734                     con.match.setBeginning(0, con.start);
 735                     con.match.setEnd(0, matchEnd);
 736                 }
 737                 con.setInUse(false);
 738                 return true;
 739             }
 740             return false;
 741         }
 742
 743         /*
 744          * The pattern has only fixed string.
 745          * The engine uses Boyer-Moore.
 746          */
 747         if (this.fixedStringOnly) {
 748             //System.err.println("DEBUG: fixed-only: "+this.fixedString);
 749             int o = this.fixedStringTable.matches(target, con.start, con.limit);
 750             if (o >= 0) {
 751                 if (con.match != null) {
 752                     con.match.setBeginning(0, o);
 753                     con.match.setEnd(0, o+this.fixedString.length());
 754                 }
 755                 con.setInUse(false);
 756                 return true;
 757             }
 758             con.setInUse(false);
 759             return false;
 760         }
 761
 762         /*
 763          * The pattern contains a fixed string.
 764          * The engine checks with Boyer-Moore whether the text contains the fixed string or not.
 765          * If not, it return with false.
 766          */
 767         if (this.fixedString != null) {
 768             int o = this.fixedStringTable.matches(target, con.start, con.limit);
 769             if (o < 0) {
 770                 //System.err.println("Non-match in fixed-string search.");
 771                 con.setInUse(false);
 772                 return false;
 773             }
 774         }
 775
 776         int limit = con.limit-this.minlength;
 777         int matchStart;
 778         int matchEnd = -1;
 779
 780         /*
 781          * Checks whether the expression starts with ".*".
 782          */
 783         if (this.operations != null
 784             && this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) {
 785             if (isSet(this.options, SINGLE_LINE)) {
 786                 matchStart = con.start;
 787                 matchEnd = this. match(con, this.operations, con.start, 1, this.options);
 788             } else {
 789                 boolean previousIsEOL = true;
 790                 for (matchStart = con.start;  matchStart <= limit;  matchStart ++) {
 791                     int ch =  target [  matchStart ] ;
 792                     if (isEOLChar(ch)) {
 793                         previousIsEOL = true;
 794                     } else {
 795                         if (previousIsEOL) {
 796                             if (0 <= (matchEnd = this. match(con, this.operations,
 797                                                              matchStart, 1, this.options))) {
 798                                 break;
 799                             }
 800                         }
 801                         previousIsEOL = false;
 802                     }
 803                 }
 804             }
 805         }
 806
 807         /*
 808          * Optimization against the first character.
 809          */
 810         else if (this.firstChar != null) {
 811             //System.err.println("DEBUG: with firstchar-matching: "+this.firstChar);
 812             RangeToken range = this.firstChar;
 813             for (matchStart = con.start;  matchStart <= limit;  matchStart ++) {
 814                 int ch =  target [matchStart] ;
 815                 if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit) {
 816                     ch = REUtil.composeFromSurrogates(ch, target[matchStart+1]);
 817                 }
 818                 if (!range.match(ch))  {
 819                     continue;
 820                 }
 821                 if (0 <= (matchEnd = this. match(con, this.operations,
 822                                                  matchStart, 1, this.options))) {
 823                         break;
 824                 }
 825             }
 826         }
 827
 828         /*
 829          * Straightforward matching.
 830          */
 831         else {
 832             for (matchStart = con.start;  matchStart <= limit;  matchStart ++) {
 833                 if (0 <= (matchEnd = this. match(con, this.operations, matchStart, 1, this.options))) {
 834                     break;
 835                 }
 836             }
 837         }
 838
 839         if (matchEnd >= 0) {
 840             if (con.match != null) {
 841                 con.match.setBeginning(0, matchStart);
 842                 con.match.setEnd(0, matchEnd);
 843             }
 844             con.setInUse(false);
 845             return true;
 846         } else {
 847             con.setInUse(false);
 848             return false;
 849         }
 850     }
 851
 852     /**
 853      * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
 854      *
 855      * @return true if the target is matched to this regular expression.
 856      */
 857     public boolean matches(String  target) {
 858         return this.matches(target, 0,  target .length() , (Match)null);
 859     }
 860
 861     /**
 862      * Checks whether the <var>target</var> text <strong>contains</strong> this pattern
 863      * in specified range or not.
 864      *
 865      * @param start Start offset of the range.
 866      * @param end  End offset +1 of the range.
 867      * @return true if the target is matched to this regular expression.
 868      */
 869     public boolean matches(String  target, int start, int end) {
 870         return this.matches(target, start, end, (Match)null);
 871     }
 872
 873     /**
 874      * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
 875      *
 876      * @param match A Match instance for storing matching result.
 877      * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
 878      */
 879     public boolean matches(String  target, Match match) {
 880         return this.matches(target, 0,  target .length() , match);
 881     }
 882
 883     /**
 884      * Checks whether the <var>target</var> text <strong>contains</strong> this pattern
 885      * in specified range or not.
 886      *
 887      * @param start Start offset of the range.
 888      * @param end  End offset +1 of the range.
 889      * @param match A Match instance for storing matching result.
 890      * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
 891      */
 892     public boolean matches(String  target, int start, int end, Match match) {
 893
 894         synchronized (this) {
 895             if (this.operations == null) {
 896                 this.prepare();
 897             }
 898             if (this.context == null) {
 899                 this.context = new Context();
 900             }
 901         }
 902         Context con = null;
 903         synchronized (this.context) {
 904             con = this.context.inuse ? new Context() : this.context;
 905             con.reset(target, start, end, this.numberOfClosures);
 906         }
 907         if (match != null) {
 908             match.setNumberOfGroups(this.nofparen);
 909             match.setSource(target);
 910         } else if (this.hasBackReferences) {
 911             match = new Match();
 912             match.setNumberOfGroups(this.nofparen);
 913             // Need not to call setSource() because
 914             // a caller can not access this match instance.
 915         }
 916         con.match = match;
 917
 918         if (RegularExpression.isSet(this.options, XMLSCHEMA_MODE)) {
 919             if (DEBUG) {
 920                 System.err.println("target string="+target);
 921             }
 922             int matchEnd = this. match(con, this.operations, con.start, 1, this.options);
 923             if (DEBUG) {
 924                 System.err.println("matchEnd="+matchEnd);
 925                 System.err.println("con.limit="+con.limit);
 926             }
 927             if (matchEnd == con.limit) {
 928                 if (con.match != null) {
 929                     con.match.setBeginning(0, con.start);
 930                     con.match.setEnd(0, matchEnd);
 931                 }
 932                 con.setInUse(false);
 933                 return true;
 934             }
 935             return false;
 936         }
 937
 938         /*
 939          * The pattern has only fixed string.
 940          * The engine uses Boyer-Moore.
 941          */
 942         if (this.fixedStringOnly) {
 943             //System.err.println("DEBUG: fixed-only: "+this.fixedString);
 944             int o = this.fixedStringTable.matches(target, con.start, con.limit);
 945             if (o >= 0) {
 946                 if (con.match != null) {
 947                     con.match.setBeginning(0, o);
 948                     con.match.setEnd(0, o+this.fixedString.length());
 949                 }
 950                 con.setInUse(false);
 951                 return true;
 952             }
 953             con.setInUse(false);
 954             return false;
 955         }
 956
 957         /*
 958          * The pattern contains a fixed string.
 959          * The engine checks with Boyer-Moore whether the text contains the fixed string or not.
 960          * If not, it return with false.
 961          */
 962         if (this.fixedString != null) {
 963             int o = this.fixedStringTable.matches(target, con.start, con.limit);
 964             if (o < 0) {
 965                 //System.err.println("Non-match in fixed-string search.");
 966                 con.setInUse(false);
 967                 return false;
 968             }
 969         }
 970
 971         int limit = con.limit-this.minlength;
 972         int matchStart;
 973         int matchEnd = -1;
 974
 975         /*
 976          * Checks whether the expression starts with ".*".
 977          */
 978         if (this.operations != null
 979             && this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) {
 980             if (isSet(this.options, SINGLE_LINE)) {
 981                 matchStart = con.start;
 982                 matchEnd = this.match(con, this.operations, con.start, 1, this.options);
 983             } else {
 984                 boolean previousIsEOL = true;
 985                 for (matchStart = con.start;  matchStart <= limit;  matchStart ++) {
 986                     int ch =  target .charAt(  matchStart ) ;
 987                     if (isEOLChar(ch)) {
 988                         previousIsEOL = true;
 989                     } else {
 990                         if (previousIsEOL) {
 991                             if (0 <= (matchEnd = this.match(con, this.operations,
 992                                                             matchStart, 1, this.options))) {
 993                                 break;
 994                             }
 995                         }
 996                         previousIsEOL = false;
 997                     }
 998                 }
 999             }
1000         }
1001
1002         /*
1003          * Optimization against the first character.
1004          */
1005         else if (this.firstChar != null) {
1006             //System.err.println("DEBUG: with firstchar-matching: "+this.firstChar);
1007             RangeToken range = this.firstChar;
1008             for (matchStart = con.start;  matchStart <= limit;  matchStart ++) {
1009                 int ch =  target .charAt(  matchStart ) ;
1010                 if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit) {
1011                     ch = REUtil.composeFromSurrogates(ch, target.charAt(matchStart+1));
1012                 }
1013                 if (!range.match(ch)) {
1014                     continue;
1015                 }
1016                 if (0 <= (matchEnd = this.match(con, this.operations,
1017                                                 matchStart, 1, this.options))) {
1018                         break;
1019                 }
1020             }
1021         }
1022
1023         /*
1024          * Straightforward matching.
1025          */
1026         else {
1027             for (matchStart = con.start;  matchStart <= limit;  matchStart ++) {
1028                 if (0 <= (matchEnd = this.match(con, this.operations, matchStart, 1, this.options))) {
1029                     break;
1030                 }
1031             }
1032         }
1033
1034         if (matchEnd >= 0) {
1035             if (con.match != null) {
1036                 con.match.setBeginning(0, matchStart);
1037                 con.match.setEnd(0, matchEnd);
1038             }
1039             con.setInUse(false);
1040             return true;
1041         } else {
1042             con.setInUse(false);
1043             return false;
1044         }
1045     }
1046
1047     /**
1048      * @return -1 when not match; offset of the end of matched string when match.
1049      */
1050     private int match(Context con, Op op, int offset, int dx, int opts) {
1051         final ExpressionTarget target = con.target;
1052         final Stack<Op> opStack = new Stack<>();
1053         final IntStack dataStack = new IntStack();
1054         final boolean isSetIgnoreCase = isSet(opts, IGNORE_CASE);
1055         int retValue = -1;
1056         boolean returned = false;
1057
1058         for (;;) {
1059             if (op == null || offset > con.limit || offset < con.start) {
1060                 if (op == null) {
1061                     retValue = isSet(opts, XMLSCHEMA_MODE) && offset != con.limit ? -1 : offset;
1062                 }
1063                 else {
1064                    retValue = -1;
1065                 }
1066                 returned = true;
1067             }
1068             else  {
1069                 retValue = -1;
1070                 // dx value is either 1 or -1
1071                 switch (op.type) {
1072                 case Op.CHAR:
1073                     {
1074                         final int o1 = (dx > 0) ? offset : offset -1;
1075                         if (o1 >= con.limit || o1 < 0 || !matchChar(op.getData(), target.charAt(o1), isSetIgnoreCase)) {
1076                             returned = true;
1077                             break;
1078                         }
1079                         offset += dx;
1080                         op = op.next;
1081                     }
1082                     break;
1083
1084                 case Op.DOT:
1085                     {
1086                         int o1 = (dx > 0) ? offset : offset - 1;
1087                         if (o1 >= con.limit || o1 < 0) {
1088                             returned = true;
1089                             break;
1090                         }
1091                         if (isSet(opts, SINGLE_LINE)) {
1092                             if (REUtil.isHighSurrogate(target.charAt(o1)) && o1+dx >= 0 && o1+dx < con.limit) {
1093                                 o1 += dx;
1094                             }
1095                         }
1096                         else {
1097                             int ch = target.charAt(o1);
1098                             if (REUtil.isHighSurrogate(ch) && o1+dx >= 0 && o1+dx < con.limit) {
1099                                 o1 += dx;
1100                                 ch = REUtil.composeFromSurrogates(ch, target.charAt(o1));
1101                             }
1102                             if (isEOLChar(ch)) {
1103                                 returned = true;
1104                                 break;
1105                             }
1106                         }
1107                         offset = (dx > 0) ? o1 + 1 : o1;
1108                         op = op.next;
1109                     }
1110                     break;
1111
1112                 case Op.RANGE:
1113                 case Op.NRANGE:
1114                     {
1115                         int o1 = (dx > 0) ? offset : offset -1;
1116                         if (o1 >= con.limit || o1 < 0) {
1117                             returned = true;
1118                             break;
1119                         }
1120                         int ch = target.charAt(offset);
1121                         if (REUtil.isHighSurrogate(ch) && o1+dx < con.limit && o1+dx >=0) {
1122                             o1 += dx;
1123                             ch = REUtil.composeFromSurrogates(ch, target.charAt(o1));
1124                         }
1125                         final RangeToken tok = op.getToken();
1126                         if (!tok.match(ch)) {
1127                             returned = true;
1128                             break;
1129                         }
1130                         offset = (dx > 0) ? o1+1 : o1;
1131                         op = op.next;
1132                     }
1133                     break;
1134
1135                 case Op.ANCHOR:
1136                     {
1137                         if (!matchAnchor(target, op, con, offset, opts)) {
1138                             returned = true;
1139                             break;
1140                         }
1141                         op = op.next;
1142                     }
1143                     break;
1144
1145                 case Op.BACKREFERENCE:
1146                     {
1147                         int refno = op.getData();
1148                         if (refno <= 0 || refno >= this.nofparen) {
1149                             throw new RuntimeException("Internal Error: Reference number must be more than zero: "+refno);
1150                         }
1151                         if (con.match.getBeginning(refno) < 0 || con.match.getEnd(refno) < 0) {
1152                             returned = true;
1153                             break;
1154                         }
1155                         int o2 = con.match.getBeginning(refno);
1156                         int literallen = con.match.getEnd(refno)-o2;
1157                         if (dx > 0) {
1158                             if (!target.regionMatches(isSetIgnoreCase, offset, con.limit, o2, literallen)) {
1159                                 returned = true;
1160                                 break;
1161                             }
1162                             offset += literallen;
1163                         }
1164                         else {
1165                             if (!target.regionMatches(isSetIgnoreCase, offset-literallen, con.limit, o2, literallen)) {
1166                                 returned = true;
1167                                 break;
1168                             }
1169                             offset -= literallen;
1170                         }
1171                         op = op.next;
1172                     }
1173                     break;
1174
1175                 case Op.STRING:
1176                     {
1177                         String literal = op.getString();
1178                         int literallen = literal.length();
1179                         if (dx > 0) {
1180                             if (!target.regionMatches(isSetIgnoreCase, offset, con.limit, literal, literallen)) {
1181                                 returned = true;
1182                                 break;
1183                             }
1184                             offset += literallen;
1185                         }
1186                         else {
1187                             if (!target.regionMatches(isSetIgnoreCase, offset-literallen, con.limit, literal, literallen)) {
1188                                 returned = true;
1189                                 break;
1190                             }
1191                             offset -= literallen;
1192                         }
1193                         op = op.next;
1194                     }
1195                     break;
1196
1197                 case Op.CLOSURE:
1198                     {
1199                         // Saves current position to avoid zero-width repeats.
1200                         final int id = op.getData();
1201                         if (con.closureContexts[id].contains(offset)) {
1202                             returned = true;
1203                             break;
1204                         }
1205
1206                         con.closureContexts[id].addOffset(offset);
1207                     }
1208                     // fall through
1209
1210                 case Op.QUESTION:
1211                     {
1212                         opStack.push(op);
1213                         dataStack.push(offset);
1214                         op = op.getChild();
1215                     }
1216                     break;
1217
1218                 case Op.NONGREEDYCLOSURE:
1219                 case Op.NONGREEDYQUESTION:
1220                     {
1221                         opStack.push(op);
1222                         dataStack.push(offset);
1223                         op = op.next;
1224                     }
1225                     break;
1226
1227                 case Op.UNION:
1228                     if (op.size() == 0) {
1229                         returned = true;
1230                     }
1231                     else {
1232                         opStack.push(op);
1233                         dataStack.push(0);
1234                         dataStack.push(offset);
1235                         op = op.elementAt(0);
1236                     }
1237                     break;
1238
1239                 case Op.CAPTURE:
1240                     {
1241                         final int refno = op.getData();
1242                         if (con.match != null) {
1243                             if (refno > 0) {
1244                                 dataStack.push(con.match.getBeginning(refno));
1245                                 con.match.setBeginning(refno, offset);
1246                             }
1247                             else {
1248                                 final int index = -refno;
1249                                 dataStack.push(con.match.getEnd(index));
1250                                 con.match.setEnd(index, offset);
1251                             }
1252                             opStack.push(op);
1253                             dataStack.push(offset);
1254                         }
1255                         op = op.next;
1256                     }
1257                     break;
1258
1259                 case Op.LOOKAHEAD:
1260                 case Op.NEGATIVELOOKAHEAD:
1261                 case Op.LOOKBEHIND:
1262                 case Op.NEGATIVELOOKBEHIND:
1263                     {
1264                         opStack.push(op);
1265                         dataStack.push(dx);
1266                         dataStack.push(offset);
1267                         dx = (op.type == Op.LOOKAHEAD || op.type == Op.NEGATIVELOOKAHEAD) ? 1 : -1;
1268                         op = op.getChild();
1269                     }
1270                     break;
1271
1272                 case Op.INDEPENDENT:
1273                     {
1274                         opStack.push(op);
1275                         dataStack.push(offset);
1276                         op = op.getChild();
1277                     }
1278                     break;
1279
1280                 case Op.MODIFIER:
1281                     {
1282                         int localopts = opts;
1283                         localopts |= op.getData();
1284                         localopts &= ~op.getData2();
1285                         opStack.push(op);
1286                         dataStack.push(opts);
1287                         dataStack.push(offset);
1288                         opts = localopts;
1289                         op = op.getChild();
1290                     }
1291                     break;
1292
1293                 case Op.CONDITION:
1294                     {
1295                         Op.ConditionOp cop = (Op.ConditionOp)op;
1296                         if (cop.refNumber > 0) {
1297                             if (cop.refNumber >= this.nofparen) {
1298                                 throw new RuntimeException("Internal Error: Reference number must be more than zero: "+cop.refNumber);
1299                             }
1300                             if (con.match.getBeginning(cop.refNumber) >= 0
1301                                     && con.match.getEnd(cop.refNumber) >= 0) {
1302                                 op = cop.yes;
1303                             }
1304                             else if (cop.no != null) {
1305                                 op = cop.no;
1306                             }
1307                             else {
1308                                 op = cop.next;
1309                             }
1310                         }
1311                         else {
1312                             opStack.push(op);
1313                             dataStack.push(offset);
1314                             op = cop.condition;
1315                         }
1316                     }
1317                     break;
1318
1319                 default:
1320                     throw new RuntimeException("Unknown operation type: " + op.type);
1321                 }
1322             }
1323
1324             // handle recursive operations
1325             while (returned) {
1326                 // exhausted all the operations
1327                 if (opStack.isEmpty()) {
1328                     return retValue;
1329                 }
1330
1331                 op = opStack.pop();
1332                 offset = dataStack.pop();
1333
1334                 switch (op.type) {
1335                 case Op.CLOSURE:
1336                 case Op.QUESTION:
1337                     if (retValue < 0) {
1338                         op = op.next;
1339                         returned = false;
1340                     }
1341                     break;
1342
1343                 case Op.NONGREEDYCLOSURE:
1344                 case Op.NONGREEDYQUESTION:
1345                     if (retValue < 0) {
1346                         op = op.getChild();
1347                         returned = false;
1348                     }
1349                     break;
1350
1351                 case Op.UNION:
1352                     {
1353                         int unionIndex = dataStack.pop();
1354                         if (DEBUG) {
1355                             System.err.println("UNION: "+unionIndex+", ret="+retValue);
1356                         }
1357
1358                         if (retValue < 0) {
1359                             if (++unionIndex < op.size()) {
1360                                 opStack.push(op);
1361                                 dataStack.push(unionIndex);
1362                                 dataStack.push(offset);
1363                                 op = op.elementAt(unionIndex);
1364                                 returned = false;
1365                             }
1366                             else {
1367                                 retValue = -1;
1368                             }
1369                         }
1370                     }
1371                     break;
1372
1373                 case Op.CAPTURE:
1374                     final int refno = op.getData();
1375                     final int saved = dataStack.pop();
1376                     if (retValue < 0) {
1377                         if (refno > 0) {
1378                             con.match.setBeginning(refno, saved);
1379                         }
1380                         else {
1381                             con.match.setEnd(-refno, saved);
1382                         }
1383                     }
1384                     break;
1385
1386                 case Op.LOOKAHEAD:
1387                 case Op.LOOKBEHIND:
1388                     {
1389                         dx = dataStack.pop();
1390                         if (0 <= retValue) {
1391                             op = op.next;
1392                             returned = false;
1393                         }
1394                         retValue = -1;
1395                     }
1396                     break;
1397
1398                 case Op.NEGATIVELOOKAHEAD:
1399                 case Op.NEGATIVELOOKBEHIND:
1400                     {
1401                         dx = dataStack.pop();
1402                         if (0 > retValue)  {
1403                             op = op.next;
1404                             returned = false;
1405                         }
1406                         retValue = -1;
1407                     }
1408                     break;
1409
1410                 case Op.MODIFIER:
1411                     opts = dataStack.pop();
1412                     // fall through
1413
1414                 case Op.INDEPENDENT:
1415                     if (retValue >= 0)  {
1416                         offset = retValue;
1417                         op = op.next;
1418                         returned = false;
1419                     }
1420                     break;
1421
1422                 case Op.CONDITION:
1423                     {
1424                         final Op.ConditionOp cop = (Op.ConditionOp)op;
1425                         if (0 <= retValue) {
1426                             op = cop.yes;
1427                         }
1428                         else if (cop.no != null) {
1429                             op = cop.no;
1430                         }
1431                         else {
1432                             op = cop.next;
1433                         }
1434                     }
1435                     returned = false;
1436                     break;
1437
1438                 default:
1439                     break;
1440                 }
1441             }
1442         }
1443     }
1444
1445     private static boolean matchChar(int ch, int other, boolean ignoreCase) {
1446         return (ignoreCase) ? matchIgnoreCase(ch, other) : ch == other;
1447     }
1448
1449     boolean matchAnchor(ExpressionTarget target, Op op, Context con, int offset, int opts) {
1450         boolean go = false;
1451         switch (op.getData()) {
1452         case '^':
1453             if (isSet(opts, MULTIPLE_LINES)) {
1454                 if (!(offset == con.start
1455                       || offset > con.start && offset < con.limit && isEOLChar(target.charAt(offset-1)))) {
1456                     return false;
1457                 }
1458             } else {
1459                 if (offset != con.start) {
1460                     return false;
1461                 }
1462             }
1463             break;
1464
1465         case '@':                         // Internal use only.
1466             // The @ always matches line beginnings.
1467             if (!(offset == con.start
1468                   || offset > con.start && isEOLChar(target.charAt(offset-1)))) {
1469                 return false;
1470             }
1471             break;
1472
1473         case '$':
1474             if (isSet(opts, MULTIPLE_LINES)) {
1475                 if (!(offset == con.limit
1476                       || offset < con.limit && isEOLChar(target.charAt(offset)))) {
1477                     return false;
1478                 }
1479             } else {
1480                 if (!(offset == con.limit
1481                       || offset+1 == con.limit && isEOLChar(target.charAt(offset))
1482                       || offset+2 == con.limit &&  target.charAt(offset) == CARRIAGE_RETURN
1483                       &&  target.charAt(offset+1) == LINE_FEED)) {
1484                     return false;
1485                 }
1486             }
1487             break;
1488
1489         case 'A':
1490             if (offset != con.start) {
1491                 return false;
1492             }
1493             break;
1494
1495         case 'Z':
1496             if (!(offset == con.limit
1497                   || offset+1 == con.limit && isEOLChar(target.charAt(offset))
1498                   || offset+2 == con.limit &&  target.charAt(offset) == CARRIAGE_RETURN
1499                   &&  target.charAt(offset+1) == LINE_FEED)) {
1500                 return false;
1501             }
1502             break;
1503
1504         case 'z':
1505             if (offset != con.limit) {
1506                 return false;
1507             }
1508             break;
1509
1510         case 'b':
1511             if (con.length == 0) {
1512                 return false;
1513             }
1514             {
1515                 int after = getWordType(target, con.start, con.limit, offset, opts);
1516                 if (after == WT_IGNORE) {
1517                     return false;
1518                 }
1519                 int before = getPreviousWordType(target, con.start, con.limit, offset, opts);
1520                 if (after == before) {
1521                     return false;
1522                 }
1523             }
1524             break;
1525
1526         case 'B':
1527             if (con.length == 0) {
1528                 go = true;
1529             } else {
1530                 int after = getWordType(target, con.start, con.limit, offset, opts);
1531                 go = after == WT_IGNORE
1532                      || after == getPreviousWordType(target, con.start, con.limit, offset, opts);
1533             }
1534             if (!go) {
1535                 return false;
1536             }
1537             break;
1538
1539         case '<':
1540             if (con.length == 0 || offset == con.limit) {
1541                 return false;
1542             }
1543             if (getWordType(target, con.start, con.limit, offset, opts) != WT_LETTER
1544                 || getPreviousWordType(target, con.start, con.limit, offset, opts) != WT_OTHER) {
1545                 return false;
1546             }
1547             break;
1548
1549         case '>':
1550             if (con.length == 0 || offset == con.start) {
1551                 return false;
1552             }
1553             if (getWordType(target, con.start, con.limit, offset, opts) != WT_OTHER
1554                 || getPreviousWordType(target, con.start, con.limit, offset, opts) != WT_LETTER) {
1555                 return false;
1556             }
1557             break;
1558         } // switch anchor type
1559
1560         return true;
1561     }
1562
1563     private static final int getPreviousWordType(ExpressionTarget target, int begin, int end,
1564                                                  int offset, int opts) {
1565         int ret = getWordType(target, begin, end, --offset, opts);
1566         while (ret == WT_IGNORE) {
1567             ret = getWordType(target, begin, end, --offset, opts);
1568         }
1569         return ret;
1570     }
1571
1572     private static final int getWordType(ExpressionTarget target, int begin, int end,
1573                                          int offset, int opts) {
1574         if (offset < begin || offset >= end) {
1575             return WT_OTHER;
1576         }
1577         return getWordType0(target.charAt(offset) , opts);
1578     }
1579
1580
1581     /**
1582      * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
1583      *
1584      * @return true if the target is matched to this regular expression.
1585      */
1586     public boolean matches(CharacterIterator target) {
1587         return this.matches(target, (Match)null);
1588     }
1589
1590
1591     /**
1592      * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
1593      *
1594      * @param match A Match instance for storing matching result.
1595      * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
1596      */
1597     public boolean matches(CharacterIterator  target, Match match) {
1598         int start = target.getBeginIndex();
1599         int end = target.getEndIndex();
1600
1601
1602
1603         synchronized (this) {
1604             if (this.operations == null) {
1605                 this.prepare();
1606             }
1607             if (this.context == null) {
1608                 this.context = new Context();
1609             }
1610         }
1611         Context con = null;
1612         synchronized (this.context) {
1613             con = this.context.inuse ? new Context() : this.context;
1614             con.reset(target, start, end, this.numberOfClosures);
1615         }
1616         if (match != null) {
1617             match.setNumberOfGroups(this.nofparen);
1618             match.setSource(target);
1619         } else if (this.hasBackReferences) {
1620             match = new Match();
1621             match.setNumberOfGroups(this.nofparen);
1622             // Need not to call setSource() because
1623             // a caller can not access this match instance.
1624         }
1625         con.match = match;
1626
1627         if (RegularExpression.isSet(this.options, XMLSCHEMA_MODE)) {
1628             int matchEnd = this.match(con, this.operations, con.start, 1, this.options);
1629             //System.err.println("DEBUG: matchEnd="+matchEnd);
1630             if (matchEnd == con.limit) {
1631                 if (con.match != null) {
1632                     con.match.setBeginning(0, con.start);
1633                     con.match.setEnd(0, matchEnd);
1634                 }
1635                 con.setInUse(false);
1636                 return true;
1637             }
1638             return false;
1639         }
1640
1641         /*
1642          * The pattern has only fixed string.
1643          * The engine uses Boyer-Moore.
1644          */
1645         if (this.fixedStringOnly) {
1646             //System.err.println("DEBUG: fixed-only: "+this.fixedString);
1647             int o = this.fixedStringTable.matches(target, con.start, con.limit);
1648             if (o >= 0) {
1649                 if (con.match != null) {
1650                     con.match.setBeginning(0, o);
1651                     con.match.setEnd(0, o+this.fixedString.length());
1652                 }
1653                 con.setInUse(false);
1654                 return true;
1655             }
1656             con.setInUse(false);
1657             return false;
1658         }
1659
1660         /*
1661          * The pattern contains a fixed string.
1662          * The engine checks with Boyer-Moore whether the text contains the fixed string or not.
1663          * If not, it return with false.
1664          */
1665         if (this.fixedString != null) {
1666             int o = this.fixedStringTable.matches(target, con.start, con.limit);
1667             if (o < 0) {
1668                 //System.err.println("Non-match in fixed-string search.");
1669                 con.setInUse(false);
1670                 return false;
1671             }
1672         }
1673
1674         int limit = con.limit-this.minlength;
1675         int matchStart;
1676         int matchEnd = -1;
1677
1678         /*
1679          * Checks whether the expression starts with ".*".
1680          */
1681         if (this.operations != null
1682             && this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) {
1683             if (isSet(this.options, SINGLE_LINE)) {
1684                 matchStart = con.start;
1685                 matchEnd = this.match(con, this.operations, con.start, 1, this.options);
1686             } else {
1687                 boolean previousIsEOL = true;
1688                 for (matchStart = con.start;  matchStart <= limit;  matchStart ++) {
1689                     int ch =  target .setIndex(  matchStart ) ;
1690                     if (isEOLChar(ch)) {
1691                         previousIsEOL = true;
1692                     } else {
1693                         if (previousIsEOL) {
1694                             if (0 <= (matchEnd = this.match(con, this.operations,
1695                                                             matchStart, 1, this.options))) {
1696                                 break;
1697                             }
1698                         }
1699                         previousIsEOL = false;
1700                     }
1701                 }
1702             }
1703         }
1704
1705         /*
1706          * Optimization against the first character.
1707          */
1708         else if (this.firstChar != null) {
1709             //System.err.println("DEBUG: with firstchar-matching: "+this.firstChar);
1710             RangeToken range = this.firstChar;
1711             for (matchStart = con.start;  matchStart <= limit;  matchStart ++) {
1712                 int ch =  target .setIndex(  matchStart ) ;
1713                 if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit) {
1714                     ch = REUtil.composeFromSurrogates(ch, target.setIndex(matchStart+1));
1715                 }
1716                 if (!range.match(ch)) {
1717                     continue;
1718                 }
1719                 if (0 <= (matchEnd = this.match(con, this.operations,
1720                                                 matchStart, 1, this.options))) {
1721                     break;
1722                 }
1723             }
1724         }
1725
1726         /*
1727          * Straightforward matching.
1728          */
1729         else {
1730             for (matchStart = con.start;  matchStart <= limit;  matchStart ++) {
1731                 if (0 <= (matchEnd = this. match(con, this.operations, matchStart, 1, this.options))) {
1732                     break;
1733                 }
1734             }
1735         }
1736
1737         if (matchEnd >= 0) {
1738             if (con.match != null) {
1739                 con.match.setBeginning(0, matchStart);
1740                 con.match.setEnd(0, matchEnd);
1741             }
1742             con.setInUse(false);
1743             return true;
1744         } else {
1745             con.setInUse(false);
1746             return false;
1747         }
1748     }
1749
1750     // ================================================================
1751
1752     /**
1753      * A regular expression.
1754      * @serial
1755      */
1756     String regex;
1757     /**
1758      * @serial
1759      */
1760     int options;
1761
1762     /**
1763      * The number of parenthesis in the regular expression.
1764      * @serial
1765      */
1766     int nofparen;
1767     /**
1768      * Internal representation of the regular expression.
1769      * @serial
1770      */
1771     Token tokentree;
1772
1773     boolean hasBackReferences = false;
1774
1775     transient int minlength;
1776     transient Op operations = null;
1777     transient int numberOfClosures;
1778     transient Context context = null;
1779     transient RangeToken firstChar = null;
1780
1781     transient String fixedString = null;
1782     transient int fixedStringOptions;
1783     transient BMPattern fixedStringTable = null;
1784     transient boolean fixedStringOnly = false;
1785
1786     static abstract class ExpressionTarget {
1787         abstract char charAt(int index);
1788         abstract boolean regionMatches(boolean ignoreCase, int offset, int limit, String part, int partlen);
1789         abstract boolean regionMatches(boolean ignoreCase, int offset, int limit, int offset2, int partlen);
1790     }
1791
1792     static final class StringTarget extends ExpressionTarget {
1793
1794         private String target;
1795
1796         StringTarget(String target) {
1797             this.target = target;
1798         }
1799
1800         final void resetTarget(String target) {
1801             this.target = target;
1802         }
1803
1804         @Override
1805         final char charAt(int index) {
1806             return target.charAt(index);
1807         }
1808
1809         @Override
1810         final boolean regionMatches(boolean ignoreCase, int offset, int limit,
1811                               String part, int partlen) {
1812             if (limit-offset < partlen) {
1813                 return false;
1814             }
1815             return (ignoreCase) ? target.regionMatches(true, offset, part, 0, partlen) : target.regionMatches(offset, part, 0, partlen);
1816         }
1817
1818         @Override
1819         final boolean regionMatches(boolean ignoreCase, int offset, int limit,
1820                                     int offset2, int partlen) {
1821             if (limit-offset < partlen) {
1822                 return false;
1823             }
1824             return (ignoreCase) ? target.regionMatches(true, offset, target, offset2, partlen)
1825                                 : target.regionMatches(offset, target, offset2, partlen);
1826         }
1827     }
1828
1829     static final class CharArrayTarget extends ExpressionTarget {
1830
1831         char[] target;
1832
1833         CharArrayTarget(char[] target) {
1834             this.target = target;
1835         }
1836
1837         final void resetTarget(char[] target) {
1838             this.target = target;
1839         }
1840
1841         @Override
1842         char charAt(int index) {
1843             return target[index];
1844         }
1845
1846         @Override
1847         final boolean regionMatches(boolean ignoreCase, int offset, int limit,
1848                 String part, int partlen) {
1849             if (offset < 0 || limit-offset < partlen)  {
1850                 return false;
1851             }
1852             return (ignoreCase) ? regionMatchesIgnoreCase(offset, limit, part, partlen)
1853                                 : regionMatches(offset, limit, part, partlen);
1854         }
1855
1856         private final boolean regionMatches(int offset, int limit, String part, int partlen) {
1857             int i = 0;
1858             while (partlen-- > 0) {
1859                 if (target[offset++] != part.charAt(i++)) {
1860                     return false;
1861                 }
1862             }
1863             return true;
1864         }
1865
1866         private final boolean regionMatchesIgnoreCase(int offset, int limit, String part, int partlen) {
1867             int i = 0;
1868             while (partlen-- > 0) {
1869                 final char ch1 = target[offset++] ;
1870                 final char ch2 = part.charAt(i++);
1871                 if (ch1 == ch2) {
1872                     continue;
1873                 }
1874                 final char uch1 = Character.toUpperCase(ch1);
1875                 final char uch2 = Character.toUpperCase(ch2);
1876                 if (uch1 == uch2) {
1877                     continue;
1878                 }
1879                 if (Character.toLowerCase(uch1) != Character.toLowerCase(uch2)) {
1880                     return false;
1881                 }
1882             }
1883             return true;
1884         }
1885
1886         @Override
1887         final boolean regionMatches(boolean ignoreCase, int offset, int limit, int offset2, int partlen) {
1888             if (offset < 0 || limit-offset < partlen) {
1889                 return false;
1890             }
1891             return (ignoreCase) ? regionMatchesIgnoreCase(offset, limit, offset2, partlen)
1892                                 : regionMatches(offset, limit, offset2, partlen);
1893         }
1894
1895         private final boolean regionMatches(int offset, int limit, int offset2, int partlen) {
1896             int i = offset2;
1897             while (partlen-- > 0) {
1898                 if ( target [  offset++ ]  !=  target [  i++ ] ) {
1899                     return false;
1900                 }
1901             }
1902             return true;
1903         }
1904
1905         private final boolean regionMatchesIgnoreCase(int offset, int limit, int offset2, int partlen) {
1906             int i = offset2;
1907             while (partlen-- > 0) {
1908                 final char ch1 =  target[offset++] ;
1909                 final char ch2 =  target[i++] ;
1910                 if (ch1 == ch2) {
1911                     continue;
1912                 }
1913                 final char uch1 = Character.toUpperCase(ch1);
1914                 final char uch2 = Character.toUpperCase(ch2);
1915                 if (uch1 == uch2) {
1916                     continue;
1917                 }
1918                 if (Character.toLowerCase(uch1) != Character.toLowerCase(uch2)) {
1919                     return false;
1920                 }
1921             }
1922             return true;
1923         }
1924     }
1925
1926     static final class CharacterIteratorTarget extends ExpressionTarget {
1927         CharacterIterator target;
1928
1929         CharacterIteratorTarget(CharacterIterator target) {
1930             this.target = target;
1931         }
1932
1933         final void resetTarget(CharacterIterator target) {
1934             this.target = target;
1935         }
1936
1937         @Override
1938         final char charAt(int index) {
1939             return target.setIndex(index);
1940         }
1941
1942         @Override
1943         final boolean regionMatches(boolean ignoreCase, int offset, int limit,
1944                 String part, int partlen) {
1945             if (offset < 0 || limit-offset < partlen)  {
1946                 return false;
1947             }
1948             return (ignoreCase) ? regionMatchesIgnoreCase(offset, limit, part, partlen)
1949                                 : regionMatches(offset, limit, part, partlen);
1950         }
1951
1952         private final boolean regionMatches(int offset, int limit, String part, int partlen) {
1953             int i = 0;
1954             while (partlen-- > 0) {
1955                 if (target.setIndex(offset++) != part.charAt(i++)) {
1956                     return false;
1957                 }
1958             }
1959             return true;
1960         }
1961
1962         private final boolean regionMatchesIgnoreCase(int offset, int limit, String part, int partlen) {
1963             int i = 0;
1964             while (partlen-- > 0) {
1965                 final char ch1 = target.setIndex(offset++) ;
1966                 final char ch2 = part.charAt(i++);
1967                 if (ch1 == ch2) {
1968                     continue;
1969                 }
1970                 final char uch1 = Character.toUpperCase(ch1);
1971                 final char uch2 = Character.toUpperCase(ch2);
1972                 if (uch1 == uch2) {
1973                     continue;
1974                 }
1975                 if (Character.toLowerCase(uch1) != Character.toLowerCase(uch2)) {
1976                     return false;
1977                 }
1978             }
1979             return true;
1980         }
1981
1982         @Override
1983         final boolean regionMatches(boolean ignoreCase, int offset, int limit, int offset2, int partlen) {
1984             if (offset < 0 || limit-offset < partlen) {
1985                 return false;
1986             }
1987             return (ignoreCase) ? regionMatchesIgnoreCase(offset, limit, offset2, partlen)
1988                                 : regionMatches(offset, limit, offset2, partlen);
1989         }
1990
1991         private final boolean regionMatches(int offset, int limit, int offset2, int partlen) {
1992             int i = offset2;
1993             while (partlen-- > 0) {
1994                 if (target.setIndex(offset++) != target.setIndex(i++)) {
1995                     return false;
1996                 }
1997             }
1998             return true;
1999         }
2000
2001         private final boolean regionMatchesIgnoreCase(int offset, int limit, int offset2, int partlen) {
2002             int i = offset2;
2003             while (partlen-- > 0) {
2004                 final char ch1 = target.setIndex(offset++) ;
2005                 final char ch2 = target.setIndex(i++) ;
2006                 if (ch1 == ch2) {
2007                     continue;
2008                 }
2009                 final char uch1 = Character.toUpperCase(ch1);
2010                 final char uch2 = Character.toUpperCase(ch2);
2011                 if (uch1 == uch2) {
2012                     continue;
2013                 }
2014                 if (Character.toLowerCase(uch1) != Character.toLowerCase(uch2)) {
2015                     return false;
2016                 }
2017             }
2018             return true;
2019         }
2020     }
2021
2022     static final class ClosureContext {
2023
2024         int[] offsets = new int[4];
2025         int currentIndex = 0;
2026
2027         boolean contains(int offset) {
2028             for (int i=0; i<currentIndex;++i) {
2029                 if (offsets[i] == offset) {
2030                     return true;
2031                 }
2032             }
2033             return false;
2034         }
2035
2036         void reset() {
2037             currentIndex = 0;
2038         }
2039
2040         void addOffset(int offset) {
2041             // We do not check for duplicates, caller is responsible for that
2042             if (currentIndex == offsets.length) {
2043                 offsets = expandOffsets();
2044             }
2045             offsets[currentIndex++] = offset;
2046         }
2047
2048         private int[] expandOffsets() {
2049             final int len = offsets.length;
2050             final int newLen = len << 1;
2051             int[] newOffsets = new int[newLen];
2052
2053             System.arraycopy(offsets, 0, newOffsets, 0, currentIndex);
2054             return newOffsets;
2055         }
2056     }
2057
2058     static final class Context {
2059         int start;
2060         int limit;
2061         int length;
2062         Match match;
2063         boolean inuse = false;
2064         ClosureContext[] closureContexts;
2065
2066         private StringTarget stringTarget;
2067         private CharArrayTarget charArrayTarget;
2068         private CharacterIteratorTarget characterIteratorTarget;
2069
2070         ExpressionTarget target;
2071
2072         Context() {
2073         }
2074
2075         private void resetCommon(int nofclosures) {
2076             this.length = this.limit-this.start;
2077             setInUse(true);
2078             this.match = null;
2079             if (this.closureContexts == null || this.closureContexts.length != nofclosures) {
2080                 this.closureContexts = new ClosureContext[nofclosures];
2081             }
2082             for (int i = 0;  i < nofclosures;  i ++)  {
2083                 if (this.closureContexts[i] == null) {
2084                     this.closureContexts[i] = new ClosureContext();
2085                 }
2086                 else {
2087                     this.closureContexts[i].reset();
2088                 }
2089             }
2090         }
2091
2092         void reset(CharacterIterator target, int start, int limit, int nofclosures) {
2093             if (characterIteratorTarget == null) {
2094                 characterIteratorTarget = new CharacterIteratorTarget(target);
2095             }
2096             else {
2097                 characterIteratorTarget.resetTarget(target);
2098             }
2099             this.target = characterIteratorTarget;
2100             this.start = start;
2101             this.limit = limit;
2102             this.resetCommon(nofclosures);
2103         }
2104
2105         void reset(String target, int start, int limit, int nofclosures) {
2106             if (stringTarget == null) {
2107                 stringTarget = new StringTarget(target);
2108             }
2109             else {
2110                 stringTarget.resetTarget(target);
2111             }
2112             this.target = stringTarget;
2113             this.start = start;
2114             this.limit = limit;
2115             this.resetCommon(nofclosures);
2116         }
2117
2118         void reset(char[] target, int start, int limit, int nofclosures) {
2119             if (charArrayTarget == null) {
2120                 charArrayTarget = new CharArrayTarget(target);
2121             }
2122             else {
2123                 charArrayTarget.resetTarget(target);
2124             }
2125             this.target = charArrayTarget;
2126             this.start = start;
2127             this.limit = limit;
2128             this.resetCommon(nofclosures);
2129         }
2130         synchronized void setInUse(boolean inUse) {
2131             this.inuse = inUse;
2132         }
2133     }
2134
2135     /**
2136      * Prepares for matching.  This method is called just before starting matching.
2137      */
2138     void prepare() {
2139         if (Op.COUNT) {
2140             Op.nofinstances = 0;
2141         }
2142         this.compile(this.tokentree);
2143         /*
2144         if  (this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) { // .*
2145             Op anchor = Op.createAnchor(isSet(this.options, SINGLE_LINE) ? 'A' : '@');
2146             anchor.next = this.operations;
2147             this.operations = anchor;
2148         }
2149         */
2150         if (Op.COUNT) {
2151             System.err.println("DEBUG: The number of operations: "+Op.nofinstances);
2152         }
2153
2154         this.minlength = this.tokentree.getMinLength();
2155
2156         this.firstChar = null;
2157         if (!isSet(this.options, PROHIBIT_HEAD_CHARACTER_OPTIMIZATION)
2158             && !isSet(this.options, XMLSCHEMA_MODE)) {
2159             RangeToken firstChar = Token.createRange();
2160             int fresult = this.tokentree.analyzeFirstCharacter(firstChar, this.options);
2161             if (fresult == Token.FC_TERMINAL) {
2162                 firstChar.compactRanges();
2163                 this.firstChar = firstChar;
2164                 if (DEBUG) {
2165                     System.err.println("DEBUG: Use the first character optimization: "+firstChar);
2166                 }
2167             }
2168         }
2169
2170         if (this.operations != null
2171             && (this.operations.type == Op.STRING || this.operations.type == Op.CHAR)
2172             && this.operations.next == null) {
2173             if (DEBUG) {
2174                 System.err.print(" *** Only fixed string! *** ");
2175             }
2176             this.fixedStringOnly = true;
2177             if (this.operations.type == Op.STRING) {
2178                 this.fixedString = this.operations.getString();
2179             } else if (this.operations.getData() >= 0x10000) { // Op.CHAR
2180                 this.fixedString = REUtil.decomposeToSurrogates(this.operations.getData());
2181             } else {
2182                 char[] ac = new char[1];
2183                 ac[0] = (char)this.operations.getData();
2184                 this.fixedString = new String(ac);
2185             }
2186             this.fixedStringOptions = this.options;
2187             this.fixedStringTable = new BMPattern(this.fixedString, 256,
2188                                                   isSet(this.fixedStringOptions, IGNORE_CASE));
2189         } else if (!isSet(this.options, PROHIBIT_FIXED_STRING_OPTIMIZATION)
2190                    && !isSet(this.options, XMLSCHEMA_MODE)) {
2191             Token.FixedStringContainer container = new Token.FixedStringContainer();
2192             this.tokentree.findFixedString(container, this.options);
2193             this.fixedString = container.token == null ? null : container.token.getString();
2194             this.fixedStringOptions = container.options;
2195             if (this.fixedString != null && this.fixedString.length() < 2) {
2196                 this.fixedString = null;
2197             }
2198             // This pattern has a fixed string of which length is more than one.
2199             if (this.fixedString != null) {
2200                 this.fixedStringTable = new BMPattern(this.fixedString, 256,
2201                                                       isSet(this.fixedStringOptions, IGNORE_CASE));
2202                 if (DEBUG) {
2203                     System.err.println("DEBUG: The longest fixed string: "+this.fixedString.length()
2204                                        +"/" //+this.fixedString
2205                                        +"/"+REUtil.createOptionString(this.fixedStringOptions));
2206                     System.err.print("String: ");
2207                     REUtil.dumpString(this.fixedString);
2208                 }
2209             }
2210         }
2211     }
2212
2213     /**
2214      * An option.
2215      * If you specify this option, <span class="REGEX"><kbd>(</kbd><var>X</var><kbd>)</kbd></span>
2216      * captures matched text, and <span class="REGEX"><kbd>(:?</kbd><var>X</var><kbd>)</kbd></span>
2217      * does not capture.
2218      *
2219      * @see #RegularExpression(java.lang.String,int)
2220      * @see #setPattern(java.lang.String,int)
2221     static final int MARK_PARENS = 1<<0;
2222      */
2223
2224     /**
2225      * "i"
2226      */
2227     static final int IGNORE_CASE = 1<<1;
2228
2229     /**
2230      * "s"
2231      */
2232     static final int SINGLE_LINE = 1<<2;
2233
2234     /**
2235      * "m"
2236      */
2237     static final int MULTIPLE_LINES = 1<<3;
2238
2239     /**
2240      * "x"
2241      */
2242     static final int EXTENDED_COMMENT = 1<<4;
2243
2244     /**
2245      * This option redefines <span class="REGEX"><kbd>\d \D \w \W \s \S</kbd></span>.
2246      *
2247      * @see #RegularExpression(java.lang.String,int)
2248      * @see #setPattern(java.lang.String,int)
2249      * @see #UNICODE_WORD_BOUNDARY
2250      */
2251     static final int USE_UNICODE_CATEGORY = 1<<5; // "u"
2252
2253     /**
2254      * An option.
2255      * This enables to process locale-independent word boundary for <span class="REGEX"><kbd>\b \B \&lt; \></kbd></span>.
2256      * <p>By default, the engine considers a position between a word character
2257      * (<span class="REGEX"><Kbd>\w</kbd></span>) and a non word character
2258      * is a word boundary.
2259      * <p>By this option, the engine checks word boundaries with the method of
2260      * 'Unicode Regular Expression Guidelines' Revision 4.
2261      *
2262      * @see #RegularExpression(java.lang.String,int)
2263      * @see #setPattern(java.lang.String,int)
2264      */
2265     static final int UNICODE_WORD_BOUNDARY = 1<<6; // "w"
2266
2267     /**
2268      * "H"
2269      */
2270     static final int PROHIBIT_HEAD_CHARACTER_OPTIMIZATION = 1<<7;
2271     /**
2272      * "F"
2273      */
2274     static final int PROHIBIT_FIXED_STRING_OPTIMIZATION = 1<<8;
2275     /**
2276      * "X". XML Schema mode.
2277      */
2278     static final int XMLSCHEMA_MODE = 1<<9;
2279     /**
2280      * ",".
2281      */
2282     static final int SPECIAL_COMMA = 1<<10;
2283
2284
2285     private static final boolean isSet(int options, int flag) {
2286         return (options & flag) == flag;
2287     }
2288
2289     /**
2290      * Creates a new RegularExpression instance.
2291      *
2292      * @param regex A regular expression
2293      * @exception org.apache.xerces.utils.regex.ParseException <VAR>regex</VAR> is not conforming to the syntax.
2294      */
2295     public RegularExpression(String regex) throws ParseException {
2296         this(regex, null);
2297     }
2298
2299     /**
2300      * Creates a new RegularExpression instance with options.
2301      *
2302      * @param regex A regular expression
2303      * @param options A String consisted of "i" "m" "s" "u" "w" "," "X"
2304      * @exception org.apache.xerces.utils.regex.ParseException <VAR>regex</VAR> is not conforming to the syntax.
2305      */
2306     public RegularExpression(String regex, String options) throws ParseException {
2307         this.setPattern(regex, options);
2308     }
2309
2310     /**
2311      * Creates a new RegularExpression instance with options.
2312      *
2313      * @param regex A regular expression
2314      * @param options A String consisted of "i" "m" "s" "u" "w" "," "X"
2315      * @exception org.apache.xerces.utils.regex.ParseException <VAR>regex</VAR> is not conforming to the syntax.
2316      */
2317     public RegularExpression(String regex, String options, Locale locale) throws ParseException {
2318         this.setPattern(regex, options, locale);
2319     }
2320
2321     RegularExpression(String regex, Token tok, int parens, boolean hasBackReferences, int options) {
2322         this.regex = regex;
2323         this.tokentree = tok;
2324         this.nofparen = parens;
2325         this.options = options;
2326         this.hasBackReferences = hasBackReferences;
2327     }
2328
2329     /**
2330      *
2331      */
2332     public void setPattern(String newPattern) throws ParseException {
2333         this.setPattern(newPattern, Locale.getDefault());
2334     }
2335
2336     public void setPattern(String newPattern, Locale locale) throws ParseException {
2337         this.setPattern(newPattern, this.options, locale);
2338     }
2339
2340     private void setPattern(String newPattern, int options, Locale locale) throws ParseException {
2341         this.regex = newPattern;
2342         this.options = options;
2343         RegexParser rp = RegularExpression.isSet(this.options, RegularExpression.XMLSCHEMA_MODE)
2344                          ? new ParserForXMLSchema(locale) : new RegexParser(locale);
2345         this.tokentree = rp.parse(this.regex, this.options);
2346         this.nofparen = rp.parennumber;
2347         this.hasBackReferences = rp.hasBackReferences;
2348
2349         this.operations = null;
2350         this.context = null;
2351     }
2352     /**
2353      *
2354      */
2355     public void setPattern(String newPattern, String options) throws ParseException {
2356         this.setPattern(newPattern, options, Locale.getDefault());
2357     }
2358
2359     public void setPattern(String newPattern, String options, Locale locale) throws ParseException {
2360         this.setPattern(newPattern, REUtil.parseOptions(options), locale);
2361     }
2362
2363     /**
2364      *
2365      */
2366     public String getPattern() {
2367         return this.regex;
2368     }
2369
2370     /**
2371      * Represents this instence in String.
2372      */
2373     @Override
2374     public String toString() {
2375         return this.tokentree.toString(this.options);
2376     }
2377
2378     /**
2379      * Returns a option string.
2380      * The order of letters in it may be different from a string specified
2381      * in a constructor or <code>setPattern()</code>.
2382      *
2383      * @see #RegularExpression(java.lang.String,java.lang.String)
2384      * @see #setPattern(java.lang.String,java.lang.String)
2385      */
2386     public String getOptions() {
2387         return REUtil.createOptionString(this.options);
2388     }
2389
2390     /**
2391      *  Return true if patterns are the same and the options are equivalent.
2392      */
2393     @Override
2394     public boolean equals(Object obj) {
2395         if (obj == null) {
2396             return false;
2397         }
2398         if (!(obj instanceof RegularExpression)) {
2399             return false;
2400         }
2401         RegularExpression r = (RegularExpression)obj;
2402         return this.regex.equals(r.regex) && this.options == r.options;
2403     }
2404
2405     boolean equals(String pattern, int options) {
2406         return this.regex.equals(pattern) && this.options == options;
2407     }
2408
2409     /**
2410      *
2411      */
2412     @Override
2413     public int hashCode() {
2414         return (this.regex+"/"+this.getOptions()).hashCode();
2415     }
2416
2417     /**
2418      * Return the number of regular expression groups.
2419      * This method returns 1 when the regular expression has no capturing-parenthesis.
2420      *
2421      */
2422     public int getNumberOfGroups() {
2423         return this.nofparen;
2424     }
2425
2426     // ================================================================
2427
2428     private static final int WT_IGNORE = 0;
2429     private static final int WT_LETTER = 1;
2430     private static final int WT_OTHER = 2;
2431     private static final int getWordType0(char ch, int opts) {
2432         if (!isSet(opts, UNICODE_WORD_BOUNDARY)) {
2433             if (isSet(opts, USE_UNICODE_CATEGORY)) {
2434                 return (Token.getRange("IsWord", true).match(ch)) ? WT_LETTER : WT_OTHER;
2435             }
2436             return isWordChar(ch) ? WT_LETTER : WT_OTHER;
2437         }
2438
2439         switch (Character.getType(ch)) {
2440         case Character.UPPERCASE_LETTER:      // L
2441         case Character.LOWERCASE_LETTER:      // L
2442         case Character.TITLECASE_LETTER:      // L
2443         case Character.MODIFIER_LETTER:       // L
2444         case Character.OTHER_LETTER:          // L
2445         case Character.LETTER_NUMBER:         // N
2446         case Character.DECIMAL_DIGIT_NUMBER:  // N
2447         case Character.OTHER_NUMBER:          // N
2448         case Character.COMBINING_SPACING_MARK: // Mc
2449             return WT_LETTER;
2450
2451         case Character.FORMAT:                // Cf
2452         case Character.NON_SPACING_MARK:      // Mn
2453         case Character.ENCLOSING_MARK:        // Mc
2454             return WT_IGNORE;
2455
2456         case Character.CONTROL:               // Cc
2457             switch (ch) {
2458             case '\t':
2459             case '\n':
2460             case '\u000B':
2461             case '\f':
2462             case '\r':
2463                 return WT_OTHER;
2464             default:
2465                 return WT_IGNORE;
2466             }
2467
2468         default:
2469             return WT_OTHER;
2470         }
2471     }
2472
2473     // ================================================================
2474
2475     static final int LINE_FEED = 0x000A;
2476     static final int CARRIAGE_RETURN = 0x000D;
2477     static final int LINE_SEPARATOR = 0x2028;
2478     static final int PARAGRAPH_SEPARATOR = 0x2029;
2479
2480     private static final boolean isEOLChar(int ch) {
2481         return ch == LINE_FEED || ch == CARRIAGE_RETURN || ch == LINE_SEPARATOR
2482         || ch == PARAGRAPH_SEPARATOR;
2483     }
2484
2485     private static final boolean isWordChar(int ch) { // Legacy word characters
2486         if (ch == '_') {
2487             return true;
2488         }
2489         if (ch < '0') {
2490             return false;
2491         }
2492         if (ch > 'z') {
2493             return false;
2494         }
2495         if (ch <= '9') {
2496             return true;
2497         }
2498         if (ch < 'A') {
2499             return false;
2500         }
2501         if (ch <= 'Z') {
2502             return true;
2503         }
2504         if (ch < 'a') {
2505             return false;
2506         }
2507         return true;
2508     }
2509
2510     private static final boolean matchIgnoreCase(int chardata, int ch) {
2511         if (chardata == ch) {
2512             return true;
2513         }
2514         if (chardata > 0xffff || ch > 0xffff) {
2515             return false;
2516         }
2517         char uch1 = Character.toUpperCase((char)chardata);
2518         char uch2 = Character.toUpperCase((char)ch);
2519         if (uch1 == uch2) {
2520             return true;
2521         }
2522         return Character.toLowerCase(uch1) == Character.toLowerCase(uch2);
2523     }
2524 }