BUG-5410: initial import of Xerces RegularExpression
[yangtools.git] / third-party / xsd-regex / src / main / java / org / opendaylight / yangtools / xsd / regex / Token.java
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one or more
3  * contributor license agreements.  See the NOTICE file distributed with
4  * this work for additional information regarding copyright ownership.
5  * The ASF licenses this file to You under the Apache License, Version 2.0
6  * (the "License"); you may not use this file except in compliance with
7  * the License.  You may obtain a copy of the License at
8  * 
9  *      http://www.apache.org/licenses/LICENSE-2.0
10  * 
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17
18 package org.opendaylight.yangtools.xsd.regex;
19
20 import java.util.Hashtable;
21 import java.util.Vector;
22
23 /**
24  * This class represents a node in parse tree.
25  * 
26  * @xerces.internal
27  *
28  * @version $Id: Token.java 1638344 2014-11-11 20:15:46Z mrglavas $
29  */
30 class Token implements java.io.Serializable {
31
32     private static final long serialVersionUID = 8484976002585487481L;
33
34     static final boolean COUNTTOKENS = true;
35     static int tokens = 0;
36
37     static final int CHAR = 0;                  // Literal char
38     static final int DOT = 11;                  // .
39     static final int CONCAT = 1;                // XY
40     static final int UNION = 2;                 // X|Y|Z
41     static final int CLOSURE = 3;               // X*
42     static final int RANGE = 4;                 // [a-zA-Z] etc.
43     static final int NRANGE = 5;                // [^a-zA-Z] etc.
44     static final int PAREN = 6;                 // (X) or (?:X)
45     static final int EMPTY = 7;                 //
46     static final int ANCHOR = 8;                // ^ $ \b \B \< \> \A \Z \z
47     static final int NONGREEDYCLOSURE = 9;      // *? +?
48     static final int STRING = 10;               // strings
49     static final int BACKREFERENCE = 12;        // back references
50     static final int LOOKAHEAD = 20;            // (?=...)
51     static final int NEGATIVELOOKAHEAD = 21;    // (?!...)
52     static final int LOOKBEHIND = 22;           // (?<=...)
53     static final int NEGATIVELOOKBEHIND = 23;   // (?<!...)
54     static final int INDEPENDENT = 24;          // (?>...)
55     static final int MODIFIERGROUP = 25;        // (?ims-ims:...)
56     static final int CONDITION = 26;            // (?(...)yes|no)
57
58     static final int UTF16_MAX = 0x10ffff;
59
60     final int type;
61
62     static Token token_dot;
63     static Token token_0to9;
64     static Token token_wordchars;
65     static Token token_not_0to9;
66     static Token token_not_wordchars;
67     static Token token_spaces;
68     static Token token_not_spaces;
69     static Token token_empty;
70     static Token token_linebeginning;
71     static Token token_linebeginning2;
72     static Token token_lineend;
73     static Token token_stringbeginning;
74     static Token token_stringend;
75     static Token token_stringend2;
76     static Token token_wordedge;
77     static Token token_not_wordedge;
78     static Token token_wordbeginning;
79     static Token token_wordend;
80     static {
81         Token.token_empty = new Token(Token.EMPTY);
82
83         Token.token_linebeginning = Token.createAnchor('^');
84         Token.token_linebeginning2 = Token.createAnchor('@');
85         Token.token_lineend = Token.createAnchor('$');
86         Token.token_stringbeginning = Token.createAnchor('A');
87         Token.token_stringend = Token.createAnchor('z');
88         Token.token_stringend2 = Token.createAnchor('Z');
89         Token.token_wordedge = Token.createAnchor('b');
90         Token.token_not_wordedge = Token.createAnchor('B');
91         Token.token_wordbeginning = Token.createAnchor('<');
92         Token.token_wordend = Token.createAnchor('>');
93
94         Token.token_dot = new Token(Token.DOT);
95
96         Token.token_0to9 = Token.createRange();
97         Token.token_0to9.addRange('0', '9');
98         Token.token_wordchars = Token.createRange();
99         Token.token_wordchars.addRange('0', '9');
100         Token.token_wordchars.addRange('A', 'Z');
101         Token.token_wordchars.addRange('_', '_');
102         Token.token_wordchars.addRange('a', 'z');
103         Token.token_spaces = Token.createRange();
104         Token.token_spaces.addRange('\t', '\t');
105         Token.token_spaces.addRange('\n', '\n');
106         Token.token_spaces.addRange('\f', '\f');
107         Token.token_spaces.addRange('\r', '\r');
108         Token.token_spaces.addRange(' ', ' ');
109
110         Token.token_not_0to9 = Token.complementRanges(Token.token_0to9);
111         Token.token_not_wordchars = Token.complementRanges(Token.token_wordchars);
112         Token.token_not_spaces = Token.complementRanges(Token.token_spaces);
113     }
114
115     static Token.ParenToken createLook(int type, Token child) {
116         if (COUNTTOKENS)  Token.tokens ++;
117         return new Token.ParenToken(type, child, 0);
118     }
119     static Token.ParenToken createParen(Token child, int pnumber) {
120         if (COUNTTOKENS)  Token.tokens ++;
121         return new Token.ParenToken(Token.PAREN, child, pnumber);
122     }
123     static Token.ClosureToken createClosure(Token tok) {
124         if (COUNTTOKENS)  Token.tokens ++;
125         return new Token.ClosureToken(Token.CLOSURE, tok);
126     }
127     static Token.ClosureToken createNGClosure(Token tok) {
128         if (COUNTTOKENS)  Token.tokens ++;
129         return new Token.ClosureToken(Token.NONGREEDYCLOSURE, tok);
130     }
131     static Token.ConcatToken createConcat(Token tok1, Token tok2) {
132         if (COUNTTOKENS)  Token.tokens ++;
133         return new Token.ConcatToken(tok1, tok2);
134     }
135     static Token.UnionToken createConcat() {
136         if (COUNTTOKENS)  Token.tokens ++;
137         return new Token.UnionToken(Token.CONCAT); // *** It is not a bug.
138     }
139     static Token.UnionToken createUnion() {
140         if (COUNTTOKENS)  Token.tokens ++;
141         return new Token.UnionToken(Token.UNION);
142     }
143     static Token createEmpty() {
144         return Token.token_empty;
145     }
146     static RangeToken createRange() {
147         if (COUNTTOKENS)  Token.tokens ++;
148         return new RangeToken(Token.RANGE);
149     }
150     static RangeToken createNRange() {
151         if (COUNTTOKENS)  Token.tokens ++;
152         return new RangeToken(Token.NRANGE);
153     }
154     static Token.CharToken createChar(int ch) {
155         if (COUNTTOKENS)  Token.tokens ++;
156         return new Token.CharToken(Token.CHAR, ch);
157     }
158     static private Token.CharToken createAnchor(int ch) {
159         if (COUNTTOKENS)  Token.tokens ++;
160         return new Token.CharToken(Token.ANCHOR, ch);
161     }
162     static Token.StringToken createBackReference(int refno) {
163         if (COUNTTOKENS)  Token.tokens ++;
164         return new Token.StringToken(Token.BACKREFERENCE, null, refno);
165     }
166     static Token.StringToken createString(String str) {
167         if (COUNTTOKENS)  Token.tokens ++;
168         return new Token.StringToken(Token.STRING, str, 0);
169     }
170     static Token.ModifierToken createModifierGroup(Token child, int add, int mask) {
171         if (COUNTTOKENS)  Token.tokens ++;
172         return new Token.ModifierToken(child, add, mask);
173     }
174     static Token.ConditionToken createCondition(int refno, Token condition,
175                                                 Token yespat, Token nopat) {
176         if (COUNTTOKENS)  Token.tokens ++;
177         return new Token.ConditionToken(refno, condition, yespat, nopat);
178     }
179
180     protected Token(int type) {
181         this.type = type;
182     }
183
184     /**
185      * A number of children.
186      */
187     int size() {
188         return 0;
189     }
190     Token getChild(int index) {
191         return null;
192     }
193     void addChild(Token tok) {
194         throw new RuntimeException("Not supported.");
195     }
196
197                                                 // for RANGE or NRANGE
198     protected void addRange(int start, int end) {
199         throw new RuntimeException("Not supported.");
200     }
201     protected void sortRanges() {
202         throw new RuntimeException("Not supported.");
203     }
204     protected void compactRanges() {
205         throw new RuntimeException("Not supported.");
206     }
207     protected void mergeRanges(Token tok) {
208         throw new RuntimeException("Not supported.");
209     }
210     protected void subtractRanges(Token tok) {
211         throw new RuntimeException("Not supported.");
212     }
213     protected void intersectRanges(Token tok) {
214         throw new RuntimeException("Not supported.");
215     }
216     static Token complementRanges(Token tok) {
217         return RangeToken.complementRanges(tok);
218     }
219
220
221     void setMin(int min) {                      // for CLOSURE
222     }
223     void setMax(int max) {                      // for CLOSURE
224     }
225     int getMin() {                              // for CLOSURE
226         return -1;
227     }
228     int getMax() {                              // for CLOSURE
229         return -1;
230     }
231     int getReferenceNumber() {                  // for STRING
232         return 0;
233     }
234     String getString() {                        // for STRING
235         return null;
236     }
237
238     int getParenNumber() {
239         return 0;
240     }
241     int getChar() {
242         return -1;
243     }
244
245     public String toString() {
246         return this.toString(0);
247     }
248     public String toString(int options) {
249         return this.type == Token.DOT ? "." : "";
250     }
251
252     /**
253      * How many characters are needed?
254      */
255     final int getMinLength() {
256         switch (this.type) {
257           case CONCAT:
258             int sum = 0;
259             for (int i = 0;  i < this.size();  i ++)
260                 sum += this.getChild(i).getMinLength();
261             return sum;
262
263           case CONDITION:
264           case UNION:
265             if (this.size() == 0)
266                 return 0;
267             int ret = this.getChild(0).getMinLength();
268             for (int i = 1;  i < this.size();  i ++) {
269                 int min = this.getChild(i).getMinLength();
270                 if (min < ret)  ret = min;
271             }
272             return ret;
273
274           case CLOSURE:
275           case NONGREEDYCLOSURE:
276             if (this.getMin() >= 0)
277                 return this.getMin() * this.getChild(0).getMinLength();
278             return 0;
279
280           case EMPTY:
281           case ANCHOR:
282             return 0;
283
284           case DOT:
285           case CHAR:
286           case RANGE:
287           case NRANGE:
288             return 1;
289
290           case INDEPENDENT:
291           case PAREN:
292           case MODIFIERGROUP:
293             return this.getChild(0).getMinLength();
294
295           case BACKREFERENCE:
296             return 0;                           // *******
297
298           case STRING:
299             return this.getString().length();
300
301           case LOOKAHEAD:
302           case NEGATIVELOOKAHEAD:
303           case LOOKBEHIND:
304           case NEGATIVELOOKBEHIND:
305             return 0;                           // ***** Really?
306
307           default:
308             throw new RuntimeException("Token#getMinLength(): Invalid Type: "+this.type);
309         }
310     }
311
312     final int getMaxLength() {
313         switch (this.type) {
314           case CONCAT:
315             int sum = 0;
316             for (int i = 0;  i < this.size();  i ++) {
317                 int d = this.getChild(i).getMaxLength();
318                 if (d < 0)  return -1;
319                 sum += d;
320             }
321             return sum;
322
323           case CONDITION:
324           case UNION:
325             if (this.size() == 0)
326                 return 0;
327             int ret = this.getChild(0).getMaxLength();
328             for (int i = 1;  ret >= 0 && i < this.size();  i ++) {
329                 int max = this.getChild(i).getMaxLength();
330                 if (max < 0) {                  // infinity
331                     ret = -1;
332                     break;
333                 }
334                 if (max > ret)  ret = max;
335             }
336             return ret;
337
338           case CLOSURE:
339           case NONGREEDYCLOSURE:
340             if (this.getMax() >= 0)
341                                                 // When this.child.getMaxLength() < 0,
342                                                 // this returns minus value
343                 return this.getMax() * this.getChild(0).getMaxLength();
344             return -1;
345
346           case EMPTY:
347           case ANCHOR:
348             return 0;
349
350           case CHAR:
351             return 1;
352           case DOT:
353           case RANGE:
354           case NRANGE:
355             return 2;
356
357           case INDEPENDENT:
358           case PAREN:
359           case MODIFIERGROUP:
360             return this.getChild(0).getMaxLength();
361
362           case BACKREFERENCE:
363             return -1;                          // ******
364
365           case STRING:
366             return this.getString().length();
367
368           case LOOKAHEAD:
369           case NEGATIVELOOKAHEAD:
370           case LOOKBEHIND:
371           case NEGATIVELOOKBEHIND:
372             return 0;                           // ***** Really?
373
374           default:
375             throw new RuntimeException("Token#getMaxLength(): Invalid Type: "+this.type);
376         }
377     }
378
379     static final int FC_CONTINUE = 0;
380     static final int FC_TERMINAL = 1;
381     static final int FC_ANY = 2;
382     private static final boolean isSet(int options, int flag) {
383         return (options & flag) == flag;
384     }
385     final int analyzeFirstCharacter(RangeToken result, int options) {
386         switch (this.type) {
387           case CONCAT:
388             int ret = FC_CONTINUE;
389             for (int i = 0;  i < this.size();  i ++)
390                 if ((ret = this.getChild(i).analyzeFirstCharacter(result, options)) != FC_CONTINUE)
391                     break;
392             return ret;
393
394           case UNION:
395             if (this.size() == 0)
396                 return FC_CONTINUE;
397             /*
398              *  a|b|c -> FC_TERMINAL
399              *  a|.|c -> FC_ANY
400              *  a|b|  -> FC_CONTINUE
401              */
402             int ret2 = FC_CONTINUE;
403             boolean hasEmpty = false;
404             for (int i = 0;  i < this.size();  i ++) {
405                 ret2 = this.getChild(i).analyzeFirstCharacter(result, options);
406                 if (ret2 == FC_ANY)
407                     break;
408                 else if (ret2 == FC_CONTINUE)
409                     hasEmpty = true;
410             }
411             return hasEmpty ? FC_CONTINUE : ret2;
412
413           case CONDITION:
414             int ret3 = this.getChild(0).analyzeFirstCharacter(result, options);
415             if (this.size() == 1)  return FC_CONTINUE;
416             if (ret3 == FC_ANY)  return ret3;
417             int ret4 = this.getChild(1).analyzeFirstCharacter(result, options);
418             if (ret4 == FC_ANY)  return ret4;
419             return ret3 == FC_CONTINUE || ret4 == FC_CONTINUE ? FC_CONTINUE : FC_TERMINAL;
420
421           case CLOSURE:
422           case NONGREEDYCLOSURE:
423             this.getChild(0).analyzeFirstCharacter(result, options);
424             return FC_CONTINUE;
425
426           case EMPTY:
427           case ANCHOR:
428             return FC_CONTINUE;
429
430           case CHAR:
431             int ch = this.getChar();
432             result.addRange(ch, ch);
433             if (ch < 0x10000 && isSet(options, RegularExpression.IGNORE_CASE)) {
434                 ch = Character.toUpperCase((char)ch);
435                 result.addRange(ch, ch);
436                 ch = Character.toLowerCase((char)ch);
437                 result.addRange(ch, ch);
438             }
439             return FC_TERMINAL;
440
441           case DOT:
442               return FC_ANY;
443
444           case RANGE:
445             result.mergeRanges(this);
446             return FC_TERMINAL;
447
448           case NRANGE:                          // ****
449             result.mergeRanges(Token.complementRanges(this));
450             return FC_TERMINAL;
451
452           case INDEPENDENT:
453           case PAREN:
454             return this.getChild(0).analyzeFirstCharacter(result, options);
455
456           case MODIFIERGROUP:
457             options |= ((ModifierToken)this).getOptions();
458             options &= ~((ModifierToken)this).getOptionsMask();
459             return this.getChild(0).analyzeFirstCharacter(result, options);
460
461           case BACKREFERENCE:
462             result.addRange(0, UTF16_MAX);  // **** We can not optimize.
463             return FC_ANY;
464
465           case STRING:
466             int cha = this.getString().charAt(0);
467             int ch2;
468             if (REUtil.isHighSurrogate(cha)
469                 && this.getString().length() >= 2
470                 && REUtil.isLowSurrogate((ch2 = this.getString().charAt(1))))
471                 cha = REUtil.composeFromSurrogates(cha, ch2);
472             result.addRange(cha, cha);
473             if (cha < 0x10000 && isSet(options, RegularExpression.IGNORE_CASE)) {
474                 cha = Character.toUpperCase((char)cha);
475                 result.addRange(cha, cha);
476                 cha = Character.toLowerCase((char)cha);
477                 result.addRange(cha, cha);
478             }
479             return FC_TERMINAL;
480
481           case LOOKAHEAD:
482           case NEGATIVELOOKAHEAD:
483           case LOOKBEHIND:
484           case NEGATIVELOOKBEHIND:
485             return FC_CONTINUE;
486
487           default:
488             throw new RuntimeException("Token#analyzeHeadCharacter(): Invalid Type: "+this.type);
489         }
490     }
491
492     private final boolean isShorterThan(Token tok) {
493         if (tok == null)  return false;
494         /*
495         int mylength;
496         if (this.type == STRING)  mylength = this.getString().length();
497         else if (this.type == CHAR)  mylength = this.getChar() >= 0x10000 ? 2 : 1;
498         else throw new RuntimeException("Internal Error: Illegal type: "+this.type);
499         int otherlength;
500         if (tok.type == STRING)  otherlength = tok.getString().length();
501         else if (tok.type == CHAR)  otherlength = tok.getChar() >= 0x10000 ? 2 : 1;
502         else throw new RuntimeException("Internal Error: Illegal type: "+tok.type);
503         */
504         int mylength;
505         if (this.type == STRING)  mylength = this.getString().length();
506         else throw new RuntimeException("Internal Error: Illegal type: "+this.type);
507         int otherlength;
508         if (tok.type == STRING)  otherlength = tok.getString().length();
509         else throw new RuntimeException("Internal Error: Illegal type: "+tok.type);
510         return mylength < otherlength;
511     }
512
513     static class FixedStringContainer {
514         Token token = null;
515         int options = 0;
516         FixedStringContainer() {
517         }
518     }
519
520     final void findFixedString(FixedStringContainer container, int options) {
521         switch (this.type) {
522           case CONCAT:
523             Token prevToken = null;
524             int prevOptions = 0;
525             for (int i = 0;  i < this.size();  i ++) {
526                 this.getChild(i).findFixedString(container, options);
527                 if (prevToken == null || prevToken.isShorterThan(container.token)) {
528                     prevToken = container.token;
529                     prevOptions = container.options;
530                 }
531             }
532             container.token = prevToken;
533             container.options = prevOptions;
534             return;
535
536           case UNION:
537           case CLOSURE:
538           case NONGREEDYCLOSURE:
539           case EMPTY:
540           case ANCHOR:
541           case RANGE:
542           case DOT:
543           case NRANGE:
544           case BACKREFERENCE:
545           case LOOKAHEAD:
546           case NEGATIVELOOKAHEAD:
547           case LOOKBEHIND:
548           case NEGATIVELOOKBEHIND:
549           case CONDITION:
550             container.token = null;
551             return;
552
553           case CHAR:                            // Ignore CHAR tokens.
554             container.token = null;             // **
555             return;                             // **
556
557           case STRING:
558             container.token = this;
559             container.options = options;
560             return;
561
562           case INDEPENDENT:
563           case PAREN:
564             this.getChild(0).findFixedString(container, options);
565             return;
566
567           case MODIFIERGROUP:
568             options |= ((ModifierToken)this).getOptions();
569             options &= ~((ModifierToken)this).getOptionsMask();
570             this.getChild(0).findFixedString(container, options);
571             return;
572
573           default:
574             throw new RuntimeException("Token#findFixedString(): Invalid Type: "+this.type);
575         }
576     }
577
578     boolean match(int ch) {
579         throw new RuntimeException("NFAArrow#match(): Internal error: "+this.type);
580     }
581
582     // ------------------------------------------------------
583     private final static Hashtable categories = new Hashtable();
584     private final static Hashtable categories2 = new Hashtable();
585     private static final String[] categoryNames = {
586         "Cn", "Lu", "Ll", "Lt", "Lm", "Lo", "Mn", "Me", "Mc", "Nd",
587         "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", null, "Co", "Cs",
588         "Pd", "Ps", "Pe", "Pc", "Po", "Sm", "Sc", "Sk", "So", // 28
589         "Pi", "Pf",  // 29, 30
590         "L", "M", "N", "Z", "C", "P", "S",      // 31-37
591     };
592
593     // Schema Rec. {Datatypes} - Punctuation 
594     static final int CHAR_INIT_QUOTE  = 29;     // Pi - initial quote
595     static final int CHAR_FINAL_QUOTE = 30;     // Pf - final quote
596     static final int CHAR_LETTER = 31;
597     static final int CHAR_MARK = 32;
598     static final int CHAR_NUMBER = 33;
599     static final int CHAR_SEPARATOR = 34;
600     static final int CHAR_OTHER = 35;
601     static final int CHAR_PUNCTUATION = 36;
602     static final int CHAR_SYMBOL = 37;
603     
604     //blockNames in UNICODE 3.1 that supported by XML Schema REC             
605     private static final String[] blockNames = {
606         /*0000..007F;*/ "Basic Latin",
607         /*0080..00FF;*/ "Latin-1 Supplement",
608         /*0100..017F;*/ "Latin Extended-A",
609         /*0180..024F;*/ "Latin Extended-B",
610         /*0250..02AF;*/ "IPA Extensions",
611         /*02B0..02FF;*/ "Spacing Modifier Letters",
612         /*0300..036F;*/ "Combining Diacritical Marks",
613         /*0370..03FF;*/ "Greek",
614         /*0400..04FF;*/ "Cyrillic",
615         /*0530..058F;*/ "Armenian",
616         /*0590..05FF;*/ "Hebrew",
617         /*0600..06FF;*/ "Arabic",
618         /*0700..074F;*/ "Syriac",  
619         /*0780..07BF;*/ "Thaana",
620         /*0900..097F;*/ "Devanagari",
621         /*0980..09FF;*/ "Bengali",
622         /*0A00..0A7F;*/ "Gurmukhi",
623         /*0A80..0AFF;*/ "Gujarati",
624         /*0B00..0B7F;*/ "Oriya",
625         /*0B80..0BFF;*/ "Tamil",
626         /*0C00..0C7F;*/ "Telugu",
627         /*0C80..0CFF;*/ "Kannada",
628         /*0D00..0D7F;*/ "Malayalam",
629         /*0D80..0DFF;*/ "Sinhala",
630         /*0E00..0E7F;*/ "Thai",
631         /*0E80..0EFF;*/ "Lao",
632         /*0F00..0FFF;*/ "Tibetan",
633         /*1000..109F;*/ "Myanmar", 
634         /*10A0..10FF;*/ "Georgian",
635         /*1100..11FF;*/ "Hangul Jamo",
636         /*1200..137F;*/ "Ethiopic",
637         /*13A0..13FF;*/ "Cherokee",
638         /*1400..167F;*/ "Unified Canadian Aboriginal Syllabics",
639         /*1680..169F;*/ "Ogham",
640         /*16A0..16FF;*/ "Runic",
641         /*1780..17FF;*/ "Khmer",
642         /*1800..18AF;*/ "Mongolian",
643         /*1E00..1EFF;*/ "Latin Extended Additional",
644         /*1F00..1FFF;*/ "Greek Extended",
645         /*2000..206F;*/ "General Punctuation",
646         /*2070..209F;*/ "Superscripts and Subscripts",
647         /*20A0..20CF;*/ "Currency Symbols",
648         /*20D0..20FF;*/ "Combining Marks for Symbols",
649         /*2100..214F;*/ "Letterlike Symbols",
650         /*2150..218F;*/ "Number Forms",
651         /*2190..21FF;*/ "Arrows",
652         /*2200..22FF;*/ "Mathematical Operators",
653         /*2300..23FF;*/ "Miscellaneous Technical",
654         /*2400..243F;*/ "Control Pictures",
655         /*2440..245F;*/ "Optical Character Recognition",
656         /*2460..24FF;*/ "Enclosed Alphanumerics",
657         /*2500..257F;*/ "Box Drawing",
658         /*2580..259F;*/ "Block Elements",
659         /*25A0..25FF;*/ "Geometric Shapes",
660         /*2600..26FF;*/ "Miscellaneous Symbols",
661         /*2700..27BF;*/ "Dingbats",
662         /*2800..28FF;*/ "Braille Patterns",
663         /*2E80..2EFF;*/ "CJK Radicals Supplement",
664         /*2F00..2FDF;*/ "Kangxi Radicals",
665         /*2FF0..2FFF;*/ "Ideographic Description Characters",
666         /*3000..303F;*/ "CJK Symbols and Punctuation",
667         /*3040..309F;*/ "Hiragana",
668         /*30A0..30FF;*/ "Katakana",
669         /*3100..312F;*/ "Bopomofo",
670         /*3130..318F;*/ "Hangul Compatibility Jamo",
671         /*3190..319F;*/ "Kanbun",
672         /*31A0..31BF;*/ "Bopomofo Extended",
673         /*3200..32FF;*/ "Enclosed CJK Letters and Months",
674         /*3300..33FF;*/ "CJK Compatibility",
675         /*3400..4DB5;*/ "CJK Unified Ideographs Extension A",
676         /*4E00..9FFF;*/ "CJK Unified Ideographs",
677         /*A000..A48F;*/ "Yi Syllables",
678         /*A490..A4CF;*/ "Yi Radicals",
679         /*AC00..D7A3;*/ "Hangul Syllables",
680         /*E000..F8FF;*/ "Private Use",
681         /*F900..FAFF;*/ "CJK Compatibility Ideographs",
682         /*FB00..FB4F;*/ "Alphabetic Presentation Forms",
683         /*FB50..FDFF;*/ "Arabic Presentation Forms-A",
684         /*FE20..FE2F;*/ "Combining Half Marks",
685         /*FE30..FE4F;*/ "CJK Compatibility Forms",
686         /*FE50..FE6F;*/ "Small Form Variants",
687         /*FE70..FEFE;*/ "Arabic Presentation Forms-B",
688         /*FEFF..FEFF;*/ "Specials",
689         /*FF00..FFEF;*/ "Halfwidth and Fullwidth Forms",
690          //missing Specials add manually
691         /*10300..1032F;*/ "Old Italic",         // 84
692         /*10330..1034F;*/ "Gothic",
693         /*10400..1044F;*/ "Deseret",
694         /*1D000..1D0FF;*/ "Byzantine Musical Symbols",
695         /*1D100..1D1FF;*/ "Musical Symbols",
696         /*1D400..1D7FF;*/ "Mathematical Alphanumeric Symbols",
697         /*20000..2A6D6;*/ "CJK Unified Ideographs Extension B",
698         /*2F800..2FA1F;*/ "CJK Compatibility Ideographs Supplement",
699         /*E0000..E007F;*/ "Tags",
700         //missing 2 private use add manually
701
702     };
703     //ADD THOSE MANUALLY
704     //F0000..FFFFD; "Private Use",
705     //100000..10FFFD; "Private Use"
706     //FFF0..FFFD; "Specials", 
707     static final String blockRanges = 
708        "\u0000\u007F\u0080\u00FF\u0100\u017F\u0180\u024F\u0250\u02AF\u02B0\u02FF\u0300\u036F"
709         +"\u0370\u03FF\u0400\u04FF\u0530\u058F\u0590\u05FF\u0600\u06FF\u0700\u074F\u0780\u07BF"
710         +"\u0900\u097F\u0980\u09FF\u0A00\u0A7F\u0A80\u0AFF\u0B00\u0B7F\u0B80\u0BFF\u0C00\u0C7F\u0C80\u0CFF"
711         +"\u0D00\u0D7F\u0D80\u0DFF\u0E00\u0E7F\u0E80\u0EFF\u0F00\u0FFF\u1000\u109F\u10A0\u10FF\u1100\u11FF"
712         +"\u1200\u137F\u13A0\u13FF\u1400\u167F\u1680\u169F\u16A0\u16FF\u1780\u17FF\u1800\u18AF\u1E00\u1EFF"
713         +"\u1F00\u1FFF\u2000\u206F\u2070\u209F\u20A0\u20CF\u20D0\u20FF\u2100\u214F\u2150\u218F\u2190\u21FF\u2200\u22FF"
714         +"\u2300\u23FF\u2400\u243F\u2440\u245F\u2460\u24FF\u2500\u257F\u2580\u259F\u25A0\u25FF\u2600\u26FF\u2700\u27BF"
715         +"\u2800\u28FF\u2E80\u2EFF\u2F00\u2FDF\u2FF0\u2FFF\u3000\u303F\u3040\u309F\u30A0\u30FF\u3100\u312F\u3130\u318F"
716         +"\u3190\u319F\u31A0\u31BF\u3200\u32FF\u3300\u33FF\u3400\u4DB5\u4E00\u9FFF\uA000\uA48F\uA490\uA4CF"
717         +"\uAC00\uD7A3\uE000\uF8FF\uF900\uFAFF\uFB00\uFB4F\uFB50\uFDFF"
718         +"\uFE20\uFE2F\uFE30\uFE4F\uFE50\uFE6F\uFE70\uFEFE\uFEFF\uFEFF\uFF00\uFFEF";
719     static final int[] nonBMPBlockRanges = {
720         0x10300, 0x1032F,       // 84
721         0x10330, 0x1034F,
722         0x10400, 0x1044F,
723         0x1D000, 0x1D0FF,
724         0x1D100, 0x1D1FF,
725         0x1D400, 0x1D7FF,
726         0x20000, 0x2A6D6,
727         0x2F800, 0x2FA1F,
728         0xE0000, 0xE007F
729     };
730     private static final int NONBMP_BLOCK_START = 84;
731
732     static protected RangeToken getRange(String name, boolean positive) {
733         if (Token.categories.size() == 0) {
734             synchronized (Token.categories) {
735                 Token[] ranges = new Token[Token.categoryNames.length];
736                 for (int i = 0;  i < ranges.length;  i ++) {
737                     ranges[i] = Token.createRange();
738                 }
739                 int type;
740                 for (int i = 0;  i < 0x10000;  i ++) {
741                     type = Character.getType((char)i);
742                     if (type == Character.START_PUNCTUATION || 
743                         type == Character.END_PUNCTUATION) {
744                         //build table of Pi values
745                         if (i == 0x00AB || i == 0x2018 || i == 0x201B || i == 0x201C ||
746                             i == 0x201F || i == 0x2039) {
747                             type = CHAR_INIT_QUOTE;
748                         }
749                         //build table of Pf values
750                         if (i == 0x00BB || i == 0x2019 || i == 0x201D || i == 0x203A ) {
751                             type = CHAR_FINAL_QUOTE;
752                         }
753                     }
754                     ranges[type].addRange(i, i);
755                     switch (type) {
756                       case Character.UPPERCASE_LETTER:
757                       case Character.LOWERCASE_LETTER:
758                       case Character.TITLECASE_LETTER:
759                       case Character.MODIFIER_LETTER:
760                       case Character.OTHER_LETTER:
761                         type = CHAR_LETTER;
762                         break;
763                       case Character.NON_SPACING_MARK:
764                       case Character.COMBINING_SPACING_MARK:
765                       case Character.ENCLOSING_MARK:
766                         type = CHAR_MARK;
767                         break;
768                       case Character.DECIMAL_DIGIT_NUMBER:
769                       case Character.LETTER_NUMBER:
770                       case Character.OTHER_NUMBER:
771                         type = CHAR_NUMBER;
772                         break;
773                       case Character.SPACE_SEPARATOR:
774                       case Character.LINE_SEPARATOR:
775                       case Character.PARAGRAPH_SEPARATOR:
776                         type = CHAR_SEPARATOR;
777                         break;
778                       case Character.CONTROL:
779                       case Character.FORMAT:
780                       case Character.SURROGATE:
781                       case Character.PRIVATE_USE:
782                       case Character.UNASSIGNED:
783                         type = CHAR_OTHER;
784                         break;
785                       case Character.CONNECTOR_PUNCTUATION:
786                       case Character.DASH_PUNCTUATION:
787                       case Character.START_PUNCTUATION:
788                       case Character.END_PUNCTUATION:
789                       case CHAR_INIT_QUOTE:
790                       case CHAR_FINAL_QUOTE:
791                       case Character.OTHER_PUNCTUATION:
792                         type = CHAR_PUNCTUATION;
793                         break;
794                       case Character.MATH_SYMBOL:
795                       case Character.CURRENCY_SYMBOL:
796                       case Character.MODIFIER_SYMBOL:
797                       case Character.OTHER_SYMBOL:
798                         type = CHAR_SYMBOL;
799                         break;
800                       default:
801                         throw new RuntimeException("org.apache.xerces.utils.regex.Token#getRange(): Unknown Unicode category: "+type);
802                     }
803                     ranges[type].addRange(i, i);
804                 } // for all characters
805                 ranges[Character.UNASSIGNED].addRange(0x10000, Token.UTF16_MAX);
806                 ranges[CHAR_OTHER].addRange(0x10000, Token.UTF16_MAX);
807
808                 for (int i = 0;  i < ranges.length;  i ++) {
809                     if (Token.categoryNames[i] != null) {
810                         if (i == Character.UNASSIGNED) { // Unassigned
811                             ranges[i].addRange(0x10000, Token.UTF16_MAX);
812                         }
813                         Token.categories.put(Token.categoryNames[i], ranges[i]);
814                         Token.categories2.put(Token.categoryNames[i],
815                                               Token.complementRanges(ranges[i]));
816                     }
817                 }
818                 //REVISIT: do we really need to support block names as in Unicode 3.1
819                 //         or we can just create all the names in IsBLOCKNAME format (XML Schema REC)?
820                 //
821                 StringBuffer buffer = new StringBuffer(50);
822                 for (int i = 0;  i < Token.blockNames.length;  i ++) {
823                     Token r1 = Token.createRange();
824                     int location;
825                     if (i < NONBMP_BLOCK_START) {
826                         location = i*2;
827                         int rstart = Token.blockRanges.charAt(location);
828                         int rend = Token.blockRanges.charAt(location+1);
829                         //DEBUGING
830                         //System.out.println(n+" " +Integer.toHexString(rstart)
831                         //                     +"-"+ Integer.toHexString(rend));
832                         r1.addRange(rstart, rend);
833                     } else {
834                         location = (i - NONBMP_BLOCK_START) * 2;
835                         r1.addRange(Token.nonBMPBlockRanges[location],
836                                     Token.nonBMPBlockRanges[location + 1]);
837                     }
838                     String n = Token.blockNames[i];
839                     if (n.equals("Specials"))
840                         r1.addRange(0xfff0, 0xfffd);
841                     if (n.equals("Private Use")) {
842                         r1.addRange(0xF0000,0xFFFFD);
843                         r1.addRange(0x100000,0x10FFFD);
844                     }
845                     Token.categories.put(n, r1);
846                     Token.categories2.put(n, Token.complementRanges(r1));
847                     buffer.setLength(0);
848                     buffer.append("Is");
849                     if (n.indexOf(' ') >= 0) {
850                         for (int ci = 0;  ci < n.length();  ci ++)
851                             if (n.charAt(ci) != ' ')  buffer.append((char)n.charAt(ci));
852                     }
853                     else {
854                         buffer.append(n);
855                     }
856                     Token.setAlias(buffer.toString(), n, true);
857                 }
858
859                 // TR#18 1.2
860                 Token.setAlias("ASSIGNED", "Cn", false);
861                 Token.setAlias("UNASSIGNED", "Cn", true);
862                 Token all = Token.createRange();
863                 all.addRange(0, Token.UTF16_MAX);
864                 Token.categories.put("ALL", all);
865                 Token.categories2.put("ALL", Token.complementRanges(all));
866                 Token.registerNonXS("ASSIGNED");
867                 Token.registerNonXS("UNASSIGNED");
868                 Token.registerNonXS("ALL");
869
870                 Token isalpha = Token.createRange();
871                 isalpha.mergeRanges(ranges[Character.UPPERCASE_LETTER]); // Lu
872                 isalpha.mergeRanges(ranges[Character.LOWERCASE_LETTER]); // Ll
873                 isalpha.mergeRanges(ranges[Character.OTHER_LETTER]); // Lo
874                 Token.categories.put("IsAlpha", isalpha);
875                 Token.categories2.put("IsAlpha", Token.complementRanges(isalpha));
876                 Token.registerNonXS("IsAlpha");
877
878                 Token isalnum = Token.createRange();
879                 isalnum.mergeRanges(isalpha);   // Lu Ll Lo
880                 isalnum.mergeRanges(ranges[Character.DECIMAL_DIGIT_NUMBER]); // Nd
881                 Token.categories.put("IsAlnum", isalnum);
882                 Token.categories2.put("IsAlnum", Token.complementRanges(isalnum));
883                 Token.registerNonXS("IsAlnum");
884
885                 Token isspace = Token.createRange();
886                 isspace.mergeRanges(Token.token_spaces);
887                 isspace.mergeRanges(ranges[CHAR_SEPARATOR]); // Z
888                 Token.categories.put("IsSpace", isspace);
889                 Token.categories2.put("IsSpace", Token.complementRanges(isspace));
890                 Token.registerNonXS("IsSpace");
891
892                 Token isword = Token.createRange();
893                 isword.mergeRanges(isalnum);     // Lu Ll Lo Nd
894                 isword.addRange('_', '_');
895                 Token.categories.put("IsWord", isword);
896                 Token.categories2.put("IsWord", Token.complementRanges(isword));
897                 Token.registerNonXS("IsWord");
898
899                 Token isascii = Token.createRange();
900                 isascii.addRange(0, 127);
901                 Token.categories.put("IsASCII", isascii);
902                 Token.categories2.put("IsASCII", Token.complementRanges(isascii));
903                 Token.registerNonXS("IsASCII");
904
905                 Token isnotgraph = Token.createRange();
906                 isnotgraph.mergeRanges(ranges[CHAR_OTHER]);
907                 isnotgraph.addRange(' ', ' ');
908                 Token.categories.put("IsGraph", Token.complementRanges(isnotgraph));
909                 Token.categories2.put("IsGraph", isnotgraph);
910                 Token.registerNonXS("IsGraph");
911
912                 Token isxdigit = Token.createRange();
913                 isxdigit.addRange('0', '9');
914                 isxdigit.addRange('A', 'F');
915                 isxdigit.addRange('a', 'f');
916                 Token.categories.put("IsXDigit", Token.complementRanges(isxdigit));
917                 Token.categories2.put("IsXDigit", isxdigit);
918                 Token.registerNonXS("IsXDigit");
919
920                 Token.setAlias("IsDigit", "Nd", true);
921                 Token.setAlias("IsUpper", "Lu", true);
922                 Token.setAlias("IsLower", "Ll", true);
923                 Token.setAlias("IsCntrl", "C", true);
924                 Token.setAlias("IsPrint", "C", false);
925                 Token.setAlias("IsPunct", "P", true);
926                 Token.registerNonXS("IsDigit");
927                 Token.registerNonXS("IsUpper");
928                 Token.registerNonXS("IsLower");
929                 Token.registerNonXS("IsCntrl");
930                 Token.registerNonXS("IsPrint");
931                 Token.registerNonXS("IsPunct");
932
933                 Token.setAlias("alpha", "IsAlpha", true);
934                 Token.setAlias("alnum", "IsAlnum", true);
935                 Token.setAlias("ascii", "IsASCII", true);
936                 Token.setAlias("cntrl", "IsCntrl", true);
937                 Token.setAlias("digit", "IsDigit", true);
938                 Token.setAlias("graph", "IsGraph", true);
939                 Token.setAlias("lower", "IsLower", true);
940                 Token.setAlias("print", "IsPrint", true);
941                 Token.setAlias("punct", "IsPunct", true);
942                 Token.setAlias("space", "IsSpace", true);
943                 Token.setAlias("upper", "IsUpper", true);
944                 Token.setAlias("word", "IsWord", true); // Perl extension
945                 Token.setAlias("xdigit", "IsXDigit", true);
946                 Token.registerNonXS("alpha");
947                 Token.registerNonXS("alnum");
948                 Token.registerNonXS("ascii");
949                 Token.registerNonXS("cntrl");
950                 Token.registerNonXS("digit");
951                 Token.registerNonXS("graph");
952                 Token.registerNonXS("lower");
953                 Token.registerNonXS("print");
954                 Token.registerNonXS("punct");
955                 Token.registerNonXS("space");
956                 Token.registerNonXS("upper");
957                 Token.registerNonXS("word");
958                 Token.registerNonXS("xdigit");
959             } // synchronized
960         } // if null
961         RangeToken tok = positive ? (RangeToken)Token.categories.get(name)
962             : (RangeToken)Token.categories2.get(name);
963         //if (tok == null) System.out.println(name);
964         return tok;
965     }
966     static protected RangeToken getRange(String name, boolean positive, boolean xs) {
967         RangeToken range = Token.getRange(name, positive);
968         if (xs && range != null && Token.isRegisterNonXS(name))
969             range = null;
970         return range;
971     }
972
973     static Hashtable nonxs = null;
974     /**
975      * This method is called by only getRange().
976      * So this method need not MT-safe.
977      */
978     static protected void registerNonXS(String name) {
979         if (Token.nonxs == null)
980             Token.nonxs = new Hashtable();
981         Token.nonxs.put(name, name);
982     }
983     static protected boolean isRegisterNonXS(String name) {
984         if (Token.nonxs == null)
985             return false;
986         //DEBUG
987         //System.err.println("isRegisterNonXS: "+name);
988         return Token.nonxs.containsKey(name);
989     }
990
991     private static void setAlias(String newName, String name, boolean positive) {
992         Token t1 = (Token)Token.categories.get(name);
993         Token t2 = (Token)Token.categories2.get(name);
994         if (positive) {
995             Token.categories.put(newName, t1);
996             Token.categories2.put(newName, t2);
997         } else {
998             Token.categories2.put(newName, t1);
999             Token.categories.put(newName, t2);
1000         }
1001     }
1002
1003     // ------------------------------------------------------
1004
1005     static final String viramaString =
1006     "\u094D"// ;DEVANAGARI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1007     +"\u09CD"//;BENGALI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1008     +"\u0A4D"//;GURMUKHI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1009     +"\u0ACD"//;GUJARATI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1010     +"\u0B4D"//;ORIYA SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1011     +"\u0BCD"//;TAMIL SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1012     +"\u0C4D"//;TELUGU SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1013     +"\u0CCD"//;KANNADA SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1014     +"\u0D4D"//;MALAYALAM SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1015     +"\u0E3A"//;THAI CHARACTER PHINTHU;Mn;9;ON;;;;;N;THAI VOWEL SIGN PHINTHU;;;;
1016     +"\u0F84";//;TIBETAN MARK HALANTA;Mn;9;ON;;;;;N;TIBETAN VIRAMA;;;;
1017
1018     static private Token token_grapheme = null;
1019     static synchronized Token getGraphemePattern() {
1020         if (Token.token_grapheme != null)
1021             return Token.token_grapheme;
1022
1023         Token base_char = Token.createRange();  // [{ASSIGNED}]-[{M},{C}]
1024         base_char.mergeRanges(Token.getRange("ASSIGNED", true));
1025         base_char.subtractRanges(Token.getRange("M", true));
1026         base_char.subtractRanges(Token.getRange("C", true));
1027
1028         Token virama = Token.createRange();
1029         for (int i = 0;  i < Token.viramaString.length(); i++) {
1030             virama.addRange(i, i);
1031         }
1032
1033         Token combiner_wo_virama = Token.createRange();
1034         combiner_wo_virama.mergeRanges(Token.getRange("M", true));
1035         combiner_wo_virama.addRange(0x1160, 0x11ff); // hangul_medial and hangul_final
1036         combiner_wo_virama.addRange(0xff9e, 0xff9f); // extras
1037
1038         Token left = Token.createUnion();       // base_char?
1039         left.addChild(base_char);
1040         left.addChild(Token.token_empty);
1041
1042         Token foo = Token.createUnion();
1043         foo.addChild(Token.createConcat(virama, Token.getRange("L", true)));
1044         foo.addChild(combiner_wo_virama);
1045
1046         foo = Token.createClosure(foo);
1047
1048         foo = Token.createConcat(left, foo);
1049
1050         Token.token_grapheme = foo;
1051         return Token.token_grapheme;
1052     }
1053
1054     /**
1055      * Combing Character Sequence in Perl 5.6.
1056      */
1057     static private Token token_ccs = null;
1058     static synchronized Token getCombiningCharacterSequence() {
1059         if (Token.token_ccs != null)
1060             return Token.token_ccs;
1061
1062         Token foo = Token.createClosure(Token.getRange("M", true)); // \pM*
1063         foo = Token.createConcat(Token.getRange("M", false), foo); // \PM + \pM*
1064         Token.token_ccs = foo;
1065         return Token.token_ccs;
1066     }
1067
1068     // ------------------------------------------------------
1069
1070     // ------------------------------------------------------
1071     /**
1072      * This class represents a node in parse tree.
1073      */
1074     static class StringToken extends Token implements java.io.Serializable {
1075
1076         private static final long serialVersionUID = -4614366944218504172L;
1077         
1078         String string;
1079         final int refNumber;
1080
1081         StringToken(int type, String str, int n) {
1082             super(type);
1083             this.string = str;
1084             this.refNumber = n;
1085         }
1086
1087         int getReferenceNumber() {              // for STRING
1088             return this.refNumber;
1089         }
1090         String getString() {                    // for STRING
1091             return this.string;
1092         }
1093         
1094         public String toString(int options) {
1095             if (this.type == BACKREFERENCE)
1096                 return "\\"+this.refNumber;
1097             else
1098                 return REUtil.quoteMeta(this.string);
1099         }
1100     }
1101
1102     /**
1103      * This class represents a node in parse tree.
1104      */
1105     static class ConcatToken extends Token implements java.io.Serializable {
1106
1107         private static final long serialVersionUID = 8717321425541346381L;
1108         
1109         final Token child;
1110         final Token child2;
1111         
1112         ConcatToken(Token t1, Token t2) {
1113             super(Token.CONCAT);
1114             this.child = t1;
1115             this.child2 = t2;
1116         }
1117
1118         int size() {
1119             return 2;
1120         }
1121         Token getChild(int index) {
1122             return index == 0 ? this.child : this.child2;
1123         }
1124
1125         public String toString(int options) {
1126             String ret;
1127             if (this.child2.type == CLOSURE && this.child2.getChild(0) == this.child) {
1128                 ret = this.child.toString(options)+"+";
1129             } else if (this.child2.type == NONGREEDYCLOSURE && this.child2.getChild(0) == this.child) {
1130                 ret = this.child.toString(options)+"+?";
1131             } else
1132                 ret = this.child.toString(options)+this.child2.toString(options);
1133             return ret;
1134         }
1135     }
1136
1137     /**
1138      * This class represents a node in parse tree.
1139      */
1140     static class CharToken extends Token implements java.io.Serializable {
1141
1142         private static final long serialVersionUID = -4394272816279496989L;
1143         
1144         final int chardata;
1145
1146         CharToken(int type, int ch) {
1147             super(type);
1148             this.chardata = ch;
1149         }
1150
1151         int getChar() {
1152             return this.chardata;
1153         }
1154
1155         public String toString(int options) {
1156             String ret;
1157             switch (this.type) {
1158               case CHAR:
1159                 switch (this.chardata) {
1160                   case '|':  case '*':  case '+':  case '?':
1161                   case '(':  case ')':  case '.':  case '[':
1162                   case '{':  case '\\':
1163                     ret = "\\"+(char)this.chardata;
1164                     break;
1165                   case '\f':  ret = "\\f";  break;
1166                   case '\n':  ret = "\\n";  break;
1167                   case '\r':  ret = "\\r";  break;
1168                   case '\t':  ret = "\\t";  break;
1169                   case 0x1b:  ret = "\\e";  break;
1170                     //case 0x0b:  ret = "\\v";  break;
1171                   default:
1172                     if (this.chardata >= 0x10000) {
1173                         String pre = "0"+Integer.toHexString(this.chardata);
1174                         ret = "\\v"+pre.substring(pre.length()-6, pre.length());
1175                     } else
1176                         ret = ""+(char)this.chardata;
1177                 }
1178                 break;
1179
1180               case ANCHOR:
1181                 if (this == Token.token_linebeginning || this == Token.token_lineend)
1182                     ret = ""+(char)this.chardata;
1183                 else 
1184                     ret = "\\"+(char)this.chardata;
1185                 break;
1186
1187               default:
1188                 ret = null;
1189             }
1190             return ret;
1191         }
1192
1193         boolean match(int ch) {
1194             if (this.type == CHAR) {
1195                 return ch == this.chardata;
1196             } else
1197                 throw new RuntimeException("NFAArrow#match(): Internal error: "+this.type);
1198         }
1199     }
1200
1201     /**
1202      * This class represents a node in parse tree.
1203      */
1204     static class ClosureToken extends Token implements java.io.Serializable {
1205
1206         private static final long serialVersionUID = 1308971930673997452L;
1207         
1208         int min;
1209         int max;
1210         final Token child;
1211
1212         ClosureToken(int type, Token tok) {
1213             super(type);
1214             this.child = tok;
1215             this.setMin(-1);
1216             this.setMax(-1);
1217         }
1218
1219         int size() {
1220             return 1;
1221         }
1222         Token getChild(int index) {
1223             return this.child;
1224         }
1225
1226         final void setMin(int min) {
1227             this.min = min;
1228         }
1229         final void setMax(int max) {
1230             this.max = max;
1231         }
1232         final int getMin() {
1233             return this.min;
1234         }
1235         final int getMax() {
1236             return this.max;
1237         }
1238
1239         public String toString(int options) {
1240             String ret;
1241             if (this.type == CLOSURE) {
1242                 if (this.getMin() < 0 && this.getMax() < 0) {
1243                     ret = this.child.toString(options)+"*";
1244                 } else if (this.getMin() == this.getMax()) {
1245                     ret = this.child.toString(options)+"{"+this.getMin()+"}";
1246                 } else if (this.getMin() >= 0 && this.getMax() >= 0) {
1247                     ret = this.child.toString(options)+"{"+this.getMin()+","+this.getMax()+"}";
1248                 } else if (this.getMin() >= 0 && this.getMax() < 0) {
1249                     ret = this.child.toString(options)+"{"+this.getMin()+",}";
1250                 } else
1251                     throw new RuntimeException("Token#toString(): CLOSURE "
1252                                                +this.getMin()+", "+this.getMax());
1253             } else {
1254                 if (this.getMin() < 0 && this.getMax() < 0) {
1255                     ret = this.child.toString(options)+"*?";
1256                 } else if (this.getMin() == this.getMax()) {
1257                     ret = this.child.toString(options)+"{"+this.getMin()+"}?";
1258                 } else if (this.getMin() >= 0 && this.getMax() >= 0) {
1259                     ret = this.child.toString(options)+"{"+this.getMin()+","+this.getMax()+"}?";
1260                 } else if (this.getMin() >= 0 && this.getMax() < 0) {
1261                     ret = this.child.toString(options)+"{"+this.getMin()+",}?";
1262                 } else
1263                     throw new RuntimeException("Token#toString(): NONGREEDYCLOSURE "
1264                                                +this.getMin()+", "+this.getMax());
1265             }
1266             return ret;
1267         }
1268     }
1269
1270     /**
1271      * This class represents a node in parse tree.
1272      */
1273     static class ParenToken extends Token implements java.io.Serializable {
1274
1275         private static final long serialVersionUID = -5938014719827987704L;
1276         
1277         final Token child;
1278         final int parennumber;
1279
1280         ParenToken(int type, Token tok, int paren) {
1281             super(type);
1282             this.child = tok;
1283             this.parennumber = paren;
1284         }
1285
1286         int size() {
1287             return 1;
1288         }
1289         Token getChild(int index) {
1290             return this.child;
1291         }
1292
1293         int getParenNumber() {
1294             return this.parennumber;
1295         }
1296
1297         public String toString(int options) {
1298             String ret = null;
1299             switch (this.type) {
1300               case PAREN:
1301                 if (this.parennumber == 0) {
1302                     ret = "(?:"+this.child.toString(options)+")";
1303                 } else {
1304                     ret = "("+this.child.toString(options)+")";
1305                 }
1306                 break;
1307
1308               case LOOKAHEAD:
1309                 ret = "(?="+this.child.toString(options)+")";
1310                 break;
1311               case NEGATIVELOOKAHEAD:
1312                 ret = "(?!"+this.child.toString(options)+")";
1313                 break;
1314               case LOOKBEHIND:
1315                 ret = "(?<="+this.child.toString(options)+")";
1316                 break;
1317               case NEGATIVELOOKBEHIND:
1318                 ret = "(?<!"+this.child.toString(options)+")";
1319                 break;
1320               case INDEPENDENT:
1321                 ret = "(?>"+this.child.toString(options)+")";
1322                 break;
1323             }
1324             return ret;
1325         }
1326     }
1327
1328     /**
1329      * (?(condition)yes-pattern|no-pattern)
1330      */
1331     static class ConditionToken extends Token implements java.io.Serializable {
1332
1333         private static final long serialVersionUID = 4353765277910594411L;
1334         
1335         final int refNumber;
1336         final Token condition;
1337         final Token yes;
1338         final Token no;
1339         ConditionToken(int refno, Token cond, Token yespat, Token nopat) {
1340             super(Token.CONDITION);
1341             this.refNumber = refno;
1342             this.condition = cond;
1343             this.yes = yespat;
1344             this.no = nopat;
1345         }
1346         int size() {
1347             return this.no == null ? 1 : 2;
1348         }
1349         Token getChild(int index) {
1350             if (index == 0)  return this.yes;
1351             if (index == 1)  return this.no;
1352             throw new RuntimeException("Internal Error: "+index);
1353         }
1354
1355         public String toString(int options) {
1356             String ret;
1357             if (refNumber > 0) {
1358                 ret = "(?("+refNumber+")";
1359             } else if (this.condition.type == Token.ANCHOR) {
1360                 ret = "(?("+this.condition+")";
1361             } else {
1362                 ret = "(?"+this.condition;
1363             }
1364
1365             if (this.no == null) {
1366                 ret += this.yes+")";
1367             } else {
1368                 ret += this.yes+"|"+this.no+")";
1369             }
1370             return ret;
1371         }
1372     }
1373
1374     /**
1375      * (ims-ims: .... )
1376      */
1377     static class ModifierToken extends Token implements java.io.Serializable {
1378
1379         private static final long serialVersionUID = -9114536559696480356L;
1380         
1381         final Token child;
1382         final int add;
1383         final int mask;
1384
1385         ModifierToken(Token tok, int add, int mask) {
1386             super(Token.MODIFIERGROUP);
1387             this.child = tok;
1388             this.add = add;
1389             this.mask = mask;
1390         }
1391
1392         int size() {
1393             return 1;
1394         }
1395         Token getChild(int index) {
1396             return this.child;
1397         }
1398
1399         int getOptions() {
1400             return this.add;
1401         }
1402         int getOptionsMask() {
1403             return this.mask;
1404         }
1405
1406         public String toString(int options) {
1407             return "(?"
1408                 +(this.add == 0 ? "" : REUtil.createOptionString(this.add))
1409                 +(this.mask == 0 ? "" : REUtil.createOptionString(this.mask))
1410                 +":"
1411                 +this.child.toString(options)
1412                 +")";
1413         }
1414     }
1415
1416     /**
1417      * This class represents a node in parse tree.
1418      * for UNION or CONCAT.
1419      */
1420     static class UnionToken extends Token implements java.io.Serializable {
1421
1422         private static final long serialVersionUID = -2568843945989489861L;
1423         
1424         Vector children;
1425
1426         UnionToken(int type) {
1427             super(type);
1428         }
1429
1430         void addChild(Token tok) {
1431             if (tok == null)  return;
1432             if (this.children == null)  this.children = new Vector();
1433             if (this.type == UNION) {
1434                 this.children.addElement(tok);
1435                 return;
1436             }
1437                                                 // This is CONCAT, and new child is CONCAT.
1438             if (tok.type == CONCAT) {
1439                 for (int i = 0;  i < tok.size();  i ++)
1440                     this.addChild(tok.getChild(i)); // Recursion
1441                 return;
1442             }
1443             int size = this.children.size();
1444             if (size == 0) {
1445                 this.children.addElement(tok);
1446                 return;
1447             }
1448             Token previous = (Token)this.children.elementAt(size-1);
1449             if (!((previous.type == CHAR || previous.type == STRING)
1450                   && (tok.type == CHAR || tok.type == STRING))) {
1451                 this.children.addElement(tok);
1452                 return;
1453             }
1454             
1455             //System.err.println("Merge '"+previous+"' and '"+tok+"'.");
1456
1457             StringBuffer buffer;
1458             int nextMaxLength = (tok.type == CHAR ? 2 : tok.getString().length());
1459             if (previous.type == CHAR) {        // Replace previous token by STRING
1460                 buffer = new StringBuffer(2 + nextMaxLength);
1461                 int ch = previous.getChar();
1462                 if (ch >= 0x10000)
1463                     buffer.append(REUtil.decomposeToSurrogates(ch));
1464                 else
1465                     buffer.append((char)ch);
1466                 previous = Token.createString(null);
1467                 this.children.setElementAt(previous, size-1);
1468             } else {                            // STRING
1469                 buffer = new StringBuffer(previous.getString().length() + nextMaxLength);
1470                 buffer.append(previous.getString());
1471             }
1472
1473             if (tok.type == CHAR) {
1474                 int ch = tok.getChar();
1475                 if (ch >= 0x10000)
1476                     buffer.append(REUtil.decomposeToSurrogates(ch));
1477                 else
1478                     buffer.append((char)ch);
1479             } else {
1480                 buffer.append(tok.getString());
1481             }
1482
1483             ((StringToken)previous).string = new String(buffer);
1484         }
1485
1486         int size() {
1487             return this.children == null ? 0 : this.children.size();
1488         }
1489         Token getChild(int index) {
1490             return (Token)this.children.elementAt(index);
1491         }
1492
1493         public String toString(int options) {
1494             String ret;
1495             if (this.type == CONCAT) {
1496                 if (this.children.size() == 2) {
1497                     Token ch = this.getChild(0);
1498                     Token ch2 = this.getChild(1);
1499                     if (ch2.type == CLOSURE && ch2.getChild(0) == ch) {
1500                         ret = ch.toString(options)+"+";
1501                     } else if (ch2.type == NONGREEDYCLOSURE && ch2.getChild(0) == ch) {
1502                         ret = ch.toString(options)+"+?";
1503                     } else
1504                         ret = ch.toString(options)+ch2.toString(options);
1505                 } else {
1506                     StringBuffer sb = new StringBuffer();
1507                     for (int i = 0;  i < this.children.size();  i ++) {
1508                         sb.append(((Token)this.children.elementAt(i)).toString(options));
1509                     }
1510                     ret = new String(sb);
1511                 }
1512                 return ret;
1513             }
1514             if (this.children.size() == 2 && this.getChild(1).type == EMPTY) {
1515                 ret = this.getChild(0).toString(options)+"?";
1516             } else if (this.children.size() == 2
1517                        && this.getChild(0).type == EMPTY) {
1518                 ret = this.getChild(1).toString(options)+"??";
1519             } else {
1520                 StringBuffer sb = new StringBuffer();
1521                 sb.append(((Token)this.children.elementAt(0)).toString(options));
1522                 for (int i = 1;  i < this.children.size();  i ++) {
1523                     sb.append((char)'|');
1524                     sb.append(((Token)this.children.elementAt(i)).toString(options));
1525                 }
1526                 ret = new String(sb);
1527             }
1528             return ret;
1529         }
1530     }
1531 }