BUG-5410: introduce RegularExpression.toPatternString()
[yangtools.git] / third-party / xsd-regex / src / main / java / org / opendaylight / yangtools / xsd / regex / Token.java
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one or more
3  * contributor license agreements.  See the NOTICE file distributed with
4  * this work for additional information regarding copyright ownership.
5  * The ASF licenses this file to You under the Apache License, Version 2.0
6  * (the "License"); you may not use this file except in compliance with
7  * the License.  You may obtain a copy of the License at
8  *
9  *      http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17
18 package org.opendaylight.yangtools.xsd.regex;
19
20 import java.util.Hashtable;
21 import java.util.Vector;
22
23 /**
24  * This class represents a node in parse tree.
25  *
26  * @xerces.internal
27  *
28  * @version $Id: Token.java 1638344 2014-11-11 20:15:46Z mrglavas $
29  */
30 class Token implements java.io.Serializable {
31
32     private static final long serialVersionUID = 8484976002585487481L;
33
34     static final boolean COUNTTOKENS = true;
35     static int tokens = 0;
36
37     static final int CHAR = 0;                  // Literal char
38     static final int DOT = 11;                  // .
39     static final int CONCAT = 1;                // XY
40     static final int UNION = 2;                 // X|Y|Z
41     static final int CLOSURE = 3;               // X*
42     static final int RANGE = 4;                 // [a-zA-Z] etc.
43     static final int NRANGE = 5;                // [^a-zA-Z] etc.
44     static final int PAREN = 6;                 // (X) or (?:X)
45     static final int EMPTY = 7;                 //
46     static final int ANCHOR = 8;                // ^ $ \b \B \< \> \A \Z \z
47     static final int NONGREEDYCLOSURE = 9;      // *? +?
48     static final int STRING = 10;               // strings
49     static final int BACKREFERENCE = 12;        // back references
50     static final int LOOKAHEAD = 20;            // (?=...)
51     static final int NEGATIVELOOKAHEAD = 21;    // (?!...)
52     static final int LOOKBEHIND = 22;           // (?<=...)
53     static final int NEGATIVELOOKBEHIND = 23;   // (?<!...)
54     static final int INDEPENDENT = 24;          // (?>...)
55     static final int MODIFIERGROUP = 25;        // (?ims-ims:...)
56     static final int CONDITION = 26;            // (?(...)yes|no)
57
58     static final int UTF16_MAX = 0x10ffff;
59
60     final int type;
61
62     static final Token token_dot;
63     static final Token token_0to9;
64     static final Token token_wordchars;
65     static final Token token_not_0to9;
66     static final Token token_not_wordchars;
67     static final Token token_spaces;
68     static final Token token_not_spaces;
69     static final Token token_empty;
70     static final Token token_linebeginning;
71     static final Token token_linebeginning2;
72     static final Token token_lineend;
73     static final Token token_stringbeginning;
74     static final Token token_stringend;
75     static final Token token_stringend2;
76     static final Token token_wordedge;
77     static final Token token_not_wordedge;
78     static final Token token_wordbeginning;
79     static final Token token_wordend;
80     static {
81         token_empty = new Token(Token.EMPTY);
82
83         token_linebeginning = Token.createAnchor('^');
84         token_linebeginning2 = Token.createAnchor('@');
85         token_lineend = Token.createAnchor('$');
86         token_stringbeginning = Token.createAnchor('A');
87         token_stringend = Token.createAnchor('z');
88         token_stringend2 = Token.createAnchor('Z');
89         token_wordedge = Token.createAnchor('b');
90         token_not_wordedge = Token.createAnchor('B');
91         token_wordbeginning = Token.createAnchor('<');
92         token_wordend = Token.createAnchor('>');
93
94         token_dot = new Token(Token.DOT);
95
96         token_0to9 = Token.createRange();
97         token_0to9.addRange('0', '9');
98         token_wordchars = Token.createRange();
99         token_wordchars.addRange('0', '9');
100         token_wordchars.addRange('A', 'Z');
101         token_wordchars.addRange('_', '_');
102         token_wordchars.addRange('a', 'z');
103         token_spaces = Token.createRange();
104         token_spaces.addRange('\t', '\t');
105         token_spaces.addRange('\n', '\n');
106         token_spaces.addRange('\f', '\f');
107         token_spaces.addRange('\r', '\r');
108         token_spaces.addRange(' ', ' ');
109
110         token_not_0to9 = Token.complementRanges(Token.token_0to9);
111         token_not_wordchars = Token.complementRanges(Token.token_wordchars);
112         token_not_spaces = Token.complementRanges(Token.token_spaces);
113     }
114
115     static Token.ParenToken createLook(int type, Token child) {
116         if (COUNTTOKENS) {
117             Token.tokens ++;
118         }
119         return new Token.ParenToken(type, child, 0);
120     }
121     static Token.ParenToken createParen(Token child, int pnumber) {
122         if (COUNTTOKENS) {
123             Token.tokens ++;
124         }
125         return new Token.ParenToken(Token.PAREN, child, pnumber);
126     }
127     static Token.ClosureToken createClosure(Token tok) {
128         if (COUNTTOKENS) {
129             Token.tokens ++;
130         }
131         return new Token.ClosureToken(Token.CLOSURE, tok);
132     }
133     static Token.ClosureToken createNGClosure(Token tok) {
134         if (COUNTTOKENS) {
135             Token.tokens ++;
136         }
137         return new Token.ClosureToken(Token.NONGREEDYCLOSURE, tok);
138     }
139     static Token.ConcatToken createConcat(Token tok1, Token tok2) {
140         if (COUNTTOKENS) {
141             Token.tokens ++;
142         }
143         return new Token.ConcatToken(tok1, tok2);
144     }
145     static Token.UnionToken createConcat() {
146         if (COUNTTOKENS) {
147             Token.tokens ++;
148         }
149         return new Token.UnionToken(Token.CONCAT); // *** It is not a bug.
150     }
151     static Token.UnionToken createUnion() {
152         if (COUNTTOKENS) {
153             Token.tokens ++;
154         }
155         return new Token.UnionToken(Token.UNION);
156     }
157     static Token createEmpty() {
158         return Token.token_empty;
159     }
160     static RangeToken createRange() {
161         if (COUNTTOKENS) {
162             Token.tokens ++;
163         }
164         return new RangeToken(Token.RANGE);
165     }
166     static RangeToken createNRange() {
167         if (COUNTTOKENS) {
168             Token.tokens ++;
169         }
170         return new RangeToken(Token.NRANGE);
171     }
172     static Token.CharToken createChar(int ch) {
173         if (COUNTTOKENS) {
174             Token.tokens ++;
175         }
176         return new Token.CharToken(Token.CHAR, ch);
177     }
178     static private Token.CharToken createAnchor(int ch) {
179         if (COUNTTOKENS) {
180             Token.tokens ++;
181         }
182         return new Token.CharToken(Token.ANCHOR, ch);
183     }
184     static Token.StringToken createBackReference(int refno) {
185         if (COUNTTOKENS) {
186             Token.tokens ++;
187         }
188         return new Token.StringToken(Token.BACKREFERENCE, null, refno);
189     }
190     static Token.StringToken createString(String str) {
191         if (COUNTTOKENS) {
192             Token.tokens ++;
193         }
194         return new Token.StringToken(Token.STRING, str, 0);
195     }
196     static Token.ModifierToken createModifierGroup(Token child, int add, int mask) {
197         if (COUNTTOKENS) {
198             Token.tokens ++;
199         }
200         return new Token.ModifierToken(child, add, mask);
201     }
202     static Token.ConditionToken createCondition(int refno, Token condition,
203                                                 Token yespat, Token nopat) {
204         if (COUNTTOKENS) {
205             Token.tokens ++;
206         }
207         return new Token.ConditionToken(refno, condition, yespat, nopat);
208     }
209
210     protected Token(int type) {
211         this.type = type;
212     }
213
214     /**
215      * A number of children.
216      */
217     int size() {
218         return 0;
219     }
220     Token getChild(int index) {
221         return null;
222     }
223     void addChild(Token tok) {
224         throw new RuntimeException("Not supported.");
225     }
226
227                                                 // for RANGE or NRANGE
228     protected void addRange(int start, int end) {
229         throw new RuntimeException("Not supported.");
230     }
231     protected void sortRanges() {
232         throw new RuntimeException("Not supported.");
233     }
234     protected void compactRanges() {
235         throw new RuntimeException("Not supported.");
236     }
237     protected void mergeRanges(Token tok) {
238         throw new RuntimeException("Not supported.");
239     }
240     protected void subtractRanges(Token tok) {
241         throw new RuntimeException("Not supported.");
242     }
243     protected void intersectRanges(Token tok) {
244         throw new RuntimeException("Not supported.");
245     }
246     static Token complementRanges(Token tok) {
247         return RangeToken.complementRanges(tok);
248     }
249
250
251     void setMin(int min) {                      // for CLOSURE
252     }
253     void setMax(int max) {                      // for CLOSURE
254     }
255     int getMin() {                              // for CLOSURE
256         return -1;
257     }
258     int getMax() {                              // for CLOSURE
259         return -1;
260     }
261     int getReferenceNumber() {                  // for STRING
262         return 0;
263     }
264     String getString() {                        // for STRING
265         return null;
266     }
267
268     int getParenNumber() {
269         return 0;
270     }
271     int getChar() {
272         return -1;
273     }
274
275     @Override
276     public String toString() {
277         return this.toString(0);
278     }
279     public String toString(int options) {
280         return this.type == Token.DOT ? "." : "";
281     }
282
283     /**
284      * How many characters are needed?
285      */
286     final int getMinLength() {
287         switch (this.type) {
288           case CONCAT:
289             int sum = 0;
290             for (int i = 0;  i < this.size();  i ++) {
291                 sum += this.getChild(i).getMinLength();
292             }
293             return sum;
294
295           case CONDITION:
296           case UNION:
297             if (this.size() == 0) {
298                 return 0;
299             }
300             int ret = this.getChild(0).getMinLength();
301             for (int i = 1;  i < this.size();  i ++) {
302                 int min = this.getChild(i).getMinLength();
303                 if (min < ret) {
304                     ret = min;
305                 }
306             }
307             return ret;
308
309           case CLOSURE:
310           case NONGREEDYCLOSURE:
311             if (this.getMin() >= 0) {
312                 return this.getMin() * this.getChild(0).getMinLength();
313             }
314             return 0;
315
316           case EMPTY:
317           case ANCHOR:
318             return 0;
319
320           case DOT:
321           case CHAR:
322           case RANGE:
323           case NRANGE:
324             return 1;
325
326           case INDEPENDENT:
327           case PAREN:
328           case MODIFIERGROUP:
329             return this.getChild(0).getMinLength();
330
331           case BACKREFERENCE:
332             return 0;                           // *******
333
334           case STRING:
335             return this.getString().length();
336
337           case LOOKAHEAD:
338           case NEGATIVELOOKAHEAD:
339           case LOOKBEHIND:
340           case NEGATIVELOOKBEHIND:
341             return 0;                           // ***** Really?
342
343           default:
344             throw new RuntimeException("Token#getMinLength(): Invalid Type: "+this.type);
345         }
346     }
347
348     final int getMaxLength() {
349         switch (this.type) {
350           case CONCAT:
351             int sum = 0;
352             for (int i = 0;  i < this.size();  i ++) {
353                 int d = this.getChild(i).getMaxLength();
354                 if (d < 0) {
355                     return -1;
356                 }
357                 sum += d;
358             }
359             return sum;
360
361           case CONDITION:
362           case UNION:
363             if (this.size() == 0) {
364                 return 0;
365             }
366             int ret = this.getChild(0).getMaxLength();
367             for (int i = 1;  ret >= 0 && i < this.size();  i ++) {
368                 int max = this.getChild(i).getMaxLength();
369                 if (max < 0) {                  // infinity
370                     ret = -1;
371                     break;
372                 }
373                 if (max > ret) {
374                     ret = max;
375                 }
376             }
377             return ret;
378
379           case CLOSURE:
380           case NONGREEDYCLOSURE:
381             if (this.getMax() >= 0) {
382                 // When this.child.getMaxLength() < 0,
383                                                 // this returns minus value
384                 return this.getMax() * this.getChild(0).getMaxLength();
385             }
386             return -1;
387
388           case EMPTY:
389           case ANCHOR:
390             return 0;
391
392           case CHAR:
393             return 1;
394           case DOT:
395           case RANGE:
396           case NRANGE:
397             return 2;
398
399           case INDEPENDENT:
400           case PAREN:
401           case MODIFIERGROUP:
402             return this.getChild(0).getMaxLength();
403
404           case BACKREFERENCE:
405             return -1;                          // ******
406
407           case STRING:
408             return this.getString().length();
409
410           case LOOKAHEAD:
411           case NEGATIVELOOKAHEAD:
412           case LOOKBEHIND:
413           case NEGATIVELOOKBEHIND:
414             return 0;                           // ***** Really?
415
416           default:
417             throw new RuntimeException("Token#getMaxLength(): Invalid Type: "+this.type);
418         }
419     }
420
421     static final int FC_CONTINUE = 0;
422     static final int FC_TERMINAL = 1;
423     static final int FC_ANY = 2;
424     static final boolean isSet(int options, int flag) {
425         return (options & flag) == flag;
426     }
427     final int analyzeFirstCharacter(RangeToken result, int options) {
428         switch (this.type) {
429           case CONCAT:
430             int ret = FC_CONTINUE;
431             for (int i = 0;  i < this.size();  i ++) {
432                 if ((ret = this.getChild(i).analyzeFirstCharacter(result, options)) != FC_CONTINUE) {
433                     break;
434                 }
435             }
436             return ret;
437
438           case UNION:
439             if (this.size() == 0) {
440                 return FC_CONTINUE;
441             }
442             /*
443              *  a|b|c -> FC_TERMINAL
444              *  a|.|c -> FC_ANY
445              *  a|b|  -> FC_CONTINUE
446              */
447             int ret2 = FC_CONTINUE;
448             boolean hasEmpty = false;
449             for (int i = 0;  i < this.size();  i ++) {
450                 ret2 = this.getChild(i).analyzeFirstCharacter(result, options);
451                 if (ret2 == FC_ANY) {
452                     break;
453                 } else if (ret2 == FC_CONTINUE) {
454                     hasEmpty = true;
455                 }
456             }
457             return hasEmpty ? FC_CONTINUE : ret2;
458
459           case CONDITION:
460             int ret3 = this.getChild(0).analyzeFirstCharacter(result, options);
461             if (this.size() == 1) {
462                 return FC_CONTINUE;
463             }
464             if (ret3 == FC_ANY) {
465                 return ret3;
466             }
467             int ret4 = this.getChild(1).analyzeFirstCharacter(result, options);
468             if (ret4 == FC_ANY) {
469                 return ret4;
470             }
471             return ret3 == FC_CONTINUE || ret4 == FC_CONTINUE ? FC_CONTINUE : FC_TERMINAL;
472
473           case CLOSURE:
474           case NONGREEDYCLOSURE:
475             this.getChild(0).analyzeFirstCharacter(result, options);
476             return FC_CONTINUE;
477
478           case EMPTY:
479           case ANCHOR:
480             return FC_CONTINUE;
481
482           case CHAR:
483             int ch = this.getChar();
484             result.addRange(ch, ch);
485             if (ch < 0x10000 && isSet(options, RegularExpression.IGNORE_CASE)) {
486                 ch = Character.toUpperCase((char)ch);
487                 result.addRange(ch, ch);
488                 ch = Character.toLowerCase((char)ch);
489                 result.addRange(ch, ch);
490             }
491             return FC_TERMINAL;
492
493           case DOT:
494               return FC_ANY;
495
496           case RANGE:
497             result.mergeRanges(this);
498             return FC_TERMINAL;
499
500           case NRANGE:                          // ****
501             result.mergeRanges(Token.complementRanges(this));
502             return FC_TERMINAL;
503
504           case INDEPENDENT:
505           case PAREN:
506             return this.getChild(0).analyzeFirstCharacter(result, options);
507
508           case MODIFIERGROUP:
509             options |= ((ModifierToken)this).getOptions();
510             options &= ~((ModifierToken)this).getOptionsMask();
511             return this.getChild(0).analyzeFirstCharacter(result, options);
512
513           case BACKREFERENCE:
514             result.addRange(0, UTF16_MAX);  // **** We can not optimize.
515             return FC_ANY;
516
517           case STRING:
518             int cha = this.getString().charAt(0);
519             int ch2;
520             if (REUtil.isHighSurrogate(cha)
521                 && this.getString().length() >= 2
522                 && REUtil.isLowSurrogate((ch2 = this.getString().charAt(1)))) {
523                 cha = REUtil.composeFromSurrogates(cha, ch2);
524             }
525             result.addRange(cha, cha);
526             if (cha < 0x10000 && isSet(options, RegularExpression.IGNORE_CASE)) {
527                 cha = Character.toUpperCase((char)cha);
528                 result.addRange(cha, cha);
529                 cha = Character.toLowerCase((char)cha);
530                 result.addRange(cha, cha);
531             }
532             return FC_TERMINAL;
533
534           case LOOKAHEAD:
535           case NEGATIVELOOKAHEAD:
536           case LOOKBEHIND:
537           case NEGATIVELOOKBEHIND:
538             return FC_CONTINUE;
539
540           default:
541             throw new RuntimeException("Token#analyzeHeadCharacter(): Invalid Type: "+this.type);
542         }
543     }
544
545     private final boolean isShorterThan(Token tok) {
546         if (tok == null) {
547             return false;
548         }
549         /*
550         int mylength;
551         if (this.type == STRING)  mylength = this.getString().length();
552         else if (this.type == CHAR)  mylength = this.getChar() >= 0x10000 ? 2 : 1;
553         else throw new RuntimeException("Internal Error: Illegal type: "+this.type);
554         int otherlength;
555         if (tok.type == STRING)  otherlength = tok.getString().length();
556         else if (tok.type == CHAR)  otherlength = tok.getChar() >= 0x10000 ? 2 : 1;
557         else throw new RuntimeException("Internal Error: Illegal type: "+tok.type);
558         */
559         int mylength;
560         if (this.type == STRING) {
561             mylength = this.getString().length();
562         } else {
563             throw new RuntimeException("Internal Error: Illegal type: "+this.type);
564         }
565         int otherlength;
566         if (tok.type == STRING) {
567             otherlength = tok.getString().length();
568         } else {
569             throw new RuntimeException("Internal Error: Illegal type: "+tok.type);
570         }
571         return mylength < otherlength;
572     }
573
574     static class FixedStringContainer {
575         Token token = null;
576         int options = 0;
577         FixedStringContainer() {
578         }
579     }
580
581     final void findFixedString(FixedStringContainer container, int options) {
582         switch (this.type) {
583           case CONCAT:
584             Token prevToken = null;
585             int prevOptions = 0;
586             for (int i = 0;  i < this.size();  i ++) {
587                 this.getChild(i).findFixedString(container, options);
588                 if (prevToken == null || prevToken.isShorterThan(container.token)) {
589                     prevToken = container.token;
590                     prevOptions = container.options;
591                 }
592             }
593             container.token = prevToken;
594             container.options = prevOptions;
595             return;
596
597           case UNION:
598           case CLOSURE:
599           case NONGREEDYCLOSURE:
600           case EMPTY:
601           case ANCHOR:
602           case RANGE:
603           case DOT:
604           case NRANGE:
605           case BACKREFERENCE:
606           case LOOKAHEAD:
607           case NEGATIVELOOKAHEAD:
608           case LOOKBEHIND:
609           case NEGATIVELOOKBEHIND:
610           case CONDITION:
611             container.token = null;
612             return;
613
614           case CHAR:                            // Ignore CHAR tokens.
615             container.token = null;             // **
616             return;                             // **
617
618           case STRING:
619             container.token = this;
620             container.options = options;
621             return;
622
623           case INDEPENDENT:
624           case PAREN:
625             this.getChild(0).findFixedString(container, options);
626             return;
627
628           case MODIFIERGROUP:
629             options |= ((ModifierToken)this).getOptions();
630             options &= ~((ModifierToken)this).getOptionsMask();
631             this.getChild(0).findFixedString(container, options);
632             return;
633
634           default:
635             throw new RuntimeException("Token#findFixedString(): Invalid Type: "+this.type);
636         }
637     }
638
639     boolean match(int ch) {
640         throw new RuntimeException("NFAArrow#match(): Internal error: "+this.type);
641     }
642
643     // ------------------------------------------------------
644     private final static Hashtable<String, Token> categories = new Hashtable<>();
645     private final static Hashtable<String, Token> categories2 = new Hashtable<>();
646     private static final String[] categoryNames = {
647         "Cn", "Lu", "Ll", "Lt", "Lm", "Lo", "Mn", "Me", "Mc", "Nd",
648         "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", null, "Co", "Cs",
649         "Pd", "Ps", "Pe", "Pc", "Po", "Sm", "Sc", "Sk", "So", // 28
650         "Pi", "Pf",  // 29, 30
651         "L", "M", "N", "Z", "C", "P", "S",      // 31-37
652     };
653
654     // Schema Rec. {Datatypes} - Punctuation
655     static final int CHAR_INIT_QUOTE  = 29;     // Pi - initial quote
656     static final int CHAR_FINAL_QUOTE = 30;     // Pf - final quote
657     static final int CHAR_LETTER = 31;
658     static final int CHAR_MARK = 32;
659     static final int CHAR_NUMBER = 33;
660     static final int CHAR_SEPARATOR = 34;
661     static final int CHAR_OTHER = 35;
662     static final int CHAR_PUNCTUATION = 36;
663     static final int CHAR_SYMBOL = 37;
664
665     //blockNames in UNICODE 3.1 that supported by XML Schema REC
666     private static final String[] blockNames = {
667         /*0000..007F;*/ "Basic Latin",
668         /*0080..00FF;*/ "Latin-1 Supplement",
669         /*0100..017F;*/ "Latin Extended-A",
670         /*0180..024F;*/ "Latin Extended-B",
671         /*0250..02AF;*/ "IPA Extensions",
672         /*02B0..02FF;*/ "Spacing Modifier Letters",
673         /*0300..036F;*/ "Combining Diacritical Marks",
674         /*0370..03FF;*/ "Greek",
675         /*0400..04FF;*/ "Cyrillic",
676         /*0530..058F;*/ "Armenian",
677         /*0590..05FF;*/ "Hebrew",
678         /*0600..06FF;*/ "Arabic",
679         /*0700..074F;*/ "Syriac",
680         /*0780..07BF;*/ "Thaana",
681         /*0900..097F;*/ "Devanagari",
682         /*0980..09FF;*/ "Bengali",
683         /*0A00..0A7F;*/ "Gurmukhi",
684         /*0A80..0AFF;*/ "Gujarati",
685         /*0B00..0B7F;*/ "Oriya",
686         /*0B80..0BFF;*/ "Tamil",
687         /*0C00..0C7F;*/ "Telugu",
688         /*0C80..0CFF;*/ "Kannada",
689         /*0D00..0D7F;*/ "Malayalam",
690         /*0D80..0DFF;*/ "Sinhala",
691         /*0E00..0E7F;*/ "Thai",
692         /*0E80..0EFF;*/ "Lao",
693         /*0F00..0FFF;*/ "Tibetan",
694         /*1000..109F;*/ "Myanmar",
695         /*10A0..10FF;*/ "Georgian",
696         /*1100..11FF;*/ "Hangul Jamo",
697         /*1200..137F;*/ "Ethiopic",
698         /*13A0..13FF;*/ "Cherokee",
699         /*1400..167F;*/ "Unified Canadian Aboriginal Syllabics",
700         /*1680..169F;*/ "Ogham",
701         /*16A0..16FF;*/ "Runic",
702         /*1780..17FF;*/ "Khmer",
703         /*1800..18AF;*/ "Mongolian",
704         /*1E00..1EFF;*/ "Latin Extended Additional",
705         /*1F00..1FFF;*/ "Greek Extended",
706         /*2000..206F;*/ "General Punctuation",
707         /*2070..209F;*/ "Superscripts and Subscripts",
708         /*20A0..20CF;*/ "Currency Symbols",
709         /*20D0..20FF;*/ "Combining Marks for Symbols",
710         /*2100..214F;*/ "Letterlike Symbols",
711         /*2150..218F;*/ "Number Forms",
712         /*2190..21FF;*/ "Arrows",
713         /*2200..22FF;*/ "Mathematical Operators",
714         /*2300..23FF;*/ "Miscellaneous Technical",
715         /*2400..243F;*/ "Control Pictures",
716         /*2440..245F;*/ "Optical Character Recognition",
717         /*2460..24FF;*/ "Enclosed Alphanumerics",
718         /*2500..257F;*/ "Box Drawing",
719         /*2580..259F;*/ "Block Elements",
720         /*25A0..25FF;*/ "Geometric Shapes",
721         /*2600..26FF;*/ "Miscellaneous Symbols",
722         /*2700..27BF;*/ "Dingbats",
723         /*2800..28FF;*/ "Braille Patterns",
724         /*2E80..2EFF;*/ "CJK Radicals Supplement",
725         /*2F00..2FDF;*/ "Kangxi Radicals",
726         /*2FF0..2FFF;*/ "Ideographic Description Characters",
727         /*3000..303F;*/ "CJK Symbols and Punctuation",
728         /*3040..309F;*/ "Hiragana",
729         /*30A0..30FF;*/ "Katakana",
730         /*3100..312F;*/ "Bopomofo",
731         /*3130..318F;*/ "Hangul Compatibility Jamo",
732         /*3190..319F;*/ "Kanbun",
733         /*31A0..31BF;*/ "Bopomofo Extended",
734         /*3200..32FF;*/ "Enclosed CJK Letters and Months",
735         /*3300..33FF;*/ "CJK Compatibility",
736         /*3400..4DB5;*/ "CJK Unified Ideographs Extension A",
737         /*4E00..9FFF;*/ "CJK Unified Ideographs",
738         /*A000..A48F;*/ "Yi Syllables",
739         /*A490..A4CF;*/ "Yi Radicals",
740         /*AC00..D7A3;*/ "Hangul Syllables",
741         /*E000..F8FF;*/ "Private Use",
742         /*F900..FAFF;*/ "CJK Compatibility Ideographs",
743         /*FB00..FB4F;*/ "Alphabetic Presentation Forms",
744         /*FB50..FDFF;*/ "Arabic Presentation Forms-A",
745         /*FE20..FE2F;*/ "Combining Half Marks",
746         /*FE30..FE4F;*/ "CJK Compatibility Forms",
747         /*FE50..FE6F;*/ "Small Form Variants",
748         /*FE70..FEFE;*/ "Arabic Presentation Forms-B",
749         /*FEFF..FEFF;*/ "Specials",
750         /*FF00..FFEF;*/ "Halfwidth and Fullwidth Forms",
751          //missing Specials add manually
752         /*10300..1032F;*/ "Old Italic",         // 84
753         /*10330..1034F;*/ "Gothic",
754         /*10400..1044F;*/ "Deseret",
755         /*1D000..1D0FF;*/ "Byzantine Musical Symbols",
756         /*1D100..1D1FF;*/ "Musical Symbols",
757         /*1D400..1D7FF;*/ "Mathematical Alphanumeric Symbols",
758         /*20000..2A6D6;*/ "CJK Unified Ideographs Extension B",
759         /*2F800..2FA1F;*/ "CJK Compatibility Ideographs Supplement",
760         /*E0000..E007F;*/ "Tags",
761         //missing 2 private use add manually
762
763     };
764     //ADD THOSE MANUALLY
765     //F0000..FFFFD; "Private Use",
766     //100000..10FFFD; "Private Use"
767     //FFF0..FFFD; "Specials",
768     static final String blockRanges =
769        "\u0000\u007F\u0080\u00FF\u0100\u017F\u0180\u024F\u0250\u02AF\u02B0\u02FF\u0300\u036F"
770         +"\u0370\u03FF\u0400\u04FF\u0530\u058F\u0590\u05FF\u0600\u06FF\u0700\u074F\u0780\u07BF"
771         +"\u0900\u097F\u0980\u09FF\u0A00\u0A7F\u0A80\u0AFF\u0B00\u0B7F\u0B80\u0BFF\u0C00\u0C7F\u0C80\u0CFF"
772         +"\u0D00\u0D7F\u0D80\u0DFF\u0E00\u0E7F\u0E80\u0EFF\u0F00\u0FFF\u1000\u109F\u10A0\u10FF\u1100\u11FF"
773         +"\u1200\u137F\u13A0\u13FF\u1400\u167F\u1680\u169F\u16A0\u16FF\u1780\u17FF\u1800\u18AF\u1E00\u1EFF"
774         +"\u1F00\u1FFF\u2000\u206F\u2070\u209F\u20A0\u20CF\u20D0\u20FF\u2100\u214F\u2150\u218F\u2190\u21FF\u2200\u22FF"
775         +"\u2300\u23FF\u2400\u243F\u2440\u245F\u2460\u24FF\u2500\u257F\u2580\u259F\u25A0\u25FF\u2600\u26FF\u2700\u27BF"
776         +"\u2800\u28FF\u2E80\u2EFF\u2F00\u2FDF\u2FF0\u2FFF\u3000\u303F\u3040\u309F\u30A0\u30FF\u3100\u312F\u3130\u318F"
777         +"\u3190\u319F\u31A0\u31BF\u3200\u32FF\u3300\u33FF\u3400\u4DB5\u4E00\u9FFF\uA000\uA48F\uA490\uA4CF"
778         +"\uAC00\uD7A3\uE000\uF8FF\uF900\uFAFF\uFB00\uFB4F\uFB50\uFDFF"
779         +"\uFE20\uFE2F\uFE30\uFE4F\uFE50\uFE6F\uFE70\uFEFE\uFEFF\uFEFF\uFF00\uFFEF";
780     static final int[] nonBMPBlockRanges = {
781         0x10300, 0x1032F,       // 84
782         0x10330, 0x1034F,
783         0x10400, 0x1044F,
784         0x1D000, 0x1D0FF,
785         0x1D100, 0x1D1FF,
786         0x1D400, 0x1D7FF,
787         0x20000, 0x2A6D6,
788         0x2F800, 0x2FA1F,
789         0xE0000, 0xE007F
790     };
791     private static final int NONBMP_BLOCK_START = 84;
792
793     static protected RangeToken getRange(String name, boolean positive) {
794         if (Token.categories.size() == 0) {
795             synchronized (Token.categories) {
796                 Token[] ranges = new Token[Token.categoryNames.length];
797                 for (int i = 0;  i < ranges.length;  i ++) {
798                     ranges[i] = Token.createRange();
799                 }
800                 int type;
801                 for (int i = 0;  i < 0x10000;  i ++) {
802                     type = Character.getType((char)i);
803                     if (type == Character.START_PUNCTUATION ||
804                         type == Character.END_PUNCTUATION) {
805                         //build table of Pi values
806                         if (i == 0x00AB || i == 0x2018 || i == 0x201B || i == 0x201C ||
807                             i == 0x201F || i == 0x2039) {
808                             type = CHAR_INIT_QUOTE;
809                         }
810                         //build table of Pf values
811                         if (i == 0x00BB || i == 0x2019 || i == 0x201D || i == 0x203A ) {
812                             type = CHAR_FINAL_QUOTE;
813                         }
814                     }
815                     ranges[type].addRange(i, i);
816                     switch (type) {
817                       case Character.UPPERCASE_LETTER:
818                       case Character.LOWERCASE_LETTER:
819                       case Character.TITLECASE_LETTER:
820                       case Character.MODIFIER_LETTER:
821                       case Character.OTHER_LETTER:
822                         type = CHAR_LETTER;
823                         break;
824                       case Character.NON_SPACING_MARK:
825                       case Character.COMBINING_SPACING_MARK:
826                       case Character.ENCLOSING_MARK:
827                         type = CHAR_MARK;
828                         break;
829                       case Character.DECIMAL_DIGIT_NUMBER:
830                       case Character.LETTER_NUMBER:
831                       case Character.OTHER_NUMBER:
832                         type = CHAR_NUMBER;
833                         break;
834                       case Character.SPACE_SEPARATOR:
835                       case Character.LINE_SEPARATOR:
836                       case Character.PARAGRAPH_SEPARATOR:
837                         type = CHAR_SEPARATOR;
838                         break;
839                       case Character.CONTROL:
840                       case Character.FORMAT:
841                       case Character.SURROGATE:
842                       case Character.PRIVATE_USE:
843                       case Character.UNASSIGNED:
844                         type = CHAR_OTHER;
845                         break;
846                       case Character.CONNECTOR_PUNCTUATION:
847                       case Character.DASH_PUNCTUATION:
848                       case Character.START_PUNCTUATION:
849                       case Character.END_PUNCTUATION:
850                       case CHAR_INIT_QUOTE:
851                       case CHAR_FINAL_QUOTE:
852                       case Character.OTHER_PUNCTUATION:
853                         type = CHAR_PUNCTUATION;
854                         break;
855                       case Character.MATH_SYMBOL:
856                       case Character.CURRENCY_SYMBOL:
857                       case Character.MODIFIER_SYMBOL:
858                       case Character.OTHER_SYMBOL:
859                         type = CHAR_SYMBOL;
860                         break;
861                       default:
862                         throw new RuntimeException("org.apache.xerces.utils.regex.Token#getRange(): Unknown Unicode category: "+type);
863                     }
864                     ranges[type].addRange(i, i);
865                 } // for all characters
866                 ranges[Character.UNASSIGNED].addRange(0x10000, Token.UTF16_MAX);
867                 ranges[CHAR_OTHER].addRange(0x10000, Token.UTF16_MAX);
868
869                 for (int i = 0;  i < ranges.length;  i ++) {
870                     if (Token.categoryNames[i] != null) {
871                         if (i == Character.UNASSIGNED) { // Unassigned
872                             ranges[i].addRange(0x10000, Token.UTF16_MAX);
873                         }
874                         Token.categories.put(Token.categoryNames[i], ranges[i]);
875                         Token.categories2.put(Token.categoryNames[i],
876                                               Token.complementRanges(ranges[i]));
877                     }
878                 }
879                 //REVISIT: do we really need to support block names as in Unicode 3.1
880                 //         or we can just create all the names in IsBLOCKNAME format (XML Schema REC)?
881                 //
882                 StringBuffer buffer = new StringBuffer(50);
883                 for (int i = 0;  i < Token.blockNames.length;  i ++) {
884                     Token r1 = Token.createRange();
885                     int location;
886                     if (i < NONBMP_BLOCK_START) {
887                         location = i*2;
888                         int rstart = Token.blockRanges.charAt(location);
889                         int rend = Token.blockRanges.charAt(location+1);
890                         //DEBUGING
891                         //System.out.println(n+" " +Integer.toHexString(rstart)
892                         //                     +"-"+ Integer.toHexString(rend));
893                         r1.addRange(rstart, rend);
894                     } else {
895                         location = (i - NONBMP_BLOCK_START) * 2;
896                         r1.addRange(Token.nonBMPBlockRanges[location],
897                                     Token.nonBMPBlockRanges[location + 1]);
898                     }
899                     String n = Token.blockNames[i];
900                     if (n.equals("Specials")) {
901                         r1.addRange(0xfff0, 0xfffd);
902                     }
903                     if (n.equals("Private Use")) {
904                         r1.addRange(0xF0000,0xFFFFD);
905                         r1.addRange(0x100000,0x10FFFD);
906                     }
907                     Token.categories.put(n, r1);
908                     Token.categories2.put(n, Token.complementRanges(r1));
909                     buffer.setLength(0);
910                     buffer.append("Is");
911                     if (n.indexOf(' ') >= 0) {
912                         for (int ci = 0;  ci < n.length();  ci ++) {
913                             if (n.charAt(ci) != ' ') {
914                                 buffer.append(n.charAt(ci));
915                             }
916                         }
917                     }
918                     else {
919                         buffer.append(n);
920                     }
921                     Token.setAlias(buffer.toString(), n, true);
922                 }
923
924                 // TR#18 1.2
925                 Token.setAlias("ASSIGNED", "Cn", false);
926                 Token.setAlias("UNASSIGNED", "Cn", true);
927                 Token all = Token.createRange();
928                 all.addRange(0, Token.UTF16_MAX);
929                 Token.categories.put("ALL", all);
930                 Token.categories2.put("ALL", Token.complementRanges(all));
931                 Token.registerNonXS("ASSIGNED");
932                 Token.registerNonXS("UNASSIGNED");
933                 Token.registerNonXS("ALL");
934
935                 Token isalpha = Token.createRange();
936                 isalpha.mergeRanges(ranges[Character.UPPERCASE_LETTER]); // Lu
937                 isalpha.mergeRanges(ranges[Character.LOWERCASE_LETTER]); // Ll
938                 isalpha.mergeRanges(ranges[Character.OTHER_LETTER]); // Lo
939                 Token.categories.put("IsAlpha", isalpha);
940                 Token.categories2.put("IsAlpha", Token.complementRanges(isalpha));
941                 Token.registerNonXS("IsAlpha");
942
943                 Token isalnum = Token.createRange();
944                 isalnum.mergeRanges(isalpha);   // Lu Ll Lo
945                 isalnum.mergeRanges(ranges[Character.DECIMAL_DIGIT_NUMBER]); // Nd
946                 Token.categories.put("IsAlnum", isalnum);
947                 Token.categories2.put("IsAlnum", Token.complementRanges(isalnum));
948                 Token.registerNonXS("IsAlnum");
949
950                 Token isspace = Token.createRange();
951                 isspace.mergeRanges(Token.token_spaces);
952                 isspace.mergeRanges(ranges[CHAR_SEPARATOR]); // Z
953                 Token.categories.put("IsSpace", isspace);
954                 Token.categories2.put("IsSpace", Token.complementRanges(isspace));
955                 Token.registerNonXS("IsSpace");
956
957                 Token isword = Token.createRange();
958                 isword.mergeRanges(isalnum);     // Lu Ll Lo Nd
959                 isword.addRange('_', '_');
960                 Token.categories.put("IsWord", isword);
961                 Token.categories2.put("IsWord", Token.complementRanges(isword));
962                 Token.registerNonXS("IsWord");
963
964                 Token isascii = Token.createRange();
965                 isascii.addRange(0, 127);
966                 Token.categories.put("IsASCII", isascii);
967                 Token.categories2.put("IsASCII", Token.complementRanges(isascii));
968                 Token.registerNonXS("IsASCII");
969
970                 Token isnotgraph = Token.createRange();
971                 isnotgraph.mergeRanges(ranges[CHAR_OTHER]);
972                 isnotgraph.addRange(' ', ' ');
973                 Token.categories.put("IsGraph", Token.complementRanges(isnotgraph));
974                 Token.categories2.put("IsGraph", isnotgraph);
975                 Token.registerNonXS("IsGraph");
976
977                 Token isxdigit = Token.createRange();
978                 isxdigit.addRange('0', '9');
979                 isxdigit.addRange('A', 'F');
980                 isxdigit.addRange('a', 'f');
981                 Token.categories.put("IsXDigit", Token.complementRanges(isxdigit));
982                 Token.categories2.put("IsXDigit", isxdigit);
983                 Token.registerNonXS("IsXDigit");
984
985                 Token.setAlias("IsDigit", "Nd", true);
986                 Token.setAlias("IsUpper", "Lu", true);
987                 Token.setAlias("IsLower", "Ll", true);
988                 Token.setAlias("IsCntrl", "C", true);
989                 Token.setAlias("IsPrint", "C", false);
990                 Token.setAlias("IsPunct", "P", true);
991                 Token.registerNonXS("IsDigit");
992                 Token.registerNonXS("IsUpper");
993                 Token.registerNonXS("IsLower");
994                 Token.registerNonXS("IsCntrl");
995                 Token.registerNonXS("IsPrint");
996                 Token.registerNonXS("IsPunct");
997
998                 Token.setAlias("alpha", "IsAlpha", true);
999                 Token.setAlias("alnum", "IsAlnum", true);
1000                 Token.setAlias("ascii", "IsASCII", true);
1001                 Token.setAlias("cntrl", "IsCntrl", true);
1002                 Token.setAlias("digit", "IsDigit", true);
1003                 Token.setAlias("graph", "IsGraph", true);
1004                 Token.setAlias("lower", "IsLower", true);
1005                 Token.setAlias("print", "IsPrint", true);
1006                 Token.setAlias("punct", "IsPunct", true);
1007                 Token.setAlias("space", "IsSpace", true);
1008                 Token.setAlias("upper", "IsUpper", true);
1009                 Token.setAlias("word", "IsWord", true); // Perl extension
1010                 Token.setAlias("xdigit", "IsXDigit", true);
1011                 Token.registerNonXS("alpha");
1012                 Token.registerNonXS("alnum");
1013                 Token.registerNonXS("ascii");
1014                 Token.registerNonXS("cntrl");
1015                 Token.registerNonXS("digit");
1016                 Token.registerNonXS("graph");
1017                 Token.registerNonXS("lower");
1018                 Token.registerNonXS("print");
1019                 Token.registerNonXS("punct");
1020                 Token.registerNonXS("space");
1021                 Token.registerNonXS("upper");
1022                 Token.registerNonXS("word");
1023                 Token.registerNonXS("xdigit");
1024             } // synchronized
1025         } // if null
1026         RangeToken tok = positive ? (RangeToken)Token.categories.get(name)
1027             : (RangeToken)Token.categories2.get(name);
1028         //if (tok == null) System.out.println(name);
1029         return tok;
1030     }
1031     static protected RangeToken getRange(String name, boolean positive, boolean xs) {
1032         RangeToken range = Token.getRange(name, positive);
1033         if (xs && range != null && Token.isRegisterNonXS(name)) {
1034             range = null;
1035         }
1036         return range;
1037     }
1038
1039     static Hashtable<String, String> nonxs = null;
1040     /**
1041      * This method is called by only getRange().
1042      * So this method need not MT-safe.
1043      */
1044     static protected void registerNonXS(String name) {
1045         if (Token.nonxs == null) {
1046             Token.nonxs = new Hashtable<>();
1047         }
1048         Token.nonxs.put(name, name);
1049     }
1050     static protected boolean isRegisterNonXS(String name) {
1051         if (Token.nonxs == null) {
1052             return false;
1053         }
1054         //DEBUG
1055         //System.err.println("isRegisterNonXS: "+name);
1056         return Token.nonxs.containsKey(name);
1057     }
1058
1059     private static void setAlias(String newName, String name, boolean positive) {
1060         Token t1 = Token.categories.get(name);
1061         Token t2 = Token.categories2.get(name);
1062         if (positive) {
1063             Token.categories.put(newName, t1);
1064             Token.categories2.put(newName, t2);
1065         } else {
1066             Token.categories2.put(newName, t1);
1067             Token.categories.put(newName, t2);
1068         }
1069     }
1070
1071     // ------------------------------------------------------
1072
1073     static final String viramaString =
1074     "\u094D"// ;DEVANAGARI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1075     +"\u09CD"//;BENGALI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1076     +"\u0A4D"//;GURMUKHI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1077     +"\u0ACD"//;GUJARATI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1078     +"\u0B4D"//;ORIYA SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1079     +"\u0BCD"//;TAMIL SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1080     +"\u0C4D"//;TELUGU SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1081     +"\u0CCD"//;KANNADA SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1082     +"\u0D4D"//;MALAYALAM SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1083     +"\u0E3A"//;THAI CHARACTER PHINTHU;Mn;9;ON;;;;;N;THAI VOWEL SIGN PHINTHU;;;;
1084     +"\u0F84";//;TIBETAN MARK HALANTA;Mn;9;ON;;;;;N;TIBETAN VIRAMA;;;;
1085
1086     static private Token token_grapheme = null;
1087     static synchronized Token getGraphemePattern() {
1088         if (Token.token_grapheme != null) {
1089             return Token.token_grapheme;
1090         }
1091
1092         Token base_char = Token.createRange();  // [{ASSIGNED}]-[{M},{C}]
1093         base_char.mergeRanges(Token.getRange("ASSIGNED", true));
1094         base_char.subtractRanges(Token.getRange("M", true));
1095         base_char.subtractRanges(Token.getRange("C", true));
1096
1097         Token virama = Token.createRange();
1098         for (int i = 0;  i < Token.viramaString.length(); i++) {
1099             virama.addRange(i, i);
1100         }
1101
1102         Token combiner_wo_virama = Token.createRange();
1103         combiner_wo_virama.mergeRanges(Token.getRange("M", true));
1104         combiner_wo_virama.addRange(0x1160, 0x11ff); // hangul_medial and hangul_final
1105         combiner_wo_virama.addRange(0xff9e, 0xff9f); // extras
1106
1107         Token left = Token.createUnion();       // base_char?
1108         left.addChild(base_char);
1109         left.addChild(Token.token_empty);
1110
1111         Token foo = Token.createUnion();
1112         foo.addChild(Token.createConcat(virama, Token.getRange("L", true)));
1113         foo.addChild(combiner_wo_virama);
1114
1115         foo = Token.createClosure(foo);
1116
1117         foo = Token.createConcat(left, foo);
1118
1119         Token.token_grapheme = foo;
1120         return Token.token_grapheme;
1121     }
1122
1123     /**
1124      * Combing Character Sequence in Perl 5.6.
1125      */
1126     static private Token token_ccs = null;
1127     static synchronized Token getCombiningCharacterSequence() {
1128         if (Token.token_ccs != null) {
1129             return Token.token_ccs;
1130         }
1131
1132         Token foo = Token.createClosure(Token.getRange("M", true)); // \pM*
1133         foo = Token.createConcat(Token.getRange("M", false), foo); // \PM + \pM*
1134         Token.token_ccs = foo;
1135         return Token.token_ccs;
1136     }
1137
1138     // ------------------------------------------------------
1139
1140     // ------------------------------------------------------
1141     /**
1142      * This class represents a node in parse tree.
1143      */
1144     static class StringToken extends Token implements java.io.Serializable {
1145
1146         private static final long serialVersionUID = -4614366944218504172L;
1147
1148         String string;
1149         final int refNumber;
1150
1151         StringToken(int type, String str, int n) {
1152             super(type);
1153             this.string = str;
1154             this.refNumber = n;
1155         }
1156
1157         @Override
1158         int getReferenceNumber() {              // for STRING
1159             return this.refNumber;
1160         }
1161         @Override
1162         String getString() {                    // for STRING
1163             return this.string;
1164         }
1165
1166         @Override
1167         public String toString(int options) {
1168             if (this.type == BACKREFERENCE) {
1169                 return "\\"+this.refNumber;
1170             } else {
1171                 return REUtil.quoteMeta(this.string);
1172             }
1173         }
1174     }
1175
1176     /**
1177      * This class represents a node in parse tree.
1178      */
1179     static class ConcatToken extends Token implements java.io.Serializable {
1180
1181         private static final long serialVersionUID = 8717321425541346381L;
1182
1183         final Token child;
1184         final Token child2;
1185
1186         ConcatToken(Token t1, Token t2) {
1187             super(Token.CONCAT);
1188             this.child = t1;
1189             this.child2 = t2;
1190         }
1191
1192         @Override
1193         int size() {
1194             return 2;
1195         }
1196         @Override
1197         Token getChild(int index) {
1198             return index == 0 ? this.child : this.child2;
1199         }
1200
1201         @Override
1202         public String toString(int options) {
1203             String ret;
1204             if (this.child2.type == CLOSURE && this.child2.getChild(0) == this.child) {
1205                 ret = this.child.toString(options)+"+";
1206             } else if (this.child2.type == NONGREEDYCLOSURE && this.child2.getChild(0) == this.child) {
1207                 ret = this.child.toString(options)+"+?";
1208             } else {
1209                 ret = this.child.toString(options)+this.child2.toString(options);
1210             }
1211             return ret;
1212         }
1213     }
1214
1215     /**
1216      * This class represents a node in parse tree.
1217      */
1218     static class CharToken extends Token implements java.io.Serializable {
1219
1220         private static final long serialVersionUID = -4394272816279496989L;
1221
1222         final int chardata;
1223
1224         CharToken(int type, int ch) {
1225             super(type);
1226             this.chardata = ch;
1227         }
1228
1229         @Override
1230         int getChar() {
1231             return this.chardata;
1232         }
1233
1234         @Override
1235         public String toString(int options) {
1236             String ret;
1237             switch (this.type) {
1238               case CHAR:
1239                 switch (this.chardata) {
1240                   case '|':  case '*':  case '+':  case '?':
1241                   case '(':  case ')':  case '.':  case '[':
1242                   case '{':  case '\\':
1243                     ret = "\\"+(char)this.chardata;
1244                     break;
1245                   case '\f':  ret = "\\f";  break;
1246                   case '\n':  ret = "\\n";  break;
1247                   case '\r':  ret = "\\r";  break;
1248                   case '\t':  ret = "\\t";  break;
1249                   case 0x1b:  ret = "\\e";  break;
1250                     //case 0x0b:  ret = "\\v";  break;
1251                   default:
1252                     if (this.chardata >= 0x10000) {
1253                         String pre = "0"+Integer.toHexString(this.chardata);
1254                         ret = "\\v"+pre.substring(pre.length()-6, pre.length());
1255                     } else {
1256                         ret = ""+(char)this.chardata;
1257                     }
1258                 }
1259                 break;
1260
1261               case ANCHOR:
1262                 if (this == Token.token_linebeginning || this == Token.token_lineend) {
1263                     ret = ""+(char)this.chardata;
1264                 } else {
1265                     ret = "\\"+(char)this.chardata;
1266                 }
1267                 break;
1268
1269               default:
1270                 ret = null;
1271             }
1272
1273             if (isSet(options, RegularExpression.XMLSCHEMA_MODE)) {
1274                 if ("$".equals(ret)) {
1275                     return "\\$";
1276                 }
1277                 if ("^".equals(ret)) {
1278                     return "\\^";
1279                 }
1280             }
1281
1282             return ret;
1283         }
1284
1285         @Override
1286         boolean match(int ch) {
1287             if (this.type == CHAR) {
1288                 return ch == this.chardata;
1289             } else {
1290                 throw new RuntimeException("NFAArrow#match(): Internal error: "+this.type);
1291             }
1292         }
1293     }
1294
1295     /**
1296      * This class represents a node in parse tree.
1297      */
1298     static class ClosureToken extends Token implements java.io.Serializable {
1299
1300         private static final long serialVersionUID = 1308971930673997452L;
1301
1302         int min;
1303         int max;
1304         final Token child;
1305
1306         ClosureToken(int type, Token tok) {
1307             super(type);
1308             this.child = tok;
1309             this.setMin(-1);
1310             this.setMax(-1);
1311         }
1312
1313         @Override
1314         int size() {
1315             return 1;
1316         }
1317         @Override
1318         Token getChild(int index) {
1319             return this.child;
1320         }
1321
1322         @Override
1323         final void setMin(int min) {
1324             this.min = min;
1325         }
1326         @Override
1327         final void setMax(int max) {
1328             this.max = max;
1329         }
1330         @Override
1331         final int getMin() {
1332             return this.min;
1333         }
1334         @Override
1335         final int getMax() {
1336             return this.max;
1337         }
1338
1339         @Override
1340         public String toString(int options) {
1341             String ret;
1342             if (this.type == CLOSURE) {
1343                 if (this.getMin() < 0 && this.getMax() < 0) {
1344                     ret = this.child.toString(options)+"*";
1345                 } else if (this.getMin() == this.getMax()) {
1346                     ret = this.child.toString(options)+"{"+this.getMin()+"}";
1347                 } else if (this.getMin() >= 0 && this.getMax() >= 0) {
1348                     ret = this.child.toString(options)+"{"+this.getMin()+","+this.getMax()+"}";
1349                 } else if (this.getMin() >= 0 && this.getMax() < 0) {
1350                     ret = this.child.toString(options)+"{"+this.getMin()+",}";
1351                 } else {
1352                     throw new RuntimeException("Token#toString(): CLOSURE "
1353                                                +this.getMin()+", "+this.getMax());
1354                 }
1355             } else {
1356                 if (this.getMin() < 0 && this.getMax() < 0) {
1357                     ret = this.child.toString(options)+"*?";
1358                 } else if (this.getMin() == this.getMax()) {
1359                     ret = this.child.toString(options)+"{"+this.getMin()+"}?";
1360                 } else if (this.getMin() >= 0 && this.getMax() >= 0) {
1361                     ret = this.child.toString(options)+"{"+this.getMin()+","+this.getMax()+"}?";
1362                 } else if (this.getMin() >= 0 && this.getMax() < 0) {
1363                     ret = this.child.toString(options)+"{"+this.getMin()+",}?";
1364                 } else {
1365                     throw new RuntimeException("Token#toString(): NONGREEDYCLOSURE "
1366                                                +this.getMin()+", "+this.getMax());
1367                 }
1368             }
1369             return ret;
1370         }
1371     }
1372
1373     /**
1374      * This class represents a node in parse tree.
1375      */
1376     static class ParenToken extends Token implements java.io.Serializable {
1377
1378         private static final long serialVersionUID = -5938014719827987704L;
1379
1380         final Token child;
1381         final int parennumber;
1382
1383         ParenToken(int type, Token tok, int paren) {
1384             super(type);
1385             this.child = tok;
1386             this.parennumber = paren;
1387         }
1388
1389         @Override
1390         int size() {
1391             return 1;
1392         }
1393         @Override
1394         Token getChild(int index) {
1395             return this.child;
1396         }
1397
1398         @Override
1399         int getParenNumber() {
1400             return this.parennumber;
1401         }
1402
1403         @Override
1404         public String toString(int options) {
1405             String ret = null;
1406             switch (this.type) {
1407               case PAREN:
1408                 if (this.parennumber == 0) {
1409                     ret = "(?:"+this.child.toString(options)+")";
1410                 } else {
1411                     ret = "("+this.child.toString(options)+")";
1412                 }
1413                 break;
1414
1415               case LOOKAHEAD:
1416                 ret = "(?="+this.child.toString(options)+")";
1417                 break;
1418               case NEGATIVELOOKAHEAD:
1419                 ret = "(?!"+this.child.toString(options)+")";
1420                 break;
1421               case LOOKBEHIND:
1422                 ret = "(?<="+this.child.toString(options)+")";
1423                 break;
1424               case NEGATIVELOOKBEHIND:
1425                 ret = "(?<!"+this.child.toString(options)+")";
1426                 break;
1427               case INDEPENDENT:
1428                 ret = "(?>"+this.child.toString(options)+")";
1429                 break;
1430             }
1431             return ret;
1432         }
1433     }
1434
1435     /**
1436      * (?(condition)yes-pattern|no-pattern)
1437      */
1438     static class ConditionToken extends Token implements java.io.Serializable {
1439
1440         private static final long serialVersionUID = 4353765277910594411L;
1441
1442         final int refNumber;
1443         final Token condition;
1444         final Token yes;
1445         final Token no;
1446         ConditionToken(int refno, Token cond, Token yespat, Token nopat) {
1447             super(Token.CONDITION);
1448             this.refNumber = refno;
1449             this.condition = cond;
1450             this.yes = yespat;
1451             this.no = nopat;
1452         }
1453         @Override
1454         int size() {
1455             return this.no == null ? 1 : 2;
1456         }
1457         @Override
1458         Token getChild(int index) {
1459             if (index == 0) {
1460                 return this.yes;
1461             }
1462             if (index == 1) {
1463                 return this.no;
1464             }
1465             throw new RuntimeException("Internal Error: "+index);
1466         }
1467
1468         @Override
1469         public String toString(int options) {
1470             String ret;
1471             if (refNumber > 0) {
1472                 ret = "(?("+refNumber+")";
1473             } else if (this.condition.type == Token.ANCHOR) {
1474                 ret = "(?("+this.condition+")";
1475             } else {
1476                 ret = "(?"+this.condition;
1477             }
1478
1479             if (this.no == null) {
1480                 ret += this.yes+")";
1481             } else {
1482                 ret += this.yes+"|"+this.no+")";
1483             }
1484             return ret;
1485         }
1486     }
1487
1488     /**
1489      * (ims-ims: .... )
1490      */
1491     static class ModifierToken extends Token implements java.io.Serializable {
1492
1493         private static final long serialVersionUID = -9114536559696480356L;
1494
1495         final Token child;
1496         final int add;
1497         final int mask;
1498
1499         ModifierToken(Token tok, int add, int mask) {
1500             super(Token.MODIFIERGROUP);
1501             this.child = tok;
1502             this.add = add;
1503             this.mask = mask;
1504         }
1505
1506         @Override
1507         int size() {
1508             return 1;
1509         }
1510         @Override
1511         Token getChild(int index) {
1512             return this.child;
1513         }
1514
1515         int getOptions() {
1516             return this.add;
1517         }
1518         int getOptionsMask() {
1519             return this.mask;
1520         }
1521
1522         @Override
1523         public String toString(int options) {
1524             return "(?"
1525                 +(this.add == 0 ? "" : REUtil.createOptionString(this.add))
1526                 +(this.mask == 0 ? "" : REUtil.createOptionString(this.mask))
1527                 +":"
1528                 +this.child.toString(options)
1529                 +")";
1530         }
1531     }
1532
1533     /**
1534      * This class represents a node in parse tree.
1535      * for UNION or CONCAT.
1536      */
1537     static class UnionToken extends Token implements java.io.Serializable {
1538
1539         private static final long serialVersionUID = -2568843945989489861L;
1540
1541         Vector<Token> children;
1542
1543         UnionToken(int type) {
1544             super(type);
1545         }
1546
1547         @Override
1548         void addChild(Token tok) {
1549             if (tok == null) {
1550                 return;
1551             }
1552             if (this.children == null) {
1553                 this.children = new Vector<>();
1554             }
1555             if (this.type == UNION) {
1556                 this.children.addElement(tok);
1557                 return;
1558             }
1559                                                 // This is CONCAT, and new child is CONCAT.
1560             if (tok.type == CONCAT) {
1561                 for (int i = 0;  i < tok.size();  i ++)
1562                  {
1563                     this.addChild(tok.getChild(i)); // Recursion
1564                 }
1565                 return;
1566             }
1567             int size = this.children.size();
1568             if (size == 0) {
1569                 this.children.addElement(tok);
1570                 return;
1571             }
1572             Token previous = this.children.elementAt(size-1);
1573             if (!((previous.type == CHAR || previous.type == STRING)
1574                   && (tok.type == CHAR || tok.type == STRING))) {
1575                 this.children.addElement(tok);
1576                 return;
1577             }
1578
1579             //System.err.println("Merge '"+previous+"' and '"+tok+"'.");
1580
1581             StringBuffer buffer;
1582             int nextMaxLength = (tok.type == CHAR ? 2 : tok.getString().length());
1583             if (previous.type == CHAR) {        // Replace previous token by STRING
1584                 buffer = new StringBuffer(2 + nextMaxLength);
1585                 int ch = previous.getChar();
1586                 if (ch >= 0x10000) {
1587                     buffer.append(REUtil.decomposeToSurrogates(ch));
1588                 } else {
1589                     buffer.append((char)ch);
1590                 }
1591                 previous = Token.createString(null);
1592                 this.children.setElementAt(previous, size-1);
1593             } else {                            // STRING
1594                 buffer = new StringBuffer(previous.getString().length() + nextMaxLength);
1595                 buffer.append(previous.getString());
1596             }
1597
1598             if (tok.type == CHAR) {
1599                 int ch = tok.getChar();
1600                 if (ch >= 0x10000) {
1601                     buffer.append(REUtil.decomposeToSurrogates(ch));
1602                 } else {
1603                     buffer.append((char)ch);
1604                 }
1605             } else {
1606                 buffer.append(tok.getString());
1607             }
1608
1609             ((StringToken)previous).string = new String(buffer);
1610         }
1611
1612         @Override
1613         int size() {
1614             return this.children == null ? 0 : this.children.size();
1615         }
1616         @Override
1617         Token getChild(int index) {
1618             return this.children.elementAt(index);
1619         }
1620
1621         @Override
1622         public String toString(int options) {
1623             String ret;
1624             if (this.type == CONCAT) {
1625                 if (this.children.size() == 2) {
1626                     Token ch = this.getChild(0);
1627                     Token ch2 = this.getChild(1);
1628                     if (ch2.type == CLOSURE && ch2.getChild(0) == ch) {
1629                         ret = ch.toString(options)+"+";
1630                     } else if (ch2.type == NONGREEDYCLOSURE && ch2.getChild(0) == ch) {
1631                         ret = ch.toString(options)+"+?";
1632                     } else {
1633                         ret = ch.toString(options)+ch2.toString(options);
1634                     }
1635                 } else {
1636                     StringBuffer sb = new StringBuffer();
1637                     for (int i = 0;  i < this.children.size();  i ++) {
1638                         sb.append(this.children.elementAt(i).toString(options));
1639                     }
1640                     ret = new String(sb);
1641                 }
1642                 return ret;
1643             }
1644             if (this.children.size() == 2 && this.getChild(1).type == EMPTY) {
1645                 ret = this.getChild(0).toString(options)+"?";
1646             } else if (this.children.size() == 2
1647                        && this.getChild(0).type == EMPTY) {
1648                 ret = this.getChild(1).toString(options)+"??";
1649             } else {
1650                 StringBuffer sb = new StringBuffer();
1651                 sb.append(this.children.elementAt(0).toString(options));
1652                 for (int i = 1;  i < this.children.size();  i ++) {
1653                     sb.append('|');
1654                     sb.append(this.children.elementAt(i).toString(options));
1655                 }
1656                 ret = new String(sb);
1657             }
1658             return ret;
1659         }
1660     }
1661 }