2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
9 * http://www.apache.org/licenses/LICENSE-2.0
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
18 package org.opendaylight.yangtools.xsd.regex;
20 import java.util.Hashtable;
21 import java.util.Vector;
24 * This class represents a node in parse tree.
28 * @version $Id: Token.java 1638344 2014-11-11 20:15:46Z mrglavas $
30 class Token implements java.io.Serializable {
32 private static final long serialVersionUID = 8484976002585487481L;
34 static final boolean COUNTTOKENS = true;
35 static int tokens = 0;
37 static final int CHAR = 0; // Literal char
38 static final int DOT = 11; // .
39 static final int CONCAT = 1; // XY
40 static final int UNION = 2; // X|Y|Z
41 static final int CLOSURE = 3; // X*
42 static final int RANGE = 4; // [a-zA-Z] etc.
43 static final int NRANGE = 5; // [^a-zA-Z] etc.
44 static final int PAREN = 6; // (X) or (?:X)
45 static final int EMPTY = 7; //
46 static final int ANCHOR = 8; // ^ $ \b \B \< \> \A \Z \z
47 static final int NONGREEDYCLOSURE = 9; // *? +?
48 static final int STRING = 10; // strings
49 static final int BACKREFERENCE = 12; // back references
50 static final int LOOKAHEAD = 20; // (?=...)
51 static final int NEGATIVELOOKAHEAD = 21; // (?!...)
52 static final int LOOKBEHIND = 22; // (?<=...)
53 static final int NEGATIVELOOKBEHIND = 23; // (?<!...)
54 static final int INDEPENDENT = 24; // (?>...)
55 static final int MODIFIERGROUP = 25; // (?ims-ims:...)
56 static final int CONDITION = 26; // (?(...)yes|no)
58 static final int UTF16_MAX = 0x10ffff;
62 static Token token_dot;
63 static Token token_0to9;
64 static Token token_wordchars;
65 static Token token_not_0to9;
66 static Token token_not_wordchars;
67 static Token token_spaces;
68 static Token token_not_spaces;
69 static Token token_empty;
70 static Token token_linebeginning;
71 static Token token_linebeginning2;
72 static Token token_lineend;
73 static Token token_stringbeginning;
74 static Token token_stringend;
75 static Token token_stringend2;
76 static Token token_wordedge;
77 static Token token_not_wordedge;
78 static Token token_wordbeginning;
79 static Token token_wordend;
81 Token.token_empty = new Token(Token.EMPTY);
83 Token.token_linebeginning = Token.createAnchor('^');
84 Token.token_linebeginning2 = Token.createAnchor('@');
85 Token.token_lineend = Token.createAnchor('$');
86 Token.token_stringbeginning = Token.createAnchor('A');
87 Token.token_stringend = Token.createAnchor('z');
88 Token.token_stringend2 = Token.createAnchor('Z');
89 Token.token_wordedge = Token.createAnchor('b');
90 Token.token_not_wordedge = Token.createAnchor('B');
91 Token.token_wordbeginning = Token.createAnchor('<');
92 Token.token_wordend = Token.createAnchor('>');
94 Token.token_dot = new Token(Token.DOT);
96 Token.token_0to9 = Token.createRange();
97 Token.token_0to9.addRange('0', '9');
98 Token.token_wordchars = Token.createRange();
99 Token.token_wordchars.addRange('0', '9');
100 Token.token_wordchars.addRange('A', 'Z');
101 Token.token_wordchars.addRange('_', '_');
102 Token.token_wordchars.addRange('a', 'z');
103 Token.token_spaces = Token.createRange();
104 Token.token_spaces.addRange('\t', '\t');
105 Token.token_spaces.addRange('\n', '\n');
106 Token.token_spaces.addRange('\f', '\f');
107 Token.token_spaces.addRange('\r', '\r');
108 Token.token_spaces.addRange(' ', ' ');
110 Token.token_not_0to9 = Token.complementRanges(Token.token_0to9);
111 Token.token_not_wordchars = Token.complementRanges(Token.token_wordchars);
112 Token.token_not_spaces = Token.complementRanges(Token.token_spaces);
115 static Token.ParenToken createLook(int type, Token child) {
116 if (COUNTTOKENS) Token.tokens ++;
117 return new Token.ParenToken(type, child, 0);
119 static Token.ParenToken createParen(Token child, int pnumber) {
120 if (COUNTTOKENS) Token.tokens ++;
121 return new Token.ParenToken(Token.PAREN, child, pnumber);
123 static Token.ClosureToken createClosure(Token tok) {
124 if (COUNTTOKENS) Token.tokens ++;
125 return new Token.ClosureToken(Token.CLOSURE, tok);
127 static Token.ClosureToken createNGClosure(Token tok) {
128 if (COUNTTOKENS) Token.tokens ++;
129 return new Token.ClosureToken(Token.NONGREEDYCLOSURE, tok);
131 static Token.ConcatToken createConcat(Token tok1, Token tok2) {
132 if (COUNTTOKENS) Token.tokens ++;
133 return new Token.ConcatToken(tok1, tok2);
135 static Token.UnionToken createConcat() {
136 if (COUNTTOKENS) Token.tokens ++;
137 return new Token.UnionToken(Token.CONCAT); // *** It is not a bug.
139 static Token.UnionToken createUnion() {
140 if (COUNTTOKENS) Token.tokens ++;
141 return new Token.UnionToken(Token.UNION);
143 static Token createEmpty() {
144 return Token.token_empty;
146 static RangeToken createRange() {
147 if (COUNTTOKENS) Token.tokens ++;
148 return new RangeToken(Token.RANGE);
150 static RangeToken createNRange() {
151 if (COUNTTOKENS) Token.tokens ++;
152 return new RangeToken(Token.NRANGE);
154 static Token.CharToken createChar(int ch) {
155 if (COUNTTOKENS) Token.tokens ++;
156 return new Token.CharToken(Token.CHAR, ch);
158 static private Token.CharToken createAnchor(int ch) {
159 if (COUNTTOKENS) Token.tokens ++;
160 return new Token.CharToken(Token.ANCHOR, ch);
162 static Token.StringToken createBackReference(int refno) {
163 if (COUNTTOKENS) Token.tokens ++;
164 return new Token.StringToken(Token.BACKREFERENCE, null, refno);
166 static Token.StringToken createString(String str) {
167 if (COUNTTOKENS) Token.tokens ++;
168 return new Token.StringToken(Token.STRING, str, 0);
170 static Token.ModifierToken createModifierGroup(Token child, int add, int mask) {
171 if (COUNTTOKENS) Token.tokens ++;
172 return new Token.ModifierToken(child, add, mask);
174 static Token.ConditionToken createCondition(int refno, Token condition,
175 Token yespat, Token nopat) {
176 if (COUNTTOKENS) Token.tokens ++;
177 return new Token.ConditionToken(refno, condition, yespat, nopat);
180 protected Token(int type) {
185 * A number of children.
190 Token getChild(int index) {
193 void addChild(Token tok) {
194 throw new RuntimeException("Not supported.");
197 // for RANGE or NRANGE
198 protected void addRange(int start, int end) {
199 throw new RuntimeException("Not supported.");
201 protected void sortRanges() {
202 throw new RuntimeException("Not supported.");
204 protected void compactRanges() {
205 throw new RuntimeException("Not supported.");
207 protected void mergeRanges(Token tok) {
208 throw new RuntimeException("Not supported.");
210 protected void subtractRanges(Token tok) {
211 throw new RuntimeException("Not supported.");
213 protected void intersectRanges(Token tok) {
214 throw new RuntimeException("Not supported.");
216 static Token complementRanges(Token tok) {
217 return RangeToken.complementRanges(tok);
221 void setMin(int min) { // for CLOSURE
223 void setMax(int max) { // for CLOSURE
225 int getMin() { // for CLOSURE
228 int getMax() { // for CLOSURE
231 int getReferenceNumber() { // for STRING
234 String getString() { // for STRING
238 int getParenNumber() {
245 public String toString() {
246 return this.toString(0);
248 public String toString(int options) {
249 return this.type == Token.DOT ? "." : "";
253 * How many characters are needed?
255 final int getMinLength() {
259 for (int i = 0; i < this.size(); i ++)
260 sum += this.getChild(i).getMinLength();
265 if (this.size() == 0)
267 int ret = this.getChild(0).getMinLength();
268 for (int i = 1; i < this.size(); i ++) {
269 int min = this.getChild(i).getMinLength();
270 if (min < ret) ret = min;
275 case NONGREEDYCLOSURE:
276 if (this.getMin() >= 0)
277 return this.getMin() * this.getChild(0).getMinLength();
293 return this.getChild(0).getMinLength();
299 return this.getString().length();
302 case NEGATIVELOOKAHEAD:
304 case NEGATIVELOOKBEHIND:
305 return 0; // ***** Really?
308 throw new RuntimeException("Token#getMinLength(): Invalid Type: "+this.type);
312 final int getMaxLength() {
316 for (int i = 0; i < this.size(); i ++) {
317 int d = this.getChild(i).getMaxLength();
318 if (d < 0) return -1;
325 if (this.size() == 0)
327 int ret = this.getChild(0).getMaxLength();
328 for (int i = 1; ret >= 0 && i < this.size(); i ++) {
329 int max = this.getChild(i).getMaxLength();
330 if (max < 0) { // infinity
334 if (max > ret) ret = max;
339 case NONGREEDYCLOSURE:
340 if (this.getMax() >= 0)
341 // When this.child.getMaxLength() < 0,
342 // this returns minus value
343 return this.getMax() * this.getChild(0).getMaxLength();
360 return this.getChild(0).getMaxLength();
366 return this.getString().length();
369 case NEGATIVELOOKAHEAD:
371 case NEGATIVELOOKBEHIND:
372 return 0; // ***** Really?
375 throw new RuntimeException("Token#getMaxLength(): Invalid Type: "+this.type);
379 static final int FC_CONTINUE = 0;
380 static final int FC_TERMINAL = 1;
381 static final int FC_ANY = 2;
382 private static final boolean isSet(int options, int flag) {
383 return (options & flag) == flag;
385 final int analyzeFirstCharacter(RangeToken result, int options) {
388 int ret = FC_CONTINUE;
389 for (int i = 0; i < this.size(); i ++)
390 if ((ret = this.getChild(i).analyzeFirstCharacter(result, options)) != FC_CONTINUE)
395 if (this.size() == 0)
398 * a|b|c -> FC_TERMINAL
400 * a|b| -> FC_CONTINUE
402 int ret2 = FC_CONTINUE;
403 boolean hasEmpty = false;
404 for (int i = 0; i < this.size(); i ++) {
405 ret2 = this.getChild(i).analyzeFirstCharacter(result, options);
408 else if (ret2 == FC_CONTINUE)
411 return hasEmpty ? FC_CONTINUE : ret2;
414 int ret3 = this.getChild(0).analyzeFirstCharacter(result, options);
415 if (this.size() == 1) return FC_CONTINUE;
416 if (ret3 == FC_ANY) return ret3;
417 int ret4 = this.getChild(1).analyzeFirstCharacter(result, options);
418 if (ret4 == FC_ANY) return ret4;
419 return ret3 == FC_CONTINUE || ret4 == FC_CONTINUE ? FC_CONTINUE : FC_TERMINAL;
422 case NONGREEDYCLOSURE:
423 this.getChild(0).analyzeFirstCharacter(result, options);
431 int ch = this.getChar();
432 result.addRange(ch, ch);
433 if (ch < 0x10000 && isSet(options, RegularExpression.IGNORE_CASE)) {
434 ch = Character.toUpperCase((char)ch);
435 result.addRange(ch, ch);
436 ch = Character.toLowerCase((char)ch);
437 result.addRange(ch, ch);
445 result.mergeRanges(this);
449 result.mergeRanges(Token.complementRanges(this));
454 return this.getChild(0).analyzeFirstCharacter(result, options);
457 options |= ((ModifierToken)this).getOptions();
458 options &= ~((ModifierToken)this).getOptionsMask();
459 return this.getChild(0).analyzeFirstCharacter(result, options);
462 result.addRange(0, UTF16_MAX); // **** We can not optimize.
466 int cha = this.getString().charAt(0);
468 if (REUtil.isHighSurrogate(cha)
469 && this.getString().length() >= 2
470 && REUtil.isLowSurrogate((ch2 = this.getString().charAt(1))))
471 cha = REUtil.composeFromSurrogates(cha, ch2);
472 result.addRange(cha, cha);
473 if (cha < 0x10000 && isSet(options, RegularExpression.IGNORE_CASE)) {
474 cha = Character.toUpperCase((char)cha);
475 result.addRange(cha, cha);
476 cha = Character.toLowerCase((char)cha);
477 result.addRange(cha, cha);
482 case NEGATIVELOOKAHEAD:
484 case NEGATIVELOOKBEHIND:
488 throw new RuntimeException("Token#analyzeHeadCharacter(): Invalid Type: "+this.type);
492 private final boolean isShorterThan(Token tok) {
493 if (tok == null) return false;
496 if (this.type == STRING) mylength = this.getString().length();
497 else if (this.type == CHAR) mylength = this.getChar() >= 0x10000 ? 2 : 1;
498 else throw new RuntimeException("Internal Error: Illegal type: "+this.type);
500 if (tok.type == STRING) otherlength = tok.getString().length();
501 else if (tok.type == CHAR) otherlength = tok.getChar() >= 0x10000 ? 2 : 1;
502 else throw new RuntimeException("Internal Error: Illegal type: "+tok.type);
505 if (this.type == STRING) mylength = this.getString().length();
506 else throw new RuntimeException("Internal Error: Illegal type: "+this.type);
508 if (tok.type == STRING) otherlength = tok.getString().length();
509 else throw new RuntimeException("Internal Error: Illegal type: "+tok.type);
510 return mylength < otherlength;
513 static class FixedStringContainer {
516 FixedStringContainer() {
520 final void findFixedString(FixedStringContainer container, int options) {
523 Token prevToken = null;
525 for (int i = 0; i < this.size(); i ++) {
526 this.getChild(i).findFixedString(container, options);
527 if (prevToken == null || prevToken.isShorterThan(container.token)) {
528 prevToken = container.token;
529 prevOptions = container.options;
532 container.token = prevToken;
533 container.options = prevOptions;
538 case NONGREEDYCLOSURE:
546 case NEGATIVELOOKAHEAD:
548 case NEGATIVELOOKBEHIND:
550 container.token = null;
553 case CHAR: // Ignore CHAR tokens.
554 container.token = null; // **
558 container.token = this;
559 container.options = options;
564 this.getChild(0).findFixedString(container, options);
568 options |= ((ModifierToken)this).getOptions();
569 options &= ~((ModifierToken)this).getOptionsMask();
570 this.getChild(0).findFixedString(container, options);
574 throw new RuntimeException("Token#findFixedString(): Invalid Type: "+this.type);
578 boolean match(int ch) {
579 throw new RuntimeException("NFAArrow#match(): Internal error: "+this.type);
582 // ------------------------------------------------------
583 private final static Hashtable categories = new Hashtable();
584 private final static Hashtable categories2 = new Hashtable();
585 private static final String[] categoryNames = {
586 "Cn", "Lu", "Ll", "Lt", "Lm", "Lo", "Mn", "Me", "Mc", "Nd",
587 "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", null, "Co", "Cs",
588 "Pd", "Ps", "Pe", "Pc", "Po", "Sm", "Sc", "Sk", "So", // 28
589 "Pi", "Pf", // 29, 30
590 "L", "M", "N", "Z", "C", "P", "S", // 31-37
593 // Schema Rec. {Datatypes} - Punctuation
594 static final int CHAR_INIT_QUOTE = 29; // Pi - initial quote
595 static final int CHAR_FINAL_QUOTE = 30; // Pf - final quote
596 static final int CHAR_LETTER = 31;
597 static final int CHAR_MARK = 32;
598 static final int CHAR_NUMBER = 33;
599 static final int CHAR_SEPARATOR = 34;
600 static final int CHAR_OTHER = 35;
601 static final int CHAR_PUNCTUATION = 36;
602 static final int CHAR_SYMBOL = 37;
604 //blockNames in UNICODE 3.1 that supported by XML Schema REC
605 private static final String[] blockNames = {
606 /*0000..007F;*/ "Basic Latin",
607 /*0080..00FF;*/ "Latin-1 Supplement",
608 /*0100..017F;*/ "Latin Extended-A",
609 /*0180..024F;*/ "Latin Extended-B",
610 /*0250..02AF;*/ "IPA Extensions",
611 /*02B0..02FF;*/ "Spacing Modifier Letters",
612 /*0300..036F;*/ "Combining Diacritical Marks",
613 /*0370..03FF;*/ "Greek",
614 /*0400..04FF;*/ "Cyrillic",
615 /*0530..058F;*/ "Armenian",
616 /*0590..05FF;*/ "Hebrew",
617 /*0600..06FF;*/ "Arabic",
618 /*0700..074F;*/ "Syriac",
619 /*0780..07BF;*/ "Thaana",
620 /*0900..097F;*/ "Devanagari",
621 /*0980..09FF;*/ "Bengali",
622 /*0A00..0A7F;*/ "Gurmukhi",
623 /*0A80..0AFF;*/ "Gujarati",
624 /*0B00..0B7F;*/ "Oriya",
625 /*0B80..0BFF;*/ "Tamil",
626 /*0C00..0C7F;*/ "Telugu",
627 /*0C80..0CFF;*/ "Kannada",
628 /*0D00..0D7F;*/ "Malayalam",
629 /*0D80..0DFF;*/ "Sinhala",
630 /*0E00..0E7F;*/ "Thai",
631 /*0E80..0EFF;*/ "Lao",
632 /*0F00..0FFF;*/ "Tibetan",
633 /*1000..109F;*/ "Myanmar",
634 /*10A0..10FF;*/ "Georgian",
635 /*1100..11FF;*/ "Hangul Jamo",
636 /*1200..137F;*/ "Ethiopic",
637 /*13A0..13FF;*/ "Cherokee",
638 /*1400..167F;*/ "Unified Canadian Aboriginal Syllabics",
639 /*1680..169F;*/ "Ogham",
640 /*16A0..16FF;*/ "Runic",
641 /*1780..17FF;*/ "Khmer",
642 /*1800..18AF;*/ "Mongolian",
643 /*1E00..1EFF;*/ "Latin Extended Additional",
644 /*1F00..1FFF;*/ "Greek Extended",
645 /*2000..206F;*/ "General Punctuation",
646 /*2070..209F;*/ "Superscripts and Subscripts",
647 /*20A0..20CF;*/ "Currency Symbols",
648 /*20D0..20FF;*/ "Combining Marks for Symbols",
649 /*2100..214F;*/ "Letterlike Symbols",
650 /*2150..218F;*/ "Number Forms",
651 /*2190..21FF;*/ "Arrows",
652 /*2200..22FF;*/ "Mathematical Operators",
653 /*2300..23FF;*/ "Miscellaneous Technical",
654 /*2400..243F;*/ "Control Pictures",
655 /*2440..245F;*/ "Optical Character Recognition",
656 /*2460..24FF;*/ "Enclosed Alphanumerics",
657 /*2500..257F;*/ "Box Drawing",
658 /*2580..259F;*/ "Block Elements",
659 /*25A0..25FF;*/ "Geometric Shapes",
660 /*2600..26FF;*/ "Miscellaneous Symbols",
661 /*2700..27BF;*/ "Dingbats",
662 /*2800..28FF;*/ "Braille Patterns",
663 /*2E80..2EFF;*/ "CJK Radicals Supplement",
664 /*2F00..2FDF;*/ "Kangxi Radicals",
665 /*2FF0..2FFF;*/ "Ideographic Description Characters",
666 /*3000..303F;*/ "CJK Symbols and Punctuation",
667 /*3040..309F;*/ "Hiragana",
668 /*30A0..30FF;*/ "Katakana",
669 /*3100..312F;*/ "Bopomofo",
670 /*3130..318F;*/ "Hangul Compatibility Jamo",
671 /*3190..319F;*/ "Kanbun",
672 /*31A0..31BF;*/ "Bopomofo Extended",
673 /*3200..32FF;*/ "Enclosed CJK Letters and Months",
674 /*3300..33FF;*/ "CJK Compatibility",
675 /*3400..4DB5;*/ "CJK Unified Ideographs Extension A",
676 /*4E00..9FFF;*/ "CJK Unified Ideographs",
677 /*A000..A48F;*/ "Yi Syllables",
678 /*A490..A4CF;*/ "Yi Radicals",
679 /*AC00..D7A3;*/ "Hangul Syllables",
680 /*E000..F8FF;*/ "Private Use",
681 /*F900..FAFF;*/ "CJK Compatibility Ideographs",
682 /*FB00..FB4F;*/ "Alphabetic Presentation Forms",
683 /*FB50..FDFF;*/ "Arabic Presentation Forms-A",
684 /*FE20..FE2F;*/ "Combining Half Marks",
685 /*FE30..FE4F;*/ "CJK Compatibility Forms",
686 /*FE50..FE6F;*/ "Small Form Variants",
687 /*FE70..FEFE;*/ "Arabic Presentation Forms-B",
688 /*FEFF..FEFF;*/ "Specials",
689 /*FF00..FFEF;*/ "Halfwidth and Fullwidth Forms",
690 //missing Specials add manually
691 /*10300..1032F;*/ "Old Italic", // 84
692 /*10330..1034F;*/ "Gothic",
693 /*10400..1044F;*/ "Deseret",
694 /*1D000..1D0FF;*/ "Byzantine Musical Symbols",
695 /*1D100..1D1FF;*/ "Musical Symbols",
696 /*1D400..1D7FF;*/ "Mathematical Alphanumeric Symbols",
697 /*20000..2A6D6;*/ "CJK Unified Ideographs Extension B",
698 /*2F800..2FA1F;*/ "CJK Compatibility Ideographs Supplement",
699 /*E0000..E007F;*/ "Tags",
700 //missing 2 private use add manually
704 //F0000..FFFFD; "Private Use",
705 //100000..10FFFD; "Private Use"
706 //FFF0..FFFD; "Specials",
707 static final String blockRanges =
708 "\u0000\u007F\u0080\u00FF\u0100\u017F\u0180\u024F\u0250\u02AF\u02B0\u02FF\u0300\u036F"
709 +"\u0370\u03FF\u0400\u04FF\u0530\u058F\u0590\u05FF\u0600\u06FF\u0700\u074F\u0780\u07BF"
710 +"\u0900\u097F\u0980\u09FF\u0A00\u0A7F\u0A80\u0AFF\u0B00\u0B7F\u0B80\u0BFF\u0C00\u0C7F\u0C80\u0CFF"
711 +"\u0D00\u0D7F\u0D80\u0DFF\u0E00\u0E7F\u0E80\u0EFF\u0F00\u0FFF\u1000\u109F\u10A0\u10FF\u1100\u11FF"
712 +"\u1200\u137F\u13A0\u13FF\u1400\u167F\u1680\u169F\u16A0\u16FF\u1780\u17FF\u1800\u18AF\u1E00\u1EFF"
713 +"\u1F00\u1FFF\u2000\u206F\u2070\u209F\u20A0\u20CF\u20D0\u20FF\u2100\u214F\u2150\u218F\u2190\u21FF\u2200\u22FF"
714 +"\u2300\u23FF\u2400\u243F\u2440\u245F\u2460\u24FF\u2500\u257F\u2580\u259F\u25A0\u25FF\u2600\u26FF\u2700\u27BF"
715 +"\u2800\u28FF\u2E80\u2EFF\u2F00\u2FDF\u2FF0\u2FFF\u3000\u303F\u3040\u309F\u30A0\u30FF\u3100\u312F\u3130\u318F"
716 +"\u3190\u319F\u31A0\u31BF\u3200\u32FF\u3300\u33FF\u3400\u4DB5\u4E00\u9FFF\uA000\uA48F\uA490\uA4CF"
717 +"\uAC00\uD7A3\uE000\uF8FF\uF900\uFAFF\uFB00\uFB4F\uFB50\uFDFF"
718 +"\uFE20\uFE2F\uFE30\uFE4F\uFE50\uFE6F\uFE70\uFEFE\uFEFF\uFEFF\uFF00\uFFEF";
719 static final int[] nonBMPBlockRanges = {
720 0x10300, 0x1032F, // 84
730 private static final int NONBMP_BLOCK_START = 84;
732 static protected RangeToken getRange(String name, boolean positive) {
733 if (Token.categories.size() == 0) {
734 synchronized (Token.categories) {
735 Token[] ranges = new Token[Token.categoryNames.length];
736 for (int i = 0; i < ranges.length; i ++) {
737 ranges[i] = Token.createRange();
740 for (int i = 0; i < 0x10000; i ++) {
741 type = Character.getType((char)i);
742 if (type == Character.START_PUNCTUATION ||
743 type == Character.END_PUNCTUATION) {
744 //build table of Pi values
745 if (i == 0x00AB || i == 0x2018 || i == 0x201B || i == 0x201C ||
746 i == 0x201F || i == 0x2039) {
747 type = CHAR_INIT_QUOTE;
749 //build table of Pf values
750 if (i == 0x00BB || i == 0x2019 || i == 0x201D || i == 0x203A ) {
751 type = CHAR_FINAL_QUOTE;
754 ranges[type].addRange(i, i);
756 case Character.UPPERCASE_LETTER:
757 case Character.LOWERCASE_LETTER:
758 case Character.TITLECASE_LETTER:
759 case Character.MODIFIER_LETTER:
760 case Character.OTHER_LETTER:
763 case Character.NON_SPACING_MARK:
764 case Character.COMBINING_SPACING_MARK:
765 case Character.ENCLOSING_MARK:
768 case Character.DECIMAL_DIGIT_NUMBER:
769 case Character.LETTER_NUMBER:
770 case Character.OTHER_NUMBER:
773 case Character.SPACE_SEPARATOR:
774 case Character.LINE_SEPARATOR:
775 case Character.PARAGRAPH_SEPARATOR:
776 type = CHAR_SEPARATOR;
778 case Character.CONTROL:
779 case Character.FORMAT:
780 case Character.SURROGATE:
781 case Character.PRIVATE_USE:
782 case Character.UNASSIGNED:
785 case Character.CONNECTOR_PUNCTUATION:
786 case Character.DASH_PUNCTUATION:
787 case Character.START_PUNCTUATION:
788 case Character.END_PUNCTUATION:
789 case CHAR_INIT_QUOTE:
790 case CHAR_FINAL_QUOTE:
791 case Character.OTHER_PUNCTUATION:
792 type = CHAR_PUNCTUATION;
794 case Character.MATH_SYMBOL:
795 case Character.CURRENCY_SYMBOL:
796 case Character.MODIFIER_SYMBOL:
797 case Character.OTHER_SYMBOL:
801 throw new RuntimeException("org.apache.xerces.utils.regex.Token#getRange(): Unknown Unicode category: "+type);
803 ranges[type].addRange(i, i);
804 } // for all characters
805 ranges[Character.UNASSIGNED].addRange(0x10000, Token.UTF16_MAX);
806 ranges[CHAR_OTHER].addRange(0x10000, Token.UTF16_MAX);
808 for (int i = 0; i < ranges.length; i ++) {
809 if (Token.categoryNames[i] != null) {
810 if (i == Character.UNASSIGNED) { // Unassigned
811 ranges[i].addRange(0x10000, Token.UTF16_MAX);
813 Token.categories.put(Token.categoryNames[i], ranges[i]);
814 Token.categories2.put(Token.categoryNames[i],
815 Token.complementRanges(ranges[i]));
818 //REVISIT: do we really need to support block names as in Unicode 3.1
819 // or we can just create all the names in IsBLOCKNAME format (XML Schema REC)?
821 StringBuffer buffer = new StringBuffer(50);
822 for (int i = 0; i < Token.blockNames.length; i ++) {
823 Token r1 = Token.createRange();
825 if (i < NONBMP_BLOCK_START) {
827 int rstart = Token.blockRanges.charAt(location);
828 int rend = Token.blockRanges.charAt(location+1);
830 //System.out.println(n+" " +Integer.toHexString(rstart)
831 // +"-"+ Integer.toHexString(rend));
832 r1.addRange(rstart, rend);
834 location = (i - NONBMP_BLOCK_START) * 2;
835 r1.addRange(Token.nonBMPBlockRanges[location],
836 Token.nonBMPBlockRanges[location + 1]);
838 String n = Token.blockNames[i];
839 if (n.equals("Specials"))
840 r1.addRange(0xfff0, 0xfffd);
841 if (n.equals("Private Use")) {
842 r1.addRange(0xF0000,0xFFFFD);
843 r1.addRange(0x100000,0x10FFFD);
845 Token.categories.put(n, r1);
846 Token.categories2.put(n, Token.complementRanges(r1));
849 if (n.indexOf(' ') >= 0) {
850 for (int ci = 0; ci < n.length(); ci ++)
851 if (n.charAt(ci) != ' ') buffer.append((char)n.charAt(ci));
856 Token.setAlias(buffer.toString(), n, true);
860 Token.setAlias("ASSIGNED", "Cn", false);
861 Token.setAlias("UNASSIGNED", "Cn", true);
862 Token all = Token.createRange();
863 all.addRange(0, Token.UTF16_MAX);
864 Token.categories.put("ALL", all);
865 Token.categories2.put("ALL", Token.complementRanges(all));
866 Token.registerNonXS("ASSIGNED");
867 Token.registerNonXS("UNASSIGNED");
868 Token.registerNonXS("ALL");
870 Token isalpha = Token.createRange();
871 isalpha.mergeRanges(ranges[Character.UPPERCASE_LETTER]); // Lu
872 isalpha.mergeRanges(ranges[Character.LOWERCASE_LETTER]); // Ll
873 isalpha.mergeRanges(ranges[Character.OTHER_LETTER]); // Lo
874 Token.categories.put("IsAlpha", isalpha);
875 Token.categories2.put("IsAlpha", Token.complementRanges(isalpha));
876 Token.registerNonXS("IsAlpha");
878 Token isalnum = Token.createRange();
879 isalnum.mergeRanges(isalpha); // Lu Ll Lo
880 isalnum.mergeRanges(ranges[Character.DECIMAL_DIGIT_NUMBER]); // Nd
881 Token.categories.put("IsAlnum", isalnum);
882 Token.categories2.put("IsAlnum", Token.complementRanges(isalnum));
883 Token.registerNonXS("IsAlnum");
885 Token isspace = Token.createRange();
886 isspace.mergeRanges(Token.token_spaces);
887 isspace.mergeRanges(ranges[CHAR_SEPARATOR]); // Z
888 Token.categories.put("IsSpace", isspace);
889 Token.categories2.put("IsSpace", Token.complementRanges(isspace));
890 Token.registerNonXS("IsSpace");
892 Token isword = Token.createRange();
893 isword.mergeRanges(isalnum); // Lu Ll Lo Nd
894 isword.addRange('_', '_');
895 Token.categories.put("IsWord", isword);
896 Token.categories2.put("IsWord", Token.complementRanges(isword));
897 Token.registerNonXS("IsWord");
899 Token isascii = Token.createRange();
900 isascii.addRange(0, 127);
901 Token.categories.put("IsASCII", isascii);
902 Token.categories2.put("IsASCII", Token.complementRanges(isascii));
903 Token.registerNonXS("IsASCII");
905 Token isnotgraph = Token.createRange();
906 isnotgraph.mergeRanges(ranges[CHAR_OTHER]);
907 isnotgraph.addRange(' ', ' ');
908 Token.categories.put("IsGraph", Token.complementRanges(isnotgraph));
909 Token.categories2.put("IsGraph", isnotgraph);
910 Token.registerNonXS("IsGraph");
912 Token isxdigit = Token.createRange();
913 isxdigit.addRange('0', '9');
914 isxdigit.addRange('A', 'F');
915 isxdigit.addRange('a', 'f');
916 Token.categories.put("IsXDigit", Token.complementRanges(isxdigit));
917 Token.categories2.put("IsXDigit", isxdigit);
918 Token.registerNonXS("IsXDigit");
920 Token.setAlias("IsDigit", "Nd", true);
921 Token.setAlias("IsUpper", "Lu", true);
922 Token.setAlias("IsLower", "Ll", true);
923 Token.setAlias("IsCntrl", "C", true);
924 Token.setAlias("IsPrint", "C", false);
925 Token.setAlias("IsPunct", "P", true);
926 Token.registerNonXS("IsDigit");
927 Token.registerNonXS("IsUpper");
928 Token.registerNonXS("IsLower");
929 Token.registerNonXS("IsCntrl");
930 Token.registerNonXS("IsPrint");
931 Token.registerNonXS("IsPunct");
933 Token.setAlias("alpha", "IsAlpha", true);
934 Token.setAlias("alnum", "IsAlnum", true);
935 Token.setAlias("ascii", "IsASCII", true);
936 Token.setAlias("cntrl", "IsCntrl", true);
937 Token.setAlias("digit", "IsDigit", true);
938 Token.setAlias("graph", "IsGraph", true);
939 Token.setAlias("lower", "IsLower", true);
940 Token.setAlias("print", "IsPrint", true);
941 Token.setAlias("punct", "IsPunct", true);
942 Token.setAlias("space", "IsSpace", true);
943 Token.setAlias("upper", "IsUpper", true);
944 Token.setAlias("word", "IsWord", true); // Perl extension
945 Token.setAlias("xdigit", "IsXDigit", true);
946 Token.registerNonXS("alpha");
947 Token.registerNonXS("alnum");
948 Token.registerNonXS("ascii");
949 Token.registerNonXS("cntrl");
950 Token.registerNonXS("digit");
951 Token.registerNonXS("graph");
952 Token.registerNonXS("lower");
953 Token.registerNonXS("print");
954 Token.registerNonXS("punct");
955 Token.registerNonXS("space");
956 Token.registerNonXS("upper");
957 Token.registerNonXS("word");
958 Token.registerNonXS("xdigit");
961 RangeToken tok = positive ? (RangeToken)Token.categories.get(name)
962 : (RangeToken)Token.categories2.get(name);
963 //if (tok == null) System.out.println(name);
966 static protected RangeToken getRange(String name, boolean positive, boolean xs) {
967 RangeToken range = Token.getRange(name, positive);
968 if (xs && range != null && Token.isRegisterNonXS(name))
973 static Hashtable nonxs = null;
975 * This method is called by only getRange().
976 * So this method need not MT-safe.
978 static protected void registerNonXS(String name) {
979 if (Token.nonxs == null)
980 Token.nonxs = new Hashtable();
981 Token.nonxs.put(name, name);
983 static protected boolean isRegisterNonXS(String name) {
984 if (Token.nonxs == null)
987 //System.err.println("isRegisterNonXS: "+name);
988 return Token.nonxs.containsKey(name);
991 private static void setAlias(String newName, String name, boolean positive) {
992 Token t1 = (Token)Token.categories.get(name);
993 Token t2 = (Token)Token.categories2.get(name);
995 Token.categories.put(newName, t1);
996 Token.categories2.put(newName, t2);
998 Token.categories2.put(newName, t1);
999 Token.categories.put(newName, t2);
1003 // ------------------------------------------------------
1005 static final String viramaString =
1006 "\u094D"// ;DEVANAGARI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1007 +"\u09CD"//;BENGALI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1008 +"\u0A4D"//;GURMUKHI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1009 +"\u0ACD"//;GUJARATI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1010 +"\u0B4D"//;ORIYA SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1011 +"\u0BCD"//;TAMIL SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1012 +"\u0C4D"//;TELUGU SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1013 +"\u0CCD"//;KANNADA SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1014 +"\u0D4D"//;MALAYALAM SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1015 +"\u0E3A"//;THAI CHARACTER PHINTHU;Mn;9;ON;;;;;N;THAI VOWEL SIGN PHINTHU;;;;
1016 +"\u0F84";//;TIBETAN MARK HALANTA;Mn;9;ON;;;;;N;TIBETAN VIRAMA;;;;
1018 static private Token token_grapheme = null;
1019 static synchronized Token getGraphemePattern() {
1020 if (Token.token_grapheme != null)
1021 return Token.token_grapheme;
1023 Token base_char = Token.createRange(); // [{ASSIGNED}]-[{M},{C}]
1024 base_char.mergeRanges(Token.getRange("ASSIGNED", true));
1025 base_char.subtractRanges(Token.getRange("M", true));
1026 base_char.subtractRanges(Token.getRange("C", true));
1028 Token virama = Token.createRange();
1029 for (int i = 0; i < Token.viramaString.length(); i++) {
1030 virama.addRange(i, i);
1033 Token combiner_wo_virama = Token.createRange();
1034 combiner_wo_virama.mergeRanges(Token.getRange("M", true));
1035 combiner_wo_virama.addRange(0x1160, 0x11ff); // hangul_medial and hangul_final
1036 combiner_wo_virama.addRange(0xff9e, 0xff9f); // extras
1038 Token left = Token.createUnion(); // base_char?
1039 left.addChild(base_char);
1040 left.addChild(Token.token_empty);
1042 Token foo = Token.createUnion();
1043 foo.addChild(Token.createConcat(virama, Token.getRange("L", true)));
1044 foo.addChild(combiner_wo_virama);
1046 foo = Token.createClosure(foo);
1048 foo = Token.createConcat(left, foo);
1050 Token.token_grapheme = foo;
1051 return Token.token_grapheme;
1055 * Combing Character Sequence in Perl 5.6.
1057 static private Token token_ccs = null;
1058 static synchronized Token getCombiningCharacterSequence() {
1059 if (Token.token_ccs != null)
1060 return Token.token_ccs;
1062 Token foo = Token.createClosure(Token.getRange("M", true)); // \pM*
1063 foo = Token.createConcat(Token.getRange("M", false), foo); // \PM + \pM*
1064 Token.token_ccs = foo;
1065 return Token.token_ccs;
1068 // ------------------------------------------------------
1070 // ------------------------------------------------------
1072 * This class represents a node in parse tree.
1074 static class StringToken extends Token implements java.io.Serializable {
1076 private static final long serialVersionUID = -4614366944218504172L;
1079 final int refNumber;
1081 StringToken(int type, String str, int n) {
1087 int getReferenceNumber() { // for STRING
1088 return this.refNumber;
1090 String getString() { // for STRING
1094 public String toString(int options) {
1095 if (this.type == BACKREFERENCE)
1096 return "\\"+this.refNumber;
1098 return REUtil.quoteMeta(this.string);
1103 * This class represents a node in parse tree.
1105 static class ConcatToken extends Token implements java.io.Serializable {
1107 private static final long serialVersionUID = 8717321425541346381L;
1112 ConcatToken(Token t1, Token t2) {
1113 super(Token.CONCAT);
1121 Token getChild(int index) {
1122 return index == 0 ? this.child : this.child2;
1125 public String toString(int options) {
1127 if (this.child2.type == CLOSURE && this.child2.getChild(0) == this.child) {
1128 ret = this.child.toString(options)+"+";
1129 } else if (this.child2.type == NONGREEDYCLOSURE && this.child2.getChild(0) == this.child) {
1130 ret = this.child.toString(options)+"+?";
1132 ret = this.child.toString(options)+this.child2.toString(options);
1138 * This class represents a node in parse tree.
1140 static class CharToken extends Token implements java.io.Serializable {
1142 private static final long serialVersionUID = -4394272816279496989L;
1146 CharToken(int type, int ch) {
1152 return this.chardata;
1155 public String toString(int options) {
1157 switch (this.type) {
1159 switch (this.chardata) {
1160 case '|': case '*': case '+': case '?':
1161 case '(': case ')': case '.': case '[':
1162 case '{': case '\\':
1163 ret = "\\"+(char)this.chardata;
1165 case '\f': ret = "\\f"; break;
1166 case '\n': ret = "\\n"; break;
1167 case '\r': ret = "\\r"; break;
1168 case '\t': ret = "\\t"; break;
1169 case 0x1b: ret = "\\e"; break;
1170 //case 0x0b: ret = "\\v"; break;
1172 if (this.chardata >= 0x10000) {
1173 String pre = "0"+Integer.toHexString(this.chardata);
1174 ret = "\\v"+pre.substring(pre.length()-6, pre.length());
1176 ret = ""+(char)this.chardata;
1181 if (this == Token.token_linebeginning || this == Token.token_lineend)
1182 ret = ""+(char)this.chardata;
1184 ret = "\\"+(char)this.chardata;
1193 boolean match(int ch) {
1194 if (this.type == CHAR) {
1195 return ch == this.chardata;
1197 throw new RuntimeException("NFAArrow#match(): Internal error: "+this.type);
1202 * This class represents a node in parse tree.
1204 static class ClosureToken extends Token implements java.io.Serializable {
1206 private static final long serialVersionUID = 1308971930673997452L;
1212 ClosureToken(int type, Token tok) {
1222 Token getChild(int index) {
1226 final void setMin(int min) {
1229 final void setMax(int max) {
1232 final int getMin() {
1235 final int getMax() {
1239 public String toString(int options) {
1241 if (this.type == CLOSURE) {
1242 if (this.getMin() < 0 && this.getMax() < 0) {
1243 ret = this.child.toString(options)+"*";
1244 } else if (this.getMin() == this.getMax()) {
1245 ret = this.child.toString(options)+"{"+this.getMin()+"}";
1246 } else if (this.getMin() >= 0 && this.getMax() >= 0) {
1247 ret = this.child.toString(options)+"{"+this.getMin()+","+this.getMax()+"}";
1248 } else if (this.getMin() >= 0 && this.getMax() < 0) {
1249 ret = this.child.toString(options)+"{"+this.getMin()+",}";
1251 throw new RuntimeException("Token#toString(): CLOSURE "
1252 +this.getMin()+", "+this.getMax());
1254 if (this.getMin() < 0 && this.getMax() < 0) {
1255 ret = this.child.toString(options)+"*?";
1256 } else if (this.getMin() == this.getMax()) {
1257 ret = this.child.toString(options)+"{"+this.getMin()+"}?";
1258 } else if (this.getMin() >= 0 && this.getMax() >= 0) {
1259 ret = this.child.toString(options)+"{"+this.getMin()+","+this.getMax()+"}?";
1260 } else if (this.getMin() >= 0 && this.getMax() < 0) {
1261 ret = this.child.toString(options)+"{"+this.getMin()+",}?";
1263 throw new RuntimeException("Token#toString(): NONGREEDYCLOSURE "
1264 +this.getMin()+", "+this.getMax());
1271 * This class represents a node in parse tree.
1273 static class ParenToken extends Token implements java.io.Serializable {
1275 private static final long serialVersionUID = -5938014719827987704L;
1278 final int parennumber;
1280 ParenToken(int type, Token tok, int paren) {
1283 this.parennumber = paren;
1289 Token getChild(int index) {
1293 int getParenNumber() {
1294 return this.parennumber;
1297 public String toString(int options) {
1299 switch (this.type) {
1301 if (this.parennumber == 0) {
1302 ret = "(?:"+this.child.toString(options)+")";
1304 ret = "("+this.child.toString(options)+")";
1309 ret = "(?="+this.child.toString(options)+")";
1311 case NEGATIVELOOKAHEAD:
1312 ret = "(?!"+this.child.toString(options)+")";
1315 ret = "(?<="+this.child.toString(options)+")";
1317 case NEGATIVELOOKBEHIND:
1318 ret = "(?<!"+this.child.toString(options)+")";
1321 ret = "(?>"+this.child.toString(options)+")";
1329 * (?(condition)yes-pattern|no-pattern)
1331 static class ConditionToken extends Token implements java.io.Serializable {
1333 private static final long serialVersionUID = 4353765277910594411L;
1335 final int refNumber;
1336 final Token condition;
1339 ConditionToken(int refno, Token cond, Token yespat, Token nopat) {
1340 super(Token.CONDITION);
1341 this.refNumber = refno;
1342 this.condition = cond;
1347 return this.no == null ? 1 : 2;
1349 Token getChild(int index) {
1350 if (index == 0) return this.yes;
1351 if (index == 1) return this.no;
1352 throw new RuntimeException("Internal Error: "+index);
1355 public String toString(int options) {
1357 if (refNumber > 0) {
1358 ret = "(?("+refNumber+")";
1359 } else if (this.condition.type == Token.ANCHOR) {
1360 ret = "(?("+this.condition+")";
1362 ret = "(?"+this.condition;
1365 if (this.no == null) {
1366 ret += this.yes+")";
1368 ret += this.yes+"|"+this.no+")";
1377 static class ModifierToken extends Token implements java.io.Serializable {
1379 private static final long serialVersionUID = -9114536559696480356L;
1385 ModifierToken(Token tok, int add, int mask) {
1386 super(Token.MODIFIERGROUP);
1395 Token getChild(int index) {
1402 int getOptionsMask() {
1406 public String toString(int options) {
1408 +(this.add == 0 ? "" : REUtil.createOptionString(this.add))
1409 +(this.mask == 0 ? "" : REUtil.createOptionString(this.mask))
1411 +this.child.toString(options)
1417 * This class represents a node in parse tree.
1418 * for UNION or CONCAT.
1420 static class UnionToken extends Token implements java.io.Serializable {
1422 private static final long serialVersionUID = -2568843945989489861L;
1426 UnionToken(int type) {
1430 void addChild(Token tok) {
1431 if (tok == null) return;
1432 if (this.children == null) this.children = new Vector();
1433 if (this.type == UNION) {
1434 this.children.addElement(tok);
1437 // This is CONCAT, and new child is CONCAT.
1438 if (tok.type == CONCAT) {
1439 for (int i = 0; i < tok.size(); i ++)
1440 this.addChild(tok.getChild(i)); // Recursion
1443 int size = this.children.size();
1445 this.children.addElement(tok);
1448 Token previous = (Token)this.children.elementAt(size-1);
1449 if (!((previous.type == CHAR || previous.type == STRING)
1450 && (tok.type == CHAR || tok.type == STRING))) {
1451 this.children.addElement(tok);
1455 //System.err.println("Merge '"+previous+"' and '"+tok+"'.");
1457 StringBuffer buffer;
1458 int nextMaxLength = (tok.type == CHAR ? 2 : tok.getString().length());
1459 if (previous.type == CHAR) { // Replace previous token by STRING
1460 buffer = new StringBuffer(2 + nextMaxLength);
1461 int ch = previous.getChar();
1463 buffer.append(REUtil.decomposeToSurrogates(ch));
1465 buffer.append((char)ch);
1466 previous = Token.createString(null);
1467 this.children.setElementAt(previous, size-1);
1469 buffer = new StringBuffer(previous.getString().length() + nextMaxLength);
1470 buffer.append(previous.getString());
1473 if (tok.type == CHAR) {
1474 int ch = tok.getChar();
1476 buffer.append(REUtil.decomposeToSurrogates(ch));
1478 buffer.append((char)ch);
1480 buffer.append(tok.getString());
1483 ((StringToken)previous).string = new String(buffer);
1487 return this.children == null ? 0 : this.children.size();
1489 Token getChild(int index) {
1490 return (Token)this.children.elementAt(index);
1493 public String toString(int options) {
1495 if (this.type == CONCAT) {
1496 if (this.children.size() == 2) {
1497 Token ch = this.getChild(0);
1498 Token ch2 = this.getChild(1);
1499 if (ch2.type == CLOSURE && ch2.getChild(0) == ch) {
1500 ret = ch.toString(options)+"+";
1501 } else if (ch2.type == NONGREEDYCLOSURE && ch2.getChild(0) == ch) {
1502 ret = ch.toString(options)+"+?";
1504 ret = ch.toString(options)+ch2.toString(options);
1506 StringBuffer sb = new StringBuffer();
1507 for (int i = 0; i < this.children.size(); i ++) {
1508 sb.append(((Token)this.children.elementAt(i)).toString(options));
1510 ret = new String(sb);
1514 if (this.children.size() == 2 && this.getChild(1).type == EMPTY) {
1515 ret = this.getChild(0).toString(options)+"?";
1516 } else if (this.children.size() == 2
1517 && this.getChild(0).type == EMPTY) {
1518 ret = this.getChild(1).toString(options)+"??";
1520 StringBuffer sb = new StringBuffer();
1521 sb.append(((Token)this.children.elementAt(0)).toString(options));
1522 for (int i = 1; i < this.children.size(); i ++) {
1523 sb.append((char)'|');
1524 sb.append(((Token)this.children.elementAt(i)).toString(options));
1526 ret = new String(sb);