2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
9 * http://www.apache.org/licenses/LICENSE-2.0
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
18 package org.opendaylight.yangtools.xsd.regex;
20 import java.util.Hashtable;
21 import java.util.Vector;
24 * This class represents a node in parse tree.
28 * @version $Id: Token.java 1638344 2014-11-11 20:15:46Z mrglavas $
30 class Token implements java.io.Serializable {
32 private static final long serialVersionUID = 8484976002585487481L;
34 static final boolean COUNTTOKENS = true;
35 static int tokens = 0;
37 static final int CHAR = 0; // Literal char
38 static final int DOT = 11; // .
39 static final int CONCAT = 1; // XY
40 static final int UNION = 2; // X|Y|Z
41 static final int CLOSURE = 3; // X*
42 static final int RANGE = 4; // [a-zA-Z] etc.
43 static final int NRANGE = 5; // [^a-zA-Z] etc.
44 static final int PAREN = 6; // (X) or (?:X)
45 static final int EMPTY = 7; //
46 static final int ANCHOR = 8; // ^ $ \b \B \< \> \A \Z \z
47 static final int NONGREEDYCLOSURE = 9; // *? +?
48 static final int STRING = 10; // strings
49 static final int BACKREFERENCE = 12; // back references
50 static final int LOOKAHEAD = 20; // (?=...)
51 static final int NEGATIVELOOKAHEAD = 21; // (?!...)
52 static final int LOOKBEHIND = 22; // (?<=...)
53 static final int NEGATIVELOOKBEHIND = 23; // (?<!...)
54 static final int INDEPENDENT = 24; // (?>...)
55 static final int MODIFIERGROUP = 25; // (?ims-ims:...)
56 static final int CONDITION = 26; // (?(...)yes|no)
58 static final int UTF16_MAX = 0x10ffff;
62 static Token token_dot;
63 static Token token_0to9;
64 static Token token_wordchars;
65 static Token token_not_0to9;
66 static Token token_not_wordchars;
67 static Token token_spaces;
68 static Token token_not_spaces;
69 static Token token_empty;
70 static Token token_linebeginning;
71 static Token token_linebeginning2;
72 static Token token_lineend;
73 static Token token_stringbeginning;
74 static Token token_stringend;
75 static Token token_stringend2;
76 static Token token_wordedge;
77 static Token token_not_wordedge;
78 static Token token_wordbeginning;
79 static Token token_wordend;
81 Token.token_empty = new Token(Token.EMPTY);
83 Token.token_linebeginning = Token.createAnchor('^');
84 Token.token_linebeginning2 = Token.createAnchor('@');
85 Token.token_lineend = Token.createAnchor('$');
86 Token.token_stringbeginning = Token.createAnchor('A');
87 Token.token_stringend = Token.createAnchor('z');
88 Token.token_stringend2 = Token.createAnchor('Z');
89 Token.token_wordedge = Token.createAnchor('b');
90 Token.token_not_wordedge = Token.createAnchor('B');
91 Token.token_wordbeginning = Token.createAnchor('<');
92 Token.token_wordend = Token.createAnchor('>');
94 Token.token_dot = new Token(Token.DOT);
96 Token.token_0to9 = Token.createRange();
97 Token.token_0to9.addRange('0', '9');
98 Token.token_wordchars = Token.createRange();
99 Token.token_wordchars.addRange('0', '9');
100 Token.token_wordchars.addRange('A', 'Z');
101 Token.token_wordchars.addRange('_', '_');
102 Token.token_wordchars.addRange('a', 'z');
103 Token.token_spaces = Token.createRange();
104 Token.token_spaces.addRange('\t', '\t');
105 Token.token_spaces.addRange('\n', '\n');
106 Token.token_spaces.addRange('\f', '\f');
107 Token.token_spaces.addRange('\r', '\r');
108 Token.token_spaces.addRange(' ', ' ');
110 Token.token_not_0to9 = Token.complementRanges(Token.token_0to9);
111 Token.token_not_wordchars = Token.complementRanges(Token.token_wordchars);
112 Token.token_not_spaces = Token.complementRanges(Token.token_spaces);
115 static Token.ParenToken createLook(int type, Token child) {
119 return new Token.ParenToken(type, child, 0);
121 static Token.ParenToken createParen(Token child, int pnumber) {
125 return new Token.ParenToken(Token.PAREN, child, pnumber);
127 static Token.ClosureToken createClosure(Token tok) {
131 return new Token.ClosureToken(Token.CLOSURE, tok);
133 static Token.ClosureToken createNGClosure(Token tok) {
137 return new Token.ClosureToken(Token.NONGREEDYCLOSURE, tok);
139 static Token.ConcatToken createConcat(Token tok1, Token tok2) {
143 return new Token.ConcatToken(tok1, tok2);
145 static Token.UnionToken createConcat() {
149 return new Token.UnionToken(Token.CONCAT); // *** It is not a bug.
151 static Token.UnionToken createUnion() {
155 return new Token.UnionToken(Token.UNION);
157 static Token createEmpty() {
158 return Token.token_empty;
160 static RangeToken createRange() {
164 return new RangeToken(Token.RANGE);
166 static RangeToken createNRange() {
170 return new RangeToken(Token.NRANGE);
172 static Token.CharToken createChar(int ch) {
176 return new Token.CharToken(Token.CHAR, ch);
178 static private Token.CharToken createAnchor(int ch) {
182 return new Token.CharToken(Token.ANCHOR, ch);
184 static Token.StringToken createBackReference(int refno) {
188 return new Token.StringToken(Token.BACKREFERENCE, null, refno);
190 static Token.StringToken createString(String str) {
194 return new Token.StringToken(Token.STRING, str, 0);
196 static Token.ModifierToken createModifierGroup(Token child, int add, int mask) {
200 return new Token.ModifierToken(child, add, mask);
202 static Token.ConditionToken createCondition(int refno, Token condition,
203 Token yespat, Token nopat) {
207 return new Token.ConditionToken(refno, condition, yespat, nopat);
210 protected Token(int type) {
215 * A number of children.
220 Token getChild(int index) {
223 void addChild(Token tok) {
224 throw new RuntimeException("Not supported.");
227 // for RANGE or NRANGE
228 protected void addRange(int start, int end) {
229 throw new RuntimeException("Not supported.");
231 protected void sortRanges() {
232 throw new RuntimeException("Not supported.");
234 protected void compactRanges() {
235 throw new RuntimeException("Not supported.");
237 protected void mergeRanges(Token tok) {
238 throw new RuntimeException("Not supported.");
240 protected void subtractRanges(Token tok) {
241 throw new RuntimeException("Not supported.");
243 protected void intersectRanges(Token tok) {
244 throw new RuntimeException("Not supported.");
246 static Token complementRanges(Token tok) {
247 return RangeToken.complementRanges(tok);
251 void setMin(int min) { // for CLOSURE
253 void setMax(int max) { // for CLOSURE
255 int getMin() { // for CLOSURE
258 int getMax() { // for CLOSURE
261 int getReferenceNumber() { // for STRING
264 String getString() { // for STRING
268 int getParenNumber() {
276 public String toString() {
277 return this.toString(0);
279 public String toString(int options) {
280 return this.type == Token.DOT ? "." : "";
284 * How many characters are needed?
286 final int getMinLength() {
290 for (int i = 0; i < this.size(); i ++) {
291 sum += this.getChild(i).getMinLength();
297 if (this.size() == 0) {
300 int ret = this.getChild(0).getMinLength();
301 for (int i = 1; i < this.size(); i ++) {
302 int min = this.getChild(i).getMinLength();
310 case NONGREEDYCLOSURE:
311 if (this.getMin() >= 0) {
312 return this.getMin() * this.getChild(0).getMinLength();
329 return this.getChild(0).getMinLength();
335 return this.getString().length();
338 case NEGATIVELOOKAHEAD:
340 case NEGATIVELOOKBEHIND:
341 return 0; // ***** Really?
344 throw new RuntimeException("Token#getMinLength(): Invalid Type: "+this.type);
348 final int getMaxLength() {
352 for (int i = 0; i < this.size(); i ++) {
353 int d = this.getChild(i).getMaxLength();
363 if (this.size() == 0) {
366 int ret = this.getChild(0).getMaxLength();
367 for (int i = 1; ret >= 0 && i < this.size(); i ++) {
368 int max = this.getChild(i).getMaxLength();
369 if (max < 0) { // infinity
380 case NONGREEDYCLOSURE:
381 if (this.getMax() >= 0) {
382 // When this.child.getMaxLength() < 0,
383 // this returns minus value
384 return this.getMax() * this.getChild(0).getMaxLength();
402 return this.getChild(0).getMaxLength();
408 return this.getString().length();
411 case NEGATIVELOOKAHEAD:
413 case NEGATIVELOOKBEHIND:
414 return 0; // ***** Really?
417 throw new RuntimeException("Token#getMaxLength(): Invalid Type: "+this.type);
421 static final int FC_CONTINUE = 0;
422 static final int FC_TERMINAL = 1;
423 static final int FC_ANY = 2;
424 private static final boolean isSet(int options, int flag) {
425 return (options & flag) == flag;
427 final int analyzeFirstCharacter(RangeToken result, int options) {
430 int ret = FC_CONTINUE;
431 for (int i = 0; i < this.size(); i ++) {
432 if ((ret = this.getChild(i).analyzeFirstCharacter(result, options)) != FC_CONTINUE) {
439 if (this.size() == 0) {
443 * a|b|c -> FC_TERMINAL
445 * a|b| -> FC_CONTINUE
447 int ret2 = FC_CONTINUE;
448 boolean hasEmpty = false;
449 for (int i = 0; i < this.size(); i ++) {
450 ret2 = this.getChild(i).analyzeFirstCharacter(result, options);
451 if (ret2 == FC_ANY) {
453 } else if (ret2 == FC_CONTINUE) {
457 return hasEmpty ? FC_CONTINUE : ret2;
460 int ret3 = this.getChild(0).analyzeFirstCharacter(result, options);
461 if (this.size() == 1) {
464 if (ret3 == FC_ANY) {
467 int ret4 = this.getChild(1).analyzeFirstCharacter(result, options);
468 if (ret4 == FC_ANY) {
471 return ret3 == FC_CONTINUE || ret4 == FC_CONTINUE ? FC_CONTINUE : FC_TERMINAL;
474 case NONGREEDYCLOSURE:
475 this.getChild(0).analyzeFirstCharacter(result, options);
483 int ch = this.getChar();
484 result.addRange(ch, ch);
485 if (ch < 0x10000 && isSet(options, RegularExpression.IGNORE_CASE)) {
486 ch = Character.toUpperCase((char)ch);
487 result.addRange(ch, ch);
488 ch = Character.toLowerCase((char)ch);
489 result.addRange(ch, ch);
497 result.mergeRanges(this);
501 result.mergeRanges(Token.complementRanges(this));
506 return this.getChild(0).analyzeFirstCharacter(result, options);
509 options |= ((ModifierToken)this).getOptions();
510 options &= ~((ModifierToken)this).getOptionsMask();
511 return this.getChild(0).analyzeFirstCharacter(result, options);
514 result.addRange(0, UTF16_MAX); // **** We can not optimize.
518 int cha = this.getString().charAt(0);
520 if (REUtil.isHighSurrogate(cha)
521 && this.getString().length() >= 2
522 && REUtil.isLowSurrogate((ch2 = this.getString().charAt(1)))) {
523 cha = REUtil.composeFromSurrogates(cha, ch2);
525 result.addRange(cha, cha);
526 if (cha < 0x10000 && isSet(options, RegularExpression.IGNORE_CASE)) {
527 cha = Character.toUpperCase((char)cha);
528 result.addRange(cha, cha);
529 cha = Character.toLowerCase((char)cha);
530 result.addRange(cha, cha);
535 case NEGATIVELOOKAHEAD:
537 case NEGATIVELOOKBEHIND:
541 throw new RuntimeException("Token#analyzeHeadCharacter(): Invalid Type: "+this.type);
545 private final boolean isShorterThan(Token tok) {
551 if (this.type == STRING) mylength = this.getString().length();
552 else if (this.type == CHAR) mylength = this.getChar() >= 0x10000 ? 2 : 1;
553 else throw new RuntimeException("Internal Error: Illegal type: "+this.type);
555 if (tok.type == STRING) otherlength = tok.getString().length();
556 else if (tok.type == CHAR) otherlength = tok.getChar() >= 0x10000 ? 2 : 1;
557 else throw new RuntimeException("Internal Error: Illegal type: "+tok.type);
560 if (this.type == STRING) {
561 mylength = this.getString().length();
563 throw new RuntimeException("Internal Error: Illegal type: "+this.type);
566 if (tok.type == STRING) {
567 otherlength = tok.getString().length();
569 throw new RuntimeException("Internal Error: Illegal type: "+tok.type);
571 return mylength < otherlength;
574 static class FixedStringContainer {
577 FixedStringContainer() {
581 final void findFixedString(FixedStringContainer container, int options) {
584 Token prevToken = null;
586 for (int i = 0; i < this.size(); i ++) {
587 this.getChild(i).findFixedString(container, options);
588 if (prevToken == null || prevToken.isShorterThan(container.token)) {
589 prevToken = container.token;
590 prevOptions = container.options;
593 container.token = prevToken;
594 container.options = prevOptions;
599 case NONGREEDYCLOSURE:
607 case NEGATIVELOOKAHEAD:
609 case NEGATIVELOOKBEHIND:
611 container.token = null;
614 case CHAR: // Ignore CHAR tokens.
615 container.token = null; // **
619 container.token = this;
620 container.options = options;
625 this.getChild(0).findFixedString(container, options);
629 options |= ((ModifierToken)this).getOptions();
630 options &= ~((ModifierToken)this).getOptionsMask();
631 this.getChild(0).findFixedString(container, options);
635 throw new RuntimeException("Token#findFixedString(): Invalid Type: "+this.type);
639 boolean match(int ch) {
640 throw new RuntimeException("NFAArrow#match(): Internal error: "+this.type);
643 // ------------------------------------------------------
644 private final static Hashtable<String, Token> categories = new Hashtable<>();
645 private final static Hashtable<String, Token> categories2 = new Hashtable<>();
646 private static final String[] categoryNames = {
647 "Cn", "Lu", "Ll", "Lt", "Lm", "Lo", "Mn", "Me", "Mc", "Nd",
648 "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", null, "Co", "Cs",
649 "Pd", "Ps", "Pe", "Pc", "Po", "Sm", "Sc", "Sk", "So", // 28
650 "Pi", "Pf", // 29, 30
651 "L", "M", "N", "Z", "C", "P", "S", // 31-37
654 // Schema Rec. {Datatypes} - Punctuation
655 static final int CHAR_INIT_QUOTE = 29; // Pi - initial quote
656 static final int CHAR_FINAL_QUOTE = 30; // Pf - final quote
657 static final int CHAR_LETTER = 31;
658 static final int CHAR_MARK = 32;
659 static final int CHAR_NUMBER = 33;
660 static final int CHAR_SEPARATOR = 34;
661 static final int CHAR_OTHER = 35;
662 static final int CHAR_PUNCTUATION = 36;
663 static final int CHAR_SYMBOL = 37;
665 //blockNames in UNICODE 3.1 that supported by XML Schema REC
666 private static final String[] blockNames = {
667 /*0000..007F;*/ "Basic Latin",
668 /*0080..00FF;*/ "Latin-1 Supplement",
669 /*0100..017F;*/ "Latin Extended-A",
670 /*0180..024F;*/ "Latin Extended-B",
671 /*0250..02AF;*/ "IPA Extensions",
672 /*02B0..02FF;*/ "Spacing Modifier Letters",
673 /*0300..036F;*/ "Combining Diacritical Marks",
674 /*0370..03FF;*/ "Greek",
675 /*0400..04FF;*/ "Cyrillic",
676 /*0530..058F;*/ "Armenian",
677 /*0590..05FF;*/ "Hebrew",
678 /*0600..06FF;*/ "Arabic",
679 /*0700..074F;*/ "Syriac",
680 /*0780..07BF;*/ "Thaana",
681 /*0900..097F;*/ "Devanagari",
682 /*0980..09FF;*/ "Bengali",
683 /*0A00..0A7F;*/ "Gurmukhi",
684 /*0A80..0AFF;*/ "Gujarati",
685 /*0B00..0B7F;*/ "Oriya",
686 /*0B80..0BFF;*/ "Tamil",
687 /*0C00..0C7F;*/ "Telugu",
688 /*0C80..0CFF;*/ "Kannada",
689 /*0D00..0D7F;*/ "Malayalam",
690 /*0D80..0DFF;*/ "Sinhala",
691 /*0E00..0E7F;*/ "Thai",
692 /*0E80..0EFF;*/ "Lao",
693 /*0F00..0FFF;*/ "Tibetan",
694 /*1000..109F;*/ "Myanmar",
695 /*10A0..10FF;*/ "Georgian",
696 /*1100..11FF;*/ "Hangul Jamo",
697 /*1200..137F;*/ "Ethiopic",
698 /*13A0..13FF;*/ "Cherokee",
699 /*1400..167F;*/ "Unified Canadian Aboriginal Syllabics",
700 /*1680..169F;*/ "Ogham",
701 /*16A0..16FF;*/ "Runic",
702 /*1780..17FF;*/ "Khmer",
703 /*1800..18AF;*/ "Mongolian",
704 /*1E00..1EFF;*/ "Latin Extended Additional",
705 /*1F00..1FFF;*/ "Greek Extended",
706 /*2000..206F;*/ "General Punctuation",
707 /*2070..209F;*/ "Superscripts and Subscripts",
708 /*20A0..20CF;*/ "Currency Symbols",
709 /*20D0..20FF;*/ "Combining Marks for Symbols",
710 /*2100..214F;*/ "Letterlike Symbols",
711 /*2150..218F;*/ "Number Forms",
712 /*2190..21FF;*/ "Arrows",
713 /*2200..22FF;*/ "Mathematical Operators",
714 /*2300..23FF;*/ "Miscellaneous Technical",
715 /*2400..243F;*/ "Control Pictures",
716 /*2440..245F;*/ "Optical Character Recognition",
717 /*2460..24FF;*/ "Enclosed Alphanumerics",
718 /*2500..257F;*/ "Box Drawing",
719 /*2580..259F;*/ "Block Elements",
720 /*25A0..25FF;*/ "Geometric Shapes",
721 /*2600..26FF;*/ "Miscellaneous Symbols",
722 /*2700..27BF;*/ "Dingbats",
723 /*2800..28FF;*/ "Braille Patterns",
724 /*2E80..2EFF;*/ "CJK Radicals Supplement",
725 /*2F00..2FDF;*/ "Kangxi Radicals",
726 /*2FF0..2FFF;*/ "Ideographic Description Characters",
727 /*3000..303F;*/ "CJK Symbols and Punctuation",
728 /*3040..309F;*/ "Hiragana",
729 /*30A0..30FF;*/ "Katakana",
730 /*3100..312F;*/ "Bopomofo",
731 /*3130..318F;*/ "Hangul Compatibility Jamo",
732 /*3190..319F;*/ "Kanbun",
733 /*31A0..31BF;*/ "Bopomofo Extended",
734 /*3200..32FF;*/ "Enclosed CJK Letters and Months",
735 /*3300..33FF;*/ "CJK Compatibility",
736 /*3400..4DB5;*/ "CJK Unified Ideographs Extension A",
737 /*4E00..9FFF;*/ "CJK Unified Ideographs",
738 /*A000..A48F;*/ "Yi Syllables",
739 /*A490..A4CF;*/ "Yi Radicals",
740 /*AC00..D7A3;*/ "Hangul Syllables",
741 /*E000..F8FF;*/ "Private Use",
742 /*F900..FAFF;*/ "CJK Compatibility Ideographs",
743 /*FB00..FB4F;*/ "Alphabetic Presentation Forms",
744 /*FB50..FDFF;*/ "Arabic Presentation Forms-A",
745 /*FE20..FE2F;*/ "Combining Half Marks",
746 /*FE30..FE4F;*/ "CJK Compatibility Forms",
747 /*FE50..FE6F;*/ "Small Form Variants",
748 /*FE70..FEFE;*/ "Arabic Presentation Forms-B",
749 /*FEFF..FEFF;*/ "Specials",
750 /*FF00..FFEF;*/ "Halfwidth and Fullwidth Forms",
751 //missing Specials add manually
752 /*10300..1032F;*/ "Old Italic", // 84
753 /*10330..1034F;*/ "Gothic",
754 /*10400..1044F;*/ "Deseret",
755 /*1D000..1D0FF;*/ "Byzantine Musical Symbols",
756 /*1D100..1D1FF;*/ "Musical Symbols",
757 /*1D400..1D7FF;*/ "Mathematical Alphanumeric Symbols",
758 /*20000..2A6D6;*/ "CJK Unified Ideographs Extension B",
759 /*2F800..2FA1F;*/ "CJK Compatibility Ideographs Supplement",
760 /*E0000..E007F;*/ "Tags",
761 //missing 2 private use add manually
765 //F0000..FFFFD; "Private Use",
766 //100000..10FFFD; "Private Use"
767 //FFF0..FFFD; "Specials",
768 static final String blockRanges =
769 "\u0000\u007F\u0080\u00FF\u0100\u017F\u0180\u024F\u0250\u02AF\u02B0\u02FF\u0300\u036F"
770 +"\u0370\u03FF\u0400\u04FF\u0530\u058F\u0590\u05FF\u0600\u06FF\u0700\u074F\u0780\u07BF"
771 +"\u0900\u097F\u0980\u09FF\u0A00\u0A7F\u0A80\u0AFF\u0B00\u0B7F\u0B80\u0BFF\u0C00\u0C7F\u0C80\u0CFF"
772 +"\u0D00\u0D7F\u0D80\u0DFF\u0E00\u0E7F\u0E80\u0EFF\u0F00\u0FFF\u1000\u109F\u10A0\u10FF\u1100\u11FF"
773 +"\u1200\u137F\u13A0\u13FF\u1400\u167F\u1680\u169F\u16A0\u16FF\u1780\u17FF\u1800\u18AF\u1E00\u1EFF"
774 +"\u1F00\u1FFF\u2000\u206F\u2070\u209F\u20A0\u20CF\u20D0\u20FF\u2100\u214F\u2150\u218F\u2190\u21FF\u2200\u22FF"
775 +"\u2300\u23FF\u2400\u243F\u2440\u245F\u2460\u24FF\u2500\u257F\u2580\u259F\u25A0\u25FF\u2600\u26FF\u2700\u27BF"
776 +"\u2800\u28FF\u2E80\u2EFF\u2F00\u2FDF\u2FF0\u2FFF\u3000\u303F\u3040\u309F\u30A0\u30FF\u3100\u312F\u3130\u318F"
777 +"\u3190\u319F\u31A0\u31BF\u3200\u32FF\u3300\u33FF\u3400\u4DB5\u4E00\u9FFF\uA000\uA48F\uA490\uA4CF"
778 +"\uAC00\uD7A3\uE000\uF8FF\uF900\uFAFF\uFB00\uFB4F\uFB50\uFDFF"
779 +"\uFE20\uFE2F\uFE30\uFE4F\uFE50\uFE6F\uFE70\uFEFE\uFEFF\uFEFF\uFF00\uFFEF";
780 static final int[] nonBMPBlockRanges = {
781 0x10300, 0x1032F, // 84
791 private static final int NONBMP_BLOCK_START = 84;
793 static protected RangeToken getRange(String name, boolean positive) {
794 if (Token.categories.size() == 0) {
795 synchronized (Token.categories) {
796 Token[] ranges = new Token[Token.categoryNames.length];
797 for (int i = 0; i < ranges.length; i ++) {
798 ranges[i] = Token.createRange();
801 for (int i = 0; i < 0x10000; i ++) {
802 type = Character.getType((char)i);
803 if (type == Character.START_PUNCTUATION ||
804 type == Character.END_PUNCTUATION) {
805 //build table of Pi values
806 if (i == 0x00AB || i == 0x2018 || i == 0x201B || i == 0x201C ||
807 i == 0x201F || i == 0x2039) {
808 type = CHAR_INIT_QUOTE;
810 //build table of Pf values
811 if (i == 0x00BB || i == 0x2019 || i == 0x201D || i == 0x203A ) {
812 type = CHAR_FINAL_QUOTE;
815 ranges[type].addRange(i, i);
817 case Character.UPPERCASE_LETTER:
818 case Character.LOWERCASE_LETTER:
819 case Character.TITLECASE_LETTER:
820 case Character.MODIFIER_LETTER:
821 case Character.OTHER_LETTER:
824 case Character.NON_SPACING_MARK:
825 case Character.COMBINING_SPACING_MARK:
826 case Character.ENCLOSING_MARK:
829 case Character.DECIMAL_DIGIT_NUMBER:
830 case Character.LETTER_NUMBER:
831 case Character.OTHER_NUMBER:
834 case Character.SPACE_SEPARATOR:
835 case Character.LINE_SEPARATOR:
836 case Character.PARAGRAPH_SEPARATOR:
837 type = CHAR_SEPARATOR;
839 case Character.CONTROL:
840 case Character.FORMAT:
841 case Character.SURROGATE:
842 case Character.PRIVATE_USE:
843 case Character.UNASSIGNED:
846 case Character.CONNECTOR_PUNCTUATION:
847 case Character.DASH_PUNCTUATION:
848 case Character.START_PUNCTUATION:
849 case Character.END_PUNCTUATION:
850 case CHAR_INIT_QUOTE:
851 case CHAR_FINAL_QUOTE:
852 case Character.OTHER_PUNCTUATION:
853 type = CHAR_PUNCTUATION;
855 case Character.MATH_SYMBOL:
856 case Character.CURRENCY_SYMBOL:
857 case Character.MODIFIER_SYMBOL:
858 case Character.OTHER_SYMBOL:
862 throw new RuntimeException("org.apache.xerces.utils.regex.Token#getRange(): Unknown Unicode category: "+type);
864 ranges[type].addRange(i, i);
865 } // for all characters
866 ranges[Character.UNASSIGNED].addRange(0x10000, Token.UTF16_MAX);
867 ranges[CHAR_OTHER].addRange(0x10000, Token.UTF16_MAX);
869 for (int i = 0; i < ranges.length; i ++) {
870 if (Token.categoryNames[i] != null) {
871 if (i == Character.UNASSIGNED) { // Unassigned
872 ranges[i].addRange(0x10000, Token.UTF16_MAX);
874 Token.categories.put(Token.categoryNames[i], ranges[i]);
875 Token.categories2.put(Token.categoryNames[i],
876 Token.complementRanges(ranges[i]));
879 //REVISIT: do we really need to support block names as in Unicode 3.1
880 // or we can just create all the names in IsBLOCKNAME format (XML Schema REC)?
882 StringBuffer buffer = new StringBuffer(50);
883 for (int i = 0; i < Token.blockNames.length; i ++) {
884 Token r1 = Token.createRange();
886 if (i < NONBMP_BLOCK_START) {
888 int rstart = Token.blockRanges.charAt(location);
889 int rend = Token.blockRanges.charAt(location+1);
891 //System.out.println(n+" " +Integer.toHexString(rstart)
892 // +"-"+ Integer.toHexString(rend));
893 r1.addRange(rstart, rend);
895 location = (i - NONBMP_BLOCK_START) * 2;
896 r1.addRange(Token.nonBMPBlockRanges[location],
897 Token.nonBMPBlockRanges[location + 1]);
899 String n = Token.blockNames[i];
900 if (n.equals("Specials")) {
901 r1.addRange(0xfff0, 0xfffd);
903 if (n.equals("Private Use")) {
904 r1.addRange(0xF0000,0xFFFFD);
905 r1.addRange(0x100000,0x10FFFD);
907 Token.categories.put(n, r1);
908 Token.categories2.put(n, Token.complementRanges(r1));
911 if (n.indexOf(' ') >= 0) {
912 for (int ci = 0; ci < n.length(); ci ++) {
913 if (n.charAt(ci) != ' ') {
914 buffer.append(n.charAt(ci));
921 Token.setAlias(buffer.toString(), n, true);
925 Token.setAlias("ASSIGNED", "Cn", false);
926 Token.setAlias("UNASSIGNED", "Cn", true);
927 Token all = Token.createRange();
928 all.addRange(0, Token.UTF16_MAX);
929 Token.categories.put("ALL", all);
930 Token.categories2.put("ALL", Token.complementRanges(all));
931 Token.registerNonXS("ASSIGNED");
932 Token.registerNonXS("UNASSIGNED");
933 Token.registerNonXS("ALL");
935 Token isalpha = Token.createRange();
936 isalpha.mergeRanges(ranges[Character.UPPERCASE_LETTER]); // Lu
937 isalpha.mergeRanges(ranges[Character.LOWERCASE_LETTER]); // Ll
938 isalpha.mergeRanges(ranges[Character.OTHER_LETTER]); // Lo
939 Token.categories.put("IsAlpha", isalpha);
940 Token.categories2.put("IsAlpha", Token.complementRanges(isalpha));
941 Token.registerNonXS("IsAlpha");
943 Token isalnum = Token.createRange();
944 isalnum.mergeRanges(isalpha); // Lu Ll Lo
945 isalnum.mergeRanges(ranges[Character.DECIMAL_DIGIT_NUMBER]); // Nd
946 Token.categories.put("IsAlnum", isalnum);
947 Token.categories2.put("IsAlnum", Token.complementRanges(isalnum));
948 Token.registerNonXS("IsAlnum");
950 Token isspace = Token.createRange();
951 isspace.mergeRanges(Token.token_spaces);
952 isspace.mergeRanges(ranges[CHAR_SEPARATOR]); // Z
953 Token.categories.put("IsSpace", isspace);
954 Token.categories2.put("IsSpace", Token.complementRanges(isspace));
955 Token.registerNonXS("IsSpace");
957 Token isword = Token.createRange();
958 isword.mergeRanges(isalnum); // Lu Ll Lo Nd
959 isword.addRange('_', '_');
960 Token.categories.put("IsWord", isword);
961 Token.categories2.put("IsWord", Token.complementRanges(isword));
962 Token.registerNonXS("IsWord");
964 Token isascii = Token.createRange();
965 isascii.addRange(0, 127);
966 Token.categories.put("IsASCII", isascii);
967 Token.categories2.put("IsASCII", Token.complementRanges(isascii));
968 Token.registerNonXS("IsASCII");
970 Token isnotgraph = Token.createRange();
971 isnotgraph.mergeRanges(ranges[CHAR_OTHER]);
972 isnotgraph.addRange(' ', ' ');
973 Token.categories.put("IsGraph", Token.complementRanges(isnotgraph));
974 Token.categories2.put("IsGraph", isnotgraph);
975 Token.registerNonXS("IsGraph");
977 Token isxdigit = Token.createRange();
978 isxdigit.addRange('0', '9');
979 isxdigit.addRange('A', 'F');
980 isxdigit.addRange('a', 'f');
981 Token.categories.put("IsXDigit", Token.complementRanges(isxdigit));
982 Token.categories2.put("IsXDigit", isxdigit);
983 Token.registerNonXS("IsXDigit");
985 Token.setAlias("IsDigit", "Nd", true);
986 Token.setAlias("IsUpper", "Lu", true);
987 Token.setAlias("IsLower", "Ll", true);
988 Token.setAlias("IsCntrl", "C", true);
989 Token.setAlias("IsPrint", "C", false);
990 Token.setAlias("IsPunct", "P", true);
991 Token.registerNonXS("IsDigit");
992 Token.registerNonXS("IsUpper");
993 Token.registerNonXS("IsLower");
994 Token.registerNonXS("IsCntrl");
995 Token.registerNonXS("IsPrint");
996 Token.registerNonXS("IsPunct");
998 Token.setAlias("alpha", "IsAlpha", true);
999 Token.setAlias("alnum", "IsAlnum", true);
1000 Token.setAlias("ascii", "IsASCII", true);
1001 Token.setAlias("cntrl", "IsCntrl", true);
1002 Token.setAlias("digit", "IsDigit", true);
1003 Token.setAlias("graph", "IsGraph", true);
1004 Token.setAlias("lower", "IsLower", true);
1005 Token.setAlias("print", "IsPrint", true);
1006 Token.setAlias("punct", "IsPunct", true);
1007 Token.setAlias("space", "IsSpace", true);
1008 Token.setAlias("upper", "IsUpper", true);
1009 Token.setAlias("word", "IsWord", true); // Perl extension
1010 Token.setAlias("xdigit", "IsXDigit", true);
1011 Token.registerNonXS("alpha");
1012 Token.registerNonXS("alnum");
1013 Token.registerNonXS("ascii");
1014 Token.registerNonXS("cntrl");
1015 Token.registerNonXS("digit");
1016 Token.registerNonXS("graph");
1017 Token.registerNonXS("lower");
1018 Token.registerNonXS("print");
1019 Token.registerNonXS("punct");
1020 Token.registerNonXS("space");
1021 Token.registerNonXS("upper");
1022 Token.registerNonXS("word");
1023 Token.registerNonXS("xdigit");
1026 RangeToken tok = positive ? (RangeToken)Token.categories.get(name)
1027 : (RangeToken)Token.categories2.get(name);
1028 //if (tok == null) System.out.println(name);
1031 static protected RangeToken getRange(String name, boolean positive, boolean xs) {
1032 RangeToken range = Token.getRange(name, positive);
1033 if (xs && range != null && Token.isRegisterNonXS(name)) {
1039 static Hashtable<String, String> nonxs = null;
1041 * This method is called by only getRange().
1042 * So this method need not MT-safe.
1044 static protected void registerNonXS(String name) {
1045 if (Token.nonxs == null) {
1046 Token.nonxs = new Hashtable<>();
1048 Token.nonxs.put(name, name);
1050 static protected boolean isRegisterNonXS(String name) {
1051 if (Token.nonxs == null) {
1055 //System.err.println("isRegisterNonXS: "+name);
1056 return Token.nonxs.containsKey(name);
1059 private static void setAlias(String newName, String name, boolean positive) {
1060 Token t1 = Token.categories.get(name);
1061 Token t2 = Token.categories2.get(name);
1063 Token.categories.put(newName, t1);
1064 Token.categories2.put(newName, t2);
1066 Token.categories2.put(newName, t1);
1067 Token.categories.put(newName, t2);
1071 // ------------------------------------------------------
1073 static final String viramaString =
1074 "\u094D"// ;DEVANAGARI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1075 +"\u09CD"//;BENGALI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1076 +"\u0A4D"//;GURMUKHI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1077 +"\u0ACD"//;GUJARATI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1078 +"\u0B4D"//;ORIYA SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1079 +"\u0BCD"//;TAMIL SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1080 +"\u0C4D"//;TELUGU SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1081 +"\u0CCD"//;KANNADA SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1082 +"\u0D4D"//;MALAYALAM SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1083 +"\u0E3A"//;THAI CHARACTER PHINTHU;Mn;9;ON;;;;;N;THAI VOWEL SIGN PHINTHU;;;;
1084 +"\u0F84";//;TIBETAN MARK HALANTA;Mn;9;ON;;;;;N;TIBETAN VIRAMA;;;;
1086 static private Token token_grapheme = null;
1087 static synchronized Token getGraphemePattern() {
1088 if (Token.token_grapheme != null) {
1089 return Token.token_grapheme;
1092 Token base_char = Token.createRange(); // [{ASSIGNED}]-[{M},{C}]
1093 base_char.mergeRanges(Token.getRange("ASSIGNED", true));
1094 base_char.subtractRanges(Token.getRange("M", true));
1095 base_char.subtractRanges(Token.getRange("C", true));
1097 Token virama = Token.createRange();
1098 for (int i = 0; i < Token.viramaString.length(); i++) {
1099 virama.addRange(i, i);
1102 Token combiner_wo_virama = Token.createRange();
1103 combiner_wo_virama.mergeRanges(Token.getRange("M", true));
1104 combiner_wo_virama.addRange(0x1160, 0x11ff); // hangul_medial and hangul_final
1105 combiner_wo_virama.addRange(0xff9e, 0xff9f); // extras
1107 Token left = Token.createUnion(); // base_char?
1108 left.addChild(base_char);
1109 left.addChild(Token.token_empty);
1111 Token foo = Token.createUnion();
1112 foo.addChild(Token.createConcat(virama, Token.getRange("L", true)));
1113 foo.addChild(combiner_wo_virama);
1115 foo = Token.createClosure(foo);
1117 foo = Token.createConcat(left, foo);
1119 Token.token_grapheme = foo;
1120 return Token.token_grapheme;
1124 * Combing Character Sequence in Perl 5.6.
1126 static private Token token_ccs = null;
1127 static synchronized Token getCombiningCharacterSequence() {
1128 if (Token.token_ccs != null) {
1129 return Token.token_ccs;
1132 Token foo = Token.createClosure(Token.getRange("M", true)); // \pM*
1133 foo = Token.createConcat(Token.getRange("M", false), foo); // \PM + \pM*
1134 Token.token_ccs = foo;
1135 return Token.token_ccs;
1138 // ------------------------------------------------------
1140 // ------------------------------------------------------
1142 * This class represents a node in parse tree.
1144 static class StringToken extends Token implements java.io.Serializable {
1146 private static final long serialVersionUID = -4614366944218504172L;
1149 final int refNumber;
1151 StringToken(int type, String str, int n) {
1158 int getReferenceNumber() { // for STRING
1159 return this.refNumber;
1162 String getString() { // for STRING
1167 public String toString(int options) {
1168 if (this.type == BACKREFERENCE) {
1169 return "\\"+this.refNumber;
1171 return REUtil.quoteMeta(this.string);
1177 * This class represents a node in parse tree.
1179 static class ConcatToken extends Token implements java.io.Serializable {
1181 private static final long serialVersionUID = 8717321425541346381L;
1186 ConcatToken(Token t1, Token t2) {
1187 super(Token.CONCAT);
1197 Token getChild(int index) {
1198 return index == 0 ? this.child : this.child2;
1202 public String toString(int options) {
1204 if (this.child2.type == CLOSURE && this.child2.getChild(0) == this.child) {
1205 ret = this.child.toString(options)+"+";
1206 } else if (this.child2.type == NONGREEDYCLOSURE && this.child2.getChild(0) == this.child) {
1207 ret = this.child.toString(options)+"+?";
1209 ret = this.child.toString(options)+this.child2.toString(options);
1216 * This class represents a node in parse tree.
1218 static class CharToken extends Token implements java.io.Serializable {
1220 private static final long serialVersionUID = -4394272816279496989L;
1224 CharToken(int type, int ch) {
1231 return this.chardata;
1235 public String toString(int options) {
1237 switch (this.type) {
1239 switch (this.chardata) {
1240 case '|': case '*': case '+': case '?':
1241 case '(': case ')': case '.': case '[':
1242 case '{': case '\\':
1243 ret = "\\"+(char)this.chardata;
1245 case '\f': ret = "\\f"; break;
1246 case '\n': ret = "\\n"; break;
1247 case '\r': ret = "\\r"; break;
1248 case '\t': ret = "\\t"; break;
1249 case 0x1b: ret = "\\e"; break;
1250 //case 0x0b: ret = "\\v"; break;
1252 if (this.chardata >= 0x10000) {
1253 String pre = "0"+Integer.toHexString(this.chardata);
1254 ret = "\\v"+pre.substring(pre.length()-6, pre.length());
1256 ret = ""+(char)this.chardata;
1262 if (this == Token.token_linebeginning || this == Token.token_lineend) {
1263 ret = ""+(char)this.chardata;
1265 ret = "\\"+(char)this.chardata;
1276 boolean match(int ch) {
1277 if (this.type == CHAR) {
1278 return ch == this.chardata;
1280 throw new RuntimeException("NFAArrow#match(): Internal error: "+this.type);
1286 * This class represents a node in parse tree.
1288 static class ClosureToken extends Token implements java.io.Serializable {
1290 private static final long serialVersionUID = 1308971930673997452L;
1296 ClosureToken(int type, Token tok) {
1308 Token getChild(int index) {
1313 final void setMin(int min) {
1317 final void setMax(int max) {
1321 final int getMin() {
1325 final int getMax() {
1330 public String toString(int options) {
1332 if (this.type == CLOSURE) {
1333 if (this.getMin() < 0 && this.getMax() < 0) {
1334 ret = this.child.toString(options)+"*";
1335 } else if (this.getMin() == this.getMax()) {
1336 ret = this.child.toString(options)+"{"+this.getMin()+"}";
1337 } else if (this.getMin() >= 0 && this.getMax() >= 0) {
1338 ret = this.child.toString(options)+"{"+this.getMin()+","+this.getMax()+"}";
1339 } else if (this.getMin() >= 0 && this.getMax() < 0) {
1340 ret = this.child.toString(options)+"{"+this.getMin()+",}";
1342 throw new RuntimeException("Token#toString(): CLOSURE "
1343 +this.getMin()+", "+this.getMax());
1346 if (this.getMin() < 0 && this.getMax() < 0) {
1347 ret = this.child.toString(options)+"*?";
1348 } else if (this.getMin() == this.getMax()) {
1349 ret = this.child.toString(options)+"{"+this.getMin()+"}?";
1350 } else if (this.getMin() >= 0 && this.getMax() >= 0) {
1351 ret = this.child.toString(options)+"{"+this.getMin()+","+this.getMax()+"}?";
1352 } else if (this.getMin() >= 0 && this.getMax() < 0) {
1353 ret = this.child.toString(options)+"{"+this.getMin()+",}?";
1355 throw new RuntimeException("Token#toString(): NONGREEDYCLOSURE "
1356 +this.getMin()+", "+this.getMax());
1364 * This class represents a node in parse tree.
1366 static class ParenToken extends Token implements java.io.Serializable {
1368 private static final long serialVersionUID = -5938014719827987704L;
1371 final int parennumber;
1373 ParenToken(int type, Token tok, int paren) {
1376 this.parennumber = paren;
1384 Token getChild(int index) {
1389 int getParenNumber() {
1390 return this.parennumber;
1394 public String toString(int options) {
1396 switch (this.type) {
1398 if (this.parennumber == 0) {
1399 ret = "(?:"+this.child.toString(options)+")";
1401 ret = "("+this.child.toString(options)+")";
1406 ret = "(?="+this.child.toString(options)+")";
1408 case NEGATIVELOOKAHEAD:
1409 ret = "(?!"+this.child.toString(options)+")";
1412 ret = "(?<="+this.child.toString(options)+")";
1414 case NEGATIVELOOKBEHIND:
1415 ret = "(?<!"+this.child.toString(options)+")";
1418 ret = "(?>"+this.child.toString(options)+")";
1426 * (?(condition)yes-pattern|no-pattern)
1428 static class ConditionToken extends Token implements java.io.Serializable {
1430 private static final long serialVersionUID = 4353765277910594411L;
1432 final int refNumber;
1433 final Token condition;
1436 ConditionToken(int refno, Token cond, Token yespat, Token nopat) {
1437 super(Token.CONDITION);
1438 this.refNumber = refno;
1439 this.condition = cond;
1445 return this.no == null ? 1 : 2;
1448 Token getChild(int index) {
1455 throw new RuntimeException("Internal Error: "+index);
1459 public String toString(int options) {
1461 if (refNumber > 0) {
1462 ret = "(?("+refNumber+")";
1463 } else if (this.condition.type == Token.ANCHOR) {
1464 ret = "(?("+this.condition+")";
1466 ret = "(?"+this.condition;
1469 if (this.no == null) {
1470 ret += this.yes+")";
1472 ret += this.yes+"|"+this.no+")";
1481 static class ModifierToken extends Token implements java.io.Serializable {
1483 private static final long serialVersionUID = -9114536559696480356L;
1489 ModifierToken(Token tok, int add, int mask) {
1490 super(Token.MODIFIERGROUP);
1501 Token getChild(int index) {
1508 int getOptionsMask() {
1513 public String toString(int options) {
1515 +(this.add == 0 ? "" : REUtil.createOptionString(this.add))
1516 +(this.mask == 0 ? "" : REUtil.createOptionString(this.mask))
1518 +this.child.toString(options)
1524 * This class represents a node in parse tree.
1525 * for UNION or CONCAT.
1527 static class UnionToken extends Token implements java.io.Serializable {
1529 private static final long serialVersionUID = -2568843945989489861L;
1531 Vector<Token> children;
1533 UnionToken(int type) {
1538 void addChild(Token tok) {
1542 if (this.children == null) {
1543 this.children = new Vector<>();
1545 if (this.type == UNION) {
1546 this.children.addElement(tok);
1549 // This is CONCAT, and new child is CONCAT.
1550 if (tok.type == CONCAT) {
1551 for (int i = 0; i < tok.size(); i ++)
1553 this.addChild(tok.getChild(i)); // Recursion
1557 int size = this.children.size();
1559 this.children.addElement(tok);
1562 Token previous = this.children.elementAt(size-1);
1563 if (!((previous.type == CHAR || previous.type == STRING)
1564 && (tok.type == CHAR || tok.type == STRING))) {
1565 this.children.addElement(tok);
1569 //System.err.println("Merge '"+previous+"' and '"+tok+"'.");
1571 StringBuffer buffer;
1572 int nextMaxLength = (tok.type == CHAR ? 2 : tok.getString().length());
1573 if (previous.type == CHAR) { // Replace previous token by STRING
1574 buffer = new StringBuffer(2 + nextMaxLength);
1575 int ch = previous.getChar();
1576 if (ch >= 0x10000) {
1577 buffer.append(REUtil.decomposeToSurrogates(ch));
1579 buffer.append((char)ch);
1581 previous = Token.createString(null);
1582 this.children.setElementAt(previous, size-1);
1584 buffer = new StringBuffer(previous.getString().length() + nextMaxLength);
1585 buffer.append(previous.getString());
1588 if (tok.type == CHAR) {
1589 int ch = tok.getChar();
1590 if (ch >= 0x10000) {
1591 buffer.append(REUtil.decomposeToSurrogates(ch));
1593 buffer.append((char)ch);
1596 buffer.append(tok.getString());
1599 ((StringToken)previous).string = new String(buffer);
1604 return this.children == null ? 0 : this.children.size();
1607 Token getChild(int index) {
1608 return this.children.elementAt(index);
1612 public String toString(int options) {
1614 if (this.type == CONCAT) {
1615 if (this.children.size() == 2) {
1616 Token ch = this.getChild(0);
1617 Token ch2 = this.getChild(1);
1618 if (ch2.type == CLOSURE && ch2.getChild(0) == ch) {
1619 ret = ch.toString(options)+"+";
1620 } else if (ch2.type == NONGREEDYCLOSURE && ch2.getChild(0) == ch) {
1621 ret = ch.toString(options)+"+?";
1623 ret = ch.toString(options)+ch2.toString(options);
1626 StringBuffer sb = new StringBuffer();
1627 for (int i = 0; i < this.children.size(); i ++) {
1628 sb.append(this.children.elementAt(i).toString(options));
1630 ret = new String(sb);
1634 if (this.children.size() == 2 && this.getChild(1).type == EMPTY) {
1635 ret = this.getChild(0).toString(options)+"?";
1636 } else if (this.children.size() == 2
1637 && this.getChild(0).type == EMPTY) {
1638 ret = this.getChild(1).toString(options)+"??";
1640 StringBuffer sb = new StringBuffer();
1641 sb.append(this.children.elementAt(0).toString(options));
1642 for (int i = 1; i < this.children.size(); i ++) {
1644 sb.append(this.children.elementAt(i).toString(options));
1646 ret = new String(sb);