2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
9 * http://www.apache.org/licenses/LICENSE-2.0
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
18 package org.opendaylight.yangtools.xsd.regex;
20 import java.util.Hashtable;
21 import java.util.Vector;
24 * This class represents a node in parse tree.
28 * @version $Id: Token.java 1638344 2014-11-11 20:15:46Z mrglavas $
30 class Token implements java.io.Serializable {
32 private static final long serialVersionUID = 8484976002585487481L;
34 static final boolean COUNTTOKENS = true;
35 static int tokens = 0;
37 static final int CHAR = 0; // Literal char
38 static final int DOT = 11; // .
39 static final int CONCAT = 1; // XY
40 static final int UNION = 2; // X|Y|Z
41 static final int CLOSURE = 3; // X*
42 static final int RANGE = 4; // [a-zA-Z] etc.
43 static final int NRANGE = 5; // [^a-zA-Z] etc.
44 static final int PAREN = 6; // (X) or (?:X)
45 static final int EMPTY = 7; //
46 static final int ANCHOR = 8; // ^ $ \b \B \< \> \A \Z \z
47 static final int NONGREEDYCLOSURE = 9; // *? +?
48 static final int STRING = 10; // strings
49 static final int BACKREFERENCE = 12; // back references
50 static final int LOOKAHEAD = 20; // (?=...)
51 static final int NEGATIVELOOKAHEAD = 21; // (?!...)
52 static final int LOOKBEHIND = 22; // (?<=...)
53 static final int NEGATIVELOOKBEHIND = 23; // (?<!...)
54 static final int INDEPENDENT = 24; // (?>...)
55 static final int MODIFIERGROUP = 25; // (?ims-ims:...)
56 static final int CONDITION = 26; // (?(...)yes|no)
58 static final int UTF16_MAX = 0x10ffff;
62 static final Token token_dot;
63 static final Token token_0to9;
64 static final Token token_wordchars;
65 static final Token token_not_0to9;
66 static final Token token_not_wordchars;
67 static final Token token_spaces;
68 static final Token token_not_spaces;
69 static final Token token_empty;
70 static final Token token_linebeginning;
71 static final Token token_linebeginning2;
72 static final Token token_lineend;
73 static final Token token_stringbeginning;
74 static final Token token_stringend;
75 static final Token token_stringend2;
76 static final Token token_wordedge;
77 static final Token token_not_wordedge;
78 static final Token token_wordbeginning;
79 static final Token token_wordend;
81 token_empty = new Token(Token.EMPTY);
83 token_linebeginning = Token.createAnchor('^');
84 token_linebeginning2 = Token.createAnchor('@');
85 token_lineend = Token.createAnchor('$');
86 token_stringbeginning = Token.createAnchor('A');
87 token_stringend = Token.createAnchor('z');
88 token_stringend2 = Token.createAnchor('Z');
89 token_wordedge = Token.createAnchor('b');
90 token_not_wordedge = Token.createAnchor('B');
91 token_wordbeginning = Token.createAnchor('<');
92 token_wordend = Token.createAnchor('>');
94 token_dot = new Token(Token.DOT);
96 token_0to9 = Token.createRange();
97 token_0to9.addRange('0', '9');
98 token_wordchars = Token.createRange();
99 token_wordchars.addRange('0', '9');
100 token_wordchars.addRange('A', 'Z');
101 token_wordchars.addRange('_', '_');
102 token_wordchars.addRange('a', 'z');
103 token_spaces = Token.createRange();
104 token_spaces.addRange('\t', '\t');
105 token_spaces.addRange('\n', '\n');
106 token_spaces.addRange('\f', '\f');
107 token_spaces.addRange('\r', '\r');
108 token_spaces.addRange(' ', ' ');
110 token_not_0to9 = Token.complementRanges(Token.token_0to9);
111 token_not_wordchars = Token.complementRanges(Token.token_wordchars);
112 token_not_spaces = Token.complementRanges(Token.token_spaces);
115 static Token.ParenToken createLook(int type, Token child) {
119 return new Token.ParenToken(type, child, 0);
121 static Token.ParenToken createParen(Token child, int pnumber) {
125 return new Token.ParenToken(Token.PAREN, child, pnumber);
127 static Token.ClosureToken createClosure(Token tok) {
131 return new Token.ClosureToken(Token.CLOSURE, tok);
133 static Token.ClosureToken createNGClosure(Token tok) {
137 return new Token.ClosureToken(Token.NONGREEDYCLOSURE, tok);
139 static Token.ConcatToken createConcat(Token tok1, Token tok2) {
143 return new Token.ConcatToken(tok1, tok2);
145 static Token.UnionToken createConcat() {
149 return new Token.UnionToken(Token.CONCAT); // *** It is not a bug.
151 static Token.UnionToken createUnion() {
155 return new Token.UnionToken(Token.UNION);
157 static Token createEmpty() {
158 return Token.token_empty;
160 static RangeToken createRange() {
164 return new RangeToken(Token.RANGE);
166 static RangeToken createNRange() {
170 return new RangeToken(Token.NRANGE);
172 static Token.CharToken createChar(int ch) {
176 return new Token.CharToken(Token.CHAR, ch);
178 static private Token.CharToken createAnchor(int ch) {
182 return new Token.CharToken(Token.ANCHOR, ch);
184 static Token.StringToken createBackReference(int refno) {
188 return new Token.StringToken(Token.BACKREFERENCE, null, refno);
190 static Token.StringToken createString(String str) {
194 return new Token.StringToken(Token.STRING, str, 0);
196 static Token.ModifierToken createModifierGroup(Token child, int add, int mask) {
200 return new Token.ModifierToken(child, add, mask);
202 static Token.ConditionToken createCondition(int refno, Token condition,
203 Token yespat, Token nopat) {
207 return new Token.ConditionToken(refno, condition, yespat, nopat);
210 protected Token(int type) {
215 * A number of children.
220 Token getChild(int index) {
223 void addChild(Token tok) {
224 throw new RuntimeException("Not supported.");
227 // for RANGE or NRANGE
228 protected void addRange(int start, int end) {
229 throw new RuntimeException("Not supported.");
231 protected void sortRanges() {
232 throw new RuntimeException("Not supported.");
234 protected void compactRanges() {
235 throw new RuntimeException("Not supported.");
237 protected void mergeRanges(Token tok) {
238 throw new RuntimeException("Not supported.");
240 protected void subtractRanges(Token tok) {
241 throw new RuntimeException("Not supported.");
243 protected void intersectRanges(Token tok) {
244 throw new RuntimeException("Not supported.");
246 static Token complementRanges(Token tok) {
247 return RangeToken.complementRanges(tok);
251 void setMin(int min) { // for CLOSURE
253 void setMax(int max) { // for CLOSURE
255 int getMin() { // for CLOSURE
258 int getMax() { // for CLOSURE
261 int getReferenceNumber() { // for STRING
264 String getString() { // for STRING
268 int getParenNumber() {
276 public String toString() {
277 return this.toString(0);
279 public String toString(int options) {
280 return this.type == Token.DOT ? "." : "";
284 * How many characters are needed?
286 final int getMinLength() {
290 for (int i = 0; i < this.size(); i ++) {
291 sum += this.getChild(i).getMinLength();
297 if (this.size() == 0) {
300 int ret = this.getChild(0).getMinLength();
301 for (int i = 1; i < this.size(); i ++) {
302 int min = this.getChild(i).getMinLength();
310 case NONGREEDYCLOSURE:
311 if (this.getMin() >= 0) {
312 return this.getMin() * this.getChild(0).getMinLength();
329 return this.getChild(0).getMinLength();
335 return this.getString().length();
338 case NEGATIVELOOKAHEAD:
340 case NEGATIVELOOKBEHIND:
341 return 0; // ***** Really?
344 throw new RuntimeException("Token#getMinLength(): Invalid Type: "+this.type);
348 final int getMaxLength() {
352 for (int i = 0; i < this.size(); i ++) {
353 int d = this.getChild(i).getMaxLength();
363 if (this.size() == 0) {
366 int ret = this.getChild(0).getMaxLength();
367 for (int i = 1; ret >= 0 && i < this.size(); i ++) {
368 int max = this.getChild(i).getMaxLength();
369 if (max < 0) { // infinity
380 case NONGREEDYCLOSURE:
381 if (this.getMax() >= 0) {
382 // When this.child.getMaxLength() < 0,
383 // this returns minus value
384 return this.getMax() * this.getChild(0).getMaxLength();
402 return this.getChild(0).getMaxLength();
408 return this.getString().length();
411 case NEGATIVELOOKAHEAD:
413 case NEGATIVELOOKBEHIND:
414 return 0; // ***** Really?
417 throw new RuntimeException("Token#getMaxLength(): Invalid Type: "+this.type);
421 static final int FC_CONTINUE = 0;
422 static final int FC_TERMINAL = 1;
423 static final int FC_ANY = 2;
424 static final boolean isSet(int options, int flag) {
425 return (options & flag) == flag;
427 final int analyzeFirstCharacter(RangeToken result, int options) {
430 int ret = FC_CONTINUE;
431 for (int i = 0; i < this.size(); i ++) {
432 if ((ret = this.getChild(i).analyzeFirstCharacter(result, options)) != FC_CONTINUE) {
439 if (this.size() == 0) {
443 * a|b|c -> FC_TERMINAL
445 * a|b| -> FC_CONTINUE
447 int ret2 = FC_CONTINUE;
448 boolean hasEmpty = false;
449 for (int i = 0; i < this.size(); i ++) {
450 ret2 = this.getChild(i).analyzeFirstCharacter(result, options);
451 if (ret2 == FC_ANY) {
453 } else if (ret2 == FC_CONTINUE) {
457 return hasEmpty ? FC_CONTINUE : ret2;
460 int ret3 = this.getChild(0).analyzeFirstCharacter(result, options);
461 if (this.size() == 1) {
464 if (ret3 == FC_ANY) {
467 int ret4 = this.getChild(1).analyzeFirstCharacter(result, options);
468 if (ret4 == FC_ANY) {
471 return ret3 == FC_CONTINUE || ret4 == FC_CONTINUE ? FC_CONTINUE : FC_TERMINAL;
474 case NONGREEDYCLOSURE:
475 this.getChild(0).analyzeFirstCharacter(result, options);
483 int ch = this.getChar();
484 result.addRange(ch, ch);
485 if (ch < 0x10000 && isSet(options, RegularExpression.IGNORE_CASE)) {
486 ch = Character.toUpperCase((char)ch);
487 result.addRange(ch, ch);
488 ch = Character.toLowerCase((char)ch);
489 result.addRange(ch, ch);
497 result.mergeRanges(this);
501 result.mergeRanges(Token.complementRanges(this));
506 return this.getChild(0).analyzeFirstCharacter(result, options);
509 options |= ((ModifierToken)this).getOptions();
510 options &= ~((ModifierToken)this).getOptionsMask();
511 return this.getChild(0).analyzeFirstCharacter(result, options);
514 result.addRange(0, UTF16_MAX); // **** We can not optimize.
518 int cha = this.getString().charAt(0);
520 if (REUtil.isHighSurrogate(cha)
521 && this.getString().length() >= 2
522 && REUtil.isLowSurrogate((ch2 = this.getString().charAt(1)))) {
523 cha = REUtil.composeFromSurrogates(cha, ch2);
525 result.addRange(cha, cha);
526 if (cha < 0x10000 && isSet(options, RegularExpression.IGNORE_CASE)) {
527 cha = Character.toUpperCase((char)cha);
528 result.addRange(cha, cha);
529 cha = Character.toLowerCase((char)cha);
530 result.addRange(cha, cha);
535 case NEGATIVELOOKAHEAD:
537 case NEGATIVELOOKBEHIND:
541 throw new RuntimeException("Token#analyzeHeadCharacter(): Invalid Type: "+this.type);
545 private final boolean isShorterThan(Token tok) {
551 if (this.type == STRING) mylength = this.getString().length();
552 else if (this.type == CHAR) mylength = this.getChar() >= 0x10000 ? 2 : 1;
553 else throw new RuntimeException("Internal Error: Illegal type: "+this.type);
555 if (tok.type == STRING) otherlength = tok.getString().length();
556 else if (tok.type == CHAR) otherlength = tok.getChar() >= 0x10000 ? 2 : 1;
557 else throw new RuntimeException("Internal Error: Illegal type: "+tok.type);
560 if (this.type == STRING) {
561 mylength = this.getString().length();
563 throw new RuntimeException("Internal Error: Illegal type: "+this.type);
566 if (tok.type == STRING) {
567 otherlength = tok.getString().length();
569 throw new RuntimeException("Internal Error: Illegal type: "+tok.type);
571 return mylength < otherlength;
574 static class FixedStringContainer {
577 FixedStringContainer() {
581 final void findFixedString(FixedStringContainer container, int options) {
584 Token prevToken = null;
586 for (int i = 0; i < this.size(); i ++) {
587 this.getChild(i).findFixedString(container, options);
588 if (prevToken == null || prevToken.isShorterThan(container.token)) {
589 prevToken = container.token;
590 prevOptions = container.options;
593 container.token = prevToken;
594 container.options = prevOptions;
599 case NONGREEDYCLOSURE:
607 case NEGATIVELOOKAHEAD:
609 case NEGATIVELOOKBEHIND:
611 container.token = null;
614 case CHAR: // Ignore CHAR tokens.
615 container.token = null; // **
619 container.token = this;
620 container.options = options;
625 this.getChild(0).findFixedString(container, options);
629 options |= ((ModifierToken)this).getOptions();
630 options &= ~((ModifierToken)this).getOptionsMask();
631 this.getChild(0).findFixedString(container, options);
635 throw new RuntimeException("Token#findFixedString(): Invalid Type: "+this.type);
639 boolean match(int ch) {
640 throw new RuntimeException("NFAArrow#match(): Internal error: "+this.type);
643 // ------------------------------------------------------
644 private final static Hashtable<String, Token> categories = new Hashtable<>();
645 private final static Hashtable<String, Token> categories2 = new Hashtable<>();
646 private static final String[] categoryNames = {
647 "Cn", "Lu", "Ll", "Lt", "Lm", "Lo", "Mn", "Me", "Mc", "Nd",
648 "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", null, "Co", "Cs",
649 "Pd", "Ps", "Pe", "Pc", "Po", "Sm", "Sc", "Sk", "So", // 28
650 "Pi", "Pf", // 29, 30
651 "L", "M", "N", "Z", "C", "P", "S", // 31-37
654 // Schema Rec. {Datatypes} - Punctuation
655 static final int CHAR_INIT_QUOTE = 29; // Pi - initial quote
656 static final int CHAR_FINAL_QUOTE = 30; // Pf - final quote
657 static final int CHAR_LETTER = 31;
658 static final int CHAR_MARK = 32;
659 static final int CHAR_NUMBER = 33;
660 static final int CHAR_SEPARATOR = 34;
661 static final int CHAR_OTHER = 35;
662 static final int CHAR_PUNCTUATION = 36;
663 static final int CHAR_SYMBOL = 37;
665 //blockNames in UNICODE 3.1 that supported by XML Schema REC
666 private static final String[] blockNames = {
667 /*0000..007F;*/ "Basic Latin",
668 /*0080..00FF;*/ "Latin-1 Supplement",
669 /*0100..017F;*/ "Latin Extended-A",
670 /*0180..024F;*/ "Latin Extended-B",
671 /*0250..02AF;*/ "IPA Extensions",
672 /*02B0..02FF;*/ "Spacing Modifier Letters",
673 /*0300..036F;*/ "Combining Diacritical Marks",
674 /*0370..03FF;*/ "Greek",
675 /*0400..04FF;*/ "Cyrillic",
676 /*0530..058F;*/ "Armenian",
677 /*0590..05FF;*/ "Hebrew",
678 /*0600..06FF;*/ "Arabic",
679 /*0700..074F;*/ "Syriac",
680 /*0780..07BF;*/ "Thaana",
681 /*0900..097F;*/ "Devanagari",
682 /*0980..09FF;*/ "Bengali",
683 /*0A00..0A7F;*/ "Gurmukhi",
684 /*0A80..0AFF;*/ "Gujarati",
685 /*0B00..0B7F;*/ "Oriya",
686 /*0B80..0BFF;*/ "Tamil",
687 /*0C00..0C7F;*/ "Telugu",
688 /*0C80..0CFF;*/ "Kannada",
689 /*0D00..0D7F;*/ "Malayalam",
690 /*0D80..0DFF;*/ "Sinhala",
691 /*0E00..0E7F;*/ "Thai",
692 /*0E80..0EFF;*/ "Lao",
693 /*0F00..0FFF;*/ "Tibetan",
694 /*1000..109F;*/ "Myanmar",
695 /*10A0..10FF;*/ "Georgian",
696 /*1100..11FF;*/ "Hangul Jamo",
697 /*1200..137F;*/ "Ethiopic",
698 /*13A0..13FF;*/ "Cherokee",
699 /*1400..167F;*/ "Unified Canadian Aboriginal Syllabics",
700 /*1680..169F;*/ "Ogham",
701 /*16A0..16FF;*/ "Runic",
702 /*1780..17FF;*/ "Khmer",
703 /*1800..18AF;*/ "Mongolian",
704 /*1E00..1EFF;*/ "Latin Extended Additional",
705 /*1F00..1FFF;*/ "Greek Extended",
706 /*2000..206F;*/ "General Punctuation",
707 /*2070..209F;*/ "Superscripts and Subscripts",
708 /*20A0..20CF;*/ "Currency Symbols",
709 /*20D0..20FF;*/ "Combining Marks for Symbols",
710 /*2100..214F;*/ "Letterlike Symbols",
711 /*2150..218F;*/ "Number Forms",
712 /*2190..21FF;*/ "Arrows",
713 /*2200..22FF;*/ "Mathematical Operators",
714 /*2300..23FF;*/ "Miscellaneous Technical",
715 /*2400..243F;*/ "Control Pictures",
716 /*2440..245F;*/ "Optical Character Recognition",
717 /*2460..24FF;*/ "Enclosed Alphanumerics",
718 /*2500..257F;*/ "Box Drawing",
719 /*2580..259F;*/ "Block Elements",
720 /*25A0..25FF;*/ "Geometric Shapes",
721 /*2600..26FF;*/ "Miscellaneous Symbols",
722 /*2700..27BF;*/ "Dingbats",
723 /*2800..28FF;*/ "Braille Patterns",
724 /*2E80..2EFF;*/ "CJK Radicals Supplement",
725 /*2F00..2FDF;*/ "Kangxi Radicals",
726 /*2FF0..2FFF;*/ "Ideographic Description Characters",
727 /*3000..303F;*/ "CJK Symbols and Punctuation",
728 /*3040..309F;*/ "Hiragana",
729 /*30A0..30FF;*/ "Katakana",
730 /*3100..312F;*/ "Bopomofo",
731 /*3130..318F;*/ "Hangul Compatibility Jamo",
732 /*3190..319F;*/ "Kanbun",
733 /*31A0..31BF;*/ "Bopomofo Extended",
734 /*3200..32FF;*/ "Enclosed CJK Letters and Months",
735 /*3300..33FF;*/ "CJK Compatibility",
736 /*3400..4DB5;*/ "CJK Unified Ideographs Extension A",
737 /*4E00..9FFF;*/ "CJK Unified Ideographs",
738 /*A000..A48F;*/ "Yi Syllables",
739 /*A490..A4CF;*/ "Yi Radicals",
740 /*AC00..D7A3;*/ "Hangul Syllables",
741 /*E000..F8FF;*/ "Private Use",
742 /*F900..FAFF;*/ "CJK Compatibility Ideographs",
743 /*FB00..FB4F;*/ "Alphabetic Presentation Forms",
744 /*FB50..FDFF;*/ "Arabic Presentation Forms-A",
745 /*FE20..FE2F;*/ "Combining Half Marks",
746 /*FE30..FE4F;*/ "CJK Compatibility Forms",
747 /*FE50..FE6F;*/ "Small Form Variants",
748 /*FE70..FEFE;*/ "Arabic Presentation Forms-B",
749 /*FEFF..FEFF;*/ "Specials",
750 /*FF00..FFEF;*/ "Halfwidth and Fullwidth Forms",
751 //missing Specials add manually
752 /*10300..1032F;*/ "Old Italic", // 84
753 /*10330..1034F;*/ "Gothic",
754 /*10400..1044F;*/ "Deseret",
755 /*1D000..1D0FF;*/ "Byzantine Musical Symbols",
756 /*1D100..1D1FF;*/ "Musical Symbols",
757 /*1D400..1D7FF;*/ "Mathematical Alphanumeric Symbols",
758 /*20000..2A6D6;*/ "CJK Unified Ideographs Extension B",
759 /*2F800..2FA1F;*/ "CJK Compatibility Ideographs Supplement",
760 /*E0000..E007F;*/ "Tags",
761 //missing 2 private use add manually
765 //F0000..FFFFD; "Private Use",
766 //100000..10FFFD; "Private Use"
767 //FFF0..FFFD; "Specials",
768 static final String blockRanges =
769 "\u0000\u007F\u0080\u00FF\u0100\u017F\u0180\u024F\u0250\u02AF\u02B0\u02FF\u0300\u036F"
770 +"\u0370\u03FF\u0400\u04FF\u0530\u058F\u0590\u05FF\u0600\u06FF\u0700\u074F\u0780\u07BF"
771 +"\u0900\u097F\u0980\u09FF\u0A00\u0A7F\u0A80\u0AFF\u0B00\u0B7F\u0B80\u0BFF\u0C00\u0C7F\u0C80\u0CFF"
772 +"\u0D00\u0D7F\u0D80\u0DFF\u0E00\u0E7F\u0E80\u0EFF\u0F00\u0FFF\u1000\u109F\u10A0\u10FF\u1100\u11FF"
773 +"\u1200\u137F\u13A0\u13FF\u1400\u167F\u1680\u169F\u16A0\u16FF\u1780\u17FF\u1800\u18AF\u1E00\u1EFF"
774 +"\u1F00\u1FFF\u2000\u206F\u2070\u209F\u20A0\u20CF\u20D0\u20FF\u2100\u214F\u2150\u218F\u2190\u21FF\u2200\u22FF"
775 +"\u2300\u23FF\u2400\u243F\u2440\u245F\u2460\u24FF\u2500\u257F\u2580\u259F\u25A0\u25FF\u2600\u26FF\u2700\u27BF"
776 +"\u2800\u28FF\u2E80\u2EFF\u2F00\u2FDF\u2FF0\u2FFF\u3000\u303F\u3040\u309F\u30A0\u30FF\u3100\u312F\u3130\u318F"
777 +"\u3190\u319F\u31A0\u31BF\u3200\u32FF\u3300\u33FF\u3400\u4DB5\u4E00\u9FFF\uA000\uA48F\uA490\uA4CF"
778 +"\uAC00\uD7A3\uE000\uF8FF\uF900\uFAFF\uFB00\uFB4F\uFB50\uFDFF"
779 +"\uFE20\uFE2F\uFE30\uFE4F\uFE50\uFE6F\uFE70\uFEFE\uFEFF\uFEFF\uFF00\uFFEF";
780 static final int[] nonBMPBlockRanges = {
781 0x10300, 0x1032F, // 84
791 private static final int NONBMP_BLOCK_START = 84;
793 static protected RangeToken getRange(String name, boolean positive) {
794 if (Token.categories.size() == 0) {
795 synchronized (Token.categories) {
796 Token[] ranges = new Token[Token.categoryNames.length];
797 for (int i = 0; i < ranges.length; i ++) {
798 ranges[i] = Token.createRange();
801 for (int i = 0; i < 0x10000; i ++) {
802 type = Character.getType((char)i);
803 if (type == Character.START_PUNCTUATION ||
804 type == Character.END_PUNCTUATION) {
805 //build table of Pi values
806 if (i == 0x00AB || i == 0x2018 || i == 0x201B || i == 0x201C ||
807 i == 0x201F || i == 0x2039) {
808 type = CHAR_INIT_QUOTE;
810 //build table of Pf values
811 if (i == 0x00BB || i == 0x2019 || i == 0x201D || i == 0x203A ) {
812 type = CHAR_FINAL_QUOTE;
815 ranges[type].addRange(i, i);
817 case Character.UPPERCASE_LETTER:
818 case Character.LOWERCASE_LETTER:
819 case Character.TITLECASE_LETTER:
820 case Character.MODIFIER_LETTER:
821 case Character.OTHER_LETTER:
824 case Character.NON_SPACING_MARK:
825 case Character.COMBINING_SPACING_MARK:
826 case Character.ENCLOSING_MARK:
829 case Character.DECIMAL_DIGIT_NUMBER:
830 case Character.LETTER_NUMBER:
831 case Character.OTHER_NUMBER:
834 case Character.SPACE_SEPARATOR:
835 case Character.LINE_SEPARATOR:
836 case Character.PARAGRAPH_SEPARATOR:
837 type = CHAR_SEPARATOR;
839 case Character.CONTROL:
840 case Character.FORMAT:
841 case Character.SURROGATE:
842 case Character.PRIVATE_USE:
843 case Character.UNASSIGNED:
846 case Character.CONNECTOR_PUNCTUATION:
847 case Character.DASH_PUNCTUATION:
848 case Character.START_PUNCTUATION:
849 case Character.END_PUNCTUATION:
850 case CHAR_INIT_QUOTE:
851 case CHAR_FINAL_QUOTE:
852 case Character.OTHER_PUNCTUATION:
853 type = CHAR_PUNCTUATION;
855 case Character.MATH_SYMBOL:
856 case Character.CURRENCY_SYMBOL:
857 case Character.MODIFIER_SYMBOL:
858 case Character.OTHER_SYMBOL:
862 throw new RuntimeException("org.apache.xerces.utils.regex.Token#getRange(): Unknown Unicode category: "+type);
864 ranges[type].addRange(i, i);
865 } // for all characters
866 ranges[Character.UNASSIGNED].addRange(0x10000, Token.UTF16_MAX);
867 ranges[CHAR_OTHER].addRange(0x10000, Token.UTF16_MAX);
869 for (int i = 0; i < ranges.length; i ++) {
870 if (Token.categoryNames[i] != null) {
871 if (i == Character.UNASSIGNED) { // Unassigned
872 ranges[i].addRange(0x10000, Token.UTF16_MAX);
874 Token.categories.put(Token.categoryNames[i], ranges[i]);
875 Token.categories2.put(Token.categoryNames[i],
876 Token.complementRanges(ranges[i]));
879 //REVISIT: do we really need to support block names as in Unicode 3.1
880 // or we can just create all the names in IsBLOCKNAME format (XML Schema REC)?
882 StringBuffer buffer = new StringBuffer(50);
883 for (int i = 0; i < Token.blockNames.length; i ++) {
884 Token r1 = Token.createRange();
886 if (i < NONBMP_BLOCK_START) {
888 int rstart = Token.blockRanges.charAt(location);
889 int rend = Token.blockRanges.charAt(location+1);
891 //System.out.println(n+" " +Integer.toHexString(rstart)
892 // +"-"+ Integer.toHexString(rend));
893 r1.addRange(rstart, rend);
895 location = (i - NONBMP_BLOCK_START) * 2;
896 r1.addRange(Token.nonBMPBlockRanges[location],
897 Token.nonBMPBlockRanges[location + 1]);
899 String n = Token.blockNames[i];
900 if (n.equals("Specials")) {
901 r1.addRange(0xfff0, 0xfffd);
903 if (n.equals("Private Use")) {
904 r1.addRange(0xF0000,0xFFFFD);
905 r1.addRange(0x100000,0x10FFFD);
907 Token.categories.put(n, r1);
908 Token.categories2.put(n, Token.complementRanges(r1));
911 if (n.indexOf(' ') >= 0) {
912 for (int ci = 0; ci < n.length(); ci ++) {
913 if (n.charAt(ci) != ' ') {
914 buffer.append(n.charAt(ci));
921 Token.setAlias(buffer.toString(), n, true);
925 Token.setAlias("ASSIGNED", "Cn", false);
926 Token.setAlias("UNASSIGNED", "Cn", true);
927 Token all = Token.createRange();
928 all.addRange(0, Token.UTF16_MAX);
929 Token.categories.put("ALL", all);
930 Token.categories2.put("ALL", Token.complementRanges(all));
931 Token.registerNonXS("ASSIGNED");
932 Token.registerNonXS("UNASSIGNED");
933 Token.registerNonXS("ALL");
935 Token isalpha = Token.createRange();
936 isalpha.mergeRanges(ranges[Character.UPPERCASE_LETTER]); // Lu
937 isalpha.mergeRanges(ranges[Character.LOWERCASE_LETTER]); // Ll
938 isalpha.mergeRanges(ranges[Character.OTHER_LETTER]); // Lo
939 Token.categories.put("IsAlpha", isalpha);
940 Token.categories2.put("IsAlpha", Token.complementRanges(isalpha));
941 Token.registerNonXS("IsAlpha");
943 Token isalnum = Token.createRange();
944 isalnum.mergeRanges(isalpha); // Lu Ll Lo
945 isalnum.mergeRanges(ranges[Character.DECIMAL_DIGIT_NUMBER]); // Nd
946 Token.categories.put("IsAlnum", isalnum);
947 Token.categories2.put("IsAlnum", Token.complementRanges(isalnum));
948 Token.registerNonXS("IsAlnum");
950 Token isspace = Token.createRange();
951 isspace.mergeRanges(Token.token_spaces);
952 isspace.mergeRanges(ranges[CHAR_SEPARATOR]); // Z
953 Token.categories.put("IsSpace", isspace);
954 Token.categories2.put("IsSpace", Token.complementRanges(isspace));
955 Token.registerNonXS("IsSpace");
957 Token isword = Token.createRange();
958 isword.mergeRanges(isalnum); // Lu Ll Lo Nd
959 isword.addRange('_', '_');
960 Token.categories.put("IsWord", isword);
961 Token.categories2.put("IsWord", Token.complementRanges(isword));
962 Token.registerNonXS("IsWord");
964 Token isascii = Token.createRange();
965 isascii.addRange(0, 127);
966 Token.categories.put("IsASCII", isascii);
967 Token.categories2.put("IsASCII", Token.complementRanges(isascii));
968 Token.registerNonXS("IsASCII");
970 Token isnotgraph = Token.createRange();
971 isnotgraph.mergeRanges(ranges[CHAR_OTHER]);
972 isnotgraph.addRange(' ', ' ');
973 Token.categories.put("IsGraph", Token.complementRanges(isnotgraph));
974 Token.categories2.put("IsGraph", isnotgraph);
975 Token.registerNonXS("IsGraph");
977 Token isxdigit = Token.createRange();
978 isxdigit.addRange('0', '9');
979 isxdigit.addRange('A', 'F');
980 isxdigit.addRange('a', 'f');
981 Token.categories.put("IsXDigit", Token.complementRanges(isxdigit));
982 Token.categories2.put("IsXDigit", isxdigit);
983 Token.registerNonXS("IsXDigit");
985 Token.setAlias("IsDigit", "Nd", true);
986 Token.setAlias("IsUpper", "Lu", true);
987 Token.setAlias("IsLower", "Ll", true);
988 Token.setAlias("IsCntrl", "C", true);
989 Token.setAlias("IsPrint", "C", false);
990 Token.setAlias("IsPunct", "P", true);
991 Token.registerNonXS("IsDigit");
992 Token.registerNonXS("IsUpper");
993 Token.registerNonXS("IsLower");
994 Token.registerNonXS("IsCntrl");
995 Token.registerNonXS("IsPrint");
996 Token.registerNonXS("IsPunct");
998 Token.setAlias("alpha", "IsAlpha", true);
999 Token.setAlias("alnum", "IsAlnum", true);
1000 Token.setAlias("ascii", "IsASCII", true);
1001 Token.setAlias("cntrl", "IsCntrl", true);
1002 Token.setAlias("digit", "IsDigit", true);
1003 Token.setAlias("graph", "IsGraph", true);
1004 Token.setAlias("lower", "IsLower", true);
1005 Token.setAlias("print", "IsPrint", true);
1006 Token.setAlias("punct", "IsPunct", true);
1007 Token.setAlias("space", "IsSpace", true);
1008 Token.setAlias("upper", "IsUpper", true);
1009 Token.setAlias("word", "IsWord", true); // Perl extension
1010 Token.setAlias("xdigit", "IsXDigit", true);
1011 Token.registerNonXS("alpha");
1012 Token.registerNonXS("alnum");
1013 Token.registerNonXS("ascii");
1014 Token.registerNonXS("cntrl");
1015 Token.registerNonXS("digit");
1016 Token.registerNonXS("graph");
1017 Token.registerNonXS("lower");
1018 Token.registerNonXS("print");
1019 Token.registerNonXS("punct");
1020 Token.registerNonXS("space");
1021 Token.registerNonXS("upper");
1022 Token.registerNonXS("word");
1023 Token.registerNonXS("xdigit");
1026 RangeToken tok = positive ? (RangeToken)Token.categories.get(name)
1027 : (RangeToken)Token.categories2.get(name);
1028 //if (tok == null) System.out.println(name);
1031 static protected RangeToken getRange(String name, boolean positive, boolean xs) {
1032 RangeToken range = Token.getRange(name, positive);
1033 if (xs && range != null && Token.isRegisterNonXS(name)) {
1039 static Hashtable<String, String> nonxs = null;
1041 * This method is called by only getRange().
1042 * So this method need not MT-safe.
1044 static protected void registerNonXS(String name) {
1045 if (Token.nonxs == null) {
1046 Token.nonxs = new Hashtable<>();
1048 Token.nonxs.put(name, name);
1050 static protected boolean isRegisterNonXS(String name) {
1051 if (Token.nonxs == null) {
1055 //System.err.println("isRegisterNonXS: "+name);
1056 return Token.nonxs.containsKey(name);
1059 private static void setAlias(String newName, String name, boolean positive) {
1060 Token t1 = Token.categories.get(name);
1061 Token t2 = Token.categories2.get(name);
1063 Token.categories.put(newName, t1);
1064 Token.categories2.put(newName, t2);
1066 Token.categories2.put(newName, t1);
1067 Token.categories.put(newName, t2);
1071 // ------------------------------------------------------
1073 static final String viramaString =
1074 "\u094D"// ;DEVANAGARI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1075 +"\u09CD"//;BENGALI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1076 +"\u0A4D"//;GURMUKHI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1077 +"\u0ACD"//;GUJARATI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1078 +"\u0B4D"//;ORIYA SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1079 +"\u0BCD"//;TAMIL SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1080 +"\u0C4D"//;TELUGU SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1081 +"\u0CCD"//;KANNADA SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1082 +"\u0D4D"//;MALAYALAM SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1083 +"\u0E3A"//;THAI CHARACTER PHINTHU;Mn;9;ON;;;;;N;THAI VOWEL SIGN PHINTHU;;;;
1084 +"\u0F84";//;TIBETAN MARK HALANTA;Mn;9;ON;;;;;N;TIBETAN VIRAMA;;;;
1086 static private Token token_grapheme = null;
1087 static synchronized Token getGraphemePattern() {
1088 if (Token.token_grapheme != null) {
1089 return Token.token_grapheme;
1092 Token base_char = Token.createRange(); // [{ASSIGNED}]-[{M},{C}]
1093 base_char.mergeRanges(Token.getRange("ASSIGNED", true));
1094 base_char.subtractRanges(Token.getRange("M", true));
1095 base_char.subtractRanges(Token.getRange("C", true));
1097 Token virama = Token.createRange();
1098 for (int i = 0; i < Token.viramaString.length(); i++) {
1099 virama.addRange(i, i);
1102 Token combiner_wo_virama = Token.createRange();
1103 combiner_wo_virama.mergeRanges(Token.getRange("M", true));
1104 combiner_wo_virama.addRange(0x1160, 0x11ff); // hangul_medial and hangul_final
1105 combiner_wo_virama.addRange(0xff9e, 0xff9f); // extras
1107 Token left = Token.createUnion(); // base_char?
1108 left.addChild(base_char);
1109 left.addChild(Token.token_empty);
1111 Token foo = Token.createUnion();
1112 foo.addChild(Token.createConcat(virama, Token.getRange("L", true)));
1113 foo.addChild(combiner_wo_virama);
1115 foo = Token.createClosure(foo);
1117 foo = Token.createConcat(left, foo);
1119 Token.token_grapheme = foo;
1120 return Token.token_grapheme;
1124 * Combing Character Sequence in Perl 5.6.
1126 static private Token token_ccs = null;
1127 static synchronized Token getCombiningCharacterSequence() {
1128 if (Token.token_ccs != null) {
1129 return Token.token_ccs;
1132 Token foo = Token.createClosure(Token.getRange("M", true)); // \pM*
1133 foo = Token.createConcat(Token.getRange("M", false), foo); // \PM + \pM*
1134 Token.token_ccs = foo;
1135 return Token.token_ccs;
1138 // ------------------------------------------------------
1140 // ------------------------------------------------------
1142 * This class represents a node in parse tree.
1144 static class StringToken extends Token implements java.io.Serializable {
1146 private static final long serialVersionUID = -4614366944218504172L;
1149 final int refNumber;
1151 StringToken(int type, String str, int n) {
1158 int getReferenceNumber() { // for STRING
1159 return this.refNumber;
1162 String getString() { // for STRING
1167 public String toString(int options) {
1168 if (this.type == BACKREFERENCE) {
1169 return "\\"+this.refNumber;
1171 return REUtil.quoteMeta(this.string);
1177 * This class represents a node in parse tree.
1179 static class ConcatToken extends Token implements java.io.Serializable {
1181 private static final long serialVersionUID = 8717321425541346381L;
1186 ConcatToken(Token t1, Token t2) {
1187 super(Token.CONCAT);
1197 Token getChild(int index) {
1198 return index == 0 ? this.child : this.child2;
1202 public String toString(int options) {
1204 if (this.child2.type == CLOSURE && this.child2.getChild(0) == this.child) {
1205 ret = this.child.toString(options)+"+";
1206 } else if (this.child2.type == NONGREEDYCLOSURE && this.child2.getChild(0) == this.child) {
1207 ret = this.child.toString(options)+"+?";
1209 ret = this.child.toString(options)+this.child2.toString(options);
1216 * This class represents a node in parse tree.
1218 static class CharToken extends Token implements java.io.Serializable {
1220 private static final long serialVersionUID = -4394272816279496989L;
1224 CharToken(int type, int ch) {
1231 return this.chardata;
1235 public String toString(int options) {
1237 switch (this.type) {
1239 switch (this.chardata) {
1240 case '|': case '*': case '+': case '?':
1241 case '(': case ')': case '.': case '[':
1242 case '{': case '\\':
1243 ret = "\\"+(char)this.chardata;
1245 case '\f': ret = "\\f"; break;
1246 case '\n': ret = "\\n"; break;
1247 case '\r': ret = "\\r"; break;
1248 case '\t': ret = "\\t"; break;
1249 case 0x1b: ret = "\\e"; break;
1250 //case 0x0b: ret = "\\v"; break;
1252 if (this.chardata >= 0x10000) {
1253 String pre = "0"+Integer.toHexString(this.chardata);
1254 ret = "\\v"+pre.substring(pre.length()-6, pre.length());
1256 ret = ""+(char)this.chardata;
1262 if (this == Token.token_linebeginning || this == Token.token_lineend) {
1263 ret = ""+(char)this.chardata;
1265 ret = "\\"+(char)this.chardata;
1273 if (isSet(options, RegularExpression.XMLSCHEMA_MODE)) {
1274 if ("$".equals(ret)) {
1277 if ("^".equals(ret)) {
1286 boolean match(int ch) {
1287 if (this.type == CHAR) {
1288 return ch == this.chardata;
1290 throw new RuntimeException("NFAArrow#match(): Internal error: "+this.type);
1296 * This class represents a node in parse tree.
1298 static class ClosureToken extends Token implements java.io.Serializable {
1300 private static final long serialVersionUID = 1308971930673997452L;
1306 ClosureToken(int type, Token tok) {
1318 Token getChild(int index) {
1323 final void setMin(int min) {
1327 final void setMax(int max) {
1331 final int getMin() {
1335 final int getMax() {
1340 public String toString(int options) {
1342 if (this.type == CLOSURE) {
1343 if (this.getMin() < 0 && this.getMax() < 0) {
1344 ret = this.child.toString(options)+"*";
1345 } else if (this.getMin() == this.getMax()) {
1346 ret = this.child.toString(options)+"{"+this.getMin()+"}";
1347 } else if (this.getMin() >= 0 && this.getMax() >= 0) {
1348 ret = this.child.toString(options)+"{"+this.getMin()+","+this.getMax()+"}";
1349 } else if (this.getMin() >= 0 && this.getMax() < 0) {
1350 ret = this.child.toString(options)+"{"+this.getMin()+",}";
1352 throw new RuntimeException("Token#toString(): CLOSURE "
1353 +this.getMin()+", "+this.getMax());
1356 if (this.getMin() < 0 && this.getMax() < 0) {
1357 ret = this.child.toString(options)+"*?";
1358 } else if (this.getMin() == this.getMax()) {
1359 ret = this.child.toString(options)+"{"+this.getMin()+"}?";
1360 } else if (this.getMin() >= 0 && this.getMax() >= 0) {
1361 ret = this.child.toString(options)+"{"+this.getMin()+","+this.getMax()+"}?";
1362 } else if (this.getMin() >= 0 && this.getMax() < 0) {
1363 ret = this.child.toString(options)+"{"+this.getMin()+",}?";
1365 throw new RuntimeException("Token#toString(): NONGREEDYCLOSURE "
1366 +this.getMin()+", "+this.getMax());
1374 * This class represents a node in parse tree.
1376 static class ParenToken extends Token implements java.io.Serializable {
1378 private static final long serialVersionUID = -5938014719827987704L;
1381 final int parennumber;
1383 ParenToken(int type, Token tok, int paren) {
1386 this.parennumber = paren;
1394 Token getChild(int index) {
1399 int getParenNumber() {
1400 return this.parennumber;
1404 public String toString(int options) {
1406 switch (this.type) {
1408 if (this.parennumber == 0) {
1409 ret = "(?:"+this.child.toString(options)+")";
1411 ret = "("+this.child.toString(options)+")";
1416 ret = "(?="+this.child.toString(options)+")";
1418 case NEGATIVELOOKAHEAD:
1419 ret = "(?!"+this.child.toString(options)+")";
1422 ret = "(?<="+this.child.toString(options)+")";
1424 case NEGATIVELOOKBEHIND:
1425 ret = "(?<!"+this.child.toString(options)+")";
1428 ret = "(?>"+this.child.toString(options)+")";
1436 * (?(condition)yes-pattern|no-pattern)
1438 static class ConditionToken extends Token implements java.io.Serializable {
1440 private static final long serialVersionUID = 4353765277910594411L;
1442 final int refNumber;
1443 final Token condition;
1446 ConditionToken(int refno, Token cond, Token yespat, Token nopat) {
1447 super(Token.CONDITION);
1448 this.refNumber = refno;
1449 this.condition = cond;
1455 return this.no == null ? 1 : 2;
1458 Token getChild(int index) {
1465 throw new RuntimeException("Internal Error: "+index);
1469 public String toString(int options) {
1471 if (refNumber > 0) {
1472 ret = "(?("+refNumber+")";
1473 } else if (this.condition.type == Token.ANCHOR) {
1474 ret = "(?("+this.condition+")";
1476 ret = "(?"+this.condition;
1479 if (this.no == null) {
1480 ret += this.yes+")";
1482 ret += this.yes+"|"+this.no+")";
1491 static class ModifierToken extends Token implements java.io.Serializable {
1493 private static final long serialVersionUID = -9114536559696480356L;
1499 ModifierToken(Token tok, int add, int mask) {
1500 super(Token.MODIFIERGROUP);
1511 Token getChild(int index) {
1518 int getOptionsMask() {
1523 public String toString(int options) {
1525 +(this.add == 0 ? "" : REUtil.createOptionString(this.add))
1526 +(this.mask == 0 ? "" : REUtil.createOptionString(this.mask))
1528 +this.child.toString(options)
1534 * This class represents a node in parse tree.
1535 * for UNION or CONCAT.
1537 static class UnionToken extends Token implements java.io.Serializable {
1539 private static final long serialVersionUID = -2568843945989489861L;
1541 Vector<Token> children;
1543 UnionToken(int type) {
1548 void addChild(Token tok) {
1552 if (this.children == null) {
1553 this.children = new Vector<>();
1555 if (this.type == UNION) {
1556 this.children.addElement(tok);
1559 // This is CONCAT, and new child is CONCAT.
1560 if (tok.type == CONCAT) {
1561 for (int i = 0; i < tok.size(); i ++)
1563 this.addChild(tok.getChild(i)); // Recursion
1567 int size = this.children.size();
1569 this.children.addElement(tok);
1572 Token previous = this.children.elementAt(size-1);
1573 if (!((previous.type == CHAR || previous.type == STRING)
1574 && (tok.type == CHAR || tok.type == STRING))) {
1575 this.children.addElement(tok);
1579 //System.err.println("Merge '"+previous+"' and '"+tok+"'.");
1581 StringBuffer buffer;
1582 int nextMaxLength = (tok.type == CHAR ? 2 : tok.getString().length());
1583 if (previous.type == CHAR) { // Replace previous token by STRING
1584 buffer = new StringBuffer(2 + nextMaxLength);
1585 int ch = previous.getChar();
1586 if (ch >= 0x10000) {
1587 buffer.append(REUtil.decomposeToSurrogates(ch));
1589 buffer.append((char)ch);
1591 previous = Token.createString(null);
1592 this.children.setElementAt(previous, size-1);
1594 buffer = new StringBuffer(previous.getString().length() + nextMaxLength);
1595 buffer.append(previous.getString());
1598 if (tok.type == CHAR) {
1599 int ch = tok.getChar();
1600 if (ch >= 0x10000) {
1601 buffer.append(REUtil.decomposeToSurrogates(ch));
1603 buffer.append((char)ch);
1606 buffer.append(tok.getString());
1609 ((StringToken)previous).string = new String(buffer);
1614 return this.children == null ? 0 : this.children.size();
1617 Token getChild(int index) {
1618 return this.children.elementAt(index);
1622 public String toString(int options) {
1624 if (this.type == CONCAT) {
1625 if (this.children.size() == 2) {
1626 Token ch = this.getChild(0);
1627 Token ch2 = this.getChild(1);
1628 if (ch2.type == CLOSURE && ch2.getChild(0) == ch) {
1629 ret = ch.toString(options)+"+";
1630 } else if (ch2.type == NONGREEDYCLOSURE && ch2.getChild(0) == ch) {
1631 ret = ch.toString(options)+"+?";
1633 ret = ch.toString(options)+ch2.toString(options);
1636 StringBuffer sb = new StringBuffer();
1637 for (int i = 0; i < this.children.size(); i ++) {
1638 sb.append(this.children.elementAt(i).toString(options));
1640 ret = new String(sb);
1644 if (this.children.size() == 2 && this.getChild(1).type == EMPTY) {
1645 ret = this.getChild(0).toString(options)+"?";
1646 } else if (this.children.size() == 2
1647 && this.getChild(0).type == EMPTY) {
1648 ret = this.getChild(1).toString(options)+"??";
1650 StringBuffer sb = new StringBuffer();
1651 sb.append(this.children.elementAt(0).toString(options));
1652 for (int i = 1; i < this.children.size(); i ++) {
1654 sb.append(this.children.elementAt(i).toString(options));
1656 ret = new String(sb);