2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
9 * http://www.apache.org/licenses/LICENSE-2.0
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
18 package org.opendaylight.yangtools.xsd.regex;
20 import java.util.Locale;
21 import java.util.MissingResourceException;
22 import java.util.ResourceBundle;
23 import java.util.Vector;
26 * A Regular Expression Parser.
30 * @version $Id: RegexParser.java 1129306 2011-05-30 19:18:04Z sandygao $
33 static final int T_CHAR = 0;
34 static final int T_EOF = 1;
35 static final int T_OR = 2; // '|'
36 static final int T_STAR = 3; // '*'
37 static final int T_PLUS = 4; // '+'
38 static final int T_QUESTION = 5; // '?'
39 static final int T_LPAREN = 6; // '('
40 static final int T_RPAREN = 7; // ')'
41 static final int T_DOT = 8; // '.'
42 static final int T_LBRACKET = 9; // '['
43 static final int T_BACKSOLIDUS = 10; // '\'
44 static final int T_CARET = 11; // '^'
45 static final int T_DOLLAR = 12; // '$'
46 static final int T_LPAREN2 = 13; // '(?:'
47 static final int T_LOOKAHEAD = 14; // '(?='
48 static final int T_NEGATIVELOOKAHEAD = 15; // '(?!'
49 static final int T_LOOKBEHIND = 16; // '(?<='
50 static final int T_NEGATIVELOOKBEHIND = 17; // '(?<!'
51 static final int T_INDEPENDENT = 18; // '(?>'
52 static final int T_SET_OPERATIONS = 19; // '(?['
53 static final int T_POSIX_CHARCLASS_START = 20; // '[:' in a character class
54 static final int T_COMMENT = 21; // '(?#'
55 static final int T_MODIFIERS = 22; // '(?' [\-,a-z,A-Z]
56 static final int T_CONDITION = 23; // '(?('
57 static final int T_XMLSCHEMA_CC_SUBTRACTION = 24; // '-[' in a character class
59 static class ReferencePosition {
62 ReferencePosition(int n, int pos) {
72 ResourceBundle resources;
75 static protected final int S_NORMAL = 0;
76 static protected final int S_INBRACKETS = 1;
77 static protected final int S_INXBRACKETS = 2;
78 int context = S_NORMAL;
81 boolean hasBackReferences;
82 Vector<ReferencePosition> references = null;
84 public RegexParser() {
85 this.setLocale(Locale.getDefault());
87 public RegexParser(Locale locale) {
88 this.setLocale(locale);
91 public void setLocale(Locale locale) {
94 this.resources = ResourceBundle.getBundle("org.apache.xerces.impl.xpath.regex.message", locale);
97 this.resources = ResourceBundle.getBundle("org.apache.xerces.impl.xpath.regex.message");
100 catch (MissingResourceException mre) {
101 throw new RuntimeException("Installation Problem??? Couldn't load messages: "
106 final ParseException ex(String key, int loc) {
107 return new ParseException(this.resources.getString(key), loc);
110 protected final boolean isSet(int flag) {
111 return (this.options & flag) == flag;
114 synchronized Token parse(String regex, int options) throws ParseException {
115 this.options = options;
117 this.setContext(S_NORMAL);
118 this.parennumber = 1;
119 this.parenOpened = 1;
120 this.hasBackReferences = false;
122 if (this.isSet(RegularExpression.EXTENDED_COMMENT)) {
123 this.regex = REUtil.stripExtendedComment(this.regex);
125 this.regexlen = this.regex.length();
129 Token ret = this.parseRegex();
130 if (this.offset != this.regexlen) {
131 throw ex("parser.parse.1", this.offset);
133 if (this.read() != T_EOF) {
134 throw ex("parser.parse.1", this.offset-1);
136 if (this.references != null) {
137 for (int i = 0; i < this.references.size(); i ++) {
138 ReferencePosition position = this.references.elementAt(i);
139 if (this.parennumber <= position.refNumber) {
140 throw ex("parser.parse.2", position.position);
143 this.references.removeAllElements();
149 public RegularExpression createRegex(String regex, int options) throws ParseException {
150 Token tok = this.parse(regex, options);
151 return new RegularExpression(regex, tok, this.parennumber, this.hasBackReferences, options);
155 protected final void setContext(int con) {
160 return this.nexttoken;
164 if (this.offset >= this.regexlen) {
166 this.nexttoken = T_EOF;
171 int ch = this.regex.charAt(this.offset++);
174 if (this.context == S_INBRACKETS) {
175 // In a character class, this.chardata has one character, that is to say,
176 // a pair of surrogates is composed and stored to this.chardata.
180 if (this.offset >= this.regexlen) {
181 throw ex("parser.next.1", this.offset-1);
183 this.chardata = this.regex.charAt(this.offset++);
187 // Allow character class subtraction (regardless of whether we are in
188 // XML Schema mode or not)
189 if (this.offset < this.regexlen && this.regex.charAt(this.offset) == '[') {
191 ret = T_XMLSCHEMA_CC_SUBTRACTION;
198 if (!this.isSet(RegularExpression.XMLSCHEMA_MODE)
199 && this.offset < this.regexlen && this.regex.charAt(this.offset) == ':') {
201 ret = T_POSIX_CHARCLASS_START;
205 if (REUtil.isHighSurrogate(ch) && this.offset < this.regexlen) {
206 int low = this.regex.charAt(this.offset);
207 if (REUtil.isLowSurrogate(low)) {
208 this.chardata = REUtil.composeFromSurrogates(ch, low);
214 this.nexttoken = ret;
219 case '|': ret = T_OR; break;
220 case '*': ret = T_STAR; break;
221 case '+': ret = T_PLUS; break;
222 case '?': ret = T_QUESTION; break;
223 case ')': ret = T_RPAREN; break;
224 case '.': ret = T_DOT; break;
225 case '[': ret = T_LBRACKET; break;
227 if (this.isSet(RegularExpression.XMLSCHEMA_MODE)) {
235 if (this.isSet(RegularExpression.XMLSCHEMA_MODE)) {
244 if (this.offset >= this.regexlen) {
247 if (this.regex.charAt(this.offset) != '?') {
250 if (++this.offset >= this.regexlen) {
251 throw ex("parser.next.2", this.offset-1);
253 ch = this.regex.charAt(this.offset++);
255 case ':': ret = T_LPAREN2; break;
256 case '=': ret = T_LOOKAHEAD; break;
257 case '!': ret = T_NEGATIVELOOKAHEAD; break;
258 case '[': ret = T_SET_OPERATIONS; break;
259 case '>': ret = T_INDEPENDENT; break;
261 if (this.offset >= this.regexlen) {
262 throw ex("parser.next.2", this.offset-3);
264 ch = this.regex.charAt(this.offset++);
267 } else if (ch == '!') {
268 ret = T_NEGATIVELOOKBEHIND;
270 throw ex("parser.next.3", this.offset-3);
274 while (this.offset < this.regexlen) {
275 ch = this.regex.charAt(this.offset++);
281 throw ex("parser.next.4", this.offset-1);
286 if (ch == '-' || 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z') {// Options
290 } else if (ch == '(') { // conditional
291 ret = T_CONDITION; // this.offsets points the next of '('.
294 throw ex("parser.next.2", this.offset-2);
300 if (this.offset >= this.regexlen) {
301 throw ex("parser.next.1", this.offset-1);
303 this.chardata = this.regex.charAt(this.offset++);
309 this.nexttoken = ret;
313 * regex ::= term (`|` term)*
315 * factor ::= ('^' | '$' | '\A' | '\Z' | '\z' | '\b' | '\B' | '\<' | '\>'
316 * | atom (('*' | '+' | '?' | minmax ) '?'? )?)
317 * | '(?=' regex ')' | '(?!' regex ')' | '(?<=' regex ')' | '(?<!' regex ')'
318 * atom ::= char | '.' | range | '(' regex ')' | '(?:' regex ')' | '\' [0-9]
319 * | '\w' | '\W' | '\d' | '\D' | '\s' | '\S' | category-block
321 Token parseRegex() throws ParseException {
322 Token tok = this.parseTerm();
324 while (this.read() == T_OR) {
326 if (parent == null) {
327 parent = Token.createUnion();
328 parent.addChild(tok);
331 tok.addChild(this.parseTerm());
339 Token parseTerm() throws ParseException {
340 int ch = this.read();
341 if (ch == T_OR || ch == T_RPAREN || ch == T_EOF) {
342 return Token.createEmpty();
344 Token tok = this.parseFactor();
346 while ((ch = this.read()) != T_OR && ch != T_RPAREN && ch != T_EOF) {
347 if (concat == null) {
348 concat = Token.createConcat();
349 concat.addChild(tok);
352 concat.addChild(this.parseFactor());
353 //tok = Token.createConcat(tok, this.parseFactor());
359 // ----------------------------------------------------------------
361 Token processCaret() throws ParseException {
363 return Token.token_linebeginning;
365 Token processDollar() throws ParseException {
367 return Token.token_lineend;
369 Token processLookahead() throws ParseException {
371 Token tok = Token.createLook(Token.LOOKAHEAD, this.parseRegex());
372 if (this.read() != T_RPAREN) {
373 throw ex("parser.factor.1", this.offset-1);
378 Token processNegativelookahead() throws ParseException {
380 Token tok = Token.createLook(Token.NEGATIVELOOKAHEAD, this.parseRegex());
381 if (this.read() != T_RPAREN) {
382 throw ex("parser.factor.1", this.offset-1);
387 Token processLookbehind() throws ParseException {
389 Token tok = Token.createLook(Token.LOOKBEHIND, this.parseRegex());
390 if (this.read() != T_RPAREN) {
391 throw ex("parser.factor.1", this.offset-1);
396 Token processNegativelookbehind() throws ParseException {
398 Token tok = Token.createLook(Token.NEGATIVELOOKBEHIND, this.parseRegex());
399 if (this.read() != T_RPAREN) {
400 throw ex("parser.factor.1", this.offset-1);
405 Token processBacksolidus_A() throws ParseException {
407 return Token.token_stringbeginning;
409 Token processBacksolidus_Z() throws ParseException {
411 return Token.token_stringend2;
413 Token processBacksolidus_z() throws ParseException {
415 return Token.token_stringend;
417 Token processBacksolidus_b() throws ParseException {
419 return Token.token_wordedge;
421 Token processBacksolidus_B() throws ParseException {
423 return Token.token_not_wordedge;
425 Token processBacksolidus_lt() throws ParseException {
427 return Token.token_wordbeginning;
429 Token processBacksolidus_gt() throws ParseException {
431 return Token.token_wordend;
433 Token processStar(Token tok) throws ParseException {
435 if (this.read() == T_QUESTION) {
437 return Token.createNGClosure(tok);
439 return Token.createClosure(tok);
442 Token processPlus(Token tok) throws ParseException {
445 if (this.read() == T_QUESTION) {
447 return Token.createConcat(tok, Token.createNGClosure(tok));
449 return Token.createConcat(tok, Token.createClosure(tok));
452 Token processQuestion(Token tok) throws ParseException {
455 Token par = Token.createUnion();
456 if (this.read() == T_QUESTION) {
458 par.addChild(Token.createEmpty());
462 par.addChild(Token.createEmpty());
466 boolean checkQuestion(int off) {
467 return off < this.regexlen && this.regex.charAt(off) == '?';
469 Token processParen() throws ParseException {
471 int p = this.parenOpened++;
472 Token tok = Token.createParen(this.parseRegex(), p);
473 if (this.read() != T_RPAREN) {
474 throw ex("parser.factor.1", this.offset-1);
477 this.next(); // Skips ')'
480 Token processParen2() throws ParseException {
482 Token tok = Token.createParen(this.parseRegex(), 0);
483 if (this.read() != T_RPAREN) {
484 throw ex("parser.factor.1", this.offset-1);
486 this.next(); // Skips ')'
489 Token processCondition() throws ParseException {
490 // this.offset points the next of '('
491 if (this.offset+1 >= this.regexlen) {
492 throw ex("parser.factor.4", this.offset);
494 // Parses a condition.
496 Token condition = null;
497 int ch = this.regex.charAt(this.offset);
498 if ('1' <= ch && ch <= '9') {
500 int finalRefno = refno;
502 if (this.parennumber <= refno) {
503 throw ex("parser.parse.2", this.offset);
506 while (this.offset + 1 < this.regexlen) {
507 ch = this.regex.charAt(this.offset + 1);
508 if ('0' <= ch && ch <= '9') {
509 refno = (refno * 10) + (ch - '0');
510 if (refno < this.parennumber) {
523 this.hasBackReferences = true;
524 if (this.references == null) {
525 this.references = new Vector<>();
527 this.references.addElement(new ReferencePosition(finalRefno, this.offset));
529 if (this.regex.charAt(this.offset) != ')') {
530 throw ex("parser.factor.1", this.offset);
536 this.offset --; // Points '('.
539 condition = this.parseFactor();
540 switch (condition.type) {
541 case Token.LOOKAHEAD:
542 case Token.NEGATIVELOOKAHEAD:
543 case Token.LOOKBEHIND:
544 case Token.NEGATIVELOOKBEHIND:
547 if (this.read() != T_RPAREN) {
548 throw ex("parser.factor.1", this.offset-1);
552 throw ex("parser.factor.5", this.offset);
555 // Parses yes/no-patterns.
557 Token yesPattern = this.parseRegex();
558 Token noPattern = null;
559 if (yesPattern.type == Token.UNION) {
560 if (yesPattern.size() != 2) {
561 throw ex("parser.factor.6", this.offset);
563 noPattern = yesPattern.getChild(1);
564 yesPattern = yesPattern.getChild(0);
566 if (this.read() != T_RPAREN) {
567 throw ex("parser.factor.1", this.offset-1);
570 return Token.createCondition(refno, condition, yesPattern, noPattern);
572 Token processModifiers() throws ParseException {
573 // this.offset points the next of '?'.
574 // modifiers ::= [imsw]* ('-' [imsw]*)? ':'
575 int add = 0, mask = 0, ch = -1;
576 while (this.offset < this.regexlen) {
577 ch = this.regex.charAt(this.offset);
578 int v = REUtil.getOptionValue(ch);
581 break; // '-' or ':'?
586 if (this.offset >= this.regexlen) {
587 throw ex("parser.factor.2", this.offset-1);
591 while (this.offset < this.regexlen) {
592 ch = this.regex.charAt(this.offset);
593 int v = REUtil.getOptionValue(ch);
601 if (this.offset >= this.regexlen) {
602 throw ex("parser.factor.2", this.offset-1);
609 tok = Token.createModifierGroup(this.parseRegex(), add, mask);
610 if (this.read() != T_RPAREN) {
611 throw ex("parser.factor.1", this.offset-1);
614 } else if (ch == ')') { // such as (?-i)
617 tok = Token.createModifierGroup(this.parseRegex(), add, mask);
619 throw ex("parser.factor.3", this.offset);
624 Token processIndependent() throws ParseException {
626 Token tok = Token.createLook(Token.INDEPENDENT, this.parseRegex());
627 if (this.read() != T_RPAREN) {
628 throw ex("parser.factor.1", this.offset-1);
630 this.next(); // Skips ')'
633 Token processBacksolidus_c() throws ParseException {
634 int ch2; // Must be in 0x0040-0x005f
635 if (this.offset >= this.regexlen
636 || ((ch2 = this.regex.charAt(this.offset++)) & 0xffe0) != 0x0040) {
637 throw ex("parser.atom.1", this.offset-1);
640 return Token.createChar(ch2-0x40);
642 Token processBacksolidus_C() throws ParseException {
643 throw ex("parser.process.1", this.offset);
645 Token processBacksolidus_i() throws ParseException {
646 Token tok = Token.createChar('i');
650 Token processBacksolidus_I() throws ParseException {
651 throw ex("parser.process.1", this.offset);
653 Token processBacksolidus_g() throws ParseException {
655 return Token.getGraphemePattern();
657 Token processBacksolidus_X() throws ParseException {
659 return Token.getCombiningCharacterSequence();
661 Token processBackreference() throws ParseException {
662 int refnum = this.chardata-'0';
663 int finalRefnum = refnum;
665 if (this.parennumber <= refnum) {
666 throw ex("parser.parse.2", this.offset-2);
669 while (this.offset < this.regexlen) {
670 final int ch = this.regex.charAt(this.offset);
671 if ('0' <= ch && ch <= '9') {
672 refnum = (refnum * 10) + (ch - '0');
673 if (refnum < this.parennumber) {
675 finalRefnum = refnum;
687 Token tok = Token.createBackReference(finalRefnum);
688 this.hasBackReferences = true;
689 if (this.references == null) {
690 this.references = new Vector<>();
692 this.references.addElement(new ReferencePosition(finalRefnum, this.offset-2));
697 // ----------------------------------------------------------------
700 * factor ::= ('^' | '$' | '\A' | '\Z' | '\z' | '\b' | '\B' | '\<' | '\>'
701 * | atom (('*' | '+' | '?' | minmax ) '?'? )?)
702 * | '(?=' regex ')' | '(?!' regex ')' | '(?<=' regex ')' | '(?<!' regex ')'
704 * minmax ::= '{' min (',' max?)? '}'
708 Token parseFactor() throws ParseException {
709 int ch = this.read();
712 case T_CARET: return this.processCaret();
713 case T_DOLLAR: return this.processDollar();
714 case T_LOOKAHEAD: return this.processLookahead();
715 case T_NEGATIVELOOKAHEAD: return this.processNegativelookahead();
716 case T_LOOKBEHIND: return this.processLookbehind();
717 case T_NEGATIVELOOKBEHIND: return this.processNegativelookbehind();
721 return Token.createEmpty();
724 switch (this.chardata) {
725 case 'A': return this.processBacksolidus_A();
726 case 'Z': return this.processBacksolidus_Z();
727 case 'z': return this.processBacksolidus_z();
728 case 'b': return this.processBacksolidus_b();
729 case 'B': return this.processBacksolidus_B();
730 case '<': return this.processBacksolidus_lt();
731 case '>': return this.processBacksolidus_gt();
735 tok = this.parseAtom();
738 case T_STAR: return this.processStar(tok);
739 case T_PLUS: return this.processPlus(tok);
740 case T_QUESTION: return this.processQuestion(tok);
742 if (this.chardata == '{' && this.offset < this.regexlen) {
744 int off = this.offset; // this.offset -> next of '{'
745 int min = 0, max = -1;
747 if ((ch = this.regex.charAt(off++)) >= '0' && ch <= '9') {
750 while (off < this.regexlen
751 && (ch = this.regex.charAt(off++)) >= '0' && ch <= '9') {
752 min = min*10 +ch-'0';
754 throw ex("parser.quantifier.5", this.offset);
759 throw ex("parser.quantifier.1", this.offset);
765 if (off >= this.regexlen) {
766 throw ex("parser.quantifier.3", this.offset);
768 else if ((ch = this.regex.charAt(off++)) >= '0' && ch <= '9') {
770 max = ch -'0'; // {min,max}
771 while (off < this.regexlen
772 && (ch = this.regex.charAt(off++)) >= '0'
774 max = max*10 +ch-'0';
776 throw ex("parser.quantifier.5", this.offset);
781 throw ex("parser.quantifier.4", this.offset);
784 else { // assume {min,}
790 throw ex("parser.quantifier.2", this.offset);
793 if (this.checkQuestion(off)) { // off -> next of '}'
794 tok = Token.createNGClosure(tok);
797 tok = Token.createClosure(tok);
803 //System.err.println("CLOSURE: "+min+", "+max);
811 * atom ::= char | '.' | char-class | '(' regex ')' | '(?:' regex ')' | '\' [0-9]
812 * | '\w' | '\W' | '\d' | '\D' | '\s' | '\S' | category-block
814 * char ::= '\\' | '\' [efnrt] | bmp-code | character-1
816 Token parseAtom() throws ParseException {
817 int ch = this.read();
820 case T_LPAREN: return this.processParen();
821 case T_LPAREN2: return this.processParen2(); // '(?:'
822 case T_CONDITION: return this.processCondition(); // '(?('
823 case T_MODIFIERS: return this.processModifiers(); // (?modifiers ... )
824 case T_INDEPENDENT: return this.processIndependent();
826 this.next(); // Skips '.'
827 tok = Token.token_dot;
831 * char-class ::= '[' ( '^'? range ','?)+ ']'
832 * range ::= '\d' | '\w' | '\s' | category-block | range-char
833 * | range-char '-' range-char
834 * range-char ::= '\[' | '\]' | '\\' | '\' [,-efnrtv] | bmp-code | character-2
835 * bmp-char ::= '\' 'u' [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]
837 case T_LBRACKET: return this.parseCharacterClass(true);
838 case T_SET_OPERATIONS: return this.parseSetOperations();
841 switch (this.chardata) {
845 tok = this.getTokenForShorthand(this.chardata);
849 case 'e': case 'f': case 'n': case 'r':
850 case 't': case 'u': case 'v': case 'x':
852 int ch2 = this.decodeEscaped();
854 tok = Token.createChar(ch2);
856 tok = Token.createString(REUtil.decomposeToSurrogates(ch2));
861 case 'c': return this.processBacksolidus_c();
862 case 'C': return this.processBacksolidus_C();
863 case 'i': return this.processBacksolidus_i();
864 case 'I': return this.processBacksolidus_I();
865 case 'g': return this.processBacksolidus_g();
866 case 'X': return this.processBacksolidus_X();
867 case '1': case '2': case '3': case '4':
868 case '5': case '6': case '7': case '8': case '9':
869 return this.processBackreference();
873 int pstart = this.offset;
874 tok = processBacksolidus_pP(this.chardata);
876 throw this.ex("parser.atom.5", pstart);
881 tok = Token.createChar(this.chardata);
887 if (this.chardata == ']' || this.chardata == '{' || this.chardata == '}') {
888 throw this.ex("parser.atom.4", this.offset-1);
890 tok = Token.createChar(this.chardata);
891 int high = this.chardata;
893 if (REUtil.isHighSurrogate(high)
894 && this.read() == T_CHAR && REUtil.isLowSurrogate(this.chardata)) {
895 char[] sur = new char[2];
897 sur[1] = (char)this.chardata;
898 tok = Token.createParen(Token.createString(new String(sur)), 0);
904 throw this.ex("parser.atom.4", this.offset-1);
909 protected RangeToken processBacksolidus_pP(int c) throws ParseException {
912 if (this.read() != T_CHAR || this.chardata != '{') {
913 throw this.ex("parser.atom.2", this.offset-1);
916 // handle category escape
917 boolean positive = c == 'p';
918 int namestart = this.offset;
919 int nameend = this.regex.indexOf('}', namestart);
922 throw this.ex("parser.atom.3", this.offset);
925 String pname = this.regex.substring(namestart, nameend);
926 this.offset = nameend+1;
928 return Token.getRange(pname, positive, this.isSet(RegularExpression.XMLSCHEMA_MODE));
931 int processCIinCharacterClass(RangeToken tok, int c) {
932 return this.decodeEscaped();
936 * char-class ::= '[' ( '^'? range ','?)+ ']'
937 * range ::= '\d' | '\w' | '\s' | category-block | range-char
938 * | range-char '-' range-char
939 * range-char ::= '\[' | '\]' | '\\' | '\' [,-efnrtv] | bmp-code | character-2
940 * bmp-code ::= '\' 'u' [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]
942 protected RangeToken parseCharacterClass(boolean useNrange) throws ParseException {
943 this.setContext(S_INBRACKETS);
945 boolean nrange = false;
946 RangeToken base = null;
948 if (this.read() == T_CHAR && this.chardata == '^') {
952 tok = Token.createNRange();
954 base = Token.createRange();
955 base.addRange(0, Token.UTF16_MAX);
956 tok = Token.createRange();
959 tok = Token.createRange();
962 boolean firstloop = true;
963 while ((type = this.read()) != T_EOF) {
964 if (type == T_CHAR && this.chardata == ']' && !firstloop) {
967 int c = this.chardata;
969 if (type == T_BACKSOLIDUS) {
974 tok.mergeRanges(this.getTokenForShorthand(c));
980 c = this.processCIinCharacterClass(tok, c);
988 int pstart = this.offset;
989 RangeToken tok2 = this.processBacksolidus_pP(c);
991 throw this.ex("parser.atom.5", pstart);
993 tok.mergeRanges(tok2);
998 c = this.decodeEscaped();
1001 // POSIX Character class such as [:alnum:]
1002 else if (type == T_POSIX_CHARCLASS_START) {
1003 int nameend = this.regex.indexOf(':', this.offset);
1005 throw this.ex("parser.cc.1", this.offset);
1007 boolean positive = true;
1008 if (this.regex.charAt(this.offset) == '^') {
1012 String name = this.regex.substring(this.offset, nameend);
1013 RangeToken range = Token.getRange(name, positive,
1014 this.isSet(RegularExpression.XMLSCHEMA_MODE));
1015 if (range == null) {
1016 throw this.ex("parser.cc.3", this.offset);
1018 tok.mergeRanges(range);
1020 if (nameend+1 >= this.regexlen || this.regex.charAt(nameend+1) != ']') {
1021 throw this.ex("parser.cc.1", nameend);
1023 this.offset = nameend+2;
1025 else if (type == T_XMLSCHEMA_CC_SUBTRACTION && !firstloop) {
1029 tok = (RangeToken) Token.complementRanges(tok);
1032 base.subtractRanges(tok);
1036 RangeToken range2 = this.parseCharacterClass(false);
1037 tok.subtractRanges(range2);
1038 if (this.read() != T_CHAR || this.chardata != ']') {
1039 throw this.ex("parser.cc.5", this.offset);
1041 break; // Exit this loop
1044 if (!end) { // if not shorthands...
1045 if (this.read() != T_CHAR || this.chardata != '-') { // Here is no '-'.
1046 if (!this.isSet(RegularExpression.IGNORE_CASE) || c > 0xffff) {
1050 addCaseInsensitiveChar(tok, c);
1053 else if (type == T_XMLSCHEMA_CC_SUBTRACTION) {
1054 throw this.ex("parser.cc.8", this.offset-1);
1057 this.next(); // Skips '-'
1058 if ((type = this.read()) == T_EOF) {
1059 throw this.ex("parser.cc.2", this.offset);
1061 if (type == T_CHAR && this.chardata == ']') {
1062 if (!this.isSet(RegularExpression.IGNORE_CASE) || c > 0xffff) {
1066 addCaseInsensitiveChar(tok, c);
1068 tok.addRange('-', '-');
1070 int rangeend = this.chardata;
1071 if (type == T_BACKSOLIDUS) {
1072 rangeend = this.decodeEscaped();
1076 throw this.ex("parser.ope.3", this.offset-1);
1078 if (!this.isSet(RegularExpression.IGNORE_CASE) ||
1079 (c > 0xffff && rangeend > 0xffff)) {
1080 tok.addRange(c, rangeend);
1083 addCaseInsensitiveCharRange(tok, c, rangeend);
1088 if (this.isSet(RegularExpression.SPECIAL_COMMA)
1089 && this.read() == T_CHAR && this.chardata == ',') {
1094 if (this.read() == T_EOF) {
1095 throw this.ex("parser.cc.2", this.offset);
1098 if (!useNrange && nrange) {
1099 base.subtractRanges(tok);
1103 tok.compactRanges();
1104 this.setContext(S_NORMAL);
1105 this.next(); // Skips ']'
1111 * '(?[' ... ']' (('-' | '+' | '&') '[' ... ']')? ')'
1113 protected RangeToken parseSetOperations() throws ParseException {
1114 RangeToken tok = this.parseCharacterClass(false);
1116 while ((type = this.read()) != T_RPAREN) {
1117 int ch = this.chardata;
1118 if (type == T_CHAR && (ch == '-' || ch == '&')
1119 || type == T_PLUS) {
1121 if (this.read() != T_LBRACKET) {
1122 throw ex("parser.ope.1", this.offset-1);
1124 RangeToken t2 = this.parseCharacterClass(false);
1125 if (type == T_PLUS) {
1126 tok.mergeRanges(t2);
1127 } else if (ch == '-') {
1128 tok.subtractRanges(t2);
1129 } else if (ch == '&') {
1130 tok.intersectRanges(t2);
1132 throw new RuntimeException("ASSERT");
1135 throw ex("parser.ope.2", this.offset-1);
1142 Token getTokenForShorthand(int ch) {
1146 tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
1147 ? Token.getRange("Nd", true) : Token.token_0to9;
1150 tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
1151 ? Token.getRange("Nd", false) : Token.token_not_0to9;
1154 tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
1155 ? Token.getRange("IsWord", true) : Token.token_wordchars;
1158 tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
1159 ? Token.getRange("IsWord", false) : Token.token_not_wordchars;
1162 tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
1163 ? Token.getRange("IsSpace", true) : Token.token_spaces;
1166 tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
1167 ? Token.getRange("IsSpace", false) : Token.token_not_spaces;
1171 throw new RuntimeException("Internal Error: shorthands: \\u"+Integer.toString(ch, 16));
1178 int decodeEscaped() throws ParseException {
1179 if (this.read() != T_BACKSOLIDUS) {
1180 throw ex("parser.next.1", this.offset-1);
1182 int c = this.chardata;
1184 case 'e': c = 0x1b; break; // ESCAPE U+001B
1185 case 'f': c = '\f'; break; // FORM FEED U+000C
1186 case 'n': c = '\n'; break; // LINE FEED U+000A
1187 case 'r': c = '\r'; break; // CRRIAGE RETURN U+000D
1188 case 't': c = '\t'; break; // HORIZONTAL TABULATION U+0009
1189 //case 'v': c = 0x0b; break; // VERTICAL TABULATION U+000B
1192 if (this.read() != T_CHAR) {
1193 throw ex("parser.descape.1", this.offset-1);
1195 if (this.chardata == '{') {
1200 if (this.read() != T_CHAR) {
1201 throw ex("parser.descape.1", this.offset-1);
1203 if ((v1 = hexChar(this.chardata)) < 0) {
1207 throw ex("parser.descape.2", this.offset-1);
1211 if (this.chardata != '}') {
1212 throw ex("parser.descape.3", this.offset-1);
1214 if (uv > Token.UTF16_MAX) {
1215 throw ex("parser.descape.4", this.offset-1);
1220 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) {
1221 throw ex("parser.descape.1", this.offset-1);
1225 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) {
1226 throw ex("parser.descape.1", this.offset-1);
1236 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) {
1237 throw ex("parser.descape.1", this.offset-1);
1241 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) {
1242 throw ex("parser.descape.1", this.offset-1);
1246 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) {
1247 throw ex("parser.descape.1", this.offset-1);
1251 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) {
1252 throw ex("parser.descape.1", this.offset-1);
1260 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) {
1261 throw ex("parser.descape.1", this.offset-1);
1265 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) {
1266 throw ex("parser.descape.1", this.offset-1);
1270 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) {
1271 throw ex("parser.descape.1", this.offset-1);
1275 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) {
1276 throw ex("parser.descape.1", this.offset-1);
1280 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) {
1281 throw ex("parser.descape.1", this.offset-1);
1285 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) {
1286 throw ex("parser.descape.1", this.offset-1);
1289 if (uv > Token.UTF16_MAX) {
1290 throw ex("parser.descappe.4", this.offset-1);
1297 throw ex("parser.descape.5", this.offset-2);
1303 static private final int hexChar(int ch) {
1325 static protected final void addCaseInsensitiveChar(RangeToken tok, int c) {
1326 final int[] caseMap = CaseInsensitiveMap.get(c);
1329 if (caseMap != null) {
1330 for (int i=0; i<caseMap.length; i+=2) {
1331 tok.addRange(caseMap[i], caseMap[i]);
1337 static protected final void addCaseInsensitiveCharRange(RangeToken tok, int start, int end) {
1348 tok.addRange(r1, r2);
1349 for (int ch = r1; ch <= r2; ch++) {
1350 caseMap = CaseInsensitiveMap.get(ch);
1351 if (caseMap != null) {
1352 for (int i=0; i<caseMap.length; i+=2) {
1353 tok.addRange(caseMap[i], caseMap[i]);