BUG-5410: initial import of Xerces RegularExpression

[yangtools.git] / third-party / xsd-regex / src / main / java / org / opendaylight / yangtools / xsd / regex / RegexParser.java
diff --git a/third-party/xsd-regex/src/main/java/org/opendaylight/yangtools/xsd/regex/RegexParser.java b/third-party/xsd-regex/src/main/java/org/opendaylight/yangtools/xsd/regex/RegexParser.java

new file mode 100644 (file)

index 0000000..92c339c
--- /dev/null
+++ b/third-party/xsd-regex/src/main/java/org/opendaylight/yangtools/xsd/regex/RegexParser.java
@@ -0,0 +1,1232 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * 
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.opendaylight.yangtools.xsd.regex;
+
+import java.util.Locale;
+import java.util.MissingResourceException;
+import java.util.ResourceBundle;
+import java.util.Vector;
+
+/**
+ * A Regular Expression Parser.
+ * 
+ * @xerces.internal
+ *
+ * @version $Id: RegexParser.java 1129306 2011-05-30 19:18:04Z sandygao $
+ */
+class RegexParser {
+    static final int T_CHAR = 0;
+    static final int T_EOF = 1;
+    static final int T_OR = 2;                  // '|'
+    static final int T_STAR = 3;                // '*'
+    static final int T_PLUS = 4;                // '+'
+    static final int T_QUESTION = 5;            // '?'
+    static final int T_LPAREN = 6;              // '('
+    static final int T_RPAREN = 7;              // ')'
+    static final int T_DOT = 8;                 // '.'
+    static final int T_LBRACKET = 9;            // '['
+    static final int T_BACKSOLIDUS = 10;        // '\'
+    static final int T_CARET = 11;              // '^'
+    static final int T_DOLLAR = 12;             // '$'
+    static final int T_LPAREN2 = 13;            // '(?:'
+    static final int T_LOOKAHEAD = 14;          // '(?='
+    static final int T_NEGATIVELOOKAHEAD = 15;  // '(?!'
+    static final int T_LOOKBEHIND = 16;         // '(?<='
+    static final int T_NEGATIVELOOKBEHIND = 17; // '(?<!'
+    static final int T_INDEPENDENT = 18;        // '(?>'
+    static final int T_SET_OPERATIONS = 19;     // '(?['
+    static final int T_POSIX_CHARCLASS_START = 20; // '[:' in a character class
+    static final int T_COMMENT = 21;            // '(?#'
+    static final int T_MODIFIERS = 22;          // '(?' [\-,a-z,A-Z]
+    static final int T_CONDITION = 23;          // '(?('
+    static final int T_XMLSCHEMA_CC_SUBTRACTION = 24; // '-[' in a character class
+
+    static class ReferencePosition {
+        int refNumber;
+        int position;
+        ReferencePosition(int n, int pos) {
+            this.refNumber = n;
+            this.position = pos;
+        }
+    }
+
+    int offset;
+    String regex;
+    int regexlen;
+    int options;
+    ResourceBundle resources;
+    int chardata;
+    int nexttoken;
+    static protected final int S_NORMAL = 0;
+    static protected final int S_INBRACKETS = 1;
+    static protected final int S_INXBRACKETS = 2;
+    int context = S_NORMAL;
+    int parenOpened = 1;
+    int parennumber = 1;
+    boolean hasBackReferences;
+    Vector references = null;
+
+    public RegexParser() {
+        this.setLocale(Locale.getDefault());
+    }
+    public RegexParser(Locale locale) {
+        this.setLocale(locale);
+    }
+
+    public void setLocale(Locale locale) {
+        try {
+            if (locale != null) {
+                this.resources = ResourceBundle.getBundle("org.apache.xerces.impl.xpath.regex.message", locale);
+            }
+            else {
+                this.resources = ResourceBundle.getBundle("org.apache.xerces.impl.xpath.regex.message");
+            }
+        } 
+        catch (MissingResourceException mre) {
+            throw new RuntimeException("Installation Problem???  Couldn't load messages: "
+                                       + mre.getMessage());
+        }
+    }
+
+    final ParseException ex(String key, int loc) {
+        return new ParseException(this.resources.getString(key), loc);
+    }
+
+    protected final boolean isSet(int flag) {
+        return (this.options & flag) == flag;
+    }
+
+    synchronized Token parse(String regex, int options) throws ParseException {
+        this.options = options;
+        this.offset = 0;
+        this.setContext(S_NORMAL);
+        this.parennumber = 1;
+        this.parenOpened = 1;
+        this.hasBackReferences = false;
+        this.regex = regex;
+        if (this.isSet(RegularExpression.EXTENDED_COMMENT))
+            this.regex = REUtil.stripExtendedComment(this.regex);
+        this.regexlen = this.regex.length();
+
+
+        this.next();
+        Token ret = this.parseRegex();
+        if (this.offset != this.regexlen)
+            throw ex("parser.parse.1", this.offset);
+        if (this.read() != T_EOF) {
+            throw ex("parser.parse.1", this.offset-1);
+        }
+        if (this.references != null) {
+            for (int i = 0;  i < this.references.size();  i ++) {
+                ReferencePosition position = (ReferencePosition)this.references.elementAt(i);
+                if (this.parennumber <= position.refNumber)
+                    throw ex("parser.parse.2", position.position);
+            }
+            this.references.removeAllElements();
+        }
+        return ret;
+    }
+
+    /*
+    public RegularExpression createRegex(String regex, int options) throws ParseException {
+        Token tok = this.parse(regex, options);
+        return new RegularExpression(regex, tok, this.parennumber, this.hasBackReferences, options);
+    }
+    */
+
+    protected final void setContext(int con) {
+        this.context = con;
+    }
+
+    final int read() {
+        return this.nexttoken;
+    }
+
+    final void next() {
+        if (this.offset >= this.regexlen) {
+            this.chardata = -1;
+            this.nexttoken = T_EOF;
+            return;
+        }
+
+        int ret;
+        int ch = this.regex.charAt(this.offset++);
+        this.chardata = ch;
+
+        if (this.context == S_INBRACKETS) {
+            // In a character class, this.chardata has one character, that is to say,
+            // a pair of surrogates is composed and stored to this.chardata.
+            switch (ch) {
+              case '\\':
+                ret = T_BACKSOLIDUS;
+                if (this.offset >= this.regexlen)
+                    throw ex("parser.next.1", this.offset-1);
+                this.chardata = this.regex.charAt(this.offset++);
+                break;
+
+              case '-':
+                // Allow character class subtraction (regardless of whether we are in
+                // XML Schema mode or not)
+                if (this.offset < this.regexlen && this.regex.charAt(this.offset) == '[') {
+                    this.offset++;
+                    ret = T_XMLSCHEMA_CC_SUBTRACTION;
+                } else
+                    ret = T_CHAR;
+                break;
+
+              case '[':
+                if (!this.isSet(RegularExpression.XMLSCHEMA_MODE)
+                    && this.offset < this.regexlen && this.regex.charAt(this.offset) == ':') {
+                    this.offset++;
+                    ret = T_POSIX_CHARCLASS_START;
+                    break;
+                } // Through down
+              default:
+                if (REUtil.isHighSurrogate(ch) && this.offset < this.regexlen) {
+                    int low = this.regex.charAt(this.offset);
+                    if (REUtil.isLowSurrogate(low)) {
+                        this.chardata = REUtil.composeFromSurrogates(ch, low);
+                        this.offset ++;
+                    }
+                }
+                ret = T_CHAR;
+            }
+            this.nexttoken = ret;
+            return;
+        }
+
+        switch (ch) {
+          case '|': ret = T_OR;             break;
+          case '*': ret = T_STAR;           break;
+          case '+': ret = T_PLUS;           break;
+          case '?': ret = T_QUESTION;       break;
+          case ')': ret = T_RPAREN;         break;
+          case '.': ret = T_DOT;            break;
+          case '[': ret = T_LBRACKET;       break;
+          case '^':
+              if (this.isSet(RegularExpression.XMLSCHEMA_MODE)) {
+                  ret = T_CHAR;
+              }
+              else {
+                  ret = T_CARET;
+              }
+              break;
+          case '$': 
+              if (this.isSet(RegularExpression.XMLSCHEMA_MODE)) {
+                  ret = T_CHAR;
+              }
+              else {
+                  ret = T_DOLLAR;
+              }
+              break;
+          case '(':
+            ret = T_LPAREN;
+            if (this.offset >= this.regexlen)
+                break;
+            if (this.regex.charAt(this.offset) != '?')
+                break;
+            if (++this.offset >= this.regexlen)
+                throw ex("parser.next.2", this.offset-1);
+            ch = this.regex.charAt(this.offset++);
+            switch (ch) {
+              case ':':  ret = T_LPAREN2;            break;
+              case '=':  ret = T_LOOKAHEAD;          break;
+              case '!':  ret = T_NEGATIVELOOKAHEAD;  break;
+              case '[':  ret = T_SET_OPERATIONS;     break;
+              case '>':  ret = T_INDEPENDENT;        break;
+              case '<':
+                if (this.offset >= this.regexlen)
+                    throw ex("parser.next.2", this.offset-3);
+                ch = this.regex.charAt(this.offset++);
+                if (ch == '=') {
+                    ret = T_LOOKBEHIND;
+                } else if (ch == '!') {
+                    ret = T_NEGATIVELOOKBEHIND;
+                } else
+                    throw ex("parser.next.3", this.offset-3);
+                break;
+              case '#':
+                while (this.offset < this.regexlen) {
+                    ch = this.regex.charAt(this.offset++);
+                    if (ch == ')')  break;
+                }
+                if (ch != ')')
+                    throw ex("parser.next.4", this.offset-1);
+                ret = T_COMMENT;
+                break;
+              default:
+                if (ch == '-' || 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z') {// Options
+                    this.offset --;
+                    ret = T_MODIFIERS;
+                    break;
+                } else if (ch == '(') {         // conditional
+                    ret = T_CONDITION;          // this.offsets points the next of '('.
+                    break;
+                }
+                throw ex("parser.next.2", this.offset-2);
+            }
+            break;
+            
+          case '\\':
+            ret = T_BACKSOLIDUS;
+            if (this.offset >= this.regexlen)
+                throw ex("parser.next.1", this.offset-1);
+            this.chardata = this.regex.charAt(this.offset++);
+            break;
+
+          default:
+            ret = T_CHAR;
+        }
+        this.nexttoken = ret;
+    }
+
+    /**
+     * regex ::= term (`|` term)*
+     * term ::= factor+
+     * factor ::= ('^' | '$' | '\A' | '\Z' | '\z' | '\b' | '\B' | '\<' | '\>'
+     *            | atom (('*' | '+' | '?' | minmax ) '?'? )?)
+     *            | '(?=' regex ')'  | '(?!' regex ')'  | '(?&lt;=' regex ')'  | '(?&lt;!' regex ')'
+     * atom ::= char | '.' | range | '(' regex ')' | '(?:' regex ')' | '\' [0-9]
+     *          | '\w' | '\W' | '\d' | '\D' | '\s' | '\S' | category-block 
+     */
+    Token parseRegex() throws ParseException {
+        Token tok = this.parseTerm();
+        Token parent = null;
+        while (this.read() == T_OR) {
+            this.next();                    // '|'
+            if (parent == null) {
+                parent = Token.createUnion();
+                parent.addChild(tok);
+                tok = parent;
+            }
+            tok.addChild(this.parseTerm());
+        }
+        return tok;
+    }
+
+    /**
+     * term ::= factor+
+     */
+    Token parseTerm() throws ParseException {
+        int ch = this.read();
+        if (ch == T_OR || ch == T_RPAREN || ch == T_EOF) {
+            return Token.createEmpty();
+        } else {
+            Token tok = this.parseFactor();
+            Token concat = null;
+            while ((ch = this.read()) != T_OR && ch != T_RPAREN && ch != T_EOF) {
+                if (concat == null) {
+                    concat = Token.createConcat();
+                    concat.addChild(tok);
+                    tok = concat;
+                }
+                concat.addChild(this.parseFactor());
+                //tok = Token.createConcat(tok, this.parseFactor());
+            }
+            return tok;
+        }
+    }
+
+    // ----------------------------------------------------------------
+
+    Token processCaret() throws ParseException {
+        this.next();
+        return Token.token_linebeginning;
+    }
+    Token processDollar() throws ParseException {
+        this.next();
+        return Token.token_lineend;
+    }
+    Token processLookahead() throws ParseException {
+        this.next();
+        Token tok = Token.createLook(Token.LOOKAHEAD, this.parseRegex());
+        if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
+        this.next();                            // ')'
+        return tok;
+    }
+    Token processNegativelookahead() throws ParseException {
+        this.next();
+        Token tok = Token.createLook(Token.NEGATIVELOOKAHEAD, this.parseRegex());
+        if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
+        this.next();                            // ')'
+        return tok;
+    }
+    Token processLookbehind() throws ParseException {
+        this.next();
+        Token tok = Token.createLook(Token.LOOKBEHIND, this.parseRegex());
+        if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
+        this.next();                            // ')'
+        return tok;
+    }
+    Token processNegativelookbehind() throws ParseException {
+        this.next();
+        Token tok = Token.createLook(Token.NEGATIVELOOKBEHIND, this.parseRegex());
+        if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
+        this.next();                    // ')'
+        return tok;
+    }
+    Token processBacksolidus_A() throws ParseException {
+        this.next();
+        return Token.token_stringbeginning;
+    }
+    Token processBacksolidus_Z() throws ParseException {
+        this.next();
+        return Token.token_stringend2;
+    }
+    Token processBacksolidus_z() throws ParseException {
+        this.next();
+        return Token.token_stringend;
+    }
+    Token processBacksolidus_b() throws ParseException {
+        this.next();
+        return Token.token_wordedge;
+    }
+    Token processBacksolidus_B() throws ParseException {
+        this.next();
+        return Token.token_not_wordedge;
+    }
+    Token processBacksolidus_lt() throws ParseException {
+        this.next();
+        return Token.token_wordbeginning;
+    }
+    Token processBacksolidus_gt() throws ParseException {
+        this.next();
+        return Token.token_wordend;
+    }
+    Token processStar(Token tok) throws ParseException {
+        this.next();
+        if (this.read() == T_QUESTION) {
+            this.next();
+            return Token.createNGClosure(tok);
+        } else
+            return Token.createClosure(tok);
+    }
+    Token processPlus(Token tok) throws ParseException {
+        // X+ -> XX*
+        this.next();
+        if (this.read() == T_QUESTION) {
+            this.next();
+            return Token.createConcat(tok, Token.createNGClosure(tok));
+        } else
+            return Token.createConcat(tok, Token.createClosure(tok));
+    }
+    Token processQuestion(Token tok) throws ParseException {
+        // X? -> X|
+        this.next();
+        Token par = Token.createUnion();
+        if (this.read() == T_QUESTION) {
+            this.next();
+            par.addChild(Token.createEmpty());
+            par.addChild(tok);
+        } else {
+            par.addChild(tok);
+            par.addChild(Token.createEmpty());
+        }
+        return par;
+    }
+    boolean checkQuestion(int off) {
+        return off < this.regexlen && this.regex.charAt(off) == '?';
+    }
+    Token processParen() throws ParseException {
+        this.next();
+        int p = this.parenOpened++;
+        Token tok = Token.createParen(this.parseRegex(), p);
+        if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
+        this.parennumber++;
+        this.next();                            // Skips ')'
+        return tok;
+    }
+    Token processParen2() throws ParseException {
+        this.next();
+        Token tok = Token.createParen(this.parseRegex(), 0);
+        if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
+        this.next();                            // Skips ')'
+        return tok;
+    }
+    Token processCondition() throws ParseException {
+                                                // this.offset points the next of '('
+        if (this.offset+1 >= this.regexlen)  throw ex("parser.factor.4", this.offset);
+                                                // Parses a condition.
+        int refno = -1;
+        Token condition = null;
+        int ch = this.regex.charAt(this.offset);
+        if ('1' <= ch && ch <= '9') {
+            refno = ch-'0';
+            int finalRefno = refno;
+            
+            if (this.parennumber <= refno)
+                throw ex("parser.parse.2", this.offset);
+
+            while (this.offset + 1 < this.regexlen) {
+                ch = this.regex.charAt(this.offset + 1);
+                if ('0' <= ch && ch <= '9') {
+                    refno = (refno * 10) + (ch - '0');
+                    if (refno < this.parennumber) {
+                        finalRefno= refno;
+                        ++this.offset;
+                    }
+                    else {
+                        break;
+                    }
+                }
+                else {
+                    break;
+                }
+            }
+
+            this.hasBackReferences = true;
+            if (this.references == null)  this.references = new Vector();
+            this.references.addElement(new ReferencePosition(finalRefno, this.offset));
+            this.offset ++;
+            if (this.regex.charAt(this.offset) != ')')  throw ex("parser.factor.1", this.offset);
+            this.offset ++;
+        } else {
+            if (ch == '?')  this.offset --; // Points '('.
+            this.next();
+            condition = this.parseFactor();
+            switch (condition.type) {
+              case Token.LOOKAHEAD:
+              case Token.NEGATIVELOOKAHEAD:
+              case Token.LOOKBEHIND:
+              case Token.NEGATIVELOOKBEHIND:
+                break;
+              case Token.ANCHOR:
+                if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
+                break;
+              default:
+                throw ex("parser.factor.5", this.offset);
+            }
+        }
+                                                // Parses yes/no-patterns.
+        this.next();
+        Token yesPattern = this.parseRegex();
+        Token noPattern = null;
+        if (yesPattern.type == Token.UNION) {
+            if (yesPattern.size() != 2)  throw ex("parser.factor.6", this.offset);
+            noPattern = yesPattern.getChild(1);
+            yesPattern = yesPattern.getChild(0);
+        }
+        if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
+        this.next();
+        return Token.createCondition(refno, condition, yesPattern, noPattern);
+    }
+    Token processModifiers() throws ParseException {
+                                                // this.offset points the next of '?'.
+                                                // modifiers ::= [imsw]* ('-' [imsw]*)? ':'
+        int add = 0, mask = 0, ch = -1;
+        while (this.offset < this.regexlen) {
+            ch = this.regex.charAt(this.offset);
+            int v = REUtil.getOptionValue(ch);
+            if (v == 0)  break;                 // '-' or ':'?
+            add |= v;
+            this.offset ++;
+        }
+        if (this.offset >= this.regexlen)  throw ex("parser.factor.2", this.offset-1);
+        if (ch == '-') {
+            this.offset ++;
+            while (this.offset < this.regexlen) {
+                ch = this.regex.charAt(this.offset);
+                int v = REUtil.getOptionValue(ch);
+                if (v == 0)  break;             // ':'?
+                mask |= v;
+                this.offset ++;
+            }
+            if (this.offset >= this.regexlen)  throw ex("parser.factor.2", this.offset-1);
+        }
+        Token tok;
+        if (ch == ':') {
+            this.offset ++;
+            this.next();
+            tok = Token.createModifierGroup(this.parseRegex(), add, mask);
+            if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
+            this.next();
+        } else if (ch == ')') {                 // such as (?-i)
+            this.offset ++;
+            this.next();
+            tok = Token.createModifierGroup(this.parseRegex(), add, mask);
+        } else
+            throw ex("parser.factor.3", this.offset);
+
+        return tok;
+    }
+    Token processIndependent() throws ParseException {
+        this.next();
+        Token tok = Token.createLook(Token.INDEPENDENT, this.parseRegex());
+        if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
+        this.next();                            // Skips ')'
+        return tok;
+    }
+    Token processBacksolidus_c() throws ParseException {
+        int ch2;                                // Must be in 0x0040-0x005f
+        if (this.offset >= this.regexlen
+            || ((ch2 = this.regex.charAt(this.offset++)) & 0xffe0) != 0x0040)
+            throw ex("parser.atom.1", this.offset-1);
+        this.next();
+        return Token.createChar(ch2-0x40);
+    }
+    Token processBacksolidus_C() throws ParseException {
+        throw ex("parser.process.1", this.offset);
+    }
+    Token processBacksolidus_i() throws ParseException {
+        Token tok = Token.createChar('i');
+        this.next();
+        return tok;
+    }
+    Token processBacksolidus_I() throws ParseException {
+        throw ex("parser.process.1", this.offset);
+    }
+    Token processBacksolidus_g() throws ParseException {
+        this.next();
+        return Token.getGraphemePattern();
+    }
+    Token processBacksolidus_X() throws ParseException {
+        this.next();
+        return Token.getCombiningCharacterSequence();
+    }
+    Token processBackreference() throws ParseException {
+        int refnum = this.chardata-'0';
+        int finalRefnum = refnum;
+
+        if (this.parennumber <= refnum)
+            throw ex("parser.parse.2", this.offset-2);
+
+        while  (this.offset < this.regexlen) {
+            final int ch = this.regex.charAt(this.offset);
+            if ('0' <= ch && ch <= '9') {
+                refnum = (refnum * 10) + (ch - '0');
+                if (refnum < this.parennumber) {
+                    ++this.offset;
+                    finalRefnum = refnum;
+                    this.chardata = ch;
+                }
+                else {
+                    break;
+                }
+            }
+            else {
+                break;
+            }
+        }
+
+        Token tok = Token.createBackReference(finalRefnum);
+        this.hasBackReferences = true;
+        if (this.references == null)  this.references = new Vector();
+        this.references.addElement(new ReferencePosition(finalRefnum, this.offset-2));
+        this.next();
+        return tok;
+    }
+
+    // ----------------------------------------------------------------
+
+    /**
+     * factor ::= ('^' | '$' | '\A' | '\Z' | '\z' | '\b' | '\B' | '\<' | '\>'
+     *            | atom (('*' | '+' | '?' | minmax ) '?'? )?)
+     *            | '(?=' regex ')'  | '(?!' regex ')'  | '(?&lt;=' regex ')'  | '(?&lt;!' regex ')'
+     *            | '(?#' [^)]* ')'
+     * minmax ::= '{' min (',' max?)? '}'
+     * min ::= [0-9]+
+     * max ::= [0-9]+
+     */
+    Token parseFactor() throws ParseException {        
+        int ch = this.read();
+        Token tok;
+        switch (ch) {
+          case T_CARET:         return this.processCaret();
+          case T_DOLLAR:        return this.processDollar();
+          case T_LOOKAHEAD:     return this.processLookahead();
+          case T_NEGATIVELOOKAHEAD: return this.processNegativelookahead();
+          case T_LOOKBEHIND:    return this.processLookbehind();
+          case T_NEGATIVELOOKBEHIND: return this.processNegativelookbehind();
+
+          case T_COMMENT:
+            this.next();
+            return Token.createEmpty();
+
+          case T_BACKSOLIDUS:
+            switch (this.chardata) {
+              case 'A': return this.processBacksolidus_A();
+              case 'Z': return this.processBacksolidus_Z();
+              case 'z': return this.processBacksolidus_z();
+              case 'b': return this.processBacksolidus_b();
+              case 'B': return this.processBacksolidus_B();
+              case '<': return this.processBacksolidus_lt();
+              case '>': return this.processBacksolidus_gt();
+            }
+                                                // through down
+        }
+        tok = this.parseAtom();
+        ch = this.read();
+        switch (ch) {
+          case T_STAR:  return this.processStar(tok);
+          case T_PLUS:  return this.processPlus(tok);
+          case T_QUESTION: return this.processQuestion(tok);
+          case T_CHAR:
+            if (this.chardata == '{' && this.offset < this.regexlen) {
+
+                int off = this.offset;          // this.offset -> next of '{'
+                int min = 0, max = -1;
+
+                if ((ch = this.regex.charAt(off++)) >= '0' && ch <= '9') {
+
+                    min = ch -'0';
+                    while (off < this.regexlen
+                           && (ch = this.regex.charAt(off++)) >= '0' && ch <= '9') {
+                        min = min*10 +ch-'0';
+                        if (min < 0)
+                            throw ex("parser.quantifier.5", this.offset);
+                    }
+                }
+                else {
+                    throw ex("parser.quantifier.1", this.offset);
+                }
+
+                max = min;
+                if (ch == ',') {
+
+                   if (off >= this.regexlen) {
+                       throw ex("parser.quantifier.3", this.offset);
+                   }
+                   else if ((ch = this.regex.charAt(off++)) >= '0' && ch <= '9') {                       
+
+                        max = ch -'0';       // {min,max}
+                        while (off < this.regexlen
+                               && (ch = this.regex.charAt(off++)) >= '0'
+                               && ch <= '9') {
+                            max = max*10 +ch-'0';
+                            if (max < 0)
+                                throw ex("parser.quantifier.5", this.offset);
+                        }
+
+                        if (min > max)
+                            throw ex("parser.quantifier.4", this.offset);
+                   }
+                   else { // assume {min,}
+                        max = -1;           
+                    }
+                }
+
+               if (ch != '}')
+                   throw ex("parser.quantifier.2", this.offset);
+
+               if (this.checkQuestion(off)) {  // off -> next of '}'
+                    tok = Token.createNGClosure(tok);
+                    this.offset = off+1;
+                } else {
+                    tok = Token.createClosure(tok);
+                    this.offset = off;
+                }
+
+                tok.setMin(min);
+                tok.setMax(max);
+                //System.err.println("CLOSURE: "+min+", "+max);
+                this.next();
+            }
+        }
+        return tok;
+    }
+
+    /**
+     * atom ::= char | '.' | char-class | '(' regex ')' | '(?:' regex ')' | '\' [0-9]
+     *          | '\w' | '\W' | '\d' | '\D' | '\s' | '\S' | category-block
+     *          | '(?>' regex ')'
+     * char ::= '\\' | '\' [efnrt] | bmp-code | character-1
+     */
+    Token parseAtom() throws ParseException {
+        int ch = this.read();
+        Token tok = null;
+        switch (ch) {
+          case T_LPAREN:        return this.processParen();
+          case T_LPAREN2:       return this.processParen2(); // '(?:'
+          case T_CONDITION:     return this.processCondition(); // '(?('
+          case T_MODIFIERS:     return this.processModifiers(); // (?modifiers ... )
+          case T_INDEPENDENT:   return this.processIndependent();
+          case T_DOT:
+            this.next();                    // Skips '.'
+            tok = Token.token_dot;
+            break;
+
+            /**
+             * char-class ::= '[' ( '^'? range ','?)+ ']'
+             * range ::= '\d' | '\w' | '\s' | category-block | range-char
+             *           | range-char '-' range-char
+             * range-char ::= '\[' | '\]' | '\\' | '\' [,-efnrtv] | bmp-code | character-2
+             * bmp-char ::= '\' 'u' [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]
+             */
+          case T_LBRACKET:      return this.parseCharacterClass(true);
+          case T_SET_OPERATIONS: return this.parseSetOperations();
+
+          case T_BACKSOLIDUS:
+            switch (this.chardata) {
+              case 'd':  case 'D':
+              case 'w':  case 'W':
+              case 's':  case 'S':
+                tok = this.getTokenForShorthand(this.chardata);
+                this.next();
+                return tok;
+
+              case 'e':  case 'f':  case 'n':  case 'r':
+              case 't':  case 'u':  case 'v':  case 'x':
+                {
+                    int ch2 = this.decodeEscaped();
+                    if (ch2 < 0x10000) {
+                        tok = Token.createChar(ch2);
+                    } else {
+                        tok = Token.createString(REUtil.decomposeToSurrogates(ch2));
+                    }
+                }
+                break;
+
+              case 'c': return this.processBacksolidus_c();
+              case 'C': return this.processBacksolidus_C();
+              case 'i': return this.processBacksolidus_i();
+              case 'I': return this.processBacksolidus_I();
+              case 'g': return this.processBacksolidus_g();
+              case 'X': return this.processBacksolidus_X();
+              case '1':  case '2':  case '3':  case '4':
+              case '5':  case '6':  case '7':  case '8':  case '9':
+                return this.processBackreference();
+
+              case 'P':
+              case 'p':
+                int pstart = this.offset;
+                tok = processBacksolidus_pP(this.chardata);
+                if (tok == null)  throw this.ex("parser.atom.5", pstart);
+                break;
+
+              default:
+                tok = Token.createChar(this.chardata);
+            }
+            this.next();
+            break;
+
+          case T_CHAR:
+            if (this.chardata == ']' || this.chardata == '{' || this.chardata == '}')
+                throw this.ex("parser.atom.4", this.offset-1);
+            tok = Token.createChar(this.chardata);
+            int high = this.chardata;
+            this.next();
+            if (REUtil.isHighSurrogate(high)
+                && this.read() == T_CHAR && REUtil.isLowSurrogate(this.chardata)) {
+                char[] sur = new char[2];
+                sur[0] = (char)high;
+                sur[1] = (char)this.chardata;
+                tok = Token.createParen(Token.createString(new String(sur)), 0);
+                this.next();
+            }
+            break;
+
+          default:
+            throw this.ex("parser.atom.4", this.offset-1);
+        }
+        return tok;
+    }
+
+    protected RangeToken processBacksolidus_pP(int c) throws ParseException {
+
+        this.next();
+        if (this.read() != T_CHAR || this.chardata != '{')
+            throw this.ex("parser.atom.2", this.offset-1);
+
+        // handle category escape
+        boolean positive = c == 'p';
+        int namestart = this.offset;
+        int nameend = this.regex.indexOf('}', namestart);
+
+        if (nameend < 0)
+            throw this.ex("parser.atom.3", this.offset);
+
+        String pname = this.regex.substring(namestart, nameend);
+        this.offset = nameend+1;
+
+        return Token.getRange(pname, positive, this.isSet(RegularExpression.XMLSCHEMA_MODE));
+    }
+
+    int processCIinCharacterClass(RangeToken tok, int c) {
+        return this.decodeEscaped();
+    }
+
+    /**
+     * char-class ::= '[' ( '^'? range ','?)+ ']'
+     * range ::= '\d' | '\w' | '\s' | category-block | range-char
+     *           | range-char '-' range-char
+     * range-char ::= '\[' | '\]' | '\\' | '\' [,-efnrtv] | bmp-code | character-2
+     * bmp-code ::= '\' 'u' [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]
+     */
+    protected RangeToken parseCharacterClass(boolean useNrange) throws ParseException {
+        this.setContext(S_INBRACKETS);
+        this.next();                            // '['
+        boolean nrange = false;
+        RangeToken base = null;
+        RangeToken tok;
+        if (this.read() == T_CHAR && this.chardata == '^') {
+            nrange = true;
+            this.next();                        // '^'
+            if (useNrange) {
+                tok = Token.createNRange();
+            } else {
+                base = Token.createRange();
+                base.addRange(0, Token.UTF16_MAX);
+                tok = Token.createRange();
+            }
+        } else {
+            tok = Token.createRange();
+        }
+        int type;
+        boolean firstloop = true;
+        while ((type = this.read()) != T_EOF) {
+            if (type == T_CHAR && this.chardata == ']' && !firstloop)
+                break;
+            int c = this.chardata;
+            boolean end = false;
+            if (type == T_BACKSOLIDUS) {
+                switch (c) {
+                  case 'd':  case 'D':
+                  case 'w':  case 'W':
+                  case 's':  case 'S':
+                    tok.mergeRanges(this.getTokenForShorthand(c));
+                    end = true;
+                    break;
+
+                  case 'i':  case 'I':
+                  case 'c':  case 'C':
+                    c = this.processCIinCharacterClass(tok, c);
+                    if (c < 0)  end = true;
+                    break;
+                    
+                  case 'p':
+                  case 'P':
+                    int pstart = this.offset;
+                    RangeToken tok2 = this.processBacksolidus_pP(c);
+                    if (tok2 == null)  throw this.ex("parser.atom.5", pstart);
+                    tok.mergeRanges(tok2);
+                    end = true;
+                    break;
+
+                  default:
+                    c = this.decodeEscaped();
+                } // \ + c
+            } // backsolidus
+                                                // POSIX Character class such as [:alnum:]
+            else if (type == T_POSIX_CHARCLASS_START) {
+                int nameend = this.regex.indexOf(':', this.offset);
+                if (nameend < 0) throw this.ex("parser.cc.1", this.offset);
+                boolean positive = true;
+                if (this.regex.charAt(this.offset) == '^') {
+                    this.offset ++;
+                    positive = false;
+                }
+                String name = this.regex.substring(this.offset, nameend);
+                RangeToken range = Token.getRange(name, positive,
+                                                  this.isSet(RegularExpression.XMLSCHEMA_MODE));
+                if (range == null)  throw this.ex("parser.cc.3", this.offset);
+                tok.mergeRanges(range);
+                end = true;
+                if (nameend+1 >= this.regexlen || this.regex.charAt(nameend+1) != ']')
+                    throw this.ex("parser.cc.1", nameend);
+                this.offset = nameend+2;
+            }
+            else if (type == T_XMLSCHEMA_CC_SUBTRACTION && !firstloop) {
+                if (nrange) {
+                    nrange = false;
+                    if (useNrange) {
+                        tok = (RangeToken) Token.complementRanges(tok);
+                    }
+                    else {
+                        base.subtractRanges(tok);
+                        tok = base;
+                    }
+                }
+                RangeToken range2 = this.parseCharacterClass(false);
+                tok.subtractRanges(range2);
+                if (this.read() != T_CHAR || this.chardata != ']') {
+                    throw this.ex("parser.cc.5", this.offset);
+                }
+                break;                          // Exit this loop
+            }
+            this.next();
+            if (!end) {                         // if not shorthands...
+                if (this.read() != T_CHAR || this.chardata != '-') { // Here is no '-'.
+                    if (!this.isSet(RegularExpression.IGNORE_CASE) || c > 0xffff) {
+                        tok.addRange(c, c);
+                    }
+                    else {
+                        addCaseInsensitiveChar(tok, c);
+                    }
+                }
+                else if (type == T_XMLSCHEMA_CC_SUBTRACTION) {
+                    throw this.ex("parser.cc.8", this.offset-1);
+                }
+                else {
+                    this.next(); // Skips '-'
+                    if ((type = this.read()) == T_EOF)  throw this.ex("parser.cc.2", this.offset);
+                    if (type == T_CHAR && this.chardata == ']') {
+                        if (!this.isSet(RegularExpression.IGNORE_CASE) || c > 0xffff) {
+                            tok.addRange(c, c);
+                        }
+                        else {
+                            addCaseInsensitiveChar(tok, c);
+                        }
+                        tok.addRange('-', '-');
+                    } else {
+                        int rangeend = this.chardata;
+                        if (type == T_BACKSOLIDUS) {
+                            rangeend = this.decodeEscaped();
+                        }
+                        this.next();
+                        if (c > rangeend) {
+                            throw this.ex("parser.ope.3", this.offset-1);
+                        }
+                        if (!this.isSet(RegularExpression.IGNORE_CASE) ||
+                                (c > 0xffff && rangeend > 0xffff)) {
+                            tok.addRange(c, rangeend);
+                        }
+                        else {
+                            addCaseInsensitiveCharRange(tok, c, rangeend);
+                        }
+                    }
+                }
+            }
+            if (this.isSet(RegularExpression.SPECIAL_COMMA)
+                && this.read() == T_CHAR && this.chardata == ',') {
+                this.next();
+            }
+            firstloop = false;
+        }
+        if (this.read() == T_EOF) {
+            throw this.ex("parser.cc.2", this.offset);
+        }
+        
+        if (!useNrange && nrange) {
+            base.subtractRanges(tok);
+            tok = base;
+        }
+        tok.sortRanges();
+        tok.compactRanges();
+        this.setContext(S_NORMAL);
+        this.next();                    // Skips ']'
+
+        return tok;
+    }
+
+    /**
+     * '(?[' ... ']' (('-' | '+' | '&') '[' ... ']')? ')'
+     */
+    protected RangeToken parseSetOperations() throws ParseException {
+        RangeToken tok = this.parseCharacterClass(false);
+        int type;
+        while ((type = this.read()) != T_RPAREN) {
+            int ch = this.chardata;
+            if (type == T_CHAR && (ch == '-' || ch == '&')
+                || type == T_PLUS) {
+                this.next();
+                if (this.read() != T_LBRACKET) throw ex("parser.ope.1", this.offset-1);
+                RangeToken t2 = this.parseCharacterClass(false);
+                if (type == T_PLUS)
+                    tok.mergeRanges(t2);
+                else if (ch == '-')
+                    tok.subtractRanges(t2);
+                else if (ch == '&')
+                    tok.intersectRanges(t2);
+                else
+                    throw new RuntimeException("ASSERT");
+            } else {
+                throw ex("parser.ope.2", this.offset-1);
+            }
+        }
+        this.next();
+        return tok;
+    }
+
+    Token getTokenForShorthand(int ch) {
+        Token tok;
+        switch (ch) {
+          case 'd':
+            tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
+                ? Token.getRange("Nd", true) : Token.token_0to9;
+            break;
+          case 'D':
+            tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
+                ? Token.getRange("Nd", false) : Token.token_not_0to9;
+            break;
+          case 'w':
+            tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
+                ? Token.getRange("IsWord", true) : Token.token_wordchars;
+            break;
+          case 'W':
+            tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
+                ? Token.getRange("IsWord", false) : Token.token_not_wordchars;
+            break;
+          case 's':
+            tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
+                ? Token.getRange("IsSpace", true) : Token.token_spaces;
+            break;
+          case 'S':
+            tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
+                ? Token.getRange("IsSpace", false) : Token.token_not_spaces;
+            break;
+
+          default:
+            throw new RuntimeException("Internal Error: shorthands: \\u"+Integer.toString(ch, 16));
+        }
+        return tok;
+    }
+
+    /**
+     */
+    int decodeEscaped() throws ParseException {
+        if (this.read() != T_BACKSOLIDUS)  throw ex("parser.next.1", this.offset-1);
+        int c = this.chardata;
+        switch (c) {
+          case 'e':  c = 0x1b;  break; // ESCAPE U+001B
+          case 'f':  c = '\f';  break; // FORM FEED U+000C
+          case 'n':  c = '\n';  break; // LINE FEED U+000A
+          case 'r':  c = '\r';  break; // CRRIAGE RETURN U+000D
+          case 't':  c = '\t';  break; // HORIZONTAL TABULATION U+0009
+          //case 'v':  c = 0x0b;  break; // VERTICAL TABULATION U+000B
+          case 'x':
+            this.next();
+            if (this.read() != T_CHAR)  throw ex("parser.descape.1", this.offset-1);
+            if (this.chardata == '{') {
+                int v1 = 0;
+                int uv = 0;
+                do {
+                    this.next();
+                    if (this.read() != T_CHAR)  throw ex("parser.descape.1", this.offset-1);
+                    if ((v1 = hexChar(this.chardata)) < 0)
+                        break;
+                    if (uv > uv*16) throw ex("parser.descape.2", this.offset-1);
+                    uv = uv*16+v1;
+                } while (true);
+                if (this.chardata != '}')  throw ex("parser.descape.3", this.offset-1);
+                if (uv > Token.UTF16_MAX)  throw ex("parser.descape.4", this.offset-1);
+                c = uv;
+            } else {
+                int v1 = 0;
+                if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
+                    throw ex("parser.descape.1", this.offset-1);
+                int uv = v1;
+                this.next();
+                if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
+                    throw ex("parser.descape.1", this.offset-1);
+                uv = uv*16+v1;
+                c = uv;
+            }
+            break;
+
+          case 'u':
+            int v1 = 0;
+            this.next();
+            if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
+                throw ex("parser.descape.1", this.offset-1);
+            int uv = v1;
+            this.next();
+            if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
+                throw ex("parser.descape.1", this.offset-1);
+            uv = uv*16+v1;
+            this.next();
+            if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
+                throw ex("parser.descape.1", this.offset-1);
+            uv = uv*16+v1;
+            this.next();
+            if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
+                throw ex("parser.descape.1", this.offset-1);
+            uv = uv*16+v1;
+            c = uv;
+            break;
+
+          case 'v':
+            this.next();
+            if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
+                throw ex("parser.descape.1", this.offset-1);
+            uv = v1;
+            this.next();
+            if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
+                throw ex("parser.descape.1", this.offset-1);
+            uv = uv*16+v1;
+            this.next();
+            if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
+                throw ex("parser.descape.1", this.offset-1);
+            uv = uv*16+v1;
+            this.next();
+            if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
+                throw ex("parser.descape.1", this.offset-1);
+            uv = uv*16+v1;
+            this.next();
+            if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
+                throw ex("parser.descape.1", this.offset-1);
+            uv = uv*16+v1;
+            this.next();
+            if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
+                throw ex("parser.descape.1", this.offset-1);
+            uv = uv*16+v1;
+            if (uv > Token.UTF16_MAX)  throw ex("parser.descappe.4", this.offset-1);
+            c = uv;
+            break;
+          case 'A':
+          case 'Z':
+          case 'z':
+            throw ex("parser.descape.5", this.offset-2);
+          default:
+        }
+        return c;
+    }
+
+    static private final int hexChar(int ch) {
+        if (ch < '0')  return -1;
+        if (ch > 'f')  return -1;
+        if (ch <= '9')  return ch-'0';
+        if (ch < 'A')  return -1;
+        if (ch <= 'F')  return ch-'A'+10;
+        if (ch < 'a')  return -1;
+        return ch-'a'+10;
+    }
+    
+    static protected final void addCaseInsensitiveChar(RangeToken tok, int c) {
+        final int[] caseMap = CaseInsensitiveMap.get(c);
+        tok.addRange(c, c);
+        
+        if (caseMap != null) {
+            for (int i=0; i<caseMap.length; i+=2) {
+                tok.addRange(caseMap[i], caseMap[i]);
+            }
+        }
+
+    }
+    
+    static protected final void addCaseInsensitiveCharRange(RangeToken tok, int start, int end) {
+        int[] caseMap;
+        int r1, r2;
+        if (start <= end) {
+            r1 = start;
+            r2 = end;
+        } else {
+            r1 = end;
+            r2 = start;
+        }
+
+        tok.addRange(r1, r2);
+        for (int ch = r1;  ch <= r2;  ch++) {
+            caseMap = CaseInsensitiveMap.get(ch);
+            if (caseMap != null) {
+                for (int i=0; i<caseMap.length; i+=2) {
+                    tok.addRange(caseMap[i], caseMap[i]);
+                }
+            }
+        }
+    }
+}