/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.opendaylight.yangtools.xsd.regex; import java.text.CharacterIterator; import java.util.ArrayDeque; import java.util.Deque; import java.util.Locale; /** * A regular expression matching engine using Non-deterministic Finite Automaton (NFA). * This engine does not conform to the POSIX regular expression. * *
* RegularExpression re = new RegularExpression(regex); * if (re.matches(text)) { ... } ** *
* RegularExpression re = new RegularExpression(regex);
* Match match = new Match();
* if (re.matches(text, match)) {
* ... // You can refer captured texts with methods of the Match
class.
* }
*
*
* * RegularExpression re = new RegularExpression(regex, "i"); * if (re.matches(text) >= 0) { ...} ** *
You can specify options to RegularExpression(
regex,
options)
* or setPattern(
regex,
options)
.
* This options parameter consists of the following characters.
*
"i"
* "m"
* "s"
* "u"
* "w"
* ","
* "X"
* match()
method does not do subsring matching
* but entire string matching.
*
*
* Differences from the Perl 5 regular expression*
|
*
Meta characters are `. * + ? { [ ( ) | \ ^ $'.
*This range matches the character. *
This range matches a character which has a code point that is >= C1's code point and <= C2's code point. + *
... *
These expressions specifies the same ranges as the following expressions. *
Enumerated ranges are merged (union operation). * [a-ec-z] is equivalent to [a-z] * *
Match
instance
* after matches(String,Match)
.
* The 0th group means whole of this regular expression.
* The Nth gorup is the inside of the Nth left parenthesis.
*
* For instance, a regular expression is * " *([^<:]*) +<([^>]*)> *" * and target text is * "From: TAMURA Kent <kent@trl.ibm.co.jp>": *
Match.getCapturedText(0)
:
* " TAMURA Kent <kent@trl.ibm.co.jp>"
* Match.getCapturedText(1)
: "TAMURA Kent"
* Match.getCapturedText(2)
: "kent@trl.ibm.co.jp"
* * regex ::= ('(?' options ')')? term ('|' term)* * term ::= factor+ * factor ::= anchors | atom (('*' | '+' | '?' | minmax ) '?'? )? * | '(?#' [^)]* ')' * minmax ::= '{' ([0-9]+ | [0-9]+ ',' | ',' [0-9]+ | [0-9]+ ',' [0-9]+) '}' * atom ::= char | '.' | char-class | '(' regex ')' | '(?:' regex ')' | '\' [0-9] * | '\w' | '\W' | '\d' | '\D' | '\s' | '\S' | category-block | '\X' * | '(?>' regex ')' | '(?' options ':' regex ')' * | '(?' ('(' [0-9] ')' | '(' anchors ')' | looks) term ('|' term)? ')' * options ::= [imsw]* ('-' [imsw]+)? * anchors ::= '^' | '$' | '\A' | '\Z' | '\z' | '\b' | '\B' | '\<' | '\>' * looks ::= '(?=' regex ')' | '(?!' regex ')' * | '(?<=' regex ')' | '(?<!' regex ')' * char ::= '\\' | '\' [efnrtv] | '\c' [@-_] | code-point | character-1 * category-block ::= '\' [pP] category-symbol-1 * | ('\p{' | '\P{') (category-symbol | block-name * | other-properties) '}' * category-symbol-1 ::= 'L' | 'M' | 'N' | 'Z' | 'C' | 'P' | 'S' * category-symbol ::= category-symbol-1 | 'Lu' | 'Ll' | 'Lt' | 'Lm' | Lo' * | 'Mn' | 'Me' | 'Mc' | 'Nd' | 'Nl' | 'No' * | 'Zs' | 'Zl' | 'Zp' | 'Cc' | 'Cf' | 'Cn' | 'Co' | 'Cs' * | 'Pd' | 'Ps' | 'Pe' | 'Pc' | 'Po' * | 'Sm' | 'Sc' | 'Sk' | 'So' * block-name ::= (See above) * other-properties ::= 'ALL' | 'ASSIGNED' | 'UNASSIGNED' * character-1 ::= (any character except meta-characters) * * char-class ::= '[' ranges ']' * | '(?[' ranges ']' ([-+&] '[' ranges ']')? ')' * ranges ::= '^'? (range ','?)+ * range ::= '\d' | '\w' | '\s' | '\D' | '\W' | '\S' | category-block * | range-char | range-char '-' range-char * range-char ::= '\[' | '\]' | '\\' | '\' [,-efnrtv] | code-point | character-2 * code-point ::= '\x' hex-char hex-char * | '\x{' hex-char+ '}' * | '\v' hex-char hex-char hex-char hex-char hex-char hex-char * hex-char ::= [0-9a-fA-F] * character-2 ::= (any character except \[]-,) ** *
By default, the engine considers a position between a word character * (\w) and a non word character * is a word boundary. *
By this option, the engine checks word boundaries with the method of
* 'Unicode Regular Expression Guidelines' Revision 4.
*
* @see #RegularExpression(java.lang.String,int)
* @see #setPattern(java.lang.String,int)
*/
static final int UNICODE_WORD_BOUNDARY = 1<<6; // "w"
/**
* "H"
*/
static final int PROHIBIT_HEAD_CHARACTER_OPTIMIZATION = 1<<7;
/**
* "F"
*/
static final int PROHIBIT_FIXED_STRING_OPTIMIZATION = 1<<8;
/**
* "X". XML Schema mode.
*/
static final int XMLSCHEMA_MODE = 1<<9;
/**
* ",".
*/
static final int SPECIAL_COMMA = 1<<10;
private static final boolean isSet(int options, int flag) {
return (options & flag) == flag;
}
/**
* Creates a new RegularExpression instance.
*
* @param regex A regular expression
* @exception ParseException regex is not conforming to the syntax.
*/
public RegularExpression(String regex) throws ParseException {
this(regex, null);
}
/**
* Creates a new RegularExpression instance with options.
*
* @param regex A regular expression
* @param options A String consisted of "i" "m" "s" "u" "w" "," "X"
* @exception ParseException regex is not conforming to the syntax.
*/
public RegularExpression(String regex, String options) throws ParseException {
this.setPattern(regex, options);
}
/**
* Creates a new RegularExpression instance with options.
*
* @param regex A regular expression
* @param options A String consisted of "i" "m" "s" "u" "w" "," "X"
* @exception ParseException regex is not conforming to the syntax.
*/
public RegularExpression(String regex, String options, Locale locale) throws ParseException {
this.setPattern(regex, options, locale);
}
RegularExpression(String regex, Token tok, int parens, boolean hasBackReferences, int options) {
this.regex = regex;
this.tokentree = tok;
this.nofparen = parens;
this.options = options;
this.hasBackReferences = hasBackReferences;
}
/**
*
*/
public void setPattern(String newPattern) throws ParseException {
this.setPattern(newPattern, Locale.getDefault());
}
public void setPattern(String newPattern, Locale locale) throws ParseException {
this.setPattern(newPattern, this.options, locale);
}
private void setPattern(String newPattern, int options, Locale locale) throws ParseException {
this.regex = newPattern;
this.options = options;
RegexParser rp = RegularExpression.isSet(this.options, RegularExpression.XMLSCHEMA_MODE)
? new ParserForXMLSchema(locale) : new RegexParser(locale);
this.tokentree = rp.parse(this.regex, this.options);
this.nofparen = rp.parennumber;
this.hasBackReferences = rp.hasBackReferences;
this.operations = null;
this.context = null;
}
/**
*
*/
public void setPattern(String newPattern, String options) throws ParseException {
this.setPattern(newPattern, options, Locale.getDefault());
}
public void setPattern(String newPattern, String options, Locale locale) throws ParseException {
this.setPattern(newPattern, REUtil.parseOptions(options), locale);
}
/**
*
*/
public String getPattern() {
return this.regex;
}
/**
* Represents this instance in String.
*/
@Override
public String toString() {
return this.tokentree.toString(this.options & ~XMLSCHEMA_MODE);
}
/**
* Returns a {@link java.util.regex.Pattern}-compatible string representation of this expression.
*
* @return A Pattern-compatible String representation
*/
public String toPatternString() {
final String str = this.tokentree.toString(this.options);
return isSet(options, XMLSCHEMA_MODE) ? "^" + str + "$" : str;
}
/**
* Returns a option string.
* The order of letters in it may be different from a string specified
* in a constructor or setPattern()
.
*
* @see #RegularExpression(java.lang.String,java.lang.String)
* @see #setPattern(java.lang.String,java.lang.String)
*/
public String getOptions() {
return REUtil.createOptionString(this.options);
}
/**
* Return true if patterns are the same and the options are equivalent.
*/
@Override
public boolean equals(Object obj) {
if (obj == null) {
return false;
}
if (!(obj instanceof RegularExpression)) {
return false;
}
RegularExpression r = (RegularExpression)obj;
return this.regex.equals(r.regex) && this.options == r.options;
}
boolean equals(String pattern, int options) {
return this.regex.equals(pattern) && this.options == options;
}
/**
*
*/
@Override
public int hashCode() {
return (this.regex+"/"+this.getOptions()).hashCode();
}
/**
* Return the number of regular expression groups.
* This method returns 1 when the regular expression has no capturing-parenthesis.
*
*/
public int getNumberOfGroups() {
return this.nofparen;
}
// ================================================================
private static final int WT_IGNORE = 0;
private static final int WT_LETTER = 1;
private static final int WT_OTHER = 2;
private static final int getWordType0(char ch, int opts) {
if (!isSet(opts, UNICODE_WORD_BOUNDARY)) {
if (isSet(opts, USE_UNICODE_CATEGORY)) {
return (Token.getRange("IsWord", true).match(ch)) ? WT_LETTER : WT_OTHER;
}
return isWordChar(ch) ? WT_LETTER : WT_OTHER;
}
switch (Character.getType(ch)) {
case Character.UPPERCASE_LETTER: // L
case Character.LOWERCASE_LETTER: // L
case Character.TITLECASE_LETTER: // L
case Character.MODIFIER_LETTER: // L
case Character.OTHER_LETTER: // L
case Character.LETTER_NUMBER: // N
case Character.DECIMAL_DIGIT_NUMBER: // N
case Character.OTHER_NUMBER: // N
case Character.COMBINING_SPACING_MARK: // Mc
return WT_LETTER;
case Character.FORMAT: // Cf
case Character.NON_SPACING_MARK: // Mn
case Character.ENCLOSING_MARK: // Mc
return WT_IGNORE;
case Character.CONTROL: // Cc
switch (ch) {
case '\t':
case '\n':
case '\u000B':
case '\f':
case '\r':
return WT_OTHER;
default:
return WT_IGNORE;
}
default:
return WT_OTHER;
}
}
// ================================================================
static final int LINE_FEED = 0x000A;
static final int CARRIAGE_RETURN = 0x000D;
static final int LINE_SEPARATOR = 0x2028;
static final int PARAGRAPH_SEPARATOR = 0x2029;
private static final boolean isEOLChar(int ch) {
return ch == LINE_FEED || ch == CARRIAGE_RETURN || ch == LINE_SEPARATOR
|| ch == PARAGRAPH_SEPARATOR;
}
private static final boolean isWordChar(int ch) { // Legacy word characters
if (ch == '_') {
return true;
}
if (ch < '0') {
return false;
}
if (ch > 'z') {
return false;
}
if (ch <= '9') {
return true;
}
if (ch < 'A') {
return false;
}
if (ch <= 'Z') {
return true;
}
if (ch < 'a') {
return false;
}
return true;
}
private static final boolean matchIgnoreCase(int chardata, int ch) {
if (chardata == ch) {
return true;
}
if (chardata > 0xffff || ch > 0xffff) {
return false;
}
char uch1 = Character.toUpperCase((char)chardata);
char uch2 = Character.toUpperCase((char)ch);
if (uch1 == uch2) {
return true;
}
return Character.toLowerCase(uch1) == Character.toLowerCase(uch2);
}
}