--- /dev/null
+/*
+ * Copyright (c) 2017 Cisco Systems, Inc. and others. All rights reserved.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License v1.0 which accompanies this distribution,
+ * and is available at http://www.eclipse.org/legal/epl-v10.html
+ */
+
+package org.opendaylight.yangtools.yang.model.util;
+
+import com.google.common.collect.ImmutableSet;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.util.regex.PatternSyntaxException;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Utilities for converting YANG XSD regexes into Java-compatible regexes
+ */
+public final class RegexUtils {
+ private static final Logger LOG = LoggerFactory.getLogger(RegexUtils.class);
+ private static final Pattern BETWEEN_CURLY_BRACES_PATTERN = Pattern.compile("\\{(.+?)\\}");
+ private static final Set<String> JAVA_UNICODE_BLOCKS = ImmutableSet.<String>builder()
+ .add("AegeanNumbers")
+ .add("AlchemicalSymbols")
+ .add("AlphabeticPresentationForms")
+ .add("AncientGreekMusicalNotation")
+ .add("AncientGreekNumbers")
+ .add("AncientSymbols")
+ .add("Arabic")
+ .add("ArabicPresentationForms-A")
+ .add("ArabicPresentationForms-B")
+ .add("ArabicSupplement")
+ .add("Armenian")
+ .add("Arrows")
+ .add("Avestan")
+ .add("Balinese")
+ .add("Bamum")
+ .add("BamumSupplement")
+ .add("BasicLatin")
+ .add("Batak")
+ .add("Bengali")
+ .add("BlockElements")
+ .add("Bopomofo")
+ .add("BopomofoExtended")
+ .add("BoxDrawing")
+ .add("Brahmi")
+ .add("BraillePatterns")
+ .add("Buginese")
+ .add("Buhid")
+ .add("ByzantineMusicalSymbols")
+ .add("Carian")
+ .add("Cham")
+ .add("Cherokee")
+ .add("CJKCompatibility")
+ .add("CJKCompatibilityForms")
+ .add("CJKCompatibilityIdeographs")
+ .add("CJKCompatibilityIdeographsSupplement")
+ .add("CJKRadicalsSupplement")
+ .add("CJKStrokes")
+ .add("CJKSymbolsandPunctuation")
+ .add("CJKUnifiedIdeographs")
+ .add("CJKUnifiedIdeographsExtensionA")
+ .add("CJKUnifiedIdeographsExtensionB")
+ .add("CJKUnifiedIdeographsExtensionC")
+ .add("CJKUnifiedIdeographsExtensionD")
+ .add("CombiningDiacriticalMarks")
+ .add("CombiningDiacriticalMarksSupplement")
+ .add("CombiningHalfMarks")
+ .add("CombiningDiacriticalMarksforSymbols")
+ .add("CommonIndicNumberForms")
+ .add("ControlPictures")
+ .add("Coptic")
+ .add("CountingRodNumerals")
+ .add("Cuneiform")
+ .add("CuneiformNumbersandPunctuation")
+ .add("CurrencySymbols")
+ .add("CypriotSyllabary")
+ .add("Cyrillic")
+ .add("CyrillicExtended-A")
+ .add("CyrillicExtended-B")
+ .add("CyrillicSupplementary")
+ .add("Deseret")
+ .add("Devanagari")
+ .add("DevanagariExtended")
+ .add("Dingbats")
+ .add("DominoTiles")
+ .add("EgyptianHieroglyphs")
+ .add("Emoticons")
+ .add("EnclosedAlphanumericSupplement")
+ .add("EnclosedAlphanumerics")
+ .add("EnclosedCJKLettersandMonths")
+ .add("EnclosedIdeographicSupplement")
+ .add("Ethiopic")
+ .add("EthiopicExtended")
+ .add("EthiopicExtended-A")
+ .add("EthiopicSupplement")
+ .add("GeneralPunctuation")
+ .add("GeometricShapes")
+ .add("Georgian")
+ .add("GeorgianSupplement")
+ .add("Glagolitic")
+ .add("Gothic")
+ .add("GreekandCoptic")
+ .add("GreekExtended")
+ .add("Gujarati")
+ .add("Gurmukhi")
+ .add("HalfwidthandFullwidthForms")
+ .add("HangulCompatibilityJamo")
+ .add("HangulJamo")
+ .add("HangulJamoExtended-A")
+ .add("HangulJamoExtended-B")
+ .add("HangulSyllables")
+ .add("Hanunoo")
+ .add("Hebrew")
+ .add("HighPrivateUseSurrogates")
+ .add("HighSurrogates")
+ .add("Hiragana")
+ .add("IdeographicDescriptionCharacters")
+ .add("ImperialAramaic")
+ .add("InscriptionalPahlavi")
+ .add("InscriptionalParthian")
+ .add("IPAExtensions")
+ .add("Javanese")
+ .add("Kaithi")
+ .add("KanaSupplement")
+ .add("Kanbun")
+ .add("Kangxi Radicals")
+ .add("Kannada")
+ .add("Katakana")
+ .add("KatakanaPhoneticExtensions")
+ .add("KayahLi")
+ .add("Kharoshthi")
+ .add("Khmer")
+ .add("KhmerSymbols")
+ .add("Lao")
+ .add("Latin-1Supplement")
+ .add("LatinExtended-A")
+ .add("LatinExtendedAdditional")
+ .add("LatinExtended-B")
+ .add("LatinExtended-C")
+ .add("LatinExtended-D")
+ .add("Lepcha")
+ .add("LetterlikeSymbols")
+ .add("Limbu")
+ .add("LinearBIdeograms")
+ .add("LinearBSyllabary")
+ .add("Lisu")
+ .add("LowSurrogates")
+ .add("Lycian")
+ .add("Lydian")
+ .add("MahjongTiles")
+ .add("Malayalam")
+ .add("Mandaic")
+ .add("MathematicalAlphanumericSymbols")
+ .add("MathematicalOperators")
+ .add("MeeteiMayek")
+ .add("MiscellaneousMathematicalSymbols-A")
+ .add("MiscellaneousMathematicalSymbols-B")
+ .add("MiscellaneousSymbols")
+ .add("MiscellaneousSymbolsandArrows")
+ .add("MiscellaneousSymbolsAndPictographs")
+ .add("MiscellaneousTechnical")
+ .add("ModifierToneLetters")
+ .add("Mongolian")
+ .add("MusicalSymbols")
+ .add("Myanmar")
+ .add("MyanmarExtended-A")
+ .add("NewTaiLue")
+ .add("NKo")
+ .add("NumberForms")
+ .add("Ogham")
+ .add("OlChiki")
+ .add("OldItalic")
+ .add("OldPersian")
+ .add("OldSouthArabian")
+ .add("OldTurkic")
+ .add("OpticalCharacterRecognition")
+ .add("Oriya")
+ .add("Osmanya")
+ .add("Phags-pa")
+ .add("PhaistosDisc")
+ .add("Phoenician")
+ .add("PhoneticExtensions")
+ .add("PhoneticExtensionsSupplement")
+ .add("PlayingCards")
+ .add("PrivateUseArea")
+ .add("Rejang")
+ .add("RumiNumeralSymbols")
+ .add("Runic")
+ .add("Samaritan")
+ .add("Saurashtra")
+ .add("Shavian")
+ .add("Sinhala")
+ .add("SmallFormVariants")
+ .add("SpacingModifierLetters")
+ .add("Specials")
+ .add("Sundanese")
+ .add("SuperscriptsandSubscripts")
+ .add("SupplementalArrows-A")
+ .add("SupplementalArrows-B")
+ .add("SupplementalMathematicalOperators")
+ .add("SupplementalPunctuation")
+ .add("SupplementaryPrivateUseArea-A")
+ .add("SupplementaryPrivateUseArea-B")
+ .add("SylotiNagri")
+ .add("Syriac")
+ .add("Tagalog")
+ .add("Tagbanwa")
+ .add("Tags")
+ .add("TaiLe")
+ .add("TaiTham")
+ .add("TaiViet")
+ .add("TaiXuanJingSymbols")
+ .add("Tamil")
+ .add("Telugu")
+ .add("Thaana")
+ .add("Thai")
+ .add("Tibetan")
+ .add("Tifinagh")
+ .add("TransportAndMapSymbols")
+ .add("Ugaritic")
+ .add("UnifiedCanadianAboriginalSyllabics")
+ .add("UnifiedCanadianAboriginalSyllabicsExtended")
+ .add("Vai")
+ .add("VariationSelectors")
+ .add("VariationSelectorsSupplement")
+ .add("VedicExtensions")
+ .add("VerticalForms")
+ .add("YiRadicals")
+ .add("YiSyllables")
+ .add("YijingHexagramSymbols").build();
+
+ private static final int UNICODE_SCRIPT_FIX_COUNTER = 30;
+
+ private RegexUtils() {
+ throw new UnsupportedOperationException("Utility class should not be instantiated.");
+ }
+
+ /**
+ * Converts XSD regex to Java-compatible regex
+ *
+ * @param xsdRegex XSD regex pattern as it is defined in a YANG source
+ * @return Java-compatible regex
+ */
+ public static String getJavaRegexFromXSD(final String xsdRegex) {
+ return "^" + fixUnicodeScriptPattern(escapeChars(xsdRegex)) + '$';
+ }
+
+ /*
+ * As both '^' and '$' are special anchor characters in java regular
+ * expressions which are implicitly present in XSD regular expressions,
+ * we need to escape them in case they are not defined as part of
+ * character ranges i.e. inside regular square brackets.
+ */
+ private static String escapeChars(final String regex) {
+ final StringBuilder result = new StringBuilder(regex.length());
+ int bracket = 0;
+ boolean escape = false;
+ for (int i = 0; i < regex.length(); i++) {
+ final char ch = regex.charAt(i);
+ switch (ch) {
+ case '[':
+ if (!escape) {
+ bracket++;
+ }
+ escape = false;
+ result.append(ch);
+ break;
+ case ']':
+ if (!escape) {
+ bracket--;
+ }
+ escape = false;
+ result.append(ch);
+ break;
+ case '\\':
+ escape = !escape;
+ result.append(ch);
+ break;
+ case '^':
+ case '$':
+ if (bracket == 0) {
+ result.append('\\');
+ }
+ escape = false;
+ result.append(ch);
+ break;
+ default:
+ escape = false;
+ result.append(ch);
+ }
+ }
+ return result.toString();
+ }
+
+ private static String fixUnicodeScriptPattern(String rawPattern) {
+ for (int i = 0; i < UNICODE_SCRIPT_FIX_COUNTER; i++) {
+ try {
+ Pattern.compile(rawPattern);
+ return rawPattern;
+ } catch(final PatternSyntaxException ex) {
+ LOG.debug("Invalid regex pattern syntax in: {}", rawPattern, ex);
+ if (ex.getMessage().contains("Unknown character script name")) {
+ rawPattern = fixUnknownScripts(ex.getMessage(), rawPattern);
+ } else {
+ return rawPattern;
+ }
+ }
+ }
+
+ LOG.warn("Regex pattern could not be fixed: {}", rawPattern);
+ return rawPattern;
+ }
+
+ private static String fixUnknownScripts(final String exMessage, final String rawPattern) {
+ StringBuilder result = new StringBuilder(rawPattern);
+ final Matcher matcher = BETWEEN_CURLY_BRACES_PATTERN.matcher(exMessage);
+ if (matcher.find()) {
+ final String capturedGroup = matcher.group(1);
+ if (JAVA_UNICODE_BLOCKS.contains(capturedGroup)) {
+ final int idx = rawPattern.indexOf("Is" + capturedGroup);
+ result = result.replace(idx, idx + 2, "In");
+ }
+ }
+ return result.toString();
+ }
+}
\ No newline at end of file