2 * Copyright (c) 2017 Cisco Systems, Inc. and others. All rights reserved.
4 * This program and the accompanying materials are made available under the
5 * terms of the Eclipse Public License v1.0 which accompanies this distribution,
6 * and is available at http://www.eclipse.org/legal/epl-v10.html
9 package org.opendaylight.yangtools.yang.model.util;
11 import com.google.common.collect.ImmutableSet;
13 import java.util.regex.Matcher;
14 import java.util.regex.Pattern;
15 import java.util.regex.PatternSyntaxException;
16 import org.slf4j.Logger;
17 import org.slf4j.LoggerFactory;
20 * Utilities for converting YANG XSD regexes into Java-compatible regexes.
22 public final class RegexUtils {
23 private static final Logger LOG = LoggerFactory.getLogger(RegexUtils.class);
24 private static final Pattern BETWEEN_CURLY_BRACES_PATTERN = Pattern.compile("\\{(.+?)\\}");
25 private static final Set<String> JAVA_UNICODE_BLOCKS = ImmutableSet.<String>builder()
27 .add("AlchemicalSymbols")
28 .add("AlphabeticPresentationForms")
29 .add("AncientGreekMusicalNotation")
30 .add("AncientGreekNumbers")
31 .add("AncientSymbols")
33 .add("ArabicPresentationForms-A")
34 .add("ArabicPresentationForms-B")
35 .add("ArabicSupplement")
41 .add("BamumSupplement")
47 .add("BopomofoExtended")
50 .add("BraillePatterns")
53 .add("ByzantineMusicalSymbols")
57 .add("CJKCompatibility")
58 .add("CJKCompatibilityForms")
59 .add("CJKCompatibilityIdeographs")
60 .add("CJKCompatibilityIdeographsSupplement")
61 .add("CJKRadicalsSupplement")
63 .add("CJKSymbolsandPunctuation")
64 .add("CJKUnifiedIdeographs")
65 .add("CJKUnifiedIdeographsExtensionA")
66 .add("CJKUnifiedIdeographsExtensionB")
67 .add("CJKUnifiedIdeographsExtensionC")
68 .add("CJKUnifiedIdeographsExtensionD")
69 .add("CombiningDiacriticalMarks")
70 .add("CombiningDiacriticalMarksSupplement")
71 .add("CombiningHalfMarks")
72 .add("CombiningDiacriticalMarksforSymbols")
73 .add("CommonIndicNumberForms")
74 .add("ControlPictures")
76 .add("CountingRodNumerals")
78 .add("CuneiformNumbersandPunctuation")
79 .add("CurrencySymbols")
80 .add("CypriotSyllabary")
82 .add("CyrillicExtended-A")
83 .add("CyrillicExtended-B")
84 .add("CyrillicSupplementary")
87 .add("DevanagariExtended")
90 .add("EgyptianHieroglyphs")
92 .add("EnclosedAlphanumericSupplement")
93 .add("EnclosedAlphanumerics")
94 .add("EnclosedCJKLettersandMonths")
95 .add("EnclosedIdeographicSupplement")
97 .add("EthiopicExtended")
98 .add("EthiopicExtended-A")
99 .add("EthiopicSupplement")
100 .add("GeneralPunctuation")
101 .add("GeometricShapes")
103 .add("GeorgianSupplement")
106 .add("GreekandCoptic")
107 .add("GreekExtended")
110 .add("HalfwidthandFullwidthForms")
111 .add("HangulCompatibilityJamo")
113 .add("HangulJamoExtended-A")
114 .add("HangulJamoExtended-B")
115 .add("HangulSyllables")
118 .add("HighPrivateUseSurrogates")
119 .add("HighSurrogates")
121 .add("IdeographicDescriptionCharacters")
122 .add("ImperialAramaic")
123 .add("InscriptionalPahlavi")
124 .add("InscriptionalParthian")
125 .add("IPAExtensions")
128 .add("KanaSupplement")
130 .add("Kangxi Radicals")
133 .add("KatakanaPhoneticExtensions")
139 .add("Latin-1Supplement")
140 .add("LatinExtended-A")
141 .add("LatinExtendedAdditional")
142 .add("LatinExtended-B")
143 .add("LatinExtended-C")
144 .add("LatinExtended-D")
146 .add("LetterlikeSymbols")
148 .add("LinearBIdeograms")
149 .add("LinearBSyllabary")
151 .add("LowSurrogates")
157 .add("MathematicalAlphanumericSymbols")
158 .add("MathematicalOperators")
160 .add("MiscellaneousMathematicalSymbols-A")
161 .add("MiscellaneousMathematicalSymbols-B")
162 .add("MiscellaneousSymbols")
163 .add("MiscellaneousSymbolsandArrows")
164 .add("MiscellaneousSymbolsAndPictographs")
165 .add("MiscellaneousTechnical")
166 .add("ModifierToneLetters")
168 .add("MusicalSymbols")
170 .add("MyanmarExtended-A")
178 .add("OldSouthArabian")
180 .add("OpticalCharacterRecognition")
186 .add("PhoneticExtensions")
187 .add("PhoneticExtensionsSupplement")
189 .add("PrivateUseArea")
191 .add("RumiNumeralSymbols")
197 .add("SmallFormVariants")
198 .add("SpacingModifierLetters")
201 .add("SuperscriptsandSubscripts")
202 .add("SupplementalArrows-A")
203 .add("SupplementalArrows-B")
204 .add("SupplementalMathematicalOperators")
205 .add("SupplementalPunctuation")
206 .add("SupplementaryPrivateUseArea-A")
207 .add("SupplementaryPrivateUseArea-B")
216 .add("TaiXuanJingSymbols")
223 .add("TransportAndMapSymbols")
225 .add("UnifiedCanadianAboriginalSyllabics")
226 .add("UnifiedCanadianAboriginalSyllabicsExtended")
228 .add("VariationSelectors")
229 .add("VariationSelectorsSupplement")
230 .add("VedicExtensions")
231 .add("VerticalForms")
234 .add("YijingHexagramSymbols").build();
236 private static final int UNICODE_SCRIPT_FIX_COUNTER = 30;
238 private RegexUtils() {
239 throw new UnsupportedOperationException("Utility class should not be instantiated.");
243 * Converts XSD regex to Java-compatible regex.
245 * @param xsdRegex XSD regex pattern as it is defined in a YANG source
246 * @return Java-compatible regex
248 public static String getJavaRegexFromXSD(final String xsdRegex) {
249 return "^" + fixUnicodeScriptPattern(escapeChars(xsdRegex)) + '$';
253 * As both '^' and '$' are special anchor characters in java regular
254 * expressions which are implicitly present in XSD regular expressions,
255 * we need to escape them in case they are not defined as part of
256 * character ranges i.e. inside regular square brackets.
258 private static String escapeChars(final String regex) {
259 final StringBuilder result = new StringBuilder(regex.length());
261 boolean escape = false;
262 for (int i = 0; i < regex.length(); i++) {
263 final char ch = regex.charAt(i);
296 return result.toString();
299 private static String fixUnicodeScriptPattern(String rawPattern) {
300 for (int i = 0; i < UNICODE_SCRIPT_FIX_COUNTER; i++) {
302 Pattern.compile(rawPattern);
304 } catch (final PatternSyntaxException ex) {
305 LOG.debug("Invalid regex pattern syntax in: {}", rawPattern, ex);
306 if (ex.getMessage().contains("Unknown character script name")) {
307 rawPattern = fixUnknownScripts(ex.getMessage(), rawPattern);
314 LOG.warn("Regex pattern could not be fixed: {}", rawPattern);
318 private static String fixUnknownScripts(final String exMessage, final String rawPattern) {
319 StringBuilder result = new StringBuilder(rawPattern);
320 final Matcher matcher = BETWEEN_CURLY_BRACES_PATTERN.matcher(exMessage);
321 if (matcher.find()) {
322 final String capturedGroup = matcher.group(1);
323 if (JAVA_UNICODE_BLOCKS.contains(capturedGroup)) {
324 final int idx = rawPattern.indexOf("Is" + capturedGroup);
325 result = result.replace(idx, idx + 2, "In");
328 return result.toString();