2 * Copyright (c) 2017 Cisco Systems, Inc. and others. All rights reserved.
4 * This program and the accompanying materials are made available under the
5 * terms of the Eclipse Public License v1.0 which accompanies this distribution,
6 * and is available at http://www.eclipse.org/legal/epl-v10.html
8 package org.opendaylight.yangtools.yang.parser.rfc7950.stmt.pattern;
10 import com.google.common.collect.ImmutableSet;
11 import java.util.regex.Matcher;
12 import java.util.regex.Pattern;
13 import java.util.regex.PatternSyntaxException;
14 import org.slf4j.Logger;
15 import org.slf4j.LoggerFactory;
18 * Utilities for converting YANG XSD regexes into Java-compatible regexes.
20 final class RegexUtils {
21 private static final Logger LOG = LoggerFactory.getLogger(RegexUtils.class);
22 private static final Pattern BETWEEN_CURLY_BRACES_PATTERN = Pattern.compile("\\{(.+?)\\}");
23 private static final ImmutableSet<String> JAVA_UNICODE_BLOCKS = ImmutableSet.<String>builder()
25 .add("AlchemicalSymbols")
26 .add("AlphabeticPresentationForms")
27 .add("AncientGreekMusicalNotation")
28 .add("AncientGreekNumbers")
29 .add("AncientSymbols")
31 .add("ArabicPresentationForms-A")
32 .add("ArabicPresentationForms-B")
33 .add("ArabicSupplement")
39 .add("BamumSupplement")
45 .add("BopomofoExtended")
48 .add("BraillePatterns")
51 .add("ByzantineMusicalSymbols")
55 .add("CJKCompatibility")
56 .add("CJKCompatibilityForms")
57 .add("CJKCompatibilityIdeographs")
58 .add("CJKCompatibilityIdeographsSupplement")
59 .add("CJKRadicalsSupplement")
61 .add("CJKSymbolsandPunctuation")
62 .add("CJKUnifiedIdeographs")
63 .add("CJKUnifiedIdeographsExtensionA")
64 .add("CJKUnifiedIdeographsExtensionB")
65 .add("CJKUnifiedIdeographsExtensionC")
66 .add("CJKUnifiedIdeographsExtensionD")
67 .add("CombiningDiacriticalMarks")
68 .add("CombiningDiacriticalMarksSupplement")
69 .add("CombiningHalfMarks")
70 .add("CombiningDiacriticalMarksforSymbols")
71 .add("CommonIndicNumberForms")
72 .add("ControlPictures")
74 .add("CountingRodNumerals")
76 .add("CuneiformNumbersandPunctuation")
77 .add("CurrencySymbols")
78 .add("CypriotSyllabary")
80 .add("CyrillicExtended-A")
81 .add("CyrillicExtended-B")
82 .add("CyrillicSupplementary")
85 .add("DevanagariExtended")
88 .add("EgyptianHieroglyphs")
90 .add("EnclosedAlphanumericSupplement")
91 .add("EnclosedAlphanumerics")
92 .add("EnclosedCJKLettersandMonths")
93 .add("EnclosedIdeographicSupplement")
95 .add("EthiopicExtended")
96 .add("EthiopicExtended-A")
97 .add("EthiopicSupplement")
98 .add("GeneralPunctuation")
99 .add("GeometricShapes")
101 .add("GeorgianSupplement")
104 .add("GreekandCoptic")
105 .add("GreekExtended")
108 .add("HalfwidthandFullwidthForms")
109 .add("HangulCompatibilityJamo")
111 .add("HangulJamoExtended-A")
112 .add("HangulJamoExtended-B")
113 .add("HangulSyllables")
116 .add("HighPrivateUseSurrogates")
117 .add("HighSurrogates")
119 .add("IdeographicDescriptionCharacters")
120 .add("ImperialAramaic")
121 .add("InscriptionalPahlavi")
122 .add("InscriptionalParthian")
123 .add("IPAExtensions")
126 .add("KanaSupplement")
128 .add("Kangxi Radicals")
131 .add("KatakanaPhoneticExtensions")
137 .add("Latin-1Supplement")
138 .add("LatinExtended-A")
139 .add("LatinExtendedAdditional")
140 .add("LatinExtended-B")
141 .add("LatinExtended-C")
142 .add("LatinExtended-D")
144 .add("LetterlikeSymbols")
146 .add("LinearBIdeograms")
147 .add("LinearBSyllabary")
149 .add("LowSurrogates")
155 .add("MathematicalAlphanumericSymbols")
156 .add("MathematicalOperators")
158 .add("MiscellaneousMathematicalSymbols-A")
159 .add("MiscellaneousMathematicalSymbols-B")
160 .add("MiscellaneousSymbols")
161 .add("MiscellaneousSymbolsandArrows")
162 .add("MiscellaneousSymbolsAndPictographs")
163 .add("MiscellaneousTechnical")
164 .add("ModifierToneLetters")
166 .add("MusicalSymbols")
168 .add("MyanmarExtended-A")
176 .add("OldSouthArabian")
178 .add("OpticalCharacterRecognition")
184 .add("PhoneticExtensions")
185 .add("PhoneticExtensionsSupplement")
187 .add("PrivateUseArea")
189 .add("RumiNumeralSymbols")
195 .add("SmallFormVariants")
196 .add("SpacingModifierLetters")
199 .add("SuperscriptsandSubscripts")
200 .add("SupplementalArrows-A")
201 .add("SupplementalArrows-B")
202 .add("SupplementalMathematicalOperators")
203 .add("SupplementalPunctuation")
204 .add("SupplementaryPrivateUseArea-A")
205 .add("SupplementaryPrivateUseArea-B")
214 .add("TaiXuanJingSymbols")
221 .add("TransportAndMapSymbols")
223 .add("UnifiedCanadianAboriginalSyllabics")
224 .add("UnifiedCanadianAboriginalSyllabicsExtended")
226 .add("VariationSelectors")
227 .add("VariationSelectorsSupplement")
228 .add("VedicExtensions")
229 .add("VerticalForms")
232 .add("YijingHexagramSymbols").build();
234 private static final int UNICODE_SCRIPT_FIX_COUNTER = 30;
236 private RegexUtils() {
241 * Converts XSD regex to Java-compatible regex.
243 * @param xsdRegex XSD regex pattern as it is defined in a YANG source
244 * @return Java-compatible regex
246 static String getJavaRegexFromXSD(final String xsdRegex) {
247 // Note: we are using a non-capturing group to deal with internal structure issues, like branches and similar.
248 return "^(?:" + fixUnicodeScriptPattern(escapeChars(xsdRegex)) + ")$";
252 * As both '^' and '$' are special anchor characters in java regular
253 * expressions which are implicitly present in XSD regular expressions,
254 * we need to escape them in case they are not defined as part of
255 * character ranges i.e. inside regular square brackets.
257 private static String escapeChars(final String regex) {
258 final StringBuilder result = new StringBuilder(regex.length());
260 boolean escape = false;
261 for (int i = 0; i < regex.length(); i++) {
262 final char ch = regex.charAt(i);
295 return result.toString();
298 private static String fixUnicodeScriptPattern(String rawPattern) {
299 for (int i = 0; i < UNICODE_SCRIPT_FIX_COUNTER; i++) {
301 Pattern.compile(rawPattern);
303 } catch (final PatternSyntaxException ex) {
304 LOG.debug("Invalid regex pattern syntax in: {}", rawPattern, ex);
305 final String msg = ex.getMessage();
306 if (msg.startsWith("Unknown character script name")
307 || msg.startsWith("Unknown character property name")) {
308 rawPattern = fixUnknownScripts(msg, rawPattern);
315 LOG.warn("Regex pattern could not be fixed: {}", rawPattern);
319 private static String fixUnknownScripts(final String exMessage, final String rawPattern) {
320 StringBuilder result = new StringBuilder(rawPattern);
321 final Matcher matcher = BETWEEN_CURLY_BRACES_PATTERN.matcher(exMessage);
322 if (matcher.find()) {
323 String capturedGroup = matcher.group(1);
324 if (capturedGroup.startsWith("In/Is")) {
325 // Java 9 changed the reporting string
326 capturedGroup = capturedGroup.substring(5);
329 if (JAVA_UNICODE_BLOCKS.contains(capturedGroup)) {
330 final int idx = rawPattern.indexOf("Is" + capturedGroup);
331 result = result.replace(idx, idx + 2, "In");
334 return result.toString();