2 * Copyright (c) 2017 Cisco Systems, Inc. and others. All rights reserved.
4 * This program and the accompanying materials are made available under the
5 * terms of the Eclipse Public License v1.0 which accompanies this distribution,
6 * and is available at http://www.eclipse.org/legal/epl-v10.html
9 package org.opendaylight.yangtools.yang.model.util;
11 import com.google.common.collect.ImmutableSet;
12 import java.util.regex.Matcher;
13 import java.util.regex.Pattern;
14 import java.util.regex.PatternSyntaxException;
15 import org.slf4j.Logger;
16 import org.slf4j.LoggerFactory;
19 * Utilities for converting YANG XSD regexes into Java-compatible regexes.
21 public final class RegexUtils {
22 private static final Logger LOG = LoggerFactory.getLogger(RegexUtils.class);
23 private static final Pattern BETWEEN_CURLY_BRACES_PATTERN = Pattern.compile("\\{(.+?)\\}");
24 private static final ImmutableSet<String> JAVA_UNICODE_BLOCKS = ImmutableSet.<String>builder()
26 .add("AlchemicalSymbols")
27 .add("AlphabeticPresentationForms")
28 .add("AncientGreekMusicalNotation")
29 .add("AncientGreekNumbers")
30 .add("AncientSymbols")
32 .add("ArabicPresentationForms-A")
33 .add("ArabicPresentationForms-B")
34 .add("ArabicSupplement")
40 .add("BamumSupplement")
46 .add("BopomofoExtended")
49 .add("BraillePatterns")
52 .add("ByzantineMusicalSymbols")
56 .add("CJKCompatibility")
57 .add("CJKCompatibilityForms")
58 .add("CJKCompatibilityIdeographs")
59 .add("CJKCompatibilityIdeographsSupplement")
60 .add("CJKRadicalsSupplement")
62 .add("CJKSymbolsandPunctuation")
63 .add("CJKUnifiedIdeographs")
64 .add("CJKUnifiedIdeographsExtensionA")
65 .add("CJKUnifiedIdeographsExtensionB")
66 .add("CJKUnifiedIdeographsExtensionC")
67 .add("CJKUnifiedIdeographsExtensionD")
68 .add("CombiningDiacriticalMarks")
69 .add("CombiningDiacriticalMarksSupplement")
70 .add("CombiningHalfMarks")
71 .add("CombiningDiacriticalMarksforSymbols")
72 .add("CommonIndicNumberForms")
73 .add("ControlPictures")
75 .add("CountingRodNumerals")
77 .add("CuneiformNumbersandPunctuation")
78 .add("CurrencySymbols")
79 .add("CypriotSyllabary")
81 .add("CyrillicExtended-A")
82 .add("CyrillicExtended-B")
83 .add("CyrillicSupplementary")
86 .add("DevanagariExtended")
89 .add("EgyptianHieroglyphs")
91 .add("EnclosedAlphanumericSupplement")
92 .add("EnclosedAlphanumerics")
93 .add("EnclosedCJKLettersandMonths")
94 .add("EnclosedIdeographicSupplement")
96 .add("EthiopicExtended")
97 .add("EthiopicExtended-A")
98 .add("EthiopicSupplement")
99 .add("GeneralPunctuation")
100 .add("GeometricShapes")
102 .add("GeorgianSupplement")
105 .add("GreekandCoptic")
106 .add("GreekExtended")
109 .add("HalfwidthandFullwidthForms")
110 .add("HangulCompatibilityJamo")
112 .add("HangulJamoExtended-A")
113 .add("HangulJamoExtended-B")
114 .add("HangulSyllables")
117 .add("HighPrivateUseSurrogates")
118 .add("HighSurrogates")
120 .add("IdeographicDescriptionCharacters")
121 .add("ImperialAramaic")
122 .add("InscriptionalPahlavi")
123 .add("InscriptionalParthian")
124 .add("IPAExtensions")
127 .add("KanaSupplement")
129 .add("Kangxi Radicals")
132 .add("KatakanaPhoneticExtensions")
138 .add("Latin-1Supplement")
139 .add("LatinExtended-A")
140 .add("LatinExtendedAdditional")
141 .add("LatinExtended-B")
142 .add("LatinExtended-C")
143 .add("LatinExtended-D")
145 .add("LetterlikeSymbols")
147 .add("LinearBIdeograms")
148 .add("LinearBSyllabary")
150 .add("LowSurrogates")
156 .add("MathematicalAlphanumericSymbols")
157 .add("MathematicalOperators")
159 .add("MiscellaneousMathematicalSymbols-A")
160 .add("MiscellaneousMathematicalSymbols-B")
161 .add("MiscellaneousSymbols")
162 .add("MiscellaneousSymbolsandArrows")
163 .add("MiscellaneousSymbolsAndPictographs")
164 .add("MiscellaneousTechnical")
165 .add("ModifierToneLetters")
167 .add("MusicalSymbols")
169 .add("MyanmarExtended-A")
177 .add("OldSouthArabian")
179 .add("OpticalCharacterRecognition")
185 .add("PhoneticExtensions")
186 .add("PhoneticExtensionsSupplement")
188 .add("PrivateUseArea")
190 .add("RumiNumeralSymbols")
196 .add("SmallFormVariants")
197 .add("SpacingModifierLetters")
200 .add("SuperscriptsandSubscripts")
201 .add("SupplementalArrows-A")
202 .add("SupplementalArrows-B")
203 .add("SupplementalMathematicalOperators")
204 .add("SupplementalPunctuation")
205 .add("SupplementaryPrivateUseArea-A")
206 .add("SupplementaryPrivateUseArea-B")
215 .add("TaiXuanJingSymbols")
222 .add("TransportAndMapSymbols")
224 .add("UnifiedCanadianAboriginalSyllabics")
225 .add("UnifiedCanadianAboriginalSyllabicsExtended")
227 .add("VariationSelectors")
228 .add("VariationSelectorsSupplement")
229 .add("VedicExtensions")
230 .add("VerticalForms")
233 .add("YijingHexagramSymbols").build();
235 private static final int UNICODE_SCRIPT_FIX_COUNTER = 30;
237 private RegexUtils() {
242 * Converts XSD regex to Java-compatible regex.
244 * @param xsdRegex XSD regex pattern as it is defined in a YANG source
245 * @return Java-compatible regex
247 public static String getJavaRegexFromXSD(final String xsdRegex) {
248 // Note: we are using a non-capturing group to deal with internal structure issues, like branches and similar.
249 return "^(?:" + fixUnicodeScriptPattern(escapeChars(xsdRegex)) + ")$";
253 * As both '^' and '$' are special anchor characters in java regular
254 * expressions which are implicitly present in XSD regular expressions,
255 * we need to escape them in case they are not defined as part of
256 * character ranges i.e. inside regular square brackets.
258 private static String escapeChars(final String regex) {
259 final StringBuilder result = new StringBuilder(regex.length());
261 boolean escape = false;
262 for (int i = 0; i < regex.length(); i++) {
263 final char ch = regex.charAt(i);
296 return result.toString();
299 private static String fixUnicodeScriptPattern(String rawPattern) {
300 for (int i = 0; i < UNICODE_SCRIPT_FIX_COUNTER; i++) {
302 Pattern.compile(rawPattern);
304 } catch (final PatternSyntaxException ex) {
305 LOG.debug("Invalid regex pattern syntax in: {}", rawPattern, ex);
306 final String msg = ex.getMessage();
307 if (msg.startsWith("Unknown character script name")
308 || msg.startsWith("Unknown character property name")) {
309 rawPattern = fixUnknownScripts(msg, rawPattern);
316 LOG.warn("Regex pattern could not be fixed: {}", rawPattern);
320 private static String fixUnknownScripts(final String exMessage, final String rawPattern) {
321 StringBuilder result = new StringBuilder(rawPattern);
322 final Matcher matcher = BETWEEN_CURLY_BRACES_PATTERN.matcher(exMessage);
323 if (matcher.find()) {
324 String capturedGroup = matcher.group(1);
325 if (capturedGroup.startsWith("In/Is")) {
326 // Java 9 changed the reporting string
327 capturedGroup = capturedGroup.substring(5);
330 if (JAVA_UNICODE_BLOCKS.contains(capturedGroup)) {
331 final int idx = rawPattern.indexOf("Is" + capturedGroup);
332 result = result.replace(idx, idx + 2, "In");
335 return result.toString();