2 * Copyright (c) 2017 Cisco Systems, Inc. and others. All rights reserved.
4 * This program and the accompanying materials are made available under the
5 * terms of the Eclipse Public License v1.0 which accompanies this distribution,
6 * and is available at http://www.eclipse.org/legal/epl-v10.html
8 package org.opendaylight.yangtools.yang.parser.rfc7950.stmt.pattern;
10 import com.google.common.collect.ImmutableSet;
11 import java.lang.Character.UnicodeBlock;
12 import java.util.regex.Matcher;
13 import java.util.regex.Pattern;
14 import java.util.regex.PatternSyntaxException;
15 import org.slf4j.Logger;
16 import org.slf4j.LoggerFactory;
19 * Utilities for converting YANG XSD regexes into Java-compatible regexes.
21 final class RegexUtils {
22 private static final Logger LOG = LoggerFactory.getLogger(RegexUtils.class);
23 private static final Pattern BETWEEN_CURLY_BRACES_PATTERN = Pattern.compile("\\{(.+?)\\}");
26 * Unicode blocks known to Java. We do not use {@link UnicodeBlock#forName(String)} due to the need to differentiate
27 * runtime-supported and compile-time supported blocks. We are limited to the latter, i.e. even if we are running
28 * on (for example) Java 17, we must rely only on blocks supported by our compilation target (for example) Java 11.
31 * Furthermore we take a page from
32 * <a href="https://www.w3.org/TR/xmlschema11-2/#charcter-classes">G.4.2.3 Block escapes</a> and only match properly
33 * normalized names, which is different from what Java does.
35 private static final ImmutableSet<String> JAVA_UNICODE_BLOCKS = ImmutableSet.<String>builder()
38 .add("AlchemicalSymbols")
39 .add("AlphabeticPresentationForms")
40 .add("AncientGreekMusicalNotation")
41 .add("AncientGreekNumbers")
42 .add("AncientSymbols")
44 .add("ArabicPresentationForms-A")
45 .add("ArabicPresentationForms-B")
46 .add("ArabicSupplement")
52 .add("BamumSupplement")
58 .add("BopomofoExtended")
61 .add("BraillePatterns")
64 .add("ByzantineMusicalSymbols")
68 .add("CJKCompatibility")
69 .add("CJKCompatibilityForms")
70 .add("CJKCompatibilityIdeographs")
71 .add("CJKCompatibilityIdeographsSupplement")
72 .add("CJKRadicalsSupplement")
74 .add("CJKSymbolsandPunctuation")
75 .add("CJKUnifiedIdeographs")
76 .add("CJKUnifiedIdeographsExtensionA")
77 .add("CJKUnifiedIdeographsExtensionB")
78 .add("CJKUnifiedIdeographsExtensionC")
79 .add("CJKUnifiedIdeographsExtensionD")
80 .add("CombiningDiacriticalMarks")
81 .add("CombiningDiacriticalMarksSupplement")
82 .add("CombiningHalfMarks")
83 .add("CombiningDiacriticalMarksforSymbols")
84 .add("CommonIndicNumberForms")
85 .add("ControlPictures")
87 .add("CountingRodNumerals")
89 .add("CuneiformNumbersandPunctuation")
90 .add("CurrencySymbols")
91 .add("CypriotSyllabary")
93 .add("CyrillicExtended-A")
94 .add("CyrillicExtended-B")
95 .add("CyrillicSupplementary")
98 .add("DevanagariExtended")
101 .add("EgyptianHieroglyphs")
103 .add("EnclosedAlphanumericSupplement")
104 .add("EnclosedAlphanumerics")
105 .add("EnclosedCJKLettersandMonths")
106 .add("EnclosedIdeographicSupplement")
108 .add("EthiopicExtended")
109 .add("EthiopicExtended-A")
110 .add("EthiopicSupplement")
111 .add("GeneralPunctuation")
112 .add("GeometricShapes")
114 .add("GeorgianSupplement")
117 .add("GreekandCoptic")
118 .add("GreekExtended")
121 .add("HalfwidthandFullwidthForms")
122 .add("HangulCompatibilityJamo")
124 .add("HangulJamoExtended-A")
125 .add("HangulJamoExtended-B")
126 .add("HangulSyllables")
129 .add("HighPrivateUseSurrogates")
130 .add("HighSurrogates")
132 .add("IdeographicDescriptionCharacters")
133 .add("ImperialAramaic")
134 .add("InscriptionalPahlavi")
135 .add("InscriptionalParthian")
136 .add("IPAExtensions")
139 .add("KanaSupplement")
141 .add("Kangxi Radicals")
144 .add("KatakanaPhoneticExtensions")
150 .add("Latin-1Supplement")
151 .add("LatinExtended-A")
152 .add("LatinExtendedAdditional")
153 .add("LatinExtended-B")
154 .add("LatinExtended-C")
155 .add("LatinExtended-D")
157 .add("LetterlikeSymbols")
159 .add("LinearBIdeograms")
160 .add("LinearBSyllabary")
162 .add("LowSurrogates")
168 .add("MathematicalAlphanumericSymbols")
169 .add("MathematicalOperators")
171 .add("MiscellaneousMathematicalSymbols-A")
172 .add("MiscellaneousMathematicalSymbols-B")
173 .add("MiscellaneousSymbols")
174 .add("MiscellaneousSymbolsandArrows")
175 .add("MiscellaneousSymbolsAndPictographs")
176 .add("MiscellaneousTechnical")
177 .add("ModifierToneLetters")
179 .add("MusicalSymbols")
181 .add("MyanmarExtended-A")
189 .add("OldSouthArabian")
191 .add("OpticalCharacterRecognition")
197 .add("PhoneticExtensions")
198 .add("PhoneticExtensionsSupplement")
200 .add("PrivateUseArea")
202 .add("RumiNumeralSymbols")
208 .add("SmallFormVariants")
209 .add("SpacingModifierLetters")
212 .add("SuperscriptsandSubscripts")
213 .add("SupplementalArrows-A")
214 .add("SupplementalArrows-B")
215 .add("SupplementalMathematicalOperators")
216 .add("SupplementalPunctuation")
217 .add("SupplementaryPrivateUseArea-A")
218 .add("SupplementaryPrivateUseArea-B")
227 .add("TaiXuanJingSymbols")
234 .add("TransportAndMapSymbols")
236 .add("UnifiedCanadianAboriginalSyllabics")
237 .add("UnifiedCanadianAboriginalSyllabicsExtended")
239 .add("VariationSelectors")
240 .add("VariationSelectorsSupplement")
241 .add("VedicExtensions")
242 .add("VerticalForms")
245 .add("YijingHexagramSymbols")
248 .add("ArabicExtended-A")
249 .add("ArabicMathematicalAlphabeticSymbols")
251 .add("MeeteiMeyekExtensions")
252 .add("MeroiticCursive")
253 .add("MeroiticHieroglyphs")
257 .add("SundaneseSupplement")
262 .add("AnatolianHieroglyphs")
264 .add("CaucasianAlbanian")
265 .add("CherokeeSupplement")
266 .add("CJKUnifiedIdeographsExtensionE")
267 .add("CombiningDiacriticalMarksExtended")
268 .add("CopticEpactNumbers")
270 .add("EarlyDynasticCuneiform")
272 .add("GeometricShapesExtended")
277 .add("LatinExtended-E")
285 .add("MyanmarExtended-B")
288 .add("OldNorthArabian")
290 .add("OrnamentalDingbats")
294 .add("PsalterPahlavi")
295 .add("ShorthandFormatControls")
297 .add("SinhalaArchaicNumbers")
298 .add("SupplementalArrows-C")
299 .add("SupplementalSymbolsandPictographs")
300 .add("SuttonSignWriting")
307 .add("CJKUnifiedIdeographsExtensionF")
308 .add("CyrillicExtended-C")
309 .add("GlagoliticSupplement")
310 .add("IdeographicSymbolsandPunctuation")
311 .add("KanaExtended-A")
314 .add("MongolianSupplement")
319 .add("SyriacSupplement")
321 .add("TangutComponents")
322 .add("ZanabazarSquare")
325 private static final int UNICODE_SCRIPT_FIX_COUNTER = 30;
327 private RegexUtils() {
332 * Converts XSD regex to Java-compatible regex.
334 * @param xsdRegex XSD regex pattern as it is defined in a YANG source
335 * @return Java-compatible regex
337 static String getJavaRegexFromXSD(final String xsdRegex) {
338 // Note: we are using a non-capturing group to deal with internal structure issues, like branches and similar.
339 return "^(?:" + fixUnicodeScriptPattern(escapeChars(xsdRegex)) + ")$";
343 * As both '^' and '$' are special anchor characters in java regular
344 * expressions which are implicitly present in XSD regular expressions,
345 * we need to escape them in case they are not defined as part of
346 * character ranges i.e. inside regular square brackets.
348 private static String escapeChars(final String regex) {
349 final StringBuilder result = new StringBuilder(regex.length());
351 boolean escape = false;
352 for (int i = 0; i < regex.length(); i++) {
353 final char ch = regex.charAt(i);
386 return result.toString();
389 private static String fixUnicodeScriptPattern(String rawPattern) {
390 for (int i = 0; i < UNICODE_SCRIPT_FIX_COUNTER; i++) {
392 Pattern.compile(rawPattern);
394 } catch (final PatternSyntaxException ex) {
395 LOG.debug("Invalid regex pattern syntax in: {}", rawPattern, ex);
396 final String msg = ex.getMessage();
397 if (msg.startsWith("Unknown character script name")
398 || msg.startsWith("Unknown character property name")) {
399 rawPattern = fixUnknownScripts(msg, rawPattern);
406 LOG.warn("Regex pattern could not be fixed: {}", rawPattern);
410 private static String fixUnknownScripts(final String exMessage, final String rawPattern) {
411 StringBuilder result = new StringBuilder(rawPattern);
412 final Matcher matcher = BETWEEN_CURLY_BRACES_PATTERN.matcher(exMessage);
413 if (matcher.find()) {
414 String capturedGroup = matcher.group(1);
415 if (capturedGroup.startsWith("In/Is")) {
416 // Java 9 changed the reporting string
417 capturedGroup = capturedGroup.substring(5);
418 } else if (capturedGroup.startsWith("Is")) {
419 // Java 14 changed the reporting string (https://bugs.openjdk.java.net/browse/JDK-8230338)
420 capturedGroup = capturedGroup.substring(2);
423 if (JAVA_UNICODE_BLOCKS.contains(capturedGroup)) {
424 final int idx = rawPattern.indexOf("Is" + capturedGroup);
425 result = result.replace(idx, idx + 2, "In");
428 return result.toString();