8a2a8fb65c35e6b9b0b76d433712de2df02b501c
[yangtools.git] / parser / yang-parser-rfc7950 / src / main / java / org / opendaylight / yangtools / yang / parser / rfc7950 / stmt / pattern / RegexUtils.java
1 /*
2  * Copyright (c) 2017 Cisco Systems, Inc. and others.  All rights reserved.
3  *
4  * This program and the accompanying materials are made available under the
5  * terms of the Eclipse Public License v1.0 which accompanies this distribution,
6  * and is available at http://www.eclipse.org/legal/epl-v10.html
7  */
8 package org.opendaylight.yangtools.yang.parser.rfc7950.stmt.pattern;
9
10 import com.google.common.collect.ImmutableSet;
11 import java.util.regex.Matcher;
12 import java.util.regex.Pattern;
13 import java.util.regex.PatternSyntaxException;
14 import org.slf4j.Logger;
15 import org.slf4j.LoggerFactory;
16
17 /**
18  * Utilities for converting YANG XSD regexes into Java-compatible regexes.
19  */
20 final class RegexUtils {
21     private static final Logger LOG = LoggerFactory.getLogger(RegexUtils.class);
22     private static final Pattern BETWEEN_CURLY_BRACES_PATTERN = Pattern.compile("\\{(.+?)\\}");
23     private static final ImmutableSet<String> JAVA_UNICODE_BLOCKS = ImmutableSet.<String>builder()
24             .add("AegeanNumbers")
25             .add("AlchemicalSymbols")
26             .add("AlphabeticPresentationForms")
27             .add("AncientGreekMusicalNotation")
28             .add("AncientGreekNumbers")
29             .add("AncientSymbols")
30             .add("Arabic")
31             .add("ArabicPresentationForms-A")
32             .add("ArabicPresentationForms-B")
33             .add("ArabicSupplement")
34             .add("Armenian")
35             .add("Arrows")
36             .add("Avestan")
37             .add("Balinese")
38             .add("Bamum")
39             .add("BamumSupplement")
40             .add("BasicLatin")
41             .add("Batak")
42             .add("Bengali")
43             .add("BlockElements")
44             .add("Bopomofo")
45             .add("BopomofoExtended")
46             .add("BoxDrawing")
47             .add("Brahmi")
48             .add("BraillePatterns")
49             .add("Buginese")
50             .add("Buhid")
51             .add("ByzantineMusicalSymbols")
52             .add("Carian")
53             .add("Cham")
54             .add("Cherokee")
55             .add("CJKCompatibility")
56             .add("CJKCompatibilityForms")
57             .add("CJKCompatibilityIdeographs")
58             .add("CJKCompatibilityIdeographsSupplement")
59             .add("CJKRadicalsSupplement")
60             .add("CJKStrokes")
61             .add("CJKSymbolsandPunctuation")
62             .add("CJKUnifiedIdeographs")
63             .add("CJKUnifiedIdeographsExtensionA")
64             .add("CJKUnifiedIdeographsExtensionB")
65             .add("CJKUnifiedIdeographsExtensionC")
66             .add("CJKUnifiedIdeographsExtensionD")
67             .add("CombiningDiacriticalMarks")
68             .add("CombiningDiacriticalMarksSupplement")
69             .add("CombiningHalfMarks")
70             .add("CombiningDiacriticalMarksforSymbols")
71             .add("CommonIndicNumberForms")
72             .add("ControlPictures")
73             .add("Coptic")
74             .add("CountingRodNumerals")
75             .add("Cuneiform")
76             .add("CuneiformNumbersandPunctuation")
77             .add("CurrencySymbols")
78             .add("CypriotSyllabary")
79             .add("Cyrillic")
80             .add("CyrillicExtended-A")
81             .add("CyrillicExtended-B")
82             .add("CyrillicSupplementary")
83             .add("Deseret")
84             .add("Devanagari")
85             .add("DevanagariExtended")
86             .add("Dingbats")
87             .add("DominoTiles")
88             .add("EgyptianHieroglyphs")
89             .add("Emoticons")
90             .add("EnclosedAlphanumericSupplement")
91             .add("EnclosedAlphanumerics")
92             .add("EnclosedCJKLettersandMonths")
93             .add("EnclosedIdeographicSupplement")
94             .add("Ethiopic")
95             .add("EthiopicExtended")
96             .add("EthiopicExtended-A")
97             .add("EthiopicSupplement")
98             .add("GeneralPunctuation")
99             .add("GeometricShapes")
100             .add("Georgian")
101             .add("GeorgianSupplement")
102             .add("Glagolitic")
103             .add("Gothic")
104             .add("GreekandCoptic")
105             .add("GreekExtended")
106             .add("Gujarati")
107             .add("Gurmukhi")
108             .add("HalfwidthandFullwidthForms")
109             .add("HangulCompatibilityJamo")
110             .add("HangulJamo")
111             .add("HangulJamoExtended-A")
112             .add("HangulJamoExtended-B")
113             .add("HangulSyllables")
114             .add("Hanunoo")
115             .add("Hebrew")
116             .add("HighPrivateUseSurrogates")
117             .add("HighSurrogates")
118             .add("Hiragana")
119             .add("IdeographicDescriptionCharacters")
120             .add("ImperialAramaic")
121             .add("InscriptionalPahlavi")
122             .add("InscriptionalParthian")
123             .add("IPAExtensions")
124             .add("Javanese")
125             .add("Kaithi")
126             .add("KanaSupplement")
127             .add("Kanbun")
128             .add("Kangxi Radicals")
129             .add("Kannada")
130             .add("Katakana")
131             .add("KatakanaPhoneticExtensions")
132             .add("KayahLi")
133             .add("Kharoshthi")
134             .add("Khmer")
135             .add("KhmerSymbols")
136             .add("Lao")
137             .add("Latin-1Supplement")
138             .add("LatinExtended-A")
139             .add("LatinExtendedAdditional")
140             .add("LatinExtended-B")
141             .add("LatinExtended-C")
142             .add("LatinExtended-D")
143             .add("Lepcha")
144             .add("LetterlikeSymbols")
145             .add("Limbu")
146             .add("LinearBIdeograms")
147             .add("LinearBSyllabary")
148             .add("Lisu")
149             .add("LowSurrogates")
150             .add("Lycian")
151             .add("Lydian")
152             .add("MahjongTiles")
153             .add("Malayalam")
154             .add("Mandaic")
155             .add("MathematicalAlphanumericSymbols")
156             .add("MathematicalOperators")
157             .add("MeeteiMayek")
158             .add("MiscellaneousMathematicalSymbols-A")
159             .add("MiscellaneousMathematicalSymbols-B")
160             .add("MiscellaneousSymbols")
161             .add("MiscellaneousSymbolsandArrows")
162             .add("MiscellaneousSymbolsAndPictographs")
163             .add("MiscellaneousTechnical")
164             .add("ModifierToneLetters")
165             .add("Mongolian")
166             .add("MusicalSymbols")
167             .add("Myanmar")
168             .add("MyanmarExtended-A")
169             .add("NewTaiLue")
170             .add("NKo")
171             .add("NumberForms")
172             .add("Ogham")
173             .add("OlChiki")
174             .add("OldItalic")
175             .add("OldPersian")
176             .add("OldSouthArabian")
177             .add("OldTurkic")
178             .add("OpticalCharacterRecognition")
179             .add("Oriya")
180             .add("Osmanya")
181             .add("Phags-pa")
182             .add("PhaistosDisc")
183             .add("Phoenician")
184             .add("PhoneticExtensions")
185             .add("PhoneticExtensionsSupplement")
186             .add("PlayingCards")
187             .add("PrivateUseArea")
188             .add("Rejang")
189             .add("RumiNumeralSymbols")
190             .add("Runic")
191             .add("Samaritan")
192             .add("Saurashtra")
193             .add("Shavian")
194             .add("Sinhala")
195             .add("SmallFormVariants")
196             .add("SpacingModifierLetters")
197             .add("Specials")
198             .add("Sundanese")
199             .add("SuperscriptsandSubscripts")
200             .add("SupplementalArrows-A")
201             .add("SupplementalArrows-B")
202             .add("SupplementalMathematicalOperators")
203             .add("SupplementalPunctuation")
204             .add("SupplementaryPrivateUseArea-A")
205             .add("SupplementaryPrivateUseArea-B")
206             .add("SylotiNagri")
207             .add("Syriac")
208             .add("Tagalog")
209             .add("Tagbanwa")
210             .add("Tags")
211             .add("TaiLe")
212             .add("TaiTham")
213             .add("TaiViet")
214             .add("TaiXuanJingSymbols")
215             .add("Tamil")
216             .add("Telugu")
217             .add("Thaana")
218             .add("Thai")
219             .add("Tibetan")
220             .add("Tifinagh")
221             .add("TransportAndMapSymbols")
222             .add("Ugaritic")
223             .add("UnifiedCanadianAboriginalSyllabics")
224             .add("UnifiedCanadianAboriginalSyllabicsExtended")
225             .add("Vai")
226             .add("VariationSelectors")
227             .add("VariationSelectorsSupplement")
228             .add("VedicExtensions")
229             .add("VerticalForms")
230             .add("YiRadicals")
231             .add("YiSyllables")
232             .add("YijingHexagramSymbols").build();
233
234     private static final int UNICODE_SCRIPT_FIX_COUNTER = 30;
235
236     private RegexUtils() {
237         // Hidden on purpose
238     }
239
240     /**
241      * Converts XSD regex to Java-compatible regex.
242      *
243      * @param xsdRegex XSD regex pattern as it is defined in a YANG source
244      * @return Java-compatible regex
245      */
246     static String getJavaRegexFromXSD(final String xsdRegex) {
247         // Note: we are using a non-capturing group to deal with internal structure issues, like branches and similar.
248         return "^(?:" + fixUnicodeScriptPattern(escapeChars(xsdRegex)) + ")$";
249     }
250
251     /*
252      * As both '^' and '$' are special anchor characters in java regular
253      * expressions which are implicitly present in XSD regular expressions,
254      * we need to escape them in case they are not defined as part of
255      * character ranges i.e. inside regular square brackets.
256      */
257     private static String escapeChars(final String regex) {
258         final StringBuilder result = new StringBuilder(regex.length());
259         int bracket = 0;
260         boolean escape = false;
261         for (int i = 0; i < regex.length(); i++) {
262             final char ch = regex.charAt(i);
263             switch (ch) {
264                 case '[':
265                     if (!escape) {
266                         bracket++;
267                     }
268                     escape = false;
269                     result.append(ch);
270                     break;
271                 case ']':
272                     if (!escape) {
273                         bracket--;
274                     }
275                     escape = false;
276                     result.append(ch);
277                     break;
278                 case '\\':
279                     escape = !escape;
280                     result.append(ch);
281                     break;
282                 case '^':
283                 case '$':
284                     if (bracket == 0) {
285                         result.append('\\');
286                     }
287                     escape = false;
288                     result.append(ch);
289                     break;
290                 default:
291                     escape = false;
292                     result.append(ch);
293             }
294         }
295         return result.toString();
296     }
297
298     private static String fixUnicodeScriptPattern(String rawPattern) {
299         for (int i = 0; i < UNICODE_SCRIPT_FIX_COUNTER; i++) {
300             try {
301                 Pattern.compile(rawPattern);
302                 return rawPattern;
303             } catch (final PatternSyntaxException ex) {
304                 LOG.debug("Invalid regex pattern syntax in: {}", rawPattern, ex);
305                 final String msg = ex.getMessage();
306                 if (msg.startsWith("Unknown character script name")
307                         || msg.startsWith("Unknown character property name")) {
308                     rawPattern = fixUnknownScripts(msg, rawPattern);
309                 } else {
310                     return rawPattern;
311                 }
312             }
313         }
314
315         LOG.warn("Regex pattern could not be fixed: {}", rawPattern);
316         return rawPattern;
317     }
318
319     private static String fixUnknownScripts(final String exMessage, final String rawPattern) {
320         StringBuilder result = new StringBuilder(rawPattern);
321         final Matcher matcher = BETWEEN_CURLY_BRACES_PATTERN.matcher(exMessage);
322         if (matcher.find()) {
323             String capturedGroup = matcher.group(1);
324             if (capturedGroup.startsWith("In/Is")) {
325                 // Java 9 changed the reporting string
326                 capturedGroup = capturedGroup.substring(5);
327             } else if (capturedGroup.startsWith("Is")) {
328                 // Java 14 changed the reporting string (https://bugs.openjdk.java.net/browse/JDK-8230338)
329                 capturedGroup = capturedGroup.substring(2);
330             }
331
332             if (JAVA_UNICODE_BLOCKS.contains(capturedGroup)) {
333                 final int idx = rawPattern.indexOf("Is" + capturedGroup);
334                 result = result.replace(idx, idx + 2, "In");
335             }
336         }
337         return result.toString();
338     }
339 }