Encapsulate regexes in a non-capturing group
[yangtools.git] / yang / yang-model-util / src / main / java / org / opendaylight / yangtools / yang / model / util / RegexUtils.java
1 /*
2  * Copyright (c) 2017 Cisco Systems, Inc. and others.  All rights reserved.
3  *
4  * This program and the accompanying materials are made available under the
5  * terms of the Eclipse Public License v1.0 which accompanies this distribution,
6  * and is available at http://www.eclipse.org/legal/epl-v10.html
7  */
8
9 package org.opendaylight.yangtools.yang.model.util;
10
11 import com.google.common.collect.ImmutableSet;
12 import java.util.Set;
13 import java.util.regex.Matcher;
14 import java.util.regex.Pattern;
15 import java.util.regex.PatternSyntaxException;
16 import org.slf4j.Logger;
17 import org.slf4j.LoggerFactory;
18
19 /**
20  * Utilities for converting YANG XSD regexes into Java-compatible regexes.
21  */
22 public final class RegexUtils {
23     private static final Logger LOG = LoggerFactory.getLogger(RegexUtils.class);
24     private static final Pattern BETWEEN_CURLY_BRACES_PATTERN = Pattern.compile("\\{(.+?)\\}");
25     private static final Set<String> JAVA_UNICODE_BLOCKS = ImmutableSet.<String>builder()
26             .add("AegeanNumbers")
27             .add("AlchemicalSymbols")
28             .add("AlphabeticPresentationForms")
29             .add("AncientGreekMusicalNotation")
30             .add("AncientGreekNumbers")
31             .add("AncientSymbols")
32             .add("Arabic")
33             .add("ArabicPresentationForms-A")
34             .add("ArabicPresentationForms-B")
35             .add("ArabicSupplement")
36             .add("Armenian")
37             .add("Arrows")
38             .add("Avestan")
39             .add("Balinese")
40             .add("Bamum")
41             .add("BamumSupplement")
42             .add("BasicLatin")
43             .add("Batak")
44             .add("Bengali")
45             .add("BlockElements")
46             .add("Bopomofo")
47             .add("BopomofoExtended")
48             .add("BoxDrawing")
49             .add("Brahmi")
50             .add("BraillePatterns")
51             .add("Buginese")
52             .add("Buhid")
53             .add("ByzantineMusicalSymbols")
54             .add("Carian")
55             .add("Cham")
56             .add("Cherokee")
57             .add("CJKCompatibility")
58             .add("CJKCompatibilityForms")
59             .add("CJKCompatibilityIdeographs")
60             .add("CJKCompatibilityIdeographsSupplement")
61             .add("CJKRadicalsSupplement")
62             .add("CJKStrokes")
63             .add("CJKSymbolsandPunctuation")
64             .add("CJKUnifiedIdeographs")
65             .add("CJKUnifiedIdeographsExtensionA")
66             .add("CJKUnifiedIdeographsExtensionB")
67             .add("CJKUnifiedIdeographsExtensionC")
68             .add("CJKUnifiedIdeographsExtensionD")
69             .add("CombiningDiacriticalMarks")
70             .add("CombiningDiacriticalMarksSupplement")
71             .add("CombiningHalfMarks")
72             .add("CombiningDiacriticalMarksforSymbols")
73             .add("CommonIndicNumberForms")
74             .add("ControlPictures")
75             .add("Coptic")
76             .add("CountingRodNumerals")
77             .add("Cuneiform")
78             .add("CuneiformNumbersandPunctuation")
79             .add("CurrencySymbols")
80             .add("CypriotSyllabary")
81             .add("Cyrillic")
82             .add("CyrillicExtended-A")
83             .add("CyrillicExtended-B")
84             .add("CyrillicSupplementary")
85             .add("Deseret")
86             .add("Devanagari")
87             .add("DevanagariExtended")
88             .add("Dingbats")
89             .add("DominoTiles")
90             .add("EgyptianHieroglyphs")
91             .add("Emoticons")
92             .add("EnclosedAlphanumericSupplement")
93             .add("EnclosedAlphanumerics")
94             .add("EnclosedCJKLettersandMonths")
95             .add("EnclosedIdeographicSupplement")
96             .add("Ethiopic")
97             .add("EthiopicExtended")
98             .add("EthiopicExtended-A")
99             .add("EthiopicSupplement")
100             .add("GeneralPunctuation")
101             .add("GeometricShapes")
102             .add("Georgian")
103             .add("GeorgianSupplement")
104             .add("Glagolitic")
105             .add("Gothic")
106             .add("GreekandCoptic")
107             .add("GreekExtended")
108             .add("Gujarati")
109             .add("Gurmukhi")
110             .add("HalfwidthandFullwidthForms")
111             .add("HangulCompatibilityJamo")
112             .add("HangulJamo")
113             .add("HangulJamoExtended-A")
114             .add("HangulJamoExtended-B")
115             .add("HangulSyllables")
116             .add("Hanunoo")
117             .add("Hebrew")
118             .add("HighPrivateUseSurrogates")
119             .add("HighSurrogates")
120             .add("Hiragana")
121             .add("IdeographicDescriptionCharacters")
122             .add("ImperialAramaic")
123             .add("InscriptionalPahlavi")
124             .add("InscriptionalParthian")
125             .add("IPAExtensions")
126             .add("Javanese")
127             .add("Kaithi")
128             .add("KanaSupplement")
129             .add("Kanbun")
130             .add("Kangxi Radicals")
131             .add("Kannada")
132             .add("Katakana")
133             .add("KatakanaPhoneticExtensions")
134             .add("KayahLi")
135             .add("Kharoshthi")
136             .add("Khmer")
137             .add("KhmerSymbols")
138             .add("Lao")
139             .add("Latin-1Supplement")
140             .add("LatinExtended-A")
141             .add("LatinExtendedAdditional")
142             .add("LatinExtended-B")
143             .add("LatinExtended-C")
144             .add("LatinExtended-D")
145             .add("Lepcha")
146             .add("LetterlikeSymbols")
147             .add("Limbu")
148             .add("LinearBIdeograms")
149             .add("LinearBSyllabary")
150             .add("Lisu")
151             .add("LowSurrogates")
152             .add("Lycian")
153             .add("Lydian")
154             .add("MahjongTiles")
155             .add("Malayalam")
156             .add("Mandaic")
157             .add("MathematicalAlphanumericSymbols")
158             .add("MathematicalOperators")
159             .add("MeeteiMayek")
160             .add("MiscellaneousMathematicalSymbols-A")
161             .add("MiscellaneousMathematicalSymbols-B")
162             .add("MiscellaneousSymbols")
163             .add("MiscellaneousSymbolsandArrows")
164             .add("MiscellaneousSymbolsAndPictographs")
165             .add("MiscellaneousTechnical")
166             .add("ModifierToneLetters")
167             .add("Mongolian")
168             .add("MusicalSymbols")
169             .add("Myanmar")
170             .add("MyanmarExtended-A")
171             .add("NewTaiLue")
172             .add("NKo")
173             .add("NumberForms")
174             .add("Ogham")
175             .add("OlChiki")
176             .add("OldItalic")
177             .add("OldPersian")
178             .add("OldSouthArabian")
179             .add("OldTurkic")
180             .add("OpticalCharacterRecognition")
181             .add("Oriya")
182             .add("Osmanya")
183             .add("Phags-pa")
184             .add("PhaistosDisc")
185             .add("Phoenician")
186             .add("PhoneticExtensions")
187             .add("PhoneticExtensionsSupplement")
188             .add("PlayingCards")
189             .add("PrivateUseArea")
190             .add("Rejang")
191             .add("RumiNumeralSymbols")
192             .add("Runic")
193             .add("Samaritan")
194             .add("Saurashtra")
195             .add("Shavian")
196             .add("Sinhala")
197             .add("SmallFormVariants")
198             .add("SpacingModifierLetters")
199             .add("Specials")
200             .add("Sundanese")
201             .add("SuperscriptsandSubscripts")
202             .add("SupplementalArrows-A")
203             .add("SupplementalArrows-B")
204             .add("SupplementalMathematicalOperators")
205             .add("SupplementalPunctuation")
206             .add("SupplementaryPrivateUseArea-A")
207             .add("SupplementaryPrivateUseArea-B")
208             .add("SylotiNagri")
209             .add("Syriac")
210             .add("Tagalog")
211             .add("Tagbanwa")
212             .add("Tags")
213             .add("TaiLe")
214             .add("TaiTham")
215             .add("TaiViet")
216             .add("TaiXuanJingSymbols")
217             .add("Tamil")
218             .add("Telugu")
219             .add("Thaana")
220             .add("Thai")
221             .add("Tibetan")
222             .add("Tifinagh")
223             .add("TransportAndMapSymbols")
224             .add("Ugaritic")
225             .add("UnifiedCanadianAboriginalSyllabics")
226             .add("UnifiedCanadianAboriginalSyllabicsExtended")
227             .add("Vai")
228             .add("VariationSelectors")
229             .add("VariationSelectorsSupplement")
230             .add("VedicExtensions")
231             .add("VerticalForms")
232             .add("YiRadicals")
233             .add("YiSyllables")
234             .add("YijingHexagramSymbols").build();
235
236     private static final int UNICODE_SCRIPT_FIX_COUNTER = 30;
237
238     private RegexUtils() {
239         throw new UnsupportedOperationException("Utility class should not be instantiated.");
240     }
241
242     /**
243      * Converts XSD regex to Java-compatible regex.
244      *
245      * @param xsdRegex XSD regex pattern as it is defined in a YANG source
246      * @return Java-compatible regex
247      */
248     public static String getJavaRegexFromXSD(final String xsdRegex) {
249         // Note: we are using a non-capturing group to deal with internal structure issues, like branches and similar.
250         return "^(?:" + fixUnicodeScriptPattern(escapeChars(xsdRegex)) + ")$";
251     }
252
253     /*
254      * As both '^' and '$' are special anchor characters in java regular
255      * expressions which are implicitly present in XSD regular expressions,
256      * we need to escape them in case they are not defined as part of
257      * character ranges i.e. inside regular square brackets.
258      */
259     private static String escapeChars(final String regex) {
260         final StringBuilder result = new StringBuilder(regex.length());
261         int bracket = 0;
262         boolean escape = false;
263         for (int i = 0; i < regex.length(); i++) {
264             final char ch = regex.charAt(i);
265             switch (ch) {
266                 case '[':
267                     if (!escape) {
268                         bracket++;
269                     }
270                     escape = false;
271                     result.append(ch);
272                     break;
273                 case ']':
274                     if (!escape) {
275                         bracket--;
276                     }
277                     escape = false;
278                     result.append(ch);
279                     break;
280                 case '\\':
281                     escape = !escape;
282                     result.append(ch);
283                     break;
284                 case '^':
285                 case '$':
286                     if (bracket == 0) {
287                         result.append('\\');
288                     }
289                     escape = false;
290                     result.append(ch);
291                     break;
292                 default:
293                     escape = false;
294                     result.append(ch);
295             }
296         }
297         return result.toString();
298     }
299
300     private static String fixUnicodeScriptPattern(String rawPattern) {
301         for (int i = 0; i < UNICODE_SCRIPT_FIX_COUNTER; i++) {
302             try {
303                 Pattern.compile(rawPattern);
304                 return rawPattern;
305             } catch (final PatternSyntaxException ex) {
306                 LOG.debug("Invalid regex pattern syntax in: {}", rawPattern, ex);
307                 if (ex.getMessage().contains("Unknown character script name")) {
308                     rawPattern = fixUnknownScripts(ex.getMessage(), rawPattern);
309                 } else {
310                     return rawPattern;
311                 }
312             }
313         }
314
315         LOG.warn("Regex pattern could not be fixed: {}", rawPattern);
316         return rawPattern;
317     }
318
319     private static String fixUnknownScripts(final String exMessage, final String rawPattern) {
320         StringBuilder result = new StringBuilder(rawPattern);
321         final Matcher matcher = BETWEEN_CURLY_BRACES_PATTERN.matcher(exMessage);
322         if (matcher.find()) {
323             final String capturedGroup = matcher.group(1);
324             if (JAVA_UNICODE_BLOCKS.contains(capturedGroup)) {
325                 final int idx = rawPattern.indexOf("Is" + capturedGroup);
326                 result = result.replace(idx, idx + 2, "In");
327             }
328         }
329         return result.toString();
330     }
331 }