Merge branch 'master' of ../controller
[yangtools.git] / yang / yang-model-util / src / main / java / org / opendaylight / yangtools / yang / model / util / RegexUtils.java
1 /*
2  * Copyright (c) 2017 Cisco Systems, Inc. and others.  All rights reserved.
3  *
4  * This program and the accompanying materials are made available under the
5  * terms of the Eclipse Public License v1.0 which accompanies this distribution,
6  * and is available at http://www.eclipse.org/legal/epl-v10.html
7  */
8
9 package org.opendaylight.yangtools.yang.model.util;
10
11 import com.google.common.collect.ImmutableSet;
12 import java.util.regex.Matcher;
13 import java.util.regex.Pattern;
14 import java.util.regex.PatternSyntaxException;
15 import org.slf4j.Logger;
16 import org.slf4j.LoggerFactory;
17
18 /**
19  * Utilities for converting YANG XSD regexes into Java-compatible regexes.
20  */
21 public final class RegexUtils {
22     private static final Logger LOG = LoggerFactory.getLogger(RegexUtils.class);
23     private static final Pattern BETWEEN_CURLY_BRACES_PATTERN = Pattern.compile("\\{(.+?)\\}");
24     private static final ImmutableSet<String> JAVA_UNICODE_BLOCKS = ImmutableSet.<String>builder()
25             .add("AegeanNumbers")
26             .add("AlchemicalSymbols")
27             .add("AlphabeticPresentationForms")
28             .add("AncientGreekMusicalNotation")
29             .add("AncientGreekNumbers")
30             .add("AncientSymbols")
31             .add("Arabic")
32             .add("ArabicPresentationForms-A")
33             .add("ArabicPresentationForms-B")
34             .add("ArabicSupplement")
35             .add("Armenian")
36             .add("Arrows")
37             .add("Avestan")
38             .add("Balinese")
39             .add("Bamum")
40             .add("BamumSupplement")
41             .add("BasicLatin")
42             .add("Batak")
43             .add("Bengali")
44             .add("BlockElements")
45             .add("Bopomofo")
46             .add("BopomofoExtended")
47             .add("BoxDrawing")
48             .add("Brahmi")
49             .add("BraillePatterns")
50             .add("Buginese")
51             .add("Buhid")
52             .add("ByzantineMusicalSymbols")
53             .add("Carian")
54             .add("Cham")
55             .add("Cherokee")
56             .add("CJKCompatibility")
57             .add("CJKCompatibilityForms")
58             .add("CJKCompatibilityIdeographs")
59             .add("CJKCompatibilityIdeographsSupplement")
60             .add("CJKRadicalsSupplement")
61             .add("CJKStrokes")
62             .add("CJKSymbolsandPunctuation")
63             .add("CJKUnifiedIdeographs")
64             .add("CJKUnifiedIdeographsExtensionA")
65             .add("CJKUnifiedIdeographsExtensionB")
66             .add("CJKUnifiedIdeographsExtensionC")
67             .add("CJKUnifiedIdeographsExtensionD")
68             .add("CombiningDiacriticalMarks")
69             .add("CombiningDiacriticalMarksSupplement")
70             .add("CombiningHalfMarks")
71             .add("CombiningDiacriticalMarksforSymbols")
72             .add("CommonIndicNumberForms")
73             .add("ControlPictures")
74             .add("Coptic")
75             .add("CountingRodNumerals")
76             .add("Cuneiform")
77             .add("CuneiformNumbersandPunctuation")
78             .add("CurrencySymbols")
79             .add("CypriotSyllabary")
80             .add("Cyrillic")
81             .add("CyrillicExtended-A")
82             .add("CyrillicExtended-B")
83             .add("CyrillicSupplementary")
84             .add("Deseret")
85             .add("Devanagari")
86             .add("DevanagariExtended")
87             .add("Dingbats")
88             .add("DominoTiles")
89             .add("EgyptianHieroglyphs")
90             .add("Emoticons")
91             .add("EnclosedAlphanumericSupplement")
92             .add("EnclosedAlphanumerics")
93             .add("EnclosedCJKLettersandMonths")
94             .add("EnclosedIdeographicSupplement")
95             .add("Ethiopic")
96             .add("EthiopicExtended")
97             .add("EthiopicExtended-A")
98             .add("EthiopicSupplement")
99             .add("GeneralPunctuation")
100             .add("GeometricShapes")
101             .add("Georgian")
102             .add("GeorgianSupplement")
103             .add("Glagolitic")
104             .add("Gothic")
105             .add("GreekandCoptic")
106             .add("GreekExtended")
107             .add("Gujarati")
108             .add("Gurmukhi")
109             .add("HalfwidthandFullwidthForms")
110             .add("HangulCompatibilityJamo")
111             .add("HangulJamo")
112             .add("HangulJamoExtended-A")
113             .add("HangulJamoExtended-B")
114             .add("HangulSyllables")
115             .add("Hanunoo")
116             .add("Hebrew")
117             .add("HighPrivateUseSurrogates")
118             .add("HighSurrogates")
119             .add("Hiragana")
120             .add("IdeographicDescriptionCharacters")
121             .add("ImperialAramaic")
122             .add("InscriptionalPahlavi")
123             .add("InscriptionalParthian")
124             .add("IPAExtensions")
125             .add("Javanese")
126             .add("Kaithi")
127             .add("KanaSupplement")
128             .add("Kanbun")
129             .add("Kangxi Radicals")
130             .add("Kannada")
131             .add("Katakana")
132             .add("KatakanaPhoneticExtensions")
133             .add("KayahLi")
134             .add("Kharoshthi")
135             .add("Khmer")
136             .add("KhmerSymbols")
137             .add("Lao")
138             .add("Latin-1Supplement")
139             .add("LatinExtended-A")
140             .add("LatinExtendedAdditional")
141             .add("LatinExtended-B")
142             .add("LatinExtended-C")
143             .add("LatinExtended-D")
144             .add("Lepcha")
145             .add("LetterlikeSymbols")
146             .add("Limbu")
147             .add("LinearBIdeograms")
148             .add("LinearBSyllabary")
149             .add("Lisu")
150             .add("LowSurrogates")
151             .add("Lycian")
152             .add("Lydian")
153             .add("MahjongTiles")
154             .add("Malayalam")
155             .add("Mandaic")
156             .add("MathematicalAlphanumericSymbols")
157             .add("MathematicalOperators")
158             .add("MeeteiMayek")
159             .add("MiscellaneousMathematicalSymbols-A")
160             .add("MiscellaneousMathematicalSymbols-B")
161             .add("MiscellaneousSymbols")
162             .add("MiscellaneousSymbolsandArrows")
163             .add("MiscellaneousSymbolsAndPictographs")
164             .add("MiscellaneousTechnical")
165             .add("ModifierToneLetters")
166             .add("Mongolian")
167             .add("MusicalSymbols")
168             .add("Myanmar")
169             .add("MyanmarExtended-A")
170             .add("NewTaiLue")
171             .add("NKo")
172             .add("NumberForms")
173             .add("Ogham")
174             .add("OlChiki")
175             .add("OldItalic")
176             .add("OldPersian")
177             .add("OldSouthArabian")
178             .add("OldTurkic")
179             .add("OpticalCharacterRecognition")
180             .add("Oriya")
181             .add("Osmanya")
182             .add("Phags-pa")
183             .add("PhaistosDisc")
184             .add("Phoenician")
185             .add("PhoneticExtensions")
186             .add("PhoneticExtensionsSupplement")
187             .add("PlayingCards")
188             .add("PrivateUseArea")
189             .add("Rejang")
190             .add("RumiNumeralSymbols")
191             .add("Runic")
192             .add("Samaritan")
193             .add("Saurashtra")
194             .add("Shavian")
195             .add("Sinhala")
196             .add("SmallFormVariants")
197             .add("SpacingModifierLetters")
198             .add("Specials")
199             .add("Sundanese")
200             .add("SuperscriptsandSubscripts")
201             .add("SupplementalArrows-A")
202             .add("SupplementalArrows-B")
203             .add("SupplementalMathematicalOperators")
204             .add("SupplementalPunctuation")
205             .add("SupplementaryPrivateUseArea-A")
206             .add("SupplementaryPrivateUseArea-B")
207             .add("SylotiNagri")
208             .add("Syriac")
209             .add("Tagalog")
210             .add("Tagbanwa")
211             .add("Tags")
212             .add("TaiLe")
213             .add("TaiTham")
214             .add("TaiViet")
215             .add("TaiXuanJingSymbols")
216             .add("Tamil")
217             .add("Telugu")
218             .add("Thaana")
219             .add("Thai")
220             .add("Tibetan")
221             .add("Tifinagh")
222             .add("TransportAndMapSymbols")
223             .add("Ugaritic")
224             .add("UnifiedCanadianAboriginalSyllabics")
225             .add("UnifiedCanadianAboriginalSyllabicsExtended")
226             .add("Vai")
227             .add("VariationSelectors")
228             .add("VariationSelectorsSupplement")
229             .add("VedicExtensions")
230             .add("VerticalForms")
231             .add("YiRadicals")
232             .add("YiSyllables")
233             .add("YijingHexagramSymbols").build();
234
235     private static final int UNICODE_SCRIPT_FIX_COUNTER = 30;
236
237     private RegexUtils() {
238         throw new UnsupportedOperationException("Utility class should not be instantiated.");
239     }
240
241     /**
242      * Converts XSD regex to Java-compatible regex.
243      *
244      * @param xsdRegex XSD regex pattern as it is defined in a YANG source
245      * @return Java-compatible regex
246      */
247     public static String getJavaRegexFromXSD(final String xsdRegex) {
248         // Note: we are using a non-capturing group to deal with internal structure issues, like branches and similar.
249         return "^(?:" + fixUnicodeScriptPattern(escapeChars(xsdRegex)) + ")$";
250     }
251
252     /*
253      * As both '^' and '$' are special anchor characters in java regular
254      * expressions which are implicitly present in XSD regular expressions,
255      * we need to escape them in case they are not defined as part of
256      * character ranges i.e. inside regular square brackets.
257      */
258     private static String escapeChars(final String regex) {
259         final StringBuilder result = new StringBuilder(regex.length());
260         int bracket = 0;
261         boolean escape = false;
262         for (int i = 0; i < regex.length(); i++) {
263             final char ch = regex.charAt(i);
264             switch (ch) {
265                 case '[':
266                     if (!escape) {
267                         bracket++;
268                     }
269                     escape = false;
270                     result.append(ch);
271                     break;
272                 case ']':
273                     if (!escape) {
274                         bracket--;
275                     }
276                     escape = false;
277                     result.append(ch);
278                     break;
279                 case '\\':
280                     escape = !escape;
281                     result.append(ch);
282                     break;
283                 case '^':
284                 case '$':
285                     if (bracket == 0) {
286                         result.append('\\');
287                     }
288                     escape = false;
289                     result.append(ch);
290                     break;
291                 default:
292                     escape = false;
293                     result.append(ch);
294             }
295         }
296         return result.toString();
297     }
298
299     private static String fixUnicodeScriptPattern(String rawPattern) {
300         for (int i = 0; i < UNICODE_SCRIPT_FIX_COUNTER; i++) {
301             try {
302                 Pattern.compile(rawPattern);
303                 return rawPattern;
304             } catch (final PatternSyntaxException ex) {
305                 LOG.debug("Invalid regex pattern syntax in: {}", rawPattern, ex);
306                 final String msg = ex.getMessage();
307                 if (msg.startsWith("Unknown character script name")
308                         || msg.startsWith("Unknown character property name")) {
309                     rawPattern = fixUnknownScripts(msg, rawPattern);
310                 } else {
311                     return rawPattern;
312                 }
313             }
314         }
315
316         LOG.warn("Regex pattern could not be fixed: {}", rawPattern);
317         return rawPattern;
318     }
319
320     private static String fixUnknownScripts(final String exMessage, final String rawPattern) {
321         StringBuilder result = new StringBuilder(rawPattern);
322         final Matcher matcher = BETWEEN_CURLY_BRACES_PATTERN.matcher(exMessage);
323         if (matcher.find()) {
324             String capturedGroup = matcher.group(1);
325             if (capturedGroup.startsWith("In/Is")) {
326                 // Java 9 changed the reporting string
327                 capturedGroup = capturedGroup.substring(5);
328             }
329
330             if (JAVA_UNICODE_BLOCKS.contains(capturedGroup)) {
331                 final int idx = rawPattern.indexOf("Is" + capturedGroup);
332                 result = result.replace(idx, idx + 2, "In");
333             }
334         }
335         return result.toString();
336     }
337 }