0991750e6ca55ec6f282c582cf8196f762f439c5
[yangtools.git] / yang / yang-model-util / src / main / java / org / opendaylight / yangtools / yang / model / util / RegexUtils.java
1 /*
2  * Copyright (c) 2017 Cisco Systems, Inc. and others.  All rights reserved.
3  *
4  * This program and the accompanying materials are made available under the
5  * terms of the Eclipse Public License v1.0 which accompanies this distribution,
6  * and is available at http://www.eclipse.org/legal/epl-v10.html
7  */
8
9 package org.opendaylight.yangtools.yang.model.util;
10
11 import com.google.common.collect.ImmutableSet;
12 import java.util.Set;
13 import java.util.regex.Matcher;
14 import java.util.regex.Pattern;
15 import java.util.regex.PatternSyntaxException;
16 import org.slf4j.Logger;
17 import org.slf4j.LoggerFactory;
18
19 /**
20  * Utilities for converting YANG XSD regexes into Java-compatible regexes.
21  */
22 public final class RegexUtils {
23     private static final Logger LOG = LoggerFactory.getLogger(RegexUtils.class);
24     private static final Pattern BETWEEN_CURLY_BRACES_PATTERN = Pattern.compile("\\{(.+?)\\}");
25     private static final Set<String> JAVA_UNICODE_BLOCKS = ImmutableSet.<String>builder()
26             .add("AegeanNumbers")
27             .add("AlchemicalSymbols")
28             .add("AlphabeticPresentationForms")
29             .add("AncientGreekMusicalNotation")
30             .add("AncientGreekNumbers")
31             .add("AncientSymbols")
32             .add("Arabic")
33             .add("ArabicPresentationForms-A")
34             .add("ArabicPresentationForms-B")
35             .add("ArabicSupplement")
36             .add("Armenian")
37             .add("Arrows")
38             .add("Avestan")
39             .add("Balinese")
40             .add("Bamum")
41             .add("BamumSupplement")
42             .add("BasicLatin")
43             .add("Batak")
44             .add("Bengali")
45             .add("BlockElements")
46             .add("Bopomofo")
47             .add("BopomofoExtended")
48             .add("BoxDrawing")
49             .add("Brahmi")
50             .add("BraillePatterns")
51             .add("Buginese")
52             .add("Buhid")
53             .add("ByzantineMusicalSymbols")
54             .add("Carian")
55             .add("Cham")
56             .add("Cherokee")
57             .add("CJKCompatibility")
58             .add("CJKCompatibilityForms")
59             .add("CJKCompatibilityIdeographs")
60             .add("CJKCompatibilityIdeographsSupplement")
61             .add("CJKRadicalsSupplement")
62             .add("CJKStrokes")
63             .add("CJKSymbolsandPunctuation")
64             .add("CJKUnifiedIdeographs")
65             .add("CJKUnifiedIdeographsExtensionA")
66             .add("CJKUnifiedIdeographsExtensionB")
67             .add("CJKUnifiedIdeographsExtensionC")
68             .add("CJKUnifiedIdeographsExtensionD")
69             .add("CombiningDiacriticalMarks")
70             .add("CombiningDiacriticalMarksSupplement")
71             .add("CombiningHalfMarks")
72             .add("CombiningDiacriticalMarksforSymbols")
73             .add("CommonIndicNumberForms")
74             .add("ControlPictures")
75             .add("Coptic")
76             .add("CountingRodNumerals")
77             .add("Cuneiform")
78             .add("CuneiformNumbersandPunctuation")
79             .add("CurrencySymbols")
80             .add("CypriotSyllabary")
81             .add("Cyrillic")
82             .add("CyrillicExtended-A")
83             .add("CyrillicExtended-B")
84             .add("CyrillicSupplementary")
85             .add("Deseret")
86             .add("Devanagari")
87             .add("DevanagariExtended")
88             .add("Dingbats")
89             .add("DominoTiles")
90             .add("EgyptianHieroglyphs")
91             .add("Emoticons")
92             .add("EnclosedAlphanumericSupplement")
93             .add("EnclosedAlphanumerics")
94             .add("EnclosedCJKLettersandMonths")
95             .add("EnclosedIdeographicSupplement")
96             .add("Ethiopic")
97             .add("EthiopicExtended")
98             .add("EthiopicExtended-A")
99             .add("EthiopicSupplement")
100             .add("GeneralPunctuation")
101             .add("GeometricShapes")
102             .add("Georgian")
103             .add("GeorgianSupplement")
104             .add("Glagolitic")
105             .add("Gothic")
106             .add("GreekandCoptic")
107             .add("GreekExtended")
108             .add("Gujarati")
109             .add("Gurmukhi")
110             .add("HalfwidthandFullwidthForms")
111             .add("HangulCompatibilityJamo")
112             .add("HangulJamo")
113             .add("HangulJamoExtended-A")
114             .add("HangulJamoExtended-B")
115             .add("HangulSyllables")
116             .add("Hanunoo")
117             .add("Hebrew")
118             .add("HighPrivateUseSurrogates")
119             .add("HighSurrogates")
120             .add("Hiragana")
121             .add("IdeographicDescriptionCharacters")
122             .add("ImperialAramaic")
123             .add("InscriptionalPahlavi")
124             .add("InscriptionalParthian")
125             .add("IPAExtensions")
126             .add("Javanese")
127             .add("Kaithi")
128             .add("KanaSupplement")
129             .add("Kanbun")
130             .add("Kangxi Radicals")
131             .add("Kannada")
132             .add("Katakana")
133             .add("KatakanaPhoneticExtensions")
134             .add("KayahLi")
135             .add("Kharoshthi")
136             .add("Khmer")
137             .add("KhmerSymbols")
138             .add("Lao")
139             .add("Latin-1Supplement")
140             .add("LatinExtended-A")
141             .add("LatinExtendedAdditional")
142             .add("LatinExtended-B")
143             .add("LatinExtended-C")
144             .add("LatinExtended-D")
145             .add("Lepcha")
146             .add("LetterlikeSymbols")
147             .add("Limbu")
148             .add("LinearBIdeograms")
149             .add("LinearBSyllabary")
150             .add("Lisu")
151             .add("LowSurrogates")
152             .add("Lycian")
153             .add("Lydian")
154             .add("MahjongTiles")
155             .add("Malayalam")
156             .add("Mandaic")
157             .add("MathematicalAlphanumericSymbols")
158             .add("MathematicalOperators")
159             .add("MeeteiMayek")
160             .add("MiscellaneousMathematicalSymbols-A")
161             .add("MiscellaneousMathematicalSymbols-B")
162             .add("MiscellaneousSymbols")
163             .add("MiscellaneousSymbolsandArrows")
164             .add("MiscellaneousSymbolsAndPictographs")
165             .add("MiscellaneousTechnical")
166             .add("ModifierToneLetters")
167             .add("Mongolian")
168             .add("MusicalSymbols")
169             .add("Myanmar")
170             .add("MyanmarExtended-A")
171             .add("NewTaiLue")
172             .add("NKo")
173             .add("NumberForms")
174             .add("Ogham")
175             .add("OlChiki")
176             .add("OldItalic")
177             .add("OldPersian")
178             .add("OldSouthArabian")
179             .add("OldTurkic")
180             .add("OpticalCharacterRecognition")
181             .add("Oriya")
182             .add("Osmanya")
183             .add("Phags-pa")
184             .add("PhaistosDisc")
185             .add("Phoenician")
186             .add("PhoneticExtensions")
187             .add("PhoneticExtensionsSupplement")
188             .add("PlayingCards")
189             .add("PrivateUseArea")
190             .add("Rejang")
191             .add("RumiNumeralSymbols")
192             .add("Runic")
193             .add("Samaritan")
194             .add("Saurashtra")
195             .add("Shavian")
196             .add("Sinhala")
197             .add("SmallFormVariants")
198             .add("SpacingModifierLetters")
199             .add("Specials")
200             .add("Sundanese")
201             .add("SuperscriptsandSubscripts")
202             .add("SupplementalArrows-A")
203             .add("SupplementalArrows-B")
204             .add("SupplementalMathematicalOperators")
205             .add("SupplementalPunctuation")
206             .add("SupplementaryPrivateUseArea-A")
207             .add("SupplementaryPrivateUseArea-B")
208             .add("SylotiNagri")
209             .add("Syriac")
210             .add("Tagalog")
211             .add("Tagbanwa")
212             .add("Tags")
213             .add("TaiLe")
214             .add("TaiTham")
215             .add("TaiViet")
216             .add("TaiXuanJingSymbols")
217             .add("Tamil")
218             .add("Telugu")
219             .add("Thaana")
220             .add("Thai")
221             .add("Tibetan")
222             .add("Tifinagh")
223             .add("TransportAndMapSymbols")
224             .add("Ugaritic")
225             .add("UnifiedCanadianAboriginalSyllabics")
226             .add("UnifiedCanadianAboriginalSyllabicsExtended")
227             .add("Vai")
228             .add("VariationSelectors")
229             .add("VariationSelectorsSupplement")
230             .add("VedicExtensions")
231             .add("VerticalForms")
232             .add("YiRadicals")
233             .add("YiSyllables")
234             .add("YijingHexagramSymbols").build();
235
236     private static final int UNICODE_SCRIPT_FIX_COUNTER = 30;
237
238     private RegexUtils() {
239         throw new UnsupportedOperationException("Utility class should not be instantiated.");
240     }
241
242     /**
243      * Converts XSD regex to Java-compatible regex.
244      *
245      * @param xsdRegex XSD regex pattern as it is defined in a YANG source
246      * @return Java-compatible regex
247      */
248     public static String getJavaRegexFromXSD(final String xsdRegex) {
249         return "^" + fixUnicodeScriptPattern(escapeChars(xsdRegex)) + '$';
250     }
251
252     /*
253      * As both '^' and '$' are special anchor characters in java regular
254      * expressions which are implicitly present in XSD regular expressions,
255      * we need to escape them in case they are not defined as part of
256      * character ranges i.e. inside regular square brackets.
257      */
258     private static String escapeChars(final String regex) {
259         final StringBuilder result = new StringBuilder(regex.length());
260         int bracket = 0;
261         boolean escape = false;
262         for (int i = 0; i < regex.length(); i++) {
263             final char ch = regex.charAt(i);
264             switch (ch) {
265                 case '[':
266                     if (!escape) {
267                         bracket++;
268                     }
269                     escape = false;
270                     result.append(ch);
271                     break;
272                 case ']':
273                     if (!escape) {
274                         bracket--;
275                     }
276                     escape = false;
277                     result.append(ch);
278                     break;
279                 case '\\':
280                     escape = !escape;
281                     result.append(ch);
282                     break;
283                 case '^':
284                 case '$':
285                     if (bracket == 0) {
286                         result.append('\\');
287                     }
288                     escape = false;
289                     result.append(ch);
290                     break;
291                 default:
292                     escape = false;
293                     result.append(ch);
294             }
295         }
296         return result.toString();
297     }
298
299     private static String fixUnicodeScriptPattern(String rawPattern) {
300         for (int i = 0; i < UNICODE_SCRIPT_FIX_COUNTER; i++) {
301             try {
302                 Pattern.compile(rawPattern);
303                 return rawPattern;
304             } catch (final PatternSyntaxException ex) {
305                 LOG.debug("Invalid regex pattern syntax in: {}", rawPattern, ex);
306                 if (ex.getMessage().contains("Unknown character script name")) {
307                     rawPattern = fixUnknownScripts(ex.getMessage(), rawPattern);
308                 } else {
309                     return rawPattern;
310                 }
311             }
312         }
313
314         LOG.warn("Regex pattern could not be fixed: {}", rawPattern);
315         return rawPattern;
316     }
317
318     private static String fixUnknownScripts(final String exMessage, final String rawPattern) {
319         StringBuilder result = new StringBuilder(rawPattern);
320         final Matcher matcher = BETWEEN_CURLY_BRACES_PATTERN.matcher(exMessage);
321         if (matcher.find()) {
322             final String capturedGroup = matcher.group(1);
323             if (JAVA_UNICODE_BLOCKS.contains(capturedGroup)) {
324                 final int idx = rawPattern.indexOf("Is" + capturedGroup);
325                 result = result.replace(idx, idx + 2, "In");
326             }
327         }
328         return result.toString();
329     }
330 }