Bug 4079: Unable to compile pattern defined in module 88/34388/2
authorIgor Foltin <ifoltin@cisco.com>
Wed, 10 Feb 2016 11:09:12 +0000 (12:09 +0100)
committerGerrit Code Review <gerrit@opendaylight.org>
Wed, 10 Feb 2016 12:42:37 +0000 (12:42 +0000)
          when using Unicode blocks

The original patch for this bug fixed the problem
only in the old yang parser.

This patch fixes the issue in the new yang parser.

Change-Id: I07c7ad8d00ab0f8f39888afd1468054e74574cb4
Signed-off-by: Igor Foltin <ifoltin@cisco.com>
(cherry picked from commit ee7500aa1604b3c654511d9f41de8281957101a8)

yang/yang-parser-impl/src/main/java/org/opendaylight/yangtools/yang/parser/stmt/rfc6020/PatternStatementImpl.java
yang/yang-parser-impl/src/main/java/org/opendaylight/yangtools/yang/parser/stmt/rfc6020/Utils.java
yang/yang-parser-impl/src/test/java/org/opendaylight/yangtools/yang/stmt/retest/Bug4079Test.java [new file with mode: 0644]

index c5cf296150411230fec9fbd18c1a00c3c41bcebb..56f562410c1c83d6b338d3ca58487187413b8aa8 100644 (file)
@@ -52,7 +52,7 @@ public class PatternStatementImpl extends AbstractDeclaredStatement<PatternConst
 
         @Override
         public PatternConstraint parseArgumentValue(final StmtContext<?, ?, ?> ctx, final String value) {
-            final String pattern = "^" + value + '$';
+            final String pattern = "^" + Utils.fixUnicodeScriptPattern(value) + '$';
 
             try {
                 Pattern.compile(pattern);
index 3123445b478574bee0f9ac12a10eed0f48218abf..71ae3700a38b3e075a6bbb384b8adffe934de40c 100644 (file)
@@ -14,6 +14,7 @@ import com.google.common.base.Splitter;
 import com.google.common.base.Strings;
 import com.google.common.collect.ImmutableMap;
 import com.google.common.collect.ImmutableMap.Builder;
+import com.google.common.collect.ImmutableSet;
 import com.google.common.collect.Iterables;
 import java.util.ArrayList;
 import java.util.Arrays;
@@ -23,7 +24,9 @@ import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
+import java.util.regex.Matcher;
 import java.util.regex.Pattern;
+import java.util.regex.PatternSyntaxException;
 import javax.annotation.Nullable;
 import javax.xml.xpath.XPath;
 import javax.xml.xpath.XPathExpressionException;
@@ -61,6 +64,7 @@ import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 public final class Utils {
+    private static final int UNICODE_SCRIPT_FIX_COUNTER = 30;
     private static final Logger LOG = LoggerFactory.getLogger(Utils.class);
     private static final CharMatcher DOUBLE_QUOTE_MATCHER = CharMatcher.is('"');
     private static final CharMatcher SINGLE_QUOTE_MATCHER = CharMatcher.is('\'');
@@ -71,6 +75,217 @@ public final class Utils {
     private static final Splitter SLASH_SPLITTER = Splitter.on('/').omitEmptyStrings().trimResults();
     private static final Splitter SPACE_SPLITTER = Splitter.on(' ').omitEmptyStrings().trimResults();
     private static final Pattern PATH_ABS = Pattern.compile("/[^/].*");
+    private static final Pattern BETWEEN_CURLY_BRACES_PATTERN = Pattern.compile("\\{(.+?)\\}");
+    private static final Set<String> JAVA_UNICODE_BLOCKS = ImmutableSet.<String>builder()
+            .add("AegeanNumbers")
+            .add("AlchemicalSymbols")
+            .add("AlphabeticPresentationForms")
+            .add("AncientGreekMusicalNotation")
+            .add("AncientGreekNumbers")
+            .add("AncientSymbols")
+            .add("Arabic")
+            .add("ArabicPresentationForms-A")
+            .add("ArabicPresentationForms-B")
+            .add("ArabicSupplement")
+            .add("Armenian")
+            .add("Arrows")
+            .add("Avestan")
+            .add("Balinese")
+            .add("Bamum")
+            .add("BamumSupplement")
+            .add("BasicLatin")
+            .add("Batak")
+            .add("Bengali")
+            .add("BlockElements")
+            .add("Bopomofo")
+            .add("BopomofoExtended")
+            .add("BoxDrawing")
+            .add("Brahmi")
+            .add("BraillePatterns")
+            .add("Buginese")
+            .add("Buhid")
+            .add("ByzantineMusicalSymbols")
+            .add("Carian")
+            .add("Cham")
+            .add("Cherokee")
+            .add("CJKCompatibility")
+            .add("CJKCompatibilityForms")
+            .add("CJKCompatibilityIdeographs")
+            .add("CJKCompatibilityIdeographsSupplement")
+            .add("CJKRadicalsSupplement")
+            .add("CJKStrokes")
+            .add("CJKSymbolsandPunctuation")
+            .add("CJKUnifiedIdeographs")
+            .add("CJKUnifiedIdeographsExtensionA")
+            .add("CJKUnifiedIdeographsExtensionB")
+            .add("CJKUnifiedIdeographsExtensionC")
+            .add("CJKUnifiedIdeographsExtensionD")
+            .add("CombiningDiacriticalMarks")
+            .add("CombiningDiacriticalMarksSupplement")
+            .add("CombiningHalfMarks")
+            .add("CombiningDiacriticalMarksforSymbols")
+            .add("CommonIndicNumberForms")
+            .add("ControlPictures")
+            .add("Coptic")
+            .add("CountingRodNumerals")
+            .add("Cuneiform")
+            .add("CuneiformNumbersandPunctuation")
+            .add("CurrencySymbols")
+            .add("CypriotSyllabary")
+            .add("Cyrillic")
+            .add("CyrillicExtended-A")
+            .add("CyrillicExtended-B")
+            .add("CyrillicSupplementary")
+            .add("Deseret")
+            .add("Devanagari")
+            .add("DevanagariExtended")
+            .add("Dingbats")
+            .add("DominoTiles")
+            .add("EgyptianHieroglyphs")
+            .add("Emoticons")
+            .add("EnclosedAlphanumericSupplement")
+            .add("EnclosedAlphanumerics")
+            .add("EnclosedCJKLettersandMonths")
+            .add("EnclosedIdeographicSupplement")
+            .add("Ethiopic")
+            .add("EthiopicExtended")
+            .add("EthiopicExtended-A")
+            .add("EthiopicSupplement")
+            .add("GeneralPunctuation")
+            .add("GeometricShapes")
+            .add("Georgian")
+            .add("GeorgianSupplement")
+            .add("Glagolitic")
+            .add("Gothic")
+            .add("GreekandCoptic")
+            .add("GreekExtended")
+            .add("Gujarati")
+            .add("Gurmukhi")
+            .add("HalfwidthandFullwidthForms")
+            .add("HangulCompatibilityJamo")
+            .add("HangulJamo")
+            .add("HangulJamoExtended-A")
+            .add("HangulJamoExtended-B")
+            .add("HangulSyllables")
+            .add("Hanunoo")
+            .add("Hebrew")
+            .add("HighPrivateUseSurrogates")
+            .add("HighSurrogates")
+            .add("Hiragana")
+            .add("IdeographicDescriptionCharacters")
+            .add("ImperialAramaic")
+            .add("InscriptionalPahlavi")
+            .add("InscriptionalParthian")
+            .add("IPAExtensions")
+            .add("Javanese")
+            .add("Kaithi")
+            .add("KanaSupplement")
+            .add("Kanbun")
+            .add("Kangxi Radicals")
+            .add("Kannada")
+            .add("Katakana")
+            .add("KatakanaPhoneticExtensions")
+            .add("KayahLi")
+            .add("Kharoshthi")
+            .add("Khmer")
+            .add("KhmerSymbols")
+            .add("Lao")
+            .add("Latin-1Supplement")
+            .add("LatinExtended-A")
+            .add("LatinExtendedAdditional")
+            .add("LatinExtended-B")
+            .add("LatinExtended-C")
+            .add("LatinExtended-D")
+            .add("Lepcha")
+            .add("LetterlikeSymbols")
+            .add("Limbu")
+            .add("LinearBIdeograms")
+            .add("LinearBSyllabary")
+            .add("Lisu")
+            .add("LowSurrogates")
+            .add("Lycian")
+            .add("Lydian")
+            .add("MahjongTiles")
+            .add("Malayalam")
+            .add("Mandaic")
+            .add("MathematicalAlphanumericSymbols")
+            .add("MathematicalOperators")
+            .add("MeeteiMayek")
+            .add("MiscellaneousMathematicalSymbols-A")
+            .add("MiscellaneousMathematicalSymbols-B")
+            .add("MiscellaneousSymbols")
+            .add("MiscellaneousSymbolsandArrows")
+            .add("MiscellaneousSymbolsAndPictographs")
+            .add("MiscellaneousTechnical")
+            .add("ModifierToneLetters")
+            .add("Mongolian")
+            .add("MusicalSymbols")
+            .add("Myanmar")
+            .add("MyanmarExtended-A")
+            .add("NewTaiLue")
+            .add("NKo")
+            .add("NumberForms")
+            .add("Ogham")
+            .add("OlChiki")
+            .add("OldItalic")
+            .add("OldPersian")
+            .add("OldSouthArabian")
+            .add("OldTurkic")
+            .add("OpticalCharacterRecognition")
+            .add("Oriya")
+            .add("Osmanya")
+            .add("Phags-pa")
+            .add("PhaistosDisc")
+            .add("Phoenician")
+            .add("PhoneticExtensions")
+            .add("PhoneticExtensionsSupplement")
+            .add("PlayingCards")
+            .add("PrivateUseArea")
+            .add("Rejang")
+            .add("RumiNumeralSymbols")
+            .add("Runic")
+            .add("Samaritan")
+            .add("Saurashtra")
+            .add("Shavian")
+            .add("Sinhala")
+            .add("SmallFormVariants")
+            .add("SpacingModifierLetters")
+            .add("Specials")
+            .add("Sundanese")
+            .add("SuperscriptsandSubscripts")
+            .add("SupplementalArrows-A")
+            .add("SupplementalArrows-B")
+            .add("SupplementalMathematicalOperators")
+            .add("SupplementalPunctuation")
+            .add("SupplementaryPrivateUseArea-A")
+            .add("SupplementaryPrivateUseArea-B")
+            .add("SylotiNagri")
+            .add("Syriac")
+            .add("Tagalog")
+            .add("Tagbanwa")
+            .add("Tags")
+            .add("TaiLe")
+            .add("TaiTham")
+            .add("TaiViet")
+            .add("TaiXuanJingSymbols")
+            .add("Tamil")
+            .add("Telugu")
+            .add("Thaana")
+            .add("Thai")
+            .add("Tibetan")
+            .add("Tifinagh")
+            .add("TransportAndMapSymbols")
+            .add("Ugaritic")
+            .add("UnifiedCanadianAboriginalSyllabics")
+            .add("UnifiedCanadianAboriginalSyllabicsExtended")
+            .add("Vai")
+            .add("VariationSelectors")
+            .add("VariationSelectorsSupplement")
+            .add("VedicExtensions")
+            .add("VerticalForms")
+            .add("YiRadicals")
+            .add("YiSyllables")
+            .add("YijingHexagramSymbols").build();
 
     private static final Map<String, Deviate> KEYWORD_TO_DEVIATE_MAP;
     static {
@@ -377,4 +592,36 @@ public final class Utils {
 
         return string;
     }
+
+    public static String fixUnicodeScriptPattern(String rawPattern) {
+        for (int i = 0; i < UNICODE_SCRIPT_FIX_COUNTER; i++) {
+            try {
+                Pattern.compile(rawPattern);
+                return rawPattern;
+            } catch(PatternSyntaxException ex) {
+                LOG.debug("Invalid regex pattern syntax in: {}", rawPattern, ex);
+                if (ex.getMessage().contains("Unknown character script name")) {
+                    rawPattern = fixUnknownScripts(ex.getMessage(), rawPattern);
+                } else {
+                    return rawPattern;
+                }
+            }
+        }
+
+        LOG.warn("Regex pattern could not be fixed: {}", rawPattern);
+        return rawPattern;
+    }
+
+    private static String fixUnknownScripts(final String exMessage, final String rawPattern) {
+        StringBuilder result = new StringBuilder(rawPattern);
+        Matcher matcher = BETWEEN_CURLY_BRACES_PATTERN.matcher(exMessage);
+        if (matcher.find()) {
+            String capturedGroup = matcher.group(1);
+            if (JAVA_UNICODE_BLOCKS.contains(capturedGroup)) {
+                int idx = rawPattern.indexOf("Is" + capturedGroup);
+                result = result.replace(idx, idx + 2, "In");
+            }
+        }
+        return result.toString();
+    }
 }
diff --git a/yang/yang-parser-impl/src/test/java/org/opendaylight/yangtools/yang/stmt/retest/Bug4079Test.java b/yang/yang-parser-impl/src/test/java/org/opendaylight/yangtools/yang/stmt/retest/Bug4079Test.java
new file mode 100644 (file)
index 0000000..7793a61
--- /dev/null
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2016 Cisco Systems, Inc. and others.  All rights reserved.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License v1.0 which accompanies this distribution,
+ * and is available at http://www.eclipse.org/legal/epl-v10.html
+ */
+
+package org.opendaylight.yangtools.yang.stmt.retest;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+
+import java.lang.reflect.InvocationTargetException;
+import java.util.regex.Pattern;
+import java.util.regex.PatternSyntaxException;
+import org.junit.Test;
+import org.opendaylight.yangtools.yang.parser.stmt.rfc6020.Utils;
+
+public class Bug4079Test {
+
+    @Test
+    public void testValidPatternFix() {
+        String fixedUnicodeScriptPattern = Utils.fixUnicodeScriptPattern("(\\p{IsArrows})*+");
+        assertEquals("(\\p{InArrows})*+", fixedUnicodeScriptPattern);
+        assertNotNull(Pattern.compile(fixedUnicodeScriptPattern));
+
+        fixedUnicodeScriptPattern = Utils.fixUnicodeScriptPattern("(\\p{IsDingbats})++");
+        assertEquals("(\\p{InDingbats})++", fixedUnicodeScriptPattern);
+        assertNotNull(Pattern.compile(fixedUnicodeScriptPattern));
+
+        fixedUnicodeScriptPattern = Utils.fixUnicodeScriptPattern("(\\p{IsSpecials})?+");
+        assertEquals("(\\p{InSpecials})?+", fixedUnicodeScriptPattern);
+        assertNotNull(Pattern.compile(fixedUnicodeScriptPattern));
+
+        fixedUnicodeScriptPattern = Utils.fixUnicodeScriptPattern("(\\p{IsBatak}){4}+");
+        assertEquals("(\\p{IsBatak}){4}+", fixedUnicodeScriptPattern);
+        assertNotNull(Pattern.compile(fixedUnicodeScriptPattern));
+
+        fixedUnicodeScriptPattern = Utils.fixUnicodeScriptPattern("(\\p{IsLatin}){4,6}+");
+        assertEquals("(\\p{IsLatin}){4,6}+", fixedUnicodeScriptPattern);
+        assertNotNull(Pattern.compile(fixedUnicodeScriptPattern));
+
+        fixedUnicodeScriptPattern = Utils.fixUnicodeScriptPattern("(\\p{IsTibetan}){4,}+");
+        assertEquals("(\\p{IsTibetan}){4,}+", fixedUnicodeScriptPattern);
+        assertNotNull(Pattern.compile(fixedUnicodeScriptPattern));
+
+        fixedUnicodeScriptPattern = Utils.fixUnicodeScriptPattern("(\\p{IsAlphabetic}){4}?");
+        assertEquals("(\\p{IsAlphabetic}){4}?", fixedUnicodeScriptPattern);
+        assertNotNull(Pattern.compile(fixedUnicodeScriptPattern));
+
+        fixedUnicodeScriptPattern = Utils.fixUnicodeScriptPattern("(\\p{IsLowercase}){4,6}?");
+        assertEquals("(\\p{IsLowercase}){4,6}?", fixedUnicodeScriptPattern);
+        assertNotNull(Pattern.compile(fixedUnicodeScriptPattern));
+
+        fixedUnicodeScriptPattern = Utils.fixUnicodeScriptPattern("(\\p{IsUppercase}){4,}?");
+        assertEquals("(\\p{IsUppercase}){4,}?", fixedUnicodeScriptPattern);
+        assertNotNull(Pattern.compile(fixedUnicodeScriptPattern));
+
+        fixedUnicodeScriptPattern = Utils.fixUnicodeScriptPattern("(\\p{IsBasicLatin}|\\p{IsLatin-1Supplement})*");
+        assertEquals("(\\p{InBasicLatin}|\\p{InLatin-1Supplement})*", fixedUnicodeScriptPattern);
+        assertNotNull(Pattern.compile(fixedUnicodeScriptPattern));
+
+        fixedUnicodeScriptPattern = Utils.fixUnicodeScriptPattern("(\\p{InBasicLatin}|\\p{InLatin-1Supplement})+");
+        assertEquals("(\\p{InBasicLatin}|\\p{InLatin-1Supplement})+", fixedUnicodeScriptPattern);
+        assertNotNull(Pattern.compile(fixedUnicodeScriptPattern));
+
+        fixedUnicodeScriptPattern = Utils.fixUnicodeScriptPattern("(\\p{IsBasicLatin}|\\p{InLatin-1Supplement})?");
+        assertEquals("(\\p{InBasicLatin}|\\p{InLatin-1Supplement})?", fixedUnicodeScriptPattern);
+        assertNotNull(Pattern.compile(fixedUnicodeScriptPattern));
+
+        fixedUnicodeScriptPattern = Utils.fixUnicodeScriptPattern("(\\p{InBasicLatin}|\\p{IsLatin-1Supplement}){4}");
+        assertEquals("(\\p{InBasicLatin}|\\p{InLatin-1Supplement}){4}", fixedUnicodeScriptPattern);
+        assertNotNull(Pattern.compile(fixedUnicodeScriptPattern));
+
+        fixedUnicodeScriptPattern = Utils.fixUnicodeScriptPattern("(\\p{IsLatin}|\\p{IsArmenian}){2,4}");
+        assertEquals("(\\p{IsLatin}|\\p{IsArmenian}){2,4}", fixedUnicodeScriptPattern);
+        assertNotNull(Pattern.compile(fixedUnicodeScriptPattern));
+
+        fixedUnicodeScriptPattern = Utils.fixUnicodeScriptPattern("(\\p{IsLatin}|\\p{IsBasicLatin}){2,}");
+        assertEquals("(\\p{IsLatin}|\\p{InBasicLatin}){2,}", fixedUnicodeScriptPattern);
+        assertNotNull(Pattern.compile(fixedUnicodeScriptPattern));
+
+        fixedUnicodeScriptPattern = Utils.fixUnicodeScriptPattern("(\\p{IsBasicLatin}|\\p{IsLatin})*?");
+        assertEquals("(\\p{InBasicLatin}|\\p{IsLatin})*?", fixedUnicodeScriptPattern);
+        assertNotNull(Pattern.compile(fixedUnicodeScriptPattern));
+
+        fixedUnicodeScriptPattern = Utils.fixUnicodeScriptPattern(
+                "(\\p{IsBasicLatin}|\\p{IsLatin-1Supplement}|\\p{IsArrows})+?");
+        assertEquals("(\\p{InBasicLatin}|\\p{InLatin-1Supplement}|\\p{InArrows})+?", fixedUnicodeScriptPattern);
+        assertNotNull(Pattern.compile(fixedUnicodeScriptPattern));
+
+        fixedUnicodeScriptPattern = Utils.fixUnicodeScriptPattern(
+                "(\\p{InBasicLatin}|\\p{IsLatin-1Supplement}|\\p{IsLatin})??");
+        assertEquals("(\\p{InBasicLatin}|\\p{InLatin-1Supplement}|\\p{IsLatin})??", fixedUnicodeScriptPattern);
+        assertNotNull(Pattern.compile(fixedUnicodeScriptPattern));
+
+        fixedUnicodeScriptPattern = Utils.fixUnicodeScriptPattern("(\\\\\\p{IsBasicLatin})*+");
+        assertEquals("(\\\\\\p{InBasicLatin})*+", fixedUnicodeScriptPattern);
+        assertNotNull(Pattern.compile(fixedUnicodeScriptPattern));
+
+        fixedUnicodeScriptPattern = Utils.fixUnicodeScriptPattern("(\\\\\\\\\\p{IsBasicLatin})*+");
+        assertEquals("(\\\\\\\\\\p{InBasicLatin})*+", fixedUnicodeScriptPattern);
+        assertNotNull(Pattern.compile(fixedUnicodeScriptPattern));
+
+        fixedUnicodeScriptPattern = Utils.fixUnicodeScriptPattern("(\\\\\\\\\\\\\\p{IsBasicLatin})*+");
+        assertEquals("(\\\\\\\\\\\\\\p{InBasicLatin})*+", fixedUnicodeScriptPattern);
+        assertNotNull(Pattern.compile(fixedUnicodeScriptPattern));
+    }
+
+    @Test(expected = PatternSyntaxException.class)
+    public void testInvalidPattern() throws NoSuchMethodException, InvocationTargetException, IllegalAccessException {
+        String fixedUnicodeScriptPattern = Utils.fixUnicodeScriptPattern("(\\\\p{IsBasicLatin})*+");
+        assertEquals("(\\\\p{IsBasicLatin})*+", fixedUnicodeScriptPattern);
+        // should throw exception
+        Pattern.compile(fixedUnicodeScriptPattern);
+    }
+
+    @Test(expected = PatternSyntaxException.class)
+    public void testInvalidPattern2() throws NoSuchMethodException, InvocationTargetException, IllegalAccessException {
+        String fixedUnicodeScriptPattern = Utils.fixUnicodeScriptPattern("(\\p{IsSpecials}|\\\\\\\\p{IsBasicLatin})*+");
+        assertEquals("(\\p{InSpecials}|\\\\\\\\p{IsBasicLatin})*+", fixedUnicodeScriptPattern);
+        // should throw exception
+        Pattern.compile(fixedUnicodeScriptPattern);
+    }
+
+    @Test(expected = PatternSyntaxException.class)
+    public void testInvalidPattern3() throws NoSuchMethodException, InvocationTargetException, IllegalAccessException {
+        String fixedUnicodeScriptPattern = Utils.fixUnicodeScriptPattern("(\\\\\\\\\\\\p{IsBasicLatin}|\\p{IsTags})*+");
+        assertEquals("(\\\\\\\\\\\\p{IsBasicLatin}|\\p{IsTags})*+", fixedUnicodeScriptPattern);
+        // should throw exception
+        Pattern.compile(fixedUnicodeScriptPattern);
+    }
+}