Rework YANG lexer/parser

author Robert Varga <robert.varga@pantheon.tech>

Thu, 13 Aug 2020 18:15:06 +0000 (20:15 +0200)

committer Robert Varga <robert.varga@pantheon.tech>

Fri, 14 Aug 2020 16:24:45 +0000 (18:24 +0200)
author Robert Varga <robert.varga@pantheon.tech>
Thu, 13 Aug 2020 18:15:06 +0000 (20:15 +0200)
committer Robert Varga <robert.varga@pantheon.tech>
Fri, 14 Aug 2020 16:24:45 +0000 (18:24 +0200)
diff --git a/yang/yang-parser-antlr/src/main/antlr4/org/opendaylight/yangtools/yang/parser/antlr/YangStatementLexer.g4 b/yang/yang-parser-antlr/src/main/antlr4/org/opendaylight/yangtools/yang/parser/antlr/YangStatementLexer.g4

index 3788c9cf928ff51c5863d8e6a8bf23c7458c80e3..fa44dd7d0e50a84e8b472691427f27126f13f3e0 100644 (file)
--- a/yang/yang-parser-antlr/src/main/antlr4/org/opendaylight/yangtools/yang/parser/antlr/YangStatementLexer.g4
+++ b/yang/yang-parser-antlr/src/main/antlr4/org/opendaylight/yangtools/yang/parser/antlr/YangStatementLexer.g4
@@ -14,7 +14,12 @@ tokens {
      SEP,
      IDENTIFIER,
      COLON,
-    PLUS
+    PLUS,
+    SLASH,
+    STAR,
+    DQUOT_STRING,
+    SQUOT_STRING,
+    UQUOT_STRING
  }
  
  SEMICOLON : ';' -> type(SEMICOLON);
@@ -23,16 +28,81 @@ RIGHT_BRACE : '}' -> type(RIGHT_BRACE);
  COLON : ':' -> type(COLON);
  PLUS : '+' -> type(PLUS);
  
-LINE_COMMENT : [ \n\r\t]* ('//' (~[\r\n]*)) [ \n\r\t]* -> skip;
+// RFC6020 section 6.1.1:
+//   Comments are C++ style.  A single line comment starts with "//" and
+//   ends at the end of the line.  A block comment is enclosed within "/*"
+//   and "*/".
+//
+// RFC7950 section 6.1.1:
+//   Comments are C++ style.  A single line comment starts with "//" and
+//   ends at the end of the line.  A block comment starts with "/*" and
+//   ends with the nearest following "*/".
+//
+//   Note that inside a quoted string (Section 6.1.3), these character
+//   pairs are never interpreted as the start or end of a comment.
+//
+// What constitutes 'end of the line' is not specified in RFC7950, hence
+// we are using RFC7950-clarified definition. Note we also need to handle
+// the case of EOF, as the user may not have included a newline.
+LINE_COMMENT : '//' .*? '\r'? ('\n' | EOF) -> skip;
  BLOCK_COMMENT : '/*' .*? '*/' -> skip;
  
  SEP: [ \n\r\t]+ -> type(SEP);
-IDENTIFIER : [a-zA-Z_/][a-zA-Z0-9_\-.:/]* -> type(IDENTIFIER);
  
-fragment SUB_STRING : ('"' (ESC | ~["])*? '"') | ('\'' (ESC | ~['])* '\'');
-fragment ESC : '\\' (["\\/bfnrt] | UNICODE);
-fragment UNICODE : 'u' HEX HEX HEX HEX;
-fragment HEX : [0-9a-fA-F] ;
+// Special-cased identifier string
+IDENTIFIER : [a-zA-Z_][a-zA-Z0-9_\-.]* -> type(IDENTIFIER);
+
+// RFC6020 section 6.1.3:
+//   If a string contains any space or tab characters, a semicolon (";"),
+//   braces ("{" or "}"), or comment sequences ("//", "/*", or "*/"), then
+//   it MUST be enclosed within double or single quotes.
+//
+// RFC7950 section 6.1.3:
+//   An unquoted string is any sequence of characters that does not
+//   contain any space, tab, carriage return, or line feed characters, a
+//   single or double quote character, a semicolon (";"), braces ("{" or
+//   "}"), or comment sequences ("//", "/*", or "*/").
+//
+// Since we need tokenization to work in both worlds, we are taking only
+// RFC6020 with CR/LF clarifications -- and allow quotes to appear in the body
+// of a string. We additionally exclude COLON, so as to prefer IDENTIFIER
+// tokenization -- which allows us to make keyword work properly.
+//
+// Furthermore we need to exclude PLUS so that concatenation works as expected
+// when + is not separated by whitespace -- even RFC7950 is far from being
+// well-specified in this regard.
+//
+// The most problematic here is the comment sequence exclusion, as we cannot
+// just exclude it from productions. We therefore provide single-char
+// tokenizations of both '*' and '/', and deal with them separately in the
+// parser.
+SLASH : '/' -> type(SLASH);
+STAR : '*' -> type(STAR);
+UQUOT_STRING :
+    // Any eager span that does not start with single/double quote and does not
+    // have slash/star.
+    ~([ \n\r\t] | [;{}:+] | [/*] | ['"])
+    ~([ \n\r\t] | [;{}:+] | [/*])*
+    -> type(UQUOT_STRING);
+
+// Double/single-quoted strings. We deal with these using specialized modes.
+DQUOT_START : '"' -> pushMode(DQUOT_STRING_MODE);
+SQUOT_START : '\'' -> pushMode(SQUOT_STRING_MODE);
  
-STRING: ((~( '\r' | '\n' | '\t' | ' ' | ';' | '{' | '"' | '\'' | '}' | '/' | '+')~( '\r' | '\n' | '\t' | ' ' | ';' | '{' | '}' )* ) | SUB_STRING );
+//
+// Double-quoted string lexing mode. We do not need to recognize all possible
+// escapes here -- just enough not to get confused by runs of backslashes and
+// recognize escaped double quotes.
+//
+mode DQUOT_STRING_MODE;
+DQUOT_STRING : (~["\\] | ('\\' .))+ -> type(DQUOT_STRING);
+DQUOT_END : '"' -> popMode;
+
+//
+// Single-quoted string lexing mode. We do not interpret anything within single
+// quotes.
+//
+mode SQUOT_STRING_MODE;
+SQUOT_STRING : ~[']+ -> type(SQUOT_STRING);
+SQUOT_END : '\'' -> popMode;
  
diff --git a/yang/yang-parser-antlr/src/main/antlr4/org/opendaylight/yangtools/yang/parser/antlr/YangStatementParser.g4 b/yang/yang-parser-antlr/src/main/antlr4/org/opendaylight/yangtools/yang/parser/antlr/YangStatementParser.g4

index 1d7554bf781b553beeff994d68a8d941788d8f84..cbfeee38c7b5211477ff370d074508732fb5f841 100644 (file)
--- a/yang/yang-parser-antlr/src/main/antlr4/org/opendaylight/yangtools/yang/parser/antlr/YangStatementParser.g4
+++ b/yang/yang-parser-antlr/src/main/antlr4/org/opendaylight/yangtools/yang/parser/antlr/YangStatementParser.g4
@@ -17,4 +17,28 @@ file : SEP* statement SEP* EOF;
  statement : keyword (SEP+ argument)? SEP* (SEMICOLON | LEFT_BRACE SEP* (statement SEP*)* RIGHT_BRACE);
  keyword : IDENTIFIER (COLON IDENTIFIER)?;
  
-argument : STRING (SEP* PLUS SEP* STRING)* | IDENTIFIER;
+// Alright, so what constitutes a string is rather funky. We need to deal with
+// the flaky definitions of RFC6020, which allow for insane quoting as well as
+// exclusion of comments. We also need to allow for stitching back tokens like
+// PLUS/COLON, which may end up being valid identifiers. Finally we need to allow
+// IDENTIFIER to be concatenated back to a string
+argument : unquotedString | quotedString (SEP* PLUS SEP* quotedString)*;
+
+quotedString :
+    DQUOT_START DQUOT_STRING? DQUOT_END
+    |
+    SQUOT_START SQUOT_STRING? SQUOT_END
+    ;
+
+unquotedString : SLASH | STAR+ | (SLASH? | STAR*) stringPart+ (SLASH? | STAR*);
+
+// A string which is guaranteed to not have slash/star in either start or end
+// and can thus be concatenated without allowing '/*', '//' and '*/' to appear.
+stringPart:
+    (IDENTIFIER | COLON | PLUS | UQUOT_STRING)+
+    |
+    stringPart SLASH stringPart
+    |
+    stringPart STAR+ stringPart
+    ;
+
diff --git a/yang/yang-parser-rfc7950/src/main/java/org/opendaylight/yangtools/yang/parser/rfc7950/repo/ArgumentContextUtils.java b/yang/yang-parser-rfc7950/src/main/java/org/opendaylight/yangtools/yang/parser/rfc7950/repo/ArgumentContextUtils.java

index ea51ac5149b14f58d3924034491a3331d9d3822c..8164c64dd07d378f116be990c01ad5cd00f20eeb 100644 (file)
--- a/yang/yang-parser-rfc7950/src/main/java/org/opendaylight/yangtools/yang/parser/rfc7950/repo/ArgumentContextUtils.java
+++ b/yang/yang-parser-rfc7950/src/main/java/org/opendaylight/yangtools/yang/parser/rfc7950/repo/ArgumentContextUtils.java
@@ -12,12 +12,15 @@ import static com.google.common.base.Verify.verify;
  import com.google.common.annotations.VisibleForTesting;
  import com.google.common.base.CharMatcher;
  import com.google.common.base.VerifyException;
+import org.antlr.v4.runtime.Token;
  import org.antlr.v4.runtime.tree.ParseTree;
  import org.antlr.v4.runtime.tree.TerminalNode;
  import org.eclipse.jdt.annotation.NonNull;
  import org.opendaylight.yangtools.yang.common.YangVersion;
  import org.opendaylight.yangtools.yang.parser.antlr.YangStatementParser;
  import org.opendaylight.yangtools.yang.parser.antlr.YangStatementParser.ArgumentContext;
+import org.opendaylight.yangtools.yang.parser.antlr.YangStatementParser.QuotedStringContext;
+import org.opendaylight.yangtools.yang.parser.antlr.YangStatementParser.UnquotedStringContext;
  import org.opendaylight.yangtools.yang.parser.spi.source.SourceException;
  import org.opendaylight.yangtools.yang.parser.spi.source.StatementSourceReference;
  
@@ -111,18 +114,41 @@ abstract class ArgumentContextUtils {
      final @NonNull String stringFromStringContext(final ArgumentContext context, final StatementSourceReference ref) {
          // Get first child, which we fully expect to exist and be a lexer token
          final ParseTree firstChild = context.getChild(0);
-        verify(firstChild instanceof TerminalNode, "Unexpected shape of %s", context);
-        final TerminalNode firstNode = (TerminalNode) firstChild;
-        final int firstType = firstNode.getSymbol().getType();
-        switch (firstType) {
-            case YangStatementParser.IDENTIFIER:
-                // Simple case, there is a simple string, which cannot contain anything that we would need to process.
-                return firstNode.getText();
-            case YangStatementParser.STRING:
-                // Complex case, defer to a separate method
-                return concatStrings(context, ref);
+        if (firstChild instanceof UnquotedStringContext) {
+            // Simple case, just grab the text, as ANTLR has done all the heavy lifting
+            final String str = firstChild.getText();
+            checkUnquoted(str, ref);
+            return str;
+        }
+
+        verify(firstChild instanceof QuotedStringContext, "Unexpected shape of %s", context);
+        if (context.getChildCount() == 1) {
+            // No concatenation needed, special-case
+            return unquoteString((QuotedStringContext) firstChild, ref);
+        }
+
+        // Potentially-complex case of string quoting, escaping and concatenation.
+        return concatStrings(context, ref);
+    }
+
+    private String unquoteString(final QuotedStringContext context, final StatementSourceReference ref) {
+        final ParseTree secondChild = context.getChild(1);
+        verify(secondChild instanceof TerminalNode, "Unexpected shape of %s", context);
+        final Token secondToken = ((TerminalNode) secondChild).getSymbol();
+        final int type = secondToken.getType();
+        switch (type) {
+            case YangStatementParser.DQUOT_END:
+            case YangStatementParser.SQUOT_END:
+                // We are missing actual body, hence this is an empty string
+                return "";
+            case YangStatementParser.SQUOT_STRING:
+                return secondChild.getText();
+            case YangStatementParser.DQUOT_STRING:
+                // We should be looking at the first token, which is DQUOT_START, but since it is a single-character
+                // token, let's not bother.
+                return normalizeDoubleQuoted(secondChild.getText(), secondToken.getCharPositionInLine() - 1, ref);
              default:
-                throw new VerifyException("Unexpected first symbol in " + context);
+                throw new VerifyException("Unhandled token type " + type);
          }
      }
  
@@ -130,56 +156,31 @@ abstract class ArgumentContextUtils {
          /*
           * We have multiple fragments. Just search the tree. This code is equivalent to
           *
-         *    context.STRING().forEach(stringNode -> appendString(sb, stringNode, ref))
+         *    context.quotedString().forEach(stringNode -> sb.append(unquoteString(stringNode, ref))
           *
           * except we minimize allocations which that would do.
           */
          final StringBuilder sb = new StringBuilder();
          for (ParseTree child : context.children) {
-            verify(child instanceof TerminalNode, "Unexpected fragment component %s", child);
-            final TerminalNode childNode = (TerminalNode) child;
-            switch (childNode.getSymbol().getType()) {
-                case YangStatementParser.SEP:
-                    // Ignore whitespace
-                    break;
-                case YangStatementParser.PLUS:
-                    // Operator, which we are handling by concat
-                    break;
-                case YangStatementParser.STRING:
-                    // a lexer string, could be pretty much anything
-                    // TODO: appendString() is a dispatch based on quotes, which we should be able to defer to lexer for
-                    //       a dedicated type. That would expand the switch table here, but since we have it anyway, it
-                    //       would be nice to have the quoting distinction already taken care of. The performance
-                    //       difference will need to be benchmarked, though.
-                    appendString(sb, childNode, ref);
-                    break;
-                default:
-                    throw new VerifyException("Unexpected symbol in " + childNode);
+            if (child instanceof TerminalNode) {
+                final TerminalNode childNode = (TerminalNode) child;
+                switch (childNode.getSymbol().getType()) {
+                    case YangStatementParser.SEP:
+                    case YangStatementParser.PLUS:
+                        // Operator, which we are handling by concat
+                        break;
+                    default:
+                        throw new VerifyException("Unexpected symbol in " + childNode);
+                }
+            } else {
+                verify(child instanceof QuotedStringContext, "Unexpected fragment component %s", child);
+                sb.append(unquoteString((QuotedStringContext) child, ref));
+                continue;
              }
          }
          return sb.toString();
      }
  
-    private void appendString(final StringBuilder sb, final TerminalNode stringNode,
-            final StatementSourceReference ref) {
-        final String str = stringNode.getText();
-        final char firstChar = str.charAt(0);
-        final char lastChar = str.charAt(str.length() - 1);
-        if (firstChar == '"' && lastChar == '"') {
-            sb.append(normalizeDoubleQuoted(str.substring(1, str.length() - 1),
-                stringNode.getSymbol().getCharPositionInLine(), ref));
-        } else if (firstChar == '\'' && lastChar == '\'') {
-            /*
-             * According to RFC6020 a single quote character cannot occur in a single-quoted string, even when preceded
-             * by a backslash.
-             */
-            sb.append(str, 1, str.length() - 1);
-        } else {
-            checkUnquoted(str, ref);
-            sb.append(str);
-        }
-    }
-
      private String normalizeDoubleQuoted(final String str, final int dquot, final StatementSourceReference ref) {
          // Whitespace normalization happens irrespective of further handling and has no effect on the result
          final String stripped = trimWhitespace(str, dquot);
diff --git a/yang/yang-parser-rfc7950/src/test/java/org/opendaylight/yangtools/yang/stmt/YT1089Test.java b/yang/yang-parser-rfc7950/src/test/java/org/opendaylight/yangtools/yang/stmt/YT1089Test.java

new file mode 100644 (file)

index 0000000..2b2a752
--- /dev/null
+++ b/yang/yang-parser-rfc7950/src/test/java/org/opendaylight/yangtools/yang/stmt/YT1089Test.java
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2020 PANTHEON.tech, s.r.o. and others.  All rights reserved.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License v1.0 which accompanies this distribution,
+ * and is available at http://www.eclipse.org/legal/epl-v10.html
+ */
+package org.opendaylight.yangtools.yang.stmt;
+
+import static org.hamcrest.CoreMatchers.instanceOf;
+import static org.hamcrest.MatcherAssert.assertThat;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+
+import com.google.common.collect.Iterables;
+import java.util.Iterator;
+import org.junit.Test;
+import org.opendaylight.yangtools.yang.common.QName;
+import org.opendaylight.yangtools.yang.model.api.SchemaContext;
+import org.opendaylight.yangtools.yang.model.api.meta.EffectiveStatement;
+import org.opendaylight.yangtools.yang.model.api.stmt.ContactEffectiveStatement;
+import org.opendaylight.yangtools.yang.model.api.stmt.DescriptionEffectiveStatement;
+import org.opendaylight.yangtools.yang.model.api.stmt.LeafEffectiveStatement;
+import org.opendaylight.yangtools.yang.model.api.stmt.ModuleEffectiveStatement;
+import org.opendaylight.yangtools.yang.model.api.stmt.NamespaceEffectiveStatement;
+import org.opendaylight.yangtools.yang.model.api.stmt.OrganizationEffectiveStatement;
+import org.opendaylight.yangtools.yang.model.api.stmt.PrefixEffectiveStatement;
+import org.opendaylight.yangtools.yang.model.api.stmt.TypeEffectiveStatement;
+
+public class YT1089Test {
+    @Test
+    public void testPlusLexing() throws Exception {
+        final SchemaContext ctx = StmtTestUtils.parseYangSource("/bugs/YT1089/foo.yang");
+        assertEquals(1, ctx.getModules().size());
+
+        final Iterator<? extends EffectiveStatement<?, ?>> it =
+                ((ModuleEffectiveStatement)Iterables.getOnlyElement(ctx.getModules()))
+                    .effectiveSubstatements().iterator();
+
+        assertThat(it.next(), instanceOf(NamespaceEffectiveStatement.class));
+        assertThat(it.next(), instanceOf(PrefixEffectiveStatement.class));
+
+        EffectiveStatement<?, ?> stmt = it.next();
+        assertThat(stmt, instanceOf(DescriptionEffectiveStatement.class));
+        assertEquals("+something", stmt.argument());
+
+        stmt = it.next();
+        assertThat(stmt, instanceOf(ContactEffectiveStatement.class));
+        assertEquals("contact++", stmt.argument());
+
+        stmt = it.next();
+        assertThat(stmt, instanceOf(OrganizationEffectiveStatement.class));
+        assertEquals("organiza++tion", stmt.argument());
+
+        assertFoo(it.next());
+        assertBar(it.next());
+        assertBaz(it.next());
+        assertXyzzy(it.next());
+        assertFalse(it.hasNext());
+    }
+
+    private static void assertFoo(final EffectiveStatement<?, ?> stmt) {
+        assertThat(stmt, instanceOf(LeafEffectiveStatement.class));
+        assertEquals(QName.create("urn:foo", "foo"), stmt.argument());
+
+        final Iterator<? extends EffectiveStatement<?, ?>> it = stmt.effectiveSubstatements().iterator();
+        assertThat(it.next(), instanceOf(TypeEffectiveStatement.class));
+        assertEquals("+", it.next().argument());
+        assertEquals("squotdquot", it.next().argument());
+        assertEquals("++", it.next().argument());
+        assertFalse(it.hasNext());
+    }
+
+    private static void assertBar(final EffectiveStatement<?, ?> stmt) {
+        assertThat(stmt, instanceOf(LeafEffectiveStatement.class));
+        assertEquals(QName.create("urn:foo", "bar"), stmt.argument());
+        final Iterator<? extends EffectiveStatement<?, ?>> it = stmt.effectiveSubstatements().iterator();
+        assertThat(it.next(), instanceOf(TypeEffectiveStatement.class));
+        assertEquals("++", it.next().argument());
+        assertEquals("+ + ++", it.next().argument());
+        assertEquals("++ + +", it.next().argument());
+        assertFalse(it.hasNext());
+    }
+
+    private static void assertBaz(final EffectiveStatement<?, ?> stmt) {
+        assertThat(stmt, instanceOf(LeafEffectiveStatement.class));
+        assertEquals(QName.create("urn:foo", "baz"), stmt.argument());
+        final Iterator<? extends EffectiveStatement<?, ?>> it = stmt.effectiveSubstatements().iterator();
+        assertThat(it.next(), instanceOf(TypeEffectiveStatement.class));
+        assertEquals("/", it.next().argument());
+        assertEquals(":", it.next().argument());
+        assertEquals("*", it.next().argument());
+        assertFalse(it.hasNext());
+    }
+
+    private static void assertXyzzy(final EffectiveStatement<?, ?> stmt) {
+        assertThat(stmt, instanceOf(LeafEffectiveStatement.class));
+        assertEquals(QName.create("urn:foo", "xyzzy"), stmt.argument());
+        final Iterator<? extends EffectiveStatement<?, ?>> it = stmt.effectiveSubstatements().iterator();
+        assertThat(it.next(), instanceOf(TypeEffectiveStatement.class));
+        assertEquals("a weird concat", it.next().argument());
+        assertEquals("another weird concat", it.next().argument());
+        assertFalse(it.hasNext());
+    }
+}
diff --git a/yang/yang-parser-rfc7950/src/test/resources/bugs/YT1089/foo.yang b/yang/yang-parser-rfc7950/src/test/resources/bugs/YT1089/foo.yang

new file mode 100644 (file)

index 0000000..91bd884
--- /dev/null
+++ b/yang/yang-parser-rfc7950/src/test/resources/bugs/YT1089/foo.yang
@@ -0,0 +1,35 @@
+module foo {
+  namespace urn:foo;
+  prefix foo;
+
+  description +something;
+  contact contact++;
+  organization organiza++tion;
+
+  leaf foo {
+    type string;
+    default +;
+    description 'squot' + "dquot";
+    reference "+" + '+';
+  }
+
+  leaf bar {
+    type string;
+    default ++;
+    description "+ + ++";
+    reference "++ + +";
+  }
+
+  leaf baz {
+    type string;
+    default /;
+    description :;
+    reference *;
+  }
+
+  leaf xyzzy {
+    type string;
+    description "a weird" +" concat";
+    reference "another"+ " weird concat";
+  }
+}
author	Robert Varga <robert.varga@pantheon.tech>
	Thu, 13 Aug 2020 18:15:06 +0000 (20:15 +0200)
committer	Robert Varga <robert.varga@pantheon.tech>
	Fri, 14 Aug 2020 16:24:45 +0000 (18:24 +0200)
yang/yang-parser-antlr/src/main/antlr4/org/opendaylight/yangtools/yang/parser/antlr/YangStatementLexer.g4		patch \| blob \| history
yang/yang-parser-antlr/src/main/antlr4/org/opendaylight/yangtools/yang/parser/antlr/YangStatementParser.g4		patch \| blob \| history
yang/yang-parser-rfc7950/src/main/java/org/opendaylight/yangtools/yang/parser/rfc7950/repo/ArgumentContextUtils.java		patch \| blob \| history
yang/yang-parser-rfc7950/src/test/java/org/opendaylight/yangtools/yang/stmt/YT1089Test.java	[new file with mode: 0644]	patch \| blob
yang/yang-parser-rfc7950/src/test/resources/bugs/YT1089/foo.yang	[new file with mode: 0644]	patch \| blob