From 58de36f365950a14a17b89f4e3ab8e7902090375 Mon Sep 17 00:00:00 2001
From: Robert Varga <robert.varga@pantheon.tech>
Date: Thu, 13 Aug 2020 20:15:06 +0200
Subject: [PATCH] Rework YANG lexer/parser

The definitions in the parser and lexer are quite arcane and are
actually wrong, not allowing for a number of edge cases, which are
completely valid.

Furthermore the definition of IDENTIFIER is wrong, as it allows /
and : to appear in it -- effectively ruining the tokenization in
parser.

Refactor the lexer to perform correct tokenization in every situation.
This makes it more complicated, but also much more obvious as to
what is going on -- especially with regard as to what decisions
end up being parser's responsibility.

Refactor the parser so it recognizes YANG tokenization constructs,
notably quoted and unquoted strings, and assemble them from lexer
tokens.

JIRA: YANGTOOLS-1089
Change-Id: I34472bf0a7e262d4f633ce271952bbcd7639ef2f
Signed-off-by: Robert Varga <robert.varga@pantheon.tech>
---
 .../yang/parser/antlr/YangStatementLexer.g4   |  86 +++++++++++++--
 .../yang/parser/antlr/YangStatementParser.g4  |  26 ++++-
 .../rfc7950/repo/ArgumentContextUtils.java    | 103 +++++++++---------
 .../yangtools/yang/stmt/YT1089Test.java       | 103 ++++++++++++++++++
 .../src/test/resources/bugs/YT1089/foo.yang   |  35 ++++++
 5 files changed, 293 insertions(+), 60 deletions(-)
 create mode 100644 yang/yang-parser-rfc7950/src/test/java/org/opendaylight/yangtools/yang/stmt/YT1089Test.java
 create mode 100644 yang/yang-parser-rfc7950/src/test/resources/bugs/YT1089/foo.yang

diff --git a/yang/yang-parser-antlr/src/main/antlr4/org/opendaylight/yangtools/yang/parser/antlr/YangStatementLexer.g4 b/yang/yang-parser-antlr/src/main/antlr4/org/opendaylight/yangtools/yang/parser/antlr/YangStatementLexer.g4
index 3788c9cf92..fa44dd7d0e 100644
--- a/yang/yang-parser-antlr/src/main/antlr4/org/opendaylight/yangtools/yang/parser/antlr/YangStatementLexer.g4
+++ b/yang/yang-parser-antlr/src/main/antlr4/org/opendaylight/yangtools/yang/parser/antlr/YangStatementLexer.g4
@@ -14,7 +14,12 @@ tokens {
     SEP,
     IDENTIFIER,
     COLON,
-    PLUS
+    PLUS,
+    SLASH,
+    STAR,
+    DQUOT_STRING,
+    SQUOT_STRING,
+    UQUOT_STRING
 }
 
 SEMICOLON : ';' -> type(SEMICOLON);
@@ -23,16 +28,81 @@ RIGHT_BRACE : '}' -> type(RIGHT_BRACE);
 COLON : ':' -> type(COLON);
 PLUS : '+' -> type(PLUS);
 
-LINE_COMMENT : [ \n\r\t]* ('//' (~[\r\n]*)) [ \n\r\t]* -> skip;
+// RFC6020 section 6.1.1:
+//   Comments are C++ style.  A single line comment starts with "//" and
+//   ends at the end of the line.  A block comment is enclosed within "/*"
+//   and "*/".
+//
+// RFC7950 section 6.1.1:
+//   Comments are C++ style.  A single line comment starts with "//" and
+//   ends at the end of the line.  A block comment starts with "/*" and
+//   ends with the nearest following "*/".
+//
+//   Note that inside a quoted string (Section 6.1.3), these character
+//   pairs are never interpreted as the start or end of a comment.
+//
+// What constitutes 'end of the line' is not specified in RFC7950, hence
+// we are using RFC7950-clarified definition. Note we also need to handle
+// the case of EOF, as the user may not have included a newline.
+LINE_COMMENT : '//' .*? '\r'? ('\n' | EOF) -> skip;
 BLOCK_COMMENT : '/*' .*? '*/' -> skip;
 
 SEP: [ \n\r\t]+ -> type(SEP);
-IDENTIFIER : [a-zA-Z_/][a-zA-Z0-9_\-.:/]* -> type(IDENTIFIER);
 
-fragment SUB_STRING : ('"' (ESC | ~["])*? '"') | ('\'' (ESC | ~['])* '\'');
-fragment ESC : '\\' (["\\/bfnrt] | UNICODE);
-fragment UNICODE : 'u' HEX HEX HEX HEX;
-fragment HEX : [0-9a-fA-F] ;
+// Special-cased identifier string
+IDENTIFIER : [a-zA-Z_][a-zA-Z0-9_\-.]* -> type(IDENTIFIER);
+
+// RFC6020 section 6.1.3:
+//   If a string contains any space or tab characters, a semicolon (";"),
+//   braces ("{" or "}"), or comment sequences ("//", "/*", or "*/"), then
+//   it MUST be enclosed within double or single quotes.
+//
+// RFC7950 section 6.1.3:
+//   An unquoted string is any sequence of characters that does not
+//   contain any space, tab, carriage return, or line feed characters, a
+//   single or double quote character, a semicolon (";"), braces ("{" or
+//   "}"), or comment sequences ("//", "/*", or "*/").
+//
+// Since we need tokenization to work in both worlds, we are taking only
+// RFC6020 with CR/LF clarifications -- and allow quotes to appear in the body
+// of a string. We additionally exclude COLON, so as to prefer IDENTIFIER
+// tokenization -- which allows us to make keyword work properly.
+//
+// Furthermore we need to exclude PLUS so that concatenation works as expected
+// when + is not separated by whitespace -- even RFC7950 is far from being
+// well-specified in this regard.
+//
+// The most problematic here is the comment sequence exclusion, as we cannot
+// just exclude it from productions. We therefore provide single-char
+// tokenizations of both '*' and '/', and deal with them separately in the
+// parser.
+SLASH : '/' -> type(SLASH);
+STAR : '*' -> type(STAR);
+UQUOT_STRING :
+    // Any eager span that does not start with single/double quote and does not
+    // have slash/star.
+    ~([ \n\r\t] | [;{}:+] | [/*] | ['"])
+    ~([ \n\r\t] | [;{}:+] | [/*])*
+    -> type(UQUOT_STRING);
+
+// Double/single-quoted strings. We deal with these using specialized modes.
+DQUOT_START : '"' -> pushMode(DQUOT_STRING_MODE);
+SQUOT_START : '\'' -> pushMode(SQUOT_STRING_MODE);
 
-STRING: ((~( '\r' | '\n' | '\t' | ' ' | ';' | '{' | '"' | '\'' | '}' | '/' | '+')~( '\r' | '\n' | '\t' | ' ' | ';' | '{' | '}' )* ) | SUB_STRING );
+//
+// Double-quoted string lexing mode. We do not need to recognize all possible
+// escapes here -- just enough not to get confused by runs of backslashes and
+// recognize escaped double quotes.
+//
+mode DQUOT_STRING_MODE;
+DQUOT_STRING : (~["\\] | ('\\' .))+ -> type(DQUOT_STRING);
+DQUOT_END : '"' -> popMode;
+
+//
+// Single-quoted string lexing mode. We do not interpret anything within single
+// quotes.
+//
+mode SQUOT_STRING_MODE;
+SQUOT_STRING : ~[']+ -> type(SQUOT_STRING);
+SQUOT_END : '\'' -> popMode;
 
diff --git a/yang/yang-parser-antlr/src/main/antlr4/org/opendaylight/yangtools/yang/parser/antlr/YangStatementParser.g4 b/yang/yang-parser-antlr/src/main/antlr4/org/opendaylight/yangtools/yang/parser/antlr/YangStatementParser.g4
index 1d7554bf78..cbfeee38c7 100644
--- a/yang/yang-parser-antlr/src/main/antlr4/org/opendaylight/yangtools/yang/parser/antlr/YangStatementParser.g4
+++ b/yang/yang-parser-antlr/src/main/antlr4/org/opendaylight/yangtools/yang/parser/antlr/YangStatementParser.g4
@@ -17,4 +17,28 @@ file : SEP* statement SEP* EOF;
 statement : keyword (SEP+ argument)? SEP* (SEMICOLON | LEFT_BRACE SEP* (statement SEP*)* RIGHT_BRACE);
 keyword : IDENTIFIER (COLON IDENTIFIER)?;
 
-argument : STRING (SEP* PLUS SEP* STRING)* | IDENTIFIER;
+// Alright, so what constitutes a string is rather funky. We need to deal with
+// the flaky definitions of RFC6020, which allow for insane quoting as well as
+// exclusion of comments. We also need to allow for stitching back tokens like
+// PLUS/COLON, which may end up being valid identifiers. Finally we need to allow
+// IDENTIFIER to be concatenated back to a string
+argument : unquotedString | quotedString (SEP* PLUS SEP* quotedString)*;
+
+quotedString :
+    DQUOT_START DQUOT_STRING? DQUOT_END
+    |
+    SQUOT_START SQUOT_STRING? SQUOT_END
+    ;
+
+unquotedString : SLASH | STAR+ | (SLASH? | STAR*) stringPart+ (SLASH? | STAR*);
+
+// A string which is guaranteed to not have slash/star in either start or end
+// and can thus be concatenated without allowing '/*', '//' and '*/' to appear.
+stringPart:
+    (IDENTIFIER | COLON | PLUS | UQUOT_STRING)+
+    |
+    stringPart SLASH stringPart
+    |
+    stringPart STAR+ stringPart
+    ;
+
diff --git a/yang/yang-parser-rfc7950/src/main/java/org/opendaylight/yangtools/yang/parser/rfc7950/repo/ArgumentContextUtils.java b/yang/yang-parser-rfc7950/src/main/java/org/opendaylight/yangtools/yang/parser/rfc7950/repo/ArgumentContextUtils.java
index ea51ac5149..8164c64dd0 100644
--- a/yang/yang-parser-rfc7950/src/main/java/org/opendaylight/yangtools/yang/parser/rfc7950/repo/ArgumentContextUtils.java
+++ b/yang/yang-parser-rfc7950/src/main/java/org/opendaylight/yangtools/yang/parser/rfc7950/repo/ArgumentContextUtils.java
@@ -12,12 +12,15 @@ import static com.google.common.base.Verify.verify;
 import com.google.common.annotations.VisibleForTesting;
 import com.google.common.base.CharMatcher;
 import com.google.common.base.VerifyException;
+import org.antlr.v4.runtime.Token;
 import org.antlr.v4.runtime.tree.ParseTree;
 import org.antlr.v4.runtime.tree.TerminalNode;
 import org.eclipse.jdt.annotation.NonNull;
 import org.opendaylight.yangtools.yang.common.YangVersion;
 import org.opendaylight.yangtools.yang.parser.antlr.YangStatementParser;
 import org.opendaylight.yangtools.yang.parser.antlr.YangStatementParser.ArgumentContext;
+import org.opendaylight.yangtools.yang.parser.antlr.YangStatementParser.QuotedStringContext;
+import org.opendaylight.yangtools.yang.parser.antlr.YangStatementParser.UnquotedStringContext;
 import org.opendaylight.yangtools.yang.parser.spi.source.SourceException;
 import org.opendaylight.yangtools.yang.parser.spi.source.StatementSourceReference;
 
@@ -111,18 +114,41 @@ abstract class ArgumentContextUtils {
     final @NonNull String stringFromStringContext(final ArgumentContext context, final StatementSourceReference ref) {
         // Get first child, which we fully expect to exist and be a lexer token
         final ParseTree firstChild = context.getChild(0);
-        verify(firstChild instanceof TerminalNode, "Unexpected shape of %s", context);
-        final TerminalNode firstNode = (TerminalNode) firstChild;
-        final int firstType = firstNode.getSymbol().getType();
-        switch (firstType) {
-            case YangStatementParser.IDENTIFIER:
-                // Simple case, there is a simple string, which cannot contain anything that we would need to process.
-                return firstNode.getText();
-            case YangStatementParser.STRING:
-                // Complex case, defer to a separate method
-                return concatStrings(context, ref);
+        if (firstChild instanceof UnquotedStringContext) {
+            // Simple case, just grab the text, as ANTLR has done all the heavy lifting
+            final String str = firstChild.getText();
+            checkUnquoted(str, ref);
+            return str;
+        }
+
+        verify(firstChild instanceof QuotedStringContext, "Unexpected shape of %s", context);
+        if (context.getChildCount() == 1) {
+            // No concatenation needed, special-case
+            return unquoteString((QuotedStringContext) firstChild, ref);
+        }
+
+        // Potentially-complex case of string quoting, escaping and concatenation.
+        return concatStrings(context, ref);
+    }
+
+    private String unquoteString(final QuotedStringContext context, final StatementSourceReference ref) {
+        final ParseTree secondChild = context.getChild(1);
+        verify(secondChild instanceof TerminalNode, "Unexpected shape of %s", context);
+        final Token secondToken = ((TerminalNode) secondChild).getSymbol();
+        final int type = secondToken.getType();
+        switch (type) {
+            case YangStatementParser.DQUOT_END:
+            case YangStatementParser.SQUOT_END:
+                // We are missing actual body, hence this is an empty string
+                return "";
+            case YangStatementParser.SQUOT_STRING:
+                return secondChild.getText();
+            case YangStatementParser.DQUOT_STRING:
+                // We should be looking at the first token, which is DQUOT_START, but since it is a single-character
+                // token, let's not bother.
+                return normalizeDoubleQuoted(secondChild.getText(), secondToken.getCharPositionInLine() - 1, ref);
             default:
-                throw new VerifyException("Unexpected first symbol in " + context);
+                throw new VerifyException("Unhandled token type " + type);
         }
     }
 
@@ -130,56 +156,31 @@ abstract class ArgumentContextUtils {
         /*
          * We have multiple fragments. Just search the tree. This code is equivalent to
          *
-         *    context.STRING().forEach(stringNode -> appendString(sb, stringNode, ref))
+         *    context.quotedString().forEach(stringNode -> sb.append(unquoteString(stringNode, ref))
          *
          * except we minimize allocations which that would do.
          */
         final StringBuilder sb = new StringBuilder();
         for (ParseTree child : context.children) {
-            verify(child instanceof TerminalNode, "Unexpected fragment component %s", child);
-            final TerminalNode childNode = (TerminalNode) child;
-            switch (childNode.getSymbol().getType()) {
-                case YangStatementParser.SEP:
-                    // Ignore whitespace
-                    break;
-                case YangStatementParser.PLUS:
-                    // Operator, which we are handling by concat
-                    break;
-                case YangStatementParser.STRING:
-                    // a lexer string, could be pretty much anything
-                    // TODO: appendString() is a dispatch based on quotes, which we should be able to defer to lexer for
-                    //       a dedicated type. That would expand the switch table here, but since we have it anyway, it
-                    //       would be nice to have the quoting distinction already taken care of. The performance
-                    //       difference will need to be benchmarked, though.
-                    appendString(sb, childNode, ref);
-                    break;
-                default:
-                    throw new VerifyException("Unexpected symbol in " + childNode);
+            if (child instanceof TerminalNode) {
+                final TerminalNode childNode = (TerminalNode) child;
+                switch (childNode.getSymbol().getType()) {
+                    case YangStatementParser.SEP:
+                    case YangStatementParser.PLUS:
+                        // Operator, which we are handling by concat
+                        break;
+                    default:
+                        throw new VerifyException("Unexpected symbol in " + childNode);
+                }
+            } else {
+                verify(child instanceof QuotedStringContext, "Unexpected fragment component %s", child);
+                sb.append(unquoteString((QuotedStringContext) child, ref));
+                continue;
             }
         }
         return sb.toString();
     }
 
-    private void appendString(final StringBuilder sb, final TerminalNode stringNode,
-            final StatementSourceReference ref) {
-        final String str = stringNode.getText();
-        final char firstChar = str.charAt(0);
-        final char lastChar = str.charAt(str.length() - 1);
-        if (firstChar == '"' && lastChar == '"') {
-            sb.append(normalizeDoubleQuoted(str.substring(1, str.length() - 1),
-                stringNode.getSymbol().getCharPositionInLine(), ref));
-        } else if (firstChar == '\'' && lastChar == '\'') {
-            /*
-             * According to RFC6020 a single quote character cannot occur in a single-quoted string, even when preceded
-             * by a backslash.
-             */
-            sb.append(str, 1, str.length() - 1);
-        } else {
-            checkUnquoted(str, ref);
-            sb.append(str);
-        }
-    }
-
     private String normalizeDoubleQuoted(final String str, final int dquot, final StatementSourceReference ref) {
         // Whitespace normalization happens irrespective of further handling and has no effect on the result
         final String stripped = trimWhitespace(str, dquot);
diff --git a/yang/yang-parser-rfc7950/src/test/java/org/opendaylight/yangtools/yang/stmt/YT1089Test.java b/yang/yang-parser-rfc7950/src/test/java/org/opendaylight/yangtools/yang/stmt/YT1089Test.java
new file mode 100644
index 0000000000..7fd4ad1bfa
--- /dev/null
+++ b/yang/yang-parser-rfc7950/src/test/java/org/opendaylight/yangtools/yang/stmt/YT1089Test.java
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2020 PANTHEON.tech, s.r.o. and others.  All rights reserved.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License v1.0 which accompanies this distribution,
+ * and is available at http://www.eclipse.org/legal/epl-v10.html
+ */
+package org.opendaylight.yangtools.yang.stmt;
+
+import static org.hamcrest.CoreMatchers.instanceOf;
+import static org.hamcrest.MatcherAssert.assertThat;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+
+import com.google.common.collect.Iterables;
+import java.util.Iterator;
+import org.junit.Test;
+import org.opendaylight.yangtools.yang.common.QName;
+import org.opendaylight.yangtools.yang.model.api.EffectiveModelContext;
+import org.opendaylight.yangtools.yang.model.api.meta.EffectiveStatement;
+import org.opendaylight.yangtools.yang.model.api.stmt.ContactEffectiveStatement;
+import org.opendaylight.yangtools.yang.model.api.stmt.DescriptionEffectiveStatement;
+import org.opendaylight.yangtools.yang.model.api.stmt.LeafEffectiveStatement;
+import org.opendaylight.yangtools.yang.model.api.stmt.NamespaceEffectiveStatement;
+import org.opendaylight.yangtools.yang.model.api.stmt.OrganizationEffectiveStatement;
+import org.opendaylight.yangtools.yang.model.api.stmt.PrefixEffectiveStatement;
+import org.opendaylight.yangtools.yang.model.api.stmt.TypeEffectiveStatement;
+
+public class YT1089Test {
+    @Test
+    public void testPlusLexing() throws Exception {
+        final EffectiveModelContext ctx = StmtTestUtils.parseYangSource("/bugs/YT1089/foo.yang");
+        assertEquals(1, ctx.getModuleStatements().size());
+
+        final Iterator<? extends EffectiveStatement<?, ?>> it =
+                Iterables.getOnlyElement(ctx.getModuleStatements().values()).effectiveSubstatements().iterator();
+
+        assertThat(it.next(), instanceOf(NamespaceEffectiveStatement.class));
+        assertThat(it.next(), instanceOf(PrefixEffectiveStatement.class));
+
+        EffectiveStatement<?, ?> stmt = it.next();
+        assertThat(stmt, instanceOf(DescriptionEffectiveStatement.class));
+        assertEquals("+something", stmt.argument());
+
+        stmt = it.next();
+        assertThat(stmt, instanceOf(ContactEffectiveStatement.class));
+        assertEquals("contact++", stmt.argument());
+
+        stmt = it.next();
+        assertThat(stmt, instanceOf(OrganizationEffectiveStatement.class));
+        assertEquals("organiza++tion", stmt.argument());
+
+        assertFoo(it.next());
+        assertBar(it.next());
+        assertBaz(it.next());
+        assertXyzzy(it.next());
+        assertFalse(it.hasNext());
+    }
+
+    private static void assertFoo(final EffectiveStatement<?, ?> stmt) {
+        assertThat(stmt, instanceOf(LeafEffectiveStatement.class));
+        assertEquals(QName.create("urn:foo", "foo"), stmt.argument());
+
+        final Iterator<? extends EffectiveStatement<?, ?>> it = stmt.effectiveSubstatements().iterator();
+        assertThat(it.next(), instanceOf(TypeEffectiveStatement.class));
+        assertEquals("+", it.next().argument());
+        assertEquals("squotdquot", it.next().argument());
+        assertEquals("++", it.next().argument());
+        assertFalse(it.hasNext());
+    }
+
+    private static void assertBar(final EffectiveStatement<?, ?> stmt) {
+        assertThat(stmt, instanceOf(LeafEffectiveStatement.class));
+        assertEquals(QName.create("urn:foo", "bar"), stmt.argument());
+        final Iterator<? extends EffectiveStatement<?, ?>> it = stmt.effectiveSubstatements().iterator();
+        assertThat(it.next(), instanceOf(TypeEffectiveStatement.class));
+        assertEquals("++", it.next().argument());
+        assertEquals("+ + ++", it.next().argument());
+        assertEquals("++ + +", it.next().argument());
+        assertFalse(it.hasNext());
+    }
+
+    private static void assertBaz(final EffectiveStatement<?, ?> stmt) {
+        assertThat(stmt, instanceOf(LeafEffectiveStatement.class));
+        assertEquals(QName.create("urn:foo", "baz"), stmt.argument());
+        final Iterator<? extends EffectiveStatement<?, ?>> it = stmt.effectiveSubstatements().iterator();
+        assertThat(it.next(), instanceOf(TypeEffectiveStatement.class));
+        assertEquals("/", it.next().argument());
+        assertEquals(":", it.next().argument());
+        assertEquals("*", it.next().argument());
+        assertFalse(it.hasNext());
+    }
+
+    private static void assertXyzzy(final EffectiveStatement<?, ?> stmt) {
+        assertThat(stmt, instanceOf(LeafEffectiveStatement.class));
+        assertEquals(QName.create("urn:foo", "xyzzy"), stmt.argument());
+        final Iterator<? extends EffectiveStatement<?, ?>> it = stmt.effectiveSubstatements().iterator();
+        assertThat(it.next(), instanceOf(TypeEffectiveStatement.class));
+        assertEquals("a weird concat", it.next().argument());
+        assertEquals("another weird concat", it.next().argument());
+        assertFalse(it.hasNext());
+    }
+}
diff --git a/yang/yang-parser-rfc7950/src/test/resources/bugs/YT1089/foo.yang b/yang/yang-parser-rfc7950/src/test/resources/bugs/YT1089/foo.yang
new file mode 100644
index 0000000000..91bd88421c
--- /dev/null
+++ b/yang/yang-parser-rfc7950/src/test/resources/bugs/YT1089/foo.yang
@@ -0,0 +1,35 @@
+module foo {
+  namespace urn:foo;
+  prefix foo;
+
+  description +something;
+  contact contact++;
+  organization organiza++tion;
+
+  leaf foo {
+    type string;
+    default +;
+    description 'squot' + "dquot";
+    reference "+" + '+';
+  }
+
+  leaf bar {
+    type string;
+    default ++;
+    description "+ + ++";
+    reference "++ + +";
+  }
+
+  leaf baz {
+    type string;
+    default /;
+    description :;
+    reference *;
+  }
+
+  leaf xyzzy {
+    type string;
+    description "a weird" +" concat";
+    reference "another"+ " weird concat";
+  }
+}
-- 
2.36.6