SEP,
IDENTIFIER,
COLON,
- PLUS
+ PLUS,
+ SLASH,
+ STAR,
+ DQUOT_STRING,
+ SQUOT_STRING,
+ UQUOT_STRING
}
SEMICOLON : ';' -> type(SEMICOLON);
COLON : ':' -> type(COLON);
PLUS : '+' -> type(PLUS);
-LINE_COMMENT : [ \n\r\t]* ('//' (~[\r\n]*)) [ \n\r\t]* -> skip;
+// RFC6020 section 6.1.1:
+// Comments are C++ style. A single line comment starts with "//" and
+// ends at the end of the line. A block comment is enclosed within "/*"
+// and "*/".
+//
+// RFC7950 section 6.1.1:
+// Comments are C++ style. A single line comment starts with "//" and
+// ends at the end of the line. A block comment starts with "/*" and
+// ends with the nearest following "*/".
+//
+// Note that inside a quoted string (Section 6.1.3), these character
+// pairs are never interpreted as the start or end of a comment.
+//
+// What constitutes 'end of the line' is not specified in RFC7950, hence
+// we are using RFC7950-clarified definition. Note we also need to handle
+// the case of EOF, as the user may not have included a newline.
+LINE_COMMENT : '//' .*? '\r'? ('\n' | EOF) -> skip;
BLOCK_COMMENT : '/*' .*? '*/' -> skip;
SEP: [ \n\r\t]+ -> type(SEP);
-IDENTIFIER : [a-zA-Z_/][a-zA-Z0-9_\-.:/]* -> type(IDENTIFIER);
-fragment SUB_STRING : ('"' (ESC | ~["])*? '"') | ('\'' (ESC | ~['])* '\'');
-fragment ESC : '\\' (["\\/bfnrt] | UNICODE);
-fragment UNICODE : 'u' HEX HEX HEX HEX;
-fragment HEX : [0-9a-fA-F] ;
+// Special-cased identifier string
+IDENTIFIER : [a-zA-Z_][a-zA-Z0-9_\-.]* -> type(IDENTIFIER);
+
+// RFC6020 section 6.1.3:
+// If a string contains any space or tab characters, a semicolon (";"),
+// braces ("{" or "}"), or comment sequences ("//", "/*", or "*/"), then
+// it MUST be enclosed within double or single quotes.
+//
+// RFC7950 section 6.1.3:
+// An unquoted string is any sequence of characters that does not
+// contain any space, tab, carriage return, or line feed characters, a
+// single or double quote character, a semicolon (";"), braces ("{" or
+// "}"), or comment sequences ("//", "/*", or "*/").
+//
+// Since we need tokenization to work in both worlds, we are taking only
+// RFC6020 with CR/LF clarifications -- and allow quotes to appear in the body
+// of a string. We additionally exclude COLON, so as to prefer IDENTIFIER
+// tokenization -- which allows us to make keyword work properly.
+//
+// Furthermore we need to exclude PLUS so that concatenation works as expected
+// when + is not separated by whitespace -- even RFC7950 is far from being
+// well-specified in this regard.
+//
+// The most problematic here is the comment sequence exclusion, as we cannot
+// just exclude it from productions. We therefore provide single-char
+// tokenizations of both '*' and '/', and deal with them separately in the
+// parser.
+SLASH : '/' -> type(SLASH);
+STAR : '*' -> type(STAR);
+UQUOT_STRING :
+ // Any eager span that does not start with single/double quote and does not
+ // have slash/star.
+ ~([ \n\r\t] | [;{}:+] | [/*] | ['"])
+ ~([ \n\r\t] | [;{}:+] | [/*])*
+ -> type(UQUOT_STRING);
+
+// Double/single-quoted strings. We deal with these using specialized modes.
+DQUOT_START : '"' -> pushMode(DQUOT_STRING_MODE);
+SQUOT_START : '\'' -> pushMode(SQUOT_STRING_MODE);
-STRING: ((~( '\r' | '\n' | '\t' | ' ' | ';' | '{' | '"' | '\'' | '}' | '/' | '+')~( '\r' | '\n' | '\t' | ' ' | ';' | '{' | '}' )* ) | SUB_STRING );
+//
+// Double-quoted string lexing mode. We do not need to recognize all possible
+// escapes here -- just enough not to get confused by runs of backslashes and
+// recognize escaped double quotes.
+//
+mode DQUOT_STRING_MODE;
+DQUOT_STRING : (~["\\] | ('\\' .))+ -> type(DQUOT_STRING);
+DQUOT_END : '"' -> popMode;
+
+//
+// Single-quoted string lexing mode. We do not interpret anything within single
+// quotes.
+//
+mode SQUOT_STRING_MODE;
+SQUOT_STRING : ~[']+ -> type(SQUOT_STRING);
+SQUOT_END : '\'' -> popMode;
statement : keyword (SEP+ argument)? SEP* (SEMICOLON | LEFT_BRACE SEP* (statement SEP*)* RIGHT_BRACE);
keyword : IDENTIFIER (COLON IDENTIFIER)?;
-argument : STRING (SEP* PLUS SEP* STRING)* | IDENTIFIER;
+// Alright, so what constitutes a string is rather funky. We need to deal with
+// the flaky definitions of RFC6020, which allow for insane quoting as well as
+// exclusion of comments. We also need to allow for stitching back tokens like
+// PLUS/COLON, which may end up being valid identifiers. Finally we need to allow
+// IDENTIFIER to be concatenated back to a string
+argument : unquotedString | quotedString (SEP* PLUS SEP* quotedString)*;
+
+quotedString :
+ DQUOT_START DQUOT_STRING? DQUOT_END
+ |
+ SQUOT_START SQUOT_STRING? SQUOT_END
+ ;
+
+unquotedString : SLASH | STAR+ | (SLASH? | STAR*) stringPart+ (SLASH? | STAR*);
+
+// A string which is guaranteed to not have slash/star in either start or end
+// and can thus be concatenated without allowing '/*', '//' and '*/' to appear.
+stringPart:
+ (IDENTIFIER | COLON | PLUS | UQUOT_STRING)+
+ |
+ stringPart SLASH stringPart
+ |
+ stringPart STAR+ stringPart
+ ;
+
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.CharMatcher;
import com.google.common.base.VerifyException;
+import org.antlr.v4.runtime.Token;
import org.antlr.v4.runtime.tree.ParseTree;
import org.antlr.v4.runtime.tree.TerminalNode;
import org.eclipse.jdt.annotation.NonNull;
import org.opendaylight.yangtools.yang.common.YangVersion;
import org.opendaylight.yangtools.yang.parser.antlr.YangStatementParser;
import org.opendaylight.yangtools.yang.parser.antlr.YangStatementParser.ArgumentContext;
+import org.opendaylight.yangtools.yang.parser.antlr.YangStatementParser.QuotedStringContext;
+import org.opendaylight.yangtools.yang.parser.antlr.YangStatementParser.UnquotedStringContext;
import org.opendaylight.yangtools.yang.parser.spi.source.SourceException;
import org.opendaylight.yangtools.yang.parser.spi.source.StatementSourceReference;
final @NonNull String stringFromStringContext(final ArgumentContext context, final StatementSourceReference ref) {
// Get first child, which we fully expect to exist and be a lexer token
final ParseTree firstChild = context.getChild(0);
- verify(firstChild instanceof TerminalNode, "Unexpected shape of %s", context);
- final TerminalNode firstNode = (TerminalNode) firstChild;
- final int firstType = firstNode.getSymbol().getType();
- switch (firstType) {
- case YangStatementParser.IDENTIFIER:
- // Simple case, there is a simple string, which cannot contain anything that we would need to process.
- return firstNode.getText();
- case YangStatementParser.STRING:
- // Complex case, defer to a separate method
- return concatStrings(context, ref);
+ if (firstChild instanceof UnquotedStringContext) {
+ // Simple case, just grab the text, as ANTLR has done all the heavy lifting
+ final String str = firstChild.getText();
+ checkUnquoted(str, ref);
+ return str;
+ }
+
+ verify(firstChild instanceof QuotedStringContext, "Unexpected shape of %s", context);
+ if (context.getChildCount() == 1) {
+ // No concatenation needed, special-case
+ return unquoteString((QuotedStringContext) firstChild, ref);
+ }
+
+ // Potentially-complex case of string quoting, escaping and concatenation.
+ return concatStrings(context, ref);
+ }
+
+ private String unquoteString(final QuotedStringContext context, final StatementSourceReference ref) {
+ final ParseTree secondChild = context.getChild(1);
+ verify(secondChild instanceof TerminalNode, "Unexpected shape of %s", context);
+ final Token secondToken = ((TerminalNode) secondChild).getSymbol();
+ final int type = secondToken.getType();
+ switch (type) {
+ case YangStatementParser.DQUOT_END:
+ case YangStatementParser.SQUOT_END:
+ // We are missing actual body, hence this is an empty string
+ return "";
+ case YangStatementParser.SQUOT_STRING:
+ return secondChild.getText();
+ case YangStatementParser.DQUOT_STRING:
+ // We should be looking at the first token, which is DQUOT_START, but since it is a single-character
+ // token, let's not bother.
+ return normalizeDoubleQuoted(secondChild.getText(), secondToken.getCharPositionInLine() - 1, ref);
default:
- throw new VerifyException("Unexpected first symbol in " + context);
+ throw new VerifyException("Unhandled token type " + type);
}
}
/*
* We have multiple fragments. Just search the tree. This code is equivalent to
*
- * context.STRING().forEach(stringNode -> appendString(sb, stringNode, ref))
+ * context.quotedString().forEach(stringNode -> sb.append(unquoteString(stringNode, ref))
*
* except we minimize allocations which that would do.
*/
final StringBuilder sb = new StringBuilder();
for (ParseTree child : context.children) {
- verify(child instanceof TerminalNode, "Unexpected fragment component %s", child);
- final TerminalNode childNode = (TerminalNode) child;
- switch (childNode.getSymbol().getType()) {
- case YangStatementParser.SEP:
- // Ignore whitespace
- break;
- case YangStatementParser.PLUS:
- // Operator, which we are handling by concat
- break;
- case YangStatementParser.STRING:
- // a lexer string, could be pretty much anything
- // TODO: appendString() is a dispatch based on quotes, which we should be able to defer to lexer for
- // a dedicated type. That would expand the switch table here, but since we have it anyway, it
- // would be nice to have the quoting distinction already taken care of. The performance
- // difference will need to be benchmarked, though.
- appendString(sb, childNode, ref);
- break;
- default:
- throw new VerifyException("Unexpected symbol in " + childNode);
+ if (child instanceof TerminalNode) {
+ final TerminalNode childNode = (TerminalNode) child;
+ switch (childNode.getSymbol().getType()) {
+ case YangStatementParser.SEP:
+ case YangStatementParser.PLUS:
+ // Operator, which we are handling by concat
+ break;
+ default:
+ throw new VerifyException("Unexpected symbol in " + childNode);
+ }
+ } else {
+ verify(child instanceof QuotedStringContext, "Unexpected fragment component %s", child);
+ sb.append(unquoteString((QuotedStringContext) child, ref));
+ continue;
}
}
return sb.toString();
}
- private void appendString(final StringBuilder sb, final TerminalNode stringNode,
- final StatementSourceReference ref) {
- final String str = stringNode.getText();
- final char firstChar = str.charAt(0);
- final char lastChar = str.charAt(str.length() - 1);
- if (firstChar == '"' && lastChar == '"') {
- sb.append(normalizeDoubleQuoted(str.substring(1, str.length() - 1),
- stringNode.getSymbol().getCharPositionInLine(), ref));
- } else if (firstChar == '\'' && lastChar == '\'') {
- /*
- * According to RFC6020 a single quote character cannot occur in a single-quoted string, even when preceded
- * by a backslash.
- */
- sb.append(str, 1, str.length() - 1);
- } else {
- checkUnquoted(str, ref);
- sb.append(str);
- }
- }
-
private String normalizeDoubleQuoted(final String str, final int dquot, final StatementSourceReference ref) {
// Whitespace normalization happens irrespective of further handling and has no effect on the result
final String stripped = trimWhitespace(str, dquot);
--- /dev/null
+/*
+ * Copyright (c) 2020 PANTHEON.tech, s.r.o. and others. All rights reserved.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License v1.0 which accompanies this distribution,
+ * and is available at http://www.eclipse.org/legal/epl-v10.html
+ */
+package org.opendaylight.yangtools.yang.stmt;
+
+import static org.hamcrest.CoreMatchers.instanceOf;
+import static org.hamcrest.MatcherAssert.assertThat;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+
+import com.google.common.collect.Iterables;
+import java.util.Iterator;
+import org.junit.Test;
+import org.opendaylight.yangtools.yang.common.QName;
+import org.opendaylight.yangtools.yang.model.api.SchemaContext;
+import org.opendaylight.yangtools.yang.model.api.meta.EffectiveStatement;
+import org.opendaylight.yangtools.yang.model.api.stmt.ContactEffectiveStatement;
+import org.opendaylight.yangtools.yang.model.api.stmt.DescriptionEffectiveStatement;
+import org.opendaylight.yangtools.yang.model.api.stmt.LeafEffectiveStatement;
+import org.opendaylight.yangtools.yang.model.api.stmt.ModuleEffectiveStatement;
+import org.opendaylight.yangtools.yang.model.api.stmt.NamespaceEffectiveStatement;
+import org.opendaylight.yangtools.yang.model.api.stmt.OrganizationEffectiveStatement;
+import org.opendaylight.yangtools.yang.model.api.stmt.PrefixEffectiveStatement;
+import org.opendaylight.yangtools.yang.model.api.stmt.TypeEffectiveStatement;
+
+public class YT1089Test {
+ @Test
+ public void testPlusLexing() throws Exception {
+ final SchemaContext ctx = StmtTestUtils.parseYangSource("/bugs/YT1089/foo.yang");
+ assertEquals(1, ctx.getModules().size());
+
+ final Iterator<? extends EffectiveStatement<?, ?>> it =
+ ((ModuleEffectiveStatement)Iterables.getOnlyElement(ctx.getModules()))
+ .effectiveSubstatements().iterator();
+
+ assertThat(it.next(), instanceOf(NamespaceEffectiveStatement.class));
+ assertThat(it.next(), instanceOf(PrefixEffectiveStatement.class));
+
+ EffectiveStatement<?, ?> stmt = it.next();
+ assertThat(stmt, instanceOf(DescriptionEffectiveStatement.class));
+ assertEquals("+something", stmt.argument());
+
+ stmt = it.next();
+ assertThat(stmt, instanceOf(ContactEffectiveStatement.class));
+ assertEquals("contact++", stmt.argument());
+
+ stmt = it.next();
+ assertThat(stmt, instanceOf(OrganizationEffectiveStatement.class));
+ assertEquals("organiza++tion", stmt.argument());
+
+ assertFoo(it.next());
+ assertBar(it.next());
+ assertBaz(it.next());
+ assertXyzzy(it.next());
+ assertFalse(it.hasNext());
+ }
+
+ private static void assertFoo(final EffectiveStatement<?, ?> stmt) {
+ assertThat(stmt, instanceOf(LeafEffectiveStatement.class));
+ assertEquals(QName.create("urn:foo", "foo"), stmt.argument());
+
+ final Iterator<? extends EffectiveStatement<?, ?>> it = stmt.effectiveSubstatements().iterator();
+ assertThat(it.next(), instanceOf(TypeEffectiveStatement.class));
+ assertEquals("+", it.next().argument());
+ assertEquals("squotdquot", it.next().argument());
+ assertEquals("++", it.next().argument());
+ assertFalse(it.hasNext());
+ }
+
+ private static void assertBar(final EffectiveStatement<?, ?> stmt) {
+ assertThat(stmt, instanceOf(LeafEffectiveStatement.class));
+ assertEquals(QName.create("urn:foo", "bar"), stmt.argument());
+ final Iterator<? extends EffectiveStatement<?, ?>> it = stmt.effectiveSubstatements().iterator();
+ assertThat(it.next(), instanceOf(TypeEffectiveStatement.class));
+ assertEquals("++", it.next().argument());
+ assertEquals("+ + ++", it.next().argument());
+ assertEquals("++ + +", it.next().argument());
+ assertFalse(it.hasNext());
+ }
+
+ private static void assertBaz(final EffectiveStatement<?, ?> stmt) {
+ assertThat(stmt, instanceOf(LeafEffectiveStatement.class));
+ assertEquals(QName.create("urn:foo", "baz"), stmt.argument());
+ final Iterator<? extends EffectiveStatement<?, ?>> it = stmt.effectiveSubstatements().iterator();
+ assertThat(it.next(), instanceOf(TypeEffectiveStatement.class));
+ assertEquals("/", it.next().argument());
+ assertEquals(":", it.next().argument());
+ assertEquals("*", it.next().argument());
+ assertFalse(it.hasNext());
+ }
+
+ private static void assertXyzzy(final EffectiveStatement<?, ?> stmt) {
+ assertThat(stmt, instanceOf(LeafEffectiveStatement.class));
+ assertEquals(QName.create("urn:foo", "xyzzy"), stmt.argument());
+ final Iterator<? extends EffectiveStatement<?, ?>> it = stmt.effectiveSubstatements().iterator();
+ assertThat(it.next(), instanceOf(TypeEffectiveStatement.class));
+ assertEquals("a weird concat", it.next().argument());
+ assertEquals("another weird concat", it.next().argument());
+ assertFalse(it.hasNext());
+ }
+}
--- /dev/null
+module foo {
+ namespace urn:foo;
+ prefix foo;
+
+ description +something;
+ contact contact++;
+ organization organiza++tion;
+
+ leaf foo {
+ type string;
+ default +;
+ description 'squot' + "dquot";
+ reference "+" + '+';
+ }
+
+ leaf bar {
+ type string;
+ default ++;
+ description "+ + ++";
+ reference "++ + +";
+ }
+
+ leaf baz {
+ type string;
+ default /;
+ description :;
+ reference *;
+ }
+
+ leaf xyzzy {
+ type string;
+ description "a weird" +" concat";
+ reference "another"+ " weird concat";
+ }
+}