Rework YANG lexer/parser

[yangtools.git] / yang / yang-parser-rfc7950 / src / main / antlr / org / opendaylight / yangtools / antlrv4 / code / gen / YangStatementLexer.g4
diff --git a/yang/yang-parser-rfc7950/src/main/antlr/org/opendaylight/yangtools/antlrv4/code/gen/YangStatementLexer.g4 b/yang/yang-parser-rfc7950/src/main/antlr/org/opendaylight/yangtools/antlrv4/code/gen/YangStatementLexer.g4

index 3788c9cf928ff51c5863d8e6a8bf23c7458c80e3..fa44dd7d0e50a84e8b472691427f27126f13f3e0 100644 (file)
--- a/yang/yang-parser-rfc7950/src/main/antlr/org/opendaylight/yangtools/antlrv4/code/gen/YangStatementLexer.g4
+++ b/yang/yang-parser-rfc7950/src/main/antlr/org/opendaylight/yangtools/antlrv4/code/gen/YangStatementLexer.g4
@@ -14,7 +14,12 @@ tokens {
      SEP,
      IDENTIFIER,
      COLON,
-    PLUS
+    PLUS,
+    SLASH,
+    STAR,
+    DQUOT_STRING,
+    SQUOT_STRING,
+    UQUOT_STRING
  }
  
  SEMICOLON : ';' -> type(SEMICOLON);
@@ -23,16 +28,81 @@ RIGHT_BRACE : '}' -> type(RIGHT_BRACE);
  COLON : ':' -> type(COLON);
  PLUS : '+' -> type(PLUS);
  
-LINE_COMMENT : [ \n\r\t]* ('//' (~[\r\n]*)) [ \n\r\t]* -> skip;
+// RFC6020 section 6.1.1:
+//   Comments are C++ style.  A single line comment starts with "//" and
+//   ends at the end of the line.  A block comment is enclosed within "/*"
+//   and "*/".
+//
+// RFC7950 section 6.1.1:
+//   Comments are C++ style.  A single line comment starts with "//" and
+//   ends at the end of the line.  A block comment starts with "/*" and
+//   ends with the nearest following "*/".
+//
+//   Note that inside a quoted string (Section 6.1.3), these character
+//   pairs are never interpreted as the start or end of a comment.
+//
+// What constitutes 'end of the line' is not specified in RFC7950, hence
+// we are using RFC7950-clarified definition. Note we also need to handle
+// the case of EOF, as the user may not have included a newline.
+LINE_COMMENT : '//' .*? '\r'? ('\n' | EOF) -> skip;
  BLOCK_COMMENT : '/*' .*? '*/' -> skip;
  
  SEP: [ \n\r\t]+ -> type(SEP);
-IDENTIFIER : [a-zA-Z_/][a-zA-Z0-9_\-.:/]* -> type(IDENTIFIER);
  
-fragment SUB_STRING : ('"' (ESC | ~["])*? '"') | ('\'' (ESC | ~['])* '\'');
-fragment ESC : '\\' (["\\/bfnrt] | UNICODE);
-fragment UNICODE : 'u' HEX HEX HEX HEX;
-fragment HEX : [0-9a-fA-F] ;
+// Special-cased identifier string
+IDENTIFIER : [a-zA-Z_][a-zA-Z0-9_\-.]* -> type(IDENTIFIER);
+
+// RFC6020 section 6.1.3:
+//   If a string contains any space or tab characters, a semicolon (";"),
+//   braces ("{" or "}"), or comment sequences ("//", "/*", or "*/"), then
+//   it MUST be enclosed within double or single quotes.
+//
+// RFC7950 section 6.1.3:
+//   An unquoted string is any sequence of characters that does not
+//   contain any space, tab, carriage return, or line feed characters, a
+//   single or double quote character, a semicolon (";"), braces ("{" or
+//   "}"), or comment sequences ("//", "/*", or "*/").
+//
+// Since we need tokenization to work in both worlds, we are taking only
+// RFC6020 with CR/LF clarifications -- and allow quotes to appear in the body
+// of a string. We additionally exclude COLON, so as to prefer IDENTIFIER
+// tokenization -- which allows us to make keyword work properly.
+//
+// Furthermore we need to exclude PLUS so that concatenation works as expected
+// when + is not separated by whitespace -- even RFC7950 is far from being
+// well-specified in this regard.
+//
+// The most problematic here is the comment sequence exclusion, as we cannot
+// just exclude it from productions. We therefore provide single-char
+// tokenizations of both '*' and '/', and deal with them separately in the
+// parser.
+SLASH : '/' -> type(SLASH);
+STAR : '*' -> type(STAR);
+UQUOT_STRING :
+    // Any eager span that does not start with single/double quote and does not
+    // have slash/star.
+    ~([ \n\r\t] | [;{}:+] | [/*] | ['"])
+    ~([ \n\r\t] | [;{}:+] | [/*])*
+    -> type(UQUOT_STRING);
+
+// Double/single-quoted strings. We deal with these using specialized modes.
+DQUOT_START : '"' -> pushMode(DQUOT_STRING_MODE);
+SQUOT_START : '\'' -> pushMode(SQUOT_STRING_MODE);
  
-STRING: ((~( '\r' | '\n' | '\t' | ' ' | ';' | '{' | '"' | '\'' | '}' | '/' | '+')~( '\r' | '\n' | '\t' | ' ' | ';' | '{' | '}' )* ) | SUB_STRING );
+//
+// Double-quoted string lexing mode. We do not need to recognize all possible
+// escapes here -- just enough not to get confused by runs of backslashes and
+// recognize escaped double quotes.
+//
+mode DQUOT_STRING_MODE;
+DQUOT_STRING : (~["\\] | ('\\' .))+ -> type(DQUOT_STRING);
+DQUOT_END : '"' -> popMode;
+
+//
+// Single-quoted string lexing mode. We do not interpret anything within single
+// quotes.
+//
+mode SQUOT_STRING_MODE;
+SQUOT_STRING : ~[']+ -> type(SQUOT_STRING);
+SQUOT_END : '\'' -> popMode;