//
// Copyright (c) 2015 Cisco Systems, Inc. and others.  All rights reserved.
//
// This program and the accompanying materials are made available under the
// terms of the Eclipse Public License v1.0 which accompanies this distribution,
// and is available at http://www.eclipse.org/legal/epl-v10.html
//
lexer grammar YangStatementLexer;

tokens {
    SEMICOLON,
    LEFT_BRACE,
    RIGHT_BRACE,
    SEP,
    IDENTIFIER,
    COLON,
    PLUS,
    SLASH,
    STAR,
    DQUOT_STRING,
    SQUOT_STRING,
    UQUOT_STRING
}

SEMICOLON : ';' -> type(SEMICOLON);
LEFT_BRACE : '{' -> type(LEFT_BRACE);
RIGHT_BRACE : '}' -> type(RIGHT_BRACE);
COLON : ':' -> type(COLON);
PLUS : '+' -> type(PLUS);

// RFC6020 section 6.1.1:
//   Comments are C++ style.  A single line comment starts with "//" and
//   ends at the end of the line.  A block comment is enclosed within "/*"
//   and "*/".
//
// RFC7950 section 6.1.1:
//   Comments are C++ style.  A single line comment starts with "//" and
//   ends at the end of the line.  A block comment starts with "/*" and
//   ends with the nearest following "*/".
//
//   Note that inside a quoted string (Section 6.1.3), these character
//   pairs are never interpreted as the start or end of a comment.
//
// What constitutes 'end of the line' is not specified in RFC7950, hence
// we are using RFC7950-clarified definition. Note we also need to handle
// the case of EOF, as the user may not have included a newline.
LINE_COMMENT : '//' .*? '\r'? ('\n' | EOF) -> skip;
BLOCK_COMMENT : '/*' .*? '*/' -> skip;

SEP: [ \n\r\t]+ -> type(SEP);

// Special-cased identifier string
IDENTIFIER : [a-zA-Z_][a-zA-Z0-9_\-.]* -> type(IDENTIFIER);

// RFC6020 section 6.1.3:
//   If a string contains any space or tab characters, a semicolon (";"),
//   braces ("{" or "}"), or comment sequences ("//", "/*", or "*/"), then
//   it MUST be enclosed within double or single quotes.
//
// RFC7950 section 6.1.3:
//   An unquoted string is any sequence of characters that does not
//   contain any space, tab, carriage return, or line feed characters, a
//   single or double quote character, a semicolon (";"), braces ("{" or
//   "}"), or comment sequences ("//", "/*", or "*/").
//
// Since we need tokenization to work in both worlds, we are taking only
// RFC6020 with CR/LF clarifications -- and allow quotes to appear in the body
// of a string. We additionally exclude COLON, so as to prefer IDENTIFIER
// tokenization -- which allows us to make keyword work properly.
//
// Furthermore we need to exclude PLUS so that concatenation works as expected
// when + is not separated by whitespace -- even RFC7950 is far from being
// well-specified in this regard.
//
// The most problematic here is the comment sequence exclusion, as we cannot
// just exclude it from productions. We therefore provide single-char
// tokenizations of both '*' and '/', and deal with them separately in the
// parser.
SLASH : '/' -> type(SLASH);
STAR : '*' -> type(STAR);
UQUOT_STRING :
    // Any eager span that does not start with single/double quote and does not
    // have slash/star.
    ~([ \n\r\t] | [;{}:+] | [/*] | ['"])
    ~([ \n\r\t] | [;{}:+] | [/*])*
    -> type(UQUOT_STRING);

// Double/single-quoted strings. We deal with these using specialized modes.
DQUOT_START : '"' -> pushMode(DQUOT_STRING_MODE), skip;
SQUOT_START : '\'' -> pushMode(SQUOT_STRING_MODE), skip;

//
// Double-quoted string lexing mode. We do not need to recognize all possible
// escapes here -- just enough not to get confused by runs of backslashes and
// recognize escaped double quotes.
//
mode DQUOT_STRING_MODE;
DQUOT_STRING : (~["\\] | ('\\' .))+ -> type(DQUOT_STRING);
DQUOT_END : '"' -> popMode;

//
// Single-quoted string lexing mode. We do not interpret anything within single
// quotes.
//
mode SQUOT_STRING_MODE;
SQUOT_STRING : ~[']+ -> type(SQUOT_STRING);
SQUOT_END : '\'' -> popMode;