yang/yang-parser-antlr/src/main/antlr4/org/opendaylight/yangtools/yang/parser/antlr/YangStatementParser.g4

   1 //
   2 // Copyright (c) 2015 Cisco Systems, Inc. and others.  All rights reserved.
   3 //
   4 // This program and the accompanying materials are made available under the
   5 // terms of the Eclipse Public License v1.0 which accompanies this distribution,
   6 // and is available at http://www.eclipse.org/legal/epl-v10.html
   7 //
   8 parser grammar YangStatementParser;
   9
  10 options {
  11     tokenVocab = YangStatementLexer;
  12 }
  13
  14 // NOTE: we need to use SEP*/SEP+ because comments end up breaking whitespace
  15 //       sequences into two.
  16 file : SEP* statement SEP* EOF;
  17 statement : keyword (SEP+ argument)? SEP* (SEMICOLON | LEFT_BRACE SEP* (statement SEP*)* RIGHT_BRACE);
  18 keyword : IDENTIFIER (COLON IDENTIFIER)?;
  19
  20 // Alright, so what constitutes a string is rather funky. We need to deal with
  21 // the flaky definitions of RFC6020, which allow for insane quoting as well as
  22 // exclusion of comments. We also need to allow for stitching back tokens like
  23 // PLUS/COLON, which may end up being valid identifiers.
  24 argument :
  25     // Note on optimization: we are allowing a single IDENTIFIER, although it
  26     // is already part of unquotedString. This is strictly superfluous, but a
  27     // single IDENTIFIER arguments are very common and this eliminates an
  28     // indirection costing us at least two objects. This is not quite a case
  29     // of premature optimization, but rather IDENTIFIER is really so very
  30     // special and deserving of this treatment.
  31     IDENTIFIER
  32     |
  33     // Quoted string and concatenations thereof. We are sacrificing brewity
  34     // here to eliminate the need for another parser construct. Quoted strings
  35     // account for about 50% of all arguments encountered -- hence the added
  36     // parse tree indirection is very visible.
  37     (DQUOT_START DQUOT_STRING? DQUOT_END | SQUOT_START SQUOT_STRING? SQUOT_END)
  38     (SEP* PLUS SEP* (DQUOT_START DQUOT_STRING? DQUOT_END | SQUOT_START SQUOT_STRING? SQUOT_END))*
  39     |
  40     unquotedString
  41     ;
  42
  43 unquotedString :
  44     SLASH | STAR+
  45     |
  46
  47     // Alright this is written in a non-trivial manner due to us wanting to
  48     // keep the number of parser objects (and hence memory pressure) down.
  49     //
  50     // Our aim is to forbid '//', '/*' and '*/' from being accepted as a
  51     // valid unquoted string. Normally we would write this as a recursive
  52     // parser rule for concatenating on '*' and '/' and let ANTLR figure it
  53     // out. Unfortunately that results in a deep parse tree, essentially
  54     // having one level for each such concatenation. For a test case imagine
  55     // how "a*b/c*d*e**f" would get parsed with a recursive grammar.
  56     //
  57     // Now we cannot do much aboud tokenization, but we can statically express
  58     // the shape we are looking for:
  59
  60     //   so an unquoted string may optionally start with a single SLASH or any
  61     //   number of STARs ...
  62     (SLASH? | STAR*)
  63
  64     //   ... but that needs to be followed by at least one span of other
  65     //       content, which is what we are really aiming for. This ensures
  66     //       any leading SLASH/STAR is followed by a non-(SLASH|STAR) ...
  67     (COLON | PLUS | IDENTIFIER | UQUOT_STRING)+
  68
  69     //   ... and based on that knowledge, we allow another SLASH or run of
  70     //       STARs to follow, but it has to be again followed by a run of
  71     //       of other tokens -- and rinse&repeat that any number of times.
  72     //       We still have ensured that the span matched does not end with
  73     //       a SLASH or a STAR ...
  74     //       ways retaining the 'does not end with SLASH or STAR' invariant
  75     ((SLASH | STAR+) (COLON | PLUS | IDENTIFIER | UQUOT_STRING)+)*
  76
  77     //   ... and therefore it is always safe to have such a span end with
  78     //       a SLASH or STARs.
  79     (SLASH? | STAR*)
  80     ;