yang/yang-parser-rfc7950/src/main/java/org/opendaylight/yangtools/yang/parser/rfc7950/repo/ArgumentContextUtils.java

   1 /*
   2  * Copyright (c) 2015 Cisco Systems, Inc. and others.  All rights reserved.
   3  *
   4  * This program and the accompanying materials are made available under the
   5  * terms of the Eclipse Public License v1.0 which accompanies this distribution,
   6  * and is available at http://www.eclipse.org/legal/epl-v10.html
   7  */
   8 package org.opendaylight.yangtools.yang.parser.rfc7950.repo;
   9
  10 import static com.google.common.base.Verify.verify;
  11
  12 import com.google.common.annotations.VisibleForTesting;
  13 import com.google.common.base.CharMatcher;
  14 import com.google.common.base.VerifyException;
  15 import java.util.regex.Pattern;
  16 import org.antlr.v4.runtime.tree.ParseTree;
  17 import org.antlr.v4.runtime.tree.TerminalNode;
  18 import org.eclipse.jdt.annotation.NonNull;
  19 import org.opendaylight.yangtools.yang.common.YangVersion;
  20 import org.opendaylight.yangtools.yang.parser.antlr.YangStatementParser;
  21 import org.opendaylight.yangtools.yang.parser.antlr.YangStatementParser.ArgumentContext;
  22 import org.opendaylight.yangtools.yang.parser.spi.source.SourceException;
  23 import org.opendaylight.yangtools.yang.parser.spi.source.StatementSourceReference;
  24
  25 /**
  26  * Utilities for dealing with YANG statement argument strings, encapsulated in ANTLR grammar's ArgumentContext.
  27  */
  28 enum ArgumentContextUtils {
  29     /**
  30      * YANG 1.0 version of strings, which were not completely clarified in RFC6020.
  31      */
  32     RFC6020 {
  33         @Override
  34         void checkDoubleQuoted(final String str, final StatementSourceReference ref) {
  35             // No-op
  36         }
  37
  38         @Override
  39         void checkUnquoted(final String str, final StatementSourceReference ref) {
  40             // No-op
  41         }
  42     },
  43     /**
  44      * YANG 1.1 version of strings, which were clarified in RFC7950.
  45      */
  46     // NOTE: the differences clarified lead to a proper ability to delegate this to ANTLR lexer, but that does not
  47     //       understand versions and needs to work with both.
  48     RFC7950 {
  49         @Override
  50         void checkDoubleQuoted(final String str, final StatementSourceReference ref) {
  51             // FIXME: YANGTOOLS-1079: we should forward backslash to this method, so that it does not start from the
  52             //                        start from the start of the string. Furthermore this logic should operate on spans
  53             //                        of characters -- i.e. the check for backslash should be a search instead -- as
  54             //                        String knows how to do that and can do it more efficiently than this loop.
  55             for (int i = 0; i < str.length() - 1; i++) {
  56                 if (str.charAt(i) == '\\') {
  57                     switch (str.charAt(i + 1)) {
  58                         case 'n':
  59                         case 't':
  60                         case '\\':
  61                         case '\"':
  62                             i++;
  63                             break;
  64                         default:
  65                             throw new SourceException(ref, "YANG 1.1: illegal double quoted string (%s). In double "
  66                                     + "quoted string the backslash must be followed by one of the following character "
  67                                     + "[n,t,\",\\], but was '%s'.", str, str.charAt(i + 1));
  68                     }
  69                 }
  70             }
  71         }
  72
  73         @Override
  74         void checkUnquoted(final String str, final StatementSourceReference ref) {
  75             SourceException.throwIf(ANYQUOTE_MATCHER.matchesAnyOf(str), ref,
  76                 "YANG 1.1: unquoted string (%s) contains illegal characters", str);
  77         }
  78     };
  79
  80     private static final CharMatcher WHITESPACE_MATCHER = CharMatcher.whitespace();
  81     private static final CharMatcher ANYQUOTE_MATCHER = CharMatcher.anyOf("'\"");
  82     private static final Pattern ESCAPED_DQUOT = Pattern.compile("\\\"", Pattern.LITERAL);
  83     private static final Pattern ESCAPED_BACKSLASH = Pattern.compile("\\\\", Pattern.LITERAL);
  84     private static final Pattern ESCAPED_LF = Pattern.compile("\\n", Pattern.LITERAL);
  85     private static final Pattern ESCAPED_TAB = Pattern.compile("\\t", Pattern.LITERAL);
  86
  87     static @NonNull ArgumentContextUtils forVersion(final YangVersion version) {
  88         switch (version) {
  89             case VERSION_1:
  90                 return RFC6020;
  91             case VERSION_1_1:
  92                 return RFC7950;
  93             default:
  94                 throw new IllegalStateException("Unhandled version " + version);
  95         }
  96     }
  97
  98     /*
  99      * NOTE: this method we do not use convenience methods provided by generated parser code, but instead are making
 100      *       based on the grammar assumptions. While this is more verbose, it cuts out a number of unnecessary code,
 101      *       such as intermediate List allocation et al.
 102      */
 103     final @NonNull String stringFromStringContext(final ArgumentContext context, final StatementSourceReference ref) {
 104         // Get first child, which we fully expect to exist and be a lexer token
 105         final ParseTree firstChild = context.getChild(0);
 106         verify(firstChild instanceof TerminalNode, "Unexpected shape of %s", context);
 107         final TerminalNode firstNode = (TerminalNode) firstChild;
 108         final int firstType = firstNode.getSymbol().getType();
 109         switch (firstType) {
 110             case YangStatementParser.IDENTIFIER:
 111                 // Simple case, there is a simple string, which cannot contain anything that we would need to process.
 112                 return firstNode.getText();
 113             case YangStatementParser.STRING:
 114                 // Complex case, defer to a separate method
 115                 return concatStrings(context, ref);
 116             default:
 117                 throw new VerifyException("Unexpected first symbol in " + context);
 118         }
 119     }
 120
 121     private String concatStrings(final ArgumentContext context, final StatementSourceReference ref) {
 122         /*
 123          * We have multiple fragments. Just search the tree. This code is equivalent to
 124          *
 125          *    context.STRING().forEach(stringNode -> appendString(sb, stringNode, ref))
 126          *
 127          * except we minimize allocations which that would do.
 128          */
 129         final StringBuilder sb = new StringBuilder();
 130         for (ParseTree child : context.children) {
 131             verify(child instanceof TerminalNode, "Unexpected fragment component %s", child);
 132             final TerminalNode childNode = (TerminalNode) child;
 133             switch (childNode.getSymbol().getType()) {
 134                 case YangStatementParser.SEP:
 135                     // Ignore whitespace
 136                     break;
 137                 case YangStatementParser.PLUS:
 138                     // Operator, which we are handling by concat
 139                     break;
 140                 case YangStatementParser.STRING:
 141                     // a lexer string, could be pretty much anything
 142                     // FIXME: YANGTOOLS-1079: appendString() is a dispatch based on quotes, which we should be able to
 143                     //                        defer to lexer for a dedicated type. That would expand the switch table
 144                     //                        here, but since we have it anyway, it would be nice to have the quoting
 145                     //                        distinction already taken care of. The performance difference will need to
 146                     //                        be benchmarked, though.
 147                     appendString(sb, childNode, ref);
 148                     break;
 149                 default:
 150                     throw new VerifyException("Unexpected symbol in " + childNode);
 151             }
 152         }
 153         return sb.toString();
 154     }
 155
 156     private void appendString(final StringBuilder sb, final TerminalNode stringNode,
 157             final StatementSourceReference ref) {
 158         final String str = stringNode.getText();
 159         final char firstChar = str.charAt(0);
 160         final char lastChar = str.charAt(str.length() - 1);
 161         if (firstChar == '"' && lastChar == '"') {
 162             sb.append(normalizeDoubleQuoted(str.substring(1, str.length() - 1),
 163                 stringNode.getSymbol().getCharPositionInLine(), ref));
 164         } else if (firstChar == '\'' && lastChar == '\'') {
 165             /*
 166              * According to RFC6020 a single quote character cannot occur in a single-quoted string, even when preceded
 167              * by a backslash.
 168              */
 169             sb.append(str, 1, str.length() - 1);
 170         } else {
 171             checkUnquoted(str, ref);
 172             sb.append(str);
 173         }
 174     }
 175
 176     private String normalizeDoubleQuoted(final String str, final int dquot, final StatementSourceReference ref) {
 177         // Whitespace normalization happens irrespective of further handling and has no effect on the result
 178         final String stripped = trimWhitespace(str, dquot);
 179
 180         // Now we need to perform some amount of unescaping. This serves as a pre-check before we dispatch
 181         // validation and processing (which will reuse the work we have done)
 182         final int backslash = stripped.indexOf('\\');
 183         return backslash == -1 ? stripped : unescape(stripped, backslash, ref);
 184     }
 185
 186     /*
 187      * NOTE: Enforcement and transformation logic done by these methods should logically reside in the lexer and ANTLR
 188      *       account the for it with lexer modes. We do not want to force a re-lexing phase in the parser just because
 189      *       we decided to let ANTLR do the work.
 190      */
 191     // FIXME: YANGTOOLS-1079: Re-evaluate above comment once our integration surface with lexer has been decided
 192     abstract void checkDoubleQuoted(String str, StatementSourceReference ref);
 193
 194     abstract void checkUnquoted(String str, StatementSourceReference ref);
 195
 196     /*
 197      * Unescape escaped double quotes, tabs, new line and backslash in the inner string and trim the result.
 198      */
 199     private String unescape(final String str, final int backslash, final StatementSourceReference ref) {
 200         checkDoubleQuoted(str, ref);
 201
 202         // FIXME: YANGTOOLS-1079: given we the leading backslash, it would be more efficient to walk the string and
 203         //                        unescape in one go
 204         return ESCAPED_TAB.matcher(
 205                     ESCAPED_LF.matcher(
 206                         ESCAPED_BACKSLASH.matcher(
 207                             ESCAPED_DQUOT.matcher(str).replaceAll("\\\""))
 208                         .replaceAll("\\\\"))
 209                     .replaceAll("\\\n"))
 210                .replaceAll("\\\t");
 211     }
 212
 213     @VisibleForTesting
 214     static String trimWhitespace(final String str, final int dquot) {
 215         final int firstBrk = str.indexOf('\n');
 216         if (firstBrk == -1) {
 217             return str;
 218         }
 219
 220         // Okay, we may need to do some trimming, set up a builder and append the first segment
 221         final int length = str.length();
 222         final StringBuilder sb = new StringBuilder(length);
 223
 224         // Append first segment, which needs only tail-trimming
 225         sb.append(str, 0, trimTrailing(str, 0, firstBrk)).append('\n');
 226
 227         // With that out of the way, setup our iteration state. The string segment we are looking at is
 228         // str.substring(start, end), which is guaranteed not to include any line breaks, i.e. end <= brk unless we are
 229         // at the last segment.
 230         int start = firstBrk + 1;
 231         int brk = str.indexOf('\n', start);
 232
 233         // Loop over inner strings
 234         while (brk != -1) {
 235             trimLeadingAndAppend(sb, dquot, str, start, trimTrailing(str, start, brk)).append('\n');
 236             start = brk + 1;
 237             brk = str.indexOf('\n', start);
 238         }
 239
 240         return trimLeadingAndAppend(sb, dquot, str, start, length).toString();
 241     }
 242
 243     private static StringBuilder trimLeadingAndAppend(final StringBuilder sb, final int dquot, final String str,
 244             final int start, final int end) {
 245         int offset = start;
 246         int pos = 0;
 247
 248         while (pos <= dquot) {
 249             if (offset == end) {
 250                 // We ran out of data, nothing to append
 251                 return sb;
 252             }
 253
 254             final char ch = str.charAt(offset);
 255             if (ch == '\t') {
 256                 // tabs are to be treated as 8 spaces
 257                 pos += 8;
 258             } else if (WHITESPACE_MATCHER.matches(ch)) {
 259                 pos++;
 260             } else {
 261                 break;
 262             }
 263
 264             offset++;
 265         }
 266
 267         // We have expanded beyond double quotes, push equivalent spaces
 268         while (pos - 1 > dquot) {
 269             sb.append(' ');
 270             pos--;
 271         }
 272
 273         return sb.append(str, offset, end);
 274     }
 275
 276     private static int trimTrailing(final String str, final int start, final int end) {
 277         int ret = end;
 278         while (ret > start) {
 279             final int prev = ret - 1;
 280             if (!WHITESPACE_MATCHER.matches(str.charAt(prev))) {
 281                 break;
 282             }
 283             ret = prev;
 284         }
 285         return ret;
 286     }
 287 }