[llvm-rc] Implement escape sequences in .rc files.

This allows the escape sequences (\a, \n, \r, \t, \\, \x[0-9a-f]*, \[0-7]*, "") to appear in .rc scripts. These are parsed and output in the same way as it's done in original MS implementation. The way these sequences are processed depends on the type of the resource it resides in, and on whether the user declared the string to be "wide" or "narrow". I tried to maintain the maximum compatibility with the original tool (and fail in some erroneous situations that are accepted by .rc). However, there are some (extremely rare) cases where Microsoft tool outputs nonsense. I found it infeasible to detect such casses. Patch by Marek Sokolowski Differential Revision: https://reviews.llvm.org/D38426 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@315118 91177308-0d34-0410-b5e6-96231b3b80d8
author: Zachary Turner <zturner@google.com> 2017-10-06 22:05:15 +0000
committer: Zachary Turner <zturner@google.com> 2017-10-06 22:05:15 +0000
commit: b231046a7d1522b4e26d167f111aa5ca06e3d585 (patch)
tree: 9265b29d0224c58ecf8788739ddc94ae72cc9bb2 /tools/llvm-rc
parent: 080f10e7ac1ed9aa77d848d86397614ca1c32045 (diff)
2 files changed, 159 insertions, 5 deletions
diff --git a/tools/llvm-rc/ResourceFileWriter.cpp b/tools/llvm-rc/ResourceFileWriter.cpp
index 1d23fb966f6..8b234ae96d6 100644
--- a/tools/llvm-rc/ResourceFileWriter.cpp
+++ b/tools/llvm-rc/ResourceFileWriter.cpp
@@ -122,24 +122,175 @@ enum class NullHandlingMethod {
   CutAtDoubleNull // Terminate string on '\0\0'; strip final '\0'.
 };
 
-// Parses an identifier or string and returns a processed version of it.
-// For now, it only strips the string boundaries, but TODO:
+// Parses an identifier or string and returns a processed version of it:
+//   * String the string boundary quotes.
 //   * Squash "" to a single ".
 //   * Replace the escape sequences with their processed version.
 // For identifiers, this is no-op.
 static Error processString(StringRef Str, NullHandlingMethod NullHandler,
                            bool &IsLongString, SmallVectorImpl<UTF16> &Result) {
   bool IsString = stripQuotes(Str, IsLongString);
-  convertUTF8ToUTF16String(Str, Result);
+  SmallVector<UTF16, 128> Chars;
+  convertUTF8ToUTF16String(Str, Chars);
 
   if (!IsString) {
     // It's an identifier if it's not a string. Make all characters uppercase.
-    for (UTF16 &Ch : Result) {
+    for (UTF16 &Ch : Chars) {
       assert(Ch <= 0x7F && "We didn't allow identifiers to be non-ASCII");
       Ch = toupper(Ch);
     }
+    Result.swap(Chars);
     return Error::success();
   }
+  Result.reserve(Chars.size());
+  size_t Pos = 0;
+
+  auto AddRes = [&Result, NullHandler, IsLongString](UTF16 Char) -> Error {
+    if (!IsLongString) {
+      if (NullHandler == NullHandlingMethod::UserResource) {
+        // Narrow strings in user-defined resources are *not* output in
+        // UTF-16 format.
+        if (Char > 0xFF)
+          return createError("Non-8-bit codepoint (" + Twine(Char) +
+                             ") can't occur in a user-defined narrow string");
+
+      } else {
+        // In case of narrow non-user strings, Windows RC converts
+        // [0x80, 0xFF] chars according to the current codepage.
+        // There is no 'codepage' concept settled in every supported platform,
+        // so we should reject such inputs.
+        if (Char > 0x7F && Char <= 0xFF)
+          return createError("Non-ASCII 8-bit codepoint (" + Twine(Char) +
+                             ") can't "
+                             "occur in a non-Unicode string");
+      }
+    }
+
+    Result.push_back(Char);
+    return Error::success();
+  };
+
+  while (Pos < Chars.size()) {
+    UTF16 CurChar = Chars[Pos];
+    ++Pos;
+
+    // Strip double "".
+    if (CurChar == '"') {
+      if (Pos == Chars.size() || Chars[Pos] != '"')
+        return createError("Expected \"\"");
+      ++Pos;
+      RETURN_IF_ERROR(AddRes('"'));
+      continue;
+    }
+
+    if (CurChar == '\\') {
+      UTF16 TypeChar = Chars[Pos];
+      ++Pos;
+
+      if (TypeChar == 'x' || TypeChar == 'X') {
+        // Read a hex number. Max number of characters to read differs between
+        // narrow and wide strings.
+        UTF16 ReadInt = 0;
+        size_t RemainingChars = IsLongString ? 4 : 2;
+        // We don't want to read non-ASCII hex digits. std:: functions past
+        // 0xFF invoke UB.
+        //
+        // FIXME: actually, Microsoft version probably doesn't check this
+        // condition and uses their Unicode version of 'isxdigit'. However,
+        // there are some hex-digit Unicode character outside of ASCII, and
+        // some of these are actually accepted by rc.exe, the notable example
+        // being fullwidth forms (U+FF10..U+FF19 etc.) These can be written
+        // instead of ASCII digits in \x... escape sequence and get accepted.
+        // However, the resulting hexcodes seem totally unpredictable.
+        // We think it's infeasible to try to reproduce this behavior, nor to
+        // put effort in order to detect it.
+        while (RemainingChars && Pos < Chars.size() && Chars[Pos] < 0x80) {
+          if (!isxdigit(Chars[Pos]))
+            break;
+          char Digit = tolower(Chars[Pos]);
+          ++Pos;
+
+          ReadInt <<= 4;
+          if (isdigit(Digit))
+            ReadInt |= Digit - '0';
+          else
+            ReadInt |= Digit - 'a' + 10;
+
+          --RemainingChars;
+        }
+
+        RETURN_IF_ERROR(AddRes(ReadInt));
+        continue;
+      }
+
+      if (TypeChar >= '0' && TypeChar < '8') {
+        // Read an octal number. Note that we've already read the first digit.
+        UTF16 ReadInt = TypeChar - '0';
+        size_t RemainingChars = IsLongString ? 6 : 2;
+
+        while (RemainingChars && Pos < Chars.size() && Chars[Pos] >= '0' &&
+               Chars[Pos] < '8') {
+          ReadInt <<= 3;
+          ReadInt |= Chars[Pos] - '0';
+          --RemainingChars;
+          ++Pos;
+        }
+
+        RETURN_IF_ERROR(AddRes(ReadInt));
+
+        continue;
+      }
+
+      switch (TypeChar) {
+      case 'A':
+      case 'a':
+        // Windows '\a' translates into '\b' (Backspace).
+        RETURN_IF_ERROR(AddRes('\b'));
+        break;
+
+      case 'n': // Somehow, RC doesn't recognize '\N' and '\R'.
+        RETURN_IF_ERROR(AddRes('\n'));
+        break;
+
+      case 'r':
+        RETURN_IF_ERROR(AddRes('\r'));
+        break;
+
+      case 'T':
+      case 't':
+        RETURN_IF_ERROR(AddRes('\t'));
+        break;
+
+      case '\\':
+        RETURN_IF_ERROR(AddRes('\\'));
+        break;
+
+      case '"':
+        // RC accepts \" only if another " comes afterwards; then, \"" means
+        // a single ".
+        if (Pos == Chars.size() || Chars[Pos] != '"')
+          return createError("Expected \\\"\"");
+        ++Pos;
+        RETURN_IF_ERROR(AddRes('"'));
+        break;
+
+      default:
+        // If TypeChar means nothing, \ is should be output to stdout with
+        // following char. However, rc.exe consumes these characters when
+        // dealing with wide strings.
+        if (!IsLongString) {
+          RETURN_IF_ERROR(AddRes('\\'));
+          RETURN_IF_ERROR(AddRes(TypeChar));
+        }
+        break;
+      }
+
+      continue;
+    }
+
+    // If nothing interesting happens, just output the character.
+    RETURN_IF_ERROR(AddRes(CurChar));
+  }
 
   switch (NullHandler) {
   case NullHandlingMethod::CutAtNull:
diff --git a/tools/llvm-rc/ResourceScriptToken.cpp b/tools/llvm-rc/ResourceScriptToken.cpp
index 36027d14ba0..061070b479e 100644
--- a/tools/llvm-rc/ResourceScriptToken.cpp
+++ b/tools/llvm-rc/ResourceScriptToken.cpp
@@ -219,7 +219,10 @@ Error Tokenizer::consumeToken(const Kind TokenKind) {
       } else if (Data[Pos] == '"') {
         // Consume the ending double-quote.
         advance();
-        return Error::success();
+        // However, if another '"' follows this double-quote, the string didn't
+        // end and we just included '"' into the string.
+        if (!willNowRead("\""))
+          return Error::success();
       } else if (Data[Pos] == '\n') {
         return getStringError("String literal not terminated in the line.");
       }
author	Zachary Turner <zturner@google.com>	2017-10-06 22:05:15 +0000
committer	Zachary Turner <zturner@google.com>	2017-10-06 22:05:15 +0000
commit	b231046a7d1522b4e26d167f111aa5ca06e3d585 (patch)
tree	9265b29d0224c58ecf8788739ddc94ae72cc9bb2 /tools/llvm-rc
parent	080f10e7ac1ed9aa77d848d86397614ca1c32045 (diff)