Escape control chars even if emitting UTF8 (#1178)

* Escape control chars even if emitting UTF8 See #1176 Fixes #1175 * review comments * fix test by stopping early enough to punt on utf8-input.
2024-12-26 10:41:03 +08:00 · 2020-05-21 11:30:59 -04:00 · 2020-05-21 11:30:59 -04:00 · c161f4ac69
commit c161f4ac69
parent 75b360af4a
2 changed files with 89 additions and 22 deletions
--- a/src/lib_json/json_writer.cpp
+++ b/src/lib_json/json_writer.cpp
@ -262,6 +262,14 @@ static String toHex16Bit(unsigned int x) {
  return result;
 }

+static void appendRaw(String& result, unsigned ch) {
+  result += static_cast<char>(ch);
+}
+
+static void appendHex(String& result, unsigned ch) {
+  result.append("\\u").append(toHex16Bit(ch));
+}
+
 static String valueToQuotedStringN(const char* value, unsigned length,
                                   bool emitUTF8 = false) {
  if (value == nullptr)
@ -310,29 +318,26 @@ static String valueToQuotedStringN(const char* value, unsigned length,
    // sequence from occurring.
    default: {
      if (emitUTF8) {
-        result += *c;
+        unsigned codepoint = static_cast<unsigned char>(*c);
+        if (codepoint < 0x20) {
+          appendHex(result, codepoint);
+        } else {
+          appendRaw(result, codepoint);
+        }
      } else {
-        unsigned int codepoint = utf8ToCodepoint(c, end);
-        const unsigned int FIRST_NON_CONTROL_CODEPOINT = 0x20;
-        const unsigned int LAST_NON_CONTROL_CODEPOINT = 0x7F;
-        const unsigned int FIRST_SURROGATE_PAIR_CODEPOINT = 0x10000;
-        // don't escape non-control characters
-        // (short escape sequence are applied above)
-        if (FIRST_NON_CONTROL_CODEPOINT <= codepoint &&
-            codepoint <= LAST_NON_CONTROL_CODEPOINT) {
-          result += static_cast<char>(codepoint);
-        } else if (codepoint <
-                   FIRST_SURROGATE_PAIR_CODEPOINT) { // codepoint is in Basic
-                                                     // Multilingual Plane
-          result += "\\u";
-          result += toHex16Bit(codepoint);
-        } else { // codepoint is not in Basic Multilingual Plane
-                 // convert to surrogate pair first
-          codepoint -= FIRST_SURROGATE_PAIR_CODEPOINT;
-          result += "\\u";
-          result += toHex16Bit((codepoint >> 10) + 0xD800);
-          result += "\\u";
-          result += toHex16Bit((codepoint & 0x3FF) + 0xDC00);
+        unsigned codepoint = utf8ToCodepoint(c, end); // modifies `c`
+        if (codepoint < 0x20) {
+          appendHex(result, codepoint);
+        } else if (codepoint < 0x80) {
+          appendRaw(result, codepoint);
+        } else if (codepoint < 0x10000) {
+          // Basic Multilingual Plane
+          appendHex(result, codepoint);
+        } else {
+          // Extended Unicode. Encode 20 bits as a surrogate pair.
+          codepoint -= 0x10000;
+          appendHex(result, 0xd800 + ((codepoint >> 10) & 0x3ff));
+          appendHex(result, 0xdc00 + (codepoint & 0x3ff));
        }
      }
    } break;
--- a/src/test_lib_json/main.cpp
+++ b/src/test_lib_json/main.cpp
@ -2640,6 +2640,68 @@ JSONTEST_FIXTURE_LOCAL(StreamWriterTest, unicode) {
                  "\"\\t\\n\\ud806\\udca1=\\u0133\\ud82c\\udd1b\\uff67\"\n}");
 }

+// Control chars should be escaped regardless of UTF-8 input encoding.
+JSONTEST_FIXTURE_LOCAL(StreamWriterTest, escapeControlCharacters) {
+  auto uEscape = [](unsigned ch) {
+    static const char h[] = "0123456789abcdef";
+    std::string r = "\\u";
+    r += h[(ch >> (3 * 4)) & 0xf];
+    r += h[(ch >> (2 * 4)) & 0xf];
+    r += h[(ch >> (1 * 4)) & 0xf];
+    r += h[(ch >> (0 * 4)) & 0xf];
+    return r;
+  };
+  auto shortEscape = [](unsigned ch) -> const char* {
+    switch (ch) {
+    case '\"':
+      return "\\\"";
+    case '\\':
+      return "\\\\";
+    case '\b':
+      return "\\b";
+    case '\f':
+      return "\\f";
+    case '\n':
+      return "\\n";
+    case '\r':
+      return "\\r";
+    case '\t':
+      return "\\t";
+    default:
+      return nullptr;
+    }
+  };
+
+  Json::StreamWriterBuilder b;
+
+  for (bool emitUTF8 : {true, false}) {
+    b.settings_["emitUTF8"] = emitUTF8;
+
+    for (unsigned i = 0; i != 0x100; ++i) {
+      if (!emitUTF8 && i >= 0x80)
+        break; // The algorithm would try to parse UTF-8, so stop here.
+
+      std::string raw({static_cast<char>(i)});
+      std::string esc = raw;
+      if (i < 0x20)
+        esc = uEscape(i);
+      if (const char* shEsc = shortEscape(i))
+        esc = shEsc;
+
+      // std::cout << "emit=" << emitUTF8 << ", i=" << std::hex << i << std::dec
+      //          << std::endl;
+
+      Json::Value root;
+      root["test"] = raw;
+      JSONTEST_ASSERT_STRING_EQUAL(
+          std::string("{\n\t\"test\" : \"").append(esc).append("\"\n}"),
+          Json::writeString(b, root))
+          << ", emit=" << emitUTF8 << ", i=" << i << ", raw=\"" << raw << "\""
+          << ", esc=\"" << esc << "\"";
+    }
+  }
+}
+
 struct ReaderTest : JsonTest::TestCase {
  void setStrictMode() {
    reader = std::unique_ptr<Json::Reader>(