From c161f4ac69633deb2ed43bc8569cb9b183f63c32 Mon Sep 17 00:00:00 2001 From: Billy Donahue Date: Thu, 21 May 2020 11:30:59 -0400 Subject: [PATCH] Escape control chars even if emitting UTF8 (#1178) * Escape control chars even if emitting UTF8 See #1176 Fixes #1175 * review comments * fix test by stopping early enough to punt on utf8-input. --- src/lib_json/json_writer.cpp | 49 +++++++++++++++------------- src/test_lib_json/main.cpp | 62 ++++++++++++++++++++++++++++++++++++ 2 files changed, 89 insertions(+), 22 deletions(-) diff --git a/src/lib_json/json_writer.cpp b/src/lib_json/json_writer.cpp index 56ee65e..03a777f 100644 --- a/src/lib_json/json_writer.cpp +++ b/src/lib_json/json_writer.cpp @@ -262,6 +262,14 @@ static String toHex16Bit(unsigned int x) { return result; } +static void appendRaw(String& result, unsigned ch) { + result += static_cast(ch); +} + +static void appendHex(String& result, unsigned ch) { + result.append("\\u").append(toHex16Bit(ch)); +} + static String valueToQuotedStringN(const char* value, unsigned length, bool emitUTF8 = false) { if (value == nullptr) @@ -310,29 +318,26 @@ static String valueToQuotedStringN(const char* value, unsigned length, // sequence from occurring. default: { if (emitUTF8) { - result += *c; + unsigned codepoint = static_cast(*c); + if (codepoint < 0x20) { + appendHex(result, codepoint); + } else { + appendRaw(result, codepoint); + } } else { - unsigned int codepoint = utf8ToCodepoint(c, end); - const unsigned int FIRST_NON_CONTROL_CODEPOINT = 0x20; - const unsigned int LAST_NON_CONTROL_CODEPOINT = 0x7F; - const unsigned int FIRST_SURROGATE_PAIR_CODEPOINT = 0x10000; - // don't escape non-control characters - // (short escape sequence are applied above) - if (FIRST_NON_CONTROL_CODEPOINT <= codepoint && - codepoint <= LAST_NON_CONTROL_CODEPOINT) { - result += static_cast(codepoint); - } else if (codepoint < - FIRST_SURROGATE_PAIR_CODEPOINT) { // codepoint is in Basic - // Multilingual Plane - result += "\\u"; - result += toHex16Bit(codepoint); - } else { // codepoint is not in Basic Multilingual Plane - // convert to surrogate pair first - codepoint -= FIRST_SURROGATE_PAIR_CODEPOINT; - result += "\\u"; - result += toHex16Bit((codepoint >> 10) + 0xD800); - result += "\\u"; - result += toHex16Bit((codepoint & 0x3FF) + 0xDC00); + unsigned codepoint = utf8ToCodepoint(c, end); // modifies `c` + if (codepoint < 0x20) { + appendHex(result, codepoint); + } else if (codepoint < 0x80) { + appendRaw(result, codepoint); + } else if (codepoint < 0x10000) { + // Basic Multilingual Plane + appendHex(result, codepoint); + } else { + // Extended Unicode. Encode 20 bits as a surrogate pair. + codepoint -= 0x10000; + appendHex(result, 0xd800 + ((codepoint >> 10) & 0x3ff)); + appendHex(result, 0xdc00 + (codepoint & 0x3ff)); } } } break; diff --git a/src/test_lib_json/main.cpp b/src/test_lib_json/main.cpp index 73850cf..639b5a2 100644 --- a/src/test_lib_json/main.cpp +++ b/src/test_lib_json/main.cpp @@ -2640,6 +2640,68 @@ JSONTEST_FIXTURE_LOCAL(StreamWriterTest, unicode) { "\"\\t\\n\\ud806\\udca1=\\u0133\\ud82c\\udd1b\\uff67\"\n}"); } +// Control chars should be escaped regardless of UTF-8 input encoding. +JSONTEST_FIXTURE_LOCAL(StreamWriterTest, escapeControlCharacters) { + auto uEscape = [](unsigned ch) { + static const char h[] = "0123456789abcdef"; + std::string r = "\\u"; + r += h[(ch >> (3 * 4)) & 0xf]; + r += h[(ch >> (2 * 4)) & 0xf]; + r += h[(ch >> (1 * 4)) & 0xf]; + r += h[(ch >> (0 * 4)) & 0xf]; + return r; + }; + auto shortEscape = [](unsigned ch) -> const char* { + switch (ch) { + case '\"': + return "\\\""; + case '\\': + return "\\\\"; + case '\b': + return "\\b"; + case '\f': + return "\\f"; + case '\n': + return "\\n"; + case '\r': + return "\\r"; + case '\t': + return "\\t"; + default: + return nullptr; + } + }; + + Json::StreamWriterBuilder b; + + for (bool emitUTF8 : {true, false}) { + b.settings_["emitUTF8"] = emitUTF8; + + for (unsigned i = 0; i != 0x100; ++i) { + if (!emitUTF8 && i >= 0x80) + break; // The algorithm would try to parse UTF-8, so stop here. + + std::string raw({static_cast(i)}); + std::string esc = raw; + if (i < 0x20) + esc = uEscape(i); + if (const char* shEsc = shortEscape(i)) + esc = shEsc; + + // std::cout << "emit=" << emitUTF8 << ", i=" << std::hex << i << std::dec + // << std::endl; + + Json::Value root; + root["test"] = raw; + JSONTEST_ASSERT_STRING_EQUAL( + std::string("{\n\t\"test\" : \"").append(esc).append("\"\n}"), + Json::writeString(b, root)) + << ", emit=" << emitUTF8 << ", i=" << i << ", raw=\"" << raw << "\"" + << ", esc=\"" << esc << "\""; + } + } +} + struct ReaderTest : JsonTest::TestCase { void setStrictMode() { reader = std::unique_ptr(