mirror of
https://github.com/open-source-parsers/jsoncpp.git
synced 2024-12-26 10:41:03 +08:00
Escape control chars even if emitting UTF8 (#1178)
* Escape control chars even if emitting UTF8 See #1176 Fixes #1175 * review comments * fix test by stopping early enough to punt on utf8-input.
This commit is contained in:
parent
75b360af4a
commit
c161f4ac69
@ -262,6 +262,14 @@ static String toHex16Bit(unsigned int x) {
|
||||
return result;
|
||||
}
|
||||
|
||||
static void appendRaw(String& result, unsigned ch) {
|
||||
result += static_cast<char>(ch);
|
||||
}
|
||||
|
||||
static void appendHex(String& result, unsigned ch) {
|
||||
result.append("\\u").append(toHex16Bit(ch));
|
||||
}
|
||||
|
||||
static String valueToQuotedStringN(const char* value, unsigned length,
|
||||
bool emitUTF8 = false) {
|
||||
if (value == nullptr)
|
||||
@ -310,29 +318,26 @@ static String valueToQuotedStringN(const char* value, unsigned length,
|
||||
// sequence from occurring.
|
||||
default: {
|
||||
if (emitUTF8) {
|
||||
result += *c;
|
||||
unsigned codepoint = static_cast<unsigned char>(*c);
|
||||
if (codepoint < 0x20) {
|
||||
appendHex(result, codepoint);
|
||||
} else {
|
||||
appendRaw(result, codepoint);
|
||||
}
|
||||
} else {
|
||||
unsigned int codepoint = utf8ToCodepoint(c, end);
|
||||
const unsigned int FIRST_NON_CONTROL_CODEPOINT = 0x20;
|
||||
const unsigned int LAST_NON_CONTROL_CODEPOINT = 0x7F;
|
||||
const unsigned int FIRST_SURROGATE_PAIR_CODEPOINT = 0x10000;
|
||||
// don't escape non-control characters
|
||||
// (short escape sequence are applied above)
|
||||
if (FIRST_NON_CONTROL_CODEPOINT <= codepoint &&
|
||||
codepoint <= LAST_NON_CONTROL_CODEPOINT) {
|
||||
result += static_cast<char>(codepoint);
|
||||
} else if (codepoint <
|
||||
FIRST_SURROGATE_PAIR_CODEPOINT) { // codepoint is in Basic
|
||||
// Multilingual Plane
|
||||
result += "\\u";
|
||||
result += toHex16Bit(codepoint);
|
||||
} else { // codepoint is not in Basic Multilingual Plane
|
||||
// convert to surrogate pair first
|
||||
codepoint -= FIRST_SURROGATE_PAIR_CODEPOINT;
|
||||
result += "\\u";
|
||||
result += toHex16Bit((codepoint >> 10) + 0xD800);
|
||||
result += "\\u";
|
||||
result += toHex16Bit((codepoint & 0x3FF) + 0xDC00);
|
||||
unsigned codepoint = utf8ToCodepoint(c, end); // modifies `c`
|
||||
if (codepoint < 0x20) {
|
||||
appendHex(result, codepoint);
|
||||
} else if (codepoint < 0x80) {
|
||||
appendRaw(result, codepoint);
|
||||
} else if (codepoint < 0x10000) {
|
||||
// Basic Multilingual Plane
|
||||
appendHex(result, codepoint);
|
||||
} else {
|
||||
// Extended Unicode. Encode 20 bits as a surrogate pair.
|
||||
codepoint -= 0x10000;
|
||||
appendHex(result, 0xd800 + ((codepoint >> 10) & 0x3ff));
|
||||
appendHex(result, 0xdc00 + (codepoint & 0x3ff));
|
||||
}
|
||||
}
|
||||
} break;
|
||||
|
@ -2640,6 +2640,68 @@ JSONTEST_FIXTURE_LOCAL(StreamWriterTest, unicode) {
|
||||
"\"\\t\\n\\ud806\\udca1=\\u0133\\ud82c\\udd1b\\uff67\"\n}");
|
||||
}
|
||||
|
||||
// Control chars should be escaped regardless of UTF-8 input encoding.
|
||||
JSONTEST_FIXTURE_LOCAL(StreamWriterTest, escapeControlCharacters) {
|
||||
auto uEscape = [](unsigned ch) {
|
||||
static const char h[] = "0123456789abcdef";
|
||||
std::string r = "\\u";
|
||||
r += h[(ch >> (3 * 4)) & 0xf];
|
||||
r += h[(ch >> (2 * 4)) & 0xf];
|
||||
r += h[(ch >> (1 * 4)) & 0xf];
|
||||
r += h[(ch >> (0 * 4)) & 0xf];
|
||||
return r;
|
||||
};
|
||||
auto shortEscape = [](unsigned ch) -> const char* {
|
||||
switch (ch) {
|
||||
case '\"':
|
||||
return "\\\"";
|
||||
case '\\':
|
||||
return "\\\\";
|
||||
case '\b':
|
||||
return "\\b";
|
||||
case '\f':
|
||||
return "\\f";
|
||||
case '\n':
|
||||
return "\\n";
|
||||
case '\r':
|
||||
return "\\r";
|
||||
case '\t':
|
||||
return "\\t";
|
||||
default:
|
||||
return nullptr;
|
||||
}
|
||||
};
|
||||
|
||||
Json::StreamWriterBuilder b;
|
||||
|
||||
for (bool emitUTF8 : {true, false}) {
|
||||
b.settings_["emitUTF8"] = emitUTF8;
|
||||
|
||||
for (unsigned i = 0; i != 0x100; ++i) {
|
||||
if (!emitUTF8 && i >= 0x80)
|
||||
break; // The algorithm would try to parse UTF-8, so stop here.
|
||||
|
||||
std::string raw({static_cast<char>(i)});
|
||||
std::string esc = raw;
|
||||
if (i < 0x20)
|
||||
esc = uEscape(i);
|
||||
if (const char* shEsc = shortEscape(i))
|
||||
esc = shEsc;
|
||||
|
||||
// std::cout << "emit=" << emitUTF8 << ", i=" << std::hex << i << std::dec
|
||||
// << std::endl;
|
||||
|
||||
Json::Value root;
|
||||
root["test"] = raw;
|
||||
JSONTEST_ASSERT_STRING_EQUAL(
|
||||
std::string("{\n\t\"test\" : \"").append(esc).append("\"\n}"),
|
||||
Json::writeString(b, root))
|
||||
<< ", emit=" << emitUTF8 << ", i=" << i << ", raw=\"" << raw << "\""
|
||||
<< ", esc=\"" << esc << "\"";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct ReaderTest : JsonTest::TestCase {
|
||||
void setStrictMode() {
|
||||
reader = std::unique_ptr<Json::Reader>(
|
||||
|
Loading…
x
Reference in New Issue
Block a user