mirror of
https://github.com/zeux/pugixml.git
synced 2024-12-27 13:33:17 +08:00
Introduced parse_ws_pcdata_single flag: only parses whitespace-only PCDATA if it's the only child of the parent node (middle ground between default flags and parse_ws_pcdata)
git-svn-id: http://pugixml.googlecode.com/svn/trunk@825 99668b35-9821-0410-8761-19e4c4f06640
This commit is contained in:
parent
fbfd2ae25a
commit
1b87d3dcbf
@ -1898,7 +1898,7 @@ namespace
|
|||||||
|
|
||||||
// Parser utilities.
|
// Parser utilities.
|
||||||
#define SKIPWS() { while (IS_CHARTYPE(*s, ct_space)) ++s; }
|
#define SKIPWS() { while (IS_CHARTYPE(*s, ct_space)) ++s; }
|
||||||
#define OPTSET(OPT) ( optmsk & OPT )
|
#define OPTSET(OPT) ( optmsk & (OPT) )
|
||||||
#define PUSHNODE(TYPE) { cursor = append_node(cursor, alloc, TYPE); if (!cursor) THROW_ERROR(status_out_of_memory, s); }
|
#define PUSHNODE(TYPE) { cursor = append_node(cursor, alloc, TYPE); if (!cursor) THROW_ERROR(status_out_of_memory, s); }
|
||||||
#define POPNODE() { cursor = cursor->parent; }
|
#define POPNODE() { cursor = cursor->parent; }
|
||||||
#define SCANFOR(X) { while (*s != 0 && !(X)) ++s; }
|
#define SCANFOR(X) { while (*s != 0 && !(X)) ++s; }
|
||||||
@ -2402,10 +2402,20 @@ namespace
|
|||||||
|
|
||||||
SKIPWS(); // Eat whitespace if no genuine PCDATA here.
|
SKIPWS(); // Eat whitespace if no genuine PCDATA here.
|
||||||
|
|
||||||
if ((!OPTSET(parse_ws_pcdata) || mark == s) && (*s == '<' || !*s))
|
if (*s == '<')
|
||||||
{
|
{
|
||||||
continue;
|
// We skipped some whitespace characters because otherwise we would take the tag branch instead of PCDATA one
|
||||||
}
|
assert(mark != s);
|
||||||
|
|
||||||
|
if (!OPTSET(parse_ws_pcdata | parse_ws_pcdata_single))
|
||||||
|
{
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
else if (OPTSET(parse_ws_pcdata_single))
|
||||||
|
{
|
||||||
|
if (s[1] != '/' || cursor->first_child) continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
s = mark;
|
s = mark;
|
||||||
|
|
||||||
|
@ -164,6 +164,11 @@ namespace pugi
|
|||||||
// This flag determines if document type declaration (node_doctype) is added to the DOM tree. This flag is off by default.
|
// This flag determines if document type declaration (node_doctype) is added to the DOM tree. This flag is off by default.
|
||||||
const unsigned int parse_doctype = 0x0200;
|
const unsigned int parse_doctype = 0x0200;
|
||||||
|
|
||||||
|
// This flag determines if plain character data (node_pcdata) that is the only child of the parent node and that consists only
|
||||||
|
// of whitespace is added to the DOM tree.
|
||||||
|
// This flag is off by default; turning it on may result in slower parsing and more memory consumption.
|
||||||
|
const unsigned int parse_ws_pcdata_single = 0x0400;
|
||||||
|
|
||||||
// The default parsing mode.
|
// The default parsing mode.
|
||||||
// Elements, PCDATA and CDATA sections are added to the DOM tree, character/reference entities are expanded,
|
// Elements, PCDATA and CDATA sections are added to the DOM tree, character/reference entities are expanded,
|
||||||
// End-of-Line characters are normalized, attribute values are normalized using CDATA normalization rules.
|
// End-of-Line characters are normalized, attribute values are normalized using CDATA normalization rules.
|
||||||
|
@ -263,6 +263,85 @@ TEST(parse_ws_pcdata_parse)
|
|||||||
CHECK_STRING(c2.first_child().value(), STR(" "));
|
CHECK_STRING(c2.first_child().value(), STR(" "));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int get_tree_node_count(xml_node n)
|
||||||
|
{
|
||||||
|
int result = 1;
|
||||||
|
|
||||||
|
for (xml_node c = n.first_child(); c; c = c.next_sibling())
|
||||||
|
result += get_tree_node_count(c);
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(parse_ws_pcdata_permutations)
|
||||||
|
{
|
||||||
|
struct test_data_t
|
||||||
|
{
|
||||||
|
unsigned int mask; // 1 = default flags, 2 = parse_ws_pcdata, 4 = parse_ws_pcdata_single
|
||||||
|
const pugi::char_t* source;
|
||||||
|
const pugi::char_t* result;
|
||||||
|
int nodes; // negative if parsing should fail
|
||||||
|
};
|
||||||
|
|
||||||
|
test_data_t test_data[] =
|
||||||
|
{
|
||||||
|
// external pcdata should be discarded (whitespace or not)
|
||||||
|
{7, STR("ext1"), STR(""), 1},
|
||||||
|
{7, STR(" "), STR(""), 1},
|
||||||
|
{7, STR("ext1<node/>"), STR("<node />"), 2},
|
||||||
|
{7, STR("ext1<node/>ext2"), STR("<node />"), 2},
|
||||||
|
{7, STR(" <node/>"), STR("<node />"), 2},
|
||||||
|
{7, STR("<node/> "), STR("<node />"), 2},
|
||||||
|
{7, STR(" <node/> "), STR("<node />"), 2},
|
||||||
|
// inner pcdata should be preserved
|
||||||
|
{7, STR("<node>inner</node>"), STR("<node>inner</node>"), 3},
|
||||||
|
{7, STR("<node>inner1<child/>inner2</node>"), STR("<node>inner1<child />inner2</node>"), 5},
|
||||||
|
{7, STR("<node>inner1<child>deep</child>inner2</node>"), STR("<node>inner1<child>deep</child>inner2</node>"), 6},
|
||||||
|
// empty pcdata nodes should never be created
|
||||||
|
{7, STR("<node>inner1<child></child>inner2</node>"), STR("<node>inner1<child />inner2</node>"), 5},
|
||||||
|
{7, STR("<node><child></child>inner2</node>"), STR("<node><child />inner2</node>"), 4},
|
||||||
|
{7, STR("<node>inner1<child></child></node>"), STR("<node>inner1<child /></node>"), 4},
|
||||||
|
{7, STR("<node><child></child></node>"), STR("<node><child /></node>"), 3},
|
||||||
|
// comments, pi or other nodes should not cause pcdata creation either
|
||||||
|
{7, STR("<node><!----><child><?pi?></child><![CDATA[x]]></node>"), STR("<node><child /><![CDATA[x]]></node>"), 4},
|
||||||
|
// leading/trailing pcdata whitespace should be preserved (note: this will change if parse_ws_pcdata_trim is introduced)
|
||||||
|
{7, STR("<node>\t \tinner1<child> deep </child>\t\ninner2\n\t</node>"), STR("<node>\t \tinner1<child> deep </child>\t\ninner2\n\t</node>"), 6},
|
||||||
|
// whitespace-only pcdata preservation depends on the parsing mode
|
||||||
|
{1, STR("<node>\n\t<child> </child>\n\t<child> <deep> </deep> </child>\n\t<!---->\n\t</node>"), STR("<node><child /><child><deep /></child></node>"), 5},
|
||||||
|
{2, STR("<node>\n\t<child> </child>\n\t<child> <deep> </deep> </child>\n\t<!---->\n\t</node>"), STR("<node>\n\t<child> </child>\n\t<child> <deep> </deep> </child>\n\t\n\t</node>"), 13},
|
||||||
|
{4, STR("<node>\n\t<child> </child>\n\t<child> <deep> </deep> </child>\n\t<!---->\n\t</node>"), STR("<node><child> </child><child><deep> </deep></child></node>"), 7},
|
||||||
|
// current implementation of parse_ws_pcdata_single has an unfortunate bug; reproduce it here
|
||||||
|
{4, STR("<node>\t\t<!---->\n\n</node>"), STR("<node>\n\n</node>"), 3},
|
||||||
|
// error case: terminate PCDATA in the middle
|
||||||
|
{7, STR("<node>abcdef"), STR("<node>abcde</node>"), -3},
|
||||||
|
{7, STR("<node> "), STR("<node> </node>"), -3},
|
||||||
|
// error case: terminate PCDATA as early as possible
|
||||||
|
{7, STR("<node>"), STR("<node />"), -2},
|
||||||
|
{7, STR("<node>a"), STR("<node />"), -2},
|
||||||
|
{7, STR("<node> "), STR("<node />"), -2},
|
||||||
|
};
|
||||||
|
|
||||||
|
for (size_t i = 0; i < sizeof(test_data) / sizeof(test_data[0]); ++i)
|
||||||
|
{
|
||||||
|
const test_data_t& td = test_data[i];
|
||||||
|
|
||||||
|
for (int flag = 0; flag < 3; ++flag)
|
||||||
|
{
|
||||||
|
if (td.mask & (1 << flag))
|
||||||
|
{
|
||||||
|
unsigned int flags[] = {parse_default, parse_default | parse_ws_pcdata, parse_default | parse_ws_pcdata_single};
|
||||||
|
|
||||||
|
xml_document doc;
|
||||||
|
CHECK((td.nodes > 0) == doc.load(td.source, flags[flag]));
|
||||||
|
CHECK_NODE(doc, td.result);
|
||||||
|
|
||||||
|
int nodes = get_tree_node_count(doc);
|
||||||
|
CHECK((td.nodes < 0 ? -td.nodes : td.nodes) == nodes);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
TEST(parse_pcdata_no_eol)
|
TEST(parse_pcdata_no_eol)
|
||||||
{
|
{
|
||||||
xml_document doc;
|
xml_document doc;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user