mirror of
https://github.com/zeux/pugixml.git
synced 2025-01-14 09:57:57 +08:00
Removed comments (cppguru does not want them), refactored chartype_symbol parsing, documentation fixes
git-svn-id: http://pugixml.googlecode.com/svn/trunk@28 99668b35-9821-0410-8761-19e4c4f06640
This commit is contained in:
parent
9433bd5d62
commit
58be69c574
@ -42,7 +42,7 @@
|
||||
<h2>Introduction</h2>
|
||||
<p><i>pugixml</i> is just another XML parser. This is a successor to
|
||||
<a href="http://www.codeproject.com/soap/pugxml.asp">pugxml</a> (well, to be honest, the only part
|
||||
that is left as is is wildcard matching code, the rest was either heavily refactored or rewritten
|
||||
that is left as is is wildcard matching code; the rest was either heavily refactored or rewritten
|
||||
from scratch). The main features (call it USP) are:</p>
|
||||
|
||||
<ul>
|
||||
@ -59,7 +59,7 @@ mode, with the exception of DTD related issues and XML namespaces)</li>
|
||||
like <i>expat</i> will; it will try to recover the state even if meeting an error (like finding matching
|
||||
tags for closing ones); it will parse files with data in wrong encoding; and so on)</li>
|
||||
<li>clean interface (a heavily refactored pugxml's one)</li>
|
||||
<li>more or less unicode-aware (actually, it assumes UTF-8 encoding of the input data, though
|
||||
<li>more or less Unicode-aware (actually, it assumes UTF-8 encoding of the input data, though
|
||||
it will readily work with ANSI - no UTF-16 for now (see <a href="#Future_work">Future work</a>), with
|
||||
helper conversion functions (UTF-8 <-> UTF-16/32 (whatever is the default for std::wstring & wchar_t))</li>
|
||||
<li>fully standard compliant code (approved by <a href="http://www.comeaucomputing.com/tryitout/">Comeau</a>
|
||||
@ -238,16 +238,16 @@ be just skipped:</p>
|
||||
|
||||
<ul>
|
||||
<li>If <b>parse_pi</b> is on, then processing instructions (<b><? ... ?></b>) are put into DOM
|
||||
tree (with node type <b>node_pi</b>, otherwise they are discarded. Note that for now the prolog
|
||||
tree (with node type <b>node_pi</b>) otherwise they are discarded. Note that for now the prolog
|
||||
(<?xml ... ?>) is parsed as a processing instruction.
|
||||
<br>Default value: off
|
||||
<br>In W3C mode: on</li>
|
||||
<li>If <b>parse_comments</b> is on, then comments (<b><!-- ... --></b>) are put into DOM
|
||||
tree (with node type <b>node_comment</b>, otherwise they are discarded.
|
||||
tree (with node type <b>node_comment</b>) otherwise they are discarded.
|
||||
<br>Default value: off
|
||||
<br>In W3C mode: on</li>
|
||||
<li>If <b>parse_cdata</b> is on, then the content of CDATA section (<b><![CDATA[[ ... ]]></b>)
|
||||
is put into DOM tree (with node type <b>node_cdata</b>, otherwise it is discarded.
|
||||
is put into DOM tree (with node type <b>node_cdata</b>) otherwise it is discarded.
|
||||
<br>Default value: on
|
||||
<br>In W3C mode: on</li>
|
||||
<li>If <b>parse_ws_pcdata</b> is off, then the content of PCDATA section (it's the plain text
|
||||
@ -282,7 +282,7 @@ and for attribute values (replacing <lt; with <, &#4c; with L, etc.).
|
||||
<li>If <b>parse_wnorm_attribute</b> is on, then the whitespace normalisation is done for attribute
|
||||
values (this includes replacing any space-like character by a space character, converting sequences of
|
||||
spaces into a single space and trimming of leading/trailing spaces)
|
||||
<br>Default value: on
|
||||
<br>Default value: off
|
||||
<br>In W3C mode: off</li>
|
||||
<li>If <b>parse_wconv_attribute</b> is on, then the whitespace conversion is done for attribute
|
||||
values (this is a subset of whitespace normalization, and includes only replacing space-like characters
|
||||
@ -324,7 +324,7 @@ These are:
|
||||
<p>A couple of words on flag usage. The parsing options are just a set of bits, with each bit corresponding
|
||||
to one flag. You can turn the flag on by OR-ing the options value with this flag's constant:
|
||||
<pre>
|
||||
parse_w3c | parse_wnorm_pcdata
|
||||
parse_w3c | parse_wnorm_attribute
|
||||
</pre>
|
||||
or turn the flag off by AND-ing the options value with the NEGation of this flag's constant:
|
||||
<pre>
|
||||
|
@ -3,8 +3,6 @@
|
||||
// Pug Improved XML Parser - Version 0.2
|
||||
// --------------------------------------------------------
|
||||
// Copyright (C) 2006-2007, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
|
||||
// Thanks to Palvelev Artyom (cppguru@mail.ru) for hints about optimizing
|
||||
// conversion functions.
|
||||
// This work is based on the pugxml parser, which is:
|
||||
// Copyright (C) 2003, by Kristen Wegner (kristen@tima.net)
|
||||
// Released into the Public Domain. Use at your own risk.
|
||||
@ -120,29 +118,30 @@ namespace
|
||||
ct_parse_attr_ws = 4, // \0, &, \r, ', ", \n, space, tab
|
||||
ct_space = 8, // \r, \n, space, tab
|
||||
ct_parse_cdata = 16, // \0, ], >, \r
|
||||
ct_parse_comment = 32 // \0, -, >, \r
|
||||
ct_parse_comment = 32, // \0, -, >, \r
|
||||
ct_symbol = 64 // Any symbol > 127, a-z, A-Z, 0-9, _, :, -, .
|
||||
|
||||
};
|
||||
|
||||
|
||||
static unsigned char chartype_table[256] =
|
||||
{
|
||||
55, 0, 0, 0, 0, 0, 0, 0, 0, 12, 12, 0, 0, 63, 0, 0, // 0-15
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 16-31
|
||||
12, 0, 6, 0, 0, 0, 7, 6, 0, 0, 0, 0, 0, 32, 0, 0, // 32-47
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 48, 0, // 48-63
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 64-79
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16, 0, 0, // 80-95
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 96-111
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 112-127
|
||||
55, 0, 0, 0, 0, 0, 0, 0, 0, 12, 12, 0, 0, 63, 0, 0, // 0-15
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 16-31
|
||||
12, 0, 6, 0, 0, 0, 7, 6, 0, 0, 0, 0, 0, 96, 64, 0, // 32-47
|
||||
64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 0, 1, 0, 48, 0, // 48-63
|
||||
0, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, // 64-79
|
||||
64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 0, 0, 16, 0, 64, // 80-95
|
||||
0, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, // 96-111
|
||||
64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 0, 0, 0, 0, 0, // 112-127
|
||||
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
||||
64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
|
||||
64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
|
||||
64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
|
||||
64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
|
||||
64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
|
||||
64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
|
||||
64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
|
||||
64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
|
||||
};
|
||||
|
||||
bool is_chartype(char c, chartype ct)
|
||||
@ -275,9 +274,6 @@ namespace pugi
|
||||
struct xml_parser_impl
|
||||
{
|
||||
xml_allocator& alloc;
|
||||
bool chartype_symbol_table[256];
|
||||
|
||||
bool chartype_symbol(char c) const { return chartype_symbol_table[(unsigned char)c]; }
|
||||
|
||||
struct gap
|
||||
{
|
||||
@ -724,8 +720,6 @@ namespace pugi
|
||||
{
|
||||
for (unsigned int c = 0; c < 256; ++c)
|
||||
{
|
||||
chartype_symbol_table[c] = c > 127 || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') ||
|
||||
(c >= '0' && c <= '9') || c == '_' || c == ':' || c == '-' || c == '.';
|
||||
}
|
||||
}
|
||||
|
||||
@ -756,10 +750,10 @@ namespace pugi
|
||||
if(*s == '?') // '<?...'
|
||||
{
|
||||
++s;
|
||||
if(chartype_symbol(*s) && OPTSET(parse_pi))
|
||||
if(is_chartype(*s, ct_symbol) && OPTSET(parse_pi))
|
||||
{
|
||||
mark = s;
|
||||
SCANWHILE(chartype_symbol(*s)); // Read PI target
|
||||
SCANWHILE(is_chartype(*s, ct_symbol)); // Read PI target
|
||||
ENDSEG();
|
||||
|
||||
PUSHNODE(node_pi); // Append a new node on the tree.
|
||||
@ -900,12 +894,12 @@ namespace pugi
|
||||
continue;
|
||||
}
|
||||
}
|
||||
else if(chartype_symbol(*s)) // '<#...'
|
||||
else if(is_chartype(*s, ct_symbol)) // '<#...'
|
||||
{
|
||||
cursor = append_node(cursor); // Append a new node to the tree.
|
||||
|
||||
cursor->name = s;
|
||||
SCANWHILE(chartype_symbol(*s)); // Scan for a terminator.
|
||||
SCANWHILE(is_chartype(*s, ct_symbol)); // Scan for a terminator.
|
||||
ENDSEG(); // Save char in 'ch', terminate & step over.
|
||||
if (*s!=0 && ch == '/') // '</...'
|
||||
{
|
||||
@ -923,11 +917,11 @@ namespace pugi
|
||||
{
|
||||
SKIPWS(); // Eat any whitespace.
|
||||
LOC_ATTRIBUTE:
|
||||
if(chartype_symbol(*s)) // <... #...
|
||||
if(is_chartype(*s, ct_symbol)) // <... #...
|
||||
{
|
||||
xml_attribute_struct* a = append_attribute(cursor); // Make space for this attribute.
|
||||
a->name = s; // Save the offset.
|
||||
SCANWHILE(chartype_symbol(*s)); // Scan for a terminator.
|
||||
SCANWHILE(is_chartype(*s, ct_symbol)); // Scan for a terminator.
|
||||
ENDSEG(); // Save char in 'ch', terminate & step over.
|
||||
if(*s!=0 && is_chartype(ch, ct_space)) SKIPWS(); // Eat any whitespace.
|
||||
if(*s!=0 && (ch == '=' || *s == '=')) // '<... #=...'
|
||||
@ -1040,7 +1034,7 @@ namespace pugi
|
||||
|
||||
if (name)
|
||||
{
|
||||
while (*tagname && chartype_symbol(*tagname))
|
||||
while (*tagname && is_chartype(*tagname, ct_symbol))
|
||||
{
|
||||
if (*tagname++ != *name++) goto TAG_NEXTMATCH;
|
||||
}
|
||||
@ -1063,7 +1057,7 @@ namespace pugi
|
||||
char* name = cursor->name;
|
||||
if (!name) return s;
|
||||
|
||||
while (*s && chartype_symbol(*s))
|
||||
while (*s && is_chartype(*s, ct_symbol))
|
||||
{
|
||||
if (*s++ != *name++) return s;
|
||||
}
|
||||
|
@ -3,8 +3,6 @@
|
||||
// Pug Improved XML Parser - Version 0.2
|
||||
// --------------------------------------------------------
|
||||
// Copyright (C) 2006-2007, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
|
||||
// Thanks to Palvelev Artyom (cppguru@mail.ru) for hints about optimizing
|
||||
// conversion functions.
|
||||
// This work is based on the pugxml parser, which is:
|
||||
// Copyright (C) 2003, by Kristen Wegner (kristen@tima.net)
|
||||
// Released into the Public Domain. Use at your own risk.
|
||||
|
Loading…
x
Reference in New Issue
Block a user