Removed comments (cppguru does not want them), refactored chartype_symbol parsing, documentation fixes

git-svn-id: http://pugixml.googlecode.com/svn/trunk@28 99668b35-9821-0410-8761-19e4c4f06640
2025-01-14 09:57:57 +08:00 · 2007-01-08 16:24:53 +00:00 · 2007-01-08 16:24:53 +00:00 · 58be69c574
commit 58be69c574
parent 9433bd5d62
3 changed files with 34 additions and 42 deletions
--- a/docs/index.html
+++ b/docs/index.html
@ -42,7 +42,7 @@
 <h2>Introduction</h2>
 <p><i>pugixml</i> is just another XML parser. This is a successor to
 <a href="http://www.codeproject.com/soap/pugxml.asp">pugxml</a> (well, to be honest, the only part
-that is left as is is wildcard matching code, the rest was either heavily refactored or rewritten
+that is left as is is wildcard matching code; the rest was either heavily refactored or rewritten
 from scratch). The main features (call it USP) are:</p>

 <ul>
@ -59,7 +59,7 @@ mode, with the exception of DTD related issues and XML namespaces)</li>
 like <i>expat</i> will; it will try to recover the state even if meeting an error (like finding matching
 tags for closing ones); it will parse files with data in wrong encoding; and so on)</li>
 <li>clean interface (a heavily refactored pugxml's one)</li>
-<li>more or less unicode-aware (actually, it assumes UTF-8 encoding of the input data, though
+<li>more or less Unicode-aware (actually, it assumes UTF-8 encoding of the input data, though
 it will readily work with ANSI - no UTF-16 for now (see <a href="#Future_work">Future work</a>), with
 helper conversion functions (UTF-8 <-> UTF-16/32 (whatever is the default for std::wstring & wchar_t))</li>
 <li>fully standard compliant code (approved by <a href="http://www.comeaucomputing.com/tryitout/">Comeau</a>
@ -238,16 +238,16 @@ be just skipped:</p>

 <ul>
 <li>If <b>parse_pi</b> is on, then processing instructions (<b>&lt;? ... ?&gt;</b>) are put into DOM
-tree (with node type <b>node_pi</b>, otherwise they are discarded. Note that for now the prolog
+tree (with node type <b>node_pi</b>) otherwise they are discarded. Note that for now the prolog
 (&lt;?xml ... ?&gt;) is parsed as a processing instruction.
 <br>Default value: off
 <br>In W3C mode: on</li>
 <li>If <b>parse_comments</b> is on, then comments (<b>&lt;!-- ... --&gt;</b>) are put into DOM
-tree (with node type <b>node_comment</b>, otherwise they are discarded.
+tree (with node type <b>node_comment</b>) otherwise they are discarded.
 <br>Default value: off
 <br>In W3C mode: on</li>
 <li>If <b>parse_cdata</b> is on, then the content of CDATA section (<b>&lt;![CDATA[[ ... ]]&gt;</b>)
-is put into DOM tree (with node type <b>node_cdata</b>, otherwise it is discarded.
+is put into DOM tree (with node type <b>node_cdata</b>) otherwise it is discarded.
 <br>Default value: on
 <br>In W3C mode: on</li>
 <li>If <b>parse_ws_pcdata</b> is off, then the content of PCDATA section (it's the plain text
@ -282,7 +282,7 @@ and for attribute values (replacing &lt;lt; with &lt;, &amp;#4c; with L, etc.).
 <li>If <b>parse_wnorm_attribute</b> is on, then the whitespace normalisation is done for attribute
 values (this includes replacing any space-like character by a space character, converting sequences of
 spaces into a single space and trimming of leading/trailing spaces)
-<br>Default value: on
+<br>Default value: off
 <br>In W3C mode: off</li>
 <li>If <b>parse_wconv_attribute</b> is on, then the whitespace conversion is done for attribute
 values (this is a subset of whitespace normalization, and includes only replacing space-like characters
@ -324,7 +324,7 @@ These are:
 <p>A couple of words on flag usage. The parsing options are just a set of bits, with each bit corresponding
 to one flag. You can turn the flag on by OR-ing the options value with this flag's constant:
 <pre>
-	parse_w3c | parse_wnorm_pcdata
+	parse_w3c | parse_wnorm_attribute
 </pre>
 or turn the flag off by AND-ing the options value with the NEGation of this flag's constant:
 <pre>
--- a/src/pugixml.cpp
+++ b/src/pugixml.cpp
@ -3,8 +3,6 @@
 // Pug Improved XML Parser - Version 0.2
 // --------------------------------------------------------
 // Copyright (C) 2006-2007, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
-// Thanks to Palvelev Artyom (cppguru@mail.ru) for hints about optimizing
-// conversion functions.
 // This work is based on the pugxml parser, which is:
 // Copyright (C) 2003, by Kristen Wegner (kristen@tima.net)
 // Released into the Public Domain. Use at your own risk.
@ -120,29 +118,30 @@ namespace
 		ct_parse_attr_ws = 4,	// \0, &, \r, ', ", \n, space, tab
 		ct_space = 8,			// \r, \n, space, tab
 		ct_parse_cdata = 16,	// \0, ], >, \r
-		ct_parse_comment = 32	// \0, -, >, \r
+		ct_parse_comment = 32,	// \0, -, >, \r
+		ct_symbol = 64			// Any symbol > 127, a-z, A-Z, 0-9, _, :, -, .
 		
 	};
-	
+
 	static unsigned char chartype_table[256] =
 	{
-		55, 0, 0, 0, 0, 0, 0, 0,		0, 12, 12, 0, 0, 63, 0, 0,	// 0-15
-		0, 0, 0, 0, 0, 0, 0, 0,			0, 0, 0, 0, 0, 0, 0, 0,		// 16-31
-		12, 0, 6, 0, 0, 0, 7, 6,		0, 0, 0, 0, 0, 32, 0, 0,	// 32-47
-		0, 0, 0, 0, 0, 0, 0, 0,			0, 0, 0, 0, 1, 0, 48, 0,	// 48-63
-		0, 0, 0, 0, 0, 0, 0, 0,			0, 0, 0, 0, 0, 0, 0, 0,		// 64-79
-		0, 0, 0, 0, 0, 0, 0, 0,			0, 0, 0, 0, 0, 16, 0, 0,	// 80-95
-		0, 0, 0, 0, 0, 0, 0, 0,			0, 0, 0, 0, 0, 0, 0, 0,		// 96-111
-		0, 0, 0, 0, 0, 0, 0, 0,			0, 0, 0, 0, 0, 0, 0, 0,		// 112-127
+		55, 0, 0, 0, 0, 0, 0, 0,				0, 12, 12, 0, 0, 63, 0, 0,			// 0-15
+		0, 0, 0, 0, 0, 0, 0, 0,					0, 0, 0, 0, 0, 0, 0, 0,				// 16-31
+		12, 0, 6, 0, 0, 0, 7, 6,				0, 0, 0, 0, 0, 96, 64, 0,			// 32-47
+		64, 64, 64, 64, 64, 64, 64, 64,			64, 64, 64, 0, 1, 0, 48, 0,			// 48-63
+		0, 64, 64, 64, 64, 64, 64, 64,			64, 64, 64, 64, 64, 64, 64, 64,		// 64-79
+		64, 64, 64, 64, 64, 64, 64, 64,			64, 64, 64, 0, 0, 16, 0, 64,		// 80-95
+		0, 64, 64, 64, 64, 64, 64, 64,			64, 64, 64, 64, 64, 64, 64, 64,		// 96-111
+		64, 64, 64, 64, 64, 64, 64, 64,			64, 64, 64, 0, 0, 0, 0, 0,			// 112-127

-		0, 0, 0, 0, 0, 0, 0, 0,			0, 0, 0, 0, 0, 0, 0, 0,
-		0, 0, 0, 0, 0, 0, 0, 0,			0, 0, 0, 0, 0, 0, 0, 0,
-		0, 0, 0, 0, 0, 0, 0, 0,			0, 0, 0, 0, 0, 0, 0, 0,
-		0, 0, 0, 0, 0, 0, 0, 0,			0, 0, 0, 0, 0, 0, 0, 0,
-		0, 0, 0, 0, 0, 0, 0, 0,			0, 0, 0, 0, 0, 0, 0, 0,
-		0, 0, 0, 0, 0, 0, 0, 0,			0, 0, 0, 0, 0, 0, 0, 0,
-		0, 0, 0, 0, 0, 0, 0, 0,			0, 0, 0, 0, 0, 0, 0, 0,
-		0, 0, 0, 0, 0, 0, 0, 0,			0, 0, 0, 0, 0, 0, 0, 0
+		64, 64, 64, 64, 64, 64, 64, 64,			64, 64, 64, 64, 64, 64, 64, 64,
+		64, 64, 64, 64, 64, 64, 64, 64,			64, 64, 64, 64, 64, 64, 64, 64,
+		64, 64, 64, 64, 64, 64, 64, 64,			64, 64, 64, 64, 64, 64, 64, 64,
+		64, 64, 64, 64, 64, 64, 64, 64,			64, 64, 64, 64, 64, 64, 64, 64,
+		64, 64, 64, 64, 64, 64, 64, 64,			64, 64, 64, 64, 64, 64, 64, 64,
+		64, 64, 64, 64, 64, 64, 64, 64,			64, 64, 64, 64, 64, 64, 64, 64,
+		64, 64, 64, 64, 64, 64, 64, 64,			64, 64, 64, 64, 64, 64, 64, 64,
+		64, 64, 64, 64, 64, 64, 64, 64,			64, 64, 64, 64, 64, 64, 64, 64
 	};
 	
 	bool is_chartype(char c, chartype ct)
@ -275,9 +274,6 @@ namespace pugi
 	struct xml_parser_impl
 	{
 		xml_allocator& alloc;
-		bool chartype_symbol_table[256];
-		
-		bool chartype_symbol(char c) const { return chartype_symbol_table[(unsigned char)c]; }
 		
 		struct gap
 		{
@ -724,8 +720,6 @@ namespace pugi
 		{
 			for (unsigned int c = 0; c < 256; ++c)
 			{
-				chartype_symbol_table[c] = c > 127 || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') ||
-										(c >= '0' && c <= '9') || c == '_' || c == ':' || c == '-' || c == '.';
 			}
 		}
 		
@ -756,10 +750,10 @@ namespace pugi
 					if(*s == '?') // '<?...'
 					{
 						++s;
-						if(chartype_symbol(*s) && OPTSET(parse_pi))
+						if(is_chartype(*s, ct_symbol) && OPTSET(parse_pi))
 						{
 							mark = s;
-							SCANWHILE(chartype_symbol(*s)); // Read PI target
+							SCANWHILE(is_chartype(*s, ct_symbol)); // Read PI target
 							ENDSEG();
 							
 							PUSHNODE(node_pi); // Append a new node on the tree.
@ -900,12 +894,12 @@ namespace pugi
 							continue;
 						}
 					}
-					else if(chartype_symbol(*s)) // '<#...'
+					else if(is_chartype(*s, ct_symbol)) // '<#...'
 					{
 						cursor = append_node(cursor); // Append a new node to the tree.

 						cursor->name = s;
-						SCANWHILE(chartype_symbol(*s)); // Scan for a terminator.
+						SCANWHILE(is_chartype(*s, ct_symbol)); // Scan for a terminator.
 						ENDSEG(); // Save char in 'ch', terminate & step over.
 						if (*s!=0 && ch == '/') // '</...'
 						{
@ -923,11 +917,11 @@ namespace pugi
 						{
 							SKIPWS(); // Eat any whitespace.
 						LOC_ATTRIBUTE:
-							if(chartype_symbol(*s)) // <... #...
+							if(is_chartype(*s, ct_symbol)) // <... #...
 							{
 								xml_attribute_struct* a = append_attribute(cursor); // Make space for this attribute.
 								a->name = s; // Save the offset.
-								SCANWHILE(chartype_symbol(*s)); // Scan for a terminator.
+								SCANWHILE(is_chartype(*s, ct_symbol)); // Scan for a terminator.
 								ENDSEG(); // Save char in 'ch', terminate & step over.
 								if(*s!=0 && is_chartype(ch, ct_space)) SKIPWS(); // Eat any whitespace.
 								if(*s!=0 && (ch == '=' || *s == '=')) // '<... #=...'
@ -1040,7 +1034,7 @@ namespace pugi
 								
 								if (name)
 								{
-									while (*tagname && chartype_symbol(*tagname))
+									while (*tagname && is_chartype(*tagname, ct_symbol))
 									{
 										if (*tagname++ != *name++) goto TAG_NEXTMATCH;
 									}
@ -1063,7 +1057,7 @@ namespace pugi
 							char* name = cursor->name;
 							if (!name) return s;
 						
-							while (*s && chartype_symbol(*s))
+							while (*s && is_chartype(*s, ct_symbol))
 							{
 								if (*s++ != *name++) return s;
 							}
--- a/src/pugixml.hpp
+++ b/src/pugixml.hpp
@ -3,8 +3,6 @@
 // Pug Improved XML Parser - Version 0.2
 // --------------------------------------------------------
 // Copyright (C) 2006-2007, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
-// Thanks to Palvelev Artyom (cppguru@mail.ru) for hints about optimizing
-// conversion functions.
 // This work is based on the pugxml parser, which is:
 // Copyright (C) 2003, by Kristen Wegner (kristen@tima.net)
 // Released into the Public Domain. Use at your own risk.