mirror of
https://github.com/zeux/pugixml.git
synced 2024-12-26 21:04:25 +08:00
XPath: Make remove_duplicates generate stable order
Given an unsorted sequence, remove_duplicates would sort it using the pointer value of attributes/nodes and then remove consecutive duplicates. This was problematic because it meant that the result of XPath queries was dependent on the memory allocation pattern. While it's technically incorrect to rely on the order, this results in easy to miss bugs. This is particularly common when XPath queries use union operators - although we also will call remove_duplicates in other cases. This change reworks the code to use a hash set instead, using the same hash function we use for compact storage. To make sure it performs well, we allocate enough buckets for count * 1.5 (assuming all elements are unique); since each bucket is a single pointer unlike xpath_node which is two pointers, we need somewhere between size * 0.75 and size * 1.5 temporary storage. The resulting filtering is stable - we remove elements that we have seen before but we don't change the order - and is actually significantly faster than sorting was. With a large union operation, before this change it took ~56 ms per 100 query invocations to remove duplicates, and after this change it takes ~20ms. Fixes #254.
This commit is contained in:
parent
930a701f1f
commit
c55ea3bc1e
@ -7465,6 +7465,41 @@ PUGI__NS_BEGIN
|
||||
// insertion sort small chunk
|
||||
insertion_sort(begin, end, pred);
|
||||
}
|
||||
|
||||
PUGI__FN bool hash_insert(const void** table, size_t size, const void* key)
|
||||
{
|
||||
assert(key);
|
||||
|
||||
unsigned int h = static_cast<unsigned int>(reinterpret_cast<uintptr_t>(key));
|
||||
|
||||
// MurmurHash3 32-bit finalizer
|
||||
h ^= h >> 16;
|
||||
h *= 0x85ebca6bu;
|
||||
h ^= h >> 13;
|
||||
h *= 0xc2b2ae35u;
|
||||
h ^= h >> 16;
|
||||
|
||||
size_t hashmod = size - 1;
|
||||
size_t bucket = h & hashmod;
|
||||
|
||||
for (size_t probe = 0; probe <= hashmod; ++probe)
|
||||
{
|
||||
if (table[bucket] == 0)
|
||||
{
|
||||
table[bucket] = key;
|
||||
return true;
|
||||
}
|
||||
|
||||
if (table[bucket] == key)
|
||||
return false;
|
||||
|
||||
// hash collision, quadratic probing
|
||||
bucket = (bucket + probe + 1) & hashmod;
|
||||
}
|
||||
|
||||
assert(false && "Hash table is full"); // unreachable
|
||||
return false;
|
||||
}
|
||||
PUGI__NS_END
|
||||
|
||||
// Allocator used for AST and evaluation stacks
|
||||
@ -8054,15 +8089,6 @@ PUGI__NS_BEGIN
|
||||
}
|
||||
};
|
||||
|
||||
struct duplicate_comparator
|
||||
{
|
||||
bool operator()(const xpath_node& lhs, const xpath_node& rhs) const
|
||||
{
|
||||
if (lhs.attribute()) return rhs.attribute() ? lhs.attribute() < rhs.attribute() : true;
|
||||
else return rhs.attribute() ? false : lhs.node() < rhs.node();
|
||||
}
|
||||
};
|
||||
|
||||
PUGI__FN double gen_nan()
|
||||
{
|
||||
#if defined(__STDC_IEC_559__) || ((FLT_RADIX - 0 == 2) && (FLT_MAX_EXP - 0 == 128) && (FLT_MANT_DIG - 0 == 24))
|
||||
@ -8850,12 +8876,42 @@ PUGI__NS_BEGIN
|
||||
_end = pos;
|
||||
}
|
||||
|
||||
void remove_duplicates()
|
||||
void remove_duplicates(xpath_allocator* alloc)
|
||||
{
|
||||
if (_type == xpath_node_set::type_unsorted)
|
||||
sort(_begin, _end, duplicate_comparator());
|
||||
if (_type == xpath_node_set::type_unsorted && _end - _begin > 2)
|
||||
{
|
||||
xpath_allocator_capture cr(alloc);
|
||||
|
||||
_end = unique(_begin, _end);
|
||||
size_t size_ = static_cast<size_t>(_end - _begin);
|
||||
|
||||
size_t hash_size = 1;
|
||||
while (hash_size < size_ + size_ / 2) hash_size *= 2;
|
||||
|
||||
const void** hash_data = static_cast<const void**>(alloc->allocate(hash_size * sizeof(void**)));
|
||||
if (!hash_data) return;
|
||||
|
||||
memset(hash_data, 0, hash_size * sizeof(const void**));
|
||||
|
||||
xpath_node* write = _begin;
|
||||
|
||||
for (xpath_node* it = _begin; it != _end; ++it)
|
||||
{
|
||||
const void* attr = it->attribute().internal_object();
|
||||
const void* node = it->node().internal_object();
|
||||
const void* key = attr ? attr : node;
|
||||
|
||||
if (key && hash_insert(hash_data, hash_size, key))
|
||||
{
|
||||
*write++ = *it;
|
||||
}
|
||||
}
|
||||
|
||||
_end = write;
|
||||
}
|
||||
else
|
||||
{
|
||||
_end = unique(_begin, _end);
|
||||
}
|
||||
}
|
||||
|
||||
xpath_node_set::type_t type() const
|
||||
@ -10117,7 +10173,7 @@ PUGI__NS_BEGIN
|
||||
// child, attribute and self axes always generate unique set of nodes
|
||||
// for other axis, if the set stayed sorted, it stayed unique because the traversal algorithms do not visit the same node twice
|
||||
if (axis != axis_child && axis != axis_attribute && axis != axis_self && ns.type() == xpath_node_set::type_unsorted)
|
||||
ns.remove_duplicates();
|
||||
ns.remove_duplicates(stack.temp);
|
||||
|
||||
return ns;
|
||||
}
|
||||
@ -10744,7 +10800,7 @@ PUGI__NS_BEGIN
|
||||
ls.set_type(xpath_node_set::type_unsorted);
|
||||
|
||||
ls.append(rs.begin(), rs.end(), stack.result);
|
||||
ls.remove_duplicates();
|
||||
ls.remove_duplicates(stack.temp);
|
||||
|
||||
return ls;
|
||||
}
|
||||
|
@ -435,6 +435,24 @@ TEST_XML(xpath_operators_union, "<node><employee/><employee secretary=''/><emplo
|
||||
CHECK_XPATH_NODESET(n, STR(". | tail/preceding-sibling::employee | .")) % 2 % 3 % 4 % 6 % 8 % 11;
|
||||
}
|
||||
|
||||
TEST_XML(xpath_operators_union_order, "<node />")
|
||||
{
|
||||
xml_node n = doc.child(STR("node"));
|
||||
|
||||
n.append_child(STR("c"));
|
||||
n.prepend_child(STR("b"));
|
||||
n.append_child(STR("d"));
|
||||
n.prepend_child(STR("a"));
|
||||
|
||||
xpath_node_set ns = n.select_nodes(STR("d | d | b | c | b | a | c | d | b"));
|
||||
|
||||
CHECK(ns.size() == 4);
|
||||
CHECK_STRING(ns[0].node().name(), STR("d"));
|
||||
CHECK_STRING(ns[1].node().name(), STR("b"));
|
||||
CHECK_STRING(ns[2].node().name(), STR("c"));
|
||||
CHECK_STRING(ns[3].node().name(), STR("a"));
|
||||
}
|
||||
|
||||
TEST(xpath_operators_union_error)
|
||||
{
|
||||
CHECK_XPATH_FAIL(STR(". | true()"));
|
||||
|
Loading…
x
Reference in New Issue
Block a user