Added bloom filter support.

In particular, we add a new FilterPolicy class.  An instance
of this class can be supplied in Options when opening a
database.  If supplied, the instance is used to generate
summaries of keys (e.g., a bloom filter) which are placed in
sstables.  These summaries are consulted by DB::Get() so we
can avoid reading sstable blocks that are guaranteed to not
contain the key we are looking for.

This change provides one implementation of FilterPolicy
based on bloom filters.

Other changes:
- Updated version number to 1.4.
- Some build tweaks.
- C binding for CompactRange.
- A few more benchmarks: deleteseq, deleterandom, readmissing, seekrandom.
- Minor .gitignore update.
This commit is contained in:
Sanjay Ghemawat 2012-04-17 08:36:46 -07:00
parent bc1ee4d25e
commit 85584d497e
38 changed files with 2070 additions and 570 deletions

3
.gitignore vendored
View File

@ -1,5 +1,8 @@
build_config.mk build_config.mk
*.a *.a
*.o *.o
*.dylib*
*.so
*.so.*
*_test *_test
db_bench db_bench

View File

@ -17,8 +17,8 @@ OPT ?= -O2 -DNDEBUG # (A) Production use (optimized mode)
#----------------------------------------------- #-----------------------------------------------
# detect what platform we're building on # detect what platform we're building on
$(shell sh ./build_detect_platform) $(shell ./build_detect_platform build_config.mk)
# this file is generated by build_detect_platform to set build flags and sources # this file is generated by the previous line to set build flags and sources
include build_config.mk include build_config.mk
CFLAGS += -I. -I./include $(PLATFORM_CCFLAGS) $(OPT) CFLAGS += -I. -I./include $(PLATFORM_CCFLAGS) $(OPT)
@ -34,6 +34,7 @@ TESTHARNESS = ./util/testharness.o $(TESTUTIL)
TESTS = \ TESTS = \
arena_test \ arena_test \
bloom_test \
c_test \ c_test \
cache_test \ cache_test \
coding_test \ coding_test \
@ -43,6 +44,7 @@ TESTS = \
dbformat_test \ dbformat_test \
env_test \ env_test \
filename_test \ filename_test \
filter_block_test \
log_test \ log_test \
memenv_test \ memenv_test \
skiplist_test \ skiplist_test \
@ -63,7 +65,7 @@ default: all
ifneq ($(PLATFORM_SHARED_EXT),) ifneq ($(PLATFORM_SHARED_EXT),)
# Update db.h if you change these. # Update db.h if you change these.
SHARED_MAJOR = 1 SHARED_MAJOR = 1
SHARED_MINOR = 3 SHARED_MINOR = 4
SHARED1 = libleveldb.$(PLATFORM_SHARED_EXT) SHARED1 = libleveldb.$(PLATFORM_SHARED_EXT)
SHARED2 = $(SHARED1).$(SHARED_MAJOR) SHARED2 = $(SHARED1).$(SHARED_MAJOR)
SHARED3 = $(SHARED1).$(SHARED_MAJOR).$(SHARED_MINOR) SHARED3 = $(SHARED1).$(SHARED_MAJOR).$(SHARED_MINOR)
@ -101,6 +103,9 @@ db_bench_tree_db: doc/bench/db_bench_tree_db.o $(LIBOBJECTS) $(TESTUTIL)
arena_test: util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS) arena_test: util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LDFLAGS) $(CXX) util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LDFLAGS)
bloom_test: util/bloom_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) util/bloom_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LDFLAGS)
c_test: db/c_test.o $(LIBOBJECTS) $(TESTHARNESS) c_test: db/c_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) db/c_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LDFLAGS) $(CXX) db/c_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LDFLAGS)
@ -128,6 +133,9 @@ env_test: util/env_test.o $(LIBOBJECTS) $(TESTHARNESS)
filename_test: db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS) filename_test: db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LDFLAGS) $(CXX) db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LDFLAGS)
filter_block_test: table/filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) table/filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LDFLAGS)
log_test: db/log_test.o $(LIBOBJECTS) $(TESTHARNESS) log_test: db/log_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) db/log_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LDFLAGS) $(CXX) db/log_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LDFLAGS)

38
build_detect_platform Normal file → Executable file
View File

@ -1,9 +1,9 @@
#!/bin/sh #!/bin/sh
# #
# Detects OS we're compiling on and generates build_config.mk, # Detects OS we're compiling on and outputs a file specified by the first
# which in turn gets read while processing Makefile. # argument, which in turn gets read while processing Makefile.
# #
# build_config.mk will set the following variables: # The output will set the following variables:
# PLATFORM_LDFLAGS Linker flags # PLATFORM_LDFLAGS Linker flags
# PLATFORM_SHARED_EXT Extension for shared libraries # PLATFORM_SHARED_EXT Extension for shared libraries
# PLATFORM_SHARED_LDFLAGS Flags for building shared library # PLATFORM_SHARED_LDFLAGS Flags for building shared library
@ -13,11 +13,15 @@
# -DLEVELDB_PLATFORM_POSIX if cstdatomic is present # -DLEVELDB_PLATFORM_POSIX if cstdatomic is present
# -DLEVELDB_PLATFORM_NOATOMIC if it is not # -DLEVELDB_PLATFORM_NOATOMIC if it is not
SCRIPT_DIR=`dirname $0` OUTPUT=$1
if test -z "$OUTPUT"; then
echo "usage: $0 <output-filename>"
exit 1
fi
# Delete existing build_config.mk # Delete existing output, if it exists
rm -f build_config.mk rm -f $OUTPUT
touch build_config.mk touch $OUTPUT
if test -z "$CXX"; then if test -z "$CXX"; then
CXX=g++ CXX=g++
@ -96,7 +100,7 @@ esac
# except for the test and benchmark files. By default, find will output a list # except for the test and benchmark files. By default, find will output a list
# of all files matching either rule, so we need to append -print to make the # of all files matching either rule, so we need to append -print to make the
# prune take effect. # prune take effect.
DIRS="$SCRIPT_DIR/util $SCRIPT_DIR/db $SCRIPT_DIR/table" DIRS="util db table"
set -f # temporarily disable globbing so that our patterns aren't expanded set -f # temporarily disable globbing so that our patterns aren't expanded
PRUNE_TEST="-name *test*.cc -prune" PRUNE_TEST="-name *test*.cc -prune"
PRUNE_BENCH="-name *_bench.cc -prune" PRUNE_BENCH="-name *_bench.cc -prune"
@ -105,8 +109,8 @@ set +f # re-enable globbing
# The sources consist of the portable files, plus the platform-specific port # The sources consist of the portable files, plus the platform-specific port
# file. # file.
echo "SOURCES=$PORTABLE_FILES $PORT_FILE" >> build_config.mk echo "SOURCES=$PORTABLE_FILES $PORT_FILE" >> $OUTPUT
echo "MEMENV_SOURCES=helpers/memenv/memenv.cc" >> build_config.mk echo "MEMENV_SOURCES=helpers/memenv/memenv.cc" >> $OUTPUT
if [ "$PLATFORM" = "OS_ANDROID_CROSSCOMPILE" ]; then if [ "$PLATFORM" = "OS_ANDROID_CROSSCOMPILE" ]; then
# Cross-compiling; do not try any compilation tests. # Cross-compiling; do not try any compilation tests.
@ -147,10 +151,10 @@ fi
PLATFORM_CCFLAGS="$PLATFORM_CCFLAGS $COMMON_FLAGS" PLATFORM_CCFLAGS="$PLATFORM_CCFLAGS $COMMON_FLAGS"
PLATFORM_CXXFLAGS="$PLATFORM_CXXFLAGS $COMMON_FLAGS" PLATFORM_CXXFLAGS="$PLATFORM_CXXFLAGS $COMMON_FLAGS"
echo "PLATFORM=$PLATFORM" >> build_config.mk echo "PLATFORM=$PLATFORM" >> $OUTPUT
echo "PLATFORM_LDFLAGS=$PLATFORM_LDFLAGS" >> build_config.mk echo "PLATFORM_LDFLAGS=$PLATFORM_LDFLAGS" >> $OUTPUT
echo "PLATFORM_CCFLAGS=$PLATFORM_CCFLAGS" >> build_config.mk echo "PLATFORM_CCFLAGS=$PLATFORM_CCFLAGS" >> $OUTPUT
echo "PLATFORM_CXXFLAGS=$PLATFORM_CXXFLAGS" >> build_config.mk echo "PLATFORM_CXXFLAGS=$PLATFORM_CXXFLAGS" >> $OUTPUT
echo "PLATFORM_SHARED_CFLAGS=$PLATFORM_SHARED_CFLAGS" >> build_config.mk echo "PLATFORM_SHARED_CFLAGS=$PLATFORM_SHARED_CFLAGS" >> $OUTPUT
echo "PLATFORM_SHARED_EXT=$PLATFORM_SHARED_EXT" >> build_config.mk echo "PLATFORM_SHARED_EXT=$PLATFORM_SHARED_EXT" >> $OUTPUT
echo "PLATFORM_SHARED_LDFLAGS=$PLATFORM_SHARED_LDFLAGS" >> build_config.mk echo "PLATFORM_SHARED_LDFLAGS=$PLATFORM_SHARED_LDFLAGS" >> $OUTPUT

110
db/c.cc
View File

@ -10,6 +10,7 @@
#include "leveldb/comparator.h" #include "leveldb/comparator.h"
#include "leveldb/db.h" #include "leveldb/db.h"
#include "leveldb/env.h" #include "leveldb/env.h"
#include "leveldb/filter_policy.h"
#include "leveldb/iterator.h" #include "leveldb/iterator.h"
#include "leveldb/options.h" #include "leveldb/options.h"
#include "leveldb/status.h" #include "leveldb/status.h"
@ -21,8 +22,10 @@ using leveldb::CompressionType;
using leveldb::DB; using leveldb::DB;
using leveldb::Env; using leveldb::Env;
using leveldb::FileLock; using leveldb::FileLock;
using leveldb::FilterPolicy;
using leveldb::Iterator; using leveldb::Iterator;
using leveldb::Logger; using leveldb::Logger;
using leveldb::NewBloomFilterPolicy;
using leveldb::NewLRUCache; using leveldb::NewLRUCache;
using leveldb::Options; using leveldb::Options;
using leveldb::RandomAccessFile; using leveldb::RandomAccessFile;
@ -78,6 +81,47 @@ struct leveldb_comparator_t : public Comparator {
virtual void FindShortSuccessor(std::string* key) const { } virtual void FindShortSuccessor(std::string* key) const { }
}; };
struct leveldb_filterpolicy_t : public FilterPolicy {
void* state_;
void (*destructor_)(void*);
const char* (*name_)(void*);
char* (*create_)(
void*,
const char* const* key_array, const size_t* key_length_array,
int num_keys,
size_t* filter_length);
unsigned char (*key_match_)(
void*,
const char* key, size_t length,
const char* filter, size_t filter_length);
virtual ~leveldb_filterpolicy_t() {
(*destructor_)(state_);
}
virtual const char* Name() const {
return (*name_)(state_);
}
virtual void CreateFilter(const Slice* keys, int n, std::string* dst) const {
std::vector<const char*> key_pointers(n);
std::vector<size_t> key_sizes(n);
for (int i = 0; i < n; i++) {
key_pointers[i] = keys[i].data();
key_sizes[i] = keys[i].size();
}
size_t len;
char* filter = (*create_)(state_, &key_pointers[0], &key_sizes[0], n, &len);
dst->append(filter, len);
free(filter);
}
virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const {
return (*key_match_)(state_, key.data(), key.size(),
filter.data(), filter.size());
}
};
struct leveldb_env_t { struct leveldb_env_t {
Env* rep; Env* rep;
bool is_default; bool is_default;
@ -218,6 +262,17 @@ void leveldb_approximate_sizes(
delete[] ranges; delete[] ranges;
} }
void leveldb_compact_range(
leveldb_t* db,
const char* start_key, size_t start_key_len,
const char* limit_key, size_t limit_key_len) {
Slice a, b;
db->rep->CompactRange(
// Pass NULL Slice if corresponding "const char*" is NULL
(start_key ? (a = Slice(start_key, start_key_len), &a) : NULL),
(limit_key ? (b = Slice(limit_key, limit_key_len), &b) : NULL));
}
void leveldb_destroy_db( void leveldb_destroy_db(
const leveldb_options_t* options, const leveldb_options_t* options,
const char* name, const char* name,
@ -340,6 +395,12 @@ void leveldb_options_set_comparator(
opt->rep.comparator = cmp; opt->rep.comparator = cmp;
} }
void leveldb_options_set_filter_policy(
leveldb_options_t* opt,
leveldb_filterpolicy_t* policy) {
opt->rep.filter_policy = policy;
}
void leveldb_options_set_create_if_missing( void leveldb_options_set_create_if_missing(
leveldb_options_t* opt, unsigned char v) { leveldb_options_t* opt, unsigned char v) {
opt->rep.create_if_missing = v; opt->rep.create_if_missing = v;
@ -407,6 +468,55 @@ void leveldb_comparator_destroy(leveldb_comparator_t* cmp) {
delete cmp; delete cmp;
} }
leveldb_filterpolicy_t* leveldb_filterpolicy_create(
void* state,
void (*destructor)(void*),
char* (*create_filter)(
void*,
const char* const* key_array, const size_t* key_length_array,
int num_keys,
size_t* filter_length),
unsigned char (*key_may_match)(
void*,
const char* key, size_t length,
const char* filter, size_t filter_length),
const char* (*name)(void*)) {
leveldb_filterpolicy_t* result = new leveldb_filterpolicy_t;
result->state_ = state;
result->destructor_ = destructor;
result->create_ = create_filter;
result->key_match_ = key_may_match;
result->name_ = name;
return result;
}
void leveldb_filterpolicy_destroy(leveldb_filterpolicy_t* filter) {
delete filter;
}
leveldb_filterpolicy_t* leveldb_filterpolicy_create_bloom(int bits_per_key) {
// Make a leveldb_filterpolicy_t, but override all of its methods so
// they delegate to a NewBloomFilterPolicy() instead of user
// supplied C functions.
struct Wrapper : public leveldb_filterpolicy_t {
const FilterPolicy* rep_;
~Wrapper() { delete rep_; }
const char* Name() const { return rep_->Name(); }
void CreateFilter(const Slice* keys, int n, std::string* dst) const {
return rep_->CreateFilter(keys, n, dst);
}
bool KeyMayMatch(const Slice& key, const Slice& filter) const {
return rep_->KeyMayMatch(key, filter);
}
static void DoNothing(void*) { }
};
Wrapper* wrapper = new Wrapper;
wrapper->rep_ = NewBloomFilterPolicy(bits_per_key);
wrapper->state_ = NULL;
wrapper->destructor_ = &Wrapper::DoNothing;
return wrapper;
}
leveldb_readoptions_t* leveldb_readoptions_create() { leveldb_readoptions_t* leveldb_readoptions_create() {
return new leveldb_readoptions_t; return new leveldb_readoptions_t;
} }

View File

@ -122,6 +122,31 @@ static const char* CmpName(void* arg) {
return "foo"; return "foo";
} }
// Custom filter policy
static unsigned char fake_filter_result = 1;
static void FilterDestroy(void* arg) { }
static const char* FilterName(void* arg) {
return "TestFilter";
}
static char* FilterCreate(
void* arg,
const char* const* key_array, const size_t* key_length_array,
int num_keys,
size_t* filter_length) {
*filter_length = 4;
char* result = malloc(4);
memcpy(result, "fake", 4);
return result;
}
unsigned char FilterKeyMatch(
void* arg,
const char* key, size_t length,
const char* filter, size_t filter_length) {
CheckCondition(filter_length == 4);
CheckCondition(memcmp(filter, "fake", 4) == 0);
return fake_filter_result;
}
int main(int argc, char** argv) { int main(int argc, char** argv) {
leveldb_t* db; leveldb_t* db;
leveldb_comparator_t* cmp; leveldb_comparator_t* cmp;
@ -131,6 +156,7 @@ int main(int argc, char** argv) {
leveldb_readoptions_t* roptions; leveldb_readoptions_t* roptions;
leveldb_writeoptions_t* woptions; leveldb_writeoptions_t* woptions;
char* err = NULL; char* err = NULL;
int run = -1;
snprintf(dbname, sizeof(dbname), "/tmp/leveldb_c_test-%d", snprintf(dbname, sizeof(dbname), "/tmp/leveldb_c_test-%d",
((int) geteuid())); ((int) geteuid()));
@ -180,6 +206,14 @@ int main(int argc, char** argv) {
CheckNoError(err); CheckNoError(err);
CheckGet(db, roptions, "foo", "hello"); CheckGet(db, roptions, "foo", "hello");
StartPhase("compactall");
leveldb_compact_range(db, NULL, 0, NULL, 0);
CheckGet(db, roptions, "foo", "hello");
StartPhase("compactrange");
leveldb_compact_range(db, "a", 1, "z", 1);
CheckGet(db, roptions, "foo", "hello");
StartPhase("writebatch"); StartPhase("writebatch");
{ {
leveldb_writebatch_t* wb = leveldb_writebatch_create(); leveldb_writebatch_t* wb = leveldb_writebatch_create();
@ -279,6 +313,49 @@ int main(int argc, char** argv) {
CheckGet(db, roptions, "foo", NULL); CheckGet(db, roptions, "foo", NULL);
CheckGet(db, roptions, "bar", NULL); CheckGet(db, roptions, "bar", NULL);
CheckGet(db, roptions, "box", "c"); CheckGet(db, roptions, "box", "c");
leveldb_options_set_create_if_missing(options, 1);
leveldb_options_set_error_if_exists(options, 1);
}
StartPhase("filter");
for (run = 0; run < 2; run++) {
// First run uses custom filter, second run uses bloom filter
CheckNoError(err);
leveldb_filterpolicy_t* policy;
if (run == 0) {
policy = leveldb_filterpolicy_create(
NULL, FilterDestroy, FilterCreate, FilterKeyMatch, FilterName);
} else {
policy = leveldb_filterpolicy_create_bloom(10);
}
// Create new database
leveldb_close(db);
leveldb_destroy_db(options, dbname, &err);
leveldb_options_set_filter_policy(options, policy);
db = leveldb_open(options, dbname, &err);
CheckNoError(err);
leveldb_put(db, woptions, "foo", 3, "foovalue", 8, &err);
CheckNoError(err);
leveldb_put(db, woptions, "bar", 3, "barvalue", 8, &err);
CheckNoError(err);
leveldb_compact_range(db, NULL, 0, NULL, 0);
fake_filter_result = 1;
CheckGet(db, roptions, "foo", "foovalue");
CheckGet(db, roptions, "bar", "barvalue");
if (phase == 0) {
// Must not find value when custom filter returns false
fake_filter_result = 0;
CheckGet(db, roptions, "foo", NULL);
CheckGet(db, roptions, "bar", NULL);
fake_filter_result = 1;
CheckGet(db, roptions, "foo", "foovalue");
CheckGet(db, roptions, "bar", "barvalue");
}
leveldb_options_set_filter_policy(options, NULL);
leveldb_filterpolicy_destroy(policy);
} }
StartPhase("cleanup"); StartPhase("cleanup");

View File

@ -25,15 +25,20 @@
// overwrite -- overwrite N values in random key order in async mode // overwrite -- overwrite N values in random key order in async mode
// fillsync -- write N/100 values in random key order in sync mode // fillsync -- write N/100 values in random key order in sync mode
// fill100K -- write N/1000 100K values in random order in async mode // fill100K -- write N/1000 100K values in random order in async mode
// deleteseq -- delete N keys in sequential order
// deleterandom -- delete N keys in random order
// readseq -- read N times sequentially // readseq -- read N times sequentially
// readreverse -- read N times in reverse order // readreverse -- read N times in reverse order
// readrandom -- read N times in random order // readrandom -- read N times in random order
// readmissing -- read N missing keys in random order
// readhot -- read N times in random order from 1% section of DB // readhot -- read N times in random order from 1% section of DB
// seekrandom -- N random seeks
// crc32c -- repeated crc32c of 4K of data // crc32c -- repeated crc32c of 4K of data
// acquireload -- load N*1000 times // acquireload -- load N*1000 times
// Meta operations: // Meta operations:
// compact -- Compact the entire DB // compact -- Compact the entire DB
// stats -- Print DB stats // stats -- Print DB stats
// sstables -- Print sstable info
// heapprofile -- Dump a heap profile (if supported by this port) // heapprofile -- Dump a heap profile (if supported by this port)
static const char* FLAGS_benchmarks = static const char* FLAGS_benchmarks =
"fillseq," "fillseq,"
@ -85,6 +90,10 @@ static int FLAGS_cache_size = -1;
// Maximum number of files to keep open at the same time (use default if == 0) // Maximum number of files to keep open at the same time (use default if == 0)
static int FLAGS_open_files = 0; static int FLAGS_open_files = 0;
// Bloom filter bits per key.
// Negative means use default settings.
static int FLAGS_bloom_bits = -1;
// If true, do not destroy the existing database. If you set this // If true, do not destroy the existing database. If you set this
// flag and also specify a benchmark that wants a fresh database, that // flag and also specify a benchmark that wants a fresh database, that
// benchmark will fail. // benchmark will fail.
@ -293,6 +302,7 @@ struct ThreadState {
class Benchmark { class Benchmark {
private: private:
Cache* cache_; Cache* cache_;
const FilterPolicy* filter_policy_;
DB* db_; DB* db_;
int num_; int num_;
int value_size_; int value_size_;
@ -378,6 +388,9 @@ class Benchmark {
public: public:
Benchmark() Benchmark()
: cache_(FLAGS_cache_size >= 0 ? NewLRUCache(FLAGS_cache_size) : NULL), : cache_(FLAGS_cache_size >= 0 ? NewLRUCache(FLAGS_cache_size) : NULL),
filter_policy_(FLAGS_bloom_bits >= 0
? NewBloomFilterPolicy(FLAGS_bloom_bits)
: NULL),
db_(NULL), db_(NULL),
num_(FLAGS_num), num_(FLAGS_num),
value_size_(FLAGS_value_size), value_size_(FLAGS_value_size),
@ -399,6 +412,7 @@ class Benchmark {
~Benchmark() { ~Benchmark() {
delete db_; delete db_;
delete cache_; delete cache_;
delete filter_policy_;
} }
void Run() { void Run() {
@ -457,11 +471,19 @@ class Benchmark {
method = &Benchmark::ReadReverse; method = &Benchmark::ReadReverse;
} else if (name == Slice("readrandom")) { } else if (name == Slice("readrandom")) {
method = &Benchmark::ReadRandom; method = &Benchmark::ReadRandom;
} else if (name == Slice("readmissing")) {
method = &Benchmark::ReadMissing;
} else if (name == Slice("seekrandom")) {
method = &Benchmark::SeekRandom;
} else if (name == Slice("readhot")) { } else if (name == Slice("readhot")) {
method = &Benchmark::ReadHot; method = &Benchmark::ReadHot;
} else if (name == Slice("readrandomsmall")) { } else if (name == Slice("readrandomsmall")) {
reads_ /= 1000; reads_ /= 1000;
method = &Benchmark::ReadRandom; method = &Benchmark::ReadRandom;
} else if (name == Slice("deleteseq")) {
method = &Benchmark::DeleteSeq;
} else if (name == Slice("deleterandom")) {
method = &Benchmark::DeleteRandom;
} else if (name == Slice("readwhilewriting")) { } else if (name == Slice("readwhilewriting")) {
num_threads++; // Add extra thread for writing num_threads++; // Add extra thread for writing
method = &Benchmark::ReadWhileWriting; method = &Benchmark::ReadWhileWriting;
@ -478,7 +500,9 @@ class Benchmark {
} else if (name == Slice("heapprofile")) { } else if (name == Slice("heapprofile")) {
HeapProfile(); HeapProfile();
} else if (name == Slice("stats")) { } else if (name == Slice("stats")) {
PrintStats(); PrintStats("leveldb.stats");
} else if (name == Slice("sstables")) {
PrintStats("leveldb.sstables");
} else { } else {
if (name != Slice()) { // No error message for empty name if (name != Slice()) { // No error message for empty name
fprintf(stderr, "unknown benchmark '%s'\n", name.ToString().c_str()); fprintf(stderr, "unknown benchmark '%s'\n", name.ToString().c_str());
@ -669,6 +693,7 @@ class Benchmark {
options.create_if_missing = !FLAGS_use_existing_db; options.create_if_missing = !FLAGS_use_existing_db;
options.block_cache = cache_; options.block_cache = cache_;
options.write_buffer_size = FLAGS_write_buffer_size; options.write_buffer_size = FLAGS_write_buffer_size;
options.filter_policy = filter_policy_;
Status s = DB::Open(options, FLAGS_db, &db_); Status s = DB::Open(options, FLAGS_db, &db_);
if (!s.ok()) { if (!s.ok()) {
fprintf(stderr, "open error: %s\n", s.ToString().c_str()); fprintf(stderr, "open error: %s\n", s.ToString().c_str());
@ -743,10 +768,28 @@ class Benchmark {
void ReadRandom(ThreadState* thread) { void ReadRandom(ThreadState* thread) {
ReadOptions options; ReadOptions options;
std::string value; std::string value;
int found = 0;
for (int i = 0; i < reads_; i++) { for (int i = 0; i < reads_; i++) {
char key[100]; char key[100];
const int k = thread->rand.Next() % FLAGS_num; const int k = thread->rand.Next() % FLAGS_num;
snprintf(key, sizeof(key), "%016d", k); snprintf(key, sizeof(key), "%016d", k);
if (db_->Get(options, key, &value).ok()) {
found++;
}
thread->stats.FinishedSingleOp();
}
char msg[100];
snprintf(msg, sizeof(msg), "(%d of %d found)", found, num_);
thread->stats.AddMessage(msg);
}
void ReadMissing(ThreadState* thread) {
ReadOptions options;
std::string value;
for (int i = 0; i < reads_; i++) {
char key[100];
const int k = thread->rand.Next() % FLAGS_num;
snprintf(key, sizeof(key), "%016d.", k);
db_->Get(options, key, &value); db_->Get(options, key, &value);
thread->stats.FinishedSingleOp(); thread->stats.FinishedSingleOp();
} }
@ -765,6 +808,54 @@ class Benchmark {
} }
} }
void SeekRandom(ThreadState* thread) {
ReadOptions options;
std::string value;
int found = 0;
for (int i = 0; i < reads_; i++) {
Iterator* iter = db_->NewIterator(options);
char key[100];
const int k = thread->rand.Next() % FLAGS_num;
snprintf(key, sizeof(key), "%016d", k);
iter->Seek(key);
if (iter->Valid() && iter->key() == key) found++;
delete iter;
thread->stats.FinishedSingleOp();
}
char msg[100];
snprintf(msg, sizeof(msg), "(%d of %d found)", found, num_);
thread->stats.AddMessage(msg);
}
void DoDelete(ThreadState* thread, bool seq) {
RandomGenerator gen;
WriteBatch batch;
Status s;
for (int i = 0; i < num_; i += entries_per_batch_) {
batch.Clear();
for (int j = 0; j < entries_per_batch_; j++) {
const int k = seq ? i+j : (thread->rand.Next() % FLAGS_num);
char key[100];
snprintf(key, sizeof(key), "%016d", k);
batch.Delete(key);
thread->stats.FinishedSingleOp();
}
s = db_->Write(write_options_, &batch);
if (!s.ok()) {
fprintf(stderr, "del error: %s\n", s.ToString().c_str());
exit(1);
}
}
}
void DeleteSeq(ThreadState* thread) {
DoDelete(thread, true);
}
void DeleteRandom(ThreadState* thread) {
DoDelete(thread, false);
}
void ReadWhileWriting(ThreadState* thread) { void ReadWhileWriting(ThreadState* thread) {
if (thread->tid > 0) { if (thread->tid > 0) {
ReadRandom(thread); ReadRandom(thread);
@ -799,9 +890,9 @@ class Benchmark {
db_->CompactRange(NULL, NULL); db_->CompactRange(NULL, NULL);
} }
void PrintStats() { void PrintStats(const char* key) {
std::string stats; std::string stats;
if (!db_->GetProperty("leveldb.stats", &stats)) { if (!db_->GetProperty(key, &stats)) {
stats = "(failed)"; stats = "(failed)";
} }
fprintf(stdout, "\n%s\n", stats.c_str()); fprintf(stdout, "\n%s\n", stats.c_str());
@ -861,6 +952,8 @@ int main(int argc, char** argv) {
FLAGS_write_buffer_size = n; FLAGS_write_buffer_size = n;
} else if (sscanf(argv[i], "--cache_size=%d%c", &n, &junk) == 1) { } else if (sscanf(argv[i], "--cache_size=%d%c", &n, &junk) == 1) {
FLAGS_cache_size = n; FLAGS_cache_size = n;
} else if (sscanf(argv[i], "--bloom_bits=%d%c", &n, &junk) == 1) {
FLAGS_bloom_bits = n;
} else if (sscanf(argv[i], "--open_files=%d%c", &n, &junk) == 1) { } else if (sscanf(argv[i], "--open_files=%d%c", &n, &junk) == 1) {
FLAGS_open_files = n; FLAGS_open_files = n;
} else if (strncmp(argv[i], "--db=", 5) == 0) { } else if (strncmp(argv[i], "--db=", 5) == 0) {

View File

@ -87,9 +87,11 @@ static void ClipToRange(T* ptr, V minvalue, V maxvalue) {
} }
Options SanitizeOptions(const std::string& dbname, Options SanitizeOptions(const std::string& dbname,
const InternalKeyComparator* icmp, const InternalKeyComparator* icmp,
const InternalFilterPolicy* ipolicy,
const Options& src) { const Options& src) {
Options result = src; Options result = src;
result.comparator = icmp; result.comparator = icmp;
result.filter_policy = (src.filter_policy != NULL) ? ipolicy : NULL;
ClipToRange(&result.max_open_files, 20, 50000); ClipToRange(&result.max_open_files, 20, 50000);
ClipToRange(&result.write_buffer_size, 64<<10, 1<<30); ClipToRange(&result.write_buffer_size, 64<<10, 1<<30);
ClipToRange(&result.block_size, 1<<10, 4<<20); ClipToRange(&result.block_size, 1<<10, 4<<20);
@ -112,7 +114,9 @@ Options SanitizeOptions(const std::string& dbname,
DBImpl::DBImpl(const Options& options, const std::string& dbname) DBImpl::DBImpl(const Options& options, const std::string& dbname)
: env_(options.env), : env_(options.env),
internal_comparator_(options.comparator), internal_comparator_(options.comparator),
options_(SanitizeOptions(dbname, &internal_comparator_, options)), internal_filter_policy_(options.filter_policy),
options_(SanitizeOptions(
dbname, &internal_comparator_, &internal_filter_policy_, options)),
owns_info_log_(options_.info_log != options.info_log), owns_info_log_(options_.info_log != options.info_log),
owns_cache_(options_.block_cache != options.block_cache), owns_cache_(options_.block_cache != options.block_cache),
dbname_(dbname), dbname_(dbname),

View File

@ -105,6 +105,7 @@ class DBImpl : public DB {
// Constant after construction // Constant after construction
Env* const env_; Env* const env_;
const InternalKeyComparator internal_comparator_; const InternalKeyComparator internal_comparator_;
const InternalFilterPolicy internal_filter_policy_;
const Options options_; // options_.comparator == &internal_comparator_ const Options options_; // options_.comparator == &internal_comparator_
bool owns_info_log_; bool owns_info_log_;
bool owns_cache_; bool owns_cache_;
@ -185,6 +186,7 @@ class DBImpl : public DB {
// it is not equal to src.info_log. // it is not equal to src.info_log.
extern Options SanitizeOptions(const std::string& db, extern Options SanitizeOptions(const std::string& db,
const InternalKeyComparator* icmp, const InternalKeyComparator* icmp,
const InternalFilterPolicy* ipolicy,
const Options& src); const Options& src);
} // namespace leveldb } // namespace leveldb

View File

@ -3,12 +3,15 @@
// found in the LICENSE file. See the AUTHORS file for names of contributors. // found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "leveldb/db.h" #include "leveldb/db.h"
#include "leveldb/filter_policy.h"
#include "db/db_impl.h" #include "db/db_impl.h"
#include "db/filename.h" #include "db/filename.h"
#include "db/version_set.h" #include "db/version_set.h"
#include "db/write_batch_internal.h" #include "db/write_batch_internal.h"
#include "leveldb/cache.h"
#include "leveldb/env.h" #include "leveldb/env.h"
#include "leveldb/table.h" #include "leveldb/table.h"
#include "util/hash.h"
#include "util/logging.h" #include "util/logging.h"
#include "util/mutexlock.h" #include "util/mutexlock.h"
#include "util/testharness.h" #include "util/testharness.h"
@ -22,6 +25,28 @@ static std::string RandomString(Random* rnd, int len) {
return r; return r;
} }
namespace {
class AtomicCounter {
private:
port::Mutex mu_;
int count_;
public:
AtomicCounter() : count_(0) { }
void Increment() {
MutexLock l(&mu_);
count_++;
}
int Read() {
MutexLock l(&mu_);
return count_;
}
void Reset() {
MutexLock l(&mu_);
count_ = 0;
}
};
}
// Special Env used to delay background operations // Special Env used to delay background operations
class SpecialEnv : public EnvWrapper { class SpecialEnv : public EnvWrapper {
public: public:
@ -31,9 +56,13 @@ class SpecialEnv : public EnvWrapper {
// Simulate no-space errors while this pointer is non-NULL. // Simulate no-space errors while this pointer is non-NULL.
port::AtomicPointer no_space_; port::AtomicPointer no_space_;
bool count_random_reads_;
AtomicCounter random_read_counter_;
explicit SpecialEnv(Env* base) : EnvWrapper(base) { explicit SpecialEnv(Env* base) : EnvWrapper(base) {
delay_sstable_sync_.Release_Store(NULL); delay_sstable_sync_.Release_Store(NULL);
no_space_.Release_Store(NULL); no_space_.Release_Store(NULL);
count_random_reads_ = false;
} }
Status NewWritableFile(const std::string& f, WritableFile** r) { Status NewWritableFile(const std::string& f, WritableFile** r) {
@ -74,9 +103,44 @@ class SpecialEnv : public EnvWrapper {
} }
return s; return s;
} }
Status NewRandomAccessFile(const std::string& f, RandomAccessFile** r) {
class CountingFile : public RandomAccessFile {
private:
RandomAccessFile* target_;
AtomicCounter* counter_;
public:
CountingFile(RandomAccessFile* target, AtomicCounter* counter)
: target_(target), counter_(counter) {
}
virtual ~CountingFile() { delete target_; }
virtual Status Read(uint64_t offset, size_t n, Slice* result,
char* scratch) const {
counter_->Increment();
return target_->Read(offset, n, result, scratch);
}
};
Status s = target()->NewRandomAccessFile(f, r);
if (s.ok() && count_random_reads_) {
*r = new CountingFile(*r, &random_read_counter_);
}
return s;
}
}; };
class DBTest { class DBTest {
private:
const FilterPolicy* filter_policy_;
// Sequence of option configurations to try
enum OptionConfig {
kDefault,
kFilter,
kEnd
};
int option_config_;
public: public:
std::string dbname_; std::string dbname_;
SpecialEnv* env_; SpecialEnv* env_;
@ -84,7 +148,9 @@ class DBTest {
Options last_options_; Options last_options_;
DBTest() : env_(new SpecialEnv(Env::Default())) { DBTest() : option_config_(kDefault),
env_(new SpecialEnv(Env::Default())) {
filter_policy_ = NewBloomFilterPolicy(10);
dbname_ = test::TmpDir() + "/db_test"; dbname_ = test::TmpDir() + "/db_test";
DestroyDB(dbname_, Options()); DestroyDB(dbname_, Options());
db_ = NULL; db_ = NULL;
@ -95,6 +161,32 @@ class DBTest {
delete db_; delete db_;
DestroyDB(dbname_, Options()); DestroyDB(dbname_, Options());
delete env_; delete env_;
delete filter_policy_;
}
// Switch to a fresh database with the next option configuration to
// test. Return false if there are no more configurations to test.
bool ChangeOptions() {
if (option_config_ == kEnd) {
return false;
} else {
option_config_++;
DestroyAndReopen();
return true;
}
}
// Return the current option configuration.
Options CurrentOptions() {
Options options;
switch (option_config_) {
case kFilter:
options.filter_policy = filter_policy_;
break;
default:
break;
}
return options;
} }
DBImpl* dbfull() { DBImpl* dbfull() {
@ -105,6 +197,11 @@ class DBTest {
ASSERT_OK(TryReopen(options)); ASSERT_OK(TryReopen(options));
} }
void Close() {
delete db_;
db_ = NULL;
}
void DestroyAndReopen(Options* options = NULL) { void DestroyAndReopen(Options* options = NULL) {
delete db_; delete db_;
db_ = NULL; db_ = NULL;
@ -119,6 +216,7 @@ class DBTest {
if (options != NULL) { if (options != NULL) {
opts = *options; opts = *options;
} else { } else {
opts = CurrentOptions();
opts.create_if_missing = true; opts.create_if_missing = true;
} }
last_options_ = opts; last_options_ = opts;
@ -189,8 +287,7 @@ class DBTest {
if (!ParseInternalKey(iter->key(), &ikey)) { if (!ParseInternalKey(iter->key(), &ikey)) {
result += "CORRUPTED"; result += "CORRUPTED";
} else { } else {
if (last_options_.comparator->Compare( if (last_options_.comparator->Compare(ikey.user_key, user_key) != 0) {
ikey.user_key, user_key) != 0) {
break; break;
} }
if (!first) { if (!first) {
@ -314,30 +411,37 @@ class DBTest {
}; };
TEST(DBTest, Empty) { TEST(DBTest, Empty) {
do {
ASSERT_TRUE(db_ != NULL); ASSERT_TRUE(db_ != NULL);
ASSERT_EQ("NOT_FOUND", Get("foo")); ASSERT_EQ("NOT_FOUND", Get("foo"));
} while (ChangeOptions());
} }
TEST(DBTest, ReadWrite) { TEST(DBTest, ReadWrite) {
do {
ASSERT_OK(Put("foo", "v1")); ASSERT_OK(Put("foo", "v1"));
ASSERT_EQ("v1", Get("foo")); ASSERT_EQ("v1", Get("foo"));
ASSERT_OK(Put("bar", "v2")); ASSERT_OK(Put("bar", "v2"));
ASSERT_OK(Put("foo", "v3")); ASSERT_OK(Put("foo", "v3"));
ASSERT_EQ("v3", Get("foo")); ASSERT_EQ("v3", Get("foo"));
ASSERT_EQ("v2", Get("bar")); ASSERT_EQ("v2", Get("bar"));
} while (ChangeOptions());
} }
TEST(DBTest, PutDeleteGet) { TEST(DBTest, PutDeleteGet) {
do {
ASSERT_OK(db_->Put(WriteOptions(), "foo", "v1")); ASSERT_OK(db_->Put(WriteOptions(), "foo", "v1"));
ASSERT_EQ("v1", Get("foo")); ASSERT_EQ("v1", Get("foo"));
ASSERT_OK(db_->Put(WriteOptions(), "foo", "v2")); ASSERT_OK(db_->Put(WriteOptions(), "foo", "v2"));
ASSERT_EQ("v2", Get("foo")); ASSERT_EQ("v2", Get("foo"));
ASSERT_OK(db_->Delete(WriteOptions(), "foo")); ASSERT_OK(db_->Delete(WriteOptions(), "foo"));
ASSERT_EQ("NOT_FOUND", Get("foo")); ASSERT_EQ("NOT_FOUND", Get("foo"));
} while (ChangeOptions());
} }
TEST(DBTest, GetFromImmutableLayer) { TEST(DBTest, GetFromImmutableLayer) {
Options options; do {
Options options = CurrentOptions();
options.env = env_; options.env = env_;
options.write_buffer_size = 100000; // Small write buffer options.write_buffer_size = 100000; // Small write buffer
Reopen(&options); Reopen(&options);
@ -350,15 +454,19 @@ TEST(DBTest, GetFromImmutableLayer) {
Put("k2", std::string(100000, 'y')); // Trigger compaction Put("k2", std::string(100000, 'y')); // Trigger compaction
ASSERT_EQ("v1", Get("foo")); ASSERT_EQ("v1", Get("foo"));
env_->delay_sstable_sync_.Release_Store(NULL); // Release sync calls env_->delay_sstable_sync_.Release_Store(NULL); // Release sync calls
} while (ChangeOptions());
} }
TEST(DBTest, GetFromVersions) { TEST(DBTest, GetFromVersions) {
do {
ASSERT_OK(Put("foo", "v1")); ASSERT_OK(Put("foo", "v1"));
dbfull()->TEST_CompactMemTable(); dbfull()->TEST_CompactMemTable();
ASSERT_EQ("v1", Get("foo")); ASSERT_EQ("v1", Get("foo"));
} while (ChangeOptions());
} }
TEST(DBTest, GetSnapshot) { TEST(DBTest, GetSnapshot) {
do {
// Try with both a short key and a long key // Try with both a short key and a long key
for (int i = 0; i < 2; i++) { for (int i = 0; i < 2; i++) {
std::string key = (i == 0) ? std::string("foo") : std::string(200, 'x'); std::string key = (i == 0) ? std::string("foo") : std::string(200, 'x');
@ -372,9 +480,11 @@ TEST(DBTest, GetSnapshot) {
ASSERT_EQ("v1", Get(key, s1)); ASSERT_EQ("v1", Get(key, s1));
db_->ReleaseSnapshot(s1); db_->ReleaseSnapshot(s1);
} }
} while (ChangeOptions());
} }
TEST(DBTest, GetLevel0Ordering) { TEST(DBTest, GetLevel0Ordering) {
do {
// Check that we process level-0 files in correct order. The code // Check that we process level-0 files in correct order. The code
// below generates two level-0 files where the earlier one comes // below generates two level-0 files where the earlier one comes
// before the later one in the level-0 file list since the earlier // before the later one in the level-0 file list since the earlier
@ -385,9 +495,11 @@ TEST(DBTest, GetLevel0Ordering) {
ASSERT_OK(Put("foo", "v2")); ASSERT_OK(Put("foo", "v2"));
dbfull()->TEST_CompactMemTable(); dbfull()->TEST_CompactMemTable();
ASSERT_EQ("v2", Get("foo")); ASSERT_EQ("v2", Get("foo"));
} while (ChangeOptions());
} }
TEST(DBTest, GetOrderedByLevels) { TEST(DBTest, GetOrderedByLevels) {
do {
ASSERT_OK(Put("foo", "v1")); ASSERT_OK(Put("foo", "v1"));
Compact("a", "z"); Compact("a", "z");
ASSERT_EQ("v1", Get("foo")); ASSERT_EQ("v1", Get("foo"));
@ -395,9 +507,11 @@ TEST(DBTest, GetOrderedByLevels) {
ASSERT_EQ("v2", Get("foo")); ASSERT_EQ("v2", Get("foo"));
dbfull()->TEST_CompactMemTable(); dbfull()->TEST_CompactMemTable();
ASSERT_EQ("v2", Get("foo")); ASSERT_EQ("v2", Get("foo"));
} while (ChangeOptions());
} }
TEST(DBTest, GetPicksCorrectFile) { TEST(DBTest, GetPicksCorrectFile) {
do {
// Arrange to have multiple files in a non-level-0 level. // Arrange to have multiple files in a non-level-0 level.
ASSERT_OK(Put("a", "va")); ASSERT_OK(Put("a", "va"));
Compact("a", "b"); Compact("a", "b");
@ -408,9 +522,11 @@ TEST(DBTest, GetPicksCorrectFile) {
ASSERT_EQ("va", Get("a")); ASSERT_EQ("va", Get("a"));
ASSERT_EQ("vf", Get("f")); ASSERT_EQ("vf", Get("f"));
ASSERT_EQ("vx", Get("x")); ASSERT_EQ("vx", Get("x"));
} while (ChangeOptions());
} }
TEST(DBTest, GetEncountersEmptyLevel) { TEST(DBTest, GetEncountersEmptyLevel) {
do {
// Arrange for the following to happen: // Arrange for the following to happen:
// * sstable A in level 0 // * sstable A in level 0
// * nothing in level 1 // * nothing in level 1
@ -443,6 +559,7 @@ TEST(DBTest, GetEncountersEmptyLevel) {
read_count++; read_count++;
ASSERT_EQ("NOT_FOUND", Get("missing")); ASSERT_EQ("NOT_FOUND", Get("missing"));
} }
} while (ChangeOptions());
} }
TEST(DBTest, IterEmpty) { TEST(DBTest, IterEmpty) {
@ -620,6 +737,7 @@ TEST(DBTest, IterSmallAndLargeMix) {
} }
TEST(DBTest, IterMultiWithDelete) { TEST(DBTest, IterMultiWithDelete) {
do {
ASSERT_OK(Put("a", "va")); ASSERT_OK(Put("a", "va"));
ASSERT_OK(Put("b", "vb")); ASSERT_OK(Put("b", "vb"));
ASSERT_OK(Put("c", "vc")); ASSERT_OK(Put("c", "vc"));
@ -632,9 +750,11 @@ TEST(DBTest, IterMultiWithDelete) {
iter->Prev(); iter->Prev();
ASSERT_EQ(IterStatus(iter), "a->va"); ASSERT_EQ(IterStatus(iter), "a->va");
delete iter; delete iter;
} while (ChangeOptions());
} }
TEST(DBTest, Recover) { TEST(DBTest, Recover) {
do {
ASSERT_OK(Put("foo", "v1")); ASSERT_OK(Put("foo", "v1"));
ASSERT_OK(Put("baz", "v5")); ASSERT_OK(Put("baz", "v5"));
@ -652,9 +772,11 @@ TEST(DBTest, Recover) {
ASSERT_EQ("v4", Get("foo")); ASSERT_EQ("v4", Get("foo"));
ASSERT_EQ("v2", Get("bar")); ASSERT_EQ("v2", Get("bar"));
ASSERT_EQ("v5", Get("baz")); ASSERT_EQ("v5", Get("baz"));
} while (ChangeOptions());
} }
TEST(DBTest, RecoveryWithEmptyLog) { TEST(DBTest, RecoveryWithEmptyLog) {
do {
ASSERT_OK(Put("foo", "v1")); ASSERT_OK(Put("foo", "v1"));
ASSERT_OK(Put("foo", "v2")); ASSERT_OK(Put("foo", "v2"));
Reopen(); Reopen();
@ -662,12 +784,14 @@ TEST(DBTest, RecoveryWithEmptyLog) {
ASSERT_OK(Put("foo", "v3")); ASSERT_OK(Put("foo", "v3"));
Reopen(); Reopen();
ASSERT_EQ("v3", Get("foo")); ASSERT_EQ("v3", Get("foo"));
} while (ChangeOptions());
} }
// Check that writes done during a memtable compaction are recovered // Check that writes done during a memtable compaction are recovered
// if the database is shutdown during the memtable compaction. // if the database is shutdown during the memtable compaction.
TEST(DBTest, RecoverDuringMemtableCompaction) { TEST(DBTest, RecoverDuringMemtableCompaction) {
Options options; do {
Options options = CurrentOptions();
options.env = env_; options.env = env_;
options.write_buffer_size = 1000000; options.write_buffer_size = 1000000;
Reopen(&options); Reopen(&options);
@ -683,6 +807,7 @@ TEST(DBTest, RecoverDuringMemtableCompaction) {
ASSERT_EQ("v2", Get("bar")); ASSERT_EQ("v2", Get("bar"));
ASSERT_EQ(std::string(10000000, 'x'), Get("big1")); ASSERT_EQ(std::string(10000000, 'x'), Get("big1"));
ASSERT_EQ(std::string(1000, 'y'), Get("big2")); ASSERT_EQ(std::string(1000, 'y'), Get("big2"));
} while (ChangeOptions());
} }
static std::string Key(int i) { static std::string Key(int i) {
@ -692,7 +817,7 @@ static std::string Key(int i) {
} }
TEST(DBTest, MinorCompactionsHappen) { TEST(DBTest, MinorCompactionsHappen) {
Options options; Options options = CurrentOptions();
options.write_buffer_size = 10000; options.write_buffer_size = 10000;
Reopen(&options); Reopen(&options);
@ -718,7 +843,7 @@ TEST(DBTest, MinorCompactionsHappen) {
TEST(DBTest, RecoverWithLargeLog) { TEST(DBTest, RecoverWithLargeLog) {
{ {
Options options; Options options = CurrentOptions();
Reopen(&options); Reopen(&options);
ASSERT_OK(Put("big1", std::string(200000, '1'))); ASSERT_OK(Put("big1", std::string(200000, '1')));
ASSERT_OK(Put("big2", std::string(200000, '2'))); ASSERT_OK(Put("big2", std::string(200000, '2')));
@ -729,7 +854,7 @@ TEST(DBTest, RecoverWithLargeLog) {
// Make sure that if we re-open with a small write buffer size that // Make sure that if we re-open with a small write buffer size that
// we flush table files in the middle of a large log file. // we flush table files in the middle of a large log file.
Options options; Options options = CurrentOptions();
options.write_buffer_size = 100000; options.write_buffer_size = 100000;
Reopen(&options); Reopen(&options);
ASSERT_EQ(NumTableFilesAtLevel(0), 3); ASSERT_EQ(NumTableFilesAtLevel(0), 3);
@ -741,7 +866,7 @@ TEST(DBTest, RecoverWithLargeLog) {
} }
TEST(DBTest, CompactionsGenerateMultipleFiles) { TEST(DBTest, CompactionsGenerateMultipleFiles) {
Options options; Options options = CurrentOptions();
options.write_buffer_size = 100000000; // Large write buffer options.write_buffer_size = 100000000; // Large write buffer
Reopen(&options); Reopen(&options);
@ -767,7 +892,7 @@ TEST(DBTest, CompactionsGenerateMultipleFiles) {
} }
TEST(DBTest, RepeatedWritesToSameKey) { TEST(DBTest, RepeatedWritesToSameKey) {
Options options; Options options = CurrentOptions();
options.env = env_; options.env = env_;
options.write_buffer_size = 100000; // Small write buffer options.write_buffer_size = 100000; // Small write buffer
Reopen(&options); Reopen(&options);
@ -786,7 +911,7 @@ TEST(DBTest, RepeatedWritesToSameKey) {
} }
TEST(DBTest, SparseMerge) { TEST(DBTest, SparseMerge) {
Options options; Options options = CurrentOptions();
options.compression = kNoCompression; options.compression = kNoCompression;
Reopen(&options); Reopen(&options);
@ -837,7 +962,8 @@ static bool Between(uint64_t val, uint64_t low, uint64_t high) {
} }
TEST(DBTest, ApproximateSizes) { TEST(DBTest, ApproximateSizes) {
Options options; do {
Options options = CurrentOptions();
options.write_buffer_size = 100000000; // Large write buffer options.write_buffer_size = 100000000; // Large write buffer
options.compression = kNoCompression; options.compression = kNoCompression;
DestroyAndReopen(); DestroyAndReopen();
@ -849,9 +975,11 @@ TEST(DBTest, ApproximateSizes) {
// Write 8MB (80 values, each 100K) // Write 8MB (80 values, each 100K)
ASSERT_EQ(NumTableFilesAtLevel(0), 0); ASSERT_EQ(NumTableFilesAtLevel(0), 0);
const int N = 80; const int N = 80;
static const int S1 = 100000;
static const int S2 = 105000; // Allow some expansion from metadata
Random rnd(301); Random rnd(301);
for (int i = 0; i < N; i++) { for (int i = 0; i < N; i++) {
ASSERT_OK(Put(Key(i), RandomString(&rnd, 100000))); ASSERT_OK(Put(Key(i), RandomString(&rnd, S1)));
} }
// 0 because GetApproximateSizes() does not account for memtable space // 0 because GetApproximateSizes() does not account for memtable space
@ -863,14 +991,12 @@ TEST(DBTest, ApproximateSizes) {
for (int compact_start = 0; compact_start < N; compact_start += 10) { for (int compact_start = 0; compact_start < N; compact_start += 10) {
for (int i = 0; i < N; i += 10) { for (int i = 0; i < N; i += 10) {
ASSERT_TRUE(Between(Size("", Key(i)), 100000*i, 100000*i + 10000)); ASSERT_TRUE(Between(Size("", Key(i)), S1*i, S2*i));
ASSERT_TRUE(Between(Size("", Key(i)+".suffix"), ASSERT_TRUE(Between(Size("", Key(i)+".suffix"), S1*(i+1), S2*(i+1)));
100000 * (i+1), 100000 * (i+1) + 10000)); ASSERT_TRUE(Between(Size(Key(i), Key(i+10)), S1*10, S2*10));
ASSERT_TRUE(Between(Size(Key(i), Key(i+10)),
100000 * 10, 100000 * 10 + 10000));
} }
ASSERT_TRUE(Between(Size("", Key(50)), 5000000, 5010000)); ASSERT_TRUE(Between(Size("", Key(50)), S1*50, S2*50));
ASSERT_TRUE(Between(Size("", Key(50)+".suffix"), 5100000, 5110000)); ASSERT_TRUE(Between(Size("", Key(50)+".suffix"), S1*50, S2*50));
std::string cstart_str = Key(compact_start); std::string cstart_str = Key(compact_start);
std::string cend_str = Key(compact_start + 9); std::string cend_str = Key(compact_start + 9);
@ -882,10 +1008,12 @@ TEST(DBTest, ApproximateSizes) {
ASSERT_EQ(NumTableFilesAtLevel(0), 0); ASSERT_EQ(NumTableFilesAtLevel(0), 0);
ASSERT_GT(NumTableFilesAtLevel(1), 0); ASSERT_GT(NumTableFilesAtLevel(1), 0);
} }
} while (ChangeOptions());
} }
TEST(DBTest, ApproximateSizes_MixOfSmallAndLarge) { TEST(DBTest, ApproximateSizes_MixOfSmallAndLarge) {
Options options; do {
Options options = CurrentOptions();
options.compression = kNoCompression; options.compression = kNoCompression;
Reopen(); Reopen();
@ -912,12 +1040,13 @@ TEST(DBTest, ApproximateSizes_MixOfSmallAndLarge) {
ASSERT_TRUE(Between(Size("", Key(5)), 230000, 231000)); ASSERT_TRUE(Between(Size("", Key(5)), 230000, 231000));
ASSERT_TRUE(Between(Size("", Key(6)), 240000, 241000)); ASSERT_TRUE(Between(Size("", Key(6)), 240000, 241000));
ASSERT_TRUE(Between(Size("", Key(7)), 540000, 541000)); ASSERT_TRUE(Between(Size("", Key(7)), 540000, 541000));
ASSERT_TRUE(Between(Size("", Key(8)), 550000, 551000)); ASSERT_TRUE(Between(Size("", Key(8)), 550000, 560000));
ASSERT_TRUE(Between(Size(Key(3), Key(5)), 110000, 111000)); ASSERT_TRUE(Between(Size(Key(3), Key(5)), 110000, 111000));
dbfull()->TEST_CompactRange(0, NULL, NULL); dbfull()->TEST_CompactRange(0, NULL, NULL);
} }
} while (ChangeOptions());
} }
TEST(DBTest, IteratorPinsRef) { TEST(DBTest, IteratorPinsRef) {
@ -943,6 +1072,7 @@ TEST(DBTest, IteratorPinsRef) {
} }
TEST(DBTest, Snapshot) { TEST(DBTest, Snapshot) {
do {
Put("foo", "v1"); Put("foo", "v1");
const Snapshot* s1 = db_->GetSnapshot(); const Snapshot* s1 = db_->GetSnapshot();
Put("foo", "v2"); Put("foo", "v2");
@ -967,9 +1097,11 @@ TEST(DBTest, Snapshot) {
db_->ReleaseSnapshot(s2); db_->ReleaseSnapshot(s2);
ASSERT_EQ("v4", Get("foo")); ASSERT_EQ("v4", Get("foo"));
} while (ChangeOptions());
} }
TEST(DBTest, HiddenValuesAreRemoved) { TEST(DBTest, HiddenValuesAreRemoved) {
do {
Random rnd(301); Random rnd(301);
FillLevels("a", "z"); FillLevels("a", "z");
@ -996,6 +1128,7 @@ TEST(DBTest, HiddenValuesAreRemoved) {
ASSERT_EQ(AllEntriesFor("foo"), "[ tiny ]"); ASSERT_EQ(AllEntriesFor("foo"), "[ tiny ]");
ASSERT_TRUE(Between(Size("", "pastfoo"), 0, 1000)); ASSERT_TRUE(Between(Size("", "pastfoo"), 0, 1000));
} while (ChangeOptions());
} }
TEST(DBTest, DeletionMarkers1) { TEST(DBTest, DeletionMarkers1) {
@ -1054,6 +1187,7 @@ TEST(DBTest, DeletionMarkers2) {
} }
TEST(DBTest, OverlapInLevel0) { TEST(DBTest, OverlapInLevel0) {
do {
ASSERT_EQ(config::kMaxMemCompactLevel, 2) << "Fix test to match config"; ASSERT_EQ(config::kMaxMemCompactLevel, 2) << "Fix test to match config";
// Fill levels 1 and 2 to disable the pushing of new memtables to levels > 0. // Fill levels 1 and 2 to disable the pushing of new memtables to levels > 0.
@ -1090,6 +1224,7 @@ TEST(DBTest, OverlapInLevel0) {
dbfull()->TEST_CompactMemTable(); dbfull()->TEST_CompactMemTable();
ASSERT_EQ("3", FilesPerLevel()); ASSERT_EQ("3", FilesPerLevel());
ASSERT_EQ("NOT_FOUND", Get("600")); ASSERT_EQ("NOT_FOUND", Get("600"));
} while (ChangeOptions());
} }
TEST(DBTest, L0_CompactionBug_Issue44_a) { TEST(DBTest, L0_CompactionBug_Issue44_a) {
@ -1150,7 +1285,7 @@ TEST(DBTest, ComparatorCheck) {
} }
}; };
NewComparator cmp; NewComparator cmp;
Options new_options; Options new_options = CurrentOptions();
new_options.comparator = &cmp; new_options.comparator = &cmp;
Status s = TryReopen(&new_options); Status s = TryReopen(&new_options);
ASSERT_TRUE(!s.ok()); ASSERT_TRUE(!s.ok());
@ -1185,9 +1320,10 @@ TEST(DBTest, CustomComparator) {
} }
}; };
NumberComparator cmp; NumberComparator cmp;
Options new_options; Options new_options = CurrentOptions();
new_options.create_if_missing = true; new_options.create_if_missing = true;
new_options.comparator = &cmp; new_options.comparator = &cmp;
new_options.filter_policy = NULL; // Cannot use bloom filters
new_options.write_buffer_size = 1000; // Compact more often new_options.write_buffer_size = 1000; // Compact more often
DestroyAndReopen(&new_options); DestroyAndReopen(&new_options);
ASSERT_OK(Put("[10]", "ten")); ASSERT_OK(Put("[10]", "ten"));
@ -1197,6 +1333,8 @@ TEST(DBTest, CustomComparator) {
ASSERT_EQ("ten", Get("[0xa]")); ASSERT_EQ("ten", Get("[0xa]"));
ASSERT_EQ("twenty", Get("[20]")); ASSERT_EQ("twenty", Get("[20]"));
ASSERT_EQ("twenty", Get("[0x14]")); ASSERT_EQ("twenty", Get("[0x14]"));
ASSERT_EQ("NOT_FOUND", Get("[15]"));
ASSERT_EQ("NOT_FOUND", Get("[0xf]"));
Compact("[0]", "[9999]"); Compact("[0]", "[9999]");
} }
@ -1285,7 +1423,7 @@ TEST(DBTest, DBOpen_Options) {
// Check that number of files does not grow when we are out of space // Check that number of files does not grow when we are out of space
TEST(DBTest, NoSpace) { TEST(DBTest, NoSpace) {
Options options; Options options = CurrentOptions();
options.env = env_; options.env = env_;
Reopen(&options); Reopen(&options);
@ -1314,6 +1452,53 @@ TEST(DBTest, FilesDeletedAfterCompaction) {
ASSERT_EQ(CountFiles(), num_files); ASSERT_EQ(CountFiles(), num_files);
} }
TEST(DBTest, BloomFilter) {
env_->count_random_reads_ = true;
Options options = CurrentOptions();
options.env = env_;
options.block_cache = NewLRUCache(0); // Prevent cache hits
options.filter_policy = NewBloomFilterPolicy(10);
Reopen(&options);
// Populate multiple layers
const int N = 10000;
for (int i = 0; i < N; i++) {
ASSERT_OK(Put(Key(i), Key(i)));
}
Compact("a", "z");
for (int i = 0; i < N; i += 100) {
ASSERT_OK(Put(Key(i), Key(i)));
}
dbfull()->TEST_CompactMemTable();
// Prevent auto compactions triggered by seeks
env_->delay_sstable_sync_.Release_Store(env_);
// Lookup present keys. Should rarely read from small sstable.
env_->random_read_counter_.Reset();
for (int i = 0; i < N; i++) {
ASSERT_EQ(Key(i), Get(Key(i)));
}
int reads = env_->random_read_counter_.Read();
fprintf(stderr, "%d present => %d reads\n", N, reads);
ASSERT_GE(reads, N);
ASSERT_LE(reads, N + 2*N/100);
// Lookup present keys. Should rarely read from either sstable.
env_->random_read_counter_.Reset();
for (int i = 0; i < N; i++) {
ASSERT_EQ("NOT_FOUND", Get(Key(i) + ".missing"));
}
reads = env_->random_read_counter_.Read();
fprintf(stderr, "%d missing => %d reads\n", N, reads);
ASSERT_LE(reads, 3*N/100);
env_->delay_sstable_sync_.Release_Store(NULL);
Close();
delete options.block_cache;
delete options.filter_policy;
}
// Multi-threaded test: // Multi-threaded test:
namespace { namespace {
@ -1381,6 +1566,7 @@ static void MTThreadBody(void* arg) {
} // namespace } // namespace
TEST(DBTest, MultiThreaded) { TEST(DBTest, MultiThreaded) {
do {
// Initialize state // Initialize state
MTState mt; MTState mt;
mt.test = this; mt.test = this;
@ -1408,6 +1594,7 @@ TEST(DBTest, MultiThreaded) {
env_->SleepForMicroseconds(100000); env_->SleepForMicroseconds(100000);
} }
} }
} while (ChangeOptions());
} }
namespace { namespace {
@ -1573,7 +1760,8 @@ static bool CompareIterators(int step,
TEST(DBTest, Randomized) { TEST(DBTest, Randomized) {
Random rnd(test::RandomSeed()); Random rnd(test::RandomSeed());
ModelDB model(last_options_); do {
ModelDB model(CurrentOptions());
const int N = 10000; const int N = 10000;
const Snapshot* model_snap = NULL; const Snapshot* model_snap = NULL;
const Snapshot* db_snap = NULL; const Snapshot* db_snap = NULL;
@ -1582,6 +1770,7 @@ TEST(DBTest, Randomized) {
if (step % 100 == 0) { if (step % 100 == 0) {
fprintf(stderr, "Step %d of %d\n", step, N); fprintf(stderr, "Step %d of %d\n", step, N);
} }
// TODO(sanjay): Test Get() works
int p = rnd.Uniform(100); int p = rnd.Uniform(100);
if (p < 45) { // Put if (p < 45) { // Put
k = RandomKey(&rnd); k = RandomKey(&rnd);
@ -1637,6 +1826,7 @@ TEST(DBTest, Randomized) {
} }
if (model_snap != NULL) model.ReleaseSnapshot(model_snap); if (model_snap != NULL) model.ReleaseSnapshot(model_snap);
if (db_snap != NULL) db_->ReleaseSnapshot(db_snap); if (db_snap != NULL) db_->ReleaseSnapshot(db_snap);
} while (ChangeOptions());
} }
std::string MakeKey(unsigned int num) { std::string MakeKey(unsigned int num) {

View File

@ -98,6 +98,26 @@ void InternalKeyComparator::FindShortSuccessor(std::string* key) const {
} }
} }
const char* InternalFilterPolicy::Name() const {
return user_policy_->Name();
}
void InternalFilterPolicy::CreateFilter(const Slice* keys, int n,
std::string* dst) const {
// We rely on the fact that the code in table.cc does not mind us
// adjusting keys[].
Slice* mkey = const_cast<Slice*>(keys);
for (int i = 0; i < n; i++) {
mkey[i] = ExtractUserKey(keys[i]);
// TODO(sanjay): Suppress dups?
}
user_policy_->CreateFilter(keys, n, dst);
}
bool InternalFilterPolicy::KeyMayMatch(const Slice& key, const Slice& f) const {
return user_policy_->KeyMayMatch(ExtractUserKey(key), f);
}
LookupKey::LookupKey(const Slice& user_key, SequenceNumber s) { LookupKey::LookupKey(const Slice& user_key, SequenceNumber s) {
size_t usize = user_key.size(); size_t usize = user_key.size();
size_t needed = usize + 13; // A conservative estimate size_t needed = usize + 13; // A conservative estimate

View File

@ -8,6 +8,7 @@
#include <stdio.h> #include <stdio.h>
#include "leveldb/comparator.h" #include "leveldb/comparator.h"
#include "leveldb/db.h" #include "leveldb/db.h"
#include "leveldb/filter_policy.h"
#include "leveldb/slice.h" #include "leveldb/slice.h"
#include "leveldb/table_builder.h" #include "leveldb/table_builder.h"
#include "util/coding.h" #include "util/coding.h"
@ -123,6 +124,17 @@ class InternalKeyComparator : public Comparator {
int Compare(const InternalKey& a, const InternalKey& b) const; int Compare(const InternalKey& a, const InternalKey& b) const;
}; };
// Filter policy wrapper that converts from internal keys to user keys
class InternalFilterPolicy : public FilterPolicy {
private:
const FilterPolicy* const user_policy_;
public:
explicit InternalFilterPolicy(const FilterPolicy* p) : user_policy_(p) { }
virtual const char* Name() const;
virtual void CreateFilter(const Slice* keys, int n, std::string* dst) const;
virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const;
};
// Modules in this directory should keep internal keys wrapped inside // Modules in this directory should keep internal keys wrapped inside
// the following class instead of plain strings so that we do not // the following class instead of plain strings so that we do not
// incorrectly use string comparisons instead of an InternalKeyComparator. // incorrectly use string comparisons instead of an InternalKeyComparator.

View File

@ -48,7 +48,8 @@ class Repairer {
: dbname_(dbname), : dbname_(dbname),
env_(options.env), env_(options.env),
icmp_(options.comparator), icmp_(options.comparator),
options_(SanitizeOptions(dbname, &icmp_, options)), ipolicy_(options.filter_policy),
options_(SanitizeOptions(dbname, &icmp_, &ipolicy_, options)),
owns_info_log_(options_.info_log != options.info_log), owns_info_log_(options_.info_log != options.info_log),
owns_cache_(options_.block_cache != options.block_cache), owns_cache_(options_.block_cache != options.block_cache),
next_file_number_(1) { next_file_number_(1) {
@ -99,6 +100,7 @@ class Repairer {
std::string const dbname_; std::string const dbname_;
Env* const env_; Env* const env_;
InternalKeyComparator const icmp_; InternalKeyComparator const icmp_;
InternalFilterPolicy const ipolicy_;
Options const options_; Options const options_;
bool owns_info_log_; bool owns_info_log_;
bool owns_cache_; bool owns_cache_;

View File

@ -42,23 +42,18 @@ TableCache::~TableCache() {
delete cache_; delete cache_;
} }
Iterator* TableCache::NewIterator(const ReadOptions& options, Status TableCache::FindTable(uint64_t file_number, uint64_t file_size,
uint64_t file_number, Cache::Handle** handle) {
uint64_t file_size, Status s;
Table** tableptr) {
if (tableptr != NULL) {
*tableptr = NULL;
}
char buf[sizeof(file_number)]; char buf[sizeof(file_number)];
EncodeFixed64(buf, file_number); EncodeFixed64(buf, file_number);
Slice key(buf, sizeof(buf)); Slice key(buf, sizeof(buf));
Cache::Handle* handle = cache_->Lookup(key); *handle = cache_->Lookup(key);
if (handle == NULL) { if (*handle == NULL) {
std::string fname = TableFileName(dbname_, file_number); std::string fname = TableFileName(dbname_, file_number);
RandomAccessFile* file = NULL; RandomAccessFile* file = NULL;
Table* table = NULL; Table* table = NULL;
Status s = env_->NewRandomAccessFile(fname, &file); s = env_->NewRandomAccessFile(fname, &file);
if (s.ok()) { if (s.ok()) {
s = Table::Open(*options_, file, file_size, &table); s = Table::Open(*options_, file, file_size, &table);
} }
@ -68,13 +63,28 @@ Iterator* TableCache::NewIterator(const ReadOptions& options,
delete file; delete file;
// We do not cache error results so that if the error is transient, // We do not cache error results so that if the error is transient,
// or somebody repairs the file, we recover automatically. // or somebody repairs the file, we recover automatically.
return NewErrorIterator(s); } else {
}
TableAndFile* tf = new TableAndFile; TableAndFile* tf = new TableAndFile;
tf->file = file; tf->file = file;
tf->table = table; tf->table = table;
handle = cache_->Insert(key, tf, 1, &DeleteEntry); *handle = cache_->Insert(key, tf, 1, &DeleteEntry);
}
}
return s;
}
Iterator* TableCache::NewIterator(const ReadOptions& options,
uint64_t file_number,
uint64_t file_size,
Table** tableptr) {
if (tableptr != NULL) {
*tableptr = NULL;
}
Cache::Handle* handle = NULL;
Status s = FindTable(file_number, file_size, &handle);
if (!s.ok()) {
return NewErrorIterator(s);
} }
Table* table = reinterpret_cast<TableAndFile*>(cache_->Value(handle))->table; Table* table = reinterpret_cast<TableAndFile*>(cache_->Value(handle))->table;
@ -86,6 +96,22 @@ Iterator* TableCache::NewIterator(const ReadOptions& options,
return result; return result;
} }
Status TableCache::Get(const ReadOptions& options,
uint64_t file_number,
uint64_t file_size,
const Slice& k,
void* arg,
void (*saver)(void*, const Slice&, const Slice&)) {
Cache::Handle* handle = NULL;
Status s = FindTable(file_number, file_size, &handle);
if (s.ok()) {
Table* t = reinterpret_cast<TableAndFile*>(cache_->Value(handle))->table;
s = t->InternalGet(options, k, arg, saver);
cache_->Release(handle);
}
return s;
}
void TableCache::Evict(uint64_t file_number) { void TableCache::Evict(uint64_t file_number) {
char buf[sizeof(file_number)]; char buf[sizeof(file_number)];
EncodeFixed64(buf, file_number); EncodeFixed64(buf, file_number);

View File

@ -35,6 +35,15 @@ class TableCache {
uint64_t file_size, uint64_t file_size,
Table** tableptr = NULL); Table** tableptr = NULL);
// If a seek to internal key "k" in specified file finds an entry,
// call (*handle_result)(arg, found_key, found_value).
Status Get(const ReadOptions& options,
uint64_t file_number,
uint64_t file_size,
const Slice& k,
void* arg,
void (*handle_result)(void*, const Slice&, const Slice&));
// Evict any entry for the specified file number // Evict any entry for the specified file number
void Evict(uint64_t file_number); void Evict(uint64_t file_number);
@ -43,6 +52,8 @@ class TableCache {
const std::string dbname_; const std::string dbname_;
const Options* options_; const Options* options_;
Cache* cache_; Cache* cache_;
Status FindTable(uint64_t file_number, uint64_t file_size, Cache::Handle**);
}; };
} // namespace leveldb } // namespace leveldb

View File

@ -255,35 +255,34 @@ void Version::AddIterators(const ReadOptions& options,
} }
} }
// If "*iter" points at a value or deletion for user_key, store // Callback from TableCache::Get()
// either the value, or a NotFound error and return true. namespace {
// Else return false. enum SaverState {
static bool GetValue(const Comparator* cmp, kNotFound,
Iterator* iter, const Slice& user_key, kFound,
std::string* value, kDeleted,
Status* s) { kCorrupt,
if (!iter->Valid()) { };
return false; struct Saver {
SaverState state;
const Comparator* ucmp;
Slice user_key;
std::string* value;
};
} }
static void SaveValue(void* arg, const Slice& ikey, const Slice& v) {
Saver* s = reinterpret_cast<Saver*>(arg);
ParsedInternalKey parsed_key; ParsedInternalKey parsed_key;
if (!ParseInternalKey(iter->key(), &parsed_key)) { if (!ParseInternalKey(ikey, &parsed_key)) {
*s = Status::Corruption("corrupted key for ", user_key); s->state = kCorrupt;
return true; } else {
} if (s->ucmp->Compare(parsed_key.user_key, s->user_key) == 0) {
if (cmp->Compare(parsed_key.user_key, user_key) != 0) { s->state = (parsed_key.type == kTypeValue) ? kFound : kDeleted;
return false; if (s->state == kFound) {
} s->value->assign(v.data(), v.size());
switch (parsed_key.type) { }
case kTypeDeletion:
*s = Status::NotFound(Slice()); // Use an empty error message for speed
break;
case kTypeValue: {
Slice v = iter->value();
value->assign(v.data(), v.size());
break;
} }
} }
return true;
} }
static bool NewestFirst(FileMetaData* a, FileMetaData* b) { static bool NewestFirst(FileMetaData* a, FileMetaData* b) {
@ -361,21 +360,27 @@ Status Version::Get(const ReadOptions& options,
last_file_read = f; last_file_read = f;
last_file_read_level = level; last_file_read_level = level;
Iterator* iter = vset_->table_cache_->NewIterator( Saver saver;
options, saver.state = kNotFound;
f->number, saver.ucmp = ucmp;
f->file_size); saver.user_key = user_key;
iter->Seek(ikey); saver.value = value;
const bool done = GetValue(ucmp, iter, user_key, value, &s); s = vset_->table_cache_->Get(options, f->number, f->file_size,
if (!iter->status().ok()) { ikey, &saver, SaveValue);
s = iter->status(); if (!s.ok()) {
delete iter;
return s;
} else {
delete iter;
if (done) {
return s; return s;
} }
switch (saver.state) {
case kNotFound:
break; // Keep searching in other files
case kFound:
return s;
case kDeleted:
s = Status::NotFound(Slice()); // Use empty error message for speed
return s;
case kCorrupt:
s = Status::Corruption("corrupted key for ", user_key);
return s;
} }
} }
} }

View File

@ -400,6 +400,69 @@ We might want to prefix <code>filename</code> keys with one letter (say '/') and
over just the metadata do not force us to fetch and cache bulky file over just the metadata do not force us to fetch and cache bulky file
contents. contents.
<p> <p>
<h2>Filters</h2>
<p>
Because of the way <code>leveldb</code> data is organized on disk,
a single <code>Get()</code> call may involve multiple reads from disk.
The optional <code>FilterPolicy</code> mechanism can be used to reduce
the number of disk reads substantially.
<pre>
leveldb::Options options;
options.filter_policy = NewBloomFilter(10);
leveldb::DB* db;
leveldb::DB::Open(options, "/tmp/testdb", &amp;db);
... use the database ...
delete db;
delete options.filter_policy;
</pre>
The preceding code associates a
<a href="http://en.wikipedia.org/wiki/Bloom_filter">Bloom filter</a>
based filtering policy with the database. Bloom filter based
filtering relies on keeping some number of bits of data in memory per
key (in this case 10 bits per key since that is the argument we passed
to NewBloomFilter). This filter will reduce the number of unnecessary
disk reads needed for <code>Get()</code> calls by a factor of
approximately a 100. Increasing the bits per key will lead to a
larger reduction at the cost of more memory usage. We recommend that
applications whose working set does not fit in memory and that do a
lot of random reads set a filter policy.
<p>
If you are using a custom comparator, you should ensure that the filter
policy you are using is compatible with your comparator. For example,
consider a comparator that ignores trailing spaces when comparing keys.
<code>NewBloomFilter</code> must not be used with such a comparator.
Instead, the application should provide a custom filter policy that
also ignores trailing spaces. For example:
<pre>
class CustomFilterPolicy : public leveldb::FilterPolicy {
private:
FilterPolicy* builtin_policy_;
public:
CustomFilterPolicy() : builtin_policy_(NewBloomFilter(10)) { }
~CustomFilterPolicy() { delete builtin_policy_; }
const char* Name() const { return "IgnoreTrailingSpacesFilter"; }
void CreateFilter(const Slice* keys, int n, std::string* dst) const {
// Use builtin bloom filter code after removing trailing spaces
std::vector&lt;Slice&gt; trimmed(n);
for (int i = 0; i &lt; n; i++) {
trimmed[i] = RemoveTrailingSpaces(keys[i]);
}
return builtin_policy_-&gt;CreateFilter(&amp;trimmed[i], n, dst);
}
bool KeyMayMatch(const Slice& key, const Slice& filter) const {
// Use builtin bloom filter code after removing trailing spaces
return builtin_policy_-&gt;KeyMayMatch(RemoveTrailingSpaces(key), filter);
}
};
</pre>
<p>
Advanced applications may provide a filter policy that does not use
a bloom filter but uses some other mechanism for summarizing a set
of keys. See <code>leveldb/filter_policy.h</code> for detail.
<p>
<h1>Checksums</h1> <h1>Checksums</h1>
<p> <p>
<code>leveldb</code> associates checksums with all data it stores in the file system. <code>leveldb</code> associates checksums with all data it stores in the file system.

View File

@ -47,6 +47,47 @@ the BlockHandle of the metaindex and index blocks as well as a magic number.
// (40==2*BlockHandle::kMaxEncodedLength) // (40==2*BlockHandle::kMaxEncodedLength)
magic: fixed64; // == 0xdb4775248b80fb57 magic: fixed64; // == 0xdb4775248b80fb57
"filter" Meta Block
-------------------
If a "FilterPolicy" was specified when the database was opened, a
filter block is stored in each table. The "metaindex" block contains
an entry that maps from "filter.<N>" to the BlockHandle for the filter
block where "<N>" is the string returned by the filter policy's
"Name()" method.
The filter block stores a sequence of filters, where filter i contains
the output of FilterPolicy::CreateFilter() on all keys that are stored
in a block whose file offset falls within the range
[ i*base ... (i+1)*base-1 ]
Currently, "base" is 2KB. So for example, if blocks X and Y start in
the range [ 0KB .. 2KB-1 ], all of the keys in X and Y will be
converted to a filter by calling FilterPolicy::CreateFilter(), and the
resulting filter will be stored as the first filter in the filter
block.
The filter block is formatted as follows:
[filter 0]
[filter 1]
[filter 2]
...
[filter N-1]
[offset of filter 0] : 4 bytes
[offset of filter 1] : 4 bytes
[offset of filter 2] : 4 bytes
...
[offset of filter N-1] : 4 bytes
[offset of beginning of offset array] : 4 bytes
lg(base) : 1 byte
The offset array at the end of the filter block allows efficient
mapping from a data block offset to the corresponding filter.
"stats" Meta Block "stats" Meta Block
------------------ ------------------

View File

@ -55,6 +55,7 @@ typedef struct leveldb_cache_t leveldb_cache_t;
typedef struct leveldb_comparator_t leveldb_comparator_t; typedef struct leveldb_comparator_t leveldb_comparator_t;
typedef struct leveldb_env_t leveldb_env_t; typedef struct leveldb_env_t leveldb_env_t;
typedef struct leveldb_filelock_t leveldb_filelock_t; typedef struct leveldb_filelock_t leveldb_filelock_t;
typedef struct leveldb_filterpolicy_t leveldb_filterpolicy_t;
typedef struct leveldb_iterator_t leveldb_iterator_t; typedef struct leveldb_iterator_t leveldb_iterator_t;
typedef struct leveldb_logger_t leveldb_logger_t; typedef struct leveldb_logger_t leveldb_logger_t;
typedef struct leveldb_options_t leveldb_options_t; typedef struct leveldb_options_t leveldb_options_t;
@ -127,6 +128,11 @@ extern void leveldb_approximate_sizes(
const char* const* range_limit_key, const size_t* range_limit_key_len, const char* const* range_limit_key, const size_t* range_limit_key_len,
uint64_t* sizes); uint64_t* sizes);
extern void leveldb_compact_range(
leveldb_t* db,
const char* start_key, size_t start_key_len,
const char* limit_key, size_t limit_key_len);
/* Management operations */ /* Management operations */
extern void leveldb_destroy_db( extern void leveldb_destroy_db(
@ -177,6 +183,9 @@ extern void leveldb_options_destroy(leveldb_options_t*);
extern void leveldb_options_set_comparator( extern void leveldb_options_set_comparator(
leveldb_options_t*, leveldb_options_t*,
leveldb_comparator_t*); leveldb_comparator_t*);
extern void leveldb_options_set_filter_policy(
leveldb_options_t*,
leveldb_filterpolicy_t*);
extern void leveldb_options_set_create_if_missing( extern void leveldb_options_set_create_if_missing(
leveldb_options_t*, unsigned char); leveldb_options_t*, unsigned char);
extern void leveldb_options_set_error_if_exists( extern void leveldb_options_set_error_if_exists(
@ -209,6 +218,26 @@ extern leveldb_comparator_t* leveldb_comparator_create(
const char* (*name)(void*)); const char* (*name)(void*));
extern void leveldb_comparator_destroy(leveldb_comparator_t*); extern void leveldb_comparator_destroy(leveldb_comparator_t*);
/* Filter policy */
extern leveldb_filterpolicy_t* leveldb_filterpolicy_create(
void* state,
void (*destructor)(void*),
char* (*create_filter)(
void*,
const char* const* key_array, const size_t* key_length_array,
int num_keys,
size_t* filter_length),
unsigned char (*key_may_match)(
void*,
const char* key, size_t length,
const char* filter, size_t filter_length),
const char* (*name)(void*));
extern void leveldb_filterpolicy_destroy(leveldb_filterpolicy_t*);
extern leveldb_filterpolicy_t* leveldb_filterpolicy_create_bloom(
int bits_per_key);
/* Read options */ /* Read options */
extern leveldb_readoptions_t* leveldb_readoptions_create(); extern leveldb_readoptions_t* leveldb_readoptions_create();

View File

@ -14,7 +14,7 @@ namespace leveldb {
// Update Makefile if you change these // Update Makefile if you change these
static const int kMajorVersion = 1; static const int kMajorVersion = 1;
static const int kMinorVersion = 3; static const int kMinorVersion = 4;
struct Options; struct Options;
struct ReadOptions; struct ReadOptions;

View File

@ -0,0 +1,70 @@
// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// A database can be configured with a custom FilterPolicy object.
// This object is responsible for creating a small filter from a set
// of keys. These filters are stored in leveldb and are consulted
// automatically by leveldb to decide whether or not to read some
// information from disk. In many cases, a filter can cut down the
// number of disk seeks form a handful to a single disk seek per
// DB::Get() call.
//
// Most people will want to use the builtin bloom filter support (see
// NewBloomFilterPolicy() below).
#ifndef STORAGE_LEVELDB_INCLUDE_FILTER_POLICY_H_
#define STORAGE_LEVELDB_INCLUDE_FILTER_POLICY_H_
#include <string>
namespace leveldb {
class Slice;
class FilterPolicy {
public:
virtual ~FilterPolicy();
// Return the name of this policy. Note that if the filter encoding
// changes in an incompatible way, the name returned by this method
// must be changed. Otherwise, old incompatible filters may be
// passed to methods of this type.
virtual const char* Name() const = 0;
// keys[0,n-1] contains a list of keys (potentially with duplicates)
// that are ordered according to the user supplied comparator.
// Append a filter that summarizes keys[0,n-1] to *dst.
//
// Warning: do not change the initial contents of *dst. Instead,
// append the newly constructed filter to *dst.
virtual void CreateFilter(const Slice* keys, int n, std::string* dst)
const = 0;
// "filter" contains the data appended by a preceding call to
// CreateFilter() on this class. This method must return true if
// the key was in the list of keys passed to CreateFilter().
// This method may return true or false if the key was not on the
// list, but it should aim to return false with a high probability.
virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const = 0;
};
// Return a new filter policy that uses a bloom filter with approximately
// the specified number of bits per key. A good value for bits_per_key
// is 10, which yields a filter with ~ 1% false positive rate.
//
// Callers must delete the result after any database that is using the
// result has been closed.
//
// Note: if you are using a custom comparator that ignores some parts
// of the keys being compared, you must not use NewBloomFilterPolicy()
// and must provide your own FilterPolicy that also ignores the
// corresponding parts of the keys. For example, if the comparator
// ignores trailing spaces, it would be incorrect to use a
// FilterPolicy (like NewBloomFilterPolicy) that does not ignore
// trailing spaces in keys.
extern const FilterPolicy* NewBloomFilterPolicy(int bits_per_key);
}
#endif // STORAGE_LEVELDB_INCLUDE_FILTER_POLICY_H_

View File

@ -12,6 +12,7 @@ namespace leveldb {
class Cache; class Cache;
class Comparator; class Comparator;
class Env; class Env;
class FilterPolicy;
class Logger; class Logger;
class Snapshot; class Snapshot;
@ -127,6 +128,13 @@ struct Options {
// efficiently detect that and will switch to uncompressed mode. // efficiently detect that and will switch to uncompressed mode.
CompressionType compression; CompressionType compression;
// If non-NULL, use the specified filter policy to reduce disk reads.
// Many applications will benefit from passing the result of
// NewBloomFilterPolicy() here.
//
// Default: NULL
const FilterPolicy* filter_policy;
// Create an Options object with default values for all fields. // Create an Options object with default values for all fields.
Options(); Options();
}; };

View File

@ -12,9 +12,11 @@ namespace leveldb {
class Block; class Block;
class BlockHandle; class BlockHandle;
class Footer;
struct Options; struct Options;
class RandomAccessFile; class RandomAccessFile;
struct ReadOptions; struct ReadOptions;
class TableCache;
// A Table is a sorted map from strings to strings. Tables are // A Table is a sorted map from strings to strings. Tables are
// immutable and persistent. A Table may be safely accessed from // immutable and persistent. A Table may be safely accessed from
@ -60,6 +62,19 @@ class Table {
explicit Table(Rep* rep) { rep_ = rep; } explicit Table(Rep* rep) { rep_ = rep; }
static Iterator* BlockReader(void*, const ReadOptions&, const Slice&); static Iterator* BlockReader(void*, const ReadOptions&, const Slice&);
// Calls (*handle_result)(arg, ...) with the entry found after a call
// to Seek(key). May not make such a call if filter policy says
// that key is not present.
friend class TableCache;
Status InternalGet(
const ReadOptions&, const Slice& key,
void* arg,
void (*handle_result)(void* arg, const Slice& k, const Slice& v));
void ReadMeta(const Footer& footer);
void ReadFilter(const Slice& filter_handle_value);
// No copying allowed // No copying allowed
Table(const Table&); Table(const Table&);
void operator=(const Table&); void operator=(const Table&);

View File

@ -77,6 +77,7 @@ class TableBuilder {
private: private:
bool ok() const { return status().ok(); } bool ok() const { return status().ok(); }
void WriteBlock(BlockBuilder* block, BlockHandle* handle); void WriteBlock(BlockBuilder* block, BlockHandle* handle);
void WriteRawBlock(const Slice& data, CompressionType, BlockHandle* handle);
struct Rep; struct Rep;
Rep* rep_; Rep* rep_;

View File

@ -78,6 +78,9 @@ class CondVar {
// On ARM chipsets <V6, 0xffff0fa0 is the hard coded address of a // On ARM chipsets <V6, 0xffff0fa0 is the hard coded address of a
// memory barrier function provided by the kernel. // memory barrier function provided by the kernel.
typedef void (*LinuxKernelMemoryBarrierFunc)(void); typedef void (*LinuxKernelMemoryBarrierFunc)(void);
// TODO(user): ATTRIBUTE_WEAK is undefined, so this fails to build on
// non-ARMV6_OR_7. We may be able to replace it with __attribute__((weak)) for
// older ARM builds, but x86 builds will require a different memory barrier.
LinuxKernelMemoryBarrierFunc pLinuxKernelMemoryBarrier ATTRIBUTE_WEAK = LinuxKernelMemoryBarrierFunc pLinuxKernelMemoryBarrier ATTRIBUTE_WEAK =
(LinuxKernelMemoryBarrierFunc) 0xffff0fa0; (LinuxKernelMemoryBarrierFunc) 0xffff0fa0;
#endif #endif

View File

@ -9,6 +9,7 @@
#include <vector> #include <vector>
#include <algorithm> #include <algorithm>
#include "leveldb/comparator.h" #include "leveldb/comparator.h"
#include "table/format.h"
#include "util/coding.h" #include "util/coding.h"
#include "util/logging.h" #include "util/logging.h"
@ -19,10 +20,10 @@ inline uint32_t Block::NumRestarts() const {
return DecodeFixed32(data_ + size_ - sizeof(uint32_t)); return DecodeFixed32(data_ + size_ - sizeof(uint32_t));
} }
Block::Block(const char* data, size_t size, bool take_ownership) Block::Block(const BlockContents& contents)
: data_(data), : data_(contents.data.data()),
size_(size), size_(contents.data.size()),
owned_(take_ownership) { owned_(contents.heap_allocated) {
if (size_ < sizeof(uint32_t)) { if (size_ < sizeof(uint32_t)) {
size_ = 0; // Error marker size_ = 0; // Error marker
} else { } else {

View File

@ -11,14 +11,13 @@
namespace leveldb { namespace leveldb {
struct BlockContents;
class Comparator; class Comparator;
class Block { class Block {
public: public:
// Initialize the block with the specified contents. // Initialize the block with the specified contents.
// Takes ownership of data[] and will delete[] it when done iff explicit Block(const BlockContents& contents);
// "take_ownership is true.
Block(const char* data, size_t size, bool take_ownership);
~Block(); ~Block();

111
table/filter_block.cc Normal file
View File

@ -0,0 +1,111 @@
// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "table/filter_block.h"
#include "leveldb/filter_policy.h"
#include "util/coding.h"
namespace leveldb {
// See doc/table_format.txt for an explanation of the filter block format.
// Generate new filter every 2KB of data
static const size_t kFilterBaseLg = 11;
static const size_t kFilterBase = 1 << kFilterBaseLg;
FilterBlockBuilder::FilterBlockBuilder(const FilterPolicy* policy)
: policy_(policy) {
}
void FilterBlockBuilder::StartBlock(uint64_t block_offset) {
uint64_t filter_index = (block_offset / kFilterBase);
assert(filter_index >= filter_offsets_.size());
while (filter_index > filter_offsets_.size()) {
GenerateFilter();
}
}
void FilterBlockBuilder::AddKey(const Slice& key) {
Slice k = key;
start_.push_back(keys_.size());
keys_.append(k.data(), k.size());
}
Slice FilterBlockBuilder::Finish() {
if (!start_.empty()) {
GenerateFilter();
}
// Append array of per-filter offsets
const uint32_t array_offset = result_.size();
for (size_t i = 0; i < filter_offsets_.size(); i++) {
PutFixed32(&result_, filter_offsets_[i]);
}
PutFixed32(&result_, array_offset);
result_.push_back(kFilterBaseLg); // Save encoding parameter in result
return Slice(result_);
}
void FilterBlockBuilder::GenerateFilter() {
const size_t num_keys = start_.size();
if (num_keys == 0) {
// Fast path if there are no keys for this filter
filter_offsets_.push_back(result_.size());
return;
}
// Make list of keys from flattened key structure
start_.push_back(keys_.size()); // Simplify length computation
tmp_keys_.resize(num_keys);
for (size_t i = 0; i < num_keys; i++) {
const char* base = keys_.data() + start_[i];
size_t length = start_[i+1] - start_[i];
tmp_keys_[i] = Slice(base, length);
}
// Generate filter for current set of keys and append to result_.
filter_offsets_.push_back(result_.size());
policy_->CreateFilter(&tmp_keys_[0], num_keys, &result_);
tmp_keys_.clear();
keys_.clear();
start_.clear();
}
FilterBlockReader::FilterBlockReader(const FilterPolicy* policy,
const Slice& contents)
: policy_(policy),
data_(NULL),
offset_(NULL),
num_(0),
base_lg_(0) {
size_t n = contents.size();
if (n < 5) return; // 1 byte for base_lg_ and 4 for start of offset array
base_lg_ = contents[n-1];
uint32_t last_word = DecodeFixed32(contents.data() + n - 5);
if (last_word > n - 5) return;
data_ = contents.data();
offset_ = data_ + last_word;
num_ = (n - 5 - last_word) / 4;
}
bool FilterBlockReader::KeyMayMatch(uint64_t block_offset, const Slice& key) {
uint64_t index = block_offset >> base_lg_;
if (index < num_) {
uint32_t start = DecodeFixed32(offset_ + index*4);
uint32_t limit = DecodeFixed32(offset_ + index*4 + 4);
if (start <= limit && limit <= (offset_ - data_)) {
Slice filter = Slice(data_ + start, limit - start);
return policy_->KeyMayMatch(key, filter);
} else if (start == limit) {
// Empty filters do not match any keys
return false;
}
}
return true; // Errors are treated as potential matches
}
}

68
table/filter_block.h Normal file
View File

@ -0,0 +1,68 @@
// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// A filter block is stored near the end of a Table file. It contains
// filters (e.g., bloom filters) for all data blocks in the table combined
// into a single filter block.
#ifndef STORAGE_LEVELDB_TABLE_FILTER_BLOCK_H_
#define STORAGE_LEVELDB_TABLE_FILTER_BLOCK_H_
#include <stddef.h>
#include <stdint.h>
#include <string>
#include <vector>
#include "leveldb/slice.h"
#include "util/hash.h"
namespace leveldb {
class FilterPolicy;
// A FilterBlockBuilder is used to construct all of the filters for a
// particular Table. It generates a single string which is stored as
// a special block in the Table.
//
// The sequence of calls to FilterBlockBuilder must match the regexp:
// (StartBlock AddKey*)* Finish
class FilterBlockBuilder {
public:
explicit FilterBlockBuilder(const FilterPolicy*);
void StartBlock(uint64_t block_offset);
void AddKey(const Slice& key);
Slice Finish();
private:
void GenerateFilter();
const FilterPolicy* policy_;
std::string keys_; // Flattened key contents
std::vector<size_t> start_; // Starting index in keys_ of each key
std::string result_; // Filter data computed so far
std::vector<Slice> tmp_keys_; // policy_->CreateFilter() argument
std::vector<uint32_t> filter_offsets_;
// No copying allowed
FilterBlockBuilder(const FilterBlockBuilder&);
void operator=(const FilterBlockBuilder&);
};
class FilterBlockReader {
public:
// REQUIRES: "contents" and *policy must stay live while *this is live.
FilterBlockReader(const FilterPolicy* policy, const Slice& contents);
bool KeyMayMatch(uint64_t block_offset, const Slice& key);
private:
const FilterPolicy* policy_;
const char* data_; // Pointer to filter data (at block-start)
const char* offset_; // Pointer to beginning of offset array (at block-end)
size_t num_; // Number of entries in offset array
size_t base_lg_; // Encoding parameter (see kFilterBaseLg in .cc file)
};
}
#endif // STORAGE_LEVELDB_TABLE_FILTER_BLOCK_H_

128
table/filter_block_test.cc Normal file
View File

@ -0,0 +1,128 @@
// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "table/filter_block.h"
#include "leveldb/filter_policy.h"
#include "util/coding.h"
#include "util/hash.h"
#include "util/logging.h"
#include "util/testharness.h"
#include "util/testutil.h"
namespace leveldb {
// For testing: emit an array with one hash value per key
class TestHashFilter : public FilterPolicy {
public:
virtual const char* Name() const {
return "TestHashFilter";
}
virtual void CreateFilter(const Slice* keys, int n, std::string* dst) const {
for (int i = 0; i < n; i++) {
uint32_t h = Hash(keys[i].data(), keys[i].size(), 1);
PutFixed32(dst, h);
}
}
virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const {
uint32_t h = Hash(key.data(), key.size(), 1);
for (int i = 0; i + 4 <= filter.size(); i += 4) {
if (h == DecodeFixed32(filter.data() + i)) {
return true;
}
}
return false;
}
};
class FilterBlockTest {
public:
TestHashFilter policy_;
};
TEST(FilterBlockTest, EmptyBuilder) {
FilterBlockBuilder builder(&policy_);
Slice block = builder.Finish();
ASSERT_EQ("\\x00\\x00\\x00\\x00\\x0b", EscapeString(block));
FilterBlockReader reader(&policy_, block);
ASSERT_TRUE(reader.KeyMayMatch(0, "foo"));
ASSERT_TRUE(reader.KeyMayMatch(100000, "foo"));
}
TEST(FilterBlockTest, SingleChunk) {
FilterBlockBuilder builder(&policy_);
builder.StartBlock(100);
builder.AddKey("foo");
builder.AddKey("bar");
builder.AddKey("box");
builder.StartBlock(200);
builder.AddKey("box");
builder.StartBlock(300);
builder.AddKey("hello");
Slice block = builder.Finish();
FilterBlockReader reader(&policy_, block);
ASSERT_TRUE(reader.KeyMayMatch(100, "foo"));
ASSERT_TRUE(reader.KeyMayMatch(100, "bar"));
ASSERT_TRUE(reader.KeyMayMatch(100, "box"));
ASSERT_TRUE(reader.KeyMayMatch(100, "hello"));
ASSERT_TRUE(reader.KeyMayMatch(100, "foo"));
ASSERT_TRUE(! reader.KeyMayMatch(100, "missing"));
ASSERT_TRUE(! reader.KeyMayMatch(100, "other"));
}
TEST(FilterBlockTest, MultiChunk) {
FilterBlockBuilder builder(&policy_);
// First filter
builder.StartBlock(0);
builder.AddKey("foo");
builder.StartBlock(2000);
builder.AddKey("bar");
// Second filter
builder.StartBlock(3100);
builder.AddKey("box");
// Third filter is empty
// Last filter
builder.StartBlock(9000);
builder.AddKey("box");
builder.AddKey("hello");
Slice block = builder.Finish();
FilterBlockReader reader(&policy_, block);
// Check first filter
ASSERT_TRUE(reader.KeyMayMatch(0, "foo"));
ASSERT_TRUE(reader.KeyMayMatch(2000, "bar"));
ASSERT_TRUE(! reader.KeyMayMatch(0, "box"));
ASSERT_TRUE(! reader.KeyMayMatch(0, "hello"));
// Check second filter
ASSERT_TRUE(reader.KeyMayMatch(3100, "box"));
ASSERT_TRUE(! reader.KeyMayMatch(3100, "foo"));
ASSERT_TRUE(! reader.KeyMayMatch(3100, "bar"));
ASSERT_TRUE(! reader.KeyMayMatch(3100, "hello"));
// Check third filter (empty)
ASSERT_TRUE(! reader.KeyMayMatch(4100, "foo"));
ASSERT_TRUE(! reader.KeyMayMatch(4100, "bar"));
ASSERT_TRUE(! reader.KeyMayMatch(4100, "box"));
ASSERT_TRUE(! reader.KeyMayMatch(4100, "hello"));
// Check last filter
ASSERT_TRUE(reader.KeyMayMatch(9000, "box"));
ASSERT_TRUE(reader.KeyMayMatch(9000, "hello"));
ASSERT_TRUE(! reader.KeyMayMatch(9000, "foo"));
ASSERT_TRUE(! reader.KeyMayMatch(9000, "bar"));
}
} // namespace leveldb
int main(int argc, char** argv) {
return leveldb::test::RunAllTests();
}

View File

@ -66,10 +66,10 @@ Status Footer::DecodeFrom(Slice* input) {
Status ReadBlock(RandomAccessFile* file, Status ReadBlock(RandomAccessFile* file,
const ReadOptions& options, const ReadOptions& options,
const BlockHandle& handle, const BlockHandle& handle,
Block** block, BlockContents* result) {
bool* may_cache) { result->data = Slice();
*block = NULL; result->cachable = false;
*may_cache = false; result->heap_allocated = false;
// Read the block contents as well as the type/crc footer. // Read the block contents as well as the type/crc footer.
// See table_builder.cc for the code that built this structure. // See table_builder.cc for the code that built this structure.
@ -105,11 +105,13 @@ Status ReadBlock(RandomAccessFile* file,
// Use it directly under the assumption that it will be live // Use it directly under the assumption that it will be live
// while the file is open. // while the file is open.
delete[] buf; delete[] buf;
*block = new Block(data, n, false /* do not take ownership */); result->data = Slice(data, n);
*may_cache = false; // Do not double-cache result->heap_allocated = false;
result->cachable = false; // Do not double-cache
} else { } else {
*block = new Block(buf, n, true /* take ownership */); result->data = Slice(buf, n);
*may_cache = true; result->heap_allocated = true;
result->cachable = true;
} }
// Ok // Ok
@ -127,8 +129,9 @@ Status ReadBlock(RandomAccessFile* file,
return Status::Corruption("corrupted compressed block contents"); return Status::Corruption("corrupted compressed block contents");
} }
delete[] buf; delete[] buf;
*block = new Block(ubuf, ulength, true /* take ownership */); result->data = Slice(ubuf, ulength);
*may_cache = true; result->heap_allocated = true;
result->cachable = true;
break; break;
} }
default: default:

View File

@ -83,16 +83,18 @@ static const uint64_t kTableMagicNumber = 0xdb4775248b80fb57ull;
// 1-byte type + 32-bit crc // 1-byte type + 32-bit crc
static const size_t kBlockTrailerSize = 5; static const size_t kBlockTrailerSize = 5;
// Read the block identified by "handle" from "file". On success, struct BlockContents {
// store a pointer to the heap-allocated result in *block and return Slice data; // Actual contents of data
// OK. On failure store NULL in *block and return non-OK. bool cachable; // True iff data can be cached
// On success, stores true in *may_cache if the result may be bool heap_allocated; // True iff caller should delete[] data.data()
// cached, false if it must not be cached. };
// Read the block identified by "handle" from "file". On failure
// return non-OK. On success fill *result and return OK.
extern Status ReadBlock(RandomAccessFile* file, extern Status ReadBlock(RandomAccessFile* file,
const ReadOptions& options, const ReadOptions& options,
const BlockHandle& handle, const BlockHandle& handle,
Block** block, BlockContents* result);
bool* may_cache);
// Implementation details follow. Clients should ignore, // Implementation details follow. Clients should ignore,

View File

@ -5,8 +5,12 @@
#include "leveldb/table.h" #include "leveldb/table.h"
#include "leveldb/cache.h" #include "leveldb/cache.h"
#include "leveldb/comparator.h"
#include "leveldb/env.h" #include "leveldb/env.h"
#include "leveldb/filter_policy.h"
#include "leveldb/options.h"
#include "table/block.h" #include "table/block.h"
#include "table/filter_block.h"
#include "table/format.h" #include "table/format.h"
#include "table/two_level_iterator.h" #include "table/two_level_iterator.h"
#include "util/coding.h" #include "util/coding.h"
@ -15,6 +19,8 @@ namespace leveldb {
struct Table::Rep { struct Table::Rep {
~Rep() { ~Rep() {
delete filter;
delete [] filter_data;
delete index_block; delete index_block;
} }
@ -22,6 +28,8 @@ struct Table::Rep {
Status status; Status status;
RandomAccessFile* file; RandomAccessFile* file;
uint64_t cache_id; uint64_t cache_id;
FilterBlockReader* filter;
const char* filter_data;
BlockHandle metaindex_handle; // Handle to metaindex_block: saved from footer BlockHandle metaindex_handle; // Handle to metaindex_block: saved from footer
Block* index_block; Block* index_block;
@ -47,11 +55,13 @@ Status Table::Open(const Options& options,
if (!s.ok()) return s; if (!s.ok()) return s;
// Read the index block // Read the index block
BlockContents contents;
Block* index_block = NULL; Block* index_block = NULL;
if (s.ok()) { if (s.ok()) {
bool may_cache; // Ignored result s = ReadBlock(file, ReadOptions(), footer.index_handle(), &contents);
s = ReadBlock(file, ReadOptions(), footer.index_handle(), &index_block, if (s.ok()) {
&may_cache); index_block = new Block(contents);
}
} }
if (s.ok()) { if (s.ok()) {
@ -63,7 +73,10 @@ Status Table::Open(const Options& options,
rep->metaindex_handle = footer.metaindex_handle(); rep->metaindex_handle = footer.metaindex_handle();
rep->index_block = index_block; rep->index_block = index_block;
rep->cache_id = (options.block_cache ? options.block_cache->NewId() : 0); rep->cache_id = (options.block_cache ? options.block_cache->NewId() : 0);
rep->filter_data = NULL;
rep->filter = NULL;
*table = new Table(rep); *table = new Table(rep);
(*table)->ReadMeta(footer);
} else { } else {
if (index_block) delete index_block; if (index_block) delete index_block;
} }
@ -71,6 +84,52 @@ Status Table::Open(const Options& options,
return s; return s;
} }
void Table::ReadMeta(const Footer& footer) {
if (rep_->options.filter_policy == NULL) {
return; // Do not need any metadata
}
// TODO(sanjay): Skip this if footer.metaindex_handle() size indicates
// it is an empty block.
ReadOptions opt;
BlockContents contents;
if (!ReadBlock(rep_->file, opt, footer.metaindex_handle(), &contents).ok()) {
// Do not propagate errors since meta info is not needed for operation
return;
}
Block* meta = new Block(contents);
Iterator* iter = meta->NewIterator(BytewiseComparator());
std::string key = "filter.";
key.append(rep_->options.filter_policy->Name());
iter->Seek(key);
if (iter->Valid() && iter->key() == Slice(key)) {
ReadFilter(iter->value());
}
delete iter;
delete meta;
}
void Table::ReadFilter(const Slice& filter_handle_value) {
Slice v = filter_handle_value;
BlockHandle filter_handle;
if (!filter_handle.DecodeFrom(&v).ok()) {
return;
}
// We might want to unify with ReadBlock() if we start
// requiring checksum verification in Table::Open.
ReadOptions opt;
BlockContents block;
if (!ReadBlock(rep_->file, opt, filter_handle, &block).ok()) {
return;
}
if (block.heap_allocated) {
rep_->filter_data = block.data.data(); // Will need to delete later
}
rep_->filter = new FilterBlockReader(rep_->options.filter_policy, block.data);
}
Table::~Table() { Table::~Table() {
delete rep_; delete rep_;
} }
@ -107,7 +166,7 @@ Iterator* Table::BlockReader(void* arg,
// can add more features in the future. // can add more features in the future.
if (s.ok()) { if (s.ok()) {
bool may_cache; BlockContents contents;
if (block_cache != NULL) { if (block_cache != NULL) {
char cache_key_buffer[16]; char cache_key_buffer[16];
EncodeFixed64(cache_key_buffer, table->rep_->cache_id); EncodeFixed64(cache_key_buffer, table->rep_->cache_id);
@ -117,14 +176,20 @@ Iterator* Table::BlockReader(void* arg,
if (cache_handle != NULL) { if (cache_handle != NULL) {
block = reinterpret_cast<Block*>(block_cache->Value(cache_handle)); block = reinterpret_cast<Block*>(block_cache->Value(cache_handle));
} else { } else {
s = ReadBlock(table->rep_->file, options, handle, &block, &may_cache); s = ReadBlock(table->rep_->file, options, handle, &contents);
if (s.ok() && may_cache && options.fill_cache) { if (s.ok()) {
block = new Block(contents);
if (contents.cachable && options.fill_cache) {
cache_handle = block_cache->Insert( cache_handle = block_cache->Insert(
key, block, block->size(), &DeleteCachedBlock); key, block, block->size(), &DeleteCachedBlock);
} }
} }
}
} else { } else {
s = ReadBlock(table->rep_->file, options, handle, &block, &may_cache); s = ReadBlock(table->rep_->file, options, handle, &contents);
if (s.ok()) {
block = new Block(contents);
}
} }
} }
@ -148,6 +213,39 @@ Iterator* Table::NewIterator(const ReadOptions& options) const {
&Table::BlockReader, const_cast<Table*>(this), options); &Table::BlockReader, const_cast<Table*>(this), options);
} }
Status Table::InternalGet(const ReadOptions& options, const Slice& k,
void* arg,
void (*saver)(void*, const Slice&, const Slice&)) {
Status s;
Iterator* iiter = rep_->index_block->NewIterator(rep_->options.comparator);
iiter->Seek(k);
if (iiter->Valid()) {
Slice handle_value = iiter->value();
FilterBlockReader* filter = rep_->filter;
BlockHandle handle;
if (filter != NULL &&
handle.DecodeFrom(&handle_value).ok() &&
!filter->KeyMayMatch(handle.offset(), k)) {
// Not found
} else {
Slice handle = iiter->value();
Iterator* block_iter = BlockReader(this, options, iiter->value());
block_iter->Seek(k);
if (block_iter->Valid()) {
(*saver)(arg, block_iter->key(), block_iter->value());
}
s = block_iter->status();
delete block_iter;
}
}
if (s.ok()) {
s = iiter->status();
}
delete iiter;
return s;
}
uint64_t Table::ApproximateOffsetOf(const Slice& key) const { uint64_t Table::ApproximateOffsetOf(const Slice& key) const {
Iterator* index_iter = Iterator* index_iter =
rep_->index_block->NewIterator(rep_->options.comparator); rep_->index_block->NewIterator(rep_->options.comparator);

View File

@ -5,14 +5,15 @@
#include "leveldb/table_builder.h" #include "leveldb/table_builder.h"
#include <assert.h> #include <assert.h>
#include <stdio.h>
#include "leveldb/comparator.h" #include "leveldb/comparator.h"
#include "leveldb/env.h" #include "leveldb/env.h"
#include "leveldb/filter_policy.h"
#include "leveldb/options.h"
#include "table/block_builder.h" #include "table/block_builder.h"
#include "table/filter_block.h"
#include "table/format.h" #include "table/format.h"
#include "util/coding.h" #include "util/coding.h"
#include "util/crc32c.h" #include "util/crc32c.h"
#include "util/logging.h"
namespace leveldb { namespace leveldb {
@ -27,6 +28,7 @@ struct TableBuilder::Rep {
std::string last_key; std::string last_key;
int64_t num_entries; int64_t num_entries;
bool closed; // Either Finish() or Abandon() has been called. bool closed; // Either Finish() or Abandon() has been called.
FilterBlockBuilder* filter_block;
// We do not emit the index entry for a block until we have seen the // We do not emit the index entry for a block until we have seen the
// first key for the next data block. This allows us to use shorter // first key for the next data block. This allows us to use shorter
@ -51,6 +53,8 @@ struct TableBuilder::Rep {
index_block(&index_block_options), index_block(&index_block_options),
num_entries(0), num_entries(0),
closed(false), closed(false),
filter_block(opt.filter_policy == NULL ? NULL
: new FilterBlockBuilder(opt.filter_policy)),
pending_index_entry(false) { pending_index_entry(false) {
index_block_options.block_restart_interval = 1; index_block_options.block_restart_interval = 1;
} }
@ -58,10 +62,14 @@ struct TableBuilder::Rep {
TableBuilder::TableBuilder(const Options& options, WritableFile* file) TableBuilder::TableBuilder(const Options& options, WritableFile* file)
: rep_(new Rep(options, file)) { : rep_(new Rep(options, file)) {
if (rep_->filter_block != NULL) {
rep_->filter_block->StartBlock(0);
}
} }
TableBuilder::~TableBuilder() { TableBuilder::~TableBuilder() {
assert(rep_->closed); // Catch errors where caller forgot to call Finish() assert(rep_->closed); // Catch errors where caller forgot to call Finish()
delete rep_->filter_block;
delete rep_; delete rep_;
} }
@ -98,6 +106,10 @@ void TableBuilder::Add(const Slice& key, const Slice& value) {
r->pending_index_entry = false; r->pending_index_entry = false;
} }
if (r->filter_block != NULL) {
r->filter_block->AddKey(key);
}
r->last_key.assign(key.data(), key.size()); r->last_key.assign(key.data(), key.size());
r->num_entries++; r->num_entries++;
r->data_block.Add(key, value); r->data_block.Add(key, value);
@ -119,6 +131,9 @@ void TableBuilder::Flush() {
r->pending_index_entry = true; r->pending_index_entry = true;
r->status = r->file->Flush(); r->status = r->file->Flush();
} }
if (r->filter_block != NULL) {
r->filter_block->StartBlock(r->offset);
}
} }
void TableBuilder::WriteBlock(BlockBuilder* block, BlockHandle* handle) { void TableBuilder::WriteBlock(BlockBuilder* block, BlockHandle* handle) {
@ -152,6 +167,15 @@ void TableBuilder::WriteBlock(BlockBuilder* block, BlockHandle* handle) {
break; break;
} }
} }
WriteRawBlock(block_contents, type, handle);
r->compressed_output.clear();
block->Reset();
}
void TableBuilder::WriteRawBlock(const Slice& block_contents,
CompressionType type,
BlockHandle* handle) {
Rep* r = rep_;
handle->set_offset(r->offset); handle->set_offset(r->offset);
handle->set_size(block_contents.size()); handle->set_size(block_contents.size());
r->status = r->file->Append(block_contents); r->status = r->file->Append(block_contents);
@ -166,8 +190,6 @@ void TableBuilder::WriteBlock(BlockBuilder* block, BlockHandle* handle) {
r->offset += block_contents.size() + kBlockTrailerSize; r->offset += block_contents.size() + kBlockTrailerSize;
} }
} }
r->compressed_output.clear();
block->Reset();
} }
Status TableBuilder::status() const { Status TableBuilder::status() const {
@ -179,13 +201,32 @@ Status TableBuilder::Finish() {
Flush(); Flush();
assert(!r->closed); assert(!r->closed);
r->closed = true; r->closed = true;
BlockHandle metaindex_block_handle;
BlockHandle index_block_handle; BlockHandle filter_block_handle, metaindex_block_handle, index_block_handle;
// Write filter block
if (ok() && r->filter_block != NULL) {
WriteRawBlock(r->filter_block->Finish(), kNoCompression,
&filter_block_handle);
}
// Write metaindex block
if (ok()) { if (ok()) {
BlockBuilder meta_index_block(&r->options); BlockBuilder meta_index_block(&r->options);
if (r->filter_block != NULL) {
// Add mapping from "filter.Name" to location of filter data
std::string key = "filter.";
key.append(r->options.filter_policy->Name());
std::string handle_encoding;
filter_block_handle.EncodeTo(&handle_encoding);
meta_index_block.Add(key, handle_encoding);
}
// TODO(postrelease): Add stats and other meta blocks // TODO(postrelease): Add stats and other meta blocks
WriteBlock(&meta_index_block, &metaindex_block_handle); WriteBlock(&meta_index_block, &metaindex_block_handle);
} }
// Write index block
if (ok()) { if (ok()) {
if (r->pending_index_entry) { if (r->pending_index_entry) {
r->options.comparator->FindShortSuccessor(&r->last_key); r->options.comparator->FindShortSuccessor(&r->last_key);
@ -196,6 +237,8 @@ Status TableBuilder::Finish() {
} }
WriteBlock(&r->index_block, &index_block_handle); WriteBlock(&r->index_block, &index_block_handle);
} }
// Write footer
if (ok()) { if (ok()) {
Footer footer; Footer footer;
footer.set_metaindex_handle(metaindex_block_handle); footer.set_metaindex_handle(metaindex_block_handle);

View File

@ -168,8 +168,6 @@ class Constructor {
// Construct the data structure from the data in "data" // Construct the data structure from the data in "data"
virtual Status FinishImpl(const Options& options, const KVMap& data) = 0; virtual Status FinishImpl(const Options& options, const KVMap& data) = 0;
virtual size_t NumBytes() const = 0;
virtual Iterator* NewIterator() const = 0; virtual Iterator* NewIterator() const = 0;
virtual const KVMap& data() { return data_; } virtual const KVMap& data() { return data_; }
@ -185,7 +183,6 @@ class BlockConstructor: public Constructor {
explicit BlockConstructor(const Comparator* cmp) explicit BlockConstructor(const Comparator* cmp)
: Constructor(cmp), : Constructor(cmp),
comparator_(cmp), comparator_(cmp),
block_size_(-1),
block_(NULL) { } block_(NULL) { }
~BlockConstructor() { ~BlockConstructor() {
delete block_; delete block_;
@ -201,22 +198,21 @@ class BlockConstructor: public Constructor {
builder.Add(it->first, it->second); builder.Add(it->first, it->second);
} }
// Open the block // Open the block
Slice block_data = builder.Finish(); data_ = builder.Finish().ToString();
block_size_ = block_data.size(); BlockContents contents;
char* block_data_copy = new char[block_size_]; contents.data = data_;
memcpy(block_data_copy, block_data.data(), block_size_); contents.cachable = false;
block_ = new Block(block_data_copy, block_size_, true /* take ownership */); contents.heap_allocated = false;
block_ = new Block(contents);
return Status::OK(); return Status::OK();
} }
virtual size_t NumBytes() const { return block_size_; }
virtual Iterator* NewIterator() const { virtual Iterator* NewIterator() const {
return block_->NewIterator(comparator_); return block_->NewIterator(comparator_);
} }
private: private:
const Comparator* comparator_; const Comparator* comparator_;
int block_size_; std::string data_;
Block* block_; Block* block_;
BlockConstructor(); BlockConstructor();
@ -253,7 +249,6 @@ class TableConstructor: public Constructor {
table_options.comparator = options.comparator; table_options.comparator = options.comparator;
return Table::Open(table_options, source_, sink.contents().size(), &table_); return Table::Open(table_options, source_, sink.contents().size(), &table_);
} }
virtual size_t NumBytes() const { return source_->Size(); }
virtual Iterator* NewIterator() const { virtual Iterator* NewIterator() const {
return table_->NewIterator(ReadOptions()); return table_->NewIterator(ReadOptions());
@ -342,10 +337,6 @@ class MemTableConstructor: public Constructor {
} }
return Status::OK(); return Status::OK();
} }
virtual size_t NumBytes() const {
return memtable_->ApproximateMemoryUsage();
}
virtual Iterator* NewIterator() const { virtual Iterator* NewIterator() const {
return new KeyConvertingIterator(memtable_->NewIterator()); return new KeyConvertingIterator(memtable_->NewIterator());
} }
@ -379,13 +370,6 @@ class DBConstructor: public Constructor {
} }
return Status::OK(); return Status::OK();
} }
virtual size_t NumBytes() const {
Range r("", "\xff\xff");
uint64_t size;
db_->GetApproximateSizes(&r, 1, &size);
return size;
}
virtual Iterator* NewIterator() const { virtual Iterator* NewIterator() const {
return db_->NewIterator(ReadOptions()); return db_->NewIterator(ReadOptions());
} }
@ -809,7 +793,7 @@ TEST(TableTest, ApproximateOffsetOfPlain) {
ASSERT_TRUE(Between(c.ApproximateOffsetOf("k05"), 210000, 211000)); ASSERT_TRUE(Between(c.ApproximateOffsetOf("k05"), 210000, 211000));
ASSERT_TRUE(Between(c.ApproximateOffsetOf("k06"), 510000, 511000)); ASSERT_TRUE(Between(c.ApproximateOffsetOf("k06"), 510000, 511000));
ASSERT_TRUE(Between(c.ApproximateOffsetOf("k07"), 510000, 511000)); ASSERT_TRUE(Between(c.ApproximateOffsetOf("k07"), 510000, 511000));
ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"), 610000, 611000)); ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"), 610000, 612000));
} }

95
util/bloom.cc Normal file
View File

@ -0,0 +1,95 @@
// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "leveldb/filter_policy.h"
#include "leveldb/slice.h"
#include "util/hash.h"
namespace leveldb {
namespace {
static uint32_t BloomHash(const Slice& key) {
return Hash(key.data(), key.size(), 0xbc9f1d34);
}
class BloomFilterPolicy : public FilterPolicy {
private:
size_t bits_per_key_;
size_t k_;
public:
explicit BloomFilterPolicy(int bits_per_key)
: bits_per_key_(bits_per_key) {
// We intentionally round down to reduce probing cost a little bit
k_ = static_cast<size_t>(bits_per_key * 0.69); // 0.69 =~ ln(2)
if (k_ < 1) k_ = 1;
if (k_ > 30) k_ = 30;
}
virtual const char* Name() const {
return "leveldb.BuiltinBloomFilter";
}
virtual void CreateFilter(const Slice* keys, int n, std::string* dst) const {
// Compute bloom filter size (in both bits and bytes)
size_t bits = n * bits_per_key_;
// For small n, we can see a very high false positive rate. Fix it
// by enforcing a minimum bloom filter length.
if (bits < 64) bits = 64;
size_t bytes = (bits + 7) / 8;
bits = bytes * 8;
const size_t init_size = dst->size();
dst->resize(init_size + bytes, 0);
dst->push_back(static_cast<char>(k_)); // Remember # of probes in filter
char* array = &(*dst)[init_size];
for (size_t i = 0; i < n; i++) {
// Use double-hashing to generate a sequence of hash values.
// See analysis in [Kirsch,Mitzenmacher 2006].
uint32_t h = BloomHash(keys[i]);
const uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits
for (size_t j = 0; j < k_; j++) {
const uint32_t bitpos = h % bits;
array[bitpos/8] |= (1 << (bitpos % 8));
h += delta;
}
}
}
virtual bool KeyMayMatch(const Slice& key, const Slice& bloom_filter) const {
const size_t len = bloom_filter.size();
if (len < 2) return false;
const char* array = bloom_filter.data();
const size_t bits = (len - 1) * 8;
// Use the encoded k so that we can read filters generated by
// bloom filters created using different parameters.
const size_t k = array[len-1];
if (k > 30) {
// Reserved for potentially new encodings for short bloom filters.
// Consider it a match.
return true;
}
uint32_t h = BloomHash(key);
const uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits
for (size_t j = 0; j < k; j++) {
const uint32_t bitpos = h % bits;
if ((array[bitpos/8] & (1 << (bitpos % 8))) == 0) return false;
h += delta;
}
return true;
}
};
}
const FilterPolicy* NewBloomFilterPolicy(int bits_per_key) {
return new BloomFilterPolicy(bits_per_key);
}
} // namespace leveldb

159
util/bloom_test.cc Normal file
View File

@ -0,0 +1,159 @@
// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "leveldb/filter_policy.h"
#include "util/logging.h"
#include "util/testharness.h"
#include "util/testutil.h"
namespace leveldb {
static const int kVerbose = 1;
static Slice Key(int i, char* buffer) {
memcpy(buffer, &i, sizeof(i));
return Slice(buffer, sizeof(i));
}
class BloomTest {
private:
const FilterPolicy* policy_;
std::string filter_;
std::vector<std::string> keys_;
public:
BloomTest() : policy_(NewBloomFilterPolicy(10)) { }
~BloomTest() {
delete policy_;
}
void Reset() {
keys_.clear();
filter_.clear();
}
void Add(const Slice& s) {
keys_.push_back(s.ToString());
}
void Build() {
std::vector<Slice> key_slices;
for (size_t i = 0; i < keys_.size(); i++) {
key_slices.push_back(Slice(keys_[i]));
}
filter_.clear();
policy_->CreateFilter(&key_slices[0], key_slices.size(), &filter_);
keys_.clear();
if (kVerbose >= 2) DumpFilter();
}
size_t FilterSize() const {
return filter_.size();
}
void DumpFilter() {
fprintf(stderr, "F(");
for (size_t i = 0; i+1 < filter_.size(); i++) {
const unsigned int c = static_cast<unsigned int>(filter_[i]);
for (int j = 0; j < 8; j++) {
fprintf(stderr, "%c", (c & (1 <<j)) ? '1' : '.');
}
}
fprintf(stderr, ")\n");
}
bool Matches(const Slice& s) {
if (!keys_.empty()) {
Build();
}
return policy_->KeyMayMatch(s, filter_);
}
double FalsePositiveRate() {
char buffer[sizeof(int)];
int result = 0;
for (int i = 0; i < 10000; i++) {
if (Matches(Key(i + 1000000000, buffer))) {
result++;
}
}
return result / 10000.0;
}
};
TEST(BloomTest, EmptyFilter) {
ASSERT_TRUE(! Matches("hello"));
ASSERT_TRUE(! Matches("world"));
}
TEST(BloomTest, Small) {
Add("hello");
Add("world");
ASSERT_TRUE(Matches("hello"));
ASSERT_TRUE(Matches("world"));
ASSERT_TRUE(! Matches("x"));
ASSERT_TRUE(! Matches("foo"));
}
static int NextLength(int length) {
if (length < 10) {
length += 1;
} else if (length < 100) {
length += 10;
} else if (length < 1000) {
length += 100;
} else {
length += 1000;
}
return length;
}
TEST(BloomTest, VaryingLengths) {
char buffer[sizeof(int)];
// Count number of filters that significantly exceed the false positive rate
int mediocre_filters = 0;
int good_filters = 0;
for (int length = 1; length <= 10000; length = NextLength(length)) {
Reset();
for (int i = 0; i < length; i++) {
Add(Key(i, buffer));
}
Build();
ASSERT_LE(FilterSize(), (length * 10 / 8) + 40) << length;
// All added keys must match
for (int i = 0; i < length; i++) {
ASSERT_TRUE(Matches(Key(i, buffer)))
<< "Length " << length << "; key " << i;
}
// Check false positive rate
double rate = FalsePositiveRate();
if (kVerbose >= 1) {
fprintf(stderr, "False positives: %5.2f%% @ length = %6d ; bytes = %6d\n",
rate*100.0, length, static_cast<int>(FilterSize()));
}
ASSERT_LE(rate, 0.02); // Must not be over 2%
if (rate > 0.0125) mediocre_filters++; // Allowed, but not too often
else good_filters++;
}
if (kVerbose >= 1) {
fprintf(stderr, "Filters: %d good, %d mediocre\n",
good_filters, mediocre_filters);
}
ASSERT_LE(mediocre_filters, good_filters/5);
}
// Different bits-per-byte
} // namespace leveldb
int main(int argc, char** argv) {
return leveldb::test::RunAllTests();
}

11
util/filter_policy.cc Normal file
View File

@ -0,0 +1,11 @@
// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "leveldb/filter_policy.h"
namespace leveldb {
FilterPolicy::~FilterPolicy() { }
} // namespace leveldb

View File

@ -21,7 +21,8 @@ Options::Options()
block_cache(NULL), block_cache(NULL),
block_size(4096), block_size(4096),
block_restart_interval(16), block_restart_interval(16),
compression(kSnappyCompression) { compression(kSnappyCompression),
filter_policy(NULL) {
} }