sync with upstream @ 21409451
Check the NEWS file for details of what changed. git-svn-id: https://leveldb.googlecode.com/svn/trunk@28 62dab493-f737-651d-591e-8d6aee1b9529
This commit is contained in:
parent
3c111335a7
commit
da79909507
17
NEWS
Normal file
17
NEWS
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
Release 1.2 2011-05-16
|
||||||
|
----------------------
|
||||||
|
|
||||||
|
Fixes for larger databases (tested up to one billion 100-byte entries,
|
||||||
|
i.e., ~100GB).
|
||||||
|
|
||||||
|
(1) Place hard limit on number of level-0 files. This fixes errors
|
||||||
|
of the form "too many open files".
|
||||||
|
|
||||||
|
(2) Fixed memtable management. Before the fix, a heavy write burst
|
||||||
|
could cause unbounded memory usage.
|
||||||
|
|
||||||
|
A fix for a logging bug where the reader would incorrectly complain
|
||||||
|
about corruption.
|
||||||
|
|
||||||
|
Allow public access to WriteBatch contents so that users can easily
|
||||||
|
wrap a DB.
|
@ -24,9 +24,10 @@
|
|||||||
// overwrite -- overwrite N values in random key order in async mode
|
// overwrite -- overwrite N values in random key order in async mode
|
||||||
// fillsync -- write N/100 values in random key order in sync mode
|
// fillsync -- write N/100 values in random key order in sync mode
|
||||||
// fill100K -- write N/1000 100K values in random order in async mode
|
// fill100K -- write N/1000 100K values in random order in async mode
|
||||||
// readseq -- read N values sequentially
|
// readseq -- read N times sequentially
|
||||||
// readreverse -- read N values in reverse order
|
// readreverse -- read N times in reverse order
|
||||||
// readrandom -- read N values in random order
|
// readrandom -- read N times in random order
|
||||||
|
// readhot -- read N times in random order from 1% section of DB
|
||||||
// crc32c -- repeated crc32c of 4K of data
|
// crc32c -- repeated crc32c of 4K of data
|
||||||
// Meta operations:
|
// Meta operations:
|
||||||
// compact -- Compact the entire DB
|
// compact -- Compact the entire DB
|
||||||
@ -54,6 +55,9 @@ static const char* FLAGS_benchmarks =
|
|||||||
// Number of key/values to place in database
|
// Number of key/values to place in database
|
||||||
static int FLAGS_num = 1000000;
|
static int FLAGS_num = 1000000;
|
||||||
|
|
||||||
|
// Number of read operations to do. If negative, do FLAGS_num reads.
|
||||||
|
static int FLAGS_reads = -1;
|
||||||
|
|
||||||
// Size of each value
|
// Size of each value
|
||||||
static int FLAGS_value_size = 100;
|
static int FLAGS_value_size = 100;
|
||||||
|
|
||||||
@ -72,6 +76,14 @@ static int FLAGS_write_buffer_size = 0;
|
|||||||
// Negative means use default settings.
|
// Negative means use default settings.
|
||||||
static int FLAGS_cache_size = -1;
|
static int FLAGS_cache_size = -1;
|
||||||
|
|
||||||
|
// Maximum number of files to keep open at the same time (use default if == 0)
|
||||||
|
static int FLAGS_open_files = 0;
|
||||||
|
|
||||||
|
// If true, do not destroy the existing database. If you set this
|
||||||
|
// flag and also specify a benchmark that wants a fresh database, that
|
||||||
|
// benchmark will fail.
|
||||||
|
static bool FLAGS_use_existing_db = false;
|
||||||
|
|
||||||
namespace leveldb {
|
namespace leveldb {
|
||||||
|
|
||||||
// Helper for quickly generating random data.
|
// Helper for quickly generating random data.
|
||||||
@ -126,6 +138,7 @@ class Benchmark {
|
|||||||
Cache* cache_;
|
Cache* cache_;
|
||||||
DB* db_;
|
DB* db_;
|
||||||
int num_;
|
int num_;
|
||||||
|
int reads_;
|
||||||
int heap_counter_;
|
int heap_counter_;
|
||||||
double start_;
|
double start_;
|
||||||
double last_op_finish_;
|
double last_op_finish_;
|
||||||
@ -298,6 +311,7 @@ class Benchmark {
|
|||||||
: cache_(FLAGS_cache_size >= 0 ? NewLRUCache(FLAGS_cache_size) : NULL),
|
: cache_(FLAGS_cache_size >= 0 ? NewLRUCache(FLAGS_cache_size) : NULL),
|
||||||
db_(NULL),
|
db_(NULL),
|
||||||
num_(FLAGS_num),
|
num_(FLAGS_num),
|
||||||
|
reads_(FLAGS_reads < 0 ? FLAGS_num : FLAGS_reads),
|
||||||
heap_counter_(0),
|
heap_counter_(0),
|
||||||
bytes_(0),
|
bytes_(0),
|
||||||
rand_(301) {
|
rand_(301) {
|
||||||
@ -308,8 +322,10 @@ class Benchmark {
|
|||||||
Env::Default()->DeleteFile("/tmp/dbbench/" + files[i]);
|
Env::Default()->DeleteFile("/tmp/dbbench/" + files[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (!FLAGS_use_existing_db) {
|
||||||
DestroyDB("/tmp/dbbench", Options());
|
DestroyDB("/tmp/dbbench", Options());
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
~Benchmark() {
|
~Benchmark() {
|
||||||
delete db_;
|
delete db_;
|
||||||
@ -355,11 +371,13 @@ class Benchmark {
|
|||||||
ReadReverse();
|
ReadReverse();
|
||||||
} else if (name == Slice("readrandom")) {
|
} else if (name == Slice("readrandom")) {
|
||||||
ReadRandom();
|
ReadRandom();
|
||||||
|
} else if (name == Slice("readhot")) {
|
||||||
|
ReadHot();
|
||||||
} else if (name == Slice("readrandomsmall")) {
|
} else if (name == Slice("readrandomsmall")) {
|
||||||
int n = num_;
|
int n = reads_;
|
||||||
num_ /= 1000;
|
reads_ /= 1000;
|
||||||
ReadRandom();
|
ReadRandom();
|
||||||
num_ = n;
|
reads_ = n;
|
||||||
} else if (name == Slice("compact")) {
|
} else if (name == Slice("compact")) {
|
||||||
Compact();
|
Compact();
|
||||||
} else if (name == Slice("crc32c")) {
|
} else if (name == Slice("crc32c")) {
|
||||||
@ -449,7 +467,7 @@ class Benchmark {
|
|||||||
void Open() {
|
void Open() {
|
||||||
assert(db_ == NULL);
|
assert(db_ == NULL);
|
||||||
Options options;
|
Options options;
|
||||||
options.create_if_missing = true;
|
options.create_if_missing = !FLAGS_use_existing_db;
|
||||||
options.block_cache = cache_;
|
options.block_cache = cache_;
|
||||||
options.write_buffer_size = FLAGS_write_buffer_size;
|
options.write_buffer_size = FLAGS_write_buffer_size;
|
||||||
Status s = DB::Open(options, "/tmp/dbbench", &db_);
|
Status s = DB::Open(options, "/tmp/dbbench", &db_);
|
||||||
@ -462,6 +480,10 @@ class Benchmark {
|
|||||||
void Write(const WriteOptions& options, Order order, DBState state,
|
void Write(const WriteOptions& options, Order order, DBState state,
|
||||||
int num_entries, int value_size, int entries_per_batch) {
|
int num_entries, int value_size, int entries_per_batch) {
|
||||||
if (state == FRESH) {
|
if (state == FRESH) {
|
||||||
|
if (FLAGS_use_existing_db) {
|
||||||
|
message_ = "skipping (--use_existing_db is true)";
|
||||||
|
return;
|
||||||
|
}
|
||||||
delete db_;
|
delete db_;
|
||||||
db_ = NULL;
|
db_ = NULL;
|
||||||
DestroyDB("/tmp/dbbench", Options());
|
DestroyDB("/tmp/dbbench", Options());
|
||||||
@ -499,7 +521,7 @@ class Benchmark {
|
|||||||
void ReadSequential() {
|
void ReadSequential() {
|
||||||
Iterator* iter = db_->NewIterator(ReadOptions());
|
Iterator* iter = db_->NewIterator(ReadOptions());
|
||||||
int i = 0;
|
int i = 0;
|
||||||
for (iter->SeekToFirst(); i < num_ && iter->Valid(); iter->Next()) {
|
for (iter->SeekToFirst(); i < reads_ && iter->Valid(); iter->Next()) {
|
||||||
bytes_ += iter->key().size() + iter->value().size();
|
bytes_ += iter->key().size() + iter->value().size();
|
||||||
FinishedSingleOp();
|
FinishedSingleOp();
|
||||||
++i;
|
++i;
|
||||||
@ -510,7 +532,7 @@ class Benchmark {
|
|||||||
void ReadReverse() {
|
void ReadReverse() {
|
||||||
Iterator* iter = db_->NewIterator(ReadOptions());
|
Iterator* iter = db_->NewIterator(ReadOptions());
|
||||||
int i = 0;
|
int i = 0;
|
||||||
for (iter->SeekToLast(); i < num_ && iter->Valid(); iter->Prev()) {
|
for (iter->SeekToLast(); i < reads_ && iter->Valid(); iter->Prev()) {
|
||||||
bytes_ += iter->key().size() + iter->value().size();
|
bytes_ += iter->key().size() + iter->value().size();
|
||||||
FinishedSingleOp();
|
FinishedSingleOp();
|
||||||
++i;
|
++i;
|
||||||
@ -521,7 +543,7 @@ class Benchmark {
|
|||||||
void ReadRandom() {
|
void ReadRandom() {
|
||||||
ReadOptions options;
|
ReadOptions options;
|
||||||
std::string value;
|
std::string value;
|
||||||
for (int i = 0; i < num_; i++) {
|
for (int i = 0; i < reads_; i++) {
|
||||||
char key[100];
|
char key[100];
|
||||||
const int k = rand_.Next() % FLAGS_num;
|
const int k = rand_.Next() % FLAGS_num;
|
||||||
snprintf(key, sizeof(key), "%016d", k);
|
snprintf(key, sizeof(key), "%016d", k);
|
||||||
@ -530,6 +552,19 @@ class Benchmark {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void ReadHot() {
|
||||||
|
ReadOptions options;
|
||||||
|
std::string value;
|
||||||
|
const int range = (FLAGS_num + 99) / 100;
|
||||||
|
for (int i = 0; i < reads_; i++) {
|
||||||
|
char key[100];
|
||||||
|
const int k = rand_.Next() % range;
|
||||||
|
snprintf(key, sizeof(key), "%016d", k);
|
||||||
|
db_->Get(options, key, &value);
|
||||||
|
FinishedSingleOp();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void Compact() {
|
void Compact() {
|
||||||
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
|
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
|
||||||
dbi->TEST_CompactMemTable();
|
dbi->TEST_CompactMemTable();
|
||||||
@ -582,6 +617,8 @@ class Benchmark {
|
|||||||
|
|
||||||
int main(int argc, char** argv) {
|
int main(int argc, char** argv) {
|
||||||
FLAGS_write_buffer_size = leveldb::Options().write_buffer_size;
|
FLAGS_write_buffer_size = leveldb::Options().write_buffer_size;
|
||||||
|
FLAGS_open_files = leveldb::Options().max_open_files;
|
||||||
|
|
||||||
for (int i = 1; i < argc; i++) {
|
for (int i = 1; i < argc; i++) {
|
||||||
double d;
|
double d;
|
||||||
int n;
|
int n;
|
||||||
@ -593,14 +630,21 @@ int main(int argc, char** argv) {
|
|||||||
} else if (sscanf(argv[i], "--histogram=%d%c", &n, &junk) == 1 &&
|
} else if (sscanf(argv[i], "--histogram=%d%c", &n, &junk) == 1 &&
|
||||||
(n == 0 || n == 1)) {
|
(n == 0 || n == 1)) {
|
||||||
FLAGS_histogram = n;
|
FLAGS_histogram = n;
|
||||||
|
} else if (sscanf(argv[i], "--use_existing_db=%d%c", &n, &junk) == 1 &&
|
||||||
|
(n == 0 || n == 1)) {
|
||||||
|
FLAGS_use_existing_db = n;
|
||||||
} else if (sscanf(argv[i], "--num=%d%c", &n, &junk) == 1) {
|
} else if (sscanf(argv[i], "--num=%d%c", &n, &junk) == 1) {
|
||||||
FLAGS_num = n;
|
FLAGS_num = n;
|
||||||
|
} else if (sscanf(argv[i], "--reads=%d%c", &n, &junk) == 1) {
|
||||||
|
FLAGS_reads = n;
|
||||||
} else if (sscanf(argv[i], "--value_size=%d%c", &n, &junk) == 1) {
|
} else if (sscanf(argv[i], "--value_size=%d%c", &n, &junk) == 1) {
|
||||||
FLAGS_value_size = n;
|
FLAGS_value_size = n;
|
||||||
} else if (sscanf(argv[i], "--write_buffer_size=%d%c", &n, &junk) == 1) {
|
} else if (sscanf(argv[i], "--write_buffer_size=%d%c", &n, &junk) == 1) {
|
||||||
FLAGS_write_buffer_size = n;
|
FLAGS_write_buffer_size = n;
|
||||||
} else if (sscanf(argv[i], "--cache_size=%d%c", &n, &junk) == 1) {
|
} else if (sscanf(argv[i], "--cache_size=%d%c", &n, &junk) == 1) {
|
||||||
FLAGS_cache_size = n;
|
FLAGS_cache_size = n;
|
||||||
|
} else if (sscanf(argv[i], "--open_files=%d%c", &n, &junk) == 1) {
|
||||||
|
FLAGS_open_files = n;
|
||||||
} else {
|
} else {
|
||||||
fprintf(stderr, "Invalid flag '%s'\n", argv[i]);
|
fprintf(stderr, "Invalid flag '%s'\n", argv[i]);
|
||||||
exit(1);
|
exit(1);
|
||||||
|
@ -126,6 +126,7 @@ DBImpl::DBImpl(const Options& options, const std::string& dbname)
|
|||||||
log_(NULL),
|
log_(NULL),
|
||||||
bg_compaction_scheduled_(false),
|
bg_compaction_scheduled_(false),
|
||||||
compacting_(false) {
|
compacting_(false) {
|
||||||
|
mem_->Ref();
|
||||||
has_imm_.Release_Store(NULL);
|
has_imm_.Release_Store(NULL);
|
||||||
|
|
||||||
// Reserve ten files or so for other uses and give the rest to TableCache.
|
// Reserve ten files or so for other uses and give the rest to TableCache.
|
||||||
@ -152,8 +153,8 @@ DBImpl::~DBImpl() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
delete versions_;
|
delete versions_;
|
||||||
delete mem_;
|
if (mem_ != NULL) mem_->Unref();
|
||||||
delete imm_;
|
if (imm_ != NULL) imm_->Unref();
|
||||||
delete log_;
|
delete log_;
|
||||||
delete logfile_;
|
delete logfile_;
|
||||||
delete table_cache_;
|
delete table_cache_;
|
||||||
@ -344,7 +345,8 @@ Status DBImpl::RecoverLogFile(uint64_t log_number,
|
|||||||
// paranoid_checks==false so that corruptions cause entire commits
|
// paranoid_checks==false so that corruptions cause entire commits
|
||||||
// to be skipped instead of propagating bad information (like overly
|
// to be skipped instead of propagating bad information (like overly
|
||||||
// large sequence numbers).
|
// large sequence numbers).
|
||||||
log::Reader reader(file, &reporter, true/*checksum*/);
|
log::Reader reader(file, &reporter, true/*checksum*/,
|
||||||
|
0/*initial_offset*/);
|
||||||
Log(env_, options_.info_log, "Recovering log #%llu",
|
Log(env_, options_.info_log, "Recovering log #%llu",
|
||||||
(unsigned long long) log_number);
|
(unsigned long long) log_number);
|
||||||
|
|
||||||
@ -364,6 +366,7 @@ Status DBImpl::RecoverLogFile(uint64_t log_number,
|
|||||||
|
|
||||||
if (mem == NULL) {
|
if (mem == NULL) {
|
||||||
mem = new MemTable(internal_comparator_);
|
mem = new MemTable(internal_comparator_);
|
||||||
|
mem->Ref();
|
||||||
}
|
}
|
||||||
status = WriteBatchInternal::InsertInto(&batch, mem);
|
status = WriteBatchInternal::InsertInto(&batch, mem);
|
||||||
MaybeIgnoreError(&status);
|
MaybeIgnoreError(&status);
|
||||||
@ -384,7 +387,7 @@ Status DBImpl::RecoverLogFile(uint64_t log_number,
|
|||||||
// file-systems cause the DB::Open() to fail.
|
// file-systems cause the DB::Open() to fail.
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
delete mem;
|
mem->Unref();
|
||||||
mem = NULL;
|
mem = NULL;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -395,7 +398,7 @@ Status DBImpl::RecoverLogFile(uint64_t log_number,
|
|||||||
// file-systems cause the DB::Open() to fail.
|
// file-systems cause the DB::Open() to fail.
|
||||||
}
|
}
|
||||||
|
|
||||||
delete mem;
|
if (mem != NULL) mem->Unref();
|
||||||
delete file;
|
delete file;
|
||||||
return status;
|
return status;
|
||||||
}
|
}
|
||||||
@ -443,11 +446,12 @@ Status DBImpl::CompactMemTable() {
|
|||||||
// Replace immutable memtable with the generated Table
|
// Replace immutable memtable with the generated Table
|
||||||
if (s.ok()) {
|
if (s.ok()) {
|
||||||
edit.SetPrevLogNumber(0);
|
edit.SetPrevLogNumber(0);
|
||||||
s = versions_->LogAndApply(&edit, imm_);
|
s = versions_->LogAndApply(&edit);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (s.ok()) {
|
if (s.ok()) {
|
||||||
// Commit to the new state
|
// Commit to the new state
|
||||||
|
imm_->Unref();
|
||||||
imm_ = NULL;
|
imm_ = NULL;
|
||||||
has_imm_.Release_Store(NULL);
|
has_imm_.Release_Store(NULL);
|
||||||
DeleteObsoleteFiles();
|
DeleteObsoleteFiles();
|
||||||
@ -556,7 +560,7 @@ void DBImpl::BackgroundCompaction() {
|
|||||||
c->edit()->DeleteFile(c->level(), f->number);
|
c->edit()->DeleteFile(c->level(), f->number);
|
||||||
c->edit()->AddFile(c->level() + 1, f->number, f->file_size,
|
c->edit()->AddFile(c->level() + 1, f->number, f->file_size,
|
||||||
f->smallest, f->largest);
|
f->smallest, f->largest);
|
||||||
status = versions_->LogAndApply(c->edit(), NULL);
|
status = versions_->LogAndApply(c->edit());
|
||||||
Log(env_, options_.info_log, "Moved #%lld to level-%d %lld bytes %s\n",
|
Log(env_, options_.info_log, "Moved #%lld to level-%d %lld bytes %s\n",
|
||||||
static_cast<unsigned long long>(f->number),
|
static_cast<unsigned long long>(f->number),
|
||||||
c->level() + 1,
|
c->level() + 1,
|
||||||
@ -697,7 +701,7 @@ Status DBImpl::InstallCompactionResults(CompactionState* compact) {
|
|||||||
}
|
}
|
||||||
compact->outputs.clear();
|
compact->outputs.clear();
|
||||||
|
|
||||||
Status s = versions_->LogAndApply(compact->compaction->edit(), NULL);
|
Status s = versions_->LogAndApply(compact->compaction->edit());
|
||||||
if (s.ok()) {
|
if (s.ok()) {
|
||||||
compact->compaction->ReleaseInputs();
|
compact->compaction->ReleaseInputs();
|
||||||
DeleteObsoleteFiles();
|
DeleteObsoleteFiles();
|
||||||
@ -754,9 +758,7 @@ Status DBImpl::DoCompactionWork(CompactionState* compact) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
Slice key = input->key();
|
Slice key = input->key();
|
||||||
InternalKey tmp_internal_key;
|
if (compact->compaction->ShouldStopBefore(key) &&
|
||||||
tmp_internal_key.DecodeFrom(key);
|
|
||||||
if (compact->compaction->ShouldStopBefore(tmp_internal_key) &&
|
|
||||||
compact->builder != NULL) {
|
compact->builder != NULL) {
|
||||||
status = FinishCompactionOutputFile(compact, input);
|
status = FinishCompactionOutputFile(compact, input);
|
||||||
if (!status.ok()) {
|
if (!status.ok()) {
|
||||||
@ -867,6 +869,9 @@ Status DBImpl::DoCompactionWork(CompactionState* compact) {
|
|||||||
}
|
}
|
||||||
compacting_ = false;
|
compacting_ = false;
|
||||||
compacting_cv_.SignalAll();
|
compacting_cv_.SignalAll();
|
||||||
|
VersionSet::LevelSummaryStorage tmp;
|
||||||
|
Log(env_, options_.info_log,
|
||||||
|
"compacted to: %s", versions_->LevelSummary(&tmp));
|
||||||
return status;
|
return status;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -925,10 +930,11 @@ Status DBImpl::Get(const ReadOptions& options,
|
|||||||
Iterator* DBImpl::NewIterator(const ReadOptions& options) {
|
Iterator* DBImpl::NewIterator(const ReadOptions& options) {
|
||||||
SequenceNumber latest_snapshot;
|
SequenceNumber latest_snapshot;
|
||||||
Iterator* internal_iter = NewInternalIterator(options, &latest_snapshot);
|
Iterator* internal_iter = NewInternalIterator(options, &latest_snapshot);
|
||||||
SequenceNumber sequence =
|
return NewDBIterator(
|
||||||
(options.snapshot ? options.snapshot->number_ : latest_snapshot);
|
&dbname_, env_, user_comparator(), internal_iter,
|
||||||
return NewDBIterator(&dbname_, env_,
|
(options.snapshot != NULL
|
||||||
user_comparator(), internal_iter, sequence);
|
? reinterpret_cast<const SnapshotImpl*>(options.snapshot)->number_
|
||||||
|
: latest_snapshot));
|
||||||
}
|
}
|
||||||
|
|
||||||
void DBImpl::Unref(void* arg1, void* arg2) {
|
void DBImpl::Unref(void* arg1, void* arg2) {
|
||||||
@ -945,7 +951,7 @@ const Snapshot* DBImpl::GetSnapshot() {
|
|||||||
|
|
||||||
void DBImpl::ReleaseSnapshot(const Snapshot* s) {
|
void DBImpl::ReleaseSnapshot(const Snapshot* s) {
|
||||||
MutexLock l(&mutex_);
|
MutexLock l(&mutex_);
|
||||||
snapshots_.Delete(s);
|
snapshots_.Delete(reinterpret_cast<const SnapshotImpl*>(s));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Convenience methods
|
// Convenience methods
|
||||||
@ -985,12 +991,26 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* updates) {
|
|||||||
|
|
||||||
Status DBImpl::MakeRoomForWrite(bool force) {
|
Status DBImpl::MakeRoomForWrite(bool force) {
|
||||||
mutex_.AssertHeld();
|
mutex_.AssertHeld();
|
||||||
|
bool allow_delay = !force;
|
||||||
Status s;
|
Status s;
|
||||||
while (true) {
|
while (true) {
|
||||||
if (!bg_error_.ok()) {
|
if (!bg_error_.ok()) {
|
||||||
// Yield previous error
|
// Yield previous error
|
||||||
s = bg_error_;
|
s = bg_error_;
|
||||||
break;
|
break;
|
||||||
|
} else if (
|
||||||
|
allow_delay &&
|
||||||
|
versions_->NumLevelFiles(0) >= config::kL0_SlowdownWritesTrigger) {
|
||||||
|
// We are getting close to hitting a hard limit on the number of
|
||||||
|
// L0 files. Rather than delaying a single write by several
|
||||||
|
// seconds when we hit the hard limit, start delaying each
|
||||||
|
// individual write by 1ms to reduce latency variance. Also,
|
||||||
|
// this delay hands over some CPU to the compaction thread in
|
||||||
|
// case it is sharing the same core as the writer.
|
||||||
|
mutex_.Unlock();
|
||||||
|
env_->SleepForMicroseconds(1000);
|
||||||
|
allow_delay = false; // Do not delay a single write more than once
|
||||||
|
mutex_.Lock();
|
||||||
} else if (!force &&
|
} else if (!force &&
|
||||||
(mem_->ApproximateMemoryUsage() <= options_.write_buffer_size)) {
|
(mem_->ApproximateMemoryUsage() <= options_.write_buffer_size)) {
|
||||||
// There is room in current memtable
|
// There is room in current memtable
|
||||||
@ -999,6 +1019,9 @@ Status DBImpl::MakeRoomForWrite(bool force) {
|
|||||||
// We have filled up the current memtable, but the previous
|
// We have filled up the current memtable, but the previous
|
||||||
// one is still being compacted, so we wait.
|
// one is still being compacted, so we wait.
|
||||||
compacting_cv_.Wait();
|
compacting_cv_.Wait();
|
||||||
|
} else if (versions_->NumLevelFiles(0) >= config::kL0_StopWritesTrigger) {
|
||||||
|
// There are too many level-0 files.
|
||||||
|
compacting_cv_.Wait();
|
||||||
} else {
|
} else {
|
||||||
// Attempt to switch to a new memtable and trigger compaction of old
|
// Attempt to switch to a new memtable and trigger compaction of old
|
||||||
assert(versions_->PrevLogNumber() == 0);
|
assert(versions_->PrevLogNumber() == 0);
|
||||||
@ -1011,7 +1034,7 @@ Status DBImpl::MakeRoomForWrite(bool force) {
|
|||||||
VersionEdit edit;
|
VersionEdit edit;
|
||||||
edit.SetPrevLogNumber(versions_->LogNumber());
|
edit.SetPrevLogNumber(versions_->LogNumber());
|
||||||
edit.SetLogNumber(new_log_number);
|
edit.SetLogNumber(new_log_number);
|
||||||
s = versions_->LogAndApply(&edit, NULL);
|
s = versions_->LogAndApply(&edit);
|
||||||
if (!s.ok()) {
|
if (!s.ok()) {
|
||||||
delete lfile;
|
delete lfile;
|
||||||
env_->DeleteFile(LogFileName(dbname_, new_log_number));
|
env_->DeleteFile(LogFileName(dbname_, new_log_number));
|
||||||
@ -1024,6 +1047,7 @@ Status DBImpl::MakeRoomForWrite(bool force) {
|
|||||||
imm_ = mem_;
|
imm_ = mem_;
|
||||||
has_imm_.Release_Store(imm_);
|
has_imm_.Release_Store(imm_);
|
||||||
mem_ = new MemTable(internal_comparator_);
|
mem_ = new MemTable(internal_comparator_);
|
||||||
|
mem_->Ref();
|
||||||
force = false; // Do not force another compaction if have room
|
force = false; // Do not force another compaction if have room
|
||||||
MaybeScheduleCompaction();
|
MaybeScheduleCompaction();
|
||||||
}
|
}
|
||||||
@ -1141,10 +1165,11 @@ Status DB::Open(const Options& options, const std::string& dbname,
|
|||||||
edit.SetLogNumber(new_log_number);
|
edit.SetLogNumber(new_log_number);
|
||||||
impl->logfile_ = lfile;
|
impl->logfile_ = lfile;
|
||||||
impl->log_ = new log::Writer(lfile);
|
impl->log_ = new log::Writer(lfile);
|
||||||
s = impl->versions_->LogAndApply(&edit, NULL);
|
s = impl->versions_->LogAndApply(&edit);
|
||||||
}
|
}
|
||||||
if (s.ok()) {
|
if (s.ok()) {
|
||||||
impl->DeleteObsoleteFiles();
|
impl->DeleteObsoleteFiles();
|
||||||
|
impl->MaybeScheduleCompaction();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
impl->mutex_.Unlock();
|
impl->mutex_.Unlock();
|
||||||
@ -1156,6 +1181,9 @@ Status DB::Open(const Options& options, const std::string& dbname,
|
|||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Snapshot::~Snapshot() {
|
||||||
|
}
|
||||||
|
|
||||||
Status DestroyDB(const std::string& dbname, const Options& options) {
|
Status DestroyDB(const std::string& dbname, const Options& options) {
|
||||||
Env* env = options.env;
|
Env* env = options.env;
|
||||||
std::vector<std::string> filenames;
|
std::vector<std::string> filenames;
|
||||||
|
107
db/db_test.cc
107
db/db_test.cc
@ -3,7 +3,6 @@
|
|||||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||||
|
|
||||||
#include "leveldb/db.h"
|
#include "leveldb/db.h"
|
||||||
|
|
||||||
#include "db/db_impl.h"
|
#include "db/db_impl.h"
|
||||||
#include "db/filename.h"
|
#include "db/filename.h"
|
||||||
#include "db/version_set.h"
|
#include "db/version_set.h"
|
||||||
@ -802,8 +801,17 @@ TEST(DBTest, DBOpen_Options) {
|
|||||||
db = NULL;
|
db = NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
typedef std::map<std::string, std::string> KVMap;
|
||||||
|
}
|
||||||
|
|
||||||
class ModelDB: public DB {
|
class ModelDB: public DB {
|
||||||
public:
|
public:
|
||||||
|
class ModelSnapshot : public Snapshot {
|
||||||
|
public:
|
||||||
|
KVMap map_;
|
||||||
|
};
|
||||||
|
|
||||||
explicit ModelDB(const Options& options): options_(options) { }
|
explicit ModelDB(const Options& options): options_(options) { }
|
||||||
~ModelDB() { }
|
~ModelDB() { }
|
||||||
virtual Status Put(const WriteOptions& o, const Slice& k, const Slice& v) {
|
virtual Status Put(const WriteOptions& o, const Slice& k, const Slice& v) {
|
||||||
@ -824,35 +832,34 @@ class ModelDB: public DB {
|
|||||||
return new ModelIter(saved, true);
|
return new ModelIter(saved, true);
|
||||||
} else {
|
} else {
|
||||||
const KVMap* snapshot_state =
|
const KVMap* snapshot_state =
|
||||||
reinterpret_cast<const KVMap*>(options.snapshot->number_);
|
&(reinterpret_cast<const ModelSnapshot*>(options.snapshot)->map_);
|
||||||
return new ModelIter(snapshot_state, false);
|
return new ModelIter(snapshot_state, false);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
virtual const Snapshot* GetSnapshot() {
|
virtual const Snapshot* GetSnapshot() {
|
||||||
KVMap* saved = new KVMap;
|
ModelSnapshot* snapshot = new ModelSnapshot;
|
||||||
*saved = map_;
|
snapshot->map_ = map_;
|
||||||
return snapshots_.New(
|
return snapshot;
|
||||||
reinterpret_cast<SequenceNumber>(saved));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
virtual void ReleaseSnapshot(const Snapshot* snapshot) {
|
virtual void ReleaseSnapshot(const Snapshot* snapshot) {
|
||||||
const KVMap* saved = reinterpret_cast<const KVMap*>(snapshot->number_);
|
delete reinterpret_cast<const ModelSnapshot*>(snapshot);
|
||||||
delete saved;
|
|
||||||
snapshots_.Delete(snapshot);
|
|
||||||
}
|
}
|
||||||
virtual Status Write(const WriteOptions& options, WriteBatch* batch) {
|
virtual Status Write(const WriteOptions& options, WriteBatch* batch) {
|
||||||
assert(options.post_write_snapshot == NULL); // Not supported
|
assert(options.post_write_snapshot == NULL); // Not supported
|
||||||
for (WriteBatchInternal::Iterator it(*batch); !it.Done(); it.Next()) {
|
class Handler : public WriteBatch::Handler {
|
||||||
switch (it.op()) {
|
public:
|
||||||
case kTypeValue:
|
KVMap* map_;
|
||||||
map_[it.key().ToString()] = it.value().ToString();
|
virtual void Put(const Slice& key, const Slice& value) {
|
||||||
break;
|
(*map_)[key.ToString()] = value.ToString();
|
||||||
case kTypeDeletion:
|
|
||||||
map_.erase(it.key().ToString());
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
|
virtual void Delete(const Slice& key) {
|
||||||
|
map_->erase(key.ToString());
|
||||||
}
|
}
|
||||||
return Status::OK();
|
};
|
||||||
|
Handler handler;
|
||||||
|
handler.map_ = &map_;
|
||||||
|
return batch->Iterate(&handler);
|
||||||
}
|
}
|
||||||
|
|
||||||
virtual bool GetProperty(const Slice& property, std::string* value) {
|
virtual bool GetProperty(const Slice& property, std::string* value) {
|
||||||
@ -864,7 +871,6 @@ class ModelDB: public DB {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
private:
|
private:
|
||||||
typedef std::map<std::string, std::string> KVMap;
|
|
||||||
class ModelIter: public Iterator {
|
class ModelIter: public Iterator {
|
||||||
public:
|
public:
|
||||||
ModelIter(const KVMap* map, bool owned)
|
ModelIter(const KVMap* map, bool owned)
|
||||||
@ -897,7 +903,6 @@ class ModelDB: public DB {
|
|||||||
};
|
};
|
||||||
const Options options_;
|
const Options options_;
|
||||||
KVMap map_;
|
KVMap map_;
|
||||||
SnapshotList snapshots_;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
static std::string RandomKey(Random* rnd) {
|
static std::string RandomKey(Random* rnd) {
|
||||||
@ -1023,8 +1028,70 @@ TEST(DBTest, Randomized) {
|
|||||||
if (db_snap != NULL) db_->ReleaseSnapshot(db_snap);
|
if (db_snap != NULL) db_->ReleaseSnapshot(db_snap);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::string MakeKey(unsigned int num) {
|
||||||
|
char buf[30];
|
||||||
|
snprintf(buf, sizeof(buf), "%016u", num);
|
||||||
|
return std::string(buf);
|
||||||
|
}
|
||||||
|
|
||||||
|
void BM_LogAndApply(int iters, int num_base_files) {
|
||||||
|
std::string dbname = test::TmpDir() + "/leveldb_test_benchmark";
|
||||||
|
DestroyDB(dbname, Options());
|
||||||
|
|
||||||
|
DB* db = NULL;
|
||||||
|
Options opts;
|
||||||
|
opts.create_if_missing = true;
|
||||||
|
Status s = DB::Open(opts, dbname, &db);
|
||||||
|
ASSERT_OK(s);
|
||||||
|
ASSERT_TRUE(db != NULL);
|
||||||
|
|
||||||
|
delete db;
|
||||||
|
db = NULL;
|
||||||
|
|
||||||
|
Env* env = Env::Default();
|
||||||
|
|
||||||
|
InternalKeyComparator cmp(BytewiseComparator());
|
||||||
|
Options options;
|
||||||
|
VersionSet vset(dbname, &options, NULL, &cmp);
|
||||||
|
ASSERT_OK(vset.Recover());
|
||||||
|
VersionEdit vbase;
|
||||||
|
uint64_t fnum = 1;
|
||||||
|
for (int i = 0; i < num_base_files; i++) {
|
||||||
|
InternalKey start(MakeKey(2*fnum), 1, kTypeValue);
|
||||||
|
InternalKey limit(MakeKey(2*fnum+1), 1, kTypeDeletion);
|
||||||
|
vbase.AddFile(2, fnum++, 1 /* file size */, start, limit);
|
||||||
|
}
|
||||||
|
ASSERT_OK(vset.LogAndApply(&vbase));
|
||||||
|
|
||||||
|
uint64_t start_micros = env->NowMicros();
|
||||||
|
|
||||||
|
for (int i = 0; i < iters; i++) {
|
||||||
|
VersionEdit vedit;
|
||||||
|
vedit.DeleteFile(2, fnum);
|
||||||
|
InternalKey start(MakeKey(2*fnum), 1, kTypeValue);
|
||||||
|
InternalKey limit(MakeKey(2*fnum+1), 1, kTypeDeletion);
|
||||||
|
vedit.AddFile(2, fnum++, 1 /* file size */, start, limit);
|
||||||
|
vset.LogAndApply(&vedit);
|
||||||
|
}
|
||||||
|
uint64_t stop_micros = env->NowMicros();
|
||||||
|
unsigned int us = stop_micros - start_micros;
|
||||||
|
char buf[16];
|
||||||
|
snprintf(buf, sizeof(buf), "%d", num_base_files);
|
||||||
|
fprintf(stderr,
|
||||||
|
"BM_LogAndApply/%-6s %8d iters : %9u us (%7.0f us / iter)\n",
|
||||||
|
buf, iters, us, ((float)us) / iters);
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
int main(int argc, char** argv) {
|
int main(int argc, char** argv) {
|
||||||
|
if (argc > 1 && std::string(argv[1]) == "--benchmark") {
|
||||||
|
leveldb::BM_LogAndApply(1000, 1);
|
||||||
|
leveldb::BM_LogAndApply(1000, 100);
|
||||||
|
leveldb::BM_LogAndApply(1000, 10000);
|
||||||
|
leveldb::BM_LogAndApply(100, 100000);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
return leveldb::test::RunAllTests();
|
return leveldb::test::RunAllTests();
|
||||||
}
|
}
|
||||||
|
@ -19,6 +19,16 @@ namespace leveldb {
|
|||||||
// parameters set via options.
|
// parameters set via options.
|
||||||
namespace config {
|
namespace config {
|
||||||
static const int kNumLevels = 7;
|
static const int kNumLevels = 7;
|
||||||
|
|
||||||
|
// Level-0 compaction is started when we hit this many files.
|
||||||
|
static const int kL0_CompactionTrigger = 4;
|
||||||
|
|
||||||
|
// Soft limit on number of level-0 files. We slow down writes at this point.
|
||||||
|
static const int kL0_SlowdownWritesTrigger = 8;
|
||||||
|
|
||||||
|
// Maximum number of level-0 files. We stop writes at this point.
|
||||||
|
static const int kL0_StopWritesTrigger = 12;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
class InternalKey;
|
class InternalKey;
|
||||||
|
116
db/log_reader.cc
116
db/log_reader.cc
@ -4,7 +4,6 @@
|
|||||||
|
|
||||||
#include "db/log_reader.h"
|
#include "db/log_reader.h"
|
||||||
|
|
||||||
#include <stdint.h>
|
|
||||||
#include "leveldb/env.h"
|
#include "leveldb/env.h"
|
||||||
#include "util/coding.h"
|
#include "util/coding.h"
|
||||||
#include "util/crc32c.h"
|
#include "util/crc32c.h"
|
||||||
@ -15,46 +14,104 @@ namespace log {
|
|||||||
Reader::Reporter::~Reporter() {
|
Reader::Reporter::~Reporter() {
|
||||||
}
|
}
|
||||||
|
|
||||||
Reader::Reader(SequentialFile* file, Reporter* reporter, bool checksum)
|
Reader::Reader(SequentialFile* file, Reporter* reporter, bool checksum,
|
||||||
|
uint64_t initial_offset)
|
||||||
: file_(file),
|
: file_(file),
|
||||||
reporter_(reporter),
|
reporter_(reporter),
|
||||||
checksum_(checksum),
|
checksum_(checksum),
|
||||||
backing_store_(new char[kBlockSize]),
|
backing_store_(new char[kBlockSize]),
|
||||||
buffer_(),
|
buffer_(),
|
||||||
eof_(false) {
|
eof_(false),
|
||||||
|
last_record_offset_(0),
|
||||||
|
end_of_buffer_offset_(0),
|
||||||
|
initial_offset_(initial_offset) {
|
||||||
}
|
}
|
||||||
|
|
||||||
Reader::~Reader() {
|
Reader::~Reader() {
|
||||||
delete[] backing_store_;
|
delete[] backing_store_;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool Reader::SkipToInitialBlock() {
|
||||||
|
size_t offset_in_block = initial_offset_ % kBlockSize;
|
||||||
|
uint64_t block_start_location = initial_offset_ - offset_in_block;
|
||||||
|
|
||||||
|
// Don't search a block if we'd be in the trailer
|
||||||
|
if (offset_in_block > kBlockSize - 6) {
|
||||||
|
offset_in_block = 0;
|
||||||
|
block_start_location += kBlockSize;
|
||||||
|
}
|
||||||
|
|
||||||
|
end_of_buffer_offset_ = block_start_location;
|
||||||
|
|
||||||
|
// Skip to start of first block that can contain the initial record
|
||||||
|
if (block_start_location > 0) {
|
||||||
|
Status skip_status = file_->Skip(block_start_location);
|
||||||
|
if (!skip_status.ok()) {
|
||||||
|
ReportDrop(block_start_location, skip_status);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
bool Reader::ReadRecord(Slice* record, std::string* scratch) {
|
bool Reader::ReadRecord(Slice* record, std::string* scratch) {
|
||||||
|
if (last_record_offset_ < initial_offset_) {
|
||||||
|
if (!SkipToInitialBlock()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
scratch->clear();
|
scratch->clear();
|
||||||
record->clear();
|
record->clear();
|
||||||
bool in_fragmented_record = false;
|
bool in_fragmented_record = false;
|
||||||
|
// Record offset of the logical record that we're reading
|
||||||
|
// 0 is a dummy value to make compilers happy
|
||||||
|
uint64_t prospective_record_offset = 0;
|
||||||
|
|
||||||
Slice fragment;
|
Slice fragment;
|
||||||
while (true) {
|
while (true) {
|
||||||
|
uint64_t physical_record_offset = end_of_buffer_offset_ - buffer_.size();
|
||||||
switch (ReadPhysicalRecord(&fragment)) {
|
switch (ReadPhysicalRecord(&fragment)) {
|
||||||
case kFullType:
|
case kFullType:
|
||||||
if (in_fragmented_record) {
|
if (in_fragmented_record) {
|
||||||
ReportDrop(scratch->size(), "partial record without end");
|
// Handle bug in earlier versions of log::Writer where
|
||||||
|
// it could emit an empty kFirstType record at the tail end
|
||||||
|
// of a block followed by a kFullType or kFirstType record
|
||||||
|
// at the beginning of the next block.
|
||||||
|
if (scratch->empty()) {
|
||||||
|
in_fragmented_record = false;
|
||||||
|
} else {
|
||||||
|
ReportCorruption(scratch->size(), "partial record without end(1)");
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
prospective_record_offset = physical_record_offset;
|
||||||
scratch->clear();
|
scratch->clear();
|
||||||
*record = fragment;
|
*record = fragment;
|
||||||
|
last_record_offset_ = prospective_record_offset;
|
||||||
return true;
|
return true;
|
||||||
|
|
||||||
case kFirstType:
|
case kFirstType:
|
||||||
if (in_fragmented_record) {
|
if (in_fragmented_record) {
|
||||||
ReportDrop(scratch->size(), "partial record without end");
|
// Handle bug in earlier versions of log::Writer where
|
||||||
|
// it could emit an empty kFirstType record at the tail end
|
||||||
|
// of a block followed by a kFullType or kFirstType record
|
||||||
|
// at the beginning of the next block.
|
||||||
|
if (scratch->empty()) {
|
||||||
|
in_fragmented_record = false;
|
||||||
|
} else {
|
||||||
|
ReportCorruption(scratch->size(), "partial record without end(2)");
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
prospective_record_offset = physical_record_offset;
|
||||||
scratch->assign(fragment.data(), fragment.size());
|
scratch->assign(fragment.data(), fragment.size());
|
||||||
in_fragmented_record = true;
|
in_fragmented_record = true;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case kMiddleType:
|
case kMiddleType:
|
||||||
if (!in_fragmented_record) {
|
if (!in_fragmented_record) {
|
||||||
ReportDrop(fragment.size(), "missing start of fragmented record");
|
ReportCorruption(fragment.size(),
|
||||||
|
"missing start of fragmented record(1)");
|
||||||
} else {
|
} else {
|
||||||
scratch->append(fragment.data(), fragment.size());
|
scratch->append(fragment.data(), fragment.size());
|
||||||
}
|
}
|
||||||
@ -62,31 +119,33 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch) {
|
|||||||
|
|
||||||
case kLastType:
|
case kLastType:
|
||||||
if (!in_fragmented_record) {
|
if (!in_fragmented_record) {
|
||||||
ReportDrop(fragment.size(), "missing start of fragmented record");
|
ReportCorruption(fragment.size(),
|
||||||
|
"missing start of fragmented record(2)");
|
||||||
} else {
|
} else {
|
||||||
scratch->append(fragment.data(), fragment.size());
|
scratch->append(fragment.data(), fragment.size());
|
||||||
*record = Slice(*scratch);
|
*record = Slice(*scratch);
|
||||||
|
last_record_offset_ = prospective_record_offset;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case kEof:
|
case kEof:
|
||||||
if (in_fragmented_record) {
|
if (in_fragmented_record) {
|
||||||
ReportDrop(scratch->size(), "partial record without end");
|
ReportCorruption(scratch->size(), "partial record without end(3)");
|
||||||
scratch->clear();
|
scratch->clear();
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
case kBadRecord:
|
case kBadRecord:
|
||||||
if (in_fragmented_record) {
|
if (in_fragmented_record) {
|
||||||
ReportDrop(scratch->size(), "error in middle of record");
|
ReportCorruption(scratch->size(), "error in middle of record");
|
||||||
in_fragmented_record = false;
|
in_fragmented_record = false;
|
||||||
scratch->clear();
|
scratch->clear();
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
|
||||||
default:
|
default:
|
||||||
ReportDrop(
|
ReportCorruption(
|
||||||
(fragment.size() + (in_fragmented_record ? scratch->size() : 0)),
|
(fragment.size() + (in_fragmented_record ? scratch->size() : 0)),
|
||||||
"unknown record type");
|
"unknown record type");
|
||||||
in_fragmented_record = false;
|
in_fragmented_record = false;
|
||||||
@ -97,9 +156,18 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch) {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
void Reader::ReportDrop(size_t bytes, const char* reason) {
|
uint64_t Reader::LastRecordOffset() {
|
||||||
if (reporter_ != NULL) {
|
return last_record_offset_;
|
||||||
reporter_->Corruption(bytes, Status::Corruption(reason));
|
}
|
||||||
|
|
||||||
|
void Reader::ReportCorruption(size_t bytes, const char* reason) {
|
||||||
|
ReportDrop(bytes, Status::Corruption(reason));
|
||||||
|
}
|
||||||
|
|
||||||
|
void Reader::ReportDrop(size_t bytes, const Status& reason) {
|
||||||
|
if (reporter_ != NULL &&
|
||||||
|
end_of_buffer_offset_ - buffer_.size() - bytes >= initial_offset_) {
|
||||||
|
reporter_->Corruption(bytes, reason);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -110,11 +178,10 @@ unsigned int Reader::ReadPhysicalRecord(Slice* result) {
|
|||||||
// Last read was a full read, so this is a trailer to skip
|
// Last read was a full read, so this is a trailer to skip
|
||||||
buffer_.clear();
|
buffer_.clear();
|
||||||
Status status = file_->Read(kBlockSize, &buffer_, backing_store_);
|
Status status = file_->Read(kBlockSize, &buffer_, backing_store_);
|
||||||
|
end_of_buffer_offset_ += buffer_.size();
|
||||||
if (!status.ok()) {
|
if (!status.ok()) {
|
||||||
if (reporter_ != NULL) {
|
|
||||||
reporter_->Corruption(kBlockSize, status);
|
|
||||||
}
|
|
||||||
buffer_.clear();
|
buffer_.clear();
|
||||||
|
ReportDrop(kBlockSize, status);
|
||||||
eof_ = true;
|
eof_ = true;
|
||||||
return kEof;
|
return kEof;
|
||||||
} else if (buffer_.size() < kBlockSize) {
|
} else if (buffer_.size() < kBlockSize) {
|
||||||
@ -125,8 +192,9 @@ unsigned int Reader::ReadPhysicalRecord(Slice* result) {
|
|||||||
// End of file
|
// End of file
|
||||||
return kEof;
|
return kEof;
|
||||||
} else {
|
} else {
|
||||||
ReportDrop(buffer_.size(), "truncated record at end of file");
|
size_t drop_size = buffer_.size();
|
||||||
buffer_.clear();
|
buffer_.clear();
|
||||||
|
ReportCorruption(drop_size, "truncated record at end of file");
|
||||||
return kEof;
|
return kEof;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -138,8 +206,9 @@ unsigned int Reader::ReadPhysicalRecord(Slice* result) {
|
|||||||
const unsigned int type = header[6];
|
const unsigned int type = header[6];
|
||||||
const uint32_t length = a | (b << 8);
|
const uint32_t length = a | (b << 8);
|
||||||
if (kHeaderSize + length > buffer_.size()) {
|
if (kHeaderSize + length > buffer_.size()) {
|
||||||
ReportDrop(buffer_.size(), "bad record length");
|
size_t drop_size = buffer_.size();
|
||||||
buffer_.clear();
|
buffer_.clear();
|
||||||
|
ReportCorruption(drop_size, "bad record length");
|
||||||
return kBadRecord;
|
return kBadRecord;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -160,13 +229,22 @@ unsigned int Reader::ReadPhysicalRecord(Slice* result) {
|
|||||||
// been corrupted and if we trust it, we could find some
|
// been corrupted and if we trust it, we could find some
|
||||||
// fragment of a real log record that just happens to look
|
// fragment of a real log record that just happens to look
|
||||||
// like a valid log record.
|
// like a valid log record.
|
||||||
ReportDrop(buffer_.size(), "checksum mismatch");
|
size_t drop_size = buffer_.size();
|
||||||
buffer_.clear();
|
buffer_.clear();
|
||||||
|
ReportCorruption(drop_size, "checksum mismatch");
|
||||||
return kBadRecord;
|
return kBadRecord;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
buffer_.remove_prefix(kHeaderSize + length);
|
buffer_.remove_prefix(kHeaderSize + length);
|
||||||
|
|
||||||
|
// Skip physical record that started before initial_offset_
|
||||||
|
if (end_of_buffer_offset_ - buffer_.size() - kHeaderSize - length <
|
||||||
|
initial_offset_) {
|
||||||
|
result->clear();
|
||||||
|
return kBadRecord;
|
||||||
|
}
|
||||||
|
|
||||||
*result = Slice(header + kHeaderSize, length);
|
*result = Slice(header + kHeaderSize, length);
|
||||||
return type;
|
return type;
|
||||||
}
|
}
|
||||||
|
@ -5,6 +5,8 @@
|
|||||||
#ifndef STORAGE_LEVELDB_DB_LOG_READER_H_
|
#ifndef STORAGE_LEVELDB_DB_LOG_READER_H_
|
||||||
#define STORAGE_LEVELDB_DB_LOG_READER_H_
|
#define STORAGE_LEVELDB_DB_LOG_READER_H_
|
||||||
|
|
||||||
|
#include <stdint.h>
|
||||||
|
|
||||||
#include "db/log_format.h"
|
#include "db/log_format.h"
|
||||||
#include "leveldb/slice.h"
|
#include "leveldb/slice.h"
|
||||||
#include "leveldb/status.h"
|
#include "leveldb/status.h"
|
||||||
@ -35,7 +37,11 @@ class Reader {
|
|||||||
// live while this Reader is in use.
|
// live while this Reader is in use.
|
||||||
//
|
//
|
||||||
// If "checksum" is true, verify checksums if available.
|
// If "checksum" is true, verify checksums if available.
|
||||||
Reader(SequentialFile* file, Reporter* reporter, bool checksum);
|
//
|
||||||
|
// The Reader will start reading at the first record located at physical
|
||||||
|
// position >= initial_offset within the file.
|
||||||
|
Reader(SequentialFile* file, Reporter* reporter, bool checksum,
|
||||||
|
uint64_t initial_offset);
|
||||||
|
|
||||||
~Reader();
|
~Reader();
|
||||||
|
|
||||||
@ -46,6 +52,11 @@ class Reader {
|
|||||||
// reader or the next mutation to *scratch.
|
// reader or the next mutation to *scratch.
|
||||||
bool ReadRecord(Slice* record, std::string* scratch);
|
bool ReadRecord(Slice* record, std::string* scratch);
|
||||||
|
|
||||||
|
// Returns the physical offset of the last record returned by ReadRecord.
|
||||||
|
//
|
||||||
|
// Undefined before the first call to ReadRecord.
|
||||||
|
uint64_t LastRecordOffset();
|
||||||
|
|
||||||
private:
|
private:
|
||||||
SequentialFile* const file_;
|
SequentialFile* const file_;
|
||||||
Reporter* const reporter_;
|
Reporter* const reporter_;
|
||||||
@ -54,15 +65,37 @@ class Reader {
|
|||||||
Slice buffer_;
|
Slice buffer_;
|
||||||
bool eof_; // Last Read() indicated EOF by returning < kBlockSize
|
bool eof_; // Last Read() indicated EOF by returning < kBlockSize
|
||||||
|
|
||||||
|
// Offset of the last record returned by ReadRecord.
|
||||||
|
uint64_t last_record_offset_;
|
||||||
|
// Offset of the first location past the end of buffer_.
|
||||||
|
uint64_t end_of_buffer_offset_;
|
||||||
|
|
||||||
|
// Offset at which to start looking for the first record to return
|
||||||
|
uint64_t const initial_offset_;
|
||||||
|
|
||||||
// Extend record types with the following special values
|
// Extend record types with the following special values
|
||||||
enum {
|
enum {
|
||||||
kEof = kMaxRecordType + 1,
|
kEof = kMaxRecordType + 1,
|
||||||
|
// Returned whenever we find an invalid physical record.
|
||||||
|
// Currently there are three situations in which this happens:
|
||||||
|
// * The record has an invalid CRC (ReadPhysicalRecord reports a drop)
|
||||||
|
// * The record is a 0-length record (No drop is reported)
|
||||||
|
// * The record is below constructor's initial_offset (No drop is reported)
|
||||||
kBadRecord = kMaxRecordType + 2
|
kBadRecord = kMaxRecordType + 2
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Skips all blocks that are completely before "initial_offset_".
|
||||||
|
//
|
||||||
|
// Returns true on success. Handles reporting.
|
||||||
|
bool SkipToInitialBlock();
|
||||||
|
|
||||||
// Return type, or one of the preceding special values
|
// Return type, or one of the preceding special values
|
||||||
unsigned int ReadPhysicalRecord(Slice* result);
|
unsigned int ReadPhysicalRecord(Slice* result);
|
||||||
void ReportDrop(size_t bytes, const char* reason);
|
|
||||||
|
// Reports dropped bytes to the reporter.
|
||||||
|
// buffer_ must be updated to remove the dropped bytes prior to invocation.
|
||||||
|
void ReportCorruption(size_t bytes, const char* reason);
|
||||||
|
void ReportDrop(size_t bytes, const Status& reason);
|
||||||
|
|
||||||
// No copying allowed
|
// No copying allowed
|
||||||
Reader(const Reader&);
|
Reader(const Reader&);
|
||||||
|
143
db/log_test.cc
143
db/log_test.cc
@ -60,7 +60,6 @@ class LogTest {
|
|||||||
|
|
||||||
virtual Status Read(size_t n, Slice* result, char* scratch) {
|
virtual Status Read(size_t n, Slice* result, char* scratch) {
|
||||||
ASSERT_TRUE(!returned_partial_) << "must not Read() after eof/error";
|
ASSERT_TRUE(!returned_partial_) << "must not Read() after eof/error";
|
||||||
ASSERT_EQ(kBlockSize, n);
|
|
||||||
|
|
||||||
if (force_error_) {
|
if (force_error_) {
|
||||||
force_error_ = false;
|
force_error_ = false;
|
||||||
@ -76,6 +75,17 @@ class LogTest {
|
|||||||
contents_.remove_prefix(n);
|
contents_.remove_prefix(n);
|
||||||
return Status::OK();
|
return Status::OK();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
virtual Status Skip(size_t n) {
|
||||||
|
if (n > contents_.size()) {
|
||||||
|
contents_.clear();
|
||||||
|
return Status::NotFound("in-memory file skipepd past end");
|
||||||
|
}
|
||||||
|
|
||||||
|
contents_.remove_prefix(n);
|
||||||
|
|
||||||
|
return Status::OK();
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
class ReportCollector : public Reader::Reporter {
|
class ReportCollector : public Reader::Reporter {
|
||||||
@ -97,10 +107,15 @@ class LogTest {
|
|||||||
Writer writer_;
|
Writer writer_;
|
||||||
Reader reader_;
|
Reader reader_;
|
||||||
|
|
||||||
|
// Record metadata for testing initial offset functionality
|
||||||
|
static size_t initial_offset_record_sizes_[];
|
||||||
|
static uint64_t initial_offset_last_record_offsets_[];
|
||||||
|
|
||||||
public:
|
public:
|
||||||
LogTest() : reading_(false),
|
LogTest() : reading_(false),
|
||||||
writer_(&dest_),
|
writer_(&dest_),
|
||||||
reader_(&source_, &report_, true/*checksum*/) {
|
reader_(&source_, &report_, true/*checksum*/,
|
||||||
|
0/*initial_offset*/) {
|
||||||
}
|
}
|
||||||
|
|
||||||
void Write(const std::string& msg) {
|
void Write(const std::string& msg) {
|
||||||
@ -153,6 +168,10 @@ class LogTest {
|
|||||||
return report_.dropped_bytes_;
|
return report_.dropped_bytes_;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::string ReportMessage() const {
|
||||||
|
return report_.message_;
|
||||||
|
}
|
||||||
|
|
||||||
// Returns OK iff recorded error message contains "msg"
|
// Returns OK iff recorded error message contains "msg"
|
||||||
std::string MatchError(const std::string& msg) const {
|
std::string MatchError(const std::string& msg) const {
|
||||||
if (report_.message_.find(msg) == std::string::npos) {
|
if (report_.message_.find(msg) == std::string::npos) {
|
||||||
@ -161,8 +180,61 @@ class LogTest {
|
|||||||
return "OK";
|
return "OK";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void WriteInitialOffsetLog() {
|
||||||
|
for (int i = 0; i < 4; i++) {
|
||||||
|
std::string record(initial_offset_record_sizes_[i],
|
||||||
|
static_cast<char>('a' + i));
|
||||||
|
Write(record);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void CheckOffsetPastEndReturnsNoRecords(uint64_t offset_past_end) {
|
||||||
|
WriteInitialOffsetLog();
|
||||||
|
reading_ = true;
|
||||||
|
source_.contents_ = Slice(dest_.contents_);
|
||||||
|
Reader* offset_reader = new Reader(&source_, &report_, true/*checksum*/,
|
||||||
|
WrittenBytes() + offset_past_end);
|
||||||
|
Slice record;
|
||||||
|
std::string scratch;
|
||||||
|
ASSERT_TRUE(!offset_reader->ReadRecord(&record, &scratch));
|
||||||
|
delete offset_reader;
|
||||||
|
}
|
||||||
|
|
||||||
|
void CheckInitialOffsetRecord(uint64_t initial_offset,
|
||||||
|
int expected_record_offset) {
|
||||||
|
WriteInitialOffsetLog();
|
||||||
|
reading_ = true;
|
||||||
|
source_.contents_ = Slice(dest_.contents_);
|
||||||
|
Reader* offset_reader = new Reader(&source_, &report_, true/*checksum*/,
|
||||||
|
initial_offset);
|
||||||
|
Slice record;
|
||||||
|
std::string scratch;
|
||||||
|
ASSERT_TRUE(offset_reader->ReadRecord(&record, &scratch));
|
||||||
|
ASSERT_EQ(initial_offset_record_sizes_[expected_record_offset],
|
||||||
|
record.size());
|
||||||
|
ASSERT_EQ(initial_offset_last_record_offsets_[expected_record_offset],
|
||||||
|
offset_reader->LastRecordOffset());
|
||||||
|
ASSERT_EQ((char)('a' + expected_record_offset), record.data()[0]);
|
||||||
|
delete offset_reader;
|
||||||
|
}
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
size_t LogTest::initial_offset_record_sizes_[] =
|
||||||
|
{10000, // Two sizable records in first block
|
||||||
|
10000,
|
||||||
|
2 * log::kBlockSize - 1000, // Span three blocks
|
||||||
|
1};
|
||||||
|
|
||||||
|
uint64_t LogTest::initial_offset_last_record_offsets_[] =
|
||||||
|
{0,
|
||||||
|
kHeaderSize + 10000,
|
||||||
|
2 * (kHeaderSize + 10000),
|
||||||
|
2 * (kHeaderSize + 10000) +
|
||||||
|
(2 * log::kBlockSize - 1000) + 3 * kHeaderSize};
|
||||||
|
|
||||||
|
|
||||||
TEST(LogTest, Empty) {
|
TEST(LogTest, Empty) {
|
||||||
ASSERT_EQ("EOF", Read());
|
ASSERT_EQ("EOF", Read());
|
||||||
}
|
}
|
||||||
@ -213,6 +285,19 @@ TEST(LogTest, MarginalTrailer) {
|
|||||||
ASSERT_EQ("EOF", Read());
|
ASSERT_EQ("EOF", Read());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST(LogTest, MarginalTrailer2) {
|
||||||
|
// Make a trailer that is exactly the same length as an empty record.
|
||||||
|
const int n = kBlockSize - 2*kHeaderSize;
|
||||||
|
Write(BigString("foo", n));
|
||||||
|
ASSERT_EQ(kBlockSize - kHeaderSize, WrittenBytes());
|
||||||
|
Write("bar");
|
||||||
|
ASSERT_EQ(BigString("foo", n), Read());
|
||||||
|
ASSERT_EQ("bar", Read());
|
||||||
|
ASSERT_EQ("EOF", Read());
|
||||||
|
ASSERT_EQ(0, DroppedBytes());
|
||||||
|
ASSERT_EQ("", ReportMessage());
|
||||||
|
}
|
||||||
|
|
||||||
TEST(LogTest, ShortTrailer) {
|
TEST(LogTest, ShortTrailer) {
|
||||||
const int n = kBlockSize - 2*kHeaderSize + 4;
|
const int n = kBlockSize - 2*kHeaderSize + 4;
|
||||||
Write(BigString("foo", n));
|
Write(BigString("foo", n));
|
||||||
@ -353,6 +438,60 @@ TEST(LogTest, ErrorJoinsRecords) {
|
|||||||
ASSERT_GE(dropped, 2*kBlockSize);
|
ASSERT_GE(dropped, 2*kBlockSize);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST(LogTest, ReadStart) {
|
||||||
|
CheckInitialOffsetRecord(0, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(LogTest, ReadSecondOneOff) {
|
||||||
|
CheckInitialOffsetRecord(1, 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(LogTest, ReadSecondTenThousand) {
|
||||||
|
CheckInitialOffsetRecord(10000, 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(LogTest, ReadSecondStart) {
|
||||||
|
CheckInitialOffsetRecord(10007, 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(LogTest, ReadThirdOneOff) {
|
||||||
|
CheckInitialOffsetRecord(10008, 2);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(LogTest, ReadThirdStart) {
|
||||||
|
CheckInitialOffsetRecord(20014, 2);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(LogTest, ReadFourthOneOff) {
|
||||||
|
CheckInitialOffsetRecord(20015, 3);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(LogTest, ReadFourthFirstBlockTrailer) {
|
||||||
|
CheckInitialOffsetRecord(log::kBlockSize - 4, 3);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(LogTest, ReadFourthMiddleBlock) {
|
||||||
|
CheckInitialOffsetRecord(log::kBlockSize + 1, 3);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(LogTest, ReadFourthLastBlock) {
|
||||||
|
CheckInitialOffsetRecord(2 * log::kBlockSize + 1, 3);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(LogTest, ReadFourthStart) {
|
||||||
|
CheckInitialOffsetRecord(
|
||||||
|
2 * (kHeaderSize + 1000) + (2 * log::kBlockSize - 1000) + 3 * kHeaderSize,
|
||||||
|
3);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(LogTest, ReadEnd) {
|
||||||
|
CheckOffsetPastEndReturnsNoRecords(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(LogTest, ReadPastEnd) {
|
||||||
|
CheckOffsetPastEndReturnsNoRecords(5);
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -32,6 +32,7 @@ Status Writer::AddRecord(const Slice& slice) {
|
|||||||
// is empty, we still want to iterate once to emit a single
|
// is empty, we still want to iterate once to emit a single
|
||||||
// zero-length record
|
// zero-length record
|
||||||
Status s;
|
Status s;
|
||||||
|
bool begin = true;
|
||||||
do {
|
do {
|
||||||
const int leftover = kBlockSize - block_offset_;
|
const int leftover = kBlockSize - block_offset_;
|
||||||
assert(leftover >= 0);
|
assert(leftover >= 0);
|
||||||
@ -52,7 +53,6 @@ Status Writer::AddRecord(const Slice& slice) {
|
|||||||
const size_t fragment_length = (left < avail) ? left : avail;
|
const size_t fragment_length = (left < avail) ? left : avail;
|
||||||
|
|
||||||
RecordType type;
|
RecordType type;
|
||||||
const bool begin = (ptr == slice.data());
|
|
||||||
const bool end = (left == fragment_length);
|
const bool end = (left == fragment_length);
|
||||||
if (begin && end) {
|
if (begin && end) {
|
||||||
type = kFullType;
|
type = kFullType;
|
||||||
@ -67,6 +67,7 @@ Status Writer::AddRecord(const Slice& slice) {
|
|||||||
s = EmitPhysicalRecord(type, ptr, fragment_length);
|
s = EmitPhysicalRecord(type, ptr, fragment_length);
|
||||||
ptr += fragment_length;
|
ptr += fragment_length;
|
||||||
left -= fragment_length;
|
left -= fragment_length;
|
||||||
|
begin = false;
|
||||||
} while (s.ok() && left > 0);
|
} while (s.ok() && left > 0);
|
||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
|
@ -20,10 +20,12 @@ static Slice GetLengthPrefixedSlice(const char* data) {
|
|||||||
|
|
||||||
MemTable::MemTable(const InternalKeyComparator& cmp)
|
MemTable::MemTable(const InternalKeyComparator& cmp)
|
||||||
: comparator_(cmp),
|
: comparator_(cmp),
|
||||||
|
refs_(0),
|
||||||
table_(comparator_, &arena_) {
|
table_(comparator_, &arena_) {
|
||||||
}
|
}
|
||||||
|
|
||||||
MemTable::~MemTable() {
|
MemTable::~MemTable() {
|
||||||
|
assert(refs_ == 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t MemTable::ApproximateMemoryUsage() { return arena_.MemoryUsage(); }
|
size_t MemTable::ApproximateMemoryUsage() { return arena_.MemoryUsage(); }
|
||||||
@ -48,10 +50,15 @@ static const char* EncodeKey(std::string* scratch, const Slice& target) {
|
|||||||
|
|
||||||
class MemTableIterator: public Iterator {
|
class MemTableIterator: public Iterator {
|
||||||
public:
|
public:
|
||||||
explicit MemTableIterator(MemTable::Table* table) {
|
explicit MemTableIterator(MemTable* mem, MemTable::Table* table) {
|
||||||
|
mem_ = mem;
|
||||||
iter_ = new MemTable::Table::Iterator(table);
|
iter_ = new MemTable::Table::Iterator(table);
|
||||||
|
mem->Ref();
|
||||||
|
}
|
||||||
|
virtual ~MemTableIterator() {
|
||||||
|
delete iter_;
|
||||||
|
mem_->Unref();
|
||||||
}
|
}
|
||||||
virtual ~MemTableIterator() { delete iter_; }
|
|
||||||
|
|
||||||
virtual bool Valid() const { return iter_->Valid(); }
|
virtual bool Valid() const { return iter_->Valid(); }
|
||||||
virtual void Seek(const Slice& k) { iter_->Seek(EncodeKey(&tmp_, k)); }
|
virtual void Seek(const Slice& k) { iter_->Seek(EncodeKey(&tmp_, k)); }
|
||||||
@ -68,6 +75,7 @@ class MemTableIterator: public Iterator {
|
|||||||
virtual Status status() const { return Status::OK(); }
|
virtual Status status() const { return Status::OK(); }
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
MemTable* mem_;
|
||||||
MemTable::Table::Iterator* iter_;
|
MemTable::Table::Iterator* iter_;
|
||||||
std::string tmp_; // For passing to EncodeKey
|
std::string tmp_; // For passing to EncodeKey
|
||||||
|
|
||||||
@ -77,7 +85,7 @@ class MemTableIterator: public Iterator {
|
|||||||
};
|
};
|
||||||
|
|
||||||
Iterator* MemTable::NewIterator() {
|
Iterator* MemTable::NewIterator() {
|
||||||
return new MemTableIterator(&table_);
|
return new MemTableIterator(this, &table_);
|
||||||
}
|
}
|
||||||
|
|
||||||
void MemTable::Add(SequenceNumber s, ValueType type,
|
void MemTable::Add(SequenceNumber s, ValueType type,
|
||||||
|
@ -19,8 +19,21 @@ class MemTableIterator;
|
|||||||
|
|
||||||
class MemTable {
|
class MemTable {
|
||||||
public:
|
public:
|
||||||
|
// MemTables are reference counted. The initial reference count
|
||||||
|
// is zero and the caller must call Ref() at least once.
|
||||||
explicit MemTable(const InternalKeyComparator& comparator);
|
explicit MemTable(const InternalKeyComparator& comparator);
|
||||||
~MemTable();
|
|
||||||
|
// Increase reference count.
|
||||||
|
void Ref() { ++refs_; }
|
||||||
|
|
||||||
|
// Drop reference count. Delete if no more references exist.
|
||||||
|
void Unref() {
|
||||||
|
--refs_;
|
||||||
|
assert(refs_ >= 0);
|
||||||
|
if (refs_ <= 0) {
|
||||||
|
delete this;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Returns an estimate of the number of bytes of data in use by this
|
// Returns an estimate of the number of bytes of data in use by this
|
||||||
// data structure.
|
// data structure.
|
||||||
@ -45,6 +58,8 @@ class MemTable {
|
|||||||
const Slice& value);
|
const Slice& value);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
~MemTable(); // Private since only Unref() should be used to delete it
|
||||||
|
|
||||||
struct KeyComparator {
|
struct KeyComparator {
|
||||||
const InternalKeyComparator comparator;
|
const InternalKeyComparator comparator;
|
||||||
explicit KeyComparator(const InternalKeyComparator& c) : comparator(c) { }
|
explicit KeyComparator(const InternalKeyComparator& c) : comparator(c) { }
|
||||||
@ -56,6 +71,7 @@ class MemTable {
|
|||||||
typedef SkipList<const char*, KeyComparator> Table;
|
typedef SkipList<const char*, KeyComparator> Table;
|
||||||
|
|
||||||
KeyComparator comparator_;
|
KeyComparator comparator_;
|
||||||
|
int refs_;
|
||||||
Arena arena_;
|
Arena arena_;
|
||||||
Table table_;
|
Table table_;
|
||||||
|
|
||||||
|
12
db/repair.cc
12
db/repair.cc
@ -183,13 +183,15 @@ class Repairer {
|
|||||||
// corruptions cause entire commits to be skipped instead of
|
// corruptions cause entire commits to be skipped instead of
|
||||||
// propagating bad information (like overly large sequence
|
// propagating bad information (like overly large sequence
|
||||||
// numbers).
|
// numbers).
|
||||||
log::Reader reader(lfile, &reporter, false/*do not checksum*/);
|
log::Reader reader(lfile, &reporter, false/*do not checksum*/,
|
||||||
|
0/*initial_offset*/);
|
||||||
|
|
||||||
// Read all the records and add to a memtable
|
// Read all the records and add to a memtable
|
||||||
std::string scratch;
|
std::string scratch;
|
||||||
Slice record;
|
Slice record;
|
||||||
WriteBatch batch;
|
WriteBatch batch;
|
||||||
MemTable mem(icmp_);
|
MemTable* mem = new MemTable(icmp_);
|
||||||
|
mem->Ref();
|
||||||
int counter = 0;
|
int counter = 0;
|
||||||
while (reader.ReadRecord(&record, &scratch)) {
|
while (reader.ReadRecord(&record, &scratch)) {
|
||||||
if (record.size() < 12) {
|
if (record.size() < 12) {
|
||||||
@ -198,7 +200,7 @@ class Repairer {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
WriteBatchInternal::SetContents(&batch, record);
|
WriteBatchInternal::SetContents(&batch, record);
|
||||||
status = WriteBatchInternal::InsertInto(&batch, &mem);
|
status = WriteBatchInternal::InsertInto(&batch, mem);
|
||||||
if (status.ok()) {
|
if (status.ok()) {
|
||||||
counter += WriteBatchInternal::Count(&batch);
|
counter += WriteBatchInternal::Count(&batch);
|
||||||
} else {
|
} else {
|
||||||
@ -215,10 +217,12 @@ class Repairer {
|
|||||||
VersionEdit skipped;
|
VersionEdit skipped;
|
||||||
FileMetaData meta;
|
FileMetaData meta;
|
||||||
meta.number = next_file_number_++;
|
meta.number = next_file_number_++;
|
||||||
Iterator* iter = mem.NewIterator();
|
Iterator* iter = mem->NewIterator();
|
||||||
status = BuildTable(dbname_, env_, options_, table_cache_, iter,
|
status = BuildTable(dbname_, env_, options_, table_cache_, iter,
|
||||||
&meta, &skipped);
|
&meta, &skipped);
|
||||||
delete iter;
|
delete iter;
|
||||||
|
mem->Unref();
|
||||||
|
mem = NULL;
|
||||||
if (status.ok()) {
|
if (status.ok()) {
|
||||||
if (meta.file_size > 0) {
|
if (meta.file_size > 0) {
|
||||||
table_numbers_.push_back(meta.number);
|
table_numbers_.push_back(meta.number);
|
||||||
|
@ -12,17 +12,17 @@ namespace leveldb {
|
|||||||
class SnapshotList;
|
class SnapshotList;
|
||||||
|
|
||||||
// Snapshots are kept in a doubly-linked list in the DB.
|
// Snapshots are kept in a doubly-linked list in the DB.
|
||||||
// Each Snapshot corresponds to a particular sequence number.
|
// Each SnapshotImpl corresponds to a particular sequence number.
|
||||||
class Snapshot {
|
class SnapshotImpl : public Snapshot {
|
||||||
public:
|
public:
|
||||||
SequenceNumber number_; // const after creation
|
SequenceNumber number_; // const after creation
|
||||||
|
|
||||||
private:
|
private:
|
||||||
friend class SnapshotList;
|
friend class SnapshotList;
|
||||||
|
|
||||||
// Snapshot is kept in a doubly-linked circular list
|
// SnapshotImpl is kept in a doubly-linked circular list
|
||||||
Snapshot* prev_;
|
SnapshotImpl* prev_;
|
||||||
Snapshot* next_;
|
SnapshotImpl* next_;
|
||||||
|
|
||||||
SnapshotList* list_; // just for sanity checks
|
SnapshotList* list_; // just for sanity checks
|
||||||
};
|
};
|
||||||
@ -35,11 +35,11 @@ class SnapshotList {
|
|||||||
}
|
}
|
||||||
|
|
||||||
bool empty() const { return list_.next_ == &list_; }
|
bool empty() const { return list_.next_ == &list_; }
|
||||||
Snapshot* oldest() const { assert(!empty()); return list_.next_; }
|
SnapshotImpl* oldest() const { assert(!empty()); return list_.next_; }
|
||||||
Snapshot* newest() const { assert(!empty()); return list_.prev_; }
|
SnapshotImpl* newest() const { assert(!empty()); return list_.prev_; }
|
||||||
|
|
||||||
const Snapshot* New(SequenceNumber seq) {
|
const SnapshotImpl* New(SequenceNumber seq) {
|
||||||
Snapshot* s = new Snapshot;
|
SnapshotImpl* s = new SnapshotImpl;
|
||||||
s->number_ = seq;
|
s->number_ = seq;
|
||||||
s->list_ = this;
|
s->list_ = this;
|
||||||
s->next_ = &list_;
|
s->next_ = &list_;
|
||||||
@ -49,7 +49,7 @@ class SnapshotList {
|
|||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
|
|
||||||
void Delete(const Snapshot* s) {
|
void Delete(const SnapshotImpl* s) {
|
||||||
assert(s->list_ == this);
|
assert(s->list_ == this);
|
||||||
s->prev_->next_ = s->next_;
|
s->prev_->next_ = s->next_;
|
||||||
s->next_->prev_ = s->prev_;
|
s->next_->prev_ = s->prev_;
|
||||||
@ -58,7 +58,7 @@ class SnapshotList {
|
|||||||
|
|
||||||
private:
|
private:
|
||||||
// Dummy head of doubly-linked list of snapshots
|
// Dummy head of doubly-linked list of snapshots
|
||||||
Snapshot list_;
|
SnapshotImpl list_;
|
||||||
};
|
};
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -57,17 +57,22 @@ std::string IntSetToString(const std::set<uint64_t>& s) {
|
|||||||
|
|
||||||
Version::~Version() {
|
Version::~Version() {
|
||||||
assert(refs_ == 0);
|
assert(refs_ == 0);
|
||||||
|
|
||||||
|
// Remove from linked list
|
||||||
|
prev_->next_ = next_;
|
||||||
|
next_->prev_ = prev_;
|
||||||
|
|
||||||
|
// Drop references to files
|
||||||
for (int level = 0; level < config::kNumLevels; level++) {
|
for (int level = 0; level < config::kNumLevels; level++) {
|
||||||
for (size_t i = 0; i < files_[level].size(); i++) {
|
for (size_t i = 0; i < files_[level].size(); i++) {
|
||||||
FileMetaData* f = files_[level][i];
|
FileMetaData* f = files_[level][i];
|
||||||
assert(f->refs >= 0);
|
assert(f->refs > 0);
|
||||||
f->refs--;
|
f->refs--;
|
||||||
if (f->refs <= 0) {
|
if (f->refs <= 0) {
|
||||||
delete f;
|
delete f;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
delete cleanup_mem_;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// An internal iterator. For a given version/level pair, yields
|
// An internal iterator. For a given version/level pair, yields
|
||||||
@ -77,9 +82,9 @@ Version::~Version() {
|
|||||||
// encoded using EncodeFixed64.
|
// encoded using EncodeFixed64.
|
||||||
class Version::LevelFileNumIterator : public Iterator {
|
class Version::LevelFileNumIterator : public Iterator {
|
||||||
public:
|
public:
|
||||||
LevelFileNumIterator(const Version* version,
|
LevelFileNumIterator(const InternalKeyComparator& icmp,
|
||||||
const std::vector<FileMetaData*>* flist)
|
const std::vector<FileMetaData*>* flist)
|
||||||
: icmp_(version->vset_->icmp_.user_comparator()),
|
: icmp_(icmp),
|
||||||
flist_(flist),
|
flist_(flist),
|
||||||
index_(flist->size()) { // Marks as invalid
|
index_(flist->size()) { // Marks as invalid
|
||||||
}
|
}
|
||||||
@ -157,7 +162,7 @@ static Iterator* GetFileIterator(void* arg,
|
|||||||
Iterator* Version::NewConcatenatingIterator(const ReadOptions& options,
|
Iterator* Version::NewConcatenatingIterator(const ReadOptions& options,
|
||||||
int level) const {
|
int level) const {
|
||||||
return NewTwoLevelIterator(
|
return NewTwoLevelIterator(
|
||||||
new LevelFileNumIterator(this, &files_[level]),
|
new LevelFileNumIterator(vset_->icmp_, &files_[level]),
|
||||||
&GetFileIterator, vset_->table_cache_, options);
|
&GetFileIterator, vset_->table_cache_, options);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -185,11 +190,11 @@ void Version::Ref() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void Version::Unref() {
|
void Version::Unref() {
|
||||||
|
assert(this != &vset_->dummy_versions_);
|
||||||
assert(refs_ >= 1);
|
assert(refs_ >= 1);
|
||||||
--refs_;
|
--refs_;
|
||||||
if (refs_ == 0) {
|
if (refs_ == 0) {
|
||||||
vset_->MaybeDeleteOldVersions();
|
delete this;
|
||||||
// TODO: try to delete obsolete files
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -222,37 +227,58 @@ std::string Version::DebugString() const {
|
|||||||
// Versions that contain full copies of the intermediate state.
|
// Versions that contain full copies of the intermediate state.
|
||||||
class VersionSet::Builder {
|
class VersionSet::Builder {
|
||||||
private:
|
private:
|
||||||
typedef std::map<uint64_t, FileMetaData*> FileMap;
|
// Helper to sort by v->files_[file_number].smallest
|
||||||
|
struct BySmallestKey {
|
||||||
|
const InternalKeyComparator* internal_comparator;
|
||||||
|
|
||||||
|
bool operator()(FileMetaData* f1, FileMetaData* f2) const {
|
||||||
|
int r = internal_comparator->Compare(f1->smallest, f2->smallest);
|
||||||
|
if (r != 0) {
|
||||||
|
return (r < 0);
|
||||||
|
} else {
|
||||||
|
// Break ties by file number
|
||||||
|
return (f1->number < f2->number);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
typedef std::set<FileMetaData*, BySmallestKey> FileSet;
|
||||||
|
struct LevelState {
|
||||||
|
std::set<uint64_t> deleted_files;
|
||||||
|
FileSet* added_files;
|
||||||
|
};
|
||||||
|
|
||||||
VersionSet* vset_;
|
VersionSet* vset_;
|
||||||
FileMap files_[config::kNumLevels];
|
Version* base_;
|
||||||
|
LevelState levels_[config::kNumLevels];
|
||||||
|
|
||||||
public:
|
public:
|
||||||
// Initialize a builder with the files from *base and other info from *vset
|
// Initialize a builder with the files from *base and other info from *vset
|
||||||
Builder(VersionSet* vset, Version* base)
|
Builder(VersionSet* vset, Version* base)
|
||||||
: vset_(vset) {
|
: vset_(vset),
|
||||||
|
base_(base) {
|
||||||
|
base_->Ref();
|
||||||
|
BySmallestKey cmp;
|
||||||
|
cmp.internal_comparator = &vset_->icmp_;
|
||||||
for (int level = 0; level < config::kNumLevels; level++) {
|
for (int level = 0; level < config::kNumLevels; level++) {
|
||||||
const std::vector<FileMetaData*>& files = base->files_[level];
|
levels_[level].added_files = new FileSet(cmp);
|
||||||
for (size_t i = 0; i < files.size(); i++) {
|
|
||||||
FileMetaData* f = files[i];
|
|
||||||
f->refs++;
|
|
||||||
files_[level].insert(std::make_pair(f->number, f));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
~Builder() {
|
~Builder() {
|
||||||
for (int level = 0; level < config::kNumLevels; level++) {
|
for (int level = 0; level < config::kNumLevels; level++) {
|
||||||
const FileMap& fmap = files_[level];
|
std::vector<FileMetaData*> to_unref(levels_[level].added_files->begin(),
|
||||||
for (FileMap::const_iterator iter = fmap.begin();
|
levels_[level].added_files->end());
|
||||||
iter != fmap.end();
|
delete levels_[level].added_files;
|
||||||
++iter) {
|
for (int i = 0; i < to_unref.size(); i++) {
|
||||||
FileMetaData* f = iter->second;
|
FileMetaData* f = to_unref[i];
|
||||||
f->refs--;
|
f->refs--;
|
||||||
if (f->refs <= 0) {
|
if (f->refs <= 0) {
|
||||||
delete f;
|
delete f;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
base_->Unref();
|
||||||
}
|
}
|
||||||
|
|
||||||
// Apply all of the edits in *edit to the current state.
|
// Apply all of the edits in *edit to the current state.
|
||||||
@ -271,16 +297,7 @@ class VersionSet::Builder {
|
|||||||
++iter) {
|
++iter) {
|
||||||
const int level = iter->first;
|
const int level = iter->first;
|
||||||
const uint64_t number = iter->second;
|
const uint64_t number = iter->second;
|
||||||
FileMap::iterator fiter = files_[level].find(number);
|
levels_[level].deleted_files.insert(number);
|
||||||
assert(fiter != files_[level].end()); // Sanity check for debug mode
|
|
||||||
if (fiter != files_[level].end()) {
|
|
||||||
FileMetaData* f = fiter->second;
|
|
||||||
f->refs--;
|
|
||||||
if (f->refs <= 0) {
|
|
||||||
delete f;
|
|
||||||
}
|
|
||||||
files_[level].erase(fiter);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Add new files
|
// Add new files
|
||||||
@ -288,24 +305,68 @@ class VersionSet::Builder {
|
|||||||
const int level = edit->new_files_[i].first;
|
const int level = edit->new_files_[i].first;
|
||||||
FileMetaData* f = new FileMetaData(edit->new_files_[i].second);
|
FileMetaData* f = new FileMetaData(edit->new_files_[i].second);
|
||||||
f->refs = 1;
|
f->refs = 1;
|
||||||
assert(files_[level].count(f->number) == 0);
|
levels_[level].deleted_files.erase(f->number);
|
||||||
files_[level].insert(std::make_pair(f->number, f));
|
levels_[level].added_files->insert(f);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Save the current state in *v.
|
// Save the current state in *v.
|
||||||
void SaveTo(Version* v) {
|
void SaveTo(Version* v) {
|
||||||
|
BySmallestKey cmp;
|
||||||
|
cmp.internal_comparator = &vset_->icmp_;
|
||||||
for (int level = 0; level < config::kNumLevels; level++) {
|
for (int level = 0; level < config::kNumLevels; level++) {
|
||||||
const FileMap& fmap = files_[level];
|
// Merge the set of added files with the set of pre-existing files.
|
||||||
for (FileMap::const_iterator iter = fmap.begin();
|
// Drop any deleted files. Store the result in *v.
|
||||||
iter != fmap.end();
|
const std::vector<FileMetaData*>& base_files = base_->files_[level];
|
||||||
++iter) {
|
std::vector<FileMetaData*>::const_iterator base_iter = base_files.begin();
|
||||||
FileMetaData* f = iter->second;
|
std::vector<FileMetaData*>::const_iterator base_end = base_files.end();
|
||||||
|
const FileSet* added = levels_[level].added_files;
|
||||||
|
v->files_[level].reserve(base_files.size() + added->size());
|
||||||
|
for (FileSet::const_iterator added_iter = added->begin();
|
||||||
|
added_iter != added->end();
|
||||||
|
++added_iter) {
|
||||||
|
// Add all smaller files listed in base_
|
||||||
|
for (std::vector<FileMetaData*>::const_iterator bpos
|
||||||
|
= std::upper_bound(base_iter, base_end, *added_iter, cmp);
|
||||||
|
base_iter != bpos;
|
||||||
|
++base_iter) {
|
||||||
|
MaybeAddFile(v, level, *base_iter);
|
||||||
|
}
|
||||||
|
|
||||||
|
MaybeAddFile(v, level, *added_iter);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add remaining base files
|
||||||
|
for (; base_iter != base_end; ++base_iter) {
|
||||||
|
MaybeAddFile(v, level, *base_iter);
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifndef NDEBUG
|
||||||
|
// Make sure there is no overlap in levels > 0
|
||||||
|
if (level > 0) {
|
||||||
|
for (int i = 1; i < v->files_[level].size(); i++) {
|
||||||
|
const InternalKey& prev_end = v->files_[level][i-1]->largest;
|
||||||
|
const InternalKey& this_begin = v->files_[level][i]->smallest;
|
||||||
|
if (vset_->icmp_.Compare(prev_end, this_begin) >= 0) {
|
||||||
|
fprintf(stderr, "overlapping ranges in same level %s vs. %s\n",
|
||||||
|
EscapeString(prev_end.Encode()).c_str(),
|
||||||
|
EscapeString(this_begin.Encode()).c_str());
|
||||||
|
abort();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void MaybeAddFile(Version* v, int level, FileMetaData* f) {
|
||||||
|
if (levels_[level].deleted_files.count(f->number) > 0) {
|
||||||
|
// File is deleted: do nothing
|
||||||
|
} else {
|
||||||
f->refs++;
|
f->refs++;
|
||||||
v->files_[level].push_back(f);
|
v->files_[level].push_back(f);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
};
|
};
|
||||||
|
|
||||||
VersionSet::VersionSet(const std::string& dbname,
|
VersionSet::VersionSet(const std::string& dbname,
|
||||||
@ -324,22 +385,36 @@ VersionSet::VersionSet(const std::string& dbname,
|
|||||||
prev_log_number_(0),
|
prev_log_number_(0),
|
||||||
descriptor_file_(NULL),
|
descriptor_file_(NULL),
|
||||||
descriptor_log_(NULL),
|
descriptor_log_(NULL),
|
||||||
current_(new Version(this)),
|
dummy_versions_(this),
|
||||||
oldest_(current_) {
|
current_(NULL) {
|
||||||
|
AppendVersion(new Version(this));
|
||||||
}
|
}
|
||||||
|
|
||||||
VersionSet::~VersionSet() {
|
VersionSet::~VersionSet() {
|
||||||
for (Version* v = oldest_; v != NULL; ) {
|
current_->Unref();
|
||||||
Version* next = v->next_;
|
assert(dummy_versions_.next_ == &dummy_versions_); // List must be empty
|
||||||
assert(v->refs_ == 0);
|
|
||||||
delete v;
|
|
||||||
v = next;
|
|
||||||
}
|
|
||||||
delete descriptor_log_;
|
delete descriptor_log_;
|
||||||
delete descriptor_file_;
|
delete descriptor_file_;
|
||||||
}
|
}
|
||||||
|
|
||||||
Status VersionSet::LogAndApply(VersionEdit* edit, MemTable* cleanup_mem) {
|
void VersionSet::AppendVersion(Version* v) {
|
||||||
|
// Make "v" current
|
||||||
|
assert(v->refs_ == 0);
|
||||||
|
assert(v != current_);
|
||||||
|
if (current_ != NULL) {
|
||||||
|
current_->Unref();
|
||||||
|
}
|
||||||
|
current_ = v;
|
||||||
|
v->Ref();
|
||||||
|
|
||||||
|
// Append to linked list
|
||||||
|
v->prev_ = dummy_versions_.prev_;
|
||||||
|
v->next_ = &dummy_versions_;
|
||||||
|
v->prev_->next_ = v;
|
||||||
|
v->next_->prev_ = v;
|
||||||
|
}
|
||||||
|
|
||||||
|
Status VersionSet::LogAndApply(VersionEdit* edit) {
|
||||||
if (edit->has_log_number_) {
|
if (edit->has_log_number_) {
|
||||||
assert(edit->log_number_ >= log_number_);
|
assert(edit->log_number_ >= log_number_);
|
||||||
assert(edit->log_number_ < next_file_number_);
|
assert(edit->log_number_ < next_file_number_);
|
||||||
@ -360,13 +435,12 @@ Status VersionSet::LogAndApply(VersionEdit* edit, MemTable* cleanup_mem) {
|
|||||||
builder.Apply(edit);
|
builder.Apply(edit);
|
||||||
builder.SaveTo(v);
|
builder.SaveTo(v);
|
||||||
}
|
}
|
||||||
|
Finalize(v);
|
||||||
std::string new_manifest_file;
|
|
||||||
Status s = Finalize(v);
|
|
||||||
|
|
||||||
// Initialize new descriptor log file if necessary by creating
|
// Initialize new descriptor log file if necessary by creating
|
||||||
// a temporary file that contains a snapshot of the current version.
|
// a temporary file that contains a snapshot of the current version.
|
||||||
if (s.ok()) {
|
std::string new_manifest_file;
|
||||||
|
Status s;
|
||||||
if (descriptor_log_ == NULL) {
|
if (descriptor_log_ == NULL) {
|
||||||
assert(descriptor_file_ == NULL);
|
assert(descriptor_file_ == NULL);
|
||||||
new_manifest_file = DescriptorFileName(dbname_, manifest_file_number_);
|
new_manifest_file = DescriptorFileName(dbname_, manifest_file_number_);
|
||||||
@ -377,7 +451,6 @@ Status VersionSet::LogAndApply(VersionEdit* edit, MemTable* cleanup_mem) {
|
|||||||
s = WriteSnapshot(descriptor_log_);
|
s = WriteSnapshot(descriptor_log_);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
// Write new record to MANIFEST log
|
// Write new record to MANIFEST log
|
||||||
if (s.ok()) {
|
if (s.ok()) {
|
||||||
@ -397,12 +470,7 @@ Status VersionSet::LogAndApply(VersionEdit* edit, MemTable* cleanup_mem) {
|
|||||||
|
|
||||||
// Install the new version
|
// Install the new version
|
||||||
if (s.ok()) {
|
if (s.ok()) {
|
||||||
assert(current_->next_ == NULL);
|
AppendVersion(v);
|
||||||
assert(current_->cleanup_mem_ == NULL);
|
|
||||||
current_->cleanup_mem_ = cleanup_mem;
|
|
||||||
v->next_ = NULL;
|
|
||||||
current_->next_ = v;
|
|
||||||
current_ = v;
|
|
||||||
log_number_ = edit->log_number_;
|
log_number_ = edit->log_number_;
|
||||||
prev_log_number_ = edit->prev_log_number_;
|
prev_log_number_ = edit->prev_log_number_;
|
||||||
} else {
|
} else {
|
||||||
@ -458,7 +526,7 @@ Status VersionSet::Recover() {
|
|||||||
{
|
{
|
||||||
LogReporter reporter;
|
LogReporter reporter;
|
||||||
reporter.status = &s;
|
reporter.status = &s;
|
||||||
log::Reader reader(file, &reporter, true/*checksum*/);
|
log::Reader reader(file, &reporter, true/*checksum*/, 0/*initial_offset*/);
|
||||||
Slice record;
|
Slice record;
|
||||||
std::string scratch;
|
std::string scratch;
|
||||||
while (reader.ReadRecord(&record, &scratch) && s.ok()) {
|
while (reader.ReadRecord(&record, &scratch) && s.ok()) {
|
||||||
@ -518,21 +586,15 @@ Status VersionSet::Recover() {
|
|||||||
if (s.ok()) {
|
if (s.ok()) {
|
||||||
Version* v = new Version(this);
|
Version* v = new Version(this);
|
||||||
builder.SaveTo(v);
|
builder.SaveTo(v);
|
||||||
s = Finalize(v);
|
|
||||||
if (!s.ok()) {
|
|
||||||
delete v;
|
|
||||||
} else {
|
|
||||||
// Install recovered version
|
// Install recovered version
|
||||||
v->next_ = NULL;
|
Finalize(v);
|
||||||
current_->next_ = v;
|
AppendVersion(v);
|
||||||
current_ = v;
|
|
||||||
manifest_file_number_ = next_file;
|
manifest_file_number_ = next_file;
|
||||||
next_file_number_ = next_file + 1;
|
next_file_number_ = next_file + 1;
|
||||||
last_sequence_ = last_sequence;
|
last_sequence_ = last_sequence;
|
||||||
log_number_ = log_number;
|
log_number_ = log_number;
|
||||||
prev_log_number_ = prev_log_number;
|
prev_log_number_ = prev_log_number;
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
@ -545,15 +607,12 @@ static int64_t TotalFileSize(const std::vector<FileMetaData*>& files) {
|
|||||||
return sum;
|
return sum;
|
||||||
}
|
}
|
||||||
|
|
||||||
Status VersionSet::Finalize(Version* v) {
|
void VersionSet::Finalize(Version* v) {
|
||||||
// Precomputed best level for next compaction
|
// Precomputed best level for next compaction
|
||||||
int best_level = -1;
|
int best_level = -1;
|
||||||
double best_score = -1;
|
double best_score = -1;
|
||||||
|
|
||||||
Status s;
|
for (int level = 0; level < config::kNumLevels-1; level++) {
|
||||||
for (int level = 0; s.ok() && level < config::kNumLevels-1; level++) {
|
|
||||||
s = SortLevel(v, level);
|
|
||||||
|
|
||||||
double score;
|
double score;
|
||||||
if (level == 0) {
|
if (level == 0) {
|
||||||
// We treat level-0 specially by bounding the number of files
|
// We treat level-0 specially by bounding the number of files
|
||||||
@ -567,7 +626,8 @@ Status VersionSet::Finalize(Version* v) {
|
|||||||
// file size is small (perhaps because of a small write-buffer
|
// file size is small (perhaps because of a small write-buffer
|
||||||
// setting, or very high compression ratios, or lots of
|
// setting, or very high compression ratios, or lots of
|
||||||
// overwrites/deletions).
|
// overwrites/deletions).
|
||||||
score = v->files_[level].size() / 4.0;
|
score = v->files_[level].size() /
|
||||||
|
static_cast<double>(config::kL0_CompactionTrigger);
|
||||||
} else {
|
} else {
|
||||||
// Compute the ratio of current size to size limit.
|
// Compute the ratio of current size to size limit.
|
||||||
const uint64_t level_bytes = TotalFileSize(v->files_[level]);
|
const uint64_t level_bytes = TotalFileSize(v->files_[level]);
|
||||||
@ -582,7 +642,6 @@ Status VersionSet::Finalize(Version* v) {
|
|||||||
|
|
||||||
v->compaction_level_ = best_level;
|
v->compaction_level_ = best_level;
|
||||||
v->compaction_score_ = best_score;
|
v->compaction_score_ = best_score;
|
||||||
return s;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Status VersionSet::WriteSnapshot(log::Writer* log) {
|
Status VersionSet::WriteSnapshot(log::Writer* log) {
|
||||||
@ -615,44 +674,27 @@ Status VersionSet::WriteSnapshot(log::Writer* log) {
|
|||||||
return log->AddRecord(record);
|
return log->AddRecord(record);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Helper to sort by tables_[file_number].smallest
|
|
||||||
struct VersionSet::BySmallestKey {
|
|
||||||
const InternalKeyComparator* internal_comparator;
|
|
||||||
|
|
||||||
bool operator()(FileMetaData* f1, FileMetaData* f2) const {
|
|
||||||
return internal_comparator->Compare(f1->smallest, f2->smallest) < 0;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
Status VersionSet::SortLevel(Version* v, uint64_t level) {
|
|
||||||
Status result;
|
|
||||||
BySmallestKey cmp;
|
|
||||||
cmp.internal_comparator = &icmp_;
|
|
||||||
std::sort(v->files_[level].begin(), v->files_[level].end(), cmp);
|
|
||||||
|
|
||||||
if (result.ok() && level > 0) {
|
|
||||||
// There should be no overlap
|
|
||||||
for (size_t i = 1; i < v->files_[level].size(); i++) {
|
|
||||||
const InternalKey& prev_end = v->files_[level][i-1]->largest;
|
|
||||||
const InternalKey& this_begin = v->files_[level][i]->smallest;
|
|
||||||
if (icmp_.Compare(prev_end, this_begin) >= 0) {
|
|
||||||
result = Status::Corruption(
|
|
||||||
"overlapping ranges in same level",
|
|
||||||
(EscapeString(prev_end.Encode()) + " vs. " +
|
|
||||||
EscapeString(this_begin.Encode())));
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
int VersionSet::NumLevelFiles(int level) const {
|
int VersionSet::NumLevelFiles(int level) const {
|
||||||
assert(level >= 0);
|
assert(level >= 0);
|
||||||
assert(level < config::kNumLevels);
|
assert(level < config::kNumLevels);
|
||||||
return current_->files_[level].size();
|
return current_->files_[level].size();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const char* VersionSet::LevelSummary(LevelSummaryStorage* scratch) const {
|
||||||
|
// Update code if kNumLevels changes
|
||||||
|
assert(config::kNumLevels == 7);
|
||||||
|
snprintf(scratch->buffer, sizeof(scratch->buffer),
|
||||||
|
"files[ %d %d %d %d %d %d %d ]",
|
||||||
|
int(current_->files_[0].size()),
|
||||||
|
int(current_->files_[1].size()),
|
||||||
|
int(current_->files_[2].size()),
|
||||||
|
int(current_->files_[3].size()),
|
||||||
|
int(current_->files_[4].size()),
|
||||||
|
int(current_->files_[5].size()),
|
||||||
|
int(current_->files_[6].size()));
|
||||||
|
return scratch->buffer;
|
||||||
|
}
|
||||||
|
|
||||||
uint64_t VersionSet::ApproximateOffsetOf(Version* v, const InternalKey& ikey) {
|
uint64_t VersionSet::ApproximateOffsetOf(Version* v, const InternalKey& ikey) {
|
||||||
uint64_t result = 0;
|
uint64_t result = 0;
|
||||||
for (int level = 0; level < config::kNumLevels; level++) {
|
for (int level = 0; level < config::kNumLevels; level++) {
|
||||||
@ -685,19 +727,10 @@ uint64_t VersionSet::ApproximateOffsetOf(Version* v, const InternalKey& ikey) {
|
|||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
void VersionSet::MaybeDeleteOldVersions() {
|
|
||||||
// Note: it is important to delete versions in order since a newer
|
|
||||||
// version with zero refs may be holding a pointer to a memtable
|
|
||||||
// that is used by somebody who has a ref on an older version.
|
|
||||||
while (oldest_ != current_ && oldest_->refs_ == 0) {
|
|
||||||
Version* next = oldest_->next_;
|
|
||||||
delete oldest_;
|
|
||||||
oldest_ = next;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void VersionSet::AddLiveFiles(std::set<uint64_t>* live) {
|
void VersionSet::AddLiveFiles(std::set<uint64_t>* live) {
|
||||||
for (Version* v = oldest_; v != NULL; v = v->next_) {
|
for (Version* v = dummy_versions_.next_;
|
||||||
|
v != &dummy_versions_;
|
||||||
|
v = v->next_) {
|
||||||
for (int level = 0; level < config::kNumLevels; level++) {
|
for (int level = 0; level < config::kNumLevels; level++) {
|
||||||
const std::vector<FileMetaData*>& files = v->files_[level];
|
const std::vector<FileMetaData*>& files = v->files_[level];
|
||||||
for (size_t i = 0; i < files.size(); i++) {
|
for (size_t i = 0; i < files.size(); i++) {
|
||||||
@ -809,8 +842,7 @@ Iterator* VersionSet::MakeInputIterator(Compaction* c) {
|
|||||||
} else {
|
} else {
|
||||||
// Create concatenating iterator for the files from this level
|
// Create concatenating iterator for the files from this level
|
||||||
list[num++] = NewTwoLevelIterator(
|
list[num++] = NewTwoLevelIterator(
|
||||||
new Version::LevelFileNumIterator(
|
new Version::LevelFileNumIterator(icmp_, &c->inputs_[which]),
|
||||||
c->input_version_, &c->inputs_[which]),
|
|
||||||
&GetFileIterator, table_cache_, options);
|
&GetFileIterator, table_cache_, options);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -996,11 +1028,12 @@ bool Compaction::IsBaseLevelForKey(const Slice& user_key) {
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool Compaction::ShouldStopBefore(const InternalKey& key) {
|
bool Compaction::ShouldStopBefore(const Slice& internal_key) {
|
||||||
// Scan to find earliest grandparent file that contains key.
|
// Scan to find earliest grandparent file that contains key.
|
||||||
const InternalKeyComparator* icmp = &input_version_->vset_->icmp_;
|
const InternalKeyComparator* icmp = &input_version_->vset_->icmp_;
|
||||||
while (grandparent_index_ < grandparents_.size() &&
|
while (grandparent_index_ < grandparents_.size() &&
|
||||||
icmp->Compare(key, grandparents_[grandparent_index_]->largest) > 0) {
|
icmp->Compare(internal_key,
|
||||||
|
grandparents_[grandparent_index_]->largest.Encode()) > 0) {
|
||||||
if (seen_key_) {
|
if (seen_key_) {
|
||||||
overlapped_bytes_ += grandparents_[grandparent_index_]->file_size;
|
overlapped_bytes_ += grandparents_[grandparent_index_]->file_size;
|
||||||
}
|
}
|
||||||
|
@ -59,8 +59,8 @@ class Version {
|
|||||||
|
|
||||||
VersionSet* vset_; // VersionSet to which this Version belongs
|
VersionSet* vset_; // VersionSet to which this Version belongs
|
||||||
Version* next_; // Next version in linked list
|
Version* next_; // Next version in linked list
|
||||||
|
Version* prev_; // Previous version in linked list
|
||||||
int refs_; // Number of live refs to this version
|
int refs_; // Number of live refs to this version
|
||||||
MemTable* cleanup_mem_; // NULL, or table to delete when version dropped
|
|
||||||
|
|
||||||
// List of files per level
|
// List of files per level
|
||||||
std::vector<FileMetaData*> files_[config::kNumLevels];
|
std::vector<FileMetaData*> files_[config::kNumLevels];
|
||||||
@ -72,8 +72,7 @@ class Version {
|
|||||||
int compaction_level_;
|
int compaction_level_;
|
||||||
|
|
||||||
explicit Version(VersionSet* vset)
|
explicit Version(VersionSet* vset)
|
||||||
: vset_(vset), next_(NULL), refs_(0),
|
: vset_(vset), next_(this), prev_(this), refs_(0),
|
||||||
cleanup_mem_(NULL),
|
|
||||||
compaction_score_(-1),
|
compaction_score_(-1),
|
||||||
compaction_level_(-1) {
|
compaction_level_(-1) {
|
||||||
}
|
}
|
||||||
@ -95,10 +94,8 @@ class VersionSet {
|
|||||||
|
|
||||||
// Apply *edit to the current version to form a new descriptor that
|
// Apply *edit to the current version to form a new descriptor that
|
||||||
// is both saved to persistent state and installed as the new
|
// is both saved to persistent state and installed as the new
|
||||||
// current version. Iff Apply() returns OK, arrange to delete
|
// current version.
|
||||||
// cleanup_mem (if cleanup_mem != NULL) when it is no longer needed
|
Status LogAndApply(VersionEdit* edit);
|
||||||
// by older versions.
|
|
||||||
Status LogAndApply(VersionEdit* edit, MemTable* cleanup_mem);
|
|
||||||
|
|
||||||
// Recover the last saved descriptor from persistent storage.
|
// Recover the last saved descriptor from persistent storage.
|
||||||
Status Recover();
|
Status Recover();
|
||||||
@ -171,19 +168,20 @@ class VersionSet {
|
|||||||
// "key" as of version "v".
|
// "key" as of version "v".
|
||||||
uint64_t ApproximateOffsetOf(Version* v, const InternalKey& key);
|
uint64_t ApproximateOffsetOf(Version* v, const InternalKey& key);
|
||||||
|
|
||||||
|
// Return a human-readable short (single-line) summary of the number
|
||||||
|
// of files per level. Uses *scratch as backing store.
|
||||||
|
struct LevelSummaryStorage {
|
||||||
|
char buffer[100];
|
||||||
|
};
|
||||||
|
const char* LevelSummary(LevelSummaryStorage* scratch) const;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
class Builder;
|
class Builder;
|
||||||
|
|
||||||
friend class Compaction;
|
friend class Compaction;
|
||||||
friend class Version;
|
friend class Version;
|
||||||
|
|
||||||
Status Finalize(Version* v);
|
void Finalize(Version* v);
|
||||||
|
|
||||||
// Delete any old versions that are no longer needed.
|
|
||||||
void MaybeDeleteOldVersions();
|
|
||||||
|
|
||||||
struct BySmallestKey;
|
|
||||||
Status SortLevel(Version* v, uint64_t level);
|
|
||||||
|
|
||||||
void GetOverlappingInputs(
|
void GetOverlappingInputs(
|
||||||
int level,
|
int level,
|
||||||
@ -202,6 +200,8 @@ class VersionSet {
|
|||||||
|
|
||||||
void SetupOtherInputs(Compaction* c);
|
void SetupOtherInputs(Compaction* c);
|
||||||
|
|
||||||
|
void AppendVersion(Version* v);
|
||||||
|
|
||||||
Env* const env_;
|
Env* const env_;
|
||||||
const std::string dbname_;
|
const std::string dbname_;
|
||||||
const Options* const options_;
|
const Options* const options_;
|
||||||
@ -216,10 +216,8 @@ class VersionSet {
|
|||||||
// Opened lazily
|
// Opened lazily
|
||||||
WritableFile* descriptor_file_;
|
WritableFile* descriptor_file_;
|
||||||
log::Writer* descriptor_log_;
|
log::Writer* descriptor_log_;
|
||||||
|
Version dummy_versions_; // Head of circular doubly-linked list of versions.
|
||||||
// Versions are kept in a singly linked list that is never empty
|
Version* current_; // == dummy_versions_.prev_
|
||||||
Version* current_; // Pointer to the last (newest) list entry
|
|
||||||
Version* oldest_; // Pointer to the first (oldest) list entry
|
|
||||||
|
|
||||||
// Per-level key at which the next compaction at that level should start.
|
// Per-level key at which the next compaction at that level should start.
|
||||||
// Either an empty string, or a valid InternalKey.
|
// Either an empty string, or a valid InternalKey.
|
||||||
@ -265,8 +263,8 @@ class Compaction {
|
|||||||
bool IsBaseLevelForKey(const Slice& user_key);
|
bool IsBaseLevelForKey(const Slice& user_key);
|
||||||
|
|
||||||
// Returns true iff we should stop building the current output
|
// Returns true iff we should stop building the current output
|
||||||
// before processing "key".
|
// before processing "internal_key".
|
||||||
bool ShouldStopBefore(const InternalKey& key);
|
bool ShouldStopBefore(const Slice& internal_key);
|
||||||
|
|
||||||
// Release the input version for the compaction, once the compaction
|
// Release the input version for the compaction, once the compaction
|
||||||
// is successful.
|
// is successful.
|
||||||
|
@ -29,11 +29,53 @@ WriteBatch::WriteBatch() {
|
|||||||
|
|
||||||
WriteBatch::~WriteBatch() { }
|
WriteBatch::~WriteBatch() { }
|
||||||
|
|
||||||
|
WriteBatch::Handler::~Handler() { }
|
||||||
|
|
||||||
void WriteBatch::Clear() {
|
void WriteBatch::Clear() {
|
||||||
rep_.clear();
|
rep_.clear();
|
||||||
rep_.resize(12);
|
rep_.resize(12);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Status WriteBatch::Iterate(Handler* handler) const {
|
||||||
|
Slice input(rep_);
|
||||||
|
if (input.size() < 12) {
|
||||||
|
return Status::Corruption("malformed WriteBatch (too small)");
|
||||||
|
}
|
||||||
|
|
||||||
|
input.remove_prefix(12);
|
||||||
|
Slice key, value;
|
||||||
|
int found = 0;
|
||||||
|
while (!input.empty()) {
|
||||||
|
found++;
|
||||||
|
char tag = input[0];
|
||||||
|
input.remove_prefix(1);
|
||||||
|
switch (tag) {
|
||||||
|
case kTypeValue:
|
||||||
|
if (GetLengthPrefixedSlice(&input, &key) &&
|
||||||
|
GetLengthPrefixedSlice(&input, &value)) {
|
||||||
|
handler->Put(key, value);
|
||||||
|
} else {
|
||||||
|
return Status::Corruption("bad WriteBatch Put");
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case kTypeDeletion:
|
||||||
|
if (GetLengthPrefixedSlice(&input, &key)) {
|
||||||
|
handler->Delete(key);
|
||||||
|
} else {
|
||||||
|
return Status::Corruption("bad WriteBatch Delete");
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
return Status::Corruption("unknown WriteBatch tag");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (found != WriteBatchInternal::Count(this)) {
|
||||||
|
return Status::Corruption("WriteBatch has wrong count");
|
||||||
|
} else {
|
||||||
|
return Status::OK();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
int WriteBatchInternal::Count(const WriteBatch* b) {
|
int WriteBatchInternal::Count(const WriteBatch* b) {
|
||||||
return DecodeFixed32(b->rep_.data() + 8);
|
return DecodeFixed32(b->rep_.data() + 8);
|
||||||
}
|
}
|
||||||
@ -63,28 +105,29 @@ void WriteBatch::Delete(const Slice& key) {
|
|||||||
PutLengthPrefixedSlice(&rep_, key);
|
PutLengthPrefixedSlice(&rep_, key);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
class MemTableInserter : public WriteBatch::Handler {
|
||||||
|
public:
|
||||||
|
SequenceNumber sequence_;
|
||||||
|
MemTable* mem_;
|
||||||
|
|
||||||
|
virtual void Put(const Slice& key, const Slice& value) {
|
||||||
|
mem_->Add(sequence_, kTypeValue, key, value);
|
||||||
|
sequence_++;
|
||||||
|
}
|
||||||
|
virtual void Delete(const Slice& key) {
|
||||||
|
mem_->Add(sequence_, kTypeDeletion, key, Slice());
|
||||||
|
sequence_++;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
Status WriteBatchInternal::InsertInto(const WriteBatch* b,
|
Status WriteBatchInternal::InsertInto(const WriteBatch* b,
|
||||||
MemTable* memtable) {
|
MemTable* memtable) {
|
||||||
const int count = WriteBatchInternal::Count(b);
|
MemTableInserter inserter;
|
||||||
int found = 0;
|
inserter.sequence_ = WriteBatchInternal::Sequence(b);
|
||||||
Iterator it(*b);
|
inserter.mem_ = memtable;
|
||||||
for (; !it.Done(); it.Next()) {
|
return b->Iterate(&inserter);
|
||||||
switch (it.op()) {
|
|
||||||
case kTypeDeletion:
|
|
||||||
memtable->Add(it.sequence_number(), kTypeDeletion, it.key(), Slice());
|
|
||||||
break;
|
|
||||||
case kTypeValue:
|
|
||||||
memtable->Add(it.sequence_number(), kTypeValue, it.key(), it.value());
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
found++;
|
|
||||||
}
|
|
||||||
if (!it.status().ok()) {
|
|
||||||
return it.status();
|
|
||||||
} else if (found != count) {
|
|
||||||
return Status::Corruption("wrong count in WriteBatch");
|
|
||||||
}
|
|
||||||
return Status::OK();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void WriteBatchInternal::SetContents(WriteBatch* b, const Slice& contents) {
|
void WriteBatchInternal::SetContents(WriteBatch* b, const Slice& contents) {
|
||||||
@ -92,57 +135,4 @@ void WriteBatchInternal::SetContents(WriteBatch* b, const Slice& contents) {
|
|||||||
b->rep_.assign(contents.data(), contents.size());
|
b->rep_.assign(contents.data(), contents.size());
|
||||||
}
|
}
|
||||||
|
|
||||||
WriteBatchInternal::Iterator::Iterator(const WriteBatch& batch)
|
|
||||||
: input_(WriteBatchInternal::Contents(&batch)),
|
|
||||||
done_(false) {
|
|
||||||
if (input_.size() < 12) {
|
|
||||||
done_ = true;
|
|
||||||
} else {
|
|
||||||
seq_ = WriteBatchInternal::Sequence(&batch),
|
|
||||||
input_.remove_prefix(12);
|
|
||||||
GetNextEntry();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void WriteBatchInternal::Iterator::Next() {
|
|
||||||
assert(!done_);
|
|
||||||
seq_++;
|
|
||||||
GetNextEntry();
|
|
||||||
}
|
|
||||||
|
|
||||||
void WriteBatchInternal::Iterator::GetNextEntry() {
|
|
||||||
if (input_.empty()) {
|
|
||||||
done_ = true;
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
char tag = input_[0];
|
|
||||||
input_.remove_prefix(1);
|
|
||||||
switch (tag) {
|
|
||||||
case kTypeValue:
|
|
||||||
if (GetLengthPrefixedSlice(&input_, &key_) &&
|
|
||||||
GetLengthPrefixedSlice(&input_, &value_)) {
|
|
||||||
op_ = static_cast<ValueType>(tag);
|
|
||||||
} else {
|
|
||||||
status_ = Status::Corruption("bad WriteBatch Put");
|
|
||||||
done_ = true;
|
|
||||||
input_.clear();
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
case kTypeDeletion:
|
|
||||||
if (GetLengthPrefixedSlice(&input_, &key_)) {
|
|
||||||
op_ = kTypeDeletion;
|
|
||||||
} else {
|
|
||||||
status_ = Status::Corruption("bad WriteBatch Delete");
|
|
||||||
done_ = true;
|
|
||||||
input_.clear();
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
status_ = Status::Corruption("unknown WriteBatch tag");
|
|
||||||
done_ = true;
|
|
||||||
input_.clear();
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -37,30 +37,6 @@ class WriteBatchInternal {
|
|||||||
static void SetContents(WriteBatch* batch, const Slice& contents);
|
static void SetContents(WriteBatch* batch, const Slice& contents);
|
||||||
|
|
||||||
static Status InsertInto(const WriteBatch* batch, MemTable* memtable);
|
static Status InsertInto(const WriteBatch* batch, MemTable* memtable);
|
||||||
|
|
||||||
// Iterate over the contents of a write batch.
|
|
||||||
class Iterator {
|
|
||||||
public:
|
|
||||||
explicit Iterator(const WriteBatch& batch);
|
|
||||||
bool Done() const { return done_; }
|
|
||||||
void Next();
|
|
||||||
ValueType op() const { return op_; }
|
|
||||||
const Slice& key() const { return key_; }
|
|
||||||
const Slice& value() const { return value_; }
|
|
||||||
SequenceNumber sequence_number() const { return seq_; }
|
|
||||||
Status status() const { return status_; }
|
|
||||||
|
|
||||||
private:
|
|
||||||
void GetNextEntry();
|
|
||||||
|
|
||||||
Slice input_;
|
|
||||||
bool done_;
|
|
||||||
ValueType op_;
|
|
||||||
Slice key_;
|
|
||||||
Slice value_;
|
|
||||||
SequenceNumber seq_;
|
|
||||||
Status status_;
|
|
||||||
};
|
|
||||||
};
|
};
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -14,10 +14,11 @@ namespace leveldb {
|
|||||||
|
|
||||||
static std::string PrintContents(WriteBatch* b) {
|
static std::string PrintContents(WriteBatch* b) {
|
||||||
InternalKeyComparator cmp(BytewiseComparator());
|
InternalKeyComparator cmp(BytewiseComparator());
|
||||||
MemTable mem(cmp);
|
MemTable* mem = new MemTable(cmp);
|
||||||
|
mem->Ref();
|
||||||
std::string state;
|
std::string state;
|
||||||
Status s = WriteBatchInternal::InsertInto(b, &mem);
|
Status s = WriteBatchInternal::InsertInto(b, mem);
|
||||||
Iterator* iter = mem.NewIterator();
|
Iterator* iter = mem->NewIterator();
|
||||||
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
|
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
|
||||||
ParsedInternalKey ikey;
|
ParsedInternalKey ikey;
|
||||||
ASSERT_TRUE(ParseInternalKey(iter->key(), &ikey));
|
ASSERT_TRUE(ParseInternalKey(iter->key(), &ikey));
|
||||||
@ -42,6 +43,7 @@ static std::string PrintContents(WriteBatch* b) {
|
|||||||
if (!s.ok()) {
|
if (!s.ok()) {
|
||||||
state.append("ParseError()");
|
state.append("ParseError()");
|
||||||
}
|
}
|
||||||
|
mem->Unref();
|
||||||
return state;
|
return state;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -17,14 +17,14 @@ However the organization of the files that make up the representation
|
|||||||
is somewhat different and is explained below.
|
is somewhat different and is explained below.
|
||||||
|
|
||||||
<p>
|
<p>
|
||||||
Each database is represented by a set of file stored in a directory.
|
Each database is represented by a set of files stored in a directory.
|
||||||
There are several different types of files as documented below:
|
There are several different types of files as documented below:
|
||||||
<p>
|
<p>
|
||||||
<h2>Log files</h2>
|
<h2>Log files</h2>
|
||||||
<p>
|
<p>
|
||||||
A log file (*.log) stores a sequence of recent updates. Each update
|
A log file (*.log) stores a sequence of recent updates. Each update
|
||||||
is appended to the current log file. When the log file reaches a
|
is appended to the current log file. When the log file reaches a
|
||||||
pre-determined size (approximately 1MB by default), it is converted
|
pre-determined size (approximately 4MB by default), it is converted
|
||||||
to a sorted table (see below) and a new log file is created for future
|
to a sorted table (see below) and a new log file is created for future
|
||||||
updates.
|
updates.
|
||||||
<p>
|
<p>
|
||||||
@ -83,19 +83,15 @@ Other files used for miscellaneous purposes may also be present
|
|||||||
<h1>Level 0</h1>
|
<h1>Level 0</h1>
|
||||||
When the log file grows above a certain size (1MB by default):
|
When the log file grows above a certain size (1MB by default):
|
||||||
<ul>
|
<ul>
|
||||||
<li>Write the contents of the current memtable to an sstable
|
<li>Create a brand new memtable and log file and direct future updates here
|
||||||
<li>Replace the current memtable by a brand new empty memtable
|
<li>In the background:
|
||||||
<li>Switch to a new log file
|
<ul>
|
||||||
|
<li>Write the contents of the previous memtable to an sstable
|
||||||
|
<li>Discard the memtable
|
||||||
<li>Delete the old log file and the old memtable
|
<li>Delete the old log file and the old memtable
|
||||||
|
<li>Add the new sstable to the young (level-0) level.
|
||||||
|
</ul>
|
||||||
</ul>
|
</ul>
|
||||||
Experimental measurements show that generating an sstable from a 1MB
|
|
||||||
log file takes ~12ms, which seems like an acceptable latency hiccup to
|
|
||||||
add infrequently to a log write.
|
|
||||||
|
|
||||||
<p>
|
|
||||||
The new sstable is added to a special level-0 level. level-0 contains
|
|
||||||
a set of files (up to 4 by default). However unlike other levels,
|
|
||||||
these files do not cover disjoint ranges, but may overlap each other.
|
|
||||||
|
|
||||||
<h1>Compactions</h1>
|
<h1>Compactions</h1>
|
||||||
|
|
||||||
@ -162,8 +158,8 @@ read.
|
|||||||
<p>
|
<p>
|
||||||
Solution 1: To reduce this problem, we might want to increase the log
|
Solution 1: To reduce this problem, we might want to increase the log
|
||||||
switching threshold when the number of level-0 files is large. Though
|
switching threshold when the number of level-0 files is large. Though
|
||||||
the downside is that the larger this threshold, the larger the delay
|
the downside is that the larger this threshold, the more memory we will
|
||||||
that we will add to write latency when a write triggers a log switch.
|
need to hold the corresponding memtable.
|
||||||
|
|
||||||
<p>
|
<p>
|
||||||
Solution 2: We might want to decrease write rate artificially when the
|
Solution 2: We might want to decrease write rate artificially when the
|
||||||
|
@ -141,10 +141,18 @@ the batch.
|
|||||||
<p>
|
<p>
|
||||||
<h1>Concurrency</h1>
|
<h1>Concurrency</h1>
|
||||||
<p>
|
<p>
|
||||||
A database may only be opened by one process at a time. The <code>leveldb</code>
|
A database may only be opened by one process at a time.
|
||||||
implementation acquires a lock from the operating system to prevent
|
The <code>leveldb</code> implementation acquires a lock from the
|
||||||
misuse. Within a single process, the same <code>leveldb::DB</code> object may
|
operating system to prevent misuse. Within a single process, the
|
||||||
be safely used by multiple concurrent threads.
|
same <code>leveldb::DB</code> object may be safely shared by multiple
|
||||||
|
concurrent threads. I.e., different threads may write into or fetch
|
||||||
|
iterators or call <code>Get</code> on the same database without any
|
||||||
|
external synchronization (the leveldb implementation will
|
||||||
|
automatically do the required synchronization). However other objects
|
||||||
|
(like Iterator and WriteBatch) may require external synchronization.
|
||||||
|
If two threads share such an object, they must protect access to it
|
||||||
|
using their own locking protocol. More details are available in
|
||||||
|
the public header files.
|
||||||
<p>
|
<p>
|
||||||
<h1>Iteration</h1>
|
<h1>Iteration</h1>
|
||||||
<p>
|
<p>
|
||||||
|
@ -12,7 +12,9 @@ namespace leveldb {
|
|||||||
class Slice;
|
class Slice;
|
||||||
|
|
||||||
// A Comparator object provides a total order across slices that are
|
// A Comparator object provides a total order across slices that are
|
||||||
// used as keys in an sstable or a database.
|
// used as keys in an sstable or a database. A Comparator implementation
|
||||||
|
// must be thread-safe since leveldb may invoke its methods concurrently
|
||||||
|
// from multiple threads.
|
||||||
class Comparator {
|
class Comparator {
|
||||||
public:
|
public:
|
||||||
virtual ~Comparator();
|
virtual ~Comparator();
|
||||||
|
@ -13,26 +13,32 @@
|
|||||||
namespace leveldb {
|
namespace leveldb {
|
||||||
|
|
||||||
static const int kMajorVersion = 1;
|
static const int kMajorVersion = 1;
|
||||||
static const int kMinorVersion = 1;
|
static const int kMinorVersion = 2;
|
||||||
|
|
||||||
struct Options;
|
struct Options;
|
||||||
struct ReadOptions;
|
struct ReadOptions;
|
||||||
struct WriteOptions;
|
struct WriteOptions;
|
||||||
|
|
||||||
class Snapshot;
|
|
||||||
class WriteBatch;
|
class WriteBatch;
|
||||||
|
|
||||||
// Some internal types. Clients should ignore.
|
// Abstract handle to particular state of a DB.
|
||||||
class WriteBatchInternal;
|
// A Snapshot is an immutable object and can therefore be safely
|
||||||
|
// accessed from multiple threads without any external synchronization.
|
||||||
|
class Snapshot {
|
||||||
|
protected:
|
||||||
|
virtual ~Snapshot();
|
||||||
|
};
|
||||||
|
|
||||||
|
// A range of keys
|
||||||
struct Range {
|
struct Range {
|
||||||
Slice start;
|
Slice start; // Included in the range
|
||||||
Slice limit;
|
Slice limit; // Not included in the range
|
||||||
|
|
||||||
Range(const Slice& s, const Slice& l) : start(s), limit(l) { }
|
Range(const Slice& s, const Slice& l) : start(s), limit(l) { }
|
||||||
};
|
};
|
||||||
|
|
||||||
// A DB is a persistent ordered map from keys to values.
|
// A DB is a persistent ordered map from keys to values.
|
||||||
|
// A DB is safe for concurrent access from multiple threads without
|
||||||
|
// any external synchronization.
|
||||||
class DB {
|
class DB {
|
||||||
public:
|
public:
|
||||||
// Open the database with the specified "name".
|
// Open the database with the specified "name".
|
||||||
|
@ -6,6 +6,9 @@
|
|||||||
// operating system functionality like the filesystem etc. Callers
|
// operating system functionality like the filesystem etc. Callers
|
||||||
// may wish to provide a custom Env object when opening a database to
|
// may wish to provide a custom Env object when opening a database to
|
||||||
// get fine gain control; e.g., to rate limit file system operations.
|
// get fine gain control; e.g., to rate limit file system operations.
|
||||||
|
//
|
||||||
|
// All Env implementations are safe for concurrent access from
|
||||||
|
// multiple threads without any external synchronization.
|
||||||
|
|
||||||
#ifndef STORAGE_LEVELDB_INCLUDE_ENV_H_
|
#ifndef STORAGE_LEVELDB_INCLUDE_ENV_H_
|
||||||
#define STORAGE_LEVELDB_INCLUDE_ENV_H_
|
#define STORAGE_LEVELDB_INCLUDE_ENV_H_
|
||||||
@ -160,6 +163,15 @@ class SequentialFile {
|
|||||||
//
|
//
|
||||||
// REQUIRES: External synchronization
|
// REQUIRES: External synchronization
|
||||||
virtual Status Read(size_t n, Slice* result, char* scratch) = 0;
|
virtual Status Read(size_t n, Slice* result, char* scratch) = 0;
|
||||||
|
|
||||||
|
// Skip "n" bytes from the file. This is guaranteed to be no
|
||||||
|
// slower that reading the same data, but may be faster.
|
||||||
|
//
|
||||||
|
// If end of file is reached, skipping will stop at the end of the
|
||||||
|
// file, and Skip will return OK.
|
||||||
|
//
|
||||||
|
// REQUIRES: External synchronization
|
||||||
|
virtual Status Skip(uint64_t n) = 0;
|
||||||
};
|
};
|
||||||
|
|
||||||
// A file abstraction for randomly reading the contents of a file.
|
// A file abstraction for randomly reading the contents of a file.
|
||||||
|
@ -6,6 +6,11 @@
|
|||||||
// The following class defines the interface. Multiple implementations
|
// The following class defines the interface. Multiple implementations
|
||||||
// are provided by this library. In particular, iterators are provided
|
// are provided by this library. In particular, iterators are provided
|
||||||
// to access the contents of a Table or a DB.
|
// to access the contents of a Table or a DB.
|
||||||
|
//
|
||||||
|
// Multiple threads can invoke const methods on an Iterator without
|
||||||
|
// external synchronization, but if any of the threads may call a
|
||||||
|
// non-const method, all threads accessing the same Iterator must use
|
||||||
|
// external synchronization.
|
||||||
|
|
||||||
#ifndef STORAGE_LEVELDB_INCLUDE_ITERATOR_H_
|
#ifndef STORAGE_LEVELDB_INCLUDE_ITERATOR_H_
|
||||||
#define STORAGE_LEVELDB_INCLUDE_ITERATOR_H_
|
#define STORAGE_LEVELDB_INCLUDE_ITERATOR_H_
|
||||||
|
@ -6,6 +6,11 @@
|
|||||||
// storage and a size. The user of a Slice must ensure that the slice
|
// storage and a size. The user of a Slice must ensure that the slice
|
||||||
// is not used after the corresponding external storage has been
|
// is not used after the corresponding external storage has been
|
||||||
// deallocated.
|
// deallocated.
|
||||||
|
//
|
||||||
|
// Multiple threads can invoke const methods on a Slice without
|
||||||
|
// external synchronization, but if any of the threads may call a
|
||||||
|
// non-const method, all threads accessing the same Slice must use
|
||||||
|
// external synchronization.
|
||||||
|
|
||||||
#ifndef STORAGE_LEVELDB_INCLUDE_SLICE_H_
|
#ifndef STORAGE_LEVELDB_INCLUDE_SLICE_H_
|
||||||
#define STORAGE_LEVELDB_INCLUDE_SLICE_H_
|
#define STORAGE_LEVELDB_INCLUDE_SLICE_H_
|
||||||
|
@ -4,12 +4,16 @@
|
|||||||
//
|
//
|
||||||
// A Status encapsulates the result of an operation. It may indicate success,
|
// A Status encapsulates the result of an operation. It may indicate success,
|
||||||
// or it may indicate an error with an associated error message.
|
// or it may indicate an error with an associated error message.
|
||||||
|
//
|
||||||
|
// Multiple threads can invoke const methods on a Status without
|
||||||
|
// external synchronization, but if any of the threads may call a
|
||||||
|
// non-const method, all threads accessing the same Status must use
|
||||||
|
// external synchronization.
|
||||||
|
|
||||||
#ifndef STORAGE_LEVELDB_INCLUDE_STATUS_H_
|
#ifndef STORAGE_LEVELDB_INCLUDE_STATUS_H_
|
||||||
#define STORAGE_LEVELDB_INCLUDE_STATUS_H_
|
#define STORAGE_LEVELDB_INCLUDE_STATUS_H_
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <utility>
|
|
||||||
#include "leveldb/slice.h"
|
#include "leveldb/slice.h"
|
||||||
|
|
||||||
namespace leveldb {
|
namespace leveldb {
|
||||||
@ -18,7 +22,7 @@ class Status {
|
|||||||
public:
|
public:
|
||||||
// Create a success status.
|
// Create a success status.
|
||||||
Status() : state_(NULL) { }
|
Status() : state_(NULL) { }
|
||||||
~Status() { delete state_; }
|
~Status() { delete[] state_; }
|
||||||
|
|
||||||
// Copy the specified status.
|
// Copy the specified status.
|
||||||
Status(const Status& s);
|
Status(const Status& s);
|
||||||
@ -29,7 +33,7 @@ class Status {
|
|||||||
|
|
||||||
// Return error status of an appropriate type.
|
// Return error status of an appropriate type.
|
||||||
static Status NotFound(const Slice& msg, const Slice& msg2 = Slice()) {
|
static Status NotFound(const Slice& msg, const Slice& msg2 = Slice()) {
|
||||||
return Status(kNotFound, msg, Slice());
|
return Status(kNotFound, msg, msg2);
|
||||||
}
|
}
|
||||||
static Status Corruption(const Slice& msg, const Slice& msg2 = Slice()) {
|
static Status Corruption(const Slice& msg, const Slice& msg2 = Slice()) {
|
||||||
return Status(kCorruption, msg, msg2);
|
return Status(kCorruption, msg, msg2);
|
||||||
@ -55,6 +59,13 @@ class Status {
|
|||||||
std::string ToString() const;
|
std::string ToString() const;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
// OK status has a NULL state_. Otherwise, state_ is a new[] array
|
||||||
|
// of the following form:
|
||||||
|
// state_[0..3] == length of message
|
||||||
|
// state_[4] == code
|
||||||
|
// state_[5..] == message
|
||||||
|
const char* state_;
|
||||||
|
|
||||||
enum Code {
|
enum Code {
|
||||||
kOk = 0,
|
kOk = 0,
|
||||||
kNotFound = 1,
|
kNotFound = 1,
|
||||||
@ -63,21 +74,24 @@ class Status {
|
|||||||
kInvalidArgument = 4,
|
kInvalidArgument = 4,
|
||||||
kIOError = 5,
|
kIOError = 5,
|
||||||
};
|
};
|
||||||
Code code() const { return (state_ == NULL) ? kOk : state_->first; }
|
|
||||||
|
Code code() const {
|
||||||
|
return (state_ == NULL) ? kOk : static_cast<Code>(state_[4]);
|
||||||
|
}
|
||||||
|
|
||||||
Status(Code code, const Slice& msg, const Slice& msg2);
|
Status(Code code, const Slice& msg, const Slice& msg2);
|
||||||
|
static const char* CopyState(const char* s);
|
||||||
typedef std::pair<Code, std::string> State;
|
|
||||||
State* state_;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
inline Status::Status(const Status& s) {
|
inline Status::Status(const Status& s) {
|
||||||
state_ = (s.state_ == NULL) ? NULL : new State(*s.state_);
|
state_ = (s.state_ == NULL) ? NULL : CopyState(s.state_);
|
||||||
}
|
}
|
||||||
inline void Status::operator=(const Status& s) {
|
inline void Status::operator=(const Status& s) {
|
||||||
if (this != &s) {
|
// The following condition catches both aliasing (when this == &s),
|
||||||
delete state_;
|
// and the common case where both s and *this are ok.
|
||||||
state_ = (s.state_ == NULL) ? NULL : new State(*s.state_);
|
if (state_ != s.state_) {
|
||||||
|
delete[] state_;
|
||||||
|
state_ = (s.state_ == NULL) ? NULL : CopyState(s.state_);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -17,7 +17,8 @@ class RandomAccessFile;
|
|||||||
struct ReadOptions;
|
struct ReadOptions;
|
||||||
|
|
||||||
// A Table is a sorted map from strings to strings. Tables are
|
// A Table is a sorted map from strings to strings. Tables are
|
||||||
// immutable and persistent.
|
// immutable and persistent. A Table may be safely accessed from
|
||||||
|
// multiple threads without external synchronization.
|
||||||
class Table {
|
class Table {
|
||||||
public:
|
public:
|
||||||
// Attempt to open the table that is stored in bytes [0..file_size)
|
// Attempt to open the table that is stored in bytes [0..file_size)
|
||||||
|
@ -4,6 +4,11 @@
|
|||||||
//
|
//
|
||||||
// TableBuilder provides the interface used to build a Table
|
// TableBuilder provides the interface used to build a Table
|
||||||
// (an immutable and sorted map from keys to values).
|
// (an immutable and sorted map from keys to values).
|
||||||
|
//
|
||||||
|
// Multiple threads can invoke const methods on a TableBuilder without
|
||||||
|
// external synchronization, but if any of the threads may call a
|
||||||
|
// non-const method, all threads accessing the same TableBuilder must use
|
||||||
|
// external synchronization.
|
||||||
|
|
||||||
#ifndef STORAGE_LEVELDB_INCLUDE_TABLE_BUILDER_H_
|
#ifndef STORAGE_LEVELDB_INCLUDE_TABLE_BUILDER_H_
|
||||||
#define STORAGE_LEVELDB_INCLUDE_TABLE_BUILDER_H_
|
#define STORAGE_LEVELDB_INCLUDE_TABLE_BUILDER_H_
|
||||||
|
@ -12,11 +12,17 @@
|
|||||||
// batch.Delete("key");
|
// batch.Delete("key");
|
||||||
// batch.Put("key", "v2");
|
// batch.Put("key", "v2");
|
||||||
// batch.Put("key", "v3");
|
// batch.Put("key", "v3");
|
||||||
|
//
|
||||||
|
// Multiple threads can invoke const methods on a WriteBatch without
|
||||||
|
// external synchronization, but if any of the threads may call a
|
||||||
|
// non-const method, all threads accessing the same WriteBatch must use
|
||||||
|
// external synchronization.
|
||||||
|
|
||||||
#ifndef STORAGE_LEVELDB_INCLUDE_WRITE_BATCH_H_
|
#ifndef STORAGE_LEVELDB_INCLUDE_WRITE_BATCH_H_
|
||||||
#define STORAGE_LEVELDB_INCLUDE_WRITE_BATCH_H_
|
#define STORAGE_LEVELDB_INCLUDE_WRITE_BATCH_H_
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
|
#include "leveldb/status.h"
|
||||||
|
|
||||||
namespace leveldb {
|
namespace leveldb {
|
||||||
|
|
||||||
@ -36,6 +42,15 @@ class WriteBatch {
|
|||||||
// Clear all updates buffered in this batch.
|
// Clear all updates buffered in this batch.
|
||||||
void Clear();
|
void Clear();
|
||||||
|
|
||||||
|
// Support for iterating over the contents of a batch.
|
||||||
|
class Handler {
|
||||||
|
public:
|
||||||
|
virtual ~Handler();
|
||||||
|
virtual void Put(const Slice& key, const Slice& value) = 0;
|
||||||
|
virtual void Delete(const Slice& key) = 0;
|
||||||
|
};
|
||||||
|
Status Iterate(Handler* handler) const;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
friend class WriteBatchInternal;
|
friend class WriteBatchInternal;
|
||||||
|
|
||||||
|
@ -80,7 +80,7 @@ void BlockBuilder::Add(const Slice& key, const Slice& value) {
|
|||||||
if (counter_ < options_->block_restart_interval) {
|
if (counter_ < options_->block_restart_interval) {
|
||||||
// See how much sharing to do with previous string
|
// See how much sharing to do with previous string
|
||||||
const size_t min_length = std::min(last_key_piece.size(), key.size());
|
const size_t min_length = std::min(last_key_piece.size(), key.size());
|
||||||
while ((shared < min_length) && (last_key_[shared] == key[shared])) {
|
while ((shared < min_length) && (last_key_piece[shared] == key[shared])) {
|
||||||
shared++;
|
shared++;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
@ -319,13 +319,15 @@ class MemTableConstructor: public Constructor {
|
|||||||
: Constructor(cmp),
|
: Constructor(cmp),
|
||||||
internal_comparator_(cmp) {
|
internal_comparator_(cmp) {
|
||||||
memtable_ = new MemTable(internal_comparator_);
|
memtable_ = new MemTable(internal_comparator_);
|
||||||
|
memtable_->Ref();
|
||||||
}
|
}
|
||||||
~MemTableConstructor() {
|
~MemTableConstructor() {
|
||||||
delete memtable_;
|
memtable_->Unref();
|
||||||
}
|
}
|
||||||
virtual Status FinishImpl(const Options& options, const KVMap& data) {
|
virtual Status FinishImpl(const Options& options, const KVMap& data) {
|
||||||
delete memtable_;
|
memtable_->Unref();
|
||||||
memtable_ = new MemTable(internal_comparator_);
|
memtable_ = new MemTable(internal_comparator_);
|
||||||
|
memtable_->Ref();
|
||||||
int seq = 1;
|
int seq = 1;
|
||||||
for (KVMap::const_iterator it = data.begin();
|
for (KVMap::const_iterator it = data.begin();
|
||||||
it != data.end();
|
it != data.end();
|
||||||
@ -736,16 +738,17 @@ class MemTableTest { };
|
|||||||
|
|
||||||
TEST(MemTableTest, Simple) {
|
TEST(MemTableTest, Simple) {
|
||||||
InternalKeyComparator cmp(BytewiseComparator());
|
InternalKeyComparator cmp(BytewiseComparator());
|
||||||
MemTable memtable(cmp);
|
MemTable* memtable = new MemTable(cmp);
|
||||||
|
memtable->Ref();
|
||||||
WriteBatch batch;
|
WriteBatch batch;
|
||||||
WriteBatchInternal::SetSequence(&batch, 100);
|
WriteBatchInternal::SetSequence(&batch, 100);
|
||||||
batch.Put(std::string("k1"), std::string("v1"));
|
batch.Put(std::string("k1"), std::string("v1"));
|
||||||
batch.Put(std::string("k2"), std::string("v2"));
|
batch.Put(std::string("k2"), std::string("v2"));
|
||||||
batch.Put(std::string("k3"), std::string("v3"));
|
batch.Put(std::string("k3"), std::string("v3"));
|
||||||
batch.Put(std::string("largekey"), std::string("vlarge"));
|
batch.Put(std::string("largekey"), std::string("vlarge"));
|
||||||
ASSERT_TRUE(WriteBatchInternal::InsertInto(&batch, &memtable).ok());
|
ASSERT_TRUE(WriteBatchInternal::InsertInto(&batch, memtable).ok());
|
||||||
|
|
||||||
Iterator* iter = memtable.NewIterator();
|
Iterator* iter = memtable->NewIterator();
|
||||||
iter->SeekToFirst();
|
iter->SeekToFirst();
|
||||||
while (iter->Valid()) {
|
while (iter->Valid()) {
|
||||||
fprintf(stderr, "key: '%s' -> '%s'\n",
|
fprintf(stderr, "key: '%s' -> '%s'\n",
|
||||||
@ -755,6 +758,7 @@ TEST(MemTableTest, Simple) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
delete iter;
|
delete iter;
|
||||||
|
memtable->Unref();
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool Between(uint64_t val, uint64_t low, uint64_t high) {
|
static bool Between(uint64_t val, uint64_t low, uint64_t high) {
|
||||||
|
@ -141,6 +141,13 @@ class ChromiumSequentialFile: public SequentialFile {
|
|||||||
}
|
}
|
||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
virtual Status Skip(uint64_t n) {
|
||||||
|
if (fseek(file_, n, SEEK_CUR)) {
|
||||||
|
return Status::IOError(filename_, strerror(errno));
|
||||||
|
}
|
||||||
|
return Status::OK();
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
class ChromiumRandomAccessFile: public RandomAccessFile {
|
class ChromiumRandomAccessFile: public RandomAccessFile {
|
||||||
|
@ -52,6 +52,13 @@ class PosixSequentialFile: public SequentialFile {
|
|||||||
}
|
}
|
||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
virtual Status Skip(uint64_t n) {
|
||||||
|
if (fseek(file_, n, SEEK_CUR)) {
|
||||||
|
return Status::IOError(filename_, strerror(errno));
|
||||||
|
}
|
||||||
|
return Status::OK();
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
class PosixRandomAccessFile: public RandomAccessFile {
|
class PosixRandomAccessFile: public RandomAccessFile {
|
||||||
|
@ -8,13 +8,29 @@
|
|||||||
|
|
||||||
namespace leveldb {
|
namespace leveldb {
|
||||||
|
|
||||||
|
const char* Status::CopyState(const char* state) {
|
||||||
|
uint32_t size;
|
||||||
|
memcpy(&size, state, sizeof(size));
|
||||||
|
char* result = new char[size + 5];
|
||||||
|
memcpy(result, state, size + 5);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
Status::Status(Code code, const Slice& msg, const Slice& msg2) {
|
Status::Status(Code code, const Slice& msg, const Slice& msg2) {
|
||||||
assert(code != kOk);
|
assert(code != kOk);
|
||||||
state_ = new State(make_pair(code, std::string(msg.data(), msg.size())));
|
const uint32_t len1 = msg.size();
|
||||||
if (!msg2.empty()) {
|
const uint32_t len2 = msg2.size();
|
||||||
state_->second.append(": ");
|
const uint32_t size = len1 + (len2 ? (2 + len2) : 0);
|
||||||
state_->second.append(msg2.data(), msg2.size());
|
char* result = new char[size + 5];
|
||||||
|
memcpy(result, &size, sizeof(size));
|
||||||
|
result[4] = static_cast<char>(code);
|
||||||
|
memcpy(result + 5, msg.data(), len1);
|
||||||
|
if (len2) {
|
||||||
|
result[5 + len1] = ':';
|
||||||
|
result[6 + len1] = ' ';
|
||||||
|
memcpy(result + 7 + len1, msg2.data(), len2);
|
||||||
}
|
}
|
||||||
|
state_ = result;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string Status::ToString() const {
|
std::string Status::ToString() const {
|
||||||
@ -23,12 +39,12 @@ std::string Status::ToString() const {
|
|||||||
} else {
|
} else {
|
||||||
char tmp[30];
|
char tmp[30];
|
||||||
const char* type;
|
const char* type;
|
||||||
switch (state_->first) {
|
switch (code()) {
|
||||||
case kOk:
|
case kOk:
|
||||||
type = "OK";
|
type = "OK";
|
||||||
break;
|
break;
|
||||||
case kNotFound:
|
case kNotFound:
|
||||||
type = "NotFound";
|
type = "NotFound: ";
|
||||||
break;
|
break;
|
||||||
case kCorruption:
|
case kCorruption:
|
||||||
type = "Corruption: ";
|
type = "Corruption: ";
|
||||||
@ -44,14 +60,14 @@ std::string Status::ToString() const {
|
|||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
snprintf(tmp, sizeof(tmp), "Unknown code(%d): ",
|
snprintf(tmp, sizeof(tmp), "Unknown code(%d): ",
|
||||||
static_cast<int>(state_->first));
|
static_cast<int>(code()));
|
||||||
type = tmp;
|
type = tmp;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
std::string result(type);
|
std::string result(type);
|
||||||
if (!state_->second.empty()) {
|
uint32_t length;
|
||||||
result.append(state_->second);
|
memcpy(&length, state_, sizeof(length));
|
||||||
}
|
result.append(state_ + 5, length);
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user