Bugfixes: for Get(), don't hold mutex while writing log.

- Fix bug in Get: when it triggers a compaction, it could sometimes mark the compaction with the wrong level (if there was a gap in the set of levels examined for the Get). - Do not hold mutex while writing to the log file or to the MANIFEST file. Added a new benchmark that runs a writer thread concurrently with reader threads. Percentiles ------------------------------ micros/op: avg median 99 99.9 99.99 99.999 max ------------------------------------------------------ before: 42 38 110 225 32000 42000 48000 after: 24 20 55 65 130 1100 7000 - Fixed race in optimized Get. It should have been using the pinned memtables, not the current memtables. git-svn-id: https://leveldb.googlecode.com/svn/trunk@50 62dab493-f737-651d-591e-8d6aee1b9529
2011-09-01 19:08:02 +00:00 · 2011-09-01 19:08:02 +00:00 · 7263023651
commit 7263023651
parent e3584f9c28
6 changed files with 188 additions and 41 deletions
--- a/db/db_bench.cc
+++ b/db/db_bench.cc
@ -280,6 +280,7 @@ struct ThreadState {
  int tid;             // 0..n-1 when running in n threads
  Random rand;         // Has different seeds for different threads
  Stats stats;
+  SharedState* shared;

  ThreadState(int index)
      : tid(index),
@ -418,13 +419,14 @@ class Benchmark {

      // Reset parameters that may be overriddden bwlow
      num_ = FLAGS_num;
-      reads_ = num_;
+      reads_ = (FLAGS_reads < 0 ? FLAGS_num : FLAGS_reads);
      value_size_ = FLAGS_value_size;
      entries_per_batch_ = 1;
      write_options_ = WriteOptions();

      void (Benchmark::*method)(ThreadState*) = NULL;
      bool fresh_db = false;
+      int num_threads = FLAGS_threads;

      if (name == Slice("fillseq")) {
        fresh_db = true;
@ -460,6 +462,9 @@ class Benchmark {
      } else if (name == Slice("readrandomsmall")) {
        reads_ /= 1000;
        method = &Benchmark::ReadRandom;
+      } else if (name == Slice("readwhilewriting")) {
+        num_threads++;  // Add extra thread for writing
+        method = &Benchmark::ReadWhileWriting;
      } else if (name == Slice("compact")) {
        method = &Benchmark::Compact;
      } else if (name == Slice("crc32c")) {
@ -494,7 +499,7 @@ class Benchmark {
      }

      if (method != NULL) {
-        RunBenchmark(name, method);
+        RunBenchmark(num_threads, name, method);
      }
    }
  }
@ -535,8 +540,8 @@ class Benchmark {
    }
  }

-  void RunBenchmark(Slice name, void (Benchmark::*method)(ThreadState*)) {
-    const int n = FLAGS_threads;
+  void RunBenchmark(int n, Slice name,
+                    void (Benchmark::*method)(ThreadState*)) {
    SharedState shared;
    shared.total = n;
    shared.num_initialized = 0;
@ -549,6 +554,7 @@ class Benchmark {
      arg[i].method = method;
      arg[i].shared = &shared;
      arg[i].thread = new ThreadState(i);
+      arg[i].thread->shared = &shared;
      Env::Default()->StartThread(ThreadBody, &arg[i]);
    }

@ -688,7 +694,6 @@ class Benchmark {
    RandomGenerator gen;
    WriteBatch batch;
    Status s;
-    std::string val;
    int64_t bytes = 0;
    for (int i = 0; i < num_; i += entries_per_batch_) {
      batch.Clear();
@ -760,6 +765,36 @@ class Benchmark {
    }
  }

+  void ReadWhileWriting(ThreadState* thread) {
+    if (thread->tid > 0) {
+      ReadRandom(thread);
+    } else {
+      // Special thread that keeps writing until other threads are done.
+      RandomGenerator gen;
+      while (true) {
+        {
+          MutexLock l(&thread->shared->mu);
+          if (thread->shared->num_done + 1 >= thread->shared->num_initialized) {
+            // Other threads have finished
+            break;
+          }
+        }
+
+        const int k = thread->rand.Next() % FLAGS_num;
+        char key[100];
+        snprintf(key, sizeof(key), "%016d", k);
+        Status s = db_->Put(write_options_, key, gen.Generate(value_size_));
+        if (!s.ok()) {
+          fprintf(stderr, "put error: %s\n", s.ToString().c_str());
+          exit(1);
+        }
+      }
+
+      // Do not count any of the preceding work/delay in stats.
+      thread->stats.Start();
+    }
+  }
+
  void Compact(ThreadState* thread) {
    DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
    dbi->TEST_CompactMemTable();
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@ -113,6 +113,8 @@ DBImpl::DBImpl(const Options& options, const std::string& dbname)
      logfile_(NULL),
      logfile_number_(0),
      log_(NULL),
+      logger_(NULL),
+      logger_cv_(&mutex_),
      bg_compaction_scheduled_(false),
      manual_compaction_(NULL) {
  mem_->Ref();
@ -308,6 +310,11 @@ Status DBImpl::Recover(VersionEdit* edit) {
    std::sort(logs.begin(), logs.end());
    for (size_t i = 0; i < logs.size(); i++) {
      s = RecoverLogFile(logs[i], edit, &max_sequence);
+
+      // The previous incarnation may not have written any MANIFEST
+      // records after allocating this log number.  So we manually
+      // update the file number allocation counter in VersionSet.
+      versions_->MarkFileNumberUsed(logs[i]);
    }

    if (s.ok()) {
@ -485,7 +492,7 @@ Status DBImpl::CompactMemTable() {
  if (s.ok()) {
    edit.SetPrevLogNumber(0);
    edit.SetLogNumber(logfile_number_);  // Earlier logs no longer needed
-    s = versions_->LogAndApply(&edit);
+    s = versions_->LogAndApply(&edit, &mutex_);
  }

  if (s.ok()) {
@ -523,7 +530,10 @@ void DBImpl::TEST_CompactRange(

 Status DBImpl::TEST_CompactMemTable() {
  MutexLock l(&mutex_);
+  LoggerId self;
+  AcquireLoggingResponsibility(&self);
  Status s = MakeRoomForWrite(true /* force compaction */);
+  ReleaseLoggingResponsibility(&self);
  if (s.ok()) {
    // Wait until the compaction completes
    while (imm_ != NULL && bg_error_.ok()) {
@ -600,7 +610,7 @@ void DBImpl::BackgroundCompaction() {
    c->edit()->DeleteFile(c->level(), f->number);
    c->edit()->AddFile(c->level() + 1, f->number, f->file_size,
                       f->smallest, f->largest);
-    status = versions_->LogAndApply(c->edit());
+    status = versions_->LogAndApply(c->edit(), &mutex_);
    VersionSet::LevelSummaryStorage tmp;
    Log(options_.info_log, "Moved #%lld to level-%d %lld bytes %s: %s\n",
        static_cast<unsigned long long>(f->number),
@ -748,7 +758,7 @@ Status DBImpl::InstallCompactionResults(CompactionState* compact) {
  }
  compact->outputs.clear();

-  Status s = versions_->LogAndApply(compact->compaction->edit());
+  Status s = versions_->LogAndApply(compact->compaction->edit(), &mutex_);
  if (s.ok()) {
    compact->compaction->ReleaseInputs();
    DeleteObsoleteFiles();
@ -1004,9 +1014,9 @@ Status DBImpl::Get(const ReadOptions& options,
    mutex_.Unlock();
    // First look in the memtable, then in the immutable memtable (if any).
    LookupKey lkey(key, snapshot);
-    if (mem_->Get(lkey, value, &s)) {
+    if (mem->Get(lkey, value, &s)) {
      // Done
-    } else if (imm_ != NULL && imm_->Get(lkey, value, &s)) {
+    } else if (imm != NULL && imm->Get(lkey, value, &s)) {
      // Done
    } else {
      s = current->Get(options, lkey, value, &stats);
@ -1053,17 +1063,39 @@ Status DBImpl::Delete(const WriteOptions& options, const Slice& key) {
  return DB::Delete(options, key);
 }

+// There is at most one thread that is the current logger.  This call
+// waits until preceding logger(s) have finished and becomes the
+// current logger.
+void DBImpl::AcquireLoggingResponsibility(LoggerId* self) {
+  while (logger_ != NULL) {
+    logger_cv_.Wait();
+  }
+  logger_ = self;
+}
+
+void DBImpl::ReleaseLoggingResponsibility(LoggerId* self) {
+  assert(logger_ == self);
+  logger_ = NULL;
+  logger_cv_.SignalAll();
+}
+
 Status DBImpl::Write(const WriteOptions& options, WriteBatch* updates) {
  Status status;
  MutexLock l(&mutex_);
+  LoggerId self;
+  AcquireLoggingResponsibility(&self);
  status = MakeRoomForWrite(false);  // May temporarily release lock and wait
  uint64_t last_sequence = versions_->LastSequence();
  if (status.ok()) {
    WriteBatchInternal::SetSequence(updates, last_sequence + 1);
    last_sequence += WriteBatchInternal::Count(updates);
-    versions_->SetLastSequence(last_sequence);

-    // Add to log and apply to memtable
+    // Add to log and apply to memtable.  We can release the lock during
+    // this phase since the "logger_" flag protects against concurrent
+    // loggers and concurrent writes into mem_.
+    {
+      assert(logger_ == &self);
+      mutex_.Unlock();
      status = log_->AddRecord(WriteBatchInternal::Contents(updates));
      if (status.ok() && options.sync) {
        status = logfile_->Sync();
@ -1071,16 +1103,25 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* updates) {
      if (status.ok()) {
        status = WriteBatchInternal::InsertInto(updates, mem_);
      }
+      mutex_.Lock();
+      assert(logger_ == &self);
+    }
+
+    versions_->SetLastSequence(last_sequence);
  }
  if (options.post_write_snapshot != NULL) {
    *options.post_write_snapshot =
        status.ok() ? snapshots_.New(last_sequence) : NULL;
  }
+  ReleaseLoggingResponsibility(&self);
  return status;
 }

+// REQUIRES: mutex_ is held
+// REQUIRES: this thread is the current logger
 Status DBImpl::MakeRoomForWrite(bool force) {
  mutex_.AssertHeld();
+  assert(logger_ != NULL);
  bool allow_delay = !force;
  Status s;
  while (true) {
@ -1249,7 +1290,7 @@ Status DB::Open(const Options& options, const std::string& dbname,
      impl->logfile_ = lfile;
      impl->logfile_number_ = new_log_number;
      impl->log_ = new log::Writer(lfile);
-      s = impl->versions_->LogAndApply(&edit);
+      s = impl->versions_->LogAndApply(&edit, &impl->mutex_);
    }
    if (s.ok()) {
      impl->DeleteObsoleteFiles();
--- a/db/db_impl.h
+++ b/db/db_impl.h
@ -87,6 +87,11 @@ class DBImpl : public DB {

  Status WriteLevel0Table(MemTable* mem, VersionEdit* edit, Version* base);

+  // Only thread is allowed to log at a time.
+  struct LoggerId { };          // Opaque identifier for logging thread
+  void AcquireLoggingResponsibility(LoggerId* self);
+  void ReleaseLoggingResponsibility(LoggerId* self);
+
  Status MakeRoomForWrite(bool force /* compact even if there is room? */);

  struct CompactionState;
@ -126,6 +131,8 @@ class DBImpl : public DB {
  WritableFile* logfile_;
  uint64_t logfile_number_;
  log::Writer* log_;
+  LoggerId* logger_;            // NULL, or the id of the current logging thread
+  port::CondVar logger_cv_;     // For threads waiting to log
  SnapshotList snapshots_;

  // Set of table files to protect from deletion because they are
--- a/db/db_test.cc
+++ b/db/db_test.cc
@ -10,6 +10,7 @@
 #include "leveldb/env.h"
 #include "leveldb/table.h"
 #include "util/logging.h"
+#include "util/mutexlock.h"
 #include "util/testharness.h"
 #include "util/testutil.h"

@ -345,6 +346,41 @@ TEST(DBTest, GetPicksCorrectFile) {
  ASSERT_EQ("vx", Get("x"));
 }

+TEST(DBTest, GetEncountersEmptyLevel) {
+  // Arrange for the following to happen:
+  //   * sstable A in level 0
+  //   * nothing in level 1
+  //   * sstable B in level 2
+  // Then do enough Get() calls to arrange for an automatic compaction
+  // of sstable A.  A bug would cause the compaction to be marked as
+  // occuring at level 1 (instead of the correct level 0).
+
+  // Step 1: First place sstables in levels 0 and 2
+  int compaction_count = 0;
+  while (NumTableFilesAtLevel(0) == 0 ||
+         NumTableFilesAtLevel(2) == 0) {
+    ASSERT_LE(compaction_count, 100) << "could not fill levels 0 and 2";
+    compaction_count++;
+    Put("a", "begin");
+    Put("z", "end");
+    dbfull()->TEST_CompactMemTable();
+  }
+
+  // Step 2: clear level 1 if necessary.
+  dbfull()->TEST_CompactRange(1, "a", "z");
+  ASSERT_EQ(NumTableFilesAtLevel(0), 1);
+  ASSERT_EQ(NumTableFilesAtLevel(1), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(2), 1);
+
+  // Step 3: read until level 0 compaction disappears.
+  int read_count = 0;
+  while (NumTableFilesAtLevel(0) > 0) {
+    ASSERT_LE(read_count, 10000) << "did not trigger level 0 compaction";
+    read_count++;
+    ASSERT_EQ("NOT_FOUND", Get("missing"));
+  }
+}
+
 TEST(DBTest, IterEmpty) {
  Iterator* iter = db_->NewIterator(ReadOptions());

@ -1355,6 +1391,9 @@ void BM_LogAndApply(int iters, int num_base_files) {

  Env* env = Env::Default();

+  port::Mutex mu;
+  MutexLock l(&mu);
+
  InternalKeyComparator cmp(BytewiseComparator());
  Options options;
  VersionSet vset(dbname, &options, NULL, &cmp);
@ -1366,7 +1405,7 @@ void BM_LogAndApply(int iters, int num_base_files) {
    InternalKey limit(MakeKey(2*fnum+1), 1, kTypeDeletion);
    vbase.AddFile(2, fnum++, 1 /* file size */, start, limit);
  }
-  ASSERT_OK(vset.LogAndApply(&vbase));
+  ASSERT_OK(vset.LogAndApply(&vbase, &mu));

  uint64_t start_micros = env->NowMicros();

@ -1376,7 +1415,7 @@ void BM_LogAndApply(int iters, int num_base_files) {
    InternalKey start(MakeKey(2*fnum), 1, kTypeValue);
    InternalKey limit(MakeKey(2*fnum+1), 1, kTypeDeletion);
    vedit.AddFile(2, fnum++, 1 /* file size */, start, limit);
-    vset.LogAndApply(&vedit);
+    vset.LogAndApply(&vedit, &mu);
  }
  uint64_t stop_micros = env->NowMicros();
  unsigned int us = stop_micros - start_micros;
--- a/db/version_set.cc
+++ b/db/version_set.cc
@ -250,6 +250,7 @@ Status Version::Get(const ReadOptions& options,
  stats->seek_file = NULL;
  stats->seek_file_level = -1;
  FileMetaData* last_file_read = NULL;
+  int last_file_read_level = -1;

  // We can search level-by-level since entries never hop across
  // levels.  Therefore we are guaranteed that if we find data
@ -301,11 +302,12 @@ Status Version::Get(const ReadOptions& options,
      if (last_file_read != NULL && stats->seek_file == NULL) {
        // We have had more than one seek for this read.  Charge the 1st file.
        stats->seek_file = last_file_read;
-        stats->seek_file_level = (i == 0 ? level - 1 : level);
+        stats->seek_file_level = last_file_read_level;
      }

      FileMetaData* f = files[i];
      last_file_read = f;
+      last_file_read_level = level;

      Iterator* iter = vset_->table_cache_->NewIterator(
          options,
@ -609,7 +611,7 @@ void VersionSet::AppendVersion(Version* v) {
  v->next_->prev_ = v;
 }

-Status VersionSet::LogAndApply(VersionEdit* edit) {
+Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mu) {
  if (edit->has_log_number_) {
    assert(edit->log_number_ >= log_number_);
    assert(edit->log_number_ < next_file_number_);
@ -637,6 +639,8 @@ Status VersionSet::LogAndApply(VersionEdit* edit) {
  std::string new_manifest_file;
  Status s;
  if (descriptor_log_ == NULL) {
+    // No reason to unlock *mu here since we only hit this path in the
+    // first call to LogAndApply (when opening the database).
    assert(descriptor_file_ == NULL);
    new_manifest_file = DescriptorFileName(dbname_, manifest_file_number_);
    edit->SetNextFile(next_file_number_);
@ -647,6 +651,10 @@ Status VersionSet::LogAndApply(VersionEdit* edit) {
    }
  }

+  // Unlock during expensive MANIFEST log write
+  {
+    mu->Unlock();
+
    // Write new record to MANIFEST log
    if (s.ok()) {
      std::string record;
@ -663,6 +671,9 @@ Status VersionSet::LogAndApply(VersionEdit* edit) {
      s = SetCurrentFile(env_, dbname_, manifest_file_number_);
    }

+    mu->Lock();
+  }
+
  // Install the new version
  if (s.ok()) {
    AppendVersion(v);
@ -776,6 +787,9 @@ Status VersionSet::Recover() {
    if (!have_prev_log_number) {
      prev_log_number = 0;
    }
+
+    MarkFileNumberUsed(prev_log_number);
+    MarkFileNumberUsed(log_number);
  }

  if (s.ok()) {
@ -794,6 +808,12 @@ Status VersionSet::Recover() {
  return s;
 }

+void VersionSet::MarkFileNumberUsed(uint64_t number) {
+  if (next_file_number_ <= number) {
+    next_file_number_ = number + 1;
+  }
+}
+
 static int64_t TotalFileSize(const std::vector<FileMetaData*>& files) {
  int64_t sum = 0;
  for (size_t i = 0; i < files.size(); i++) {
--- a/db/version_set.h
+++ b/db/version_set.h
@ -138,15 +138,14 @@ class VersionSet {

  // Apply *edit to the current version to form a new descriptor that
  // is both saved to persistent state and installed as the new
-  // current version.
-  Status LogAndApply(VersionEdit* edit);
+  // current version.  Will release *mu while actually writing to the file.
+  // REQUIRES: *mu is held on entry.
+  // REQUIRES: no other thread concurrently calls LogAndApply()
+  Status LogAndApply(VersionEdit* edit, port::Mutex* mu);

  // Recover the last saved descriptor from persistent storage.
  Status Recover();

-  // Save current contents to *log
-  Status WriteSnapshot(log::Writer* log);
-
  // Return the current version.
  Version* current() const { return current_; }

@ -171,6 +170,9 @@ class VersionSet {
    last_sequence_ = s;
  }

+  // Mark the specified file number as used.
+  void MarkFileNumberUsed(uint64_t number);
+
  // Return the current log file number.
  uint64_t LogNumber() const { return log_number_; }

@ -247,6 +249,9 @@ class VersionSet {

  void SetupOtherInputs(Compaction* c);

+  // Save current contents to *log
+  Status WriteSnapshot(log::Writer* log);
+
  void AppendVersion(Version* v);

  Env* const env_;