sync with upstream @ 21409451

Check the NEWS file for details of what changed. git-svn-id: https://leveldb.googlecode.com/svn/trunk@28 62dab493-f737-651d-591e-8d6aee1b9529
2011-05-21 02:17:43 +00:00 · 2011-05-21 02:17:43 +00:00 · da79909507
commit da79909507
parent 3c111335a7
34 changed files with 953 additions and 406 deletions
--- a/17
+++ b/17
@ -0,0 +1,17 @@
 Release 1.2 2011-05-16
 ----------------------
 Fixes for larger databases (tested up to one billion 100-byte entries,
 i.e., ~100GB).
 (1) Place hard limit on number of level-0 files.  This fixes errors
 of the form "too many open files".
 (2) Fixed memtable management.  Before the fix, a heavy write burst
 could cause unbounded memory usage.
 A fix for a logging bug where the reader would incorrectly complain
 about corruption.
 Allow public access to WriteBatch contents so that users can easily
 wrap a DB.
--- a/db/db_bench.cc
+++ b/db/db_bench.cc
@ -24,9 +24,10 @@
 //      overwrite     -- overwrite N values in random key order in async mode
 //      fillsync      -- write N/100 values in random key order in sync mode
 //      fill100K      -- write N/1000 100K values in random order in async mode
-//      readseq       -- read N values sequentially
+//      readseq       -- read N times sequentially
-//      readreverse   -- read N values in reverse order
+//      readreverse   -- read N times in reverse order
-//      readrandom    -- read N values in random order
+//      readrandom    -- read N times in random order
 //      readhot       -- read N times in random order from 1% section of DB
 //      crc32c        -- repeated crc32c of 4K of data
 //   Meta operations:
 //      compact     -- Compact the entire DB
@ -54,6 +55,9 @@ static const char* FLAGS_benchmarks =
 // Number of key/values to place in database
 static int FLAGS_num = 1000000;
 // Number of read operations to do.  If negative, do FLAGS_num reads.
 static int FLAGS_reads = -1;
 // Size of each value
 static int FLAGS_value_size = 100;
@ -72,6 +76,14 @@ static int FLAGS_write_buffer_size = 0;
 // Negative means use default settings.
 static int FLAGS_cache_size = -1;
 // Maximum number of files to keep open at the same time (use default if == 0)
 static int FLAGS_open_files = 0;
 // If true, do not destroy the existing database.  If you set this
 // flag and also specify a benchmark that wants a fresh database, that
 // benchmark will fail.
 static bool FLAGS_use_existing_db = false;
 namespace leveldb {
 // Helper for quickly generating random data.
@ -126,6 +138,7 @@ class Benchmark {
  Cache* cache_;
  DB* db_;
  int num_;
  int reads_;
  int heap_counter_;
  double start_;
  double last_op_finish_;
@ -298,6 +311,7 @@ class Benchmark {
  : cache_(FLAGS_cache_size >= 0 ? NewLRUCache(FLAGS_cache_size) : NULL),
    db_(NULL),
    num_(FLAGS_num),
    reads_(FLAGS_reads < 0 ? FLAGS_num : FLAGS_reads),
    heap_counter_(0),
    bytes_(0),
    rand_(301) {
@ -308,8 +322,10 @@ class Benchmark {
        Env::Default()->DeleteFile("/tmp/dbbench/" + files[i]);
      }
    }
    if (!FLAGS_use_existing_db) {
      DestroyDB("/tmp/dbbench", Options());
    }
  }
  ~Benchmark() {
    delete db_;
@ -355,11 +371,13 @@ class Benchmark {
        ReadReverse();
      } else if (name == Slice("readrandom")) {
        ReadRandom();
      } else if (name == Slice("readhot")) {
        ReadHot();
      } else if (name == Slice("readrandomsmall")) {
-        int n = num_;
+        int n = reads_;
-        num_ /= 1000;
+        reads_ /= 1000;
        ReadRandom();
-        num_ = n;
+        reads_ = n;
      } else if (name == Slice("compact")) {
        Compact();
      } else if (name == Slice("crc32c")) {
@ -449,7 +467,7 @@ class Benchmark {
  void Open() {
    assert(db_ == NULL);
    Options options;
-    options.create_if_missing = true;
+    options.create_if_missing = !FLAGS_use_existing_db;
    options.block_cache = cache_;
    options.write_buffer_size = FLAGS_write_buffer_size;
    Status s = DB::Open(options, "/tmp/dbbench", &db_);
@ -462,6 +480,10 @@ class Benchmark {
  void Write(const WriteOptions& options, Order order, DBState state,
             int num_entries, int value_size, int entries_per_batch) {
    if (state == FRESH) {
      if (FLAGS_use_existing_db) {
        message_ = "skipping (--use_existing_db is true)";
        return;
      }
      delete db_;
      db_ = NULL;
      DestroyDB("/tmp/dbbench", Options());
@ -499,7 +521,7 @@ class Benchmark {
  void ReadSequential() {
    Iterator* iter = db_->NewIterator(ReadOptions());
    int i = 0;
-    for (iter->SeekToFirst(); i < num_ && iter->Valid(); iter->Next()) {
+    for (iter->SeekToFirst(); i < reads_ && iter->Valid(); iter->Next()) {
      bytes_ += iter->key().size() + iter->value().size();
      FinishedSingleOp();
      ++i;
@ -510,7 +532,7 @@ class Benchmark {
  void ReadReverse() {
    Iterator* iter = db_->NewIterator(ReadOptions());
    int i = 0;
-    for (iter->SeekToLast(); i < num_ && iter->Valid(); iter->Prev()) {
+    for (iter->SeekToLast(); i < reads_ && iter->Valid(); iter->Prev()) {
      bytes_ += iter->key().size() + iter->value().size();
      FinishedSingleOp();
      ++i;
@ -521,7 +543,7 @@ class Benchmark {
  void ReadRandom() {
    ReadOptions options;
    std::string value;
-    for (int i = 0; i < num_; i++) {
+    for (int i = 0; i < reads_; i++) {
      char key[100];
      const int k = rand_.Next() % FLAGS_num;
      snprintf(key, sizeof(key), "%016d", k);
@ -530,6 +552,19 @@ class Benchmark {
    }
  }
  void ReadHot() {
    ReadOptions options;
    std::string value;
    const int range = (FLAGS_num + 99) / 100;
    for (int i = 0; i < reads_; i++) {
      char key[100];
      const int k = rand_.Next() % range;
      snprintf(key, sizeof(key), "%016d", k);
      db_->Get(options, key, &value);
      FinishedSingleOp();
    }
  }
  void Compact() {
    DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
    dbi->TEST_CompactMemTable();
@ -582,6 +617,8 @@ class Benchmark {
 int main(int argc, char** argv) {
  FLAGS_write_buffer_size = leveldb::Options().write_buffer_size;
  FLAGS_open_files = leveldb::Options().max_open_files;
  for (int i = 1; i < argc; i++) {
    double d;
    int n;
@ -593,14 +630,21 @@ int main(int argc, char** argv) {
    } else if (sscanf(argv[i], "--histogram=%d%c", &n, &junk) == 1 &&
               (n == 0 || n == 1)) {
      FLAGS_histogram = n;
    } else if (sscanf(argv[i], "--use_existing_db=%d%c", &n, &junk) == 1 &&
               (n == 0 || n == 1)) {
      FLAGS_use_existing_db = n;
    } else if (sscanf(argv[i], "--num=%d%c", &n, &junk) == 1) {
      FLAGS_num = n;
    } else if (sscanf(argv[i], "--reads=%d%c", &n, &junk) == 1) {
      FLAGS_reads = n;
    } else if (sscanf(argv[i], "--value_size=%d%c", &n, &junk) == 1) {
      FLAGS_value_size = n;
    } else if (sscanf(argv[i], "--write_buffer_size=%d%c", &n, &junk) == 1) {
      FLAGS_write_buffer_size = n;
    } else if (sscanf(argv[i], "--cache_size=%d%c", &n, &junk) == 1) {
      FLAGS_cache_size = n;
    } else if (sscanf(argv[i], "--open_files=%d%c", &n, &junk) == 1) {
      FLAGS_open_files = n;
    } else {
      fprintf(stderr, "Invalid flag '%s'\n", argv[i]);
      exit(1);
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@ -126,6 +126,7 @@ DBImpl::DBImpl(const Options& options, const std::string& dbname)
      log_(NULL),
      bg_compaction_scheduled_(false),
      compacting_(false) {
  mem_->Ref();
  has_imm_.Release_Store(NULL);
  // Reserve ten files or so for other uses and give the rest to TableCache.
@ -152,8 +153,8 @@ DBImpl::~DBImpl() {
  }
  delete versions_;
-  delete mem_;
+  if (mem_ != NULL) mem_->Unref();
-  delete imm_;
+  if (imm_ != NULL) imm_->Unref();
  delete log_;
  delete logfile_;
  delete table_cache_;
@ -344,7 +345,8 @@ Status DBImpl::RecoverLogFile(uint64_t log_number,
  // paranoid_checks==false so that corruptions cause entire commits
  // to be skipped instead of propagating bad information (like overly
  // large sequence numbers).
-  log::Reader reader(file, &reporter, true/*checksum*/);
+  log::Reader reader(file, &reporter, true/*checksum*/,
                     0/*initial_offset*/);
  Log(env_, options_.info_log, "Recovering log #%llu",
      (unsigned long long) log_number);
@ -364,6 +366,7 @@ Status DBImpl::RecoverLogFile(uint64_t log_number,
    if (mem == NULL) {
      mem = new MemTable(internal_comparator_);
      mem->Ref();
    }
    status = WriteBatchInternal::InsertInto(&batch, mem);
    MaybeIgnoreError(&status);
@ -384,7 +387,7 @@ Status DBImpl::RecoverLogFile(uint64_t log_number,
        // file-systems cause the DB::Open() to fail.
        break;
      }
-      delete mem;
+      mem->Unref();
      mem = NULL;
    }
  }
@ -395,7 +398,7 @@ Status DBImpl::RecoverLogFile(uint64_t log_number,
    // file-systems cause the DB::Open() to fail.
  }
-  delete mem;
+  if (mem != NULL) mem->Unref();
  delete file;
  return status;
 }
@ -443,11 +446,12 @@ Status DBImpl::CompactMemTable() {
  // Replace immutable memtable with the generated Table
  if (s.ok()) {
    edit.SetPrevLogNumber(0);
-    s = versions_->LogAndApply(&edit, imm_);
+    s = versions_->LogAndApply(&edit);
  }
  if (s.ok()) {
    // Commit to the new state
    imm_->Unref();
    imm_ = NULL;
    has_imm_.Release_Store(NULL);
    DeleteObsoleteFiles();
@ -556,7 +560,7 @@ void DBImpl::BackgroundCompaction() {
    c->edit()->DeleteFile(c->level(), f->number);
    c->edit()->AddFile(c->level() + 1, f->number, f->file_size,
                       f->smallest, f->largest);
-    status = versions_->LogAndApply(c->edit(), NULL);
+    status = versions_->LogAndApply(c->edit());
    Log(env_, options_.info_log, "Moved #%lld to level-%d %lld bytes %s\n",
        static_cast<unsigned long long>(f->number),
        c->level() + 1,
@ -697,7 +701,7 @@ Status DBImpl::InstallCompactionResults(CompactionState* compact) {
  }
  compact->outputs.clear();
-  Status s = versions_->LogAndApply(compact->compaction->edit(), NULL);
+  Status s = versions_->LogAndApply(compact->compaction->edit());
  if (s.ok()) {
    compact->compaction->ReleaseInputs();
    DeleteObsoleteFiles();
@ -754,9 +758,7 @@ Status DBImpl::DoCompactionWork(CompactionState* compact) {
    }
    Slice key = input->key();
-    InternalKey tmp_internal_key;
+    if (compact->compaction->ShouldStopBefore(key) &&
    tmp_internal_key.DecodeFrom(key);
    if (compact->compaction->ShouldStopBefore(tmp_internal_key) &&
        compact->builder != NULL) {
      status = FinishCompactionOutputFile(compact, input);
      if (!status.ok()) {
@ -867,6 +869,9 @@ Status DBImpl::DoCompactionWork(CompactionState* compact) {
  }
  compacting_ = false;
  compacting_cv_.SignalAll();
  VersionSet::LevelSummaryStorage tmp;
  Log(env_, options_.info_log,
      "compacted to: %s", versions_->LevelSummary(&tmp));
  return status;
 }
@ -925,10 +930,11 @@ Status DBImpl::Get(const ReadOptions& options,
 Iterator* DBImpl::NewIterator(const ReadOptions& options) {
  SequenceNumber latest_snapshot;
  Iterator* internal_iter = NewInternalIterator(options, &latest_snapshot);
-  SequenceNumber sequence =
+  return NewDBIterator(
-      (options.snapshot ? options.snapshot->number_ : latest_snapshot);
+      &dbname_, env_, user_comparator(), internal_iter,
-  return NewDBIterator(&dbname_, env_,
+      (options.snapshot != NULL
-                       user_comparator(), internal_iter, sequence);
+       ? reinterpret_cast<const SnapshotImpl*>(options.snapshot)->number_
       : latest_snapshot));
 }
 void DBImpl::Unref(void* arg1, void* arg2) {
@ -945,7 +951,7 @@ const Snapshot* DBImpl::GetSnapshot() {
 void DBImpl::ReleaseSnapshot(const Snapshot* s) {
  MutexLock l(&mutex_);
-  snapshots_.Delete(s);
+  snapshots_.Delete(reinterpret_cast<const SnapshotImpl*>(s));
 }
 // Convenience methods
@ -985,12 +991,26 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* updates) {
 Status DBImpl::MakeRoomForWrite(bool force) {
  mutex_.AssertHeld();
  bool allow_delay = !force;
  Status s;
  while (true) {
    if (!bg_error_.ok()) {
      // Yield previous error
      s = bg_error_;
      break;
    } else if (
        allow_delay &&
        versions_->NumLevelFiles(0) >= config::kL0_SlowdownWritesTrigger) {
      // We are getting close to hitting a hard limit on the number of
      // L0 files.  Rather than delaying a single write by several
      // seconds when we hit the hard limit, start delaying each
      // individual write by 1ms to reduce latency variance.  Also,
      // this delay hands over some CPU to the compaction thread in
      // case it is sharing the same core as the writer.
      mutex_.Unlock();
      env_->SleepForMicroseconds(1000);
      allow_delay = false;  // Do not delay a single write more than once
      mutex_.Lock();
    } else if (!force &&
               (mem_->ApproximateMemoryUsage() <= options_.write_buffer_size)) {
      // There is room in current memtable
@ -999,6 +1019,9 @@ Status DBImpl::MakeRoomForWrite(bool force) {
      // We have filled up the current memtable, but the previous
      // one is still being compacted, so we wait.
      compacting_cv_.Wait();
    } else if (versions_->NumLevelFiles(0) >= config::kL0_StopWritesTrigger) {
      // There are too many level-0 files.
      compacting_cv_.Wait();
    } else {
      // Attempt to switch to a new memtable and trigger compaction of old
      assert(versions_->PrevLogNumber() == 0);
@ -1011,7 +1034,7 @@ Status DBImpl::MakeRoomForWrite(bool force) {
      VersionEdit edit;
      edit.SetPrevLogNumber(versions_->LogNumber());
      edit.SetLogNumber(new_log_number);
-      s = versions_->LogAndApply(&edit, NULL);
+      s = versions_->LogAndApply(&edit);
      if (!s.ok()) {
        delete lfile;
        env_->DeleteFile(LogFileName(dbname_, new_log_number));
@ -1024,6 +1047,7 @@ Status DBImpl::MakeRoomForWrite(bool force) {
      imm_ = mem_;
      has_imm_.Release_Store(imm_);
      mem_ = new MemTable(internal_comparator_);
      mem_->Ref();
      force = false;   // Do not force another compaction if have room
      MaybeScheduleCompaction();
    }
@ -1141,10 +1165,11 @@ Status DB::Open(const Options& options, const std::string& dbname,
      edit.SetLogNumber(new_log_number);
      impl->logfile_ = lfile;
      impl->log_ = new log::Writer(lfile);
-      s = impl->versions_->LogAndApply(&edit, NULL);
+      s = impl->versions_->LogAndApply(&edit);
    }
    if (s.ok()) {
      impl->DeleteObsoleteFiles();
      impl->MaybeScheduleCompaction();
    }
  }
  impl->mutex_.Unlock();
@ -1156,6 +1181,9 @@ Status DB::Open(const Options& options, const std::string& dbname,
  return s;
 }
 Snapshot::~Snapshot() {
 }
 Status DestroyDB(const std::string& dbname, const Options& options) {
  Env* env = options.env;
  std::vector<std::string> filenames;
--- a/db/db_test.cc
+++ b/db/db_test.cc
@ -3,7 +3,6 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 #include "leveldb/db.h"
 #include "db/db_impl.h"
 #include "db/filename.h"
 #include "db/version_set.h"
@ -802,8 +801,17 @@ TEST(DBTest, DBOpen_Options) {
  db = NULL;
 }
 namespace {
 typedef std::map<std::string, std::string> KVMap;
 }
 class ModelDB: public DB {
 public:
  class ModelSnapshot : public Snapshot {
   public:
    KVMap map_;
  };
  explicit ModelDB(const Options& options): options_(options) { }
  ~ModelDB() { }
  virtual Status Put(const WriteOptions& o, const Slice& k, const Slice& v) {
@ -824,35 +832,34 @@ class ModelDB: public DB {
      return new ModelIter(saved, true);
    } else {
      const KVMap* snapshot_state =
-          reinterpret_cast<const KVMap*>(options.snapshot->number_);
+          &(reinterpret_cast<const ModelSnapshot*>(options.snapshot)->map_);
      return new ModelIter(snapshot_state, false);
    }
  }
  virtual const Snapshot* GetSnapshot() {
-    KVMap* saved = new KVMap;
+    ModelSnapshot* snapshot = new ModelSnapshot;
-    *saved = map_;
+    snapshot->map_ = map_;
-    return snapshots_.New(
+    return snapshot;
        reinterpret_cast<SequenceNumber>(saved));
  }
  virtual void ReleaseSnapshot(const Snapshot* snapshot) {
-    const KVMap* saved = reinterpret_cast<const KVMap*>(snapshot->number_);
+    delete reinterpret_cast<const ModelSnapshot*>(snapshot);
    delete saved;
    snapshots_.Delete(snapshot);
  }
  virtual Status Write(const WriteOptions& options, WriteBatch* batch) {
    assert(options.post_write_snapshot == NULL);   // Not supported
-    for (WriteBatchInternal::Iterator it(*batch); !it.Done(); it.Next()) {
+    class Handler : public WriteBatch::Handler {
-      switch (it.op()) {
+     public:
-        case kTypeValue:
+      KVMap* map_;
-          map_[it.key().ToString()] = it.value().ToString();
+      virtual void Put(const Slice& key, const Slice& value) {
-          break;
+        (*map_)[key.ToString()] = value.ToString();
        case kTypeDeletion:
          map_.erase(it.key().ToString());
          break;
      }
      virtual void Delete(const Slice& key) {
        map_->erase(key.ToString());
      }
-    return Status::OK();
+    };
    Handler handler;
    handler.map_ = &map_;
    return batch->Iterate(&handler);
  }
  virtual bool GetProperty(const Slice& property, std::string* value) {
@ -864,7 +871,6 @@ class ModelDB: public DB {
    }
  }
 private:
  typedef std::map<std::string, std::string> KVMap;
  class ModelIter: public Iterator {
   public:
    ModelIter(const KVMap* map, bool owned)
@ -897,7 +903,6 @@ class ModelDB: public DB {
  };
  const Options options_;
  KVMap map_;
  SnapshotList snapshots_;
 };
 static std::string RandomKey(Random* rnd) {
@ -1023,8 +1028,70 @@ TEST(DBTest, Randomized) {
  if (db_snap != NULL) db_->ReleaseSnapshot(db_snap);
 }
 std::string MakeKey(unsigned int num) {
  char buf[30];
  snprintf(buf, sizeof(buf), "%016u", num);
  return std::string(buf);
 }
 void BM_LogAndApply(int iters, int num_base_files) {
  std::string dbname = test::TmpDir() + "/leveldb_test_benchmark";
  DestroyDB(dbname, Options());
  DB* db = NULL;
  Options opts;
  opts.create_if_missing = true;
  Status s = DB::Open(opts, dbname, &db);
  ASSERT_OK(s);
  ASSERT_TRUE(db != NULL);
  delete db;
  db = NULL;
  Env* env = Env::Default();
  InternalKeyComparator cmp(BytewiseComparator());
  Options options;
  VersionSet vset(dbname, &options, NULL, &cmp);
  ASSERT_OK(vset.Recover());
  VersionEdit vbase;
  uint64_t fnum = 1;
  for (int i = 0; i < num_base_files; i++) {
    InternalKey start(MakeKey(2*fnum), 1, kTypeValue);
    InternalKey limit(MakeKey(2*fnum+1), 1, kTypeDeletion);
    vbase.AddFile(2, fnum++, 1 /* file size */, start, limit);
  }
  ASSERT_OK(vset.LogAndApply(&vbase));
  uint64_t start_micros = env->NowMicros();
  for (int i = 0; i < iters; i++) {
    VersionEdit vedit;
    vedit.DeleteFile(2, fnum);
    InternalKey start(MakeKey(2*fnum), 1, kTypeValue);
    InternalKey limit(MakeKey(2*fnum+1), 1, kTypeDeletion);
    vedit.AddFile(2, fnum++, 1 /* file size */, start, limit);
    vset.LogAndApply(&vedit);
  }
  uint64_t stop_micros = env->NowMicros();
  unsigned int us = stop_micros - start_micros;
  char buf[16];
  snprintf(buf, sizeof(buf), "%d", num_base_files);
  fprintf(stderr,
          "BM_LogAndApply/%-6s   %8d iters : %9u us (%7.0f us / iter)\n",
          buf, iters, us, ((float)us) / iters);
 }
 }
 int main(int argc, char** argv) {
  if (argc > 1 && std::string(argv[1]) == "--benchmark") {
    leveldb::BM_LogAndApply(1000, 1);
    leveldb::BM_LogAndApply(1000, 100);
    leveldb::BM_LogAndApply(1000, 10000);
    leveldb::BM_LogAndApply(100, 100000);
    return 0;
  }
  return leveldb::test::RunAllTests();
 }
--- a/db/dbformat.h
+++ b/db/dbformat.h
@ -19,6 +19,16 @@ namespace leveldb {
 // parameters set via options.
 namespace config {
 static const int kNumLevels = 7;
 // Level-0 compaction is started when we hit this many files.
 static const int kL0_CompactionTrigger = 4;
 // Soft limit on number of level-0 files.  We slow down writes at this point.
 static const int kL0_SlowdownWritesTrigger = 8;
 // Maximum number of level-0 files.  We stop writes at this point.
 static const int kL0_StopWritesTrigger = 12;
 }
 class InternalKey;
--- a/db/log_reader.cc
+++ b/db/log_reader.cc
@ -4,7 +4,6 @@
 #include "db/log_reader.h"
 #include <stdint.h>
 #include "leveldb/env.h"
 #include "util/coding.h"
 #include "util/crc32c.h"
@ -15,46 +14,104 @@ namespace log {
 Reader::Reporter::~Reporter() {
 }
-Reader::Reader(SequentialFile* file, Reporter* reporter, bool checksum)
+Reader::Reader(SequentialFile* file, Reporter* reporter, bool checksum,
               uint64_t initial_offset)
    : file_(file),
      reporter_(reporter),
      checksum_(checksum),
      backing_store_(new char[kBlockSize]),
      buffer_(),
-      eof_(false) {
+      eof_(false),
      last_record_offset_(0),
      end_of_buffer_offset_(0),
      initial_offset_(initial_offset) {
 }
 Reader::~Reader() {
  delete[] backing_store_;
 }
 bool Reader::SkipToInitialBlock() {
  size_t offset_in_block = initial_offset_ % kBlockSize;
  uint64_t block_start_location = initial_offset_ - offset_in_block;
  // Don't search a block if we'd be in the trailer
  if (offset_in_block > kBlockSize - 6) {
    offset_in_block = 0;
    block_start_location += kBlockSize;
  }
  end_of_buffer_offset_ = block_start_location;
  // Skip to start of first block that can contain the initial record
  if (block_start_location > 0) {
    Status skip_status = file_->Skip(block_start_location);
    if (!skip_status.ok()) {
      ReportDrop(block_start_location, skip_status);
      return false;
    }
  }
  return true;
 }
 bool Reader::ReadRecord(Slice* record, std::string* scratch) {
  if (last_record_offset_ < initial_offset_) {
    if (!SkipToInitialBlock()) {
      return false;
    }
  }
  scratch->clear();
  record->clear();
  bool in_fragmented_record = false;
  // Record offset of the logical record that we're reading
  // 0 is a dummy value to make compilers happy
  uint64_t prospective_record_offset = 0;
  Slice fragment;
  while (true) {
    uint64_t physical_record_offset = end_of_buffer_offset_ - buffer_.size();
    switch (ReadPhysicalRecord(&fragment)) {
      case kFullType:
        if (in_fragmented_record) {
-          ReportDrop(scratch->size(), "partial record without end");
+          // Handle bug in earlier versions of log::Writer where
          // it could emit an empty kFirstType record at the tail end
          // of a block followed by a kFullType or kFirstType record
          // at the beginning of the next block.
          if (scratch->empty()) {
            in_fragmented_record = false;
          } else {
            ReportCorruption(scratch->size(), "partial record without end(1)");
          }
        }
        prospective_record_offset = physical_record_offset;
        scratch->clear();
        *record = fragment;
        last_record_offset_ = prospective_record_offset;
        return true;
      case kFirstType:
        if (in_fragmented_record) {
-          ReportDrop(scratch->size(), "partial record without end");
+          // Handle bug in earlier versions of log::Writer where
          // it could emit an empty kFirstType record at the tail end
          // of a block followed by a kFullType or kFirstType record
          // at the beginning of the next block.
          if (scratch->empty()) {
            in_fragmented_record = false;
          } else {
            ReportCorruption(scratch->size(), "partial record without end(2)");
          }
        }
        prospective_record_offset = physical_record_offset;
        scratch->assign(fragment.data(), fragment.size());
        in_fragmented_record = true;
        break;
      case kMiddleType:
        if (!in_fragmented_record) {
-          ReportDrop(fragment.size(), "missing start of fragmented record");
+          ReportCorruption(fragment.size(),
                           "missing start of fragmented record(1)");
        } else {
          scratch->append(fragment.data(), fragment.size());
        }
@ -62,31 +119,33 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch) {
      case kLastType:
        if (!in_fragmented_record) {
-          ReportDrop(fragment.size(), "missing start of fragmented record");
+          ReportCorruption(fragment.size(),
                           "missing start of fragmented record(2)");
        } else {
          scratch->append(fragment.data(), fragment.size());
          *record = Slice(*scratch);
          last_record_offset_ = prospective_record_offset;
          return true;
        }
        break;
      case kEof:
        if (in_fragmented_record) {
-          ReportDrop(scratch->size(), "partial record without end");
+          ReportCorruption(scratch->size(), "partial record without end(3)");
          scratch->clear();
        }
        return false;
      case kBadRecord:
        if (in_fragmented_record) {
-          ReportDrop(scratch->size(), "error in middle of record");
+          ReportCorruption(scratch->size(), "error in middle of record");
          in_fragmented_record = false;
          scratch->clear();
        }
        break;
      default:
-        ReportDrop(
+        ReportCorruption(
            (fragment.size() + (in_fragmented_record ? scratch->size() : 0)),
            "unknown record type");
        in_fragmented_record = false;
@ -97,9 +156,18 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch) {
  return false;
 }
-void Reader::ReportDrop(size_t bytes, const char* reason) {
+uint64_t Reader::LastRecordOffset() {
-  if (reporter_ != NULL) {
+  return last_record_offset_;
-    reporter_->Corruption(bytes, Status::Corruption(reason));
+}
 void Reader::ReportCorruption(size_t bytes, const char* reason) {
  ReportDrop(bytes, Status::Corruption(reason));
 }
 void Reader::ReportDrop(size_t bytes, const Status& reason) {
  if (reporter_ != NULL &&
      end_of_buffer_offset_ - buffer_.size() - bytes >= initial_offset_) {
    reporter_->Corruption(bytes, reason);
  }
 }
@ -110,11 +178,10 @@ unsigned int Reader::ReadPhysicalRecord(Slice* result) {
        // Last read was a full read, so this is a trailer to skip
        buffer_.clear();
        Status status = file_->Read(kBlockSize, &buffer_, backing_store_);
        end_of_buffer_offset_ += buffer_.size();
        if (!status.ok()) {
          if (reporter_ != NULL) {
            reporter_->Corruption(kBlockSize, status);
          }
          buffer_.clear();
          ReportDrop(kBlockSize, status);
          eof_ = true;
          return kEof;
        } else if (buffer_.size() < kBlockSize) {
@ -125,8 +192,9 @@ unsigned int Reader::ReadPhysicalRecord(Slice* result) {
        // End of file
        return kEof;
      } else {
-        ReportDrop(buffer_.size(), "truncated record at end of file");
+        size_t drop_size = buffer_.size();
        buffer_.clear();
        ReportCorruption(drop_size, "truncated record at end of file");
        return kEof;
      }
    }
@ -138,8 +206,9 @@ unsigned int Reader::ReadPhysicalRecord(Slice* result) {
    const unsigned int type = header[6];
    const uint32_t length = a | (b << 8);
    if (kHeaderSize + length > buffer_.size()) {
-      ReportDrop(buffer_.size(), "bad record length");
+      size_t drop_size = buffer_.size();
      buffer_.clear();
      ReportCorruption(drop_size, "bad record length");
      return kBadRecord;
    }
@ -160,13 +229,22 @@ unsigned int Reader::ReadPhysicalRecord(Slice* result) {
        // been corrupted and if we trust it, we could find some
        // fragment of a real log record that just happens to look
        // like a valid log record.
-        ReportDrop(buffer_.size(), "checksum mismatch");
+        size_t drop_size = buffer_.size();
        buffer_.clear();
        ReportCorruption(drop_size, "checksum mismatch");
        return kBadRecord;
      }
    }
    buffer_.remove_prefix(kHeaderSize + length);
    // Skip physical record that started before initial_offset_
    if (end_of_buffer_offset_ - buffer_.size() - kHeaderSize - length <
        initial_offset_) {
      result->clear();
      return kBadRecord;
    }
    *result = Slice(header + kHeaderSize, length);
    return type;
  }
--- a/db/log_reader.h
+++ b/db/log_reader.h
@ -5,6 +5,8 @@
 #ifndef STORAGE_LEVELDB_DB_LOG_READER_H_
 #define STORAGE_LEVELDB_DB_LOG_READER_H_
 #include <stdint.h>
 #include "db/log_format.h"
 #include "leveldb/slice.h"
 #include "leveldb/status.h"
@ -35,7 +37,11 @@ class Reader {
  // live while this Reader is in use.
  //
  // If "checksum" is true, verify checksums if available.
-  Reader(SequentialFile* file, Reporter* reporter, bool checksum);
+  //
  // The Reader will start reading at the first record located at physical
  // position >= initial_offset within the file.
  Reader(SequentialFile* file, Reporter* reporter, bool checksum,
         uint64_t initial_offset);
  ~Reader();
@ -46,6 +52,11 @@ class Reader {
  // reader or the next mutation to *scratch.
  bool ReadRecord(Slice* record, std::string* scratch);
  // Returns the physical offset of the last record returned by ReadRecord.
  //
  // Undefined before the first call to ReadRecord.
  uint64_t LastRecordOffset();
 private:
  SequentialFile* const file_;
  Reporter* const reporter_;
@ -54,15 +65,37 @@ class Reader {
  Slice buffer_;
  bool eof_;   // Last Read() indicated EOF by returning < kBlockSize
  // Offset of the last record returned by ReadRecord.
  uint64_t last_record_offset_;
  // Offset of the first location past the end of buffer_.
  uint64_t end_of_buffer_offset_;
  // Offset at which to start looking for the first record to return
  uint64_t const initial_offset_;
  // Extend record types with the following special values
  enum {
    kEof = kMaxRecordType + 1,
    // Returned whenever we find an invalid physical record.
    // Currently there are three situations in which this happens:
    // * The record has an invalid CRC (ReadPhysicalRecord reports a drop)
    // * The record is a 0-length record (No drop is reported)
    // * The record is below constructor's initial_offset (No drop is reported)
    kBadRecord = kMaxRecordType + 2
  };
  // Skips all blocks that are completely before "initial_offset_".
  //
  // Returns true on success. Handles reporting.
  bool SkipToInitialBlock();
  // Return type, or one of the preceding special values
  unsigned int ReadPhysicalRecord(Slice* result);
-  void ReportDrop(size_t bytes, const char* reason);
+
  // Reports dropped bytes to the reporter.
  // buffer_ must be updated to remove the dropped bytes prior to invocation.
  void ReportCorruption(size_t bytes, const char* reason);
  void ReportDrop(size_t bytes, const Status& reason);
  // No copying allowed
  Reader(const Reader&);
--- a/db/log_test.cc
+++ b/db/log_test.cc
@ -60,7 +60,6 @@ class LogTest {
    virtual Status Read(size_t n, Slice* result, char* scratch) {
      ASSERT_TRUE(!returned_partial_) << "must not Read() after eof/error";
      ASSERT_EQ(kBlockSize, n);
      if (force_error_) {
        force_error_ = false;
@ -76,6 +75,17 @@ class LogTest {
      contents_.remove_prefix(n);
      return Status::OK();
    }
    virtual Status Skip(size_t n) {
      if (n > contents_.size()) {
        contents_.clear();
        return Status::NotFound("in-memory file skipepd past end");
      }
      contents_.remove_prefix(n);
      return Status::OK();
    }
  };
  class ReportCollector : public Reader::Reporter {
@ -97,10 +107,15 @@ class LogTest {
  Writer writer_;
  Reader reader_;
  // Record metadata for testing initial offset functionality
  static size_t initial_offset_record_sizes_[];
  static uint64_t initial_offset_last_record_offsets_[];
 public:
  LogTest() : reading_(false),
              writer_(&dest_),
-              reader_(&source_, &report_, true/*checksum*/) {
+              reader_(&source_, &report_, true/*checksum*/,
                      0/*initial_offset*/) {
  }
  void Write(const std::string& msg) {
@ -153,6 +168,10 @@ class LogTest {
    return report_.dropped_bytes_;
  }
  std::string ReportMessage() const {
    return report_.message_;
  }
  // Returns OK iff recorded error message contains "msg"
  std::string MatchError(const std::string& msg) const {
    if (report_.message_.find(msg) == std::string::npos) {
@ -161,8 +180,61 @@ class LogTest {
      return "OK";
    }
  }
  void WriteInitialOffsetLog() {
    for (int i = 0; i < 4; i++) {
      std::string record(initial_offset_record_sizes_[i],
                         static_cast<char>('a' + i));
      Write(record);
    }
  }
  void CheckOffsetPastEndReturnsNoRecords(uint64_t offset_past_end) {
    WriteInitialOffsetLog();
    reading_ = true;
    source_.contents_ = Slice(dest_.contents_);
    Reader* offset_reader = new Reader(&source_, &report_, true/*checksum*/,
                                       WrittenBytes() + offset_past_end);
    Slice record;
    std::string scratch;
    ASSERT_TRUE(!offset_reader->ReadRecord(&record, &scratch));
    delete offset_reader;
  }
  void CheckInitialOffsetRecord(uint64_t initial_offset,
                                int expected_record_offset) {
    WriteInitialOffsetLog();
    reading_ = true;
    source_.contents_ = Slice(dest_.contents_);
    Reader* offset_reader = new Reader(&source_, &report_, true/*checksum*/,
                                       initial_offset);
    Slice record;
    std::string scratch;
    ASSERT_TRUE(offset_reader->ReadRecord(&record, &scratch));
    ASSERT_EQ(initial_offset_record_sizes_[expected_record_offset],
              record.size());
    ASSERT_EQ(initial_offset_last_record_offsets_[expected_record_offset],
              offset_reader->LastRecordOffset());
    ASSERT_EQ((char)('a' + expected_record_offset), record.data()[0]);
    delete offset_reader;
  }
 };
 size_t LogTest::initial_offset_record_sizes_[] =
    {10000,  // Two sizable records in first block
     10000,
     2 * log::kBlockSize - 1000,  // Span three blocks
     1};
 uint64_t LogTest::initial_offset_last_record_offsets_[] =
    {0,
     kHeaderSize + 10000,
     2 * (kHeaderSize + 10000),
     2 * (kHeaderSize + 10000) +
         (2 * log::kBlockSize - 1000) + 3 * kHeaderSize};
 TEST(LogTest, Empty) {
  ASSERT_EQ("EOF", Read());
 }
@ -213,6 +285,19 @@ TEST(LogTest, MarginalTrailer) {
  ASSERT_EQ("EOF", Read());
 }
 TEST(LogTest, MarginalTrailer2) {
  // Make a trailer that is exactly the same length as an empty record.
  const int n = kBlockSize - 2*kHeaderSize;
  Write(BigString("foo", n));
  ASSERT_EQ(kBlockSize - kHeaderSize, WrittenBytes());
  Write("bar");
  ASSERT_EQ(BigString("foo", n), Read());
  ASSERT_EQ("bar", Read());
  ASSERT_EQ("EOF", Read());
  ASSERT_EQ(0, DroppedBytes());
  ASSERT_EQ("", ReportMessage());
 }
 TEST(LogTest, ShortTrailer) {
  const int n = kBlockSize - 2*kHeaderSize + 4;
  Write(BigString("foo", n));
@ -353,6 +438,60 @@ TEST(LogTest, ErrorJoinsRecords) {
  ASSERT_GE(dropped, 2*kBlockSize);
 }
 TEST(LogTest, ReadStart) {
  CheckInitialOffsetRecord(0, 0);
 }
 TEST(LogTest, ReadSecondOneOff) {
  CheckInitialOffsetRecord(1, 1);
 }
 TEST(LogTest, ReadSecondTenThousand) {
  CheckInitialOffsetRecord(10000, 1);
 }
 TEST(LogTest, ReadSecondStart) {
  CheckInitialOffsetRecord(10007, 1);
 }
 TEST(LogTest, ReadThirdOneOff) {
  CheckInitialOffsetRecord(10008, 2);
 }
 TEST(LogTest, ReadThirdStart) {
  CheckInitialOffsetRecord(20014, 2);
 }
 TEST(LogTest, ReadFourthOneOff) {
  CheckInitialOffsetRecord(20015, 3);
 }
 TEST(LogTest, ReadFourthFirstBlockTrailer) {
  CheckInitialOffsetRecord(log::kBlockSize - 4, 3);
 }
 TEST(LogTest, ReadFourthMiddleBlock) {
  CheckInitialOffsetRecord(log::kBlockSize + 1, 3);
 }
 TEST(LogTest, ReadFourthLastBlock) {
  CheckInitialOffsetRecord(2 * log::kBlockSize + 1, 3);
 }
 TEST(LogTest, ReadFourthStart) {
  CheckInitialOffsetRecord(
      2 * (kHeaderSize + 1000) + (2 * log::kBlockSize - 1000) + 3 * kHeaderSize,
      3);
 }
 TEST(LogTest, ReadEnd) {
  CheckOffsetPastEndReturnsNoRecords(0);
 }
 TEST(LogTest, ReadPastEnd) {
  CheckOffsetPastEndReturnsNoRecords(5);
 }
 }
 }
--- a/db/log_writer.cc
+++ b/db/log_writer.cc
@ -32,6 +32,7 @@ Status Writer::AddRecord(const Slice& slice) {
  // is empty, we still want to iterate once to emit a single
  // zero-length record
  Status s;
  bool begin = true;
  do {
    const int leftover = kBlockSize - block_offset_;
    assert(leftover >= 0);
@ -52,7 +53,6 @@ Status Writer::AddRecord(const Slice& slice) {
    const size_t fragment_length = (left < avail) ? left : avail;
    RecordType type;
    const bool begin = (ptr == slice.data());
    const bool end = (left == fragment_length);
    if (begin && end) {
      type = kFullType;
@ -67,6 +67,7 @@ Status Writer::AddRecord(const Slice& slice) {
    s = EmitPhysicalRecord(type, ptr, fragment_length);
    ptr += fragment_length;
    left -= fragment_length;
    begin = false;
  } while (s.ok() && left > 0);
  return s;
 }
--- a/db/memtable.cc
+++ b/db/memtable.cc
@ -20,10 +20,12 @@ static Slice GetLengthPrefixedSlice(const char* data) {
 MemTable::MemTable(const InternalKeyComparator& cmp)
    : comparator_(cmp),
      refs_(0),
      table_(comparator_, &arena_) {
 }
 MemTable::~MemTable() {
  assert(refs_ == 0);
 }
 size_t MemTable::ApproximateMemoryUsage() { return arena_.MemoryUsage(); }
@ -48,10 +50,15 @@ static const char* EncodeKey(std::string* scratch, const Slice& target) {
 class MemTableIterator: public Iterator {
 public:
-  explicit MemTableIterator(MemTable::Table* table) {
+  explicit MemTableIterator(MemTable* mem, MemTable::Table* table) {
    mem_ = mem;
    iter_ = new MemTable::Table::Iterator(table);
    mem->Ref();
  }
  virtual ~MemTableIterator() {
    delete iter_;
    mem_->Unref();
  }
  virtual ~MemTableIterator() { delete iter_; }
  virtual bool Valid() const { return iter_->Valid(); }
  virtual void Seek(const Slice& k) { iter_->Seek(EncodeKey(&tmp_, k)); }
@ -68,6 +75,7 @@ class MemTableIterator: public Iterator {
  virtual Status status() const { return Status::OK(); }
 private:
  MemTable* mem_;
  MemTable::Table::Iterator* iter_;
  std::string tmp_;       // For passing to EncodeKey
@ -77,7 +85,7 @@ class MemTableIterator: public Iterator {
 };
 Iterator* MemTable::NewIterator() {
-  return new MemTableIterator(&table_);
+  return new MemTableIterator(this, &table_);
 }
 void MemTable::Add(SequenceNumber s, ValueType type,
--- a/db/memtable.h
+++ b/db/memtable.h
@ -19,8 +19,21 @@ class MemTableIterator;
 class MemTable {
 public:
  // MemTables are reference counted.  The initial reference count
  // is zero and the caller must call Ref() at least once.
  explicit MemTable(const InternalKeyComparator& comparator);
-  ~MemTable();
+
  // Increase reference count.
  void Ref() { ++refs_; }
  // Drop reference count.  Delete if no more references exist.
  void Unref() {
    --refs_;
    assert(refs_ >= 0);
    if (refs_ <= 0) {
      delete this;
    }
  }
  // Returns an estimate of the number of bytes of data in use by this
  // data structure.
@ -45,6 +58,8 @@ class MemTable {
           const Slice& value);
 private:
  ~MemTable();  // Private since only Unref() should be used to delete it
  struct KeyComparator {
    const InternalKeyComparator comparator;
    explicit KeyComparator(const InternalKeyComparator& c) : comparator(c) { }
@ -56,6 +71,7 @@ class MemTable {
  typedef SkipList<const char*, KeyComparator> Table;
  KeyComparator comparator_;
  int refs_;
  Arena arena_;
  Table table_;
--- a/db/repair.cc
+++ b/db/repair.cc
@ -183,13 +183,15 @@ class Repairer {
    // corruptions cause entire commits to be skipped instead of
    // propagating bad information (like overly large sequence
    // numbers).
-    log::Reader reader(lfile, &reporter, false/*do not checksum*/);
+    log::Reader reader(lfile, &reporter, false/*do not checksum*/,
                       0/*initial_offset*/);
    // Read all the records and add to a memtable
    std::string scratch;
    Slice record;
    WriteBatch batch;
-    MemTable mem(icmp_);
+    MemTable* mem = new MemTable(icmp_);
    mem->Ref();
    int counter = 0;
    while (reader.ReadRecord(&record, &scratch)) {
      if (record.size() < 12) {
@ -198,7 +200,7 @@ class Repairer {
        continue;
      }
      WriteBatchInternal::SetContents(&batch, record);
-      status = WriteBatchInternal::InsertInto(&batch, &mem);
+      status = WriteBatchInternal::InsertInto(&batch, mem);
      if (status.ok()) {
        counter += WriteBatchInternal::Count(&batch);
      } else {
@ -215,10 +217,12 @@ class Repairer {
    VersionEdit skipped;
    FileMetaData meta;
    meta.number = next_file_number_++;
-    Iterator* iter = mem.NewIterator();
+    Iterator* iter = mem->NewIterator();
    status = BuildTable(dbname_, env_, options_, table_cache_, iter,
                        &meta, &skipped);
    delete iter;
    mem->Unref();
    mem = NULL;
    if (status.ok()) {
      if (meta.file_size > 0) {
        table_numbers_.push_back(meta.number);
--- a/db/snapshot.h
+++ b/db/snapshot.h
@ -12,17 +12,17 @@ namespace leveldb {
 class SnapshotList;
 // Snapshots are kept in a doubly-linked list in the DB.
-// Each Snapshot corresponds to a particular sequence number.
+// Each SnapshotImpl corresponds to a particular sequence number.
-class Snapshot {
+class SnapshotImpl : public Snapshot {
 public:
  SequenceNumber number_;  // const after creation
 private:
  friend class SnapshotList;
-  // Snapshot is kept in a doubly-linked circular list
+  // SnapshotImpl is kept in a doubly-linked circular list
-  Snapshot* prev_;
+  SnapshotImpl* prev_;
-  Snapshot* next_;
+  SnapshotImpl* next_;
  SnapshotList* list_;                 // just for sanity checks
 };
@ -35,11 +35,11 @@ class SnapshotList {
  }
  bool empty() const { return list_.next_ == &list_; }
-  Snapshot* oldest() const { assert(!empty()); return list_.next_; }
+  SnapshotImpl* oldest() const { assert(!empty()); return list_.next_; }
-  Snapshot* newest() const { assert(!empty()); return list_.prev_; }
+  SnapshotImpl* newest() const { assert(!empty()); return list_.prev_; }
-  const Snapshot* New(SequenceNumber seq) {
+  const SnapshotImpl* New(SequenceNumber seq) {
-    Snapshot* s = new Snapshot;
+    SnapshotImpl* s = new SnapshotImpl;
    s->number_ = seq;
    s->list_ = this;
    s->next_ = &list_;
@ -49,7 +49,7 @@ class SnapshotList {
    return s;
  }
-  void Delete(const Snapshot* s) {
+  void Delete(const SnapshotImpl* s) {
    assert(s->list_ == this);
    s->prev_->next_ = s->next_;
    s->next_->prev_ = s->prev_;
@ -58,7 +58,7 @@ class SnapshotList {
 private:
  // Dummy head of doubly-linked list of snapshots
-  Snapshot list_;
+  SnapshotImpl list_;
 };
 }
--- a/db/version_set.cc
+++ b/db/version_set.cc
@ -57,17 +57,22 @@ std::string IntSetToString(const std::set<uint64_t>& s) {
 Version::~Version() {
  assert(refs_ == 0);
  // Remove from linked list
  prev_->next_ = next_;
  next_->prev_ = prev_;
  // Drop references to files
  for (int level = 0; level < config::kNumLevels; level++) {
    for (size_t i = 0; i < files_[level].size(); i++) {
      FileMetaData* f = files_[level][i];
-      assert(f->refs >= 0);
+      assert(f->refs > 0);
      f->refs--;
      if (f->refs <= 0) {
        delete f;
      }
    }
  }
  delete cleanup_mem_;
 }
 // An internal iterator.  For a given version/level pair, yields
@ -77,9 +82,9 @@ Version::~Version() {
 // encoded using EncodeFixed64.
 class Version::LevelFileNumIterator : public Iterator {
 public:
-  LevelFileNumIterator(const Version* version,
+  LevelFileNumIterator(const InternalKeyComparator& icmp,
                       const std::vector<FileMetaData*>* flist)
-      : icmp_(version->vset_->icmp_.user_comparator()),
+      : icmp_(icmp),
        flist_(flist),
        index_(flist->size()) {        // Marks as invalid
  }
@ -157,7 +162,7 @@ static Iterator* GetFileIterator(void* arg,
 Iterator* Version::NewConcatenatingIterator(const ReadOptions& options,
                                            int level) const {
  return NewTwoLevelIterator(
-      new LevelFileNumIterator(this, &files_[level]),
+      new LevelFileNumIterator(vset_->icmp_, &files_[level]),
      &GetFileIterator, vset_->table_cache_, options);
 }
@ -185,11 +190,11 @@ void Version::Ref() {
 }
 void Version::Unref() {
  assert(this != &vset_->dummy_versions_);
  assert(refs_ >= 1);
  --refs_;
  if (refs_ == 0) {
-    vset_->MaybeDeleteOldVersions();
+    delete this;
    // TODO: try to delete obsolete files
  }
 }
@ -222,37 +227,58 @@ std::string Version::DebugString() const {
 // Versions that contain full copies of the intermediate state.
 class VersionSet::Builder {
 private:
-  typedef std::map<uint64_t, FileMetaData*> FileMap;
+  // Helper to sort by v->files_[file_number].smallest
  struct BySmallestKey {
    const InternalKeyComparator* internal_comparator;
    bool operator()(FileMetaData* f1, FileMetaData* f2) const {
      int r = internal_comparator->Compare(f1->smallest, f2->smallest);
      if (r != 0) {
        return (r < 0);
      } else {
        // Break ties by file number
        return (f1->number < f2->number);
      }
    }
  };
  typedef std::set<FileMetaData*, BySmallestKey> FileSet;
  struct LevelState {
    std::set<uint64_t> deleted_files;
    FileSet* added_files;
  };
  VersionSet* vset_;
-  FileMap files_[config::kNumLevels];
+  Version* base_;
  LevelState levels_[config::kNumLevels];
 public:
  // Initialize a builder with the files from *base and other info from *vset
  Builder(VersionSet* vset, Version* base)
-      : vset_(vset) {
+      : vset_(vset),
        base_(base) {
    base_->Ref();
    BySmallestKey cmp;
    cmp.internal_comparator = &vset_->icmp_;
    for (int level = 0; level < config::kNumLevels; level++) {
-      const std::vector<FileMetaData*>& files = base->files_[level];
+      levels_[level].added_files = new FileSet(cmp);
      for (size_t i = 0; i < files.size(); i++) {
        FileMetaData* f = files[i];
        f->refs++;
        files_[level].insert(std::make_pair(f->number, f));
      }
    }
  }
  ~Builder() {
    for (int level = 0; level < config::kNumLevels; level++) {
-      const FileMap& fmap = files_[level];
+      std::vector<FileMetaData*> to_unref(levels_[level].added_files->begin(),
-      for (FileMap::const_iterator iter = fmap.begin();
+                                          levels_[level].added_files->end());
-           iter != fmap.end();
+      delete levels_[level].added_files;
-           ++iter) {
+      for (int i = 0; i < to_unref.size(); i++) {
-        FileMetaData* f = iter->second;
+        FileMetaData* f = to_unref[i];
        f->refs--;
        if (f->refs <= 0) {
          delete f;
        }
      }
    }
    base_->Unref();
  }
  // Apply all of the edits in *edit to the current state.
@ -271,16 +297,7 @@ class VersionSet::Builder {
         ++iter) {
      const int level = iter->first;
      const uint64_t number = iter->second;
-      FileMap::iterator fiter = files_[level].find(number);
+      levels_[level].deleted_files.insert(number);
      assert(fiter != files_[level].end());  // Sanity check for debug mode
      if (fiter != files_[level].end()) {
        FileMetaData* f = fiter->second;
        f->refs--;
        if (f->refs <= 0) {
          delete f;
        }
        files_[level].erase(fiter);
      }
    }
    // Add new files
@ -288,24 +305,68 @@ class VersionSet::Builder {
      const int level = edit->new_files_[i].first;
      FileMetaData* f = new FileMetaData(edit->new_files_[i].second);
      f->refs = 1;
-      assert(files_[level].count(f->number) == 0);
+      levels_[level].deleted_files.erase(f->number);
-      files_[level].insert(std::make_pair(f->number, f));
+      levels_[level].added_files->insert(f);
    }
  }
  // Save the current state in *v.
  void SaveTo(Version* v) {
    BySmallestKey cmp;
    cmp.internal_comparator = &vset_->icmp_;
    for (int level = 0; level < config::kNumLevels; level++) {
-      const FileMap& fmap = files_[level];
+      // Merge the set of added files with the set of pre-existing files.
-      for (FileMap::const_iterator iter = fmap.begin();
+      // Drop any deleted files.  Store the result in *v.
-           iter != fmap.end();
+      const std::vector<FileMetaData*>& base_files = base_->files_[level];
-           ++iter) {
+      std::vector<FileMetaData*>::const_iterator base_iter = base_files.begin();
-        FileMetaData* f = iter->second;
+      std::vector<FileMetaData*>::const_iterator base_end = base_files.end();
      const FileSet* added = levels_[level].added_files;
      v->files_[level].reserve(base_files.size() + added->size());
      for (FileSet::const_iterator added_iter = added->begin();
           added_iter != added->end();
           ++added_iter) {
        // Add all smaller files listed in base_
        for (std::vector<FileMetaData*>::const_iterator bpos
                 = std::upper_bound(base_iter, base_end, *added_iter, cmp);
             base_iter != bpos;
             ++base_iter) {
          MaybeAddFile(v, level, *base_iter);
        }
        MaybeAddFile(v, level, *added_iter);
      }
      // Add remaining base files
      for (; base_iter != base_end; ++base_iter) {
        MaybeAddFile(v, level, *base_iter);
      }
 #ifndef NDEBUG
      // Make sure there is no overlap in levels > 0
      if (level > 0) {
        for (int i = 1; i < v->files_[level].size(); i++) {
          const InternalKey& prev_end = v->files_[level][i-1]->largest;
          const InternalKey& this_begin = v->files_[level][i]->smallest;
          if (vset_->icmp_.Compare(prev_end, this_begin) >= 0) {
            fprintf(stderr, "overlapping ranges in same level %s vs. %s\n",
                    EscapeString(prev_end.Encode()).c_str(),
                    EscapeString(this_begin.Encode()).c_str());
            abort();
          }
        }
      }
 #endif
    }
  }
  void MaybeAddFile(Version* v, int level, FileMetaData* f) {
    if (levels_[level].deleted_files.count(f->number) > 0) {
      // File is deleted: do nothing
    } else {
      f->refs++;
      v->files_[level].push_back(f);
    }
  }
  }
 };
 VersionSet::VersionSet(const std::string& dbname,
@ -324,22 +385,36 @@ VersionSet::VersionSet(const std::string& dbname,
      prev_log_number_(0),
      descriptor_file_(NULL),
      descriptor_log_(NULL),
-      current_(new Version(this)),
+      dummy_versions_(this),
-      oldest_(current_) {
+      current_(NULL) {
  AppendVersion(new Version(this));
 }
 VersionSet::~VersionSet() {
-  for (Version* v = oldest_; v != NULL; ) {
+  current_->Unref();
-    Version* next = v->next_;
+  assert(dummy_versions_.next_ == &dummy_versions_);  // List must be empty
    assert(v->refs_ == 0);
    delete v;
    v = next;
  }
  delete descriptor_log_;
  delete descriptor_file_;
 }
-Status VersionSet::LogAndApply(VersionEdit* edit, MemTable* cleanup_mem) {
+void VersionSet::AppendVersion(Version* v) {
  // Make "v" current
  assert(v->refs_ == 0);
  assert(v != current_);
  if (current_ != NULL) {
    current_->Unref();
  }
  current_ = v;
  v->Ref();
  // Append to linked list
  v->prev_ = dummy_versions_.prev_;
  v->next_ = &dummy_versions_;
  v->prev_->next_ = v;
  v->next_->prev_ = v;
 }
 Status VersionSet::LogAndApply(VersionEdit* edit) {
  if (edit->has_log_number_) {
    assert(edit->log_number_ >= log_number_);
    assert(edit->log_number_ < next_file_number_);
@ -360,13 +435,12 @@ Status VersionSet::LogAndApply(VersionEdit* edit, MemTable* cleanup_mem) {
    builder.Apply(edit);
    builder.SaveTo(v);
  }
-
+  Finalize(v);
  std::string new_manifest_file;
  Status s = Finalize(v);
  // Initialize new descriptor log file if necessary by creating
  // a temporary file that contains a snapshot of the current version.
-  if (s.ok()) {
+  std::string new_manifest_file;
  Status s;
  if (descriptor_log_ == NULL) {
    assert(descriptor_file_ == NULL);
    new_manifest_file = DescriptorFileName(dbname_, manifest_file_number_);
@ -377,7 +451,6 @@ Status VersionSet::LogAndApply(VersionEdit* edit, MemTable* cleanup_mem) {
      s = WriteSnapshot(descriptor_log_);
    }
  }
  }
  // Write new record to MANIFEST log
  if (s.ok()) {
@ -397,12 +470,7 @@ Status VersionSet::LogAndApply(VersionEdit* edit, MemTable* cleanup_mem) {
  // Install the new version
  if (s.ok()) {
-    assert(current_->next_ == NULL);
+    AppendVersion(v);
    assert(current_->cleanup_mem_ == NULL);
    current_->cleanup_mem_ = cleanup_mem;
    v->next_ = NULL;
    current_->next_ = v;
    current_ = v;
    log_number_ = edit->log_number_;
    prev_log_number_ = edit->prev_log_number_;
  } else {
@ -458,7 +526,7 @@ Status VersionSet::Recover() {
  {
    LogReporter reporter;
    reporter.status = &s;
-    log::Reader reader(file, &reporter, true/*checksum*/);
+    log::Reader reader(file, &reporter, true/*checksum*/, 0/*initial_offset*/);
    Slice record;
    std::string scratch;
    while (reader.ReadRecord(&record, &scratch) && s.ok()) {
@ -518,21 +586,15 @@ Status VersionSet::Recover() {
  if (s.ok()) {
    Version* v = new Version(this);
    builder.SaveTo(v);
    s = Finalize(v);
    if (!s.ok()) {
      delete v;
    } else {
    // Install recovered version
-      v->next_ = NULL;
+    Finalize(v);
-      current_->next_ = v;
+    AppendVersion(v);
      current_ = v;
    manifest_file_number_ = next_file;
    next_file_number_ = next_file + 1;
    last_sequence_ = last_sequence;
    log_number_ = log_number;
    prev_log_number_ = prev_log_number;
  }
  }
  return s;
 }
@ -545,15 +607,12 @@ static int64_t TotalFileSize(const std::vector<FileMetaData*>& files) {
  return sum;
 }
-Status VersionSet::Finalize(Version* v) {
+void VersionSet::Finalize(Version* v) {
  // Precomputed best level for next compaction
  int best_level = -1;
  double best_score = -1;
-  Status s;
+  for (int level = 0; level < config::kNumLevels-1; level++) {
  for (int level = 0; s.ok() && level < config::kNumLevels-1; level++) {
    s = SortLevel(v, level);
    double score;
    if (level == 0) {
      // We treat level-0 specially by bounding the number of files
@ -567,7 +626,8 @@ Status VersionSet::Finalize(Version* v) {
      // file size is small (perhaps because of a small write-buffer
      // setting, or very high compression ratios, or lots of
      // overwrites/deletions).
-      score = v->files_[level].size() / 4.0;
+      score = v->files_[level].size() /
          static_cast<double>(config::kL0_CompactionTrigger);
    } else {
      // Compute the ratio of current size to size limit.
      const uint64_t level_bytes = TotalFileSize(v->files_[level]);
@ -582,7 +642,6 @@ Status VersionSet::Finalize(Version* v) {
  v->compaction_level_ = best_level;
  v->compaction_score_ = best_score;
  return s;
 }
 Status VersionSet::WriteSnapshot(log::Writer* log) {
@ -615,44 +674,27 @@ Status VersionSet::WriteSnapshot(log::Writer* log) {
  return log->AddRecord(record);
 }
 // Helper to sort by tables_[file_number].smallest
 struct VersionSet::BySmallestKey {
  const InternalKeyComparator* internal_comparator;
  bool operator()(FileMetaData* f1, FileMetaData* f2) const {
    return internal_comparator->Compare(f1->smallest, f2->smallest) < 0;
  }
 };
 Status VersionSet::SortLevel(Version* v, uint64_t level) {
  Status result;
  BySmallestKey cmp;
  cmp.internal_comparator = &icmp_;
  std::sort(v->files_[level].begin(), v->files_[level].end(), cmp);
  if (result.ok() && level > 0) {
    // There should be no overlap
    for (size_t i = 1; i < v->files_[level].size(); i++) {
      const InternalKey& prev_end = v->files_[level][i-1]->largest;
      const InternalKey& this_begin = v->files_[level][i]->smallest;
      if (icmp_.Compare(prev_end, this_begin) >= 0) {
        result = Status::Corruption(
            "overlapping ranges in same level",
            (EscapeString(prev_end.Encode()) + " vs. " +
             EscapeString(this_begin.Encode())));
        break;
      }
    }
  }
  return result;
 }
 int VersionSet::NumLevelFiles(int level) const {
  assert(level >= 0);
  assert(level < config::kNumLevels);
  return current_->files_[level].size();
 }
 const char* VersionSet::LevelSummary(LevelSummaryStorage* scratch) const {
  // Update code if kNumLevels changes
  assert(config::kNumLevels == 7);
  snprintf(scratch->buffer, sizeof(scratch->buffer),
           "files[ %d %d %d %d %d %d %d ]",
           int(current_->files_[0].size()),
           int(current_->files_[1].size()),
           int(current_->files_[2].size()),
           int(current_->files_[3].size()),
           int(current_->files_[4].size()),
           int(current_->files_[5].size()),
           int(current_->files_[6].size()));
  return scratch->buffer;
 }
 uint64_t VersionSet::ApproximateOffsetOf(Version* v, const InternalKey& ikey) {
  uint64_t result = 0;
  for (int level = 0; level < config::kNumLevels; level++) {
@ -685,19 +727,10 @@ uint64_t VersionSet::ApproximateOffsetOf(Version* v, const InternalKey& ikey) {
  return result;
 }
 void VersionSet::MaybeDeleteOldVersions() {
  // Note: it is important to delete versions in order since a newer
  // version with zero refs may be holding a pointer to a memtable
  // that is used by somebody who has a ref on an older version.
  while (oldest_ != current_ && oldest_->refs_ == 0) {
    Version* next = oldest_->next_;
    delete oldest_;
    oldest_ = next;
  }
 }
 void VersionSet::AddLiveFiles(std::set<uint64_t>* live) {
-  for (Version* v = oldest_; v != NULL; v = v->next_) {
+  for (Version* v = dummy_versions_.next_;
       v != &dummy_versions_;
       v = v->next_) {
    for (int level = 0; level < config::kNumLevels; level++) {
      const std::vector<FileMetaData*>& files = v->files_[level];
      for (size_t i = 0; i < files.size(); i++) {
@ -809,8 +842,7 @@ Iterator* VersionSet::MakeInputIterator(Compaction* c) {
      } else {
        // Create concatenating iterator for the files from this level
        list[num++] = NewTwoLevelIterator(
-            new Version::LevelFileNumIterator(
+            new Version::LevelFileNumIterator(icmp_, &c->inputs_[which]),
                c->input_version_, &c->inputs_[which]),
            &GetFileIterator, table_cache_, options);
      }
    }
@ -996,11 +1028,12 @@ bool Compaction::IsBaseLevelForKey(const Slice& user_key) {
  return true;
 }
-bool Compaction::ShouldStopBefore(const InternalKey& key) {
+bool Compaction::ShouldStopBefore(const Slice& internal_key) {
  // Scan to find earliest grandparent file that contains key.
  const InternalKeyComparator* icmp = &input_version_->vset_->icmp_;
  while (grandparent_index_ < grandparents_.size() &&
-      icmp->Compare(key, grandparents_[grandparent_index_]->largest) > 0) {
+      icmp->Compare(internal_key,
                    grandparents_[grandparent_index_]->largest.Encode()) > 0) {
    if (seen_key_) {
      overlapped_bytes_ += grandparents_[grandparent_index_]->file_size;
    }
--- a/db/version_set.h
+++ b/db/version_set.h
@ -59,8 +59,8 @@ class Version {
  VersionSet* vset_;            // VersionSet to which this Version belongs
  Version* next_;               // Next version in linked list
  Version* prev_;               // Previous version in linked list
  int refs_;                    // Number of live refs to this version
  MemTable* cleanup_mem_;       // NULL, or table to delete when version dropped
  // List of files per level
  std::vector<FileMetaData*> files_[config::kNumLevels];
@ -72,8 +72,7 @@ class Version {
  int compaction_level_;
  explicit Version(VersionSet* vset)
-      : vset_(vset), next_(NULL), refs_(0),
+      : vset_(vset), next_(this), prev_(this), refs_(0),
        cleanup_mem_(NULL),
        compaction_score_(-1),
        compaction_level_(-1) {
  }
@ -95,10 +94,8 @@ class VersionSet {
  // Apply *edit to the current version to form a new descriptor that
  // is both saved to persistent state and installed as the new
-  // current version.  Iff Apply() returns OK, arrange to delete
+  // current version.
-  // cleanup_mem (if cleanup_mem != NULL) when it is no longer needed
+  Status LogAndApply(VersionEdit* edit);
  // by older versions.
  Status LogAndApply(VersionEdit* edit, MemTable* cleanup_mem);
  // Recover the last saved descriptor from persistent storage.
  Status Recover();
@ -171,19 +168,20 @@ class VersionSet {
  // "key" as of version "v".
  uint64_t ApproximateOffsetOf(Version* v, const InternalKey& key);
  // Return a human-readable short (single-line) summary of the number
  // of files per level.  Uses *scratch as backing store.
  struct LevelSummaryStorage {
    char buffer[100];
  };
  const char* LevelSummary(LevelSummaryStorage* scratch) const;
 private:
  class Builder;
  friend class Compaction;
  friend class Version;
-  Status Finalize(Version* v);
+  void Finalize(Version* v);
  // Delete any old versions that are no longer needed.
  void MaybeDeleteOldVersions();
  struct BySmallestKey;
  Status SortLevel(Version* v, uint64_t level);
  void GetOverlappingInputs(
      int level,
@ -202,6 +200,8 @@ class VersionSet {
  void SetupOtherInputs(Compaction* c);
  void AppendVersion(Version* v);
  Env* const env_;
  const std::string dbname_;
  const Options* const options_;
@ -216,10 +216,8 @@ class VersionSet {
  // Opened lazily
  WritableFile* descriptor_file_;
  log::Writer* descriptor_log_;
-
+  Version dummy_versions_;  // Head of circular doubly-linked list of versions.
-  // Versions are kept in a singly linked list that is never empty
+  Version* current_;        // == dummy_versions_.prev_
  Version* current_;    // Pointer to the last (newest) list entry
  Version* oldest_;     // Pointer to the first (oldest) list entry
  // Per-level key at which the next compaction at that level should start.
  // Either an empty string, or a valid InternalKey.
@ -265,8 +263,8 @@ class Compaction {
  bool IsBaseLevelForKey(const Slice& user_key);
  // Returns true iff we should stop building the current output
-  // before processing "key".
+  // before processing "internal_key".
-  bool ShouldStopBefore(const InternalKey& key);
+  bool ShouldStopBefore(const Slice& internal_key);
  // Release the input version for the compaction, once the compaction
  // is successful.
--- a/db/write_batch.cc
+++ b/db/write_batch.cc
@ -29,11 +29,53 @@ WriteBatch::WriteBatch() {
 WriteBatch::~WriteBatch() { }
 WriteBatch::Handler::~Handler() { }
 void WriteBatch::Clear() {
  rep_.clear();
  rep_.resize(12);
 }
 Status WriteBatch::Iterate(Handler* handler) const {
  Slice input(rep_);
  if (input.size() < 12) {
    return Status::Corruption("malformed WriteBatch (too small)");
  }
  input.remove_prefix(12);
  Slice key, value;
  int found = 0;
  while (!input.empty()) {
    found++;
    char tag = input[0];
    input.remove_prefix(1);
    switch (tag) {
      case kTypeValue:
        if (GetLengthPrefixedSlice(&input, &key) &&
            GetLengthPrefixedSlice(&input, &value)) {
          handler->Put(key, value);
        } else {
          return Status::Corruption("bad WriteBatch Put");
        }
        break;
      case kTypeDeletion:
        if (GetLengthPrefixedSlice(&input, &key)) {
          handler->Delete(key);
        } else {
          return Status::Corruption("bad WriteBatch Delete");
        }
        break;
      default:
        return Status::Corruption("unknown WriteBatch tag");
    }
  }
  if (found != WriteBatchInternal::Count(this)) {
    return Status::Corruption("WriteBatch has wrong count");
  } else {
    return Status::OK();
  }
 }
 int WriteBatchInternal::Count(const WriteBatch* b) {
  return DecodeFixed32(b->rep_.data() + 8);
 }
@ -63,28 +105,29 @@ void WriteBatch::Delete(const Slice& key) {
  PutLengthPrefixedSlice(&rep_, key);
 }
 namespace {
 class MemTableInserter : public WriteBatch::Handler {
 public:
  SequenceNumber sequence_;
  MemTable* mem_;
  virtual void Put(const Slice& key, const Slice& value) {
    mem_->Add(sequence_, kTypeValue, key, value);
    sequence_++;
  }
  virtual void Delete(const Slice& key) {
    mem_->Add(sequence_, kTypeDeletion, key, Slice());
    sequence_++;
  }
 };
 }
 Status WriteBatchInternal::InsertInto(const WriteBatch* b,
                                      MemTable* memtable) {
-  const int count = WriteBatchInternal::Count(b);
+  MemTableInserter inserter;
-  int found = 0;
+  inserter.sequence_ = WriteBatchInternal::Sequence(b);
-  Iterator it(*b);
+  inserter.mem_ = memtable;
-  for (; !it.Done(); it.Next()) {
+  return b->Iterate(&inserter);
    switch (it.op()) {
      case kTypeDeletion:
        memtable->Add(it.sequence_number(), kTypeDeletion, it.key(), Slice());
        break;
      case kTypeValue:
        memtable->Add(it.sequence_number(), kTypeValue, it.key(), it.value());
        break;
    }
    found++;
  }
  if (!it.status().ok()) {
    return it.status();
  } else if (found != count) {
    return Status::Corruption("wrong count in WriteBatch");
  }
  return Status::OK();
 }
 void WriteBatchInternal::SetContents(WriteBatch* b, const Slice& contents) {
@ -92,57 +135,4 @@ void WriteBatchInternal::SetContents(WriteBatch* b, const Slice& contents) {
  b->rep_.assign(contents.data(), contents.size());
 }
 WriteBatchInternal::Iterator::Iterator(const WriteBatch& batch)
    : input_(WriteBatchInternal::Contents(&batch)),
      done_(false) {
  if (input_.size() < 12) {
    done_ = true;
  } else {
    seq_ = WriteBatchInternal::Sequence(&batch),
    input_.remove_prefix(12);
    GetNextEntry();
  }
 }
 void WriteBatchInternal::Iterator::Next() {
  assert(!done_);
  seq_++;
  GetNextEntry();
 }
 void WriteBatchInternal::Iterator::GetNextEntry() {
  if (input_.empty()) {
    done_ = true;
    return;
  }
  char tag = input_[0];
  input_.remove_prefix(1);
  switch (tag) {
    case kTypeValue:
      if (GetLengthPrefixedSlice(&input_, &key_) &&
          GetLengthPrefixedSlice(&input_, &value_)) {
        op_ = static_cast<ValueType>(tag);
      } else {
        status_ = Status::Corruption("bad WriteBatch Put");
        done_ = true;
        input_.clear();
      }
      break;
    case kTypeDeletion:
      if (GetLengthPrefixedSlice(&input_, &key_)) {
        op_ = kTypeDeletion;
      } else {
        status_ = Status::Corruption("bad WriteBatch Delete");
        done_ = true;
        input_.clear();
      }
      break;
    default:
      status_ = Status::Corruption("unknown WriteBatch tag");
      done_ = true;
      input_.clear();
      break;
  }
 }
 }
--- a/db/write_batch_internal.h
+++ b/db/write_batch_internal.h
@ -37,30 +37,6 @@ class WriteBatchInternal {
  static void SetContents(WriteBatch* batch, const Slice& contents);
  static Status InsertInto(const WriteBatch* batch, MemTable* memtable);
  // Iterate over the contents of a write batch.
  class Iterator {
   public:
    explicit Iterator(const WriteBatch& batch);
    bool Done() const { return done_; }
    void Next();
    ValueType op() const { return op_; }
    const Slice& key() const { return key_; }
    const Slice& value() const { return value_; }
    SequenceNumber sequence_number() const { return seq_; }
    Status status() const { return status_; }
   private:
    void GetNextEntry();
    Slice input_;
    bool done_;
    ValueType op_;
    Slice key_;
    Slice value_;
    SequenceNumber seq_;
    Status status_;
  };
 };
 }
--- a/db/write_batch_test.cc
+++ b/db/write_batch_test.cc
@ -14,10 +14,11 @@ namespace leveldb {
 static std::string PrintContents(WriteBatch* b) {
  InternalKeyComparator cmp(BytewiseComparator());
-  MemTable mem(cmp);
+  MemTable* mem = new MemTable(cmp);
  mem->Ref();
  std::string state;
-  Status s = WriteBatchInternal::InsertInto(b, &mem);
+  Status s = WriteBatchInternal::InsertInto(b, mem);
-  Iterator* iter = mem.NewIterator();
+  Iterator* iter = mem->NewIterator();
  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
    ParsedInternalKey ikey;
    ASSERT_TRUE(ParseInternalKey(iter->key(), &ikey));
@ -42,6 +43,7 @@ static std::string PrintContents(WriteBatch* b) {
  if (!s.ok()) {
    state.append("ParseError()");
  }
  mem->Unref();
  return state;
 }
--- a/doc/impl.html
+++ b/doc/impl.html
@ -17,14 +17,14 @@ However the organization of the files that make up the representation
 is somewhat different and is explained below.
 <p>
-Each database is represented by a set of file stored in a directory.
+Each database is represented by a set of files stored in a directory.
 There are several different types of files as documented below:
 <p>
 <h2>Log files</h2>
 <p>
 A log file (*.log) stores a sequence of recent updates.  Each update
 is appended to the current log file.  When the log file reaches a
-pre-determined size (approximately 1MB by default), it is converted
+pre-determined size (approximately 4MB by default), it is converted
 to a sorted table (see below) and a new log file is created for future
 updates.
 <p>
@ -83,19 +83,15 @@ Other files used for miscellaneous purposes may also be present
 <h1>Level 0</h1>
 When the log file grows above a certain size (1MB by default):
 <ul>
-<li>Write the contents of the current memtable to an sstable
+<li>Create a brand new memtable and log file and direct future updates here
-<li>Replace the current memtable by a brand new empty memtable
+<li>In the background:
-<li>Switch to a new log file
+<ul>
 <li>Write the contents of the previous memtable to an sstable
 <li>Discard the memtable
 <li>Delete the old log file and the old memtable
 <li>Add the new sstable to the young (level-0) level.
 </ul>
 </ul>
 Experimental measurements show that generating an sstable from a 1MB
 log file takes ~12ms, which seems like an acceptable latency hiccup to
 add infrequently to a log write.
 <p>
 The new sstable is added to a special level-0 level.  level-0 contains
 a set of files (up to 4 by default).  However unlike other levels,
 these files do not cover disjoint ranges, but may overlap each other.
 <h1>Compactions</h1>
@ -162,8 +158,8 @@ read.
 <p>
 Solution 1: To reduce this problem, we might want to increase the log
 switching threshold when the number of level-0 files is large.  Though
-the downside is that the larger this threshold, the larger the delay
+the downside is that the larger this threshold, the more memory we will
-that we will add to write latency when a write triggers a log switch.
+need to hold the corresponding memtable.
 <p>
 Solution 2: We might want to decrease write rate artificially when the
--- a/doc/index.html
+++ b/doc/index.html
@ -141,10 +141,18 @@ the batch.
 <p>
 <h1>Concurrency</h1>
 <p>
-A database may only be opened by one process at a time.  The <code>leveldb</code>
+A database may only be opened by one process at a time.
-implementation acquires a lock from the operating system to prevent
+The <code>leveldb</code> implementation acquires a lock from the
-misuse.  Within a single process, the same <code>leveldb::DB</code> object may
+operating system to prevent misuse.  Within a single process, the
-be safely used by multiple concurrent threads.
+same <code>leveldb::DB</code> object may be safely shared by multiple
 concurrent threads.  I.e., different threads may write into or fetch
 iterators or call <code>Get</code> on the same database without any
 external synchronization (the leveldb implementation will
 automatically do the required synchronization).  However other objects
 (like Iterator and WriteBatch) may require external synchronization.
 If two threads share such an object, they must protect access to it
 using their own locking protocol.  More details are available in
 the public header files.
 <p>
 <h1>Iteration</h1>
 <p>
--- a/include/leveldb/comparator.h
+++ b/include/leveldb/comparator.h
@ -12,7 +12,9 @@ namespace leveldb {
 class Slice;
 // A Comparator object provides a total order across slices that are
-// used as keys in an sstable or a database.
+// used as keys in an sstable or a database.  A Comparator implementation
 // must be thread-safe since leveldb may invoke its methods concurrently
 // from multiple threads.
 class Comparator {
 public:
  virtual ~Comparator();
--- a/include/leveldb/db.h
+++ b/include/leveldb/db.h
@ -13,26 +13,32 @@
 namespace leveldb {
 static const int kMajorVersion = 1;
-static const int kMinorVersion = 1;
+static const int kMinorVersion = 2;
 struct Options;
 struct ReadOptions;
 struct WriteOptions;
 class Snapshot;
 class WriteBatch;
-// Some internal types.  Clients should ignore.
+// Abstract handle to particular state of a DB.
-class WriteBatchInternal;
+// A Snapshot is an immutable object and can therefore be safely
 // accessed from multiple threads without any external synchronization.
 class Snapshot {
 protected:
  virtual ~Snapshot();
 };
 // A range of keys
 struct Range {
-  Slice start;
+  Slice start;          // Included in the range
-  Slice limit;
+  Slice limit;          // Not included in the range
  Range(const Slice& s, const Slice& l) : start(s), limit(l) { }
 };
 // A DB is a persistent ordered map from keys to values.
 // A DB is safe for concurrent access from multiple threads without
 // any external synchronization.
 class DB {
 public:
  // Open the database with the specified "name".
--- a/include/leveldb/env.h
+++ b/include/leveldb/env.h
@ -6,6 +6,9 @@
 // operating system functionality like the filesystem etc.  Callers
 // may wish to provide a custom Env object when opening a database to
 // get fine gain control; e.g., to rate limit file system operations.
 //
 // All Env implementations are safe for concurrent access from
 // multiple threads without any external synchronization.
 #ifndef STORAGE_LEVELDB_INCLUDE_ENV_H_
 #define STORAGE_LEVELDB_INCLUDE_ENV_H_
@ -160,6 +163,15 @@ class SequentialFile {
  //
  // REQUIRES: External synchronization
  virtual Status Read(size_t n, Slice* result, char* scratch) = 0;
  // Skip "n" bytes from the file. This is guaranteed to be no
  // slower that reading the same data, but may be faster.
  //
  // If end of file is reached, skipping will stop at the end of the
  // file, and Skip will return OK.
  //
  // REQUIRES: External synchronization
  virtual Status Skip(uint64_t n) = 0;
 };
 // A file abstraction for randomly reading the contents of a file.
--- a/include/leveldb/iterator.h
+++ b/include/leveldb/iterator.h
@ -6,6 +6,11 @@
 // The following class defines the interface.  Multiple implementations
 // are provided by this library.  In particular, iterators are provided
 // to access the contents of a Table or a DB.
 //
 // Multiple threads can invoke const methods on an Iterator without
 // external synchronization, but if any of the threads may call a
 // non-const method, all threads accessing the same Iterator must use
 // external synchronization.
 #ifndef STORAGE_LEVELDB_INCLUDE_ITERATOR_H_
 #define STORAGE_LEVELDB_INCLUDE_ITERATOR_H_
--- a/include/leveldb/slice.h
+++ b/include/leveldb/slice.h
@ -6,6 +6,11 @@
 // storage and a size.  The user of a Slice must ensure that the slice
 // is not used after the corresponding external storage has been
 // deallocated.
 //
 // Multiple threads can invoke const methods on a Slice without
 // external synchronization, but if any of the threads may call a
 // non-const method, all threads accessing the same Slice must use
 // external synchronization.
 #ifndef STORAGE_LEVELDB_INCLUDE_SLICE_H_
 #define STORAGE_LEVELDB_INCLUDE_SLICE_H_
--- a/include/leveldb/status.h
+++ b/include/leveldb/status.h
@ -4,12 +4,16 @@
 //
 // A Status encapsulates the result of an operation.  It may indicate success,
 // or it may indicate an error with an associated error message.
 //
 // Multiple threads can invoke const methods on a Status without
 // external synchronization, but if any of the threads may call a
 // non-const method, all threads accessing the same Status must use
 // external synchronization.
 #ifndef STORAGE_LEVELDB_INCLUDE_STATUS_H_
 #define STORAGE_LEVELDB_INCLUDE_STATUS_H_
 #include <string>
 #include <utility>
 #include "leveldb/slice.h"
 namespace leveldb {
@ -18,7 +22,7 @@ class Status {
 public:
  // Create a success status.
  Status() : state_(NULL) { }
-  ~Status() { delete state_; }
+  ~Status() { delete[] state_; }
  // Copy the specified status.
  Status(const Status& s);
@ -29,7 +33,7 @@ class Status {
  // Return error status of an appropriate type.
  static Status NotFound(const Slice& msg, const Slice& msg2 = Slice()) {
-    return Status(kNotFound, msg, Slice());
+    return Status(kNotFound, msg, msg2);
  }
  static Status Corruption(const Slice& msg, const Slice& msg2 = Slice()) {
    return Status(kCorruption, msg, msg2);
@ -55,6 +59,13 @@ class Status {
  std::string ToString() const;
 private:
  // OK status has a NULL state_.  Otherwise, state_ is a new[] array
  // of the following form:
  //    state_[0..3] == length of message
  //    state_[4]    == code
  //    state_[5..]  == message
  const char* state_;
  enum Code {
    kOk = 0,
    kNotFound = 1,
@ -63,21 +74,24 @@ class Status {
    kInvalidArgument = 4,
    kIOError = 5,
  };
-  Code code() const { return (state_ == NULL) ? kOk : state_->first; }
+
  Code code() const {
    return (state_ == NULL) ? kOk : static_cast<Code>(state_[4]);
  }
  Status(Code code, const Slice& msg, const Slice& msg2);
-
+  static const char* CopyState(const char* s);
  typedef std::pair<Code, std::string> State;
  State* state_;
 };
 inline Status::Status(const Status& s) {
-  state_ = (s.state_ == NULL) ? NULL : new State(*s.state_);
+  state_ = (s.state_ == NULL) ? NULL : CopyState(s.state_);
 }
 inline void Status::operator=(const Status& s) {
-  if (this != &s) {
+  // The following condition catches both aliasing (when this == &s),
-    delete state_;
+  // and the common case where both s and *this are ok.
-    state_ = (s.state_ == NULL) ? NULL : new State(*s.state_);
+  if (state_ != s.state_) {
    delete[] state_;
    state_ = (s.state_ == NULL) ? NULL : CopyState(s.state_);
  }
 }
--- a/include/leveldb/table.h
+++ b/include/leveldb/table.h
@ -17,7 +17,8 @@ class RandomAccessFile;
 struct ReadOptions;
 // A Table is a sorted map from strings to strings.  Tables are
-// immutable and persistent.
+// immutable and persistent.  A Table may be safely accessed from
 // multiple threads without external synchronization.
 class Table {
 public:
  // Attempt to open the table that is stored in bytes [0..file_size)
--- a/include/leveldb/table_builder.h
+++ b/include/leveldb/table_builder.h
@ -4,6 +4,11 @@
 //
 // TableBuilder provides the interface used to build a Table
 // (an immutable and sorted map from keys to values).
 //
 // Multiple threads can invoke const methods on a TableBuilder without
 // external synchronization, but if any of the threads may call a
 // non-const method, all threads accessing the same TableBuilder must use
 // external synchronization.
 #ifndef STORAGE_LEVELDB_INCLUDE_TABLE_BUILDER_H_
 #define STORAGE_LEVELDB_INCLUDE_TABLE_BUILDER_H_
--- a/include/leveldb/write_batch.h
+++ b/include/leveldb/write_batch.h
@ -12,11 +12,17 @@
 //    batch.Delete("key");
 //    batch.Put("key", "v2");
 //    batch.Put("key", "v3");
 //
 // Multiple threads can invoke const methods on a WriteBatch without
 // external synchronization, but if any of the threads may call a
 // non-const method, all threads accessing the same WriteBatch must use
 // external synchronization.
 #ifndef STORAGE_LEVELDB_INCLUDE_WRITE_BATCH_H_
 #define STORAGE_LEVELDB_INCLUDE_WRITE_BATCH_H_
 #include <string>
 #include "leveldb/status.h"
 namespace leveldb {
@ -36,6 +42,15 @@ class WriteBatch {
  // Clear all updates buffered in this batch.
  void Clear();
  // Support for iterating over the contents of a batch.
  class Handler {
   public:
    virtual ~Handler();
    virtual void Put(const Slice& key, const Slice& value) = 0;
    virtual void Delete(const Slice& key) = 0;
  };
  Status Iterate(Handler* handler) const;
 private:
  friend class WriteBatchInternal;
--- a/table/block_builder.cc
+++ b/table/block_builder.cc
@ -80,7 +80,7 @@ void BlockBuilder::Add(const Slice& key, const Slice& value) {
  if (counter_ < options_->block_restart_interval) {
    // See how much sharing to do with previous string
    const size_t min_length = std::min(last_key_piece.size(), key.size());
-    while ((shared < min_length) && (last_key_[shared] == key[shared])) {
+    while ((shared < min_length) && (last_key_piece[shared] == key[shared])) {
      shared++;
    }
  } else {
--- a/table/table_test.cc
+++ b/table/table_test.cc
@ -319,13 +319,15 @@ class MemTableConstructor: public Constructor {
      : Constructor(cmp),
        internal_comparator_(cmp) {
    memtable_ = new MemTable(internal_comparator_);
    memtable_->Ref();
  }
  ~MemTableConstructor() {
-    delete memtable_;
+    memtable_->Unref();
  }
  virtual Status FinishImpl(const Options& options, const KVMap& data) {
-    delete memtable_;
+    memtable_->Unref();
    memtable_ = new MemTable(internal_comparator_);
    memtable_->Ref();
    int seq = 1;
    for (KVMap::const_iterator it = data.begin();
         it != data.end();
@ -736,16 +738,17 @@ class MemTableTest { };
 TEST(MemTableTest, Simple) {
  InternalKeyComparator cmp(BytewiseComparator());
-  MemTable memtable(cmp);
+  MemTable* memtable = new MemTable(cmp);
  memtable->Ref();
  WriteBatch batch;
  WriteBatchInternal::SetSequence(&batch, 100);
  batch.Put(std::string("k1"), std::string("v1"));
  batch.Put(std::string("k2"), std::string("v2"));
  batch.Put(std::string("k3"), std::string("v3"));
  batch.Put(std::string("largekey"), std::string("vlarge"));
-  ASSERT_TRUE(WriteBatchInternal::InsertInto(&batch, &memtable).ok());
+  ASSERT_TRUE(WriteBatchInternal::InsertInto(&batch, memtable).ok());
-  Iterator* iter = memtable.NewIterator();
+  Iterator* iter = memtable->NewIterator();
  iter->SeekToFirst();
  while (iter->Valid()) {
    fprintf(stderr, "key: '%s' -> '%s'\n",
@ -755,6 +758,7 @@ TEST(MemTableTest, Simple) {
  }
  delete iter;
  memtable->Unref();
 }
 static bool Between(uint64_t val, uint64_t low, uint64_t high) {
--- a/util/env_chromium.cc
+++ b/util/env_chromium.cc
@ -141,6 +141,13 @@ class ChromiumSequentialFile: public SequentialFile {
    }
    return s;
  }
  virtual Status Skip(uint64_t n) {
    if (fseek(file_, n, SEEK_CUR)) {
      return Status::IOError(filename_, strerror(errno));
    }
    return Status::OK();
  }
 };
 class ChromiumRandomAccessFile: public RandomAccessFile {
--- a/util/env_posix.cc
+++ b/util/env_posix.cc
@ -52,6 +52,13 @@ class PosixSequentialFile: public SequentialFile {
    }
    return s;
  }
  virtual Status Skip(uint64_t n) {
    if (fseek(file_, n, SEEK_CUR)) {
      return Status::IOError(filename_, strerror(errno));
    }
    return Status::OK();
  }
 };
 class PosixRandomAccessFile: public RandomAccessFile {
--- a/util/status.cc
+++ b/util/status.cc
@ -8,13 +8,29 @@
 namespace leveldb {
 const char* Status::CopyState(const char* state) {
  uint32_t size;
  memcpy(&size, state, sizeof(size));
  char* result = new char[size + 5];
  memcpy(result, state, size + 5);
  return result;
 }
 Status::Status(Code code, const Slice& msg, const Slice& msg2) {
  assert(code != kOk);
-  state_ = new State(make_pair(code, std::string(msg.data(), msg.size())));
+  const uint32_t len1 = msg.size();
-  if (!msg2.empty()) {
+  const uint32_t len2 = msg2.size();
-    state_->second.append(": ");
+  const uint32_t size = len1 + (len2 ? (2 + len2) : 0);
-    state_->second.append(msg2.data(), msg2.size());
+  char* result = new char[size + 5];
  memcpy(result, &size, sizeof(size));
  result[4] = static_cast<char>(code);
  memcpy(result + 5, msg.data(), len1);
  if (len2) {
    result[5 + len1] = ':';
    result[6 + len1] = ' ';
    memcpy(result + 7 + len1, msg2.data(), len2);
  }
  state_ = result;
 }
 std::string Status::ToString() const {
@ -23,12 +39,12 @@ std::string Status::ToString() const {
  } else {
    char tmp[30];
    const char* type;
-    switch (state_->first) {
+    switch (code()) {
      case kOk:
        type = "OK";
        break;
      case kNotFound:
-        type = "NotFound";
+        type = "NotFound: ";
        break;
      case kCorruption:
        type = "Corruption: ";
@ -44,14 +60,14 @@ std::string Status::ToString() const {
        break;
      default:
        snprintf(tmp, sizeof(tmp), "Unknown code(%d): ",
-                 static_cast<int>(state_->first));
+                 static_cast<int>(code()));
        type = tmp;
        break;
    }
    std::string result(type);
-    if (!state_->second.empty()) {
+    uint32_t length;
-      result.append(state_->second);
+    memcpy(&length, state_, sizeof(length));
-    }
+    result.append(state_ + 5, length);
    return result;
  }
 }