switched to use basho LevelDB

This is much more efficient in disk writes. more careful incorporation of basho made it build out of the box (but not well)
2019-08-20 09:50:49 -06:00 · 2019-08-20 09:50:49 -06:00 · 1069eb65b5
commit 1069eb65b5
parent 83319b7f31
194 changed files with 23069 additions and 7759 deletions
--- a/src/Makefile.am
+++ b/src/Makefile.am
@ -475,7 +475,6 @@ lbrycrdd_LDADD = \
  $(LIBBITCOIN_CONSENSUS) \
  $(LIBBITCOIN_CRYPTO) \
  $(LIBLEVELDB) \
-  $(LIBLEVELDB_SSE42) \
  $(LIBMEMENV) \
  $(LIBSECP256K1)

@ -573,7 +572,7 @@ $(top_srcdir)/$(subdir)/config/bitcoin-config.h.in:  $(am__configure_deps)
 clean-local:
 	-$(MAKE) -C secp256k1 clean
 	-$(MAKE) -C univalue clean
-	-rm -f leveldb/*/*.gcda leveldb/*/*.gcno leveldb/helpers/memenv/*.gcda leveldb/helpers/memenv/*.gcno
+	-$(MAKE) -C leveldb clean
 	-rm -f config.h
 	-rm -rf test/__pycache__

--- a/src/Makefile.bench.include
+++ b/src/Makefile.bench.include
@ -42,7 +42,6 @@ bench_bench_bitcoin_LDADD = \
  $(LIBBITCOIN_CONSENSUS) \
  $(LIBBITCOIN_CRYPTO) \
  $(LIBLEVELDB) \
-  $(LIBLEVELDB_SSE42) \
  $(LIBMEMENV) \
  $(LIBSECP256K1) \
  $(LIBUNIVALUE)
--- a/src/Makefile.leveldb.include
+++ b/src/Makefile.leveldb.include
@ -2,148 +2,23 @@
 # Distributed under the MIT software license, see the accompanying
 # file COPYING or http://www.opensource.org/licenses/mit-license.php.

+SUBDIRS = leveldb
+
 LIBLEVELDB_INT = leveldb/libleveldb.a
 LIBMEMENV_INT  = leveldb/libmemenv.a
-LIBLEVELDB_SSE42_INT  = leveldb/libleveldb_sse42.a

 EXTRA_LIBRARIES += $(LIBLEVELDB_INT)
 EXTRA_LIBRARIES += $(LIBMEMENV_INT)
-EXTRA_LIBRARIES += $(LIBLEVELDB_SSE42_INT)

 LIBLEVELDB += $(LIBLEVELDB_INT)
 LIBMEMENV += $(LIBMEMENV_INT)
-LIBLEVELDB_SSE42 = $(LIBLEVELDB_SSE42_INT)

 LEVELDB_CPPFLAGS += -I$(srcdir)/leveldb/include
 LEVELDB_CPPFLAGS += -I$(srcdir)/leveldb/helpers/memenv
+LEVELDB_CPPFLAGS += -I$(srcdir)/leveldb

-LEVELDB_CPPFLAGS_INT =
-LEVELDB_CPPFLAGS_INT += -I$(srcdir)/leveldb
-LEVELDB_CPPFLAGS_INT += $(LEVELDB_TARGET_FLAGS)
-LEVELDB_CPPFLAGS_INT += -DLEVELDB_ATOMIC_PRESENT
-LEVELDB_CPPFLAGS_INT += -D__STDC_LIMIT_MACROS
+leveldb/libleveldb.a:
+	$(AM_V_at)$(MAKE) $(AM_MAKEFLAGS) -C leveldb

-if TARGET_WINDOWS
-LEVELDB_CPPFLAGS_INT += -DLEVELDB_PLATFORM_WINDOWS -DWINVER=0x0500 -D__USE_MINGW_ANSI_STDIO=1
-else
-LEVELDB_CPPFLAGS_INT += -DLEVELDB_PLATFORM_POSIX
-endif
-
-leveldb_libleveldb_a_CPPFLAGS = $(AM_CPPFLAGS) $(LEVELDB_CPPFLAGS_INT) $(LEVELDB_CPPFLAGS)
-leveldb_libleveldb_a_CXXFLAGS = $(AM_CXXFLAGS) $(PIE_FLAGS)
-
-leveldb_libleveldb_a_SOURCES=
-leveldb_libleveldb_a_SOURCES += leveldb/port/atomic_pointer.h
-leveldb_libleveldb_a_SOURCES += leveldb/port/port_example.h
-leveldb_libleveldb_a_SOURCES += leveldb/port/port_posix.h
-leveldb_libleveldb_a_SOURCES += leveldb/port/win/stdint.h
-leveldb_libleveldb_a_SOURCES += leveldb/port/port.h
-leveldb_libleveldb_a_SOURCES += leveldb/port/port_win.h
-leveldb_libleveldb_a_SOURCES += leveldb/port/thread_annotations.h
-leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/db.h
-leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/options.h
-leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/comparator.h
-leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/filter_policy.h
-leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/slice.h
-leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/table_builder.h
-leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/env.h
-leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/c.h
-leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/iterator.h
-leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/cache.h
-leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/dumpfile.h
-leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/table.h
-leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/write_batch.h
-leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/status.h
-leveldb_libleveldb_a_SOURCES += leveldb/db/log_format.h
-leveldb_libleveldb_a_SOURCES += leveldb/db/memtable.h
-leveldb_libleveldb_a_SOURCES += leveldb/db/version_set.h
-leveldb_libleveldb_a_SOURCES += leveldb/db/write_batch_internal.h
-leveldb_libleveldb_a_SOURCES += leveldb/db/filename.h
-leveldb_libleveldb_a_SOURCES += leveldb/db/version_edit.h
-leveldb_libleveldb_a_SOURCES += leveldb/db/dbformat.h
-leveldb_libleveldb_a_SOURCES += leveldb/db/builder.h
-leveldb_libleveldb_a_SOURCES += leveldb/db/log_writer.h
-leveldb_libleveldb_a_SOURCES += leveldb/db/db_iter.h
-leveldb_libleveldb_a_SOURCES += leveldb/db/skiplist.h
-leveldb_libleveldb_a_SOURCES += leveldb/db/db_impl.h
-leveldb_libleveldb_a_SOURCES += leveldb/db/table_cache.h
-leveldb_libleveldb_a_SOURCES += leveldb/db/snapshot.h
-leveldb_libleveldb_a_SOURCES += leveldb/db/log_reader.h
-leveldb_libleveldb_a_SOURCES += leveldb/table/filter_block.h
-leveldb_libleveldb_a_SOURCES += leveldb/table/block_builder.h
-leveldb_libleveldb_a_SOURCES += leveldb/table/block.h
-leveldb_libleveldb_a_SOURCES += leveldb/table/two_level_iterator.h
-leveldb_libleveldb_a_SOURCES += leveldb/table/merger.h
-leveldb_libleveldb_a_SOURCES += leveldb/table/format.h
-leveldb_libleveldb_a_SOURCES += leveldb/table/iterator_wrapper.h
-leveldb_libleveldb_a_SOURCES += leveldb/util/crc32c.h
-leveldb_libleveldb_a_SOURCES += leveldb/util/env_posix_test_helper.h
-leveldb_libleveldb_a_SOURCES += leveldb/util/arena.h
-leveldb_libleveldb_a_SOURCES += leveldb/util/random.h
-leveldb_libleveldb_a_SOURCES += leveldb/util/posix_logger.h
-leveldb_libleveldb_a_SOURCES += leveldb/util/hash.h
-leveldb_libleveldb_a_SOURCES += leveldb/util/histogram.h
-leveldb_libleveldb_a_SOURCES += leveldb/util/coding.h
-leveldb_libleveldb_a_SOURCES += leveldb/util/testutil.h
-leveldb_libleveldb_a_SOURCES += leveldb/util/mutexlock.h
-leveldb_libleveldb_a_SOURCES += leveldb/util/logging.h
-leveldb_libleveldb_a_SOURCES += leveldb/util/testharness.h
-
-leveldb_libleveldb_a_SOURCES += leveldb/db/builder.cc
-leveldb_libleveldb_a_SOURCES += leveldb/db/c.cc
-leveldb_libleveldb_a_SOURCES += leveldb/db/dbformat.cc
-leveldb_libleveldb_a_SOURCES += leveldb/db/db_impl.cc
-leveldb_libleveldb_a_SOURCES += leveldb/db/db_iter.cc
-leveldb_libleveldb_a_SOURCES += leveldb/db/dumpfile.cc
-leveldb_libleveldb_a_SOURCES += leveldb/db/filename.cc
-leveldb_libleveldb_a_SOURCES += leveldb/db/log_reader.cc
-leveldb_libleveldb_a_SOURCES += leveldb/db/log_writer.cc
-leveldb_libleveldb_a_SOURCES += leveldb/db/memtable.cc
-leveldb_libleveldb_a_SOURCES += leveldb/db/repair.cc
-leveldb_libleveldb_a_SOURCES += leveldb/db/table_cache.cc
-leveldb_libleveldb_a_SOURCES += leveldb/db/version_edit.cc
-leveldb_libleveldb_a_SOURCES += leveldb/db/version_set.cc
-leveldb_libleveldb_a_SOURCES += leveldb/db/write_batch.cc
-leveldb_libleveldb_a_SOURCES += leveldb/table/block_builder.cc
-leveldb_libleveldb_a_SOURCES += leveldb/table/block.cc
-leveldb_libleveldb_a_SOURCES += leveldb/table/filter_block.cc
-leveldb_libleveldb_a_SOURCES += leveldb/table/format.cc
-leveldb_libleveldb_a_SOURCES += leveldb/table/iterator.cc
-leveldb_libleveldb_a_SOURCES += leveldb/table/merger.cc
-leveldb_libleveldb_a_SOURCES += leveldb/table/table_builder.cc
-leveldb_libleveldb_a_SOURCES += leveldb/table/table.cc
-leveldb_libleveldb_a_SOURCES += leveldb/table/two_level_iterator.cc
-leveldb_libleveldb_a_SOURCES += leveldb/util/arena.cc
-leveldb_libleveldb_a_SOURCES += leveldb/util/bloom.cc
-leveldb_libleveldb_a_SOURCES += leveldb/util/cache.cc
-leveldb_libleveldb_a_SOURCES += leveldb/util/coding.cc
-leveldb_libleveldb_a_SOURCES += leveldb/util/comparator.cc
-leveldb_libleveldb_a_SOURCES += leveldb/util/crc32c.cc
-leveldb_libleveldb_a_SOURCES += leveldb/util/env.cc
-leveldb_libleveldb_a_SOURCES += leveldb/util/env_posix.cc
-leveldb_libleveldb_a_SOURCES += leveldb/util/filter_policy.cc
-leveldb_libleveldb_a_SOURCES += leveldb/util/hash.cc
-leveldb_libleveldb_a_SOURCES += leveldb/util/histogram.cc
-leveldb_libleveldb_a_SOURCES += leveldb/util/logging.cc
-leveldb_libleveldb_a_SOURCES += leveldb/util/options.cc
-leveldb_libleveldb_a_SOURCES += leveldb/util/status.cc
-
-if TARGET_WINDOWS
-leveldb_libleveldb_a_SOURCES += leveldb/util/env_win.cc
-leveldb_libleveldb_a_SOURCES += leveldb/port/port_win.cc
-else
-leveldb_libleveldb_a_SOURCES += leveldb/port/port_posix.cc
-endif
-
-leveldb_libmemenv_a_CPPFLAGS = $(leveldb_libleveldb_a_CPPFLAGS)
-leveldb_libmemenv_a_CXXFLAGS = $(leveldb_libleveldb_a_CXXFLAGS)
-leveldb_libmemenv_a_SOURCES =  leveldb/helpers/memenv/memenv.cc
-leveldb_libmemenv_a_SOURCES += leveldb/helpers/memenv/memenv.h
-
-leveldb_libleveldb_sse42_a_CPPFLAGS = $(leveldb_libleveldb_a_CPPFLAGS)
-leveldb_libleveldb_sse42_a_CXXFLAGS = $(leveldb_libleveldb_a_CXXFLAGS)
-if ENABLE_HWCRC32
-leveldb_libleveldb_sse42_a_CPPFLAGS += -DLEVELDB_PLATFORM_POSIX_SSE
-leveldb_libleveldb_sse42_a_CXXFLAGS += $(SSE42_CXXFLAGS)
-endif
-leveldb_libleveldb_sse42_a_SOURCES =  leveldb/port/port_posix_sse.cc
+leveldb/libmemenv.a: leveldb/libleveldb.a
+	$(AM_V_at)$(MAKE) $(AM_MAKEFLAGS) -C leveldb memenv_test
--- a/src/Makefile.qt.include
+++ b/src/Makefile.qt.include
@ -408,7 +408,7 @@ endif
 if ENABLE_ZMQ
 qt_lbrycrd_qt_LDADD += $(LIBBITCOIN_ZMQ) $(ZMQ_LIBS)
 endif
-qt_lbrycrd_qt_LDADD += $(LIBBITCOIN_CLI) $(LIBBITCOIN_COMMON) $(LIBBITCOIN_UTIL) $(LIBBITCOIN_CONSENSUS) $(LIBBITCOIN_CRYPTO) $(LIBUNIVALUE) $(LIBLEVELDB) $(LIBLEVELDB_SSE42) $(LIBMEMENV) \
+qt_lbrycrd_qt_LDADD += $(LIBBITCOIN_CLI) $(LIBBITCOIN_COMMON) $(LIBBITCOIN_UTIL) $(LIBBITCOIN_CONSENSUS) $(LIBBITCOIN_CRYPTO) $(LIBUNIVALUE) $(LIBLEVELDB) $(LIBMEMENV) \
  $(BOOST_LIBS) $(QT_LIBS) $(QT_DBUS_LIBS) $(QR_LIBS) $(PROTOBUF_LIBS) $(ICU_LIBS) $(BDB_LIBS) $(SSL_LIBS) $(CRYPTO_LIBS) $(MINIUPNPC_LIBS) $(LIBSECP256K1) \
  $(EVENT_PTHREADS_LIBS) $(EVENT_LIBS)
 qt_lbrycrd_qt_LDFLAGS = $(RELDFLAGS) $(AM_LDFLAGS) $(QT_LDFLAGS) $(LIBTOOL_APP_LDFLAGS)
--- a/src/Makefile.qttest.include
+++ b/src/Makefile.qttest.include
@ -63,7 +63,7 @@ if ENABLE_ZMQ
 qt_test_test_lbrycrd_qt_LDADD += $(LIBBITCOIN_ZMQ) $(ZMQ_LIBS)
 endif
 qt_test_test_lbrycrd_qt_LDADD += $(LIBBITCOIN_CLI) $(LIBBITCOIN_COMMON) $(LIBBITCOIN_UTIL) $(LIBBITCOIN_CONSENSUS) $(LIBBITCOIN_CRYPTO) $(LIBUNIVALUE) $(LIBLEVELDB) \
-  $(LIBLEVELDB_SSE42) $(LIBMEMENV) $(BOOST_LIBS) $(QT_DBUS_LIBS) $(QT_TEST_LIBS) $(QT_LIBS) \
+  $(LIBMEMENV) $(BOOST_LIBS) $(QT_DBUS_LIBS) $(QT_TEST_LIBS) $(QT_LIBS) \
  $(QR_LIBS) $(PROTOBUF_LIBS) $(ICU_LIBS) $(BDB_LIBS) $(SSL_LIBS) $(CRYPTO_LIBS) $(MINIUPNPC_LIBS) $(LIBSECP256K1) \
  $(EVENT_PTHREADS_LIBS) $(EVENT_LIBS)
 qt_test_test_lbrycrd_qt_LDFLAGS = $(RELDFLAGS) $(AM_LDFLAGS) $(QT_LDFLAGS) $(LIBTOOL_APP_LDFLAGS)
--- a/src/Makefile.test.include
+++ b/src/Makefile.test.include
@ -122,7 +122,7 @@ test_test_lbrycrd_LDADD += $(LIBBITCOIN_WALLET)
 endif

 test_test_lbrycrd_LDADD += $(LIBBITCOIN_SERVER) $(LIBBITCOIN_CLI) $(LIBBITCOIN_COMMON) $(LIBBITCOIN_UTIL) $(LIBBITCOIN_CONSENSUS) $(LIBBITCOIN_CRYPTO) $(LIBUNIVALUE) \
-  $(LIBLEVELDB) $(LIBLEVELDB_SSE42) $(LIBMEMENV) $(BOOST_LIBS) $(BOOST_UNIT_TEST_FRAMEWORK_LIB) $(LIBSECP256K1) $(EVENT_LIBS) $(EVENT_PTHREADS_LIBS)
+  $(LIBLEVELDB) $(LIBMEMENV) $(BOOST_LIBS) $(BOOST_UNIT_TEST_FRAMEWORK_LIB) $(LIBSECP256K1) $(EVENT_LIBS) $(EVENT_PTHREADS_LIBS)
 test_test_lbrycrd_CXXFLAGS = $(AM_CXXFLAGS) $(PIE_FLAGS)

 test_test_lbrycrd_LDADD += $(LIBBITCOIN_CONSENSUS) $(BDB_LIBS) $(CRYPTO_LIBS) $(ICU_LIBS) $(MINIUPNPC_LIBS)
--- a/src/claimtrie.cpp
+++ b/src/claimtrie.cpp
@ -597,7 +597,7 @@ bool CClaimTrieCacheBase::flush()
    base->nNextHeight = nNextHeight;
    if (!nodesToAddOrUpdate.empty() && (LogAcceptCategory(BCLog::CLAIMS) || LogAcceptCategory(BCLog::BENCH))) {
        LogPrintf("TrieCache size: %zu nodes on block %d, batch writes %zu bytes.\n",
-                nodesToAddOrUpdate.height(), nNextHeight, batch.SizeEstimate());
+                nodesToAddOrUpdate.height(), nNextHeight, batch.SizeEstimate(), base->db->DynamicMemoryUsage());
    }
    auto ret = base->db->WriteBatch(batch);
    clear();
--- a/src/dbwrapper.cpp
+++ b/src/dbwrapper.cpp
@ -97,11 +97,45 @@ static void SetMaxOpenFiles(leveldb::Options *options) {
             options->max_open_files, default_open_files);
 }

+class CappedLenCache: public leveldb::Cache {
+    leveldb::Cache* inner;
+    std::size_t maxKeyLen;
+public:
+    CappedLenCache(std::size_t capacity, std::size_t maxKeyLen)
+        : inner(leveldb::NewLRUCache(capacity)), maxKeyLen(maxKeyLen) {}
+
+    ~CappedLenCache() override { delete inner; }
+
+    Handle* Insert(const leveldb::Slice& key, void* value, size_t charge,
+                           void (*deleter)(const leveldb::Slice& key, void* value)) override {
+        if (key.size() <= maxKeyLen)
+            return inner->Insert(key, value, charge, deleter);
+        deleter(key, value);
+        return nullptr;
+    }
+
+    Handle* Lookup(const leveldb::Slice& key) override { return inner->Lookup(key); }
+    void Release(Handle* handle) override { return inner->Release(handle); }
+    void* Value(Handle* handle) override { return inner->Value(handle); }
+    void Erase(const leveldb::Slice& key) override {return inner->Erase(key); }
+    uint64_t NewId() override { return inner->NewId(); }
+};
+
 static leveldb::Options GetOptions(size_t nCacheSize)
 {
    leveldb::Options options;
-    auto write_cache = std::min(nCacheSize / 4, size_t(16) << 20U); // cap write_cache at 16MB (4x default)
+
+    options.filter_policy=leveldb::NewBloomFilterPolicy2(16);
+    options.write_buffer_size=60 * 1024 * 1024;
+    options.total_leveldb_mem=2500ULL * 1024ULL * 1024ULL;
+    options.env=leveldb::Env::Default();
+    options.compression = leveldb::kNoCompression;
+    options.info_log = new CBitcoinLevelDBLogger();
+    return options;
+
+    auto write_cache = std::min(nCacheSize / 4, size_t(4 * 1024 * 1024)); // cap write_cache at 4MB (default)
    options.block_cache = leveldb::NewLRUCache(nCacheSize - write_cache * 2);
+    // options.block_cache = new CappedLenCache(nCacheSize - write_cache * 2, 6);
    options.write_buffer_size = write_cache; // up to two write buffers may be held in memory simultaneously
    options.filter_policy = leveldb::NewBloomFilterPolicy(10);
    options.compression = leveldb::kNoCompression;
@ -112,6 +146,7 @@ static leveldb::Options GetOptions(size_t nCacheSize)
        options.paranoid_checks = true;
    }
    SetMaxOpenFiles(&options);
+    options.max_open_files = 30000;
    return options;
 }

--- a/src/dbwrapper.h
+++ b/src/dbwrapper.h
@ -81,7 +81,7 @@ public:
        ssValue.Xor(dbwrapper_private::GetObfuscateKey(parent));
        leveldb::Slice slValue(ssValue.data(), ssValue.size());

-        batch.Put(slKey, slValue);
+        batch.Put(slKey, slValue, nullptr);
        // LevelDB serializes writes as:
        // - byte: header
        // - varint: key length (1 byte up to 127B, 2 bytes up to 16383B, ...)
--- a/src/leveldb/.gitignore
+++ b/src/leveldb/.gitignore
@ -1,13 +0,0 @@
-build_config.mk
-*.a
-*.o
-*.dylib*
-*.so
-*.so.*
-*_test
-db_bench
-leveldbutil
-Release
-Debug
-Benchmark
-vs2010.*
--- a/src/leveldb/AUTHORS
+++ b/src/leveldb/AUTHORS
@ -6,7 +6,3 @@ Google Inc.
 # Initial version authors:
 Jeffrey Dean <jeff@google.com>
 Sanjay Ghemawat <sanjay@google.com>
-
-# Partial list of contributors:
-Kevin Regan <kevin.d.regan@gmail.com>
-Johan Bilien <jobi@litl.com>
--- a/src/leveldb/BASHO_RELEASES
+++ b/src/leveldb/BASHO_RELEASES
@ -0,0 +1,72 @@
+github.com tag 2.0.34 - February 15, 2017
+-----------------------------------------
+mv-hot-backup2:  - correct MakeTieredDbname() within db/filename.cc
+                   for case where dbname input is blank and fast/slow
+                   already populated in options.  Corrects issue
+                   with hot backup in non-tiered storage situations
+
+github.com tag 2.0.33 - November 21, 2016
+-----------------------------------------
+mv-bucket-expiry:  - partial branch to enable X-Riak-Meta-Expiry-Base-Seconds
+                     property within enterprise edition
+
+--- no 2.0.32 tag on leveldb ---
+
+github.com tag 2.0.31 - November 1, 2016
+----------------------------------------
+ - version shipped with Riak 2.2
+mv-no-md-expiry: - Riak specific
+                 - never convert a key prefix of sext:encoded "{md" to expiry
+                 - update sst_scan for dumping Riak formated keys
+mv-tuning8:      - rework penalty rules in version_set.cc UpdatePenalty()
+                 - add unit test framework for UpdatePenalty()
+
+github.com tag 2.0.30 - October 11, 2016
+----------------------------------------
+mv-delayed-bloom: - when opening an .sst table file, only load
+                    bloom filter on second Get() operation.  Saves time.
+                  - correct VersionSet::Finalize() logic for level 1 when
+                    when level 2 is above desired size
+                  - move hot backup to Riak ee build
+
+github.com tag 2.0.29 - September 13, 2016
+------------------------------------------
+mv-expiry-manifest:  only switch to expiry enabled manifest format
+                     if expiry function enabled.  Eases downgrade
+                     during early Riak releases containing expiry
+
+github.com tag 2.0.28 - September 6, 2016
+-----------------------------------------
+mv-hot-backup:  add externally triggered hot backup feature
+
+github.com tag 2.0.27 - August 22, 2016
+---------------------------------------
+mv-mem-fences:  fix iterator double delete bug in eleveldb and
+                build better memory fenced operations for referenced count objects.
+
+github.com tag 2.0.26 - August 21, 2016
+---------------------------------------
+mv-expiry-iter-bug:  DBImpl::NewIterator() was not setting the new expiry parameter.
+
+github.com tag 2.0.25 - August 10, 2016
+---------------------------------------
+Make LZ4 the default compression instead of Snappy.
+
+github.com tag 2.0.24 - August 2, 2016
+--------------------------------------
+mv-expiry:  open source expiry.  Supports one expiry policy for all databases.
+
+github.com tag 2.0.23 - July 20, 2016
+-------------------------------------
+mv-no-semaphore:  remove semaphore controlled thread in hot_threads.cc.  Instead use
+ use mutex of thread 0 (only one thread's mutex) to address know race condition.
+
+github.com tag 2.0.22 - June 22, 2016
+-------------------------------------
+no change: iterator fix in eleveldb
+
+github.com tag 2.0.21 - June 16, 2016
+-------------------------------------
+branch mv-iterator-hot-threads:  correct condition where eleveldb MoveTask
+ could hang an iterator. (https://github.com/basho/leveldb/wiki/mv-iterator-hot-threads)
+
--- a/src/leveldb/CONTRIBUTING.md
+++ b/src/leveldb/CONTRIBUTING.md
@ -1,36 +0,0 @@
-# Contributing
-
-We'd love to accept your code patches! However, before we can take them, we
-have to jump a couple of legal hurdles.
-
-## Contributor License Agreements
-
-Please fill out either the individual or corporate Contributor License
-Agreement as appropriate.
-
-* If you are an individual writing original source code and you're sure you
-own the intellectual property, then sign an [individual CLA](https://developers.google.com/open-source/cla/individual).
-* If you work for a company that wants to allow you to contribute your work,
-then sign a [corporate CLA](https://developers.google.com/open-source/cla/corporate).
-
-Follow either of the two links above to access the appropriate CLA and
-instructions for how to sign and return it.
-
-## Submitting a Patch
-
-1. Sign the contributors license agreement above.
-2. Decide which code you want to submit. A submission should be a set of changes
-that addresses one issue in the [issue tracker](https://github.com/google/leveldb/issues).
-Please don't mix more than one logical change per submission, because it makes
-the history hard to follow. If you want to make a change
-(e.g. add a sample or feature) that doesn't have a corresponding issue in the
-issue tracker, please create one.
-3. **Submitting**: When you are ready to submit, send us a Pull Request. Be
-sure to include the issue number you fixed and the name you used to sign
-the CLA.
-
-## Writing Code ##
-
-If your contribution contains code, please make sure that it follows 
-[the style guide](http://google-styleguide.googlecode.com/svn/trunk/cppguide.xml).
-Otherwise we will have to ask you to make changes, and that's no fun for anyone.
--- a/src/leveldb/Makefile
+++ b/src/leveldb/Makefile
@ -2,423 +2,219 @@
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file. See the AUTHORS file for names of contributors.

+# Inherit some settings from environment variables, if available
+INSTALL_PATH ?= $(CURDIR)
+
 #-----------------------------------------------
 # Uncomment exactly one of the lines labelled (A), (B), and (C) below
 # to switch between compilation modes.
+#  NOTE: targets "debug" and "prof" provide same functionality
+#  NOTE 2: -DNDEBUG disables assert() statements within C code,
+#            i.e. no assert()s in production code

-# (A) Production use (optimized mode)
-OPT ?= -O2 -DNDEBUG
-# (B) Debug mode, w/ full line-level debugging symbols
-# OPT ?= -g2
-# (C) Profiling mode: opt, but w/debugging symbols
-# OPT ?= -O2 -g2 -DNDEBUG
+OPT ?= -O2 -g -DNDEBUG    # (A) Production use (optimized mode)
+# OPT ?= -g2              # (B) Debug mode, w/ full line-level debugging symbols
+# OPT ?= -O2 -g2 -DNDEBUG # (C) Profiling mode: opt, but w/debugging symbols
 #-----------------------------------------------

 # detect what platform we're building on
-$(shell CC="$(CC)" CXX="$(CXX)" TARGET_OS="$(TARGET_OS)" \
-    ./build_detect_platform build_config.mk ./)
+ifeq ($(wildcard build_config.mk),)
+$(shell ./build_detect_platform build_config.mk)
+endif
 # this file is generated by the previous line to set build flags and sources
 include build_config.mk

-TESTS = \
-	db/autocompact_test \
-	db/c_test \
-	db/corruption_test \
-	db/db_test \
-	db/dbformat_test \
-	db/fault_injection_test \
-	db/filename_test \
-	db/log_test \
-	db/recovery_test \
-	db/skiplist_test \
-	db/version_edit_test \
-	db/version_set_test \
-	db/write_batch_test \
-	helpers/memenv/memenv_test \
-	issues/issue178_test \
-	issues/issue200_test \
-	table/filter_block_test \
-	table/table_test \
-	util/arena_test \
-	util/bloom_test \
-	util/cache_test \
-	util/coding_test \
-	util/crc32c_test \
-	util/env_posix_test \
-	util/env_test \
-	util/hash_test
-
-UTILS = \
-	db/db_bench \
-	db/leveldbutil
-
-# Put the object files in a subdirectory, but the application at the top of the object dir.
-PROGNAMES := $(notdir $(TESTS) $(UTILS))
-
-# On Linux may need libkyotocabinet-dev for dependency.
-BENCHMARKS = \
-	doc/bench/db_bench_sqlite3 \
-	doc/bench/db_bench_tree_db
-
 CFLAGS += -I. -I./include $(PLATFORM_CCFLAGS) $(OPT)
 CXXFLAGS += -I. -I./include $(PLATFORM_CXXFLAGS) $(OPT)

 LDFLAGS += $(PLATFORM_LDFLAGS)
-LIBS += $(PLATFORM_LIBS)

-SIMULATOR_OUTDIR=out-ios-x86
-DEVICE_OUTDIR=out-ios-arm
+LIBOBJECTS := $(SOURCES:.cc=.o)
+LIBOBJECTS += util/lz4.o
+MEMENVOBJECTS = $(MEMENV_SOURCES:.cc=.o)
+DEPEND := $(SOURCES:.cc=.d)

-ifeq ($(PLATFORM), IOS)
-# Note: iOS should probably be using libtool, not ar.
-AR=xcrun ar
-SIMULATORSDK=$(shell xcrun -sdk iphonesimulator --show-sdk-path)
-DEVICESDK=$(shell xcrun -sdk iphoneos --show-sdk-path)
-DEVICE_CFLAGS = -isysroot "$(DEVICESDK)" -arch armv6 -arch armv7 -arch armv7s -arch arm64
-SIMULATOR_CFLAGS = -isysroot "$(SIMULATORSDK)" -arch i686 -arch x86_64
-STATIC_OUTDIR=out-ios-universal
+TESTUTIL = ./util/testutil.o
+TESTHARNESS = ./util/testharness.o $(TESTUTIL)
+
+TESTS := $(sort $(notdir $(basename $(TEST_SOURCES))))
+
+TOOLS = \
+	leveldb_repair \
+	perf_dump \
+	sst_rewrite \
+	sst_scan
+
+PROGRAMS = db_bench $(TESTS) $(TOOLS)
+BENCHMARKS = db_bench_sqlite3 db_bench_tree_db
+
+LIBRARY = libleveldb.a
+MEMENVLIBRARY = libmemenv.a
+
+#
+# static link leveldb to tools to simplify platform usage (if Linux)
+#
+ifeq ($(PLATFORM),OS_LINUX)
+LEVEL_LDFLAGS := -L . -Wl,-non_shared -lleveldb -Wl,-call_shared
 else
-STATIC_OUTDIR=out-static
-SHARED_OUTDIR=out-shared
-STATIC_PROGRAMS := $(addprefix $(STATIC_OUTDIR)/, $(PROGNAMES))
-SHARED_PROGRAMS := $(addprefix $(SHARED_OUTDIR)/, db_bench)
+LEVEL_LDFLAGS := -L . -lleveldb
 endif

-STATIC_LIBOBJECTS := $(addprefix $(STATIC_OUTDIR)/, $(SOURCES:.cc=.o))
-STATIC_MEMENVOBJECTS := $(addprefix $(STATIC_OUTDIR)/, $(MEMENV_SOURCES:.cc=.o))
-
-DEVICE_LIBOBJECTS := $(addprefix $(DEVICE_OUTDIR)/, $(SOURCES:.cc=.o))
-DEVICE_MEMENVOBJECTS := $(addprefix $(DEVICE_OUTDIR)/, $(MEMENV_SOURCES:.cc=.o))
-
-SIMULATOR_LIBOBJECTS := $(addprefix $(SIMULATOR_OUTDIR)/, $(SOURCES:.cc=.o))
-SIMULATOR_MEMENVOBJECTS := $(addprefix $(SIMULATOR_OUTDIR)/, $(MEMENV_SOURCES:.cc=.o))
-
-SHARED_LIBOBJECTS := $(addprefix $(SHARED_OUTDIR)/, $(SOURCES:.cc=.o))
-SHARED_MEMENVOBJECTS := $(addprefix $(SHARED_OUTDIR)/, $(MEMENV_SOURCES:.cc=.o))
-
-TESTUTIL := $(STATIC_OUTDIR)/util/testutil.o
-TESTHARNESS := $(STATIC_OUTDIR)/util/testharness.o $(TESTUTIL)
-
-STATIC_TESTOBJS := $(addprefix $(STATIC_OUTDIR)/, $(addsuffix .o, $(TESTS)))
-STATIC_UTILOBJS := $(addprefix $(STATIC_OUTDIR)/, $(addsuffix .o, $(UTILS)))
-STATIC_ALLOBJS := $(STATIC_LIBOBJECTS) $(STATIC_MEMENVOBJECTS) $(STATIC_TESTOBJS) $(STATIC_UTILOBJS) $(TESTHARNESS)
-DEVICE_ALLOBJS := $(DEVICE_LIBOBJECTS) $(DEVICE_MEMENVOBJECTS)
-SIMULATOR_ALLOBJS := $(SIMULATOR_LIBOBJECTS) $(SIMULATOR_MEMENVOBJECTS)
-
 default: all

 # Should we build shared libraries?
 ifneq ($(PLATFORM_SHARED_EXT),)

-# Many leveldb test apps use non-exported API's. Only build a subset for testing.
-SHARED_ALLOBJS := $(SHARED_LIBOBJECTS) $(SHARED_MEMENVOBJECTS) $(TESTHARNESS)
-
 ifneq ($(PLATFORM_SHARED_VERSIONED),true)
-SHARED_LIB1 = libleveldb.$(PLATFORM_SHARED_EXT)
-SHARED_LIB2 = $(SHARED_LIB1)
-SHARED_LIB3 = $(SHARED_LIB1)
-SHARED_LIBS = $(SHARED_LIB1)
-SHARED_MEMENVLIB = $(SHARED_OUTDIR)/libmemenv.a
+SHARED1 = libleveldb.$(PLATFORM_SHARED_EXT)
+SHARED2 = $(SHARED1)
+SHARED3 = $(SHARED1)
+SHARED = $(SHARED1)
 else
 # Update db.h if you change these.
-SHARED_VERSION_MAJOR = 1
-SHARED_VERSION_MINOR = 20
-SHARED_LIB1 = libleveldb.$(PLATFORM_SHARED_EXT)
-SHARED_LIB2 = $(SHARED_LIB1).$(SHARED_VERSION_MAJOR)
-SHARED_LIB3 = $(SHARED_LIB1).$(SHARED_VERSION_MAJOR).$(SHARED_VERSION_MINOR)
-SHARED_LIBS = $(SHARED_OUTDIR)/$(SHARED_LIB1) $(SHARED_OUTDIR)/$(SHARED_LIB2) $(SHARED_OUTDIR)/$(SHARED_LIB3)
-$(SHARED_OUTDIR)/$(SHARED_LIB1): $(SHARED_OUTDIR)/$(SHARED_LIB3)
-	ln -fs $(SHARED_LIB3) $(SHARED_OUTDIR)/$(SHARED_LIB1)
-$(SHARED_OUTDIR)/$(SHARED_LIB2): $(SHARED_OUTDIR)/$(SHARED_LIB3)
-	ln -fs $(SHARED_LIB3) $(SHARED_OUTDIR)/$(SHARED_LIB2)
-SHARED_MEMENVLIB = $(SHARED_OUTDIR)/libmemenv.a
+SHARED_MAJOR = 1
+SHARED_MINOR = 9
+SHARED1 = libleveldb.$(PLATFORM_SHARED_EXT)
+SHARED2 = $(SHARED1).$(SHARED_MAJOR)
+SHARED3 = $(SHARED1).$(SHARED_MAJOR).$(SHARED_MINOR)
+SHARED = $(SHARED1) $(SHARED2) $(SHARED3)
+$(SHARED1): $(SHARED3)
+	ln -fs $(SHARED3) $(SHARED1)
+$(SHARED2): $(SHARED3)
+	ln -fs $(SHARED3) $(SHARED2)
 endif

-$(SHARED_OUTDIR)/$(SHARED_LIB3): $(SHARED_LIBOBJECTS)
-	$(CXX) $(LDFLAGS) $(PLATFORM_SHARED_LDFLAGS)$(SHARED_LIB2) $(SHARED_LIBOBJECTS) -o $(SHARED_OUTDIR)/$(SHARED_LIB3) $(LIBS)
+$(SHARED3): $(LIBOBJECTS)
+	$(CXX) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) $(LIBOBJECTS) -o $(SHARED3) $(LDFLAGS) $(PLATFORM_SHARED_LDFLAGS)$(SHARED2)

 endif  # PLATFORM_SHARED_EXT

-all: $(SHARED_LIBS) $(SHARED_PROGRAMS) $(STATIC_OUTDIR)/libleveldb.a $(STATIC_OUTDIR)/libmemenv.a $(STATIC_PROGRAMS)
+all: $(SHARED) $(LIBRARY)

-check: $(STATIC_PROGRAMS)
-	for t in $(notdir $(TESTS)); do echo "***** Running $$t"; $(STATIC_OUTDIR)/$$t || exit 1; done
+test check: all $(PROGRAMS) $(TESTS)
+	for t in $(TESTS); do echo "***** Running $$t"; ./$$t || exit 1; done

-clean:
-	-rm -rf out-static out-shared out-ios-x86 out-ios-arm out-ios-universal
-	-rm -f build_config.mk
-	-rm -rf ios-x86 ios-arm
+tools: all $(TOOLS)

-$(STATIC_OUTDIR):
-	mkdir $@
-
-$(STATIC_OUTDIR)/db: | $(STATIC_OUTDIR)
-	mkdir $@
-
-$(STATIC_OUTDIR)/helpers/memenv: | $(STATIC_OUTDIR)
-	mkdir -p $@
-
-$(STATIC_OUTDIR)/port: | $(STATIC_OUTDIR)
-	mkdir $@
-
-$(STATIC_OUTDIR)/table: | $(STATIC_OUTDIR)
-	mkdir $@
-
-$(STATIC_OUTDIR)/util: | $(STATIC_OUTDIR)
-	mkdir $@
-
-.PHONY: STATIC_OBJDIRS
-STATIC_OBJDIRS: \
-	$(STATIC_OUTDIR)/db \
-	$(STATIC_OUTDIR)/port \
-	$(STATIC_OUTDIR)/table \
-	$(STATIC_OUTDIR)/util \
-	$(STATIC_OUTDIR)/helpers/memenv
-
-$(SHARED_OUTDIR):
-	mkdir $@
-
-$(SHARED_OUTDIR)/db: | $(SHARED_OUTDIR)
-	mkdir $@
-
-$(SHARED_OUTDIR)/helpers/memenv: | $(SHARED_OUTDIR)
-	mkdir -p $@
-
-$(SHARED_OUTDIR)/port: | $(SHARED_OUTDIR)
-	mkdir $@
-
-$(SHARED_OUTDIR)/table: | $(SHARED_OUTDIR)
-	mkdir $@
-
-$(SHARED_OUTDIR)/util: | $(SHARED_OUTDIR)
-	mkdir $@
-
-.PHONY: SHARED_OBJDIRS
-SHARED_OBJDIRS: \
-	$(SHARED_OUTDIR)/db \
-	$(SHARED_OUTDIR)/port \
-	$(SHARED_OUTDIR)/table \
-	$(SHARED_OUTDIR)/util \
-	$(SHARED_OUTDIR)/helpers/memenv
-
-$(DEVICE_OUTDIR):
-	mkdir $@
-
-$(DEVICE_OUTDIR)/db: | $(DEVICE_OUTDIR)
-	mkdir $@
-
-$(DEVICE_OUTDIR)/helpers/memenv: | $(DEVICE_OUTDIR)
-	mkdir -p $@
-
-$(DEVICE_OUTDIR)/port: | $(DEVICE_OUTDIR)
-	mkdir $@
-
-$(DEVICE_OUTDIR)/table: | $(DEVICE_OUTDIR)
-	mkdir $@
-
-$(DEVICE_OUTDIR)/util: | $(DEVICE_OUTDIR)
-	mkdir $@
-
-.PHONY: DEVICE_OBJDIRS
-DEVICE_OBJDIRS: \
-	$(DEVICE_OUTDIR)/db \
-	$(DEVICE_OUTDIR)/port \
-	$(DEVICE_OUTDIR)/table \
-	$(DEVICE_OUTDIR)/util \
-	$(DEVICE_OUTDIR)/helpers/memenv
-
-$(SIMULATOR_OUTDIR):
-	mkdir $@
-
-$(SIMULATOR_OUTDIR)/db: | $(SIMULATOR_OUTDIR)
-	mkdir $@
-
-$(SIMULATOR_OUTDIR)/helpers/memenv: | $(SIMULATOR_OUTDIR)
-	mkdir -p $@
-
-$(SIMULATOR_OUTDIR)/port: | $(SIMULATOR_OUTDIR)
-	mkdir $@
-
-$(SIMULATOR_OUTDIR)/table: | $(SIMULATOR_OUTDIR)
-	mkdir $@
-
-$(SIMULATOR_OUTDIR)/util: | $(SIMULATOR_OUTDIR)
-	mkdir $@
-
-.PHONY: SIMULATOR_OBJDIRS
-SIMULATOR_OBJDIRS: \
-	$(SIMULATOR_OUTDIR)/db \
-	$(SIMULATOR_OUTDIR)/port \
-	$(SIMULATOR_OUTDIR)/table \
-	$(SIMULATOR_OUTDIR)/util \
-	$(SIMULATOR_OUTDIR)/helpers/memenv
-
-$(STATIC_ALLOBJS): | STATIC_OBJDIRS
-$(DEVICE_ALLOBJS): | DEVICE_OBJDIRS
-$(SIMULATOR_ALLOBJS): | SIMULATOR_OBJDIRS
-$(SHARED_ALLOBJS): | SHARED_OBJDIRS
-
-ifeq ($(PLATFORM), IOS)
-$(DEVICE_OUTDIR)/libleveldb.a: $(DEVICE_LIBOBJECTS)
-	rm -f $@
-	$(AR) -rs $@ $(DEVICE_LIBOBJECTS)
-
-$(SIMULATOR_OUTDIR)/libleveldb.a: $(SIMULATOR_LIBOBJECTS)
-	rm -f $@
-	$(AR) -rs $@ $(SIMULATOR_LIBOBJECTS)
-
-$(DEVICE_OUTDIR)/libmemenv.a: $(DEVICE_MEMENVOBJECTS)
-	rm -f $@
-	$(AR) -rs $@ $(DEVICE_MEMENVOBJECTS)
-
-$(SIMULATOR_OUTDIR)/libmemenv.a: $(SIMULATOR_MEMENVOBJECTS)
-	rm -f $@
-	$(AR) -rs $@ $(SIMULATOR_MEMENVOBJECTS)
-
-# For iOS, create universal object libraries to be used on both the simulator and
-# a device.
-$(STATIC_OUTDIR)/libleveldb.a: $(STATIC_OUTDIR) $(DEVICE_OUTDIR)/libleveldb.a $(SIMULATOR_OUTDIR)/libleveldb.a
-	lipo -create $(DEVICE_OUTDIR)/libleveldb.a $(SIMULATOR_OUTDIR)/libleveldb.a -output $@
-
-$(STATIC_OUTDIR)/libmemenv.a: $(STATIC_OUTDIR) $(DEVICE_OUTDIR)/libmemenv.a $(SIMULATOR_OUTDIR)/libmemenv.a
-	lipo -create $(DEVICE_OUTDIR)/libmemenv.a $(SIMULATOR_OUTDIR)/libmemenv.a -output $@
-else
-$(STATIC_OUTDIR)/libleveldb.a:$(STATIC_LIBOBJECTS)
-	rm -f $@
-	$(AR) -rs $@ $(STATIC_LIBOBJECTS)
-
-$(STATIC_OUTDIR)/libmemenv.a:$(STATIC_MEMENVOBJECTS)
-	rm -f $@
-	$(AR) -rs $@ $(STATIC_MEMENVOBJECTS)
+#
+# command line targets:  debug and prof
+#  just like
+ifneq ($(filter debug,$(MAKECMDGOALS)),)
+OPT := -g2              # (B) Debug mode, w/ full line-level debugging symbols
+debug: all
 endif

-$(SHARED_MEMENVLIB):$(SHARED_MEMENVOBJECTS)
+ifneq ($(filter prof,$(MAKECMDGOALS)),)
+OPT := -O2 -g2 -DNDEBUG # (C) Profiling mode: opt, but w/debugging symbols
+prof: all
+endif
+
+
+clean:
+	-rm -f $(PROGRAMS) $(BENCHMARKS) $(LIBRARY) $(SHARED) $(MEMENVLIBRARY) */*.o */*/*.o */*.d */*/*.d ios-x86/*/*.o ios-arm/*/*.o build_config.mk include/leveldb/ldb_config.h
+	-rm -rf ios-x86/* ios-arm/* *.dSYM
+
+
+$(LIBRARY): $(LIBOBJECTS)
 	rm -f $@
-	$(AR) -rs $@ $(SHARED_MEMENVOBJECTS)
+	$(AR) -rs $@ $(LIBOBJECTS)

-$(STATIC_OUTDIR)/db_bench:db/db_bench.cc $(STATIC_LIBOBJECTS) $(TESTUTIL)
-	$(CXX) $(LDFLAGS) $(CXXFLAGS) db/db_bench.cc $(STATIC_LIBOBJECTS) $(TESTUTIL) -o $@ $(LIBS)
+#
+# all tools, programs, and tests depend upon the static library
+$(TESTS) $(PROGRAMS) $(TOOLS) : $(LIBRARY)

-$(STATIC_OUTDIR)/db_bench_sqlite3:doc/bench/db_bench_sqlite3.cc $(STATIC_LIBOBJECTS) $(TESTUTIL)
-	$(CXX) $(LDFLAGS) $(CXXFLAGS) doc/bench/db_bench_sqlite3.cc $(STATIC_LIBOBJECTS) $(TESTUTIL) -o $@ -lsqlite3 $(LIBS)
+#
+# all tests depend upon the test harness
+$(TESTS) : $(TESTHARNESS)

-$(STATIC_OUTDIR)/db_bench_tree_db:doc/bench/db_bench_tree_db.cc $(STATIC_LIBOBJECTS) $(TESTUTIL)
-	$(CXX) $(LDFLAGS) $(CXXFLAGS) doc/bench/db_bench_tree_db.cc $(STATIC_LIBOBJECTS) $(TESTUTIL) -o $@ -lkyotocabinet $(LIBS)
+#
+# tools, programs, and tests will compile to the root directory
+#  but their .cc source file will be in one of the following subdirectories
+vpath %.cc db:table:util:leveldb_ee:leveldb_os

-$(STATIC_OUTDIR)/leveldbutil:db/leveldbutil.cc $(STATIC_LIBOBJECTS)
-	$(CXX) $(LDFLAGS) $(CXXFLAGS) db/leveldbutil.cc $(STATIC_LIBOBJECTS) -o $@ $(LIBS)
+# special case for c_test
+vpath %.c db

-$(STATIC_OUTDIR)/arena_test:util/arena_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
-	$(CXX) $(LDFLAGS) $(CXXFLAGS) util/arena_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
+db_bench: db/db_bench.o $(LIBRARY) $(TESTUTIL)
+	$(CXX) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) $< $(TESTUTIL) -o $@  $(LEVEL_LDFLAGS) $(LDFLAGS)

-$(STATIC_OUTDIR)/autocompact_test:db/autocompact_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
-	$(CXX) $(LDFLAGS) $(CXXFLAGS) db/autocompact_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
+db_bench_sqlite3: doc/bench/db_bench_sqlite3.o $(LIBRARY) $(TESTUTIL)

-$(STATIC_OUTDIR)/bloom_test:util/bloom_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
-	$(CXX) $(LDFLAGS) $(CXXFLAGS) util/bloom_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
+db_bench_tree_db: doc/bench/db_bench_tree_db.o $(LIBRARY) $(TESTUTIL)

-$(STATIC_OUTDIR)/c_test:$(STATIC_OUTDIR)/db/c_test.o $(STATIC_LIBOBJECTS) $(TESTHARNESS)
-	$(CXX) $(LDFLAGS) $(STATIC_OUTDIR)/db/c_test.o $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)

-$(STATIC_OUTDIR)/cache_test:util/cache_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
-	$(CXX) $(LDFLAGS) $(CXXFLAGS) util/cache_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
+#
+# build line taken from lz4 makefile
+#
+util/lz4.o: util/lz4.c util/lz4.h
+	$(CC) $(CFLAGS) $(PLATFORM_SHARED_CFLAGS) -O3 -std=c99 -Wall -Wextra -Wundef -Wshadow -Wcast-qual -Wcast-align -Wstrict-prototypes -pedantic -DLZ4_VERSION=\"r130\"  -c util/lz4.c -o util/lz4.o

-$(STATIC_OUTDIR)/coding_test:util/coding_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
-	$(CXX) $(LDFLAGS) $(CXXFLAGS) util/coding_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
+#
+# memory env
+#
+$(MEMENVLIBRARY) : $(MEMENVOBJECTS)
+	rm -f $@
+	$(AR) -rs $@ $(MEMENVOBJECTS)

-$(STATIC_OUTDIR)/corruption_test:db/corruption_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
-	$(CXX) $(LDFLAGS) $(CXXFLAGS) db/corruption_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
+memenv_test : helpers/memenv/memenv_test.o $(MEMENVLIBRARY) $(LIBRARY) $(TESTHARNESS)
+	$(CXX) helpers/memenv/memenv_test.o $(MEMENVLIBRARY) $(LIBRARY) $(TESTHARNESS) -o $@ $(LDFLAGS)

-$(STATIC_OUTDIR)/crc32c_test:util/crc32c_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
-	$(CXX) $(LDFLAGS) $(CXXFLAGS) util/crc32c_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
+#
+# IOS build
+#
+ifeq ($(PLATFORM), IOS)
+# For iOS, create universal object files to be used on both the simulator and
+# a device.
+PLATFORMSROOT=/Applications/Xcode.app/Contents/Developer/Platforms
+SIMULATORROOT=$(PLATFORMSROOT)/iPhoneSimulator.platform/Developer
+DEVICEROOT=$(PLATFORMSROOT)/iPhoneOS.platform/Developer
+IOSVERSION=$(shell defaults read $(PLATFORMSROOT)/iPhoneOS.platform/version CFBundleShortVersionString)

-$(STATIC_OUTDIR)/db_test:db/db_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
-	$(CXX) $(LDFLAGS) $(CXXFLAGS) db/db_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
+.cc.o:
+	mkdir -p ios-x86/$(dir $@)
+	$(SIMULATORROOT)/usr/bin/$(CXX) $(CXXFLAGS) -isysroot $(SIMULATORROOT)/SDKs/iPhoneSimulator$(IOSVERSION).sdk -arch i686 -c $< -o ios-x86/$@
+	mkdir -p ios-arm/$(dir $@)
+	$(DEVICEROOT)/usr/bin/$(CXX) $(CXXFLAGS) -isysroot $(DEVICEROOT)/SDKs/iPhoneOS$(IOSVERSION).sdk -arch armv6 -arch armv7 -c $< -o ios-arm/$@
+	lipo ios-x86/$@ ios-arm/$@ -create -output $@

-$(STATIC_OUTDIR)/dbformat_test:db/dbformat_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
-	$(CXX) $(LDFLAGS) $(CXXFLAGS) db/dbformat_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
+.c.o:
+	mkdir -p ios-x86/$(dir $@)
+	$(SIMULATORROOT)/usr/bin/$(CC) $(CFLAGS) -isysroot $(SIMULATORROOT)/SDKs/iPhoneSimulator$(IOSVERSION).sdk -arch i686 -c $< -o ios-x86/$@
+	mkdir -p ios-arm/$(dir $@)
+	$(DEVICEROOT)/usr/bin/$(CC) $(CFLAGS) -isysroot $(DEVICEROOT)/SDKs/iPhoneOS$(IOSVERSION).sdk -arch armv6 -arch armv7 -c $< -o ios-arm/$@
+	lipo ios-x86/$@ ios-arm/$@ -create -output $@

-$(STATIC_OUTDIR)/env_posix_test:util/env_posix_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
-	$(CXX) $(LDFLAGS) $(CXXFLAGS) util/env_posix_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
-
-$(STATIC_OUTDIR)/env_test:util/env_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
-	$(CXX) $(LDFLAGS) $(CXXFLAGS) util/env_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
-
-$(STATIC_OUTDIR)/fault_injection_test:db/fault_injection_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
-	$(CXX) $(LDFLAGS) $(CXXFLAGS) db/fault_injection_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
-
-$(STATIC_OUTDIR)/filename_test:db/filename_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
-	$(CXX) $(LDFLAGS) $(CXXFLAGS) db/filename_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
-
-$(STATIC_OUTDIR)/filter_block_test:table/filter_block_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
-	$(CXX) $(LDFLAGS) $(CXXFLAGS) table/filter_block_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
-
-$(STATIC_OUTDIR)/hash_test:util/hash_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
-	$(CXX) $(LDFLAGS) $(CXXFLAGS) util/hash_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
-
-$(STATIC_OUTDIR)/issue178_test:issues/issue178_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
-	$(CXX) $(LDFLAGS) $(CXXFLAGS) issues/issue178_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
-
-$(STATIC_OUTDIR)/issue200_test:issues/issue200_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
-	$(CXX) $(LDFLAGS) $(CXXFLAGS) issues/issue200_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
-
-$(STATIC_OUTDIR)/log_test:db/log_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
-	$(CXX) $(LDFLAGS) $(CXXFLAGS) db/log_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
-
-$(STATIC_OUTDIR)/recovery_test:db/recovery_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
-	$(CXX) $(LDFLAGS) $(CXXFLAGS) db/recovery_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
-
-$(STATIC_OUTDIR)/table_test:table/table_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
-	$(CXX) $(LDFLAGS) $(CXXFLAGS) table/table_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
-
-$(STATIC_OUTDIR)/skiplist_test:db/skiplist_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
-	$(CXX) $(LDFLAGS) $(CXXFLAGS) db/skiplist_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
-
-$(STATIC_OUTDIR)/version_edit_test:db/version_edit_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
-	$(CXX) $(LDFLAGS) $(CXXFLAGS) db/version_edit_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
-
-$(STATIC_OUTDIR)/version_set_test:db/version_set_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
-	$(CXX) $(LDFLAGS) $(CXXFLAGS) db/version_set_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
-
-$(STATIC_OUTDIR)/write_batch_test:db/write_batch_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
-	$(CXX) $(LDFLAGS) $(CXXFLAGS) db/write_batch_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
-
-$(STATIC_OUTDIR)/memenv_test:$(STATIC_OUTDIR)/helpers/memenv/memenv_test.o $(STATIC_OUTDIR)/libmemenv.a $(STATIC_OUTDIR)/libleveldb.a $(TESTHARNESS)
-	$(XCRUN) $(CXX) $(LDFLAGS) $(STATIC_OUTDIR)/helpers/memenv/memenv_test.o $(STATIC_OUTDIR)/libmemenv.a $(STATIC_OUTDIR)/libleveldb.a $(TESTHARNESS) -o $@ $(LIBS)
-
-$(SHARED_OUTDIR)/db_bench:$(SHARED_OUTDIR)/db/db_bench.o $(SHARED_LIBS) $(TESTUTIL)
-	$(XCRUN) $(CXX) $(LDFLAGS) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) $(SHARED_OUTDIR)/db/db_bench.o $(TESTUTIL) $(SHARED_OUTDIR)/$(SHARED_LIB3) -o $@ $(LIBS)
-
-.PHONY: run-shared
-run-shared: $(SHARED_OUTDIR)/db_bench
-	LD_LIBRARY_PATH=$(SHARED_OUTDIR) $(SHARED_OUTDIR)/db_bench
-
-$(SIMULATOR_OUTDIR)/%.o: %.cc
-	xcrun -sdk iphonesimulator $(CXX) $(CXXFLAGS) $(SIMULATOR_CFLAGS) -c $< -o $@
-
-$(DEVICE_OUTDIR)/%.o: %.cc
-	xcrun -sdk iphoneos $(CXX) $(CXXFLAGS) $(DEVICE_CFLAGS) -c $< -o $@
-
-$(SIMULATOR_OUTDIR)/%.o: %.c
-	xcrun -sdk iphonesimulator $(CC) $(CFLAGS) $(SIMULATOR_CFLAGS) -c $< -o $@
-
-$(DEVICE_OUTDIR)/%.o: %.c
-	xcrun -sdk iphoneos $(CC) $(CFLAGS) $(DEVICE_CFLAGS) -c $< -o $@
-
-$(STATIC_OUTDIR)/%.o: %.cc
-	$(CXX) $(CXXFLAGS) -c $< -o $@
-
-$(STATIC_OUTDIR)/%.o: %.c
-	$(CC) $(CFLAGS) -c $< -o $@
-
-$(SHARED_OUTDIR)/%.o: %.cc
+else
+#
+# build for everything NOT IOS
+#
+.cc.o:
 	$(CXX) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) -c $< -o $@

-$(SHARED_OUTDIR)/%.o: %.c
+.c.o:
 	$(CC) $(CFLAGS) $(PLATFORM_SHARED_CFLAGS) -c $< -o $@

-$(STATIC_OUTDIR)/port/port_posix_sse.o: port/port_posix_sse.cc
-	$(CXX) $(CXXFLAGS) $(PLATFORM_SSEFLAGS) -c $< -o $@
+## 	@echo -- Creating dependency file for $<
+%.d: %.cc
+	$(CC) $(CFLAGS) $(PLATFORM_SHARED_CFLAGS) -MM -E -MT $(basename $@).d -MT $(basename $@).o -MF $@ $<
+	@echo $(basename $@).o: $(basename $@).d >>$@

-$(SHARED_OUTDIR)/port/port_posix_sse.o: port/port_posix_sse.cc
-	$(CXX) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) $(PLATFORM_SSEFLAGS) -c $< -o $@
+# generic build for command line tests
+%: %.cc
+	$(CXX) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) $< $(TESTHARNESS) -o $@ $(LEVEL_LDFLAGS) $(LDFLAGS)
+
+%: db/%.c
+	$(CXX) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) $< $(TESTHARNESS) -o $@ $(LEVEL_LDFLAGS) $(LDFLAGS)
+
+# for tools, omits test harness
+%: tools/%.cc
+	$(CXX) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) $< -o $@ $(LEVEL_LDFLAGS) $(LDFLAGS)
+
+endif
+
+#
+# load dependency files
+#
+ifeq ($(filter tar clean allclean distclean,$(MAKECMDGOALS)),)
+-include $(DEPEND)
+endif
--- a/src/leveldb/README
+++ b/src/leveldb/README
@ -0,0 +1,83 @@
+leveldb: A key-value store
+Authors: Sanjay Ghemawat (sanjay@google.com) and Jeff Dean (jeff@google.com)
+
+The original Google README is now README.GOOGLE.
+
+** Introduction
+
+This repository contains the Google source code as modified to benefit
+the Riak environment.  The typical Riak environment has two attributes
+that necessitate leveldb adjustments, both in options and code:
+
+- production servers: Riak often runs in heavy Internet environments:
+  servers with many CPU cores, lots of memory, and 24x7 disk activity.
+  Basho's leveldb takes advantage of the environment by adding
+  hardware CRC calculation, increasing Bloom filter accuracy, and
+  defaulting to integrity checking enabled.
+
+- multiple databases open: Riak opens 8 to 128 databases
+  simultaneously.  Google's leveldb supports this, but its background
+  compaction thread can fall behind.  leveldb will "stall" new user
+  writes whenever the compaction thread gets too far behind.  Basho's
+  leveldb modification include multiple thread blocks that each
+  contain prioritized threads for specific compaction activities.
+
+Details for Basho's customizations exist in the leveldb wiki:
+
+  http://github.com/basho/leveldb/wiki
+
+
+** Branch pattern
+
+This repository follows the Basho standard for branch management 
+as of November 28, 2013.  The standard is found here:
+
+https://github.com/basho/riak/wiki/Basho-repository-management
+
+In summary, the "develop" branch contains the most recently reviewed
+engineering work.  The "master" branch contains the most recently
+released work, i.e. distributed as part of a Riak release.
+
+
+** Basic options needed
+
+Those wishing to truly savor the benefits of Basho's modifications
+need to initialize a new leveldb::Options structure similar to the
+following before each call to leveldb::DB::Open:
+
+    leveldb::Options * options;
+
+    options=new Leveldb::Options;
+
+    options.filter_policy=leveldb::NewBloomFilterPolicy2(16);
+    options.write_buffer_size=62914560;  // 60Mbytes
+    options.total_leveldb_mem=2684354560; // 2.5Gbytes (details below)
+    options.env=leveldb::Env::Default();
+
+
+** Memory plan
+
+Basho's leveldb dramatically departed from Google's original internal
+memory allotment plan with Riak 2.0.  Basho's leveldb uses a methodology
+called flexcache.  The technical details are here:
+
+   https://github.com/basho/leveldb/wiki/mv-flexcache
+
+The key points are:
+
+- options.total_leveldb_mem is an allocation for the entire process,
+  not a single database
+
+- giving different values to options.total_leveldb_mem on subsequent Open
+  calls causes memory to rearrange to current value across all databases
+
+- recommended minimum for Basho's leveldb is 340Mbytes per database.  
+
+- performance improves rapidly from 340Mbytes to 2.5Gbytes per database (3.0Gbytes
+  if using Riak's active anti-entropy).  Even more is nice, but not as helpful.
+
+- never assign more than 75% of available RAM to total_leveldb_mem.  There is
+  too much unaccounted memory overhead (worse if you use tcmalloc library).
+
+- options.max_open_files and options.block_cache should not be used.
+  
--- a/src/leveldb/README.GOOGLE
+++ b/src/leveldb/README.GOOGLE
@ -0,0 +1,51 @@
+leveldb: A key-value store
+Authors: Sanjay Ghemawat (sanjay@google.com) and Jeff Dean (jeff@google.com)
+
+The code under this directory implements a system for maintaining a
+persistent key/value store.
+
+See doc/index.html for more explanation.
+See doc/impl.html for a brief overview of the implementation.
+
+The public interface is in include/*.h.  Callers should not include or
+rely on the details of any other header files in this package.  Those
+internal APIs may be changed without warning.
+
+Guide to header files:
+
+include/db.h
+    Main interface to the DB: Start here
+
+include/options.h
+    Control over the behavior of an entire database, and also
+    control over the behavior of individual reads and writes.
+
+include/comparator.h
+    Abstraction for user-specified comparison function.  If you want
+    just bytewise comparison of keys, you can use the default comparator,
+    but clients can write their own comparator implementations if they
+    want custom ordering (e.g. to handle different character
+    encodings, etc.)
+
+include/iterator.h
+    Interface for iterating over data. You can get an iterator
+    from a DB object.
+
+include/write_batch.h
+    Interface for atomically applying multiple updates to a database.
+
+include/slice.h
+    A simple module for maintaining a pointer and a length into some
+    other byte array.
+
+include/status.h
+    Status is returned from many of the public interfaces and is used
+    to report success and various kinds of errors.
+
+include/env.h
+    Abstraction of the OS environment.  A posix implementation of
+    this interface is in util/env_posix.cc
+
+include/table.h
+include/table_builder.h
+    Lower-level modules that most clients probably won't use directly
--- a/src/leveldb/README.md
+++ b/src/leveldb/README.md
@ -1,174 +0,0 @@
-**LevelDB is a fast key-value storage library written at Google that provides an ordered mapping from string keys to string values.**
-
-[![Build Status](https://travis-ci.org/google/leveldb.svg?branch=master)](https://travis-ci.org/google/leveldb)
-
-Authors: Sanjay Ghemawat (sanjay@google.com) and Jeff Dean (jeff@google.com)
-
-# Features
-  * Keys and values are arbitrary byte arrays.
-  * Data is stored sorted by key.
-  * Callers can provide a custom comparison function to override the sort order.
-  * The basic operations are `Put(key,value)`, `Get(key)`, `Delete(key)`.
-  * Multiple changes can be made in one atomic batch.
-  * Users can create a transient snapshot to get a consistent view of data.
-  * Forward and backward iteration is supported over the data.
-  * Data is automatically compressed using the [Snappy compression library](http://google.github.io/snappy/).
-  * External activity (file system operations etc.) is relayed through a virtual interface so users can customize the operating system interactions.
-
-# Documentation
-  [LevelDB library documentation](https://github.com/google/leveldb/blob/master/doc/index.md) is online and bundled with the source code.
-
-
-# Limitations
-  * This is not a SQL database.  It does not have a relational data model, it does not support SQL queries, and it has no support for indexes.
-  * Only a single process (possibly multi-threaded) can access a particular database at a time.
-  * There is no client-server support builtin to the library.  An application that needs such support will have to wrap their own server around the library.
-
-# Contributing to the leveldb Project
-The leveldb project welcomes contributions. leveldb's primary goal is to be
-a reliable and fast key/value store. Changes that are in line with the
-features/limitations outlined above, and meet the requirements below,
-will be considered.
-
-Contribution requirements:
-
-1. **POSIX only**. We _generally_ will only accept changes that are both
-   compiled, and tested on a POSIX platform - usually Linux. Very small
-   changes will sometimes be accepted, but consider that more of an
-   exception than the rule.
-
-2. **Stable API**. We strive very hard to maintain a stable API. Changes that
-   require changes for projects using leveldb _might_ be rejected without
-   sufficient benefit to the project.
-
-3. **Tests**: All changes must be accompanied by a new (or changed) test, or
-   a sufficient explanation as to why a new (or changed) test is not required.
-
-## Submitting a Pull Request
-Before any pull request will be accepted the author must first sign a
-Contributor License Agreement (CLA) at https://cla.developers.google.com/.
-
-In order to keep the commit timeline linear
-[squash](https://git-scm.com/book/en/v2/Git-Tools-Rewriting-History#Squashing-Commits)
-your changes down to a single commit and [rebase](https://git-scm.com/docs/git-rebase)
-on google/leveldb/master. This keeps the commit timeline linear and more easily sync'ed
-with the internal repository at Google. More information at GitHub's
-[About Git rebase](https://help.github.com/articles/about-git-rebase/) page.
-
-# Performance
-
-Here is a performance report (with explanations) from the run of the
-included db_bench program.  The results are somewhat noisy, but should
-be enough to get a ballpark performance estimate.
-
-## Setup
-
-We use a database with a million entries.  Each entry has a 16 byte
-key, and a 100 byte value.  Values used by the benchmark compress to
-about half their original size.
-
-    LevelDB:    version 1.1
-    Date:       Sun May  1 12:11:26 2011
-    CPU:        4 x Intel(R) Core(TM)2 Quad CPU    Q6600  @ 2.40GHz
-    CPUCache:   4096 KB
-    Keys:       16 bytes each
-    Values:     100 bytes each (50 bytes after compression)
-    Entries:    1000000
-    Raw Size:   110.6 MB (estimated)
-    File Size:  62.9 MB (estimated)
-
-## Write performance
-
-The "fill" benchmarks create a brand new database, in either
-sequential, or random order.  The "fillsync" benchmark flushes data
-from the operating system to the disk after every operation; the other
-write operations leave the data sitting in the operating system buffer
-cache for a while.  The "overwrite" benchmark does random writes that
-update existing keys in the database.
-
-    fillseq      :       1.765 micros/op;   62.7 MB/s
-    fillsync     :     268.409 micros/op;    0.4 MB/s (10000 ops)
-    fillrandom   :       2.460 micros/op;   45.0 MB/s
-    overwrite    :       2.380 micros/op;   46.5 MB/s
-
-Each "op" above corresponds to a write of a single key/value pair.
-I.e., a random write benchmark goes at approximately 400,000 writes per second.
-
-Each "fillsync" operation costs much less (0.3 millisecond)
-than a disk seek (typically 10 milliseconds).  We suspect that this is
-because the hard disk itself is buffering the update in its memory and
-responding before the data has been written to the platter.  This may
-or may not be safe based on whether or not the hard disk has enough
-power to save its memory in the event of a power failure.
-
-## Read performance
-
-We list the performance of reading sequentially in both the forward
-and reverse direction, and also the performance of a random lookup.
-Note that the database created by the benchmark is quite small.
-Therefore the report characterizes the performance of leveldb when the
-working set fits in memory.  The cost of reading a piece of data that
-is not present in the operating system buffer cache will be dominated
-by the one or two disk seeks needed to fetch the data from disk.
-Write performance will be mostly unaffected by whether or not the
-working set fits in memory.
-
-    readrandom  : 16.677 micros/op;  (approximately 60,000 reads per second)
-    readseq     :  0.476 micros/op;  232.3 MB/s
-    readreverse :  0.724 micros/op;  152.9 MB/s
-
-LevelDB compacts its underlying storage data in the background to
-improve read performance.  The results listed above were done
-immediately after a lot of random writes.  The results after
-compactions (which are usually triggered automatically) are better.
-
-    readrandom  : 11.602 micros/op;  (approximately 85,000 reads per second)
-    readseq     :  0.423 micros/op;  261.8 MB/s
-    readreverse :  0.663 micros/op;  166.9 MB/s
-
-Some of the high cost of reads comes from repeated decompression of blocks
-read from disk.  If we supply enough cache to the leveldb so it can hold the
-uncompressed blocks in memory, the read performance improves again:
-
-    readrandom  : 9.775 micros/op;  (approximately 100,000 reads per second before compaction)
-    readrandom  : 5.215 micros/op;  (approximately 190,000 reads per second after compaction)
-
-## Repository contents
-
-See [doc/index.md](doc/index.md) for more explanation. See
-[doc/impl.md](doc/impl.md) for a brief overview of the implementation.
-
-The public interface is in include/*.h.  Callers should not include or
-rely on the details of any other header files in this package.  Those
-internal APIs may be changed without warning.
-
-Guide to header files:
-
-* **include/db.h**: Main interface to the DB: Start here
-
-* **include/options.h**: Control over the behavior of an entire database,
-and also control over the behavior of individual reads and writes.
-
-* **include/comparator.h**: Abstraction for user-specified comparison function.
-If you want just bytewise comparison of keys, you can use the default
-comparator, but clients can write their own comparator implementations if they
-want custom ordering (e.g. to handle different character encodings, etc.)
-
-* **include/iterator.h**: Interface for iterating over data. You can get
-an iterator from a DB object.
-
-* **include/write_batch.h**: Interface for atomically applying multiple
-updates to a database.
-
-* **include/slice.h**: A simple module for maintaining a pointer and a
-length into some other byte array.
-
-* **include/status.h**: Status is returned from many of the public interfaces
-and is used to report success and various kinds of errors.
-
-* **include/env.h**:
-Abstraction of the OS environment.  A posix implementation of this interface is
-in util/env_posix.cc
-
-* **include/table.h, include/table_builder.h**: Lower-level modules that most
-clients probably won't use directly
--- a/src/leveldb/TODO
+++ b/src/leveldb/TODO
@ -7,7 +7,6 @@ db
  within [start_key..end_key]?  For Chrome, deletion of obsolete
  object stores, etc. can be done in the background anyway, so
  probably not that important.
- There have been requests for MultiGet.

 After a range is completely deleted, what gets rid of the
 corresponding files if we do no future changes to that range.  Make
--- a/src/leveldb/WINDOWS.md
+++ b/src/leveldb/WINDOWS.md
@ -1,39 +0,0 @@
-# Building LevelDB On Windows
-
-## Prereqs 
-
-Install the [Windows Software Development Kit version 7.1](http://www.microsoft.com/downloads/dlx/en-us/listdetailsview.aspx?FamilyID=6b6c21d2-2006-4afa-9702-529fa782d63b).
-
-Download and extract the [Snappy source distribution](http://snappy.googlecode.com/files/snappy-1.0.5.tar.gz)
-
-1. Open the "Windows SDK 7.1 Command Prompt" :
-   Start Menu -> "Microsoft Windows SDK v7.1" > "Windows SDK 7.1 Command Prompt"
-2. Change the directory to the leveldb project
-
-## Building the Static lib 
-
-* 32 bit Version 
-
-        setenv /x86
-        msbuild.exe /p:Configuration=Release /p:Platform=Win32 /p:Snappy=..\snappy-1.0.5
-
-* 64 bit Version 
-
-        setenv /x64
-        msbuild.exe /p:Configuration=Release /p:Platform=x64 /p:Snappy=..\snappy-1.0.5
-
-
-## Building and Running the Benchmark app
-
-* 32 bit Version 
-
-	    setenv /x86
-	    msbuild.exe /p:Configuration=Benchmark /p:Platform=Win32 /p:Snappy=..\snappy-1.0.5
-		Benchmark\leveldb.exe
-
-* 64 bit Version 
-
-	    setenv /x64
-	    msbuild.exe /p:Configuration=Benchmark /p:Platform=x64 /p:Snappy=..\snappy-1.0.5
-	    x64\Benchmark\leveldb.exe
-
--- a/src/leveldb/build_detect_platform
+++ b/src/leveldb/build_detect_platform
@ -7,11 +7,8 @@
 #   CC                          C Compiler path
 #   CXX                         C++ Compiler path
 #   PLATFORM_LDFLAGS            Linker flags
-#   PLATFORM_LIBS               Libraries flags
 #   PLATFORM_SHARED_EXT         Extension for shared libraries
 #   PLATFORM_SHARED_LDFLAGS     Flags for building shared library
-#                               This flag is embedded just before the name
-#                               of the shared library without intervening spaces
 #   PLATFORM_SHARED_CFLAGS      Flags for compiling objects for shared library
 #   PLATFORM_CCFLAGS            C compiler flags
 #   PLATFORM_CXXFLAGS           C++ compiler flags.  Will contain:
@ -20,15 +17,14 @@
 #
 # The PLATFORM_CCFLAGS and PLATFORM_CXXFLAGS might include the following:
 #
-#       -DLEVELDB_ATOMIC_PRESENT     if <atomic> is present
+#       -DLEVELDB_CSTDATOMIC_PRESENT if <cstdatomic> is present
 #       -DLEVELDB_PLATFORM_POSIX     for Posix-based platforms
 #       -DSNAPPY                     if the Snappy library is present
 #

 OUTPUT=$1
-PREFIX=$2
-if test -z "$OUTPUT" || test -z "$PREFIX"; then
-  echo "usage: $0 <output-filename> <directory_prefix>" >&2
+if test -z "$OUTPUT"; then
+  echo "usage: $0 <output-filename>" >&2
  exit 1
 fi

@ -44,10 +40,6 @@ if test -z "$CXX"; then
    CXX=g++
 fi

-if test -z "$TMPDIR"; then
-    TMPDIR=/tmp
-fi
-
 # Detect OS
 if test -z "$TARGET_OS"; then
    TARGET_OS=`uname -s`
@ -58,119 +50,77 @@ CROSS_COMPILE=
 PLATFORM_CCFLAGS=
 PLATFORM_CXXFLAGS=
 PLATFORM_LDFLAGS=
-PLATFORM_LIBS=
-PLATFORM_SHARED_EXT="so"
+PLATFORM_SHARED_EXT=
 PLATFORM_SHARED_LDFLAGS="-shared -Wl,-soname -Wl,"
 PLATFORM_SHARED_CFLAGS="-fPIC"
 PLATFORM_SHARED_VERSIONED=true
-PLATFORM_SSEFLAGS=

-MEMCMP_FLAG=
-if [ "$CXX" = "g++" ]; then
-    # Use libc's memcmp instead of GCC's memcmp.  This results in ~40%
-    # performance improvement on readrandom under gcc 4.4.3 on Linux/x86.
-    MEMCMP_FLAG="-fno-builtin-memcmp"
+if test -n "$LEVELDB_VSN"; then
+    VERSION_FLAGS="$VERSION_FLAGS -DLEVELDB_VSN=\"$LEVELDB_VSN\""
 fi

+# On GCC, we pick libc's memcmp over GCC's memcmp via -fno-builtin-memcmp
 case "$TARGET_OS" in
-    CYGWIN_*)
-        PLATFORM=OS_LINUX
-        COMMON_FLAGS="$MEMCMP_FLAG -lpthread -DOS_LINUX -DCYGWIN"
-        PLATFORM_LDFLAGS="-lpthread"
-        PORT_FILE=port/port_posix.cc
-        PORT_SSE_FILE=port/port_posix_sse.cc
-        ;;
    Darwin)
        PLATFORM=OS_MACOSX
-        COMMON_FLAGS="$MEMCMP_FLAG -DOS_MACOSX"
-        PLATFORM_SHARED_EXT=dylib
-        [ -z "$INSTALL_PATH" ] && INSTALL_PATH=`pwd`
-        PLATFORM_SHARED_LDFLAGS="-dynamiclib -install_name $INSTALL_PATH/"
+        oIFS="$IFS"; IFS=.
+        set `uname -r`
+        IFS="$oIFS"
+        if [ "$1" -ge 13 ]; then
+            # assume clang compiler
+            COMMON_FLAGS="-mmacosx-version-min=10.8 -DOS_MACOSX -stdlib=libc++"
+            PLATFORM_LDFLAGS="-mmacosx-version-min=10.8"
+        else
+            COMMON_FLAGS="-fno-builtin-memcmp -DOS_MACOSX"
+        fi
+        PLATFORM_SHARED_EXT=
+        PLATFORM_SHARED_LDFLAGS="-dynamiclib -install_name "
        PORT_FILE=port/port_posix.cc
-        PORT_SSE_FILE=port/port_posix_sse.cc
        ;;
    Linux)
        PLATFORM=OS_LINUX
-        COMMON_FLAGS="$MEMCMP_FLAG -pthread -DOS_LINUX"
-        PLATFORM_LDFLAGS="-pthread"
+        COMMON_FLAGS="-fno-builtin-memcmp -pthread -DOS_LINUX"
+        PLATFORM_LDFLAGS="-pthread -lrt"
        PORT_FILE=port/port_posix.cc
-        PORT_SSE_FILE=port/port_posix_sse.cc
        ;;
    SunOS)
        PLATFORM=OS_SOLARIS
-        COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_SOLARIS"
-        PLATFORM_LIBS="-lpthread -lrt"
+        COMMON_FLAGS="-fno-builtin-memcmp -D_REENTRANT -DOS_SOLARIS -m64"
+        PLATFORM_LDFLAGS="-lpthread -lrt"
+        PLATFORM_SHARED_EXT=
        PORT_FILE=port/port_posix.cc
-        PORT_SSE_FILE=port/port_posix_sse.cc
        ;;
    FreeBSD)
+        CC=cc
+        CXX=c++
        PLATFORM=OS_FREEBSD
-        COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_FREEBSD"
-        PLATFORM_LIBS="-lpthread"
-        PORT_FILE=port/port_posix.cc
-        PORT_SSE_FILE=port/port_posix_sse.cc
-        ;;
-    GNU/kFreeBSD)
-        PLATFORM=OS_KFREEBSD
-        COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_KFREEBSD"
-        PLATFORM_LIBS="-lpthread"
+        COMMON_FLAGS="-fno-builtin-memcmp -D_REENTRANT -DOS_FREEBSD"
+        PLATFORM_LDFLAGS="-lpthread"
        PORT_FILE=port/port_posix.cc
        ;;
    NetBSD)
        PLATFORM=OS_NETBSD
-        COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_NETBSD"
-        PLATFORM_LIBS="-lpthread -lgcc_s"
+        COMMON_FLAGS="-fno-builtin-memcmp -D_REENTRANT -DOS_NETBSD"
+        PLATFORM_LDFLAGS="-lpthread -lgcc_s"
        PORT_FILE=port/port_posix.cc
-        PORT_SSE_FILE=port/port_posix_sse.cc
        ;;
    OpenBSD)
        PLATFORM=OS_OPENBSD
-        COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_OPENBSD"
+        COMMON_FLAGS="-fno-builtin-memcmp -D_REENTRANT -DOS_OPENBSD"
        PLATFORM_LDFLAGS="-pthread"
        PORT_FILE=port/port_posix.cc
-        PORT_SSE_FILE=port/port_posix_sse.cc
        ;;
    DragonFly)
        PLATFORM=OS_DRAGONFLYBSD
-        COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_DRAGONFLYBSD"
-        PLATFORM_LIBS="-lpthread"
+        COMMON_FLAGS="-fno-builtin-memcmp -D_REENTRANT -DOS_DRAGONFLYBSD"
+        PLATFORM_LDFLAGS="-lpthread"
        PORT_FILE=port/port_posix.cc
-        PORT_SSE_FILE=port/port_posix_sse.cc
        ;;
    OS_ANDROID_CROSSCOMPILE)
        PLATFORM=OS_ANDROID
-        COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_ANDROID -DLEVELDB_PLATFORM_POSIX"
+        COMMON_FLAGS="-fno-builtin-memcmp -D_REENTRANT -DOS_ANDROID -DLEVELDB_PLATFORM_POSIX"
        PLATFORM_LDFLAGS=""  # All pthread features are in the Android C library
        PORT_FILE=port/port_posix.cc
-        PORT_SSE_FILE=port/port_posix_sse.cc
-        CROSS_COMPILE=true
-        ;;
-    HP-UX)
-        PLATFORM=OS_HPUX
-        COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_HPUX"
-        PLATFORM_LDFLAGS="-pthread"
-        PORT_FILE=port/port_posix.cc
-        PORT_SSE_FILE=port/port_posix_sse.cc
-        # man ld: +h internal_name
-        PLATFORM_SHARED_LDFLAGS="-shared -Wl,+h -Wl,"
-        ;;
-    IOS)
-        PLATFORM=IOS
-        COMMON_FLAGS="$MEMCMP_FLAG -DOS_MACOSX"
-        [ -z "$INSTALL_PATH" ] && INSTALL_PATH=`pwd`
-        PORT_FILE=port/port_posix.cc
-        PORT_SSE_FILE=port/port_posix_sse.cc
-        PLATFORM_SHARED_EXT=
-        PLATFORM_SHARED_LDFLAGS=
-        PLATFORM_SHARED_CFLAGS=
-        PLATFORM_SHARED_VERSIONED=
-        ;;
-    OS_WINDOWS_CROSSCOMPILE | NATIVE_WINDOWS)
-        PLATFORM=OS_WINDOWS
-        COMMON_FLAGS="-fno-builtin-memcmp -D_REENTRANT -DOS_WINDOWS -DLEVELDB_PLATFORM_WINDOWS -DWINVER=0x0500 -D__USE_MINGW_ANSI_STDIO=1"
-        PLATFORM_SOURCES="util/env_win.cc"
-        PLATFORM_LIBS="-lshlwapi"
-        PORT_FILE=port/port_win.cc
        CROSS_COMPILE=true
        ;;
    *)
@ -182,78 +132,106 @@ esac
 # except for the test and benchmark files. By default, find will output a list
 # of all files matching either rule, so we need to append -print to make the
 # prune take effect.
-DIRS="$PREFIX/db $PREFIX/util $PREFIX/table"
-
+if [ -f leveldb_ee/README.md ]; then
+DIRS="util db table leveldb_ee"
+else
+DIRS="util db table leveldb_os"
+fi
 set -f # temporarily disable globbing so that our patterns aren't expanded
 PRUNE_TEST="-name *test*.cc -prune"
 PRUNE_BENCH="-name *_bench.cc -prune"
-PRUNE_TOOL="-name leveldbutil.cc -prune"
-PORTABLE_FILES=`find $DIRS $PRUNE_TEST -o $PRUNE_BENCH -o $PRUNE_TOOL -o -name '*.cc' -print | sort | sed "s,^$PREFIX/,," | tr "\n" " "`
-
+PORTABLE_FILES=`find $DIRS $PRUNE_TEST -o $PRUNE_BENCH -o -name '*.cc' -print | sort | tr "\n" " "`
+TESTS=`find $DIRS -name '*_test.c*' -print | sort | tr "\n" " "`
 set +f # re-enable globbing

 # The sources consist of the portable files, plus the platform-specific port
 # file.
-echo "SOURCES=$PORTABLE_FILES $PORT_FILE $PORT_SSE_FILE" >> $OUTPUT
+echo "SOURCES=$PORTABLE_FILES $PORT_FILE" >> $OUTPUT
 echo "MEMENV_SOURCES=helpers/memenv/memenv.cc" >> $OUTPUT
+echo "TEST_SOURCES=$TESTS" >>$OUTPUT

 if [ "$CROSS_COMPILE" = "true" ]; then
    # Cross-compiling; do not try any compilation tests.
    true
 else
-    CXXOUTPUT="${TMPDIR}/leveldb_build_detect_platform-cxx.$$"
-
-    # If -std=c++0x works, use <atomic> as fallback for when memory barriers
-    # are not available.
-    $CXX $CXXFLAGS -std=c++0x -x c++ - -o $CXXOUTPUT 2>/dev/null  <<EOF
-      #include <atomic>
+    # If -std=c++0x works, use <cstdatomic>.  Otherwise use port_posix.h.
+    $CXX $CFLAGS -std=c++0x -x c++ - -o /dev/null 2>/dev/null  <<EOF
+      #include <cstdatomic>
      int main() {}
 EOF
    if [ "$?" = 0 ]; then
-        COMMON_FLAGS="$COMMON_FLAGS -DLEVELDB_PLATFORM_POSIX -DLEVELDB_ATOMIC_PRESENT"
+        COMMON_FLAGS="$COMMON_FLAGS -DLEVELDB_PLATFORM_POSIX -DLEVELDB_CSTDATOMIC_PRESENT"
        PLATFORM_CXXFLAGS="-std=c++0x"
    else
        COMMON_FLAGS="$COMMON_FLAGS -DLEVELDB_PLATFORM_POSIX"
    fi

+    # Test whether Snappy library is installed
+    # http://code.google.com/p/snappy/
+    $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
+      #include <snappy.h>
+      int main() {}
+EOF
+    if [ "$?" = 0 ]; then
+        COMMON_FLAGS="$COMMON_FLAGS -DSNAPPY"
+        if [ "$PLATFORM" = "OS_LINUX" ]; then
+            # Basho: switching to static snappy library to make tools more portable
+            PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -Wl,-non_shared -lsnappy -Wl,-call_shared"
+        else
+            PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lsnappy"
+        fi
+    fi
+
    # Test whether tcmalloc is available
-    $CXX $CXXFLAGS -x c++ - -o $CXXOUTPUT -ltcmalloc 2>/dev/null  <<EOF
+    $CXX $CFLAGS -x c++ - -o /dev/null -ltcmalloc 2>/dev/null  <<EOF
      int main() {}
 EOF
    if [ "$?" = 0 ]; then
-        PLATFORM_LIBS="$PLATFORM_LIBS -ltcmalloc"
+        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -ltcmalloc"
+    fi
 fi

-    rm -f $CXXOUTPUT 2>/dev/null
-
-    # Test if gcc SSE 4.2 is supported
-    $CXX $CXXFLAGS -x c++ - -o $CXXOUTPUT -msse4.2 2>/dev/null  <<EOF
-      int main() {}
-EOF
-    if [ "$?" = 0 ]; then
-        PLATFORM_SSEFLAGS="-msse4.2"
-    fi
-
-    rm -f $CXXOUTPUT 2>/dev/null
-fi
-
-# Use the SSE 4.2 CRC32C intrinsics iff runtime checks indicate compiler supports them.
-if [ -n "$PLATFORM_SSEFLAGS" ]; then
-    PLATFORM_SSEFLAGS="$PLATFORM_SSEFLAGS -DLEVELDB_PLATFORM_POSIX_SSE"
-fi
-
-PLATFORM_CCFLAGS="$PLATFORM_CCFLAGS $COMMON_FLAGS"
-PLATFORM_CXXFLAGS="$PLATFORM_CXXFLAGS $COMMON_FLAGS"
+PLATFORM_CCFLAGS="$PLATFORM_CCFLAGS $COMMON_FLAGS $VERSION_FLAGS"
+PLATFORM_CXXFLAGS="$PLATFORM_CXXFLAGS $COMMON_FLAGS $VERSION_FLAGS"

 echo "CC=$CC" >> $OUTPUT
 echo "CXX=$CXX" >> $OUTPUT
 echo "PLATFORM=$PLATFORM" >> $OUTPUT
 echo "PLATFORM_LDFLAGS=$PLATFORM_LDFLAGS" >> $OUTPUT
-echo "PLATFORM_LIBS=$PLATFORM_LIBS" >> $OUTPUT
 echo "PLATFORM_CCFLAGS=$PLATFORM_CCFLAGS" >> $OUTPUT
 echo "PLATFORM_CXXFLAGS=$PLATFORM_CXXFLAGS" >> $OUTPUT
-echo "PLATFORM_SSEFLAGS=$PLATFORM_SSEFLAGS" >> $OUTPUT
 echo "PLATFORM_SHARED_CFLAGS=$PLATFORM_SHARED_CFLAGS" >> $OUTPUT
 echo "PLATFORM_SHARED_EXT=$PLATFORM_SHARED_EXT" >> $OUTPUT
 echo "PLATFORM_SHARED_LDFLAGS=$PLATFORM_SHARED_LDFLAGS" >> $OUTPUT
 echo "PLATFORM_SHARED_VERSIONED=$PLATFORM_SHARED_VERSIONED" >> $OUTPUT
+
+#
+# Basho extension to place -D variable in include/leveldb/ldb_config.h
+#
+
+LDB_CONFIG="include/leveldb/ldb_config.h"
+
+# Delete existing output, if it exists
+rm -f $LDB_CONFIG
+
+write_config_h()
+{
+    for param in $@
+    do
+        prefix=$(expr -- $param : "\(..\)")
+        if [ X$prefix = "X-D" ]
+        then
+            echo "" >>$LDB_CONFIG
+            echo "#ifndef $(expr -- $param : '..\(.*\)')" >>$LDB_CONFIG
+            echo "    #define $(expr -- $param : '..\(.*\)')" >>$LDB_CONFIG
+            echo "#endif" >>$LDB_CONFIG
+        fi
+    done
+}
+
+echo "/** This file is generated by build_detect_platform." >$LDB_CONFIG
+echo " *   It saves the state of compile flags.  This benefits the reuse" >>$LDB_CONFIG
+echo " *   of internal include files outside of a leveldb build." >>$LDB_CONFIG
+echo " */" >>$LDB_CONFIG
+
+write_config_h $COMMON_FLAGS
--- a/src/leveldb/db/autocompact_test.cc
+++ b/src/leveldb/db/autocompact_test.cc
@ -1,118 +0,0 @@
-// Copyright (c) 2013 The LevelDB Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
-
-#include "leveldb/db.h"
-#include "db/db_impl.h"
-#include "leveldb/cache.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
-
-namespace leveldb {
-
-class AutoCompactTest {
- public:
-  std::string dbname_;
-  Cache* tiny_cache_;
-  Options options_;
-  DB* db_;
-
-  AutoCompactTest() {
-    dbname_ = test::TmpDir() + "/autocompact_test";
-    tiny_cache_ = NewLRUCache(100);
-    options_.block_cache = tiny_cache_;
-    DestroyDB(dbname_, options_);
-    options_.create_if_missing = true;
-    options_.compression = kNoCompression;
-    ASSERT_OK(DB::Open(options_, dbname_, &db_));
-  }
-
-  ~AutoCompactTest() {
-    delete db_;
-    DestroyDB(dbname_, Options());
-    delete tiny_cache_;
-  }
-
-  std::string Key(int i) {
-    char buf[100];
-    snprintf(buf, sizeof(buf), "key%06d", i);
-    return std::string(buf);
-  }
-
-  uint64_t Size(const Slice& start, const Slice& limit) {
-    Range r(start, limit);
-    uint64_t size;
-    db_->GetApproximateSizes(&r, 1, &size);
-    return size;
-  }
-
-  void DoReads(int n);
-};
-
-static const int kValueSize = 200 * 1024;
-static const int kTotalSize = 100 * 1024 * 1024;
-static const int kCount = kTotalSize / kValueSize;
-
-// Read through the first n keys repeatedly and check that they get
-// compacted (verified by checking the size of the key space).
-void AutoCompactTest::DoReads(int n) {
-  std::string value(kValueSize, 'x');
-  DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
-
-  // Fill database
-  for (int i = 0; i < kCount; i++) {
-    ASSERT_OK(db_->Put(WriteOptions(), Key(i), value));
-  }
-  ASSERT_OK(dbi->TEST_CompactMemTable());
-
-  // Delete everything
-  for (int i = 0; i < kCount; i++) {
-    ASSERT_OK(db_->Delete(WriteOptions(), Key(i)));
-  }
-  ASSERT_OK(dbi->TEST_CompactMemTable());
-
-  // Get initial measurement of the space we will be reading.
-  const int64_t initial_size = Size(Key(0), Key(n));
-  const int64_t initial_other_size = Size(Key(n), Key(kCount));
-
-  // Read until size drops significantly.
-  std::string limit_key = Key(n);
-  for (int read = 0; true; read++) {
-    ASSERT_LT(read, 100) << "Taking too long to compact";
-    Iterator* iter = db_->NewIterator(ReadOptions());
-    for (iter->SeekToFirst();
-         iter->Valid() && iter->key().ToString() < limit_key;
-         iter->Next()) {
-      // Drop data
-    }
-    delete iter;
-    // Wait a little bit to allow any triggered compactions to complete.
-    Env::Default()->SleepForMicroseconds(1000000);
-    uint64_t size = Size(Key(0), Key(n));
-    fprintf(stderr, "iter %3d => %7.3f MB [other %7.3f MB]\n",
-            read+1, size/1048576.0, Size(Key(n), Key(kCount))/1048576.0);
-    if (size <= initial_size/10) {
-      break;
-    }
-  }
-
-  // Verify that the size of the key space not touched by the reads
-  // is pretty much unchanged.
-  const int64_t final_other_size = Size(Key(n), Key(kCount));
-  ASSERT_LE(final_other_size, initial_other_size + 1048576);
-  ASSERT_GE(final_other_size, initial_other_size/5 - 1048576);
-}
-
-TEST(AutoCompactTest, ReadAll) {
-  DoReads(kCount);
-}
-
-TEST(AutoCompactTest, ReadHalf) {
-  DoReads(kCount/2);
-}
-
-}  // namespace leveldb
-
-int main(int argc, char** argv) {
-  return leveldb::test::RunAllTests();
-}
--- a/src/leveldb/db/builder.cc
+++ b/src/leveldb/db/builder.cc
@ -2,12 +2,16 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.

+#define __STDC_FORMAT_MACROS
+#include <inttypes.h>
+
 #include "db/builder.h"

 #include "db/filename.h"
 #include "db/dbformat.h"
 #include "db/table_cache.h"
 #include "db/version_edit.h"
+#include "db/version_set.h"
 #include "leveldb/db.h"
 #include "leveldb/env.h"
 #include "leveldb/iterator.h"
@ -17,27 +21,51 @@ namespace leveldb {
 Status BuildTable(const std::string& dbname,
                  Env* env,
                  const Options& options,
+                  const Comparator * user_comparator,
                  TableCache* table_cache,
                  Iterator* iter,
-                  FileMetaData* meta) {
+                  FileMetaData* meta,
+                  SequenceNumber smallest_snapshot) {
  Status s;
+  size_t keys_seen, keys_retired;
+
+  keys_seen=0;
+  keys_retired=0;
+
  meta->file_size = 0;
  iter->SeekToFirst();

-  std::string fname = TableFileName(dbname, meta->number);
+  KeyRetirement retire(user_comparator, smallest_snapshot, &options);
+
+  std::string fname = TableFileName(options, meta->number, meta->level);
  if (iter->Valid()) {
    WritableFile* file;
-    s = env->NewWritableFile(fname, &file);
+
+    s = env->NewWritableFile(fname, &file,
+                                 env->RecoveryMmapSize(&options));
    if (!s.ok()) {
      return s;
    }

+    // tune fadvise to keep all of this lower level file in page cache
+    //  (compaction of unsorted files causes severe cache misses)
+    file->SetMetadataOffset(1);
+
    TableBuilder* builder = new TableBuilder(options, file);
    meta->smallest.DecodeFrom(iter->key());
    for (; iter->Valid(); iter->Next()) {
+      ++keys_seen;
      Slice key = iter->key();
+      if (!retire(key))
+      {
          meta->largest.DecodeFrom(key);
          builder->Add(key, iter->value());
+          ++meta->num_entries;
+      }   // if
+      else
+      {
+          ++keys_retired;
+      }   // else
    }

    // Finish and check for builder errors
@ -45,6 +73,9 @@ Status BuildTable(const std::string& dbname,
      s = builder->Finish();
      if (s.ok()) {
        meta->file_size = builder->FileSize();
+        meta->exp_write_low = builder->GetExpiryWriteLow();
+        meta->exp_write_high = builder->GetExpiryWriteHigh();
+        meta->exp_explicit_high = builder->GetExpiryExplicitHigh();
        assert(meta->file_size > 0);
      }
    } else {
@ -64,10 +95,20 @@ Status BuildTable(const std::string& dbname,

    if (s.ok()) {
      // Verify that the table is usable
+      Table * table_ptr;
      Iterator* it = table_cache->NewIterator(ReadOptions(),
                                              meta->number,
-                                              meta->file_size);
+                                              meta->file_size,
+                                              meta->level,
+                                              &table_ptr);
      s = it->status();
+
+      // Riak specific: bloom filter is no longer read by default,
+      //  force read on highly used overlapped table files
+      if (s.ok() && VersionSet::IsLevelOverlapped(meta->level))
+          table_ptr->ReadFilter();
+
+      // table_ptr is owned by it and therefore invalidated by this delete
      delete it;
    }
  }
@ -79,6 +120,11 @@ Status BuildTable(const std::string& dbname,

  if (s.ok() && meta->file_size > 0) {
    // Keep it
+      if (0!=keys_retired)
+      {
+          Log(options.info_log, "Level-0 table #%" PRIu64 ": %zd keys seen, %zd keys retired, %zd keys expired",
+              meta->number, keys_seen, retire.GetDroppedCount(), retire.GetExpiredCount());
+      }   // if
  } else {
    env->DeleteFile(fname);
  }
--- a/src/leveldb/db/builder.h
+++ b/src/leveldb/db/builder.h
@ -6,6 +6,7 @@
 #define STORAGE_LEVELDB_DB_BUILDER_H_

 #include "leveldb/status.h"
+#include "db/dbformat.h"

 namespace leveldb {

@ -25,9 +26,11 @@ class VersionEdit;
 extern Status BuildTable(const std::string& dbname,
                         Env* env,
                         const Options& options,
+                         const Comparator * user_comparator,
                         TableCache* table_cache,
                         Iterator* iter,
-                         FileMetaData* meta);
+                         FileMetaData* meta,
+                         SequenceNumber smallest_snapshot);

 }  // namespace leveldb

--- a/src/leveldb/db/c.cc
+++ b/src/leveldb/db/c.cc
@ -6,6 +6,7 @@

 #include <stdlib.h>
 #include <unistd.h>
+#include <stdint.h>
 #include "leveldb/cache.h"
 #include "leveldb/comparator.h"
 #include "leveldb/db.h"
@ -40,6 +41,8 @@ using leveldb::Status;
 using leveldb::WritableFile;
 using leveldb::WriteBatch;
 using leveldb::WriteOptions;
+using leveldb::KeyMetaData;
+using leveldb::ValueType;

 extern "C" {

@ -49,6 +52,7 @@ struct leveldb_writebatch_t   { WriteBatch        rep; };
 struct leveldb_snapshot_t     { const Snapshot*   rep; };
 struct leveldb_readoptions_t  { ReadOptions       rep; };
 struct leveldb_writeoptions_t { WriteOptions      rep; };
+struct leveldb_keymetadata_t  { KeyMetaData       rep; };
 struct leveldb_options_t      { Options           rep; };
 struct leveldb_cache_t        { Cache*            rep; };
 struct leveldb_seqfile_t      { SequentialFile*   rep; };
@ -173,8 +177,19 @@ void leveldb_put(
    const char* key, size_t keylen,
    const char* val, size_t vallen,
    char** errptr) {
+    return(leveldb_put2(db, options, key, keylen, val, vallen, errptr, NULL));
+}
+
+void leveldb_put2(
+    leveldb_t* db,
+    const leveldb_writeoptions_t* options,
+    const char* key, size_t keylen,
+    const char* val, size_t vallen,
+    char** errptr,
+    const leveldb_keymetadata_t * metadata) {
  SaveError(errptr,
-            db->rep->Put(options->rep, Slice(key, keylen), Slice(val, vallen)));
+            db->rep->Put(options->rep, Slice(key, keylen), Slice(val, vallen),
+                         (NULL==metadata ? NULL : &metadata->rep)));
 }

 void leveldb_delete(
@ -200,9 +215,21 @@ char* leveldb_get(
    const char* key, size_t keylen,
    size_t* vallen,
    char** errptr) {
+
+ return(leveldb_get2(db, options, key, keylen, vallen, errptr, NULL));
+}
+
+char* leveldb_get2(
+    leveldb_t* db,
+    const leveldb_readoptions_t* options,
+    const char* key, size_t keylen,
+    size_t* vallen,
+    char** errptr,
+    leveldb_keymetadata_t * metadata) {
  char* result = NULL;
  std::string tmp;
-  Status s = db->rep->Get(options->rep, Slice(key, keylen), &tmp);
+  Status s = db->rep->Get(options->rep, Slice(key, keylen), &tmp,
+                          (NULL==metadata ? NULL : &metadata->rep));
  if (s.ok()) {
    *vallen = tmp.size();
    result = CopyString(tmp);
@ -330,6 +357,15 @@ const char* leveldb_iter_value(const leveldb_iterator_t* iter, size_t* vlen) {
  return s.data();
 }

+const void leveldb_iter_keymetadata(const leveldb_iterator_t* iter,
+                                    leveldb_keymetadata_t * meta)
+{
+  if (NULL!=iter && NULL!=meta)
+  {
+    meta->rep=iter->rep->keymetadata();
+  } // if
+}
+
 void leveldb_iter_get_error(const leveldb_iterator_t* iter, char** errptr) {
  SaveError(errptr, iter->rep->status());
 }
@ -350,7 +386,16 @@ void leveldb_writebatch_put(
    leveldb_writebatch_t* b,
    const char* key, size_t klen,
    const char* val, size_t vlen) {
-  b->rep.Put(Slice(key, klen), Slice(val, vlen));
+    leveldb_writebatch_put2(b, key, klen, val, vlen,NULL);
+}
+
+void leveldb_writebatch_put2(
+    leveldb_writebatch_t* b,
+    const char* key, size_t klen,
+    const char* val, size_t vlen,
+    const leveldb_keymetadata_t * metadata) {
+    b->rep.Put(Slice(key, klen), Slice(val, vlen),
+                         (NULL==metadata ? NULL : &metadata->rep));
 }

 void leveldb_writebatch_delete(
@ -362,15 +407,20 @@ void leveldb_writebatch_delete(
 void leveldb_writebatch_iterate(
    leveldb_writebatch_t* b,
    void* state,
-    void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen),
+    void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen,
+                const int & type, const uint64_t & expiry),
    void (*deleted)(void*, const char* k, size_t klen)) {
  class H : public WriteBatch::Handler {
   public:
    void* state_;
-    void (*put_)(void*, const char* k, size_t klen, const char* v, size_t vlen);
+    void (*put_)(void*, const char* k, size_t klen, const char* v, size_t vlen,
+                 const int & type, const uint64_t & expiry);
    void (*deleted_)(void*, const char* k, size_t klen);
-    virtual void Put(const Slice& key, const Slice& value) {
-      (*put_)(state_, key.data(), key.size(), value.data(), value.size());
+    virtual void Put(const Slice& key, const Slice& value,
+                     const leveldb::ValueType & type,
+                     const leveldb::ExpiryTimeMicros & expiry)
+    {
+        (*put_)(state_, key.data(), key.size(), value.data(), value.size(), (int)type, (uint64_t)expiry);
    }
    virtual void Delete(const Slice& key) {
      (*deleted_)(state_, key.data(), key.size());
@ -418,6 +468,11 @@ void leveldb_options_set_paranoid_checks(
  opt->rep.paranoid_checks = v;
 }

+void leveldb_options_set_verify_compactions(
+    leveldb_options_t* opt, unsigned char v) {
+  opt->rep.verify_compactions = v;
+}
+
 void leveldb_options_set_env(leveldb_options_t* opt, leveldb_env_t* env) {
  opt->rep.env = (env ? env->rep : NULL);
 }
@ -450,6 +505,10 @@ void leveldb_options_set_compression(leveldb_options_t* opt, int t) {
  opt->rep.compression = static_cast<CompressionType>(t);
 }

+void leveldb_options_set_total_leveldb_mem(leveldb_options_t* opt, size_t s) {
+  opt->rep.total_leveldb_mem = s;
+}
+
 leveldb_comparator_t* leveldb_comparator_create(
    void* state,
    void (*destructor)(void*),
@ -580,7 +639,17 @@ void leveldb_env_destroy(leveldb_env_t* env) {
  delete env;
 }

+void leveldb_env_shutdown() {
+  Env::Shutdown();
+}
+
+/**
+ * CAUTION:  this call is only for char * objects returned by
+ *           functions like leveldb_get and leveldb_property_value.
+ *           Also used to release errptr strings.
+ */
 void leveldb_free(void* ptr) {
+  if (NULL!=ptr)
    free(ptr);
 }

--- a/src/leveldb/db/c_test.c
+++ b/src/leveldb/db/c_test.c
@ -3,6 +3,8 @@
   found in the LICENSE file. See the AUTHORS file for names of contributors. */

 #include "leveldb/c.h"
+#include "leveldb/options.h"
+#include "port/port.h"

 #include <stddef.h>
 #include <stdio.h>
@ -11,8 +13,13 @@
 #include <sys/types.h>
 #include <unistd.h>

+using leveldb::ValueType;
+
+struct leveldb_keymetadata_t  { leveldb::KeyMetaData       rep; };
+
 const char* phase = "";
 static char dbname[200];
+static leveldb::ExpiryTimeMicros gStartTime;

 static void StartPhase(const char* name) {
  fprintf(stderr, "=== Test %s\n", name);
@ -49,7 +56,7 @@ static void CheckEqual(const char* expected, const char* v, size_t n) {
    fprintf(stderr, "%s: expected '%s', got '%s'\n",
            phase,
            (expected ? expected : "(null)"),
-            (v ? v : "(null"));
+            (v ? v : "(null)"));
    abort();
  }
 }
@ -112,6 +119,117 @@ static void CheckDel(void* ptr, const char* k, size_t klen) {
  (*state)++;
 }

+//  (expiry enabled)
+static void CheckGet2(
+    leveldb_t* db,
+    const leveldb_readoptions_t* options,
+    const char* key,
+    const char* expected,
+    ValueType type,
+    uint64_t expiry) {
+  char* err = NULL;
+  size_t val_len;
+  char* val;
+  leveldb_keymetadata_t meta;
+
+  val = leveldb_get2(db, options, key, strlen(key), &val_len, &err, &meta);
+  CheckNoError(err);
+  CheckEqual(expected, val, val_len);
+  CheckCondition(type==meta.rep.m_Type);
+  if (0==expiry && leveldb::kTypeValueWriteTime==type)
+  {
+    leveldb::ExpiryTimeMicros now=leveldb::port::TimeMicros();
+    CheckCondition(gStartTime<=meta.rep.m_Expiry && meta.rep.m_Expiry<=now);
+  }   // if
+  else
+    {CheckCondition(expiry==meta.rep.m_Expiry);}
+
+  Free(&val);
+}
+
+//  (expiry enabled)
+static void CheckIter2(leveldb_iterator_t* iter,
+                       const char* key, const char* val,
+                       const leveldb::KeyMetaData & meta) {
+  size_t len;
+  const char* str;
+  leveldb_keymetadata_t it_meta;
+
+  str = leveldb_iter_key(iter, &len);
+  CheckEqual(key, str, len);
+  str = leveldb_iter_value(iter, &len);
+  CheckEqual(val, str, len);
+
+  leveldb_iter_keymetadata(iter, &it_meta);
+  CheckCondition(meta.m_Type==it_meta.rep.m_Type);
+  if (0==meta.m_Expiry && leveldb::kTypeValueWriteTime==meta.m_Type)
+  {
+    leveldb::ExpiryTimeMicros now=leveldb::port::TimeMicros();
+    CheckCondition(gStartTime<=it_meta.rep.m_Expiry && it_meta.rep.m_Expiry<=now);
+  }   // if
+  else
+    {CheckCondition(meta.m_Expiry==it_meta.rep.m_Expiry);}
+
+}
+
+// Callback from leveldb_writebatch_iterate()
+//  (expiry enabled)
+struct CheckPut2Data
+{
+    const char * m_Key;
+    const char * m_Value;
+    ValueType m_Type;
+    uint64_t m_Expiry;
+};
+
+static struct CheckPut2Data gCheckPut2Data[]=
+{
+    {"foo","hello_put2",leveldb::kTypeValue,0},
+    {"box","c_put2",leveldb::kTypeValue,0},
+    {"disney","cartoon_put2",leveldb::kTypeValueWriteTime, 0},
+    {"money","lotsof_put2",leveldb::kTypeValueWriteTime, 9988776655},
+    {"time","ismoney_put2",leveldb::kTypeValueExplicitExpiry, 221199887766}
+};
+
+static struct CheckPut2Data gCheckPut2ItrData[]=
+{
+    {"bar","b",leveldb::kTypeValue,0},
+    {"box","c",leveldb::kTypeValue,0},
+    {"bar","",leveldb::kTypeDeletion,0},
+    {"mom","texas",leveldb::kTypeValueWriteTime,0},
+    {"dad","poland",leveldb::kTypeValueExplicitExpiry,22446688}
+  };
+
+static void CheckPut2(void* ptr,
+                      const char* k, size_t klen,
+                      const char* v, size_t vlen,
+                      const int & type_int,
+                      const uint64_t & expiry) {
+  int* state = (int*) ptr;
+  CheckCondition(*state < (sizeof(gCheckPut2ItrData)/sizeof(gCheckPut2ItrData[0])));
+  struct CheckPut2Data * test;
+
+  test=&gCheckPut2ItrData[*state];
+  CheckEqual(test->m_Key, k, klen);
+  CheckEqual(test->m_Value, v, vlen);
+  CheckCondition((int)test->m_Type==type_int);
+  if (leveldb::kTypeValueWriteTime!=test->m_Type)
+    {CheckCondition((uint64_t)test->m_Expiry==expiry);}
+  (*state)++;
+}
+
+// Callback from leveldb_writebatch_iterate()
+//  (expiry enabled)
+static void CheckDel2(void* ptr, const char* k, size_t klen) {
+  int* state = (int*) ptr;
+  CheckCondition(*state < (sizeof(gCheckPut2ItrData)/sizeof(gCheckPut2ItrData[0])));
+  struct CheckPut2Data * test;
+
+  test=&gCheckPut2ItrData[*state];
+  CheckEqual(test->m_Key, k, klen);
+  (*state)++;
+}
+
 static void CmpDestroy(void* arg) { }

 static int CmpCompare(void* arg, const char* a, size_t alen,
@ -141,7 +259,7 @@ static char* FilterCreate(
    int num_keys,
    size_t* filter_length) {
  *filter_length = 4;
-  char* result = malloc(4);
+  char* result = (char*)malloc(4);
  memcpy(result, "fake", 4);
  return result;
 }
@ -167,6 +285,7 @@ int main(int argc, char** argv) {

  CheckCondition(leveldb_major_version() >= 1);
  CheckCondition(leveldb_minor_version() >= 1);
+  gStartTime=leveldb::port::TimeMicros();

  snprintf(dbname, sizeof(dbname),
           "%s/leveldb_c_test-%d",
@ -207,12 +326,6 @@ int main(int argc, char** argv) {
  CheckCondition(err != NULL);
  Free(&err);

-  StartPhase("leveldb_free");
-  db = leveldb_open(options, dbname, &err);
-  CheckCondition(err != NULL);
-  leveldb_free(err);
-  err = NULL;
-
  StartPhase("open");
  leveldb_options_set_create_if_missing(options, 1);
  db = leveldb_open(options, dbname, &err);
@ -234,42 +347,74 @@ int main(int argc, char** argv) {

  StartPhase("writebatch");
  {
+    leveldb_keymetadata_t meta;
    leveldb_writebatch_t* wb = leveldb_writebatch_create();
    leveldb_writebatch_put(wb, "foo", 3, "a", 1);
    leveldb_writebatch_clear(wb);
    leveldb_writebatch_put(wb, "bar", 3, "b", 1);
    leveldb_writebatch_put(wb, "box", 3, "c", 1);
    leveldb_writebatch_delete(wb, "bar", 3);
+    meta.rep.m_Type=leveldb::kTypeValueWriteTime;
+    meta.rep.m_Expiry=0;
+    leveldb_writebatch_put2(wb, "mom", 3, "texas", 5, &meta);
+    meta.rep.m_Type=leveldb::kTypeValueExplicitExpiry;
+    meta.rep.m_Expiry=22446688;
+    leveldb_writebatch_put2(wb, "dad", 3, "poland", 6, &meta);
    leveldb_write(db, woptions, wb, &err);
    CheckNoError(err);
    CheckGet(db, roptions, "foo", "hello");
    CheckGet(db, roptions, "bar", NULL);
    CheckGet(db, roptions, "box", "c");
+    CheckGet2(db, roptions, "dad", "poland", leveldb::kTypeValueExplicitExpiry, 22446688);
+    CheckGet2(db, roptions, "mom", "texas", leveldb::kTypeValueWriteTime, 0);
    int pos = 0;
-    leveldb_writebatch_iterate(wb, &pos, CheckPut, CheckDel);
-    CheckCondition(pos == 3);
+    leveldb_writebatch_iterate(wb, &pos, CheckPut2, CheckDel2);
+    CheckCondition(pos == 5);
    leveldb_writebatch_destroy(wb);
  }

+  // reminder:  keymetadata not supported on backward iteration
  StartPhase("iter");
  {
+    leveldb::KeyMetaData meta;
    leveldb_iterator_t* iter = leveldb_create_iterator(db, roptions);
    CheckCondition(!leveldb_iter_valid(iter));
    leveldb_iter_seek_to_first(iter);
    CheckCondition(leveldb_iter_valid(iter));
    CheckIter(iter, "box", "c");
+    meta.m_Type=leveldb::kTypeValue;
+    meta.m_Expiry=0;
+    CheckIter2(iter, "box", "c", meta);
+
+    meta.m_Type=leveldb::kTypeValueExplicitExpiry;
+    meta.m_Expiry=22446688;
+    leveldb_iter_next(iter);
+    CheckIter2(iter, "dad", "poland", meta);
    leveldb_iter_next(iter);
    CheckIter(iter, "foo", "hello");
    leveldb_iter_prev(iter);
+    CheckIter(iter, "dad", "poland");
+    leveldb_iter_prev(iter);
    CheckIter(iter, "box", "c");
    leveldb_iter_prev(iter);
    CheckCondition(!leveldb_iter_valid(iter));
    leveldb_iter_seek_to_last(iter);
-    CheckIter(iter, "foo", "hello");
+    CheckIter(iter, "mom", "texas");
    leveldb_iter_seek(iter, "b", 1);
    CheckIter(iter, "box", "c");
    leveldb_iter_get_error(iter, &err);
    CheckNoError(err);
+
+    meta.m_Type=leveldb::kTypeValue;
+    meta.m_Expiry=0;
+    CheckIter2(iter, "box", "c", meta);
+    leveldb_iter_seek(iter, "m", 1);
+    meta.m_Type=leveldb::kTypeValueWriteTime;
+    meta.m_Expiry=0;
+    CheckIter2(iter, "mom", "texas", meta);
+    leveldb_iter_get_error(iter, &err);
+    CheckNoError(err);
+
    leveldb_iter_destroy(iter);
  }

@ -335,6 +480,70 @@ int main(int argc, char** argv) {
    leveldb_options_set_error_if_exists(options, 1);
  }

+  StartPhase("put expiry");
+  {
+      leveldb_keymetadata_t meta;
+      int loop, count;
+
+      count = sizeof(gCheckPut2Data) / sizeof(gCheckPut2Data[0]);
+
+      for (loop=0; loop<count; ++loop)
+      {
+          size_t klen, vlen;
+          leveldb_keymetadata_t meta;
+          struct CheckPut2Data * test;
+
+          test=&gCheckPut2Data[loop];
+          klen=strlen(test->m_Key);
+          vlen=strlen(test->m_Value);
+          meta.rep.m_Type=test->m_Type;
+          meta.rep.m_Expiry=test->m_Expiry;
+
+          leveldb_put2(db, woptions, test->m_Key, klen,
+                       test->m_Value, vlen, &err,
+                       &meta);
+          CheckNoError(err);
+      }   // for
+
+      // testing memtable right now
+      for (loop=0; loop<count; ++loop)
+      {
+          size_t klen, vlen;
+          leveldb_keymetadata_t meta;
+          struct CheckPut2Data * test;
+
+          test=&gCheckPut2Data[loop];
+          klen=strlen(test->m_Key);
+          vlen=strlen(test->m_Value);
+
+          CheckGet2(db, roptions, test->m_Key, test->m_Value,
+                    test->m_Type, test->m_Expiry);
+      }   // for
+
+      // close and open to force memory table into .sst upon open
+      leveldb_close(db);
+      leveldb_options_set_error_if_exists(options, 0);
+      db = leveldb_open(options, dbname, &err);
+      CheckNoError(err);
+
+      // now testing get from a level-0 .sst file
+      for (loop=0; loop<count; ++loop)
+      {
+          size_t klen, vlen;
+          leveldb_keymetadata_t meta;
+          struct CheckPut2Data * test;
+
+          test=&gCheckPut2Data[loop];
+          klen=strlen(test->m_Key);
+          vlen=strlen(test->m_Value);
+
+          CheckGet2(db, roptions, test->m_Key, test->m_Value,
+                    test->m_Type, test->m_Expiry);
+      }   // for
+  }
+
+  //
+  // This screws up "options" for real database work.  execute last.
  StartPhase("filter");
  for (run = 0; run < 2; run++) {
    // First run uses custom filter, second run uses bloom filter
@ -376,6 +585,8 @@ int main(int argc, char** argv) {
    leveldb_filterpolicy_destroy(policy);
  }

+
+
  StartPhase("cleanup");
  leveldb_close(db);
  leveldb_options_destroy(options);
@ -386,5 +597,7 @@ int main(int argc, char** argv) {
  leveldb_env_destroy(env);

  fprintf(stderr, "PASS\n");
+
+  leveldb_env_shutdown();
  return 0;
 }
--- a/src/leveldb/db/corruption_test.cc
+++ b/src/leveldb/db/corruption_test.cc
@ -35,8 +35,8 @@ class CorruptionTest {
  CorruptionTest() {
    tiny_cache_ = NewLRUCache(100);
    options_.env = &env_;
-    options_.block_cache = tiny_cache_;
-    dbname_ = test::TmpDir() + "/corruption_test";
+    dbname_ = test::TmpDir() + "/db_test";
+    dbname_ = MakeTieredDbname(dbname_, options_);
    DestroyDB(dbname_, options_);

    db_ = NULL;
@ -51,14 +51,17 @@ class CorruptionTest {
     delete tiny_cache_;
  }

-  Status TryReopen() {
+  Status TryReopen(Options* options = NULL) {
    delete db_;
    db_ = NULL;
-    return DB::Open(options_, dbname_, &db_);
+    Options opt = (options ? *options : options_);
+    opt.env = &env_;
+    opt.block_cache = tiny_cache_;
+    return DB::Open(opt, dbname_, &db_);
  }

-  void Reopen() {
-    ASSERT_OK(TryReopen());
+  void Reopen(Options* options = NULL) {
+    ASSERT_OK(TryReopen(options));
  }

  void RepairDB() {
@ -75,13 +78,7 @@ class CorruptionTest {
      Slice key = Key(i, &key_space);
      batch.Clear();
      batch.Put(key, Value(i, &value_space));
-      WriteOptions options;
-      // Corrupt() doesn't work without this sync on windows; stat reports 0 for
-      // the file size.
-      if (i == n - 1) {
-        options.sync = true;
-      }
-      ASSERT_OK(db_->Write(options, &batch));
+      ASSERT_OK(db_->Write(WriteOptions(), &batch));
    }
  }

@ -96,10 +93,6 @@ class CorruptionTest {
    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
      uint64_t key;
      Slice in(iter->key());
-      if (in == "" || in == "~") {
-        // Ignore boundary keys.
-        continue;
-      }
      if (!ConsumeDecimalNumber(&in, &key) ||
          !in.empty() ||
          key < next_expected) {
@ -123,19 +116,26 @@ class CorruptionTest {
    ASSERT_GE(max_expected, correct);
  }

-  void Corrupt(FileType filetype, int offset, int bytes_to_corrupt) {
+  void Corrupt(FileType filetype, int offset, int bytes_to_corrupt, int level=0) {
    // Pick file to corrupt
    std::vector<std::string> filenames;
-    ASSERT_OK(env_.GetChildren(dbname_, &filenames));
+    std::string dirname;
+    if (leveldb::kTableFile!=filetype)
+        dirname=dbname_;
+    else
+        dirname=MakeDirName2(options_, level, "sst");
+
+    ASSERT_OK(env_.GetChildren(dirname, &filenames));
+
    uint64_t number;
    FileType type;
    std::string fname;
    int picked_number = -1;
-    for (size_t i = 0; i < filenames.size(); i++) {
+    for (int i = 0; i < filenames.size(); i++) {
      if (ParseFileName(filenames[i], &number, &type) &&
          type == filetype &&
          int(number) > picked_number) {  // Pick latest file
-        fname = dbname_ + "/" + filenames[i];
+        fname = dirname + "/" + filenames[i];
        picked_number = number;
      }
    }
@ -222,12 +222,14 @@ TEST(CorruptionTest, NewFileErrorDuringWrite) {
  const int num = 3 + (Options().write_buffer_size / kValueSize);
  std::string value_storage;
  Status s;
-  for (int i = 0; s.ok() && i < num; i++) {
+  for (int i = 0;
+       s.ok() && i < num && 0==env_.num_writable_file_errors_;
+       i++) {
    WriteBatch batch;
    batch.Put("a", Value(100, &value_storage));
    s = db_->Write(WriteOptions(), &batch);
  }
-  ASSERT_TRUE(!s.ok());
+//  ASSERT_TRUE(!s.ok());  Background write thread will never report this
  ASSERT_GE(env_.num_writable_file_errors_, 1);
  env_.writable_file_error_ = false;
  Reopen();
@ -240,34 +242,18 @@ TEST(CorruptionTest, TableFile) {
  dbi->TEST_CompactRange(0, NULL, NULL);
  dbi->TEST_CompactRange(1, NULL, NULL);

-  Corrupt(kTableFile, 100, 1);
-  Check(90, 99);
-}
-
-TEST(CorruptionTest, TableFileRepair) {
-  options_.block_size = 2 * kValueSize;  // Limit scope of corruption
-  options_.paranoid_checks = true;
-  Reopen();
-  Build(100);
-  DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
-  dbi->TEST_CompactMemTable();
-  dbi->TEST_CompactRange(0, NULL, NULL);
-  dbi->TEST_CompactRange(1, NULL, NULL);
-
-  Corrupt(kTableFile, 100, 1);
-  RepairDB();
-  Reopen();
+  Corrupt(kTableFile, 100, 1, config::kMaxMemCompactLevel);
  Check(95, 99);
 }

 TEST(CorruptionTest, TableFileIndexData) {
-  Build(10000);  // Enough to build multiple Tables
+  Build(100000);  // Enough to build multiple Tables
  DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
  dbi->TEST_CompactMemTable();

-  Corrupt(kTableFile, -2000, 500);
+  Corrupt(kTableFile, -2000, 500, config::kMaxMemCompactLevel);
  Reopen();
-  Check(5000, 9999);
+  Check(50000, 99999);
 }

 TEST(CorruptionTest, MissingDescriptor) {
@ -319,10 +305,10 @@ TEST(CorruptionTest, CompactionInputError) {
  Build(10);
  DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
  dbi->TEST_CompactMemTable();
-  const int last = config::kMaxMemCompactLevel;
+  const int last = config::kMaxMemCompactLevel; // Riak does not "move" files
  ASSERT_EQ(1, Property("leveldb.num-files-at-level" + NumberToString(last)));

-  Corrupt(kTableFile, 100, 1);
+  Corrupt(kTableFile, 100, 1, last);
  Check(5, 9);

  // Force compactions by writing lots of values
@ -331,23 +317,42 @@ TEST(CorruptionTest, CompactionInputError) {
 }

 TEST(CorruptionTest, CompactionInputErrorParanoid) {
-  options_.paranoid_checks = true;
-  options_.write_buffer_size = 512 << 10;
-  Reopen();
+  Options options;
+  options.paranoid_checks = true;
+  options.write_buffer_size = 1048576;
+  Reopen(&options);
+
+  int current_corruption=Property("leveldb.ReadBlockError");
  DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);

-  // Make multiple inputs so we need to compact.
-  for (int i = 0; i < 2; i++) {
+  // Fill levels >= 1 so memtable compaction outputs to level 1
+  //  matthewv 1/10/14 - what does "levels" have to do with this,
+  //  switching to compaction trigger.
+  // 7/10/14 - compaction starts between 4 and 6 files ... assume 4 and 1 move
+  //  (will make a new, descriptive constant for 4)
+  for (int level = Property("leveldb.num-files-at-level0")+1;
+       level < config::kL0_GroomingTrigger; level++) {
+    dbi->Put(WriteOptions(), "", "begin");
+    dbi->Put(WriteOptions(), "~", "end");
+    dbi->TEST_CompactMemTable();
+  }
+
  Build(10);
  dbi->TEST_CompactMemTable();
-    Corrupt(kTableFile, 100, 1);
-    env_.SleepForMicroseconds(100000);
-  }
-  dbi->CompactRange(NULL, NULL);
+  ASSERT_TRUE(1 < Property("leveldb.num-files-at-level0"));

-  // Write must fail because of corrupted table
+  Corrupt(kTableFile, 100, 1, 0);
+  Check(5, 9);
+
+  // Write must eventually fail because of corrupted table
+  Status s;
  std::string tmp1, tmp2;
-  Status s = db_->Put(WriteOptions(), Key(5, &tmp1), Value(5, &tmp2));
+  for (int i = 0; i < 10000 && s.ok(); i++) {
+    s = db_->Put(WriteOptions(), Key(i, &tmp1), Value(i, &tmp2));
+  }
+  if (s.ok())
+      ASSERT_NE(current_corruption, Property("leveldb.ReadBlockError")) << "no ReadBlockError seen";
+  else
      ASSERT_TRUE(!s.ok()) << "write did not fail in corrupted paranoid db";
 }

@ -355,7 +360,7 @@ TEST(CorruptionTest, UnrelatedKeys) {
  Build(10);
  DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
  dbi->TEST_CompactMemTable();
-  Corrupt(kTableFile, 100, 1);
+  Corrupt(kTableFile, 100, 1, config::kMaxMemCompactLevel);

  std::string tmp1, tmp2;
  ASSERT_OK(db_->Put(WriteOptions(), Key(1000, &tmp1), Value(1000, &tmp2)));
--- a/src/leveldb/db/db_bench.cc
+++ b/src/leveldb/db/db_bench.cc
@ -33,7 +33,6 @@
 //      readmissing   -- read N missing keys in random order
 //      readhot       -- read N times in random order from 1% section of DB
 //      seekrandom    -- N random seeks
-//      open          -- cost of opening a DB
 //      crc32c        -- repeated crc32c of 4K of data
 //      acquireload   -- load N*1000 times
 //   Meta operations:
@ -84,14 +83,6 @@ static bool FLAGS_histogram = false;
 // (initialized to default value by "main")
 static int FLAGS_write_buffer_size = 0;

-// Number of bytes written to each file.
-// (initialized to default value by "main")
-static int FLAGS_max_file_size = 0;
-
-// Approximate size of user data packed per block (before compression.
-// (initialized to default value by "main")
-static int FLAGS_block_size = 0;
-
 // Number of bytes to use as a cache of uncompressed data.
 // Negative means use default settings.
 static int FLAGS_cache_size = -1;
@ -103,21 +94,26 @@ static int FLAGS_open_files = 0;
 // Negative means use default settings.
 static int FLAGS_bloom_bits = -1;

+// Riak bloom adaptation
+static int FLAGS_bloom2_bits = -1;
+
+// Riak param for total memory allocation (flex_cache)
+static uint64_t FLAGS_leveldb_memory = -1;
+
+// Riak param for compression setting
+static int FLAGS_compression = 2;
+
 // If true, do not destroy the existing database.  If you set this
 // flag and also specify a benchmark that wants a fresh database, that
 // benchmark will fail.
 static bool FLAGS_use_existing_db = false;

-// If true, reuse existing log/MANIFEST files when re-opening a database.
-static bool FLAGS_reuse_logs = false;
-
 // Use the db with the following name.
 static const char* FLAGS_db = NULL;

 namespace leveldb {

 namespace {
-leveldb::Env* g_env = NULL;

 // Helper for quickly generating random data.
 class RandomGenerator {
@ -141,7 +137,7 @@ class RandomGenerator {
    pos_ = 0;
  }

-  Slice Generate(size_t len) {
+  Slice Generate(int len) {
    if (pos_ + len > data_.size()) {
      pos_ = 0;
      assert(len < data_.size());
@ -151,19 +147,17 @@ class RandomGenerator {
  }
 };

-#if defined(__linux)
 static Slice TrimSpace(Slice s) {
-  size_t start = 0;
+  int start = 0;
  while (start < s.size() && isspace(s[start])) {
    start++;
  }
-  size_t limit = s.size();
+  int limit = s.size();
  while (limit > start && isspace(s[limit-1])) {
    limit--;
  }
  return Slice(s.data() + start, limit - start);
 }
-#endif

 static void AppendWithSpace(std::string* str, Slice msg) {
  if (msg.empty()) return;
@ -195,7 +189,7 @@ class Stats {
    done_ = 0;
    bytes_ = 0;
    seconds_ = 0;
-    start_ = g_env->NowMicros();
+    start_ = Env::Default()->NowMicros();
    finish_ = start_;
    message_.clear();
  }
@ -213,7 +207,7 @@ class Stats {
  }

  void Stop() {
-    finish_ = g_env->NowMicros();
+    finish_ = Env::Default()->NowMicros();
    seconds_ = (finish_ - start_) * 1e-6;
  }

@ -223,7 +217,7 @@ class Stats {

  void FinishedSingleOp() {
    if (FLAGS_histogram) {
-      double now = g_env->NowMicros();
+      double now = Env::Default()->NowMicros();
      double micros = now - last_op_finish_;
      hist_.Add(micros);
      if (micros > 20000) {
@ -405,7 +399,7 @@ class Benchmark {
  : cache_(FLAGS_cache_size >= 0 ? NewLRUCache(FLAGS_cache_size) : NULL),
    filter_policy_(FLAGS_bloom_bits >= 0
                   ? NewBloomFilterPolicy(FLAGS_bloom_bits)
-                   : NULL),
+                   : (FLAGS_bloom2_bits >=0 ? NewBloomFilterPolicy2(FLAGS_bloom2_bits) : NULL)),
    db_(NULL),
    num_(FLAGS_num),
    value_size_(FLAGS_value_size),
@ -413,10 +407,10 @@ class Benchmark {
    reads_(FLAGS_reads < 0 ? FLAGS_num : FLAGS_reads),
    heap_counter_(0) {
    std::vector<std::string> files;
-    g_env->GetChildren(FLAGS_db, &files);
-    for (size_t i = 0; i < files.size(); i++) {
+    Env::Default()->GetChildren(FLAGS_db, &files);
+    for (int i = 0; i < files.size(); i++) {
      if (Slice(files[i]).starts_with("heap-")) {
-        g_env->DeleteFile(std::string(FLAGS_db) + "/" + files[i]);
+        Env::Default()->DeleteFile(std::string(FLAGS_db) + "/" + files[i]);
      }
    }
    if (!FLAGS_use_existing_db) {
@ -446,7 +440,7 @@ class Benchmark {
        benchmarks = sep + 1;
      }

-      // Reset parameters that may be overridden below
+      // Reset parameters that may be overriddden bwlow
      num_ = FLAGS_num;
      reads_ = (FLAGS_reads < 0 ? FLAGS_num : FLAGS_reads);
      value_size_ = FLAGS_value_size;
@ -457,11 +451,7 @@ class Benchmark {
      bool fresh_db = false;
      int num_threads = FLAGS_threads;

-      if (name == Slice("open")) {
-        method = &Benchmark::OpenBench;
-        num_ /= 10000;
-        if (num_ < 1) num_ = 1;
-      } else if (name == Slice("fillseq")) {
+      if (name == Slice("fillseq")) {
        fresh_db = true;
        method = &Benchmark::WriteSeq;
      } else if (name == Slice("fillbatch")) {
@ -553,6 +543,7 @@ class Benchmark {
    SharedState* shared;
    ThreadState* thread;
    void (Benchmark::*method)(ThreadState*);
+    pthread_t thread_id;
  };

  static void ThreadBody(void* v) {
@ -598,7 +589,8 @@ class Benchmark {
      arg[i].shared = &shared;
      arg[i].thread = new ThreadState(i);
      arg[i].thread->shared = &shared;
-      g_env->StartThread(ThreadBody, &arg[i]);
+      arg[i].thread_id=Env::Default()->StartThread(ThreadBody, &arg[i]);
+      pthread_detach(arg[i].thread_id);
    }

    shared.mu.Lock();
@ -709,15 +701,12 @@ class Benchmark {
  void Open() {
    assert(db_ == NULL);
    Options options;
-    options.env = g_env;
    options.create_if_missing = !FLAGS_use_existing_db;
    options.block_cache = cache_;
    options.write_buffer_size = FLAGS_write_buffer_size;
-    options.max_file_size = FLAGS_max_file_size;
-    options.block_size = FLAGS_block_size;
-    options.max_open_files = FLAGS_open_files;
    options.filter_policy = filter_policy_;
-    options.reuse_logs = FLAGS_reuse_logs;
+    options.compression = (leveldb::CompressionType)FLAGS_compression;
+    options.total_leveldb_mem = FLAGS_leveldb_memory;
    Status s = DB::Open(options, FLAGS_db, &db_);
    if (!s.ok()) {
      fprintf(stderr, "open error: %s\n", s.ToString().c_str());
@ -725,14 +714,6 @@ class Benchmark {
    }
  }

-  void OpenBench(ThreadState* thread) {
-    for (int i = 0; i < num_; i++) {
-      delete db_;
-      Open();
-      thread->stats.FinishedSingleOp();
-    }
-  }
-
  void WriteSeq(ThreadState* thread) {
    DoWrite(thread, true);
  }
@ -842,6 +823,7 @@ class Benchmark {

  void SeekRandom(ThreadState* thread) {
    ReadOptions options;
+    std::string value;
    int found = 0;
    for (int i = 0; i < reads_; i++) {
      Iterator* iter = db_->NewIterator(options);
@ -937,7 +919,7 @@ class Benchmark {
    char fname[100];
    snprintf(fname, sizeof(fname), "%s/heap-%04d", FLAGS_db, ++heap_counter_);
    WritableFile* file;
-    Status s = g_env->NewWritableFile(fname, &file);
+    Status s = Env::Default()->NewWritableFile(fname, &file, 2<<20);
    if (!s.ok()) {
      fprintf(stderr, "%s\n", s.ToString().c_str());
      return;
@ -946,7 +928,7 @@ class Benchmark {
    delete file;
    if (!ok) {
      fprintf(stderr, "heap profiling not supported\n");
-      g_env->DeleteFile(fname);
+      Env::Default()->DeleteFile(fname);
    }
  }
 };
@ -955,14 +937,14 @@ class Benchmark {

 int main(int argc, char** argv) {
  FLAGS_write_buffer_size = leveldb::Options().write_buffer_size;
-  FLAGS_max_file_size = leveldb::Options().max_file_size;
-  FLAGS_block_size = leveldb::Options().block_size;
  FLAGS_open_files = leveldb::Options().max_open_files;
+  FLAGS_leveldb_memory = 25000000000LL;
  std::string default_db_path;

  for (int i = 1; i < argc; i++) {
    double d;
    int n;
+    uint64_t u;
    char junk;
    if (leveldb::Slice(argv[i]).starts_with("--benchmarks=")) {
      FLAGS_benchmarks = argv[i] + strlen("--benchmarks=");
@ -974,9 +956,6 @@ int main(int argc, char** argv) {
    } else if (sscanf(argv[i], "--use_existing_db=%d%c", &n, &junk) == 1 &&
               (n == 0 || n == 1)) {
      FLAGS_use_existing_db = n;
-    } else if (sscanf(argv[i], "--reuse_logs=%d%c", &n, &junk) == 1 &&
-               (n == 0 || n == 1)) {
-      FLAGS_reuse_logs = n;
    } else if (sscanf(argv[i], "--num=%d%c", &n, &junk) == 1) {
      FLAGS_num = n;
    } else if (sscanf(argv[i], "--reads=%d%c", &n, &junk) == 1) {
@ -987,16 +966,18 @@ int main(int argc, char** argv) {
      FLAGS_value_size = n;
    } else if (sscanf(argv[i], "--write_buffer_size=%d%c", &n, &junk) == 1) {
      FLAGS_write_buffer_size = n;
-    } else if (sscanf(argv[i], "--max_file_size=%d%c", &n, &junk) == 1) {
-      FLAGS_max_file_size = n;
-    } else if (sscanf(argv[i], "--block_size=%d%c", &n, &junk) == 1) {
-      FLAGS_block_size = n;
    } else if (sscanf(argv[i], "--cache_size=%d%c", &n, &junk) == 1) {
      FLAGS_cache_size = n;
    } else if (sscanf(argv[i], "--bloom_bits=%d%c", &n, &junk) == 1) {
      FLAGS_bloom_bits = n;
+    } else if (sscanf(argv[i], "--bloom_bits2=%d%c", &n, &junk) == 1) {
+      FLAGS_bloom2_bits = n;
+    } else if (sscanf(argv[i], "--leveldb_memory=%d%c", &n, &junk) == 1) {
+      FLAGS_leveldb_memory = n * 1024 * 1024LL;
    } else if (sscanf(argv[i], "--open_files=%d%c", &n, &junk) == 1) {
      FLAGS_open_files = n;
+    } else if (sscanf(argv[i], "--compression=%d%c", &n, &junk) == 1) {
+      FLAGS_compression = n;
    } else if (strncmp(argv[i], "--db=", 5) == 0) {
      FLAGS_db = argv[i] + 5;
    } else {
@ -1005,16 +986,20 @@ int main(int argc, char** argv) {
    }
  }

-  leveldb::g_env = leveldb::Env::Default();
-
  // Choose a location for the test database if none given with --db=<path>
  if (FLAGS_db == NULL) {
-      leveldb::g_env->GetTestDirectory(&default_db_path);
+      leveldb::Env::Default()->GetTestDirectory(&default_db_path);
      default_db_path += "/dbbench";
      FLAGS_db = default_db_path.c_str();
  }

+  // benchmark class needs to destruct before Shutdown call
+  {
      leveldb::Benchmark benchmark;
      benchmark.Run();
+  }
+
+  leveldb::Env::Shutdown();
+
  return 0;
 }
--- a/src/leveldb/db/db_impl.cc
+++ b/src/leveldb/db/db_impl.cc
--- a/src/leveldb/db/db_impl.h
+++ b/src/leveldb/db/db_impl.h
@ -13,7 +13,7 @@
 #include "leveldb/db.h"
 #include "leveldb/env.h"
 #include "port/port.h"
-#include "port/thread_annotations.h"
+#include "util/cache2.h"

 namespace leveldb {

@ -29,26 +29,37 @@ class DBImpl : public DB {
  virtual ~DBImpl();

  // Implementations of the DB interface
-  virtual Status Put(const WriteOptions&, const Slice& key, const Slice& value);
+  virtual Status Put(const WriteOptions&, const Slice& key, const Slice& value, const KeyMetaData * meta=NULL);
  virtual Status Delete(const WriteOptions&, const Slice& key);
  virtual Status Write(const WriteOptions& options, WriteBatch* updates);
  virtual Status Get(const ReadOptions& options,
                     const Slice& key,
-                     std::string* value);
+                     std::string* value,
+                     KeyMetaData * meta=NULL);
+  virtual Status Get(const ReadOptions& options,
+                     const Slice& key,
+                     Value* value,
+                     KeyMetaData * meta=NULL);
  virtual Iterator* NewIterator(const ReadOptions&);
  virtual const Snapshot* GetSnapshot();
  virtual void ReleaseSnapshot(const Snapshot* snapshot);
  virtual bool GetProperty(const Slice& property, std::string* value);
  virtual void GetApproximateSizes(const Range* range, int n, uint64_t* sizes);
  virtual void CompactRange(const Slice* begin, const Slice* end);
+  virtual Status VerifyLevels();
+  virtual void CheckAvailableCompactions();
+  virtual Logger* GetLogger() const { return options_.info_log; }

  // Extra methods (for testing) that are not in the public DB interface

+  const Options & GetOptions() const { return options_; };
+
  // Compact any files in the named level that overlap [*begin,*end]
  void TEST_CompactRange(int level, const Slice* begin, const Slice* end);

-  // Force current memtable contents to be compacted.
-  Status TEST_CompactMemTable();
+  // Force current memtable contents to be compacted, waits for completion
+  Status CompactMemTableSynchronous();
+  Status TEST_CompactMemTable();       // wraps CompactMemTableSynchronous (historical)

  // Return an internal iterator over the current state of the database.
  // The keys of this iterator are internal keys (see format.h).
@ -59,64 +70,82 @@ class DBImpl : public DB {
  // file at a level >= 1.
  int64_t TEST_MaxNextLevelOverlappingBytes();

-  // Record a sample of bytes read at the specified internal key.
-  // Samples are taken approximately once every config::kReadBytesPeriod
-  // bytes.
-  void RecordReadSample(Slice key);
+  // These are routines that DBListImpl calls across all open databases
+  void ResizeCaches() {double_cache.ResizeCaches();};
+  size_t GetCacheCapacity() {return(double_cache.GetCapacity(false));}
+  void PurgeExpiredFileCache() {double_cache.PurgeExpiredFiles();};

- private:
+  // in util/hot_backup.cc
+  void HotBackup();
+  bool PurgeWriteBuffer();
+  bool WriteBackupManifest();
+  bool CreateBackupLinks(Version * Version, Options & BackupOptions);
+  bool CopyLOGSegment(long FileEnd);
+  void HotBackupComplete();
+
+  void BackgroundCall2(Compaction * Compact);
+  void BackgroundImmCompactCall();
+  bool IsCompactionScheduled();
+  uint32_t RunningCompactionCount() {mutex_.AssertHeld(); return(running_compactions_);};
+
+ protected:
  friend class DB;
  struct CompactionState;
  struct Writer;

  Iterator* NewInternalIterator(const ReadOptions&,
-                                SequenceNumber* latest_snapshot,
-                                uint32_t* seed);
+                                SequenceNumber* latest_snapshot);

  Status NewDB();

  // Recover the descriptor from persistent storage.  May do a significant
  // amount of work to recover recently logged updates.  Any changes to
  // be made to the descriptor are added to *edit.
-  Status Recover(VersionEdit* edit, bool* save_manifest)
-      EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+  Status Recover(VersionEdit* edit);
+
+  // Riak routine:  pause DB::Open if too many compactions
+  //  stacked up immediately.  Happens in some repairs and
+  //  some Riak upgrades
+  void CheckCompactionState();

  void MaybeIgnoreError(Status* s) const;

  // Delete any unneeded files and stale in-memory entries.
  void DeleteObsoleteFiles();
+  void KeepOrDelete(const std::string & Filename, int level, const std::set<uint64_t> & Live);

  // Compact the in-memory write buffer to disk.  Switches to a new
  // log-file/memtable and writes a new descriptor iff successful.
-  // Errors are recorded in bg_error_.
-  void CompactMemTable() EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+  Status CompactMemTable();

-  Status RecoverLogFile(uint64_t log_number, bool last_log, bool* save_manifest,
-                        VersionEdit* edit, SequenceNumber* max_sequence)
-      EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+  Status RecoverLogFile(uint64_t log_number,
+                        VersionEdit* edit,
+                        SequenceNumber* max_sequence);

-  Status WriteLevel0Table(MemTable* mem, VersionEdit* edit, Version* base)
-      EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+  Status WriteLevel0Table(volatile MemTable* mem, VersionEdit* edit, Version* base);
+
+  Status MakeRoomForWrite(bool force /* TRUE forces memtable rotation to disk (for testing) */);
+  Status NewRecoveryLog(uint64_t NewLogNumber);

-  Status MakeRoomForWrite(bool force /* compact even if there is room? */)
-      EXCLUSIVE_LOCKS_REQUIRED(mutex_);
  WriteBatch* BuildBatchGroup(Writer** last_writer);

-  void RecordBackgroundError(const Status& s);
+  void MaybeScheduleCompaction();

-  void MaybeScheduleCompaction() EXCLUSIVE_LOCKS_REQUIRED(mutex_);
-  static void BGWork(void* db);
-  void BackgroundCall();
-  void  BackgroundCompaction() EXCLUSIVE_LOCKS_REQUIRED(mutex_);
-  void CleanupCompaction(CompactionState* compact)
-      EXCLUSIVE_LOCKS_REQUIRED(mutex_);
-  Status DoCompactionWork(CompactionState* compact)
-      EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+  Status BackgroundCompaction(Compaction * Compact=NULL);
+  Status BackgroundExpiry(Compaction * Compact=NULL);

-  Status OpenCompactionOutputFile(CompactionState* compact);
+  void CleanupCompaction(CompactionState* compact);
+  Status DoCompactionWork(CompactionState* compact);
+  int64_t PrioritizeWork(bool IsLevel0);
+
+  Status OpenCompactionOutputFile(CompactionState* compact, size_t sample_value_size);
+  bool Send2PageCache(CompactionState * compact);
+  size_t MaybeRaiseBlockSize(Compaction & CompactionStuff, size_t SampleValueSize);
  Status FinishCompactionOutputFile(CompactionState* compact, Iterator* input);
-  Status InstallCompactionResults(CompactionState* compact)
-      EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+  Status InstallCompactionResults(CompactionState* compact);
+
+  // initialized before options so its block_cache is available
+  class DoubleCache double_cache;

  // Constant after construction
  Env* const env_;
@ -130,20 +159,22 @@ class DBImpl : public DB {
  // table_cache_ provides its own synchronization
  TableCache* table_cache_;

+
  // Lock over the persistent DB state.  Non-NULL iff successfully acquired.
  FileLock* db_lock_;

  // State below is protected by mutex_
  port::Mutex mutex_;
+  port::Mutex throttle_mutex_;   // used by write throttle to force sequential waits on callers
  port::AtomicPointer shutting_down_;
+
  port::CondVar bg_cv_;          // Signalled when background work finishes
  MemTable* mem_;
-  MemTable* imm_;                // Memtable being compacted
+  volatile MemTable* imm_;                // Memtable being compacted
  port::AtomicPointer has_imm_;  // So bg thread can detect non-NULL imm_
  WritableFile* logfile_;
  uint64_t logfile_number_;
  log::Writer* log_;
-  uint32_t seed_;                // For sampling.

  // Queue of writers.
  std::deque<Writer*> writers_;
@ -155,9 +186,6 @@ class DBImpl : public DB {
  // part of ongoing compactions.
  std::set<uint64_t> pending_outputs_;

-  // Has a background compaction been scheduled or is running?
-  bool bg_compaction_scheduled_;
-
  // Information for a manual compaction
  struct ManualCompaction {
    int level;
@ -166,7 +194,7 @@ class DBImpl : public DB {
    const InternalKey* end;     // NULL means end of key range
    InternalKey tmp_storage;    // Used to keep track of compaction progress
  };
-  ManualCompaction* manual_compaction_;
+  volatile ManualCompaction* manual_compaction_;

  VersionSet* versions_;

@ -190,6 +218,18 @@ class DBImpl : public DB {
  };
  CompactionStats stats_[config::kNumLevels];

+  volatile uint64_t throttle_end;
+  volatile uint32_t running_compactions_;
+  volatile size_t current_block_size_;    // last dynamic block size computed
+  volatile uint64_t block_size_changed_;  // NowMicros() when block size computed
+  volatile uint64_t last_low_mem_;        // NowMicros() when low memory last seen
+
+  // accessor to new, dynamic block_cache
+  Cache * block_cache() {return(double_cache.GetBlockCache());};
+  Cache * file_cache() {return(double_cache.GetFileCache());};
+
+  volatile bool hotbackup_pending_;
+
  // No copying allowed
  DBImpl(const DBImpl&);
  void operator=(const DBImpl&);
@ -204,7 +244,8 @@ class DBImpl : public DB {
 extern Options SanitizeOptions(const std::string& db,
                               const InternalKeyComparator* icmp,
                               const InternalFilterPolicy* ipolicy,
-                               const Options& src);
+                               const Options& src,
+                               Cache * block_cache);

 }  // namespace leveldb

--- a/src/leveldb/db/db_iter.cc
+++ b/src/leveldb/db/db_iter.cc
@ -5,14 +5,14 @@
 #include "db/db_iter.h"

 #include "db/filename.h"
-#include "db/db_impl.h"
 #include "db/dbformat.h"
 #include "leveldb/env.h"
+#include "leveldb/expiry.h"
 #include "leveldb/iterator.h"
+#include "leveldb/perf_count.h"
 #include "port/port.h"
 #include "util/logging.h"
 #include "util/mutexlock.h"
-#include "util/random.h"

 namespace leveldb {

@ -48,18 +48,20 @@ class DBIter: public Iterator {
    kReverse
  };

-  DBIter(DBImpl* db, const Comparator* cmp, Iterator* iter, SequenceNumber s,
-         uint32_t seed)
-      : db_(db),
+  DBIter(const std::string* dbname, Env* env,
+         const Comparator* cmp, Iterator* iter, SequenceNumber s,
+         const ExpiryModule * expiry)
+      : dbname_(dbname),
+        env_(env),
        user_comparator_(cmp),
        iter_(iter),
        sequence_(s),
        direction_(kForward),
        valid_(false),
-        rnd_(seed),
-        bytes_counter_(RandomPeriod()) {
+        expiry_(expiry) {
  }
  virtual ~DBIter() {
+    gPerfCounters->Inc(ePerfIterDelete);
    delete iter_;
  }
  virtual bool Valid() const { return valid_; }
@ -71,6 +73,26 @@ class DBIter: public Iterator {
    assert(valid_);
    return (direction_ == kForward) ? iter_->value() : saved_value_;
  }
+  // Riak specific:  if a database iterator, returns key meta data
+  // REQUIRES: Valid() and forward iteration
+  //  (reverse iteration is possible, just needs code)
+  virtual KeyMetaData & keymetadata() const
+  {
+    assert(valid_ && kForward==direction_);
+    if (kForward==direction_)
+    {
+      ParsedInternalKey parsed;
+      // this initialization clears a warning.  ParsedInternalKey says
+      //  it is not initializing for performance reasons ... oh well
+      parsed.type=kTypeValue; parsed.sequence=0; parsed.expiry=0;
+      ParseInternalKey(iter_->key(), &parsed);
+      keymetadata_.m_Type=parsed.type;
+      keymetadata_.m_Sequence=parsed.sequence;
+      keymetadata_.m_Expiry=parsed.expiry;
+    }
+    return(keymetadata_);
+  }
+
  virtual Status status() const {
    if (status_.ok()) {
      return iter_->status();
@ -103,12 +125,8 @@ class DBIter: public Iterator {
    }
  }

-  // Pick next gap with average value of config::kReadBytesPeriod.
-  ssize_t RandomPeriod() {
-    return rnd_.Uniform(2*config::kReadBytesPeriod);
-  }
-
-  DBImpl* db_;
+  const std::string* const dbname_;
+  Env* const env_;
  const Comparator* const user_comparator_;
  Iterator* const iter_;
  SequenceNumber const sequence_;
@ -118,9 +136,7 @@ class DBIter: public Iterator {
  std::string saved_value_;   // == current raw value when direction_==kReverse
  Direction direction_;
  bool valid_;
-
-  Random rnd_;
-  ssize_t bytes_counter_;
+  const ExpiryModule * expiry_;

  // No copying allowed
  DBIter(const DBIter&);
@ -128,14 +144,7 @@ class DBIter: public Iterator {
 };

 inline bool DBIter::ParseKey(ParsedInternalKey* ikey) {
-  Slice k = iter_->key();
-  ssize_t n = k.size() + iter_->value().size();
-  bytes_counter_ -= n;
-  while (bytes_counter_ < 0) {
-    bytes_counter_ += RandomPeriod();
-    db_->RecordReadSample(k);
-  }
-  if (!ParseInternalKey(k, ikey)) {
+  if (!ParseInternalKey(iter_->key(), ikey)) {
    status_ = Status::Corruption("corrupted internal key in DBIter");
    return false;
  } else {
@ -146,6 +155,7 @@ inline bool DBIter::ParseKey(ParsedInternalKey* ikey) {
 void DBIter::Next() {
  assert(valid_);

+  gPerfCounters->Inc(ePerfIterNext);
  if (direction_ == kReverse) {  // Switch directions?
    direction_ = kForward;
    // iter_ is pointing just before the entries for this->key(),
@ -161,13 +171,12 @@ void DBIter::Next() {
      saved_key_.clear();
      return;
    }
-    // saved_key_ already contains the key to skip past.
-  } else {
-    // Store in saved_key_ the current key so we skip it below.
-    SaveKey(ExtractUserKey(iter_->key()), &saved_key_);
  }

-  FindNextUserEntry(true, &saved_key_);
+  // Temporarily use saved_key_ as storage for key to skip.
+  std::string* skip = &saved_key_;
+  SaveKey(ExtractUserKey(iter_->key()), skip);
+  FindNextUserEntry(true, skip);
 }

 void DBIter::FindNextUserEntry(bool skipping, std::string* skip) {
@ -177,6 +186,9 @@ void DBIter::FindNextUserEntry(bool skipping, std::string* skip) {
  do {
    ParsedInternalKey ikey;
    if (ParseKey(&ikey) && ikey.sequence <= sequence_) {
+      if (IsExpiryKey(ikey.type) && NULL!=expiry_
+          && expiry_->KeyRetirementCallback(ikey))
+        ikey.type=kTypeDeletion;
      switch (ikey.type) {
        case kTypeDeletion:
          // Arrange to skip all upcoming entries for this key since
@ -184,6 +196,9 @@ void DBIter::FindNextUserEntry(bool skipping, std::string* skip) {
          SaveKey(ikey.user_key, skip);
          skipping = true;
          break;
+
+        case kTypeValueWriteTime:
+        case kTypeValueExplicitExpiry:
        case kTypeValue:
          if (skipping &&
              user_comparator_->Compare(ikey.user_key, *skip) <= 0) {
@ -205,6 +220,7 @@ void DBIter::FindNextUserEntry(bool skipping, std::string* skip) {
 void DBIter::Prev() {
  assert(valid_);

+  gPerfCounters->Inc(ePerfIterPrev);
  if (direction_ == kForward) {  // Switch directions?
    // iter_ is pointing at the current entry.  Scan backwards until
    // the key changes so we can use the normal reverse scanning code.
@ -242,6 +258,10 @@ void DBIter::FindPrevUserEntry() {
          // We encountered a non-deleted value in entries for previous keys,
          break;
        }
+        if (IsExpiryKey(ikey.type) && NULL!=expiry_
+            && expiry_->KeyRetirementCallback(ikey))
+          ikey.type=kTypeDeletion;
+
        value_type = ikey.type;
        if (value_type == kTypeDeletion) {
          saved_key_.clear();
@ -272,11 +292,12 @@ void DBIter::FindPrevUserEntry() {
 }

 void DBIter::Seek(const Slice& target) {
+  gPerfCounters->Inc(ePerfIterSeek);
  direction_ = kForward;
  ClearSavedValue();
  saved_key_.clear();
  AppendInternalKey(
-      &saved_key_, ParsedInternalKey(target, sequence_, kValueTypeForSeek));
+      &saved_key_, ParsedInternalKey(target, 0, sequence_, kValueTypeForSeek));
  iter_->Seek(saved_key_);
  if (iter_->Valid()) {
    FindNextUserEntry(false, &saved_key_ /* temporary storage */);
@ -286,6 +307,7 @@ void DBIter::Seek(const Slice& target) {
 }

 void DBIter::SeekToFirst() {
+  gPerfCounters->Inc(ePerfIterSeekFirst);
  direction_ = kForward;
  ClearSavedValue();
  iter_->SeekToFirst();
@ -297,6 +319,7 @@ void DBIter::SeekToFirst() {
 }

 void DBIter::SeekToLast() {
+  gPerfCounters->Inc(ePerfIterSeekLast);
  direction_ = kReverse;
  ClearSavedValue();
  iter_->SeekToLast();
@ -306,12 +329,13 @@ void DBIter::SeekToLast() {
 }  // anonymous namespace

 Iterator* NewDBIterator(
-    DBImpl* db,
+    const std::string* dbname,
+    Env* env,
    const Comparator* user_key_comparator,
    Iterator* internal_iter,
-    SequenceNumber sequence,
-    uint32_t seed) {
-  return new DBIter(db, user_key_comparator, internal_iter, sequence, seed);
+    const SequenceNumber& sequence,
+    const ExpiryModule * expiry) {
+  return new DBIter(dbname, env, user_key_comparator, internal_iter, sequence, expiry);
 }

 }  // namespace leveldb
--- a/src/leveldb/db/db_iter.h
+++ b/src/leveldb/db/db_iter.h
@ -7,21 +7,21 @@

 #include <stdint.h>
 #include "leveldb/db.h"
+#include "leveldb/expiry.h"
 #include "db/dbformat.h"

 namespace leveldb {

-class DBImpl;
-
 // Return a new iterator that converts internal keys (yielded by
 // "*internal_iter") that were live at the specified "sequence" number
 // into appropriate user keys.
 extern Iterator* NewDBIterator(
-    DBImpl* db,
+    const std::string* dbname,
+    Env* env,
    const Comparator* user_key_comparator,
    Iterator* internal_iter,
-    SequenceNumber sequence,
-    uint32_t seed);
+    const SequenceNumber& sequence,
+    const ExpiryModule * expiry=NULL);

 }  // namespace leveldb

--- a/src/leveldb/db/db_test.cc
+++ b/src/leveldb/db/db_test.cc
@ -33,11 +33,8 @@ class AtomicCounter {
 public:
  AtomicCounter() : count_(0) { }
  void Increment() {
-    IncrementBy(1);
-  }
-  void IncrementBy(int count) {
    MutexLock l(&mu_);
-    count_ += count;
+    count_++;
  }
  int Read() {
    MutexLock l(&mu_);
@ -48,20 +45,13 @@ class AtomicCounter {
    count_ = 0;
  }
 };
-
-void DelayMilliseconds(int millis) {
-  Env::Default()->SleepForMicroseconds(millis * 1000);
-}
 }

 // Special Env used to delay background operations
 class SpecialEnv : public EnvWrapper {
 public:
-  // sstable/log Sync() calls are blocked while this pointer is non-NULL.
-  port::AtomicPointer delay_data_sync_;
-
-  // sstable/log Sync() calls return an error.
-  port::AtomicPointer data_sync_error_;
+  // sstable Sync() calls are blocked while this pointer is non-NULL.
+  port::AtomicPointer delay_sstable_sync_;

  // Simulate no-space errors while this pointer is non-NULL.
  port::AtomicPointer no_space_;
@ -69,37 +59,30 @@ class SpecialEnv : public EnvWrapper {
  // Simulate non-writable file system while this pointer is non-NULL
  port::AtomicPointer non_writable_;

-  // Force sync of manifest files to fail while this pointer is non-NULL
-  port::AtomicPointer manifest_sync_error_;
-
-  // Force write to manifest files to fail while this pointer is non-NULL
-  port::AtomicPointer manifest_write_error_;
-
  bool count_random_reads_;
  AtomicCounter random_read_counter_;

+  AtomicCounter sleep_counter_;
+
  explicit SpecialEnv(Env* base) : EnvWrapper(base) {
-    delay_data_sync_.Release_Store(NULL);
-    data_sync_error_.Release_Store(NULL);
+    delay_sstable_sync_.Release_Store(NULL);
    no_space_.Release_Store(NULL);
    non_writable_.Release_Store(NULL);
    count_random_reads_ = false;
-    manifest_sync_error_.Release_Store(NULL);
-    manifest_write_error_.Release_Store(NULL);
  }

-  Status NewWritableFile(const std::string& f, WritableFile** r) {
-    class DataFile : public WritableFile {
+  Status NewWritableFile(const std::string& f, WritableFile** r, size_t map_size) {
+    class SSTableFile : public WritableFile {
     private:
      SpecialEnv* env_;
      WritableFile* base_;

     public:
-      DataFile(SpecialEnv* env, WritableFile* base)
+      SSTableFile(SpecialEnv* env, WritableFile* base)
          : env_(env),
            base_(base) {
      }
-      ~DataFile() { delete base_; }
+      ~SSTableFile() { delete base_; }
      Status Append(const Slice& data) {
        if (env_->no_space_.Acquire_Load() != NULL) {
          // Drop writes on the floor
@ -111,51 +94,21 @@ class SpecialEnv : public EnvWrapper {
      Status Close() { return base_->Close(); }
      Status Flush() { return base_->Flush(); }
      Status Sync() {
-        if (env_->data_sync_error_.Acquire_Load() != NULL) {
-          return Status::IOError("simulated data sync error");
-        }
-        while (env_->delay_data_sync_.Acquire_Load() != NULL) {
-          DelayMilliseconds(100);
+        while (env_->delay_sstable_sync_.Acquire_Load() != NULL) {
+          env_->SleepForMicroseconds(100000);
        }
        return base_->Sync();
      }
    };
-    class ManifestFile : public WritableFile {
-     private:
-      SpecialEnv* env_;
-      WritableFile* base_;
-     public:
-      ManifestFile(SpecialEnv* env, WritableFile* b) : env_(env), base_(b) { }
-      ~ManifestFile() { delete base_; }
-      Status Append(const Slice& data) {
-        if (env_->manifest_write_error_.Acquire_Load() != NULL) {
-          return Status::IOError("simulated writer error");
-        } else {
-          return base_->Append(data);
-        }
-      }
-      Status Close() { return base_->Close(); }
-      Status Flush() { return base_->Flush(); }
-      Status Sync() {
-        if (env_->manifest_sync_error_.Acquire_Load() != NULL) {
-          return Status::IOError("simulated sync error");
-        } else {
-          return base_->Sync();
-        }
-      }
-    };

    if (non_writable_.Acquire_Load() != NULL) {
      return Status::IOError("simulated write error");
    }

-    Status s = target()->NewWritableFile(f, r);
+    Status s = target()->NewWritableFile(f, r, 2<<20);
    if (s.ok()) {
-      if (strstr(f.c_str(), ".ldb") != NULL ||
-          strstr(f.c_str(), ".log") != NULL) {
-        *r = new DataFile(this, *r);
-      } else if (strstr(f.c_str(), "MANIFEST") != NULL) {
-        *r = new ManifestFile(this, *r);
+      if (strstr(f.c_str(), ".sst") != NULL) {
+        *r = new SSTableFile(this, *r);
      }
    }
    return s;
@ -184,6 +137,11 @@ class SpecialEnv : public EnvWrapper {
    }
    return s;
  }
+
+  virtual void SleepForMicroseconds(int micros) {
+    sleep_counter_.Increment();
+    target()->SleepForMicroseconds(micros);
+  }
 };

 class DBTest {
@ -193,7 +151,6 @@ class DBTest {
  // Sequence of option configurations to try
  enum OptionConfig {
    kDefault,
-    kReuse,
    kFilter,
    kUncompressed,
    kEnd
@ -209,7 +166,7 @@ class DBTest {

  DBTest() : option_config_(kDefault),
             env_(new SpecialEnv(Env::Default())) {
-    filter_policy_ = NewBloomFilterPolicy(10);
+    filter_policy_ = NewBloomFilterPolicy2(16);
    dbname_ = test::TmpDir() + "/db_test";
    DestroyDB(dbname_, Options());
    db_ = NULL;
@ -238,11 +195,7 @@ class DBTest {
  // Return the current option configuration.
  Options CurrentOptions() {
    Options options;
-    options.reuse_logs = false;
    switch (option_config_) {
-      case kReuse:
-        options.reuse_logs = true;
-        break;
      case kFilter:
        options.filter_policy = filter_policy_;
        break;
@ -290,6 +243,23 @@ class DBTest {
    return DB::Open(opts, dbname_, &db_);
  }

+  Status DoubleOpen(Options* options = NULL) {
+    DB * db_fail;
+    delete db_;
+    db_ = NULL;
+    Options opts, opts2;
+    if (options != NULL) {
+      opts = *options;
+    } else {
+      opts = CurrentOptions();
+      opts.create_if_missing = true;
+    }
+    last_options_ = opts;
+
+    DB::Open(opts, dbname_, &db_);
+    return DB::Open(opts2, dbname_, &db_fail);
+  }
+
  Status Put(const std::string& k, const std::string& v) {
    return db_->Put(WriteOptions(), k, v);
  }
@ -311,6 +281,20 @@ class DBTest {
    return result;
  }

+  std::string GetNoCache(const std::string& k, const Snapshot* snapshot = NULL) {
+    ReadOptions options;
+    options.snapshot = snapshot;
+    options.fill_cache=false;
+    std::string result;
+    Status s = db_->Get(options, k, &result);
+    if (s.IsNotFound()) {
+      result = "NOT_FOUND";
+    } else if (!s.ok()) {
+      result = s.ToString();
+    }
+    return result;
+  }
+
  // Return a string that contains all key,value pairs in order,
  // formatted like "(k1->v1)(k2->v2)".
  std::string Contents() {
@ -326,7 +310,7 @@ class DBTest {
    }

    // Check reverse iteration results are the reverse of forward results
-    size_t matched = 0;
+    int matched = 0;
    for (iter->SeekToLast(); iter->Valid(); iter->Prev()) {
      ASSERT_LT(matched, forward.size());
      ASSERT_EQ(IterStatus(iter), forward[forward.size() - matched - 1]);
@ -340,7 +324,7 @@ class DBTest {

  std::string AllEntriesFor(const Slice& user_key) {
    Iterator* iter = dbfull()->TEST_NewInternalIterator();
-    InternalKey target(user_key, kMaxSequenceNumber, kTypeValue);
+    InternalKey target(user_key, 0, kMaxSequenceNumber, kTypeValue);
    iter->Seek(target.Encode());
    std::string result;
    if (!iter->status().ok()) {
@ -361,6 +345,8 @@ class DBTest {
          }
          first = false;
          switch (ikey.type) {
+            case kTypeValueWriteTime:
+            case kTypeValueExplicitExpiry:
            case kTypeValue:
              result += iter->value().ToString();
              break;
@ -474,38 +460,6 @@ class DBTest {
    }
    return result;
  }
-
-  bool DeleteAnSSTFile() {
-    std::vector<std::string> filenames;
-    ASSERT_OK(env_->GetChildren(dbname_, &filenames));
-    uint64_t number;
-    FileType type;
-    for (size_t i = 0; i < filenames.size(); i++) {
-      if (ParseFileName(filenames[i], &number, &type) && type == kTableFile) {
-        ASSERT_OK(env_->DeleteFile(TableFileName(dbname_, number)));
-        return true;
-      }
-    }
-    return false;
-  }
-
-  // Returns number of files renamed.
-  int RenameLDBToSST() {
-    std::vector<std::string> filenames;
-    ASSERT_OK(env_->GetChildren(dbname_, &filenames));
-    uint64_t number;
-    FileType type;
-    int files_renamed = 0;
-    for (size_t i = 0; i < filenames.size(); i++) {
-      if (ParseFileName(filenames[i], &number, &type) && type == kTableFile) {
-        const std::string from = TableFileName(dbname_, number);
-        const std::string to = SSTTableFileName(dbname_, number);
-        ASSERT_OK(env_->RenameFile(from, to));
-        files_renamed++;
-      }
-    }
-    return files_renamed;
-  }
 };

 TEST(DBTest, Empty) {
@ -515,6 +469,11 @@ TEST(DBTest, Empty) {
  } while (ChangeOptions());
 }

+TEST(DBTest, DoubleOpen)
+{
+    ASSERT_NOTOK(DoubleOpen());
+}
+
 TEST(DBTest, ReadWrite) {
  do {
    ASSERT_OK(Put("foo", "v1"));
@ -547,11 +506,11 @@ TEST(DBTest, GetFromImmutableLayer) {
    ASSERT_OK(Put("foo", "v1"));
    ASSERT_EQ("v1", Get("foo"));

-    env_->delay_data_sync_.Release_Store(env_);      // Block sync calls
+    env_->delay_sstable_sync_.Release_Store(env_);   // Block sync calls
    Put("k1", std::string(100000, 'x'));             // Fill memtable
    Put("k2", std::string(100000, 'y'));             // Trigger compaction
    ASSERT_EQ("v1", Get("foo"));
-    env_->delay_data_sync_.Release_Store(NULL);      // Release sync calls
+    env_->delay_sstable_sync_.Release_Store(NULL);   // Release sync calls
  } while (ChangeOptions());
 }

@ -563,17 +522,6 @@ TEST(DBTest, GetFromVersions) {
  } while (ChangeOptions());
 }

-TEST(DBTest, GetMemUsage) {
-  do {
-    ASSERT_OK(Put("foo", "v1"));
-    std::string val;
-    ASSERT_TRUE(db_->GetProperty("leveldb.approximate-memory-usage", &val));
-    int mem_usage = atoi(val.c_str());
-    ASSERT_GT(mem_usage, 0);
-    ASSERT_LT(mem_usage, 5*1024*1024);
-  } while (ChangeOptions());
-}
-
 TEST(DBTest, GetSnapshot) {
  do {
    // Try with both a short key and a long key
@ -634,6 +582,9 @@ TEST(DBTest, GetPicksCorrectFile) {
  } while (ChangeOptions());
 }

+#if 0
+// riak does not execute compaction due to reads
+
 TEST(DBTest, GetEncountersEmptyLevel) {
  do {
    // Arrange for the following to happen:
@ -642,7 +593,7 @@ TEST(DBTest, GetEncountersEmptyLevel) {
    //   * sstable B in level 2
    // Then do enough Get() calls to arrange for an automatic compaction
    // of sstable A.  A bug would cause the compaction to be marked as
-    // occurring at level 1 (instead of the correct level 0).
+    // occuring at level 1 (instead of the correct level 0).

    // Step 1: First place sstables in levels 0 and 2
    int compaction_count = 0;
@ -667,11 +618,12 @@ TEST(DBTest, GetEncountersEmptyLevel) {
    }

    // Step 4: Wait for compaction to finish
-    DelayMilliseconds(1000);
+    env_->SleepForMicroseconds(1000000);

    ASSERT_EQ(NumTableFilesAtLevel(0), 0);
  } while (ChangeOptions());
 }
+#endif

 TEST(DBTest, IterEmpty) {
  Iterator* iter = db_->NewIterator(ReadOptions());
@ -996,7 +948,8 @@ TEST(DBTest, CompactionsGenerateMultipleFiles) {
  dbfull()->TEST_CompactRange(0, NULL, NULL);

  ASSERT_EQ(NumTableFilesAtLevel(0), 0);
-  ASSERT_GT(NumTableFilesAtLevel(1), 1);
+// not riak  ASSERT_GT(NumTableFilesAtLevel(1), 1);
+  ASSERT_EQ(NumTableFilesAtLevel(1), 1);  // yes riak
  for (int i = 0; i < 80; i++) {
    ASSERT_EQ(Get(Key(i)), values[i]);
  }
@ -1010,7 +963,8 @@ TEST(DBTest, RepeatedWritesToSameKey) {

  // We must have at most one file per level except for level-0,
  // which may have up to kL0_StopWritesTrigger files.
-  const int kMaxFiles = config::kNumLevels + config::kL0_StopWritesTrigger;
+  //  ... basho adds *2 since level-1 is now overlapped too
+  const int kMaxFiles = config::kNumLevels + config::kL0_StopWritesTrigger*2;

  Random rnd(301);
  std::string value = RandomString(&rnd, 2 * options.write_buffer_size);
@ -1054,11 +1008,13 @@ TEST(DBTest, SparseMerge) {

  // Compactions should not cause us to create a situation where
  // a file overlaps too much data at the next level.
-  ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576);
+  // 07/10/14 matthewv - we overlap first two levels.  sparse test not appropriate there,
+  //                     and we set overlaps into 100s of megabytes as "normal"
+//  ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576);
  dbfull()->TEST_CompactRange(0, NULL, NULL);
-  ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576);
+//  ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576);
  dbfull()->TEST_CompactRange(1, NULL, NULL);
-  ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576);
+//  ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576);
 }

 static bool Between(uint64_t val, uint64_t low, uint64_t high) {
@ -1096,14 +1052,6 @@ TEST(DBTest, ApproximateSizes) {
    // 0 because GetApproximateSizes() does not account for memtable space
    ASSERT_TRUE(Between(Size("", Key(50)), 0, 0));

-    if (options.reuse_logs) {
-      // Recovery will reuse memtable, and GetApproximateSizes() does not
-      // account for memtable usage;
-      Reopen(&options);
-      ASSERT_TRUE(Between(Size("", Key(50)), 0, 0));
-      continue;
-    }
-
    // Check sizes across recovery by reopening a few times
    for (int run = 0; run < 3; run++) {
      Reopen(&options);
@ -1147,11 +1095,6 @@ TEST(DBTest, ApproximateSizes_MixOfSmallAndLarge) {
    ASSERT_OK(Put(Key(6), RandomString(&rnd, 300000)));
    ASSERT_OK(Put(Key(7), RandomString(&rnd, 10000)));

-    if (options.reuse_logs) {
-      // Need to force a memtable compaction since recovery does not do so.
-      ASSERT_OK(dbfull()->TEST_CompactMemTable());
-    }
-
    // Check sizes across recovery by reopening a few times
    for (int run = 0; run < 3; run++) {
      Reopen(&options);
@ -1223,7 +1166,7 @@ TEST(DBTest, Snapshot) {
    ASSERT_EQ("v4", Get("foo"));
  } while (ChangeOptions());
 }
-
+#if 0 // trouble under Riak due to assumed file sizes
 TEST(DBTest, HiddenValuesAreRemoved) {
  do {
    Random rnd(301);
@ -1254,7 +1197,7 @@ TEST(DBTest, HiddenValuesAreRemoved) {
    ASSERT_TRUE(Between(Size("", "pastfoo"), 0, 1000));
  } while (ChangeOptions());
 }
-
+#endif
 TEST(DBTest, DeletionMarkers1) {
  Put("foo", "v1");
  ASSERT_OK(dbfull()->TEST_CompactMemTable());
@ -1271,13 +1214,14 @@ TEST(DBTest, DeletionMarkers1) {
  Delete("foo");
  Put("foo", "v2");
  ASSERT_EQ(AllEntriesFor("foo"), "[ v2, DEL, v1 ]");
-  ASSERT_OK(dbfull()->TEST_CompactMemTable());  // Moves to level last-2
-  ASSERT_EQ(AllEntriesFor("foo"), "[ v2, DEL, v1 ]");
+  ASSERT_OK(dbfull()->TEST_CompactMemTable());  // stays at level 0
+  ASSERT_EQ(AllEntriesFor("foo"), "[ v2, v1 ]"); // riak 1.3, DEL merged out by BuildTable
  Slice z("z");
-  dbfull()->TEST_CompactRange(last-2, NULL, &z);
+  dbfull()->TEST_CompactRange(0, NULL, &z);
+  dbfull()->TEST_CompactRange(1, NULL, &z);
  // DEL eliminated, but v1 remains because we aren't compacting that level
  // (DEL can be eliminated because v2 hides v1).
-  ASSERT_EQ(AllEntriesFor("foo"), "[ v2, v1 ]");
+  ASSERT_EQ(AllEntriesFor("foo"), "[ v2, v1 ]"); // Riak 1.4 has merged to level 1
  dbfull()->TEST_CompactRange(last-1, NULL, NULL);
  // Merging last-1 w/ last, so we are the base level for "foo", so
  // DEL is removed.  (as is v1).
@ -1289,39 +1233,47 @@ TEST(DBTest, DeletionMarkers2) {
  ASSERT_OK(dbfull()->TEST_CompactMemTable());
  const int last = config::kMaxMemCompactLevel;
  ASSERT_EQ(NumTableFilesAtLevel(last), 1);   // foo => v1 is now in last level
+  dbfull()->TEST_CompactRange(0, NULL, NULL);
+  ASSERT_EQ(NumTableFilesAtLevel(last), 1);   // foo => v1 is now in last level
+  ASSERT_EQ(NumTableFilesAtLevel(last-1), 0);

  // Place a table at level last-1 to prevent merging with preceding mutation
  Put("a", "begin");
  Put("z", "end");
-  dbfull()->TEST_CompactMemTable();
-  ASSERT_EQ(NumTableFilesAtLevel(last), 1);
+  dbfull()->TEST_CompactMemTable(); // goes to last-1
  ASSERT_EQ(NumTableFilesAtLevel(last-1), 1);

  Delete("foo");
  ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]");
-  ASSERT_OK(dbfull()->TEST_CompactMemTable());  // Moves to level last-2
+  ASSERT_OK(dbfull()->TEST_CompactMemTable());  // Moves to level 0
  ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]");
-  dbfull()->TEST_CompactRange(last-2, NULL, NULL);
+  dbfull()->TEST_CompactRange(0, NULL, NULL);   // Riak overlaps level 1
  // DEL kept: "last" file overlaps
  ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]");
-  dbfull()->TEST_CompactRange(last-1, NULL, NULL);
  // Merging last-1 w/ last, so we are the base level for "foo", so
  // DEL is removed.  (as is v1).
+  dbfull()->TEST_CompactRange(1, NULL, NULL);
+  ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]");
+
+  dbfull()->TEST_CompactRange(2, NULL, NULL);
  ASSERT_EQ(AllEntriesFor("foo"), "[ ]");
 }

 TEST(DBTest, OverlapInLevel0) {
  do {
-    ASSERT_EQ(config::kMaxMemCompactLevel, 2) << "Fix test to match config";
+    ASSERT_EQ(config::kMaxMemCompactLevel, 3) << "Fix test to match config";

    // Fill levels 1 and 2 to disable the pushing of new memtables to levels > 0.
    ASSERT_OK(Put("100", "v100"));
    ASSERT_OK(Put("999", "v999"));
    dbfull()->TEST_CompactMemTable();
+    dbfull()->TEST_CompactRange(0, NULL, NULL);
+    dbfull()->TEST_CompactRange(1, NULL, NULL);
    ASSERT_OK(Delete("100"));
    ASSERT_OK(Delete("999"));
    dbfull()->TEST_CompactMemTable();
-    ASSERT_EQ("0,1,1", FilesPerLevel());
+    dbfull()->TEST_CompactRange(0, NULL, NULL);
+    ASSERT_EQ("0,0,1,1", FilesPerLevel());

    // Make files spanning the following ranges in level-0:
    //  files[0]  200 .. 900
@ -1334,7 +1286,7 @@ TEST(DBTest, OverlapInLevel0) {
    ASSERT_OK(Put("600", "v600"));
    ASSERT_OK(Put("900", "v900"));
    dbfull()->TEST_CompactMemTable();
-    ASSERT_EQ("2,1,1", FilesPerLevel());
+    ASSERT_EQ("2,0,1,1", FilesPerLevel());

    // Compact away the placeholder files we created initially
    dbfull()->TEST_CompactRange(1, NULL, NULL);
@ -1364,7 +1316,7 @@ TEST(DBTest, L0_CompactionBug_Issue44_a) {
  Reopen();
  Reopen();
  ASSERT_EQ("(a->v)", Contents());
-  DelayMilliseconds(1000);  // Wait for compaction to finish
+  env_->SleepForMicroseconds(1000000);  // Wait for compaction to finish
  ASSERT_EQ("(a->v)", Contents());
 }

@ -1380,7 +1332,7 @@ TEST(DBTest, L0_CompactionBug_Issue44_b) {
  Put("","");
  Reopen();
  Put("","");
-  DelayMilliseconds(1000);  // Wait for compaction to finish
+  env_->SleepForMicroseconds(1000000);  // Wait for compaction to finish
  Reopen();
  Put("d","dv");
  Reopen();
@ -1390,7 +1342,7 @@ TEST(DBTest, L0_CompactionBug_Issue44_b) {
  Delete("b");
  Reopen();
  ASSERT_EQ("(->)(c->cv)", Contents());
-  DelayMilliseconds(1000);  // Wait for compaction to finish
+  env_->SleepForMicroseconds(1000000);  // Wait for compaction to finish
  ASSERT_EQ("(->)(c->cv)", Contents());
 }

@ -1473,37 +1425,37 @@ TEST(DBTest, CustomComparator) {
 }

 TEST(DBTest, ManualCompaction) {
-  ASSERT_EQ(config::kMaxMemCompactLevel, 2)
+  ASSERT_EQ(config::kMaxMemCompactLevel, 3)
      << "Need to update this test to match kMaxMemCompactLevel";

  MakeTables(3, "p", "q");
-  ASSERT_EQ("1,1,1", FilesPerLevel());
+  ASSERT_EQ("1,0,1,1", FilesPerLevel());

  // Compaction range falls before files
  Compact("", "c");
-  ASSERT_EQ("1,1,1", FilesPerLevel());
+  ASSERT_EQ("0,1,1,1", FilesPerLevel());

  // Compaction range falls after files
  Compact("r", "z");
-  ASSERT_EQ("1,1,1", FilesPerLevel());
+  ASSERT_EQ("0,1,1,1", FilesPerLevel());

  // Compaction range overlaps files
  Compact("p1", "p9");
-  ASSERT_EQ("0,0,1", FilesPerLevel());
+  ASSERT_EQ("0,0,0,1", FilesPerLevel());

  // Populate a different range
  MakeTables(3, "c", "e");
-  ASSERT_EQ("1,1,2", FilesPerLevel());
+  ASSERT_EQ("1,0,1,2", FilesPerLevel());

  // Compact just the new range
  Compact("b", "f");
-  ASSERT_EQ("0,0,2", FilesPerLevel());
+  ASSERT_EQ("0,0,0,2", FilesPerLevel());

  // Compact all
  MakeTables(1, "a", "z");
-  ASSERT_EQ("0,1,2", FilesPerLevel());
+  ASSERT_EQ("0,0,1,2", FilesPerLevel());
  db_->CompactRange(NULL, NULL);
-  ASSERT_EQ("0,0,1", FilesPerLevel());
+  ASSERT_EQ("0,0,0,1", FilesPerLevel());
 }

 TEST(DBTest, DBOpen_Options) {
@ -1545,12 +1497,6 @@ TEST(DBTest, DBOpen_Options) {
  db = NULL;
 }

-TEST(DBTest, Locking) {
-  DB* db2 = NULL;
-  Status s = DB::Open(CurrentOptions(), dbname_, &db2);
-  ASSERT_TRUE(!s.ok()) << "Locking did not prevent re-opening db";
-}
-
 // Check that number of files does not grow when we are out of space
 TEST(DBTest, NoSpace) {
  Options options = CurrentOptions();
@ -1562,15 +1508,19 @@ TEST(DBTest, NoSpace) {
  Compact("a", "z");
  const int num_files = CountFiles();
  env_->no_space_.Release_Store(env_);   // Force out-of-space errors
-  for (int i = 0; i < 10; i++) {
+  env_->sleep_counter_.Reset();
+  for (int i = 0; i < 5; i++) {
    for (int level = 0; level < config::kNumLevels-1; level++) {
      dbfull()->TEST_CompactRange(level, NULL, NULL);
    }
  }
  env_->no_space_.Release_Store(NULL);
  ASSERT_LT(CountFiles(), num_files + 3);
-}

+  // Check that compaction attempts slept after errors
+  ASSERT_GE(env_->sleep_counter_.Read(), 5);
+}
+#if 0
 TEST(DBTest, NonWritableFileSystem) {
  Options options = CurrentOptions();
  options.write_buffer_size = 1000;
@ -1584,119 +1534,13 @@ TEST(DBTest, NonWritableFileSystem) {
    fprintf(stderr, "iter %d; errors %d\n", i, errors);
    if (!Put("foo", big).ok()) {
      errors++;
-      DelayMilliseconds(100);
+      env_->SleepForMicroseconds(100000);
    }
  }
  ASSERT_GT(errors, 0);
  env_->non_writable_.Release_Store(NULL);
 }
-
-TEST(DBTest, WriteSyncError) {
-  // Check that log sync errors cause the DB to disallow future writes.
-
-  // (a) Cause log sync calls to fail
-  Options options = CurrentOptions();
-  options.env = env_;
-  Reopen(&options);
-  env_->data_sync_error_.Release_Store(env_);
-
-  // (b) Normal write should succeed
-  WriteOptions w;
-  ASSERT_OK(db_->Put(w, "k1", "v1"));
-  ASSERT_EQ("v1", Get("k1"));
-
-  // (c) Do a sync write; should fail
-  w.sync = true;
-  ASSERT_TRUE(!db_->Put(w, "k2", "v2").ok());
-  ASSERT_EQ("v1", Get("k1"));
-  ASSERT_EQ("NOT_FOUND", Get("k2"));
-
-  // (d) make sync behave normally
-  env_->data_sync_error_.Release_Store(NULL);
-
-  // (e) Do a non-sync write; should fail
-  w.sync = false;
-  ASSERT_TRUE(!db_->Put(w, "k3", "v3").ok());
-  ASSERT_EQ("v1", Get("k1"));
-  ASSERT_EQ("NOT_FOUND", Get("k2"));
-  ASSERT_EQ("NOT_FOUND", Get("k3"));
-}
-
-TEST(DBTest, ManifestWriteError) {
-  // Test for the following problem:
-  // (a) Compaction produces file F
-  // (b) Log record containing F is written to MANIFEST file, but Sync() fails
-  // (c) GC deletes F
-  // (d) After reopening DB, reads fail since deleted F is named in log record
-
-  // We iterate twice.  In the second iteration, everything is the
-  // same except the log record never makes it to the MANIFEST file.
-  for (int iter = 0; iter < 2; iter++) {
-    port::AtomicPointer* error_type = (iter == 0)
-        ? &env_->manifest_sync_error_
-        : &env_->manifest_write_error_;
-
-    // Insert foo=>bar mapping
-    Options options = CurrentOptions();
-    options.env = env_;
-    options.create_if_missing = true;
-    options.error_if_exists = false;
-    DestroyAndReopen(&options);
-    ASSERT_OK(Put("foo", "bar"));
-    ASSERT_EQ("bar", Get("foo"));
-
-    // Memtable compaction (will succeed)
-    dbfull()->TEST_CompactMemTable();
-    ASSERT_EQ("bar", Get("foo"));
-    const int last = config::kMaxMemCompactLevel;
-    ASSERT_EQ(NumTableFilesAtLevel(last), 1);   // foo=>bar is now in last level
-
-    // Merging compaction (will fail)
-    error_type->Release_Store(env_);
-    dbfull()->TEST_CompactRange(last, NULL, NULL);  // Should fail
-    ASSERT_EQ("bar", Get("foo"));
-
-    // Recovery: should not lose data
-    error_type->Release_Store(NULL);
-    Reopen(&options);
-    ASSERT_EQ("bar", Get("foo"));
-  }
-}
-
-TEST(DBTest, MissingSSTFile) {
-  ASSERT_OK(Put("foo", "bar"));
-  ASSERT_EQ("bar", Get("foo"));
-
-  // Dump the memtable to disk.
-  dbfull()->TEST_CompactMemTable();
-  ASSERT_EQ("bar", Get("foo"));
-
-  Close();
-  ASSERT_TRUE(DeleteAnSSTFile());
-  Options options = CurrentOptions();
-  options.paranoid_checks = true;
-  Status s = TryReopen(&options);
-  ASSERT_TRUE(!s.ok());
-  ASSERT_TRUE(s.ToString().find("issing") != std::string::npos)
-      << s.ToString();
-}
-
-TEST(DBTest, StillReadSST) {
-  ASSERT_OK(Put("foo", "bar"));
-  ASSERT_EQ("bar", Get("foo"));
-
-  // Dump the memtable to disk.
-  dbfull()->TEST_CompactMemTable();
-  ASSERT_EQ("bar", Get("foo"));
-  Close();
-  ASSERT_GT(RenameLDBToSST(), 0);
-  Options options = CurrentOptions();
-  options.paranoid_checks = true;
-  Status s = TryReopen(&options);
-  ASSERT_TRUE(s.ok());
-  ASSERT_EQ("bar", Get("foo"));
-}
-
+#endif
 TEST(DBTest, FilesDeletedAfterCompaction) {
  ASSERT_OK(Put("foo", "v2"));
  Compact("a", "z");
@ -1713,7 +1557,7 @@ TEST(DBTest, BloomFilter) {
  Options options = CurrentOptions();
  options.env = env_;
  options.block_cache = NewLRUCache(0);  // Prevent cache hits
-  options.filter_policy = NewBloomFilterPolicy(10);
+  options.filter_policy = NewBloomFilterPolicy2(16);
  Reopen(&options);

  // Populate multiple layers
@ -1728,12 +1572,12 @@ TEST(DBTest, BloomFilter) {
  dbfull()->TEST_CompactMemTable();

  // Prevent auto compactions triggered by seeks
-  env_->delay_data_sync_.Release_Store(env_);
+  env_->delay_sstable_sync_.Release_Store(env_);

  // Lookup present keys.  Should rarely read from small sstable.
  env_->random_read_counter_.Reset();
  for (int i = 0; i < N; i++) {
-    ASSERT_EQ(Key(i), Get(Key(i)));
+    ASSERT_EQ(Key(i), GetNoCache(Key(i)));
  }
  int reads = env_->random_read_counter_.Read();
  fprintf(stderr, "%d present => %d reads\n", N, reads);
@ -1743,13 +1587,13 @@ TEST(DBTest, BloomFilter) {
  // Lookup present keys.  Should rarely read from either sstable.
  env_->random_read_counter_.Reset();
  for (int i = 0; i < N; i++) {
-    ASSERT_EQ("NOT_FOUND", Get(Key(i) + ".missing"));
+    ASSERT_EQ("NOT_FOUND", GetNoCache(Key(i) + ".missing"));
  }
  reads = env_->random_read_counter_.Read();
  fprintf(stderr, "%d missing => %d reads\n", N, reads);
  ASSERT_LE(reads, 3*N/100);

-  env_->delay_data_sync_.Release_Store(NULL);
+  env_->delay_sstable_sync_.Release_Store(NULL);
  Close();
  delete options.block_cache;
  delete options.filter_policy;
@ -1809,7 +1653,7 @@ static void MTThreadBody(void* arg) {
        ASSERT_EQ(k, key);
        ASSERT_GE(w, 0);
        ASSERT_LT(w, kNumThreads);
-        ASSERT_LE(static_cast<uintptr_t>(c), reinterpret_cast<uintptr_t>(
+        ASSERT_LE(c, reinterpret_cast<uintptr_t>(
            t->state->counter[w].Acquire_Load()));
      }
    }
@ -1834,27 +1678,35 @@ TEST(DBTest, MultiThreaded) {

    // Start threads
    MTThread thread[kNumThreads];
+    pthread_t tid;
    for (int id = 0; id < kNumThreads; id++) {
      thread[id].state = &mt;
      thread[id].id = id;
-      env_->StartThread(MTThreadBody, &thread[id]);
+      tid=env_->StartThread(MTThreadBody, &thread[id]);
+      pthread_detach(tid);
    }

    // Let them run for a while
-    DelayMilliseconds(kTestSeconds * 1000);
+    env_->SleepForMicroseconds(kTestSeconds * 1000000);

    // Stop the threads and wait for them to finish
    mt.stop.Release_Store(&mt);
    for (int id = 0; id < kNumThreads; id++) {
      while (mt.thread_done[id].Acquire_Load() == NULL) {
-        DelayMilliseconds(100);
+        env_->SleepForMicroseconds(100000);
      }
    }
  } while (ChangeOptions());
 }

 namespace {
-typedef std::map<std::string, std::string> KVMap;
+struct KVEntry
+{
+    std::string m_Value;
+    KeyMetaData m_Meta;
+};
+
+typedef std::map<std::string, KVEntry> KVMap;
 }

 class ModelDB: public DB {
@ -1866,14 +1718,21 @@ class ModelDB: public DB {

  explicit ModelDB(const Options& options): options_(options) { }
  ~ModelDB() { }
-  virtual Status Put(const WriteOptions& o, const Slice& k, const Slice& v) {
-    return DB::Put(o, k, v);
+  virtual Status Put(const WriteOptions& o, const Slice& k, const Slice& v, const KeyMetaData * meta=NULL) {
+    return DB::Put(o, k, v, meta);
  }
  virtual Status Delete(const WriteOptions& o, const Slice& key) {
    return DB::Delete(o, key);
  }
  virtual Status Get(const ReadOptions& options,
-                     const Slice& key, std::string* value) {
+                     const Slice& key, std::string* value,
+                     KeyMetaData * meta = NULL) {
+    assert(false);      // Not implemented
+    return Status::NotFound(key);
+  }
+  virtual Status Get(const ReadOptions& options,
+                     const Slice& key, Value* value,
+                     KeyMetaData * meta = NULL) {
    assert(false);      // Not implemented
    return Status::NotFound(key);
  }
@ -1901,8 +1760,13 @@ class ModelDB: public DB {
    class Handler : public WriteBatch::Handler {
     public:
      KVMap* map_;
-      virtual void Put(const Slice& key, const Slice& value) {
-        (*map_)[key.ToString()] = value.ToString();
+      virtual void Put(const Slice& key, const Slice& value,
+                       const ValueType & type, const ExpiryTimeMicros & expiry) {
+        KVEntry ent;
+        ent.m_Value=value.ToString();
+        ent.m_Meta.m_Type=type;
+        ent.m_Meta.m_Expiry=expiry;
+        (*map_)[key.ToString()] = ent;
      }
      virtual void Delete(const Slice& key) {
        map_->erase(key.ToString());
@ -1948,7 +1812,7 @@ class ModelDB: public DB {
    virtual void Next() { ++iter_; }
    virtual void Prev() { --iter_; }
    virtual Slice key() const { return iter_->first; }
-    virtual Slice value() const { return iter_->second; }
+    virtual Slice value() const { return iter_->second.m_Value; }
    virtual Status status() const { return Status::OK(); }
   private:
    const KVMap* const map_;
@ -2085,6 +1949,44 @@ TEST(DBTest, Randomized) {
  } while (ChangeOptions());
 }

+
+class SimpleBugs
+{
+    // need a class for the test harness
+};
+
+
+TEST(SimpleBugs, TieredRecoveryLog)
+{
+    // DB::Open created first recovery log directly
+    //   which lead to it NOT being in tiered storage location.
+    // nope std::string dbname = test::TmpDir() + "/leveldb_nontiered";
+    std::string dbname = "leveldb";
+    std::string fastname = test::TmpDir() + "/leveldb_fast";
+    std::string slowname = test::TmpDir() + "/leveldb_slow";
+    std::string combined;
+
+    DB* db = NULL;
+    Options opts;
+
+    opts.tiered_slow_level = 4;
+    opts.tiered_fast_prefix = fastname;
+    opts.tiered_slow_prefix = slowname;
+    opts.create_if_missing = true;
+
+    Env::Default()->CreateDir(fastname);
+    Env::Default()->CreateDir(slowname);
+
+    Status s = DB::Open(opts, dbname, &db);
+    ASSERT_OK(s);
+    ASSERT_TRUE(db != NULL);
+
+    delete db;
+    DestroyDB(dbname, opts);
+
+}   // TieredRecoveryLog
+
+
 std::string MakeKey(unsigned int num) {
  char buf[30];
  snprintf(buf, sizeof(buf), "%016u", num);
@ -2113,14 +2015,13 @@ void BM_LogAndApply(int iters, int num_base_files) {
  InternalKeyComparator cmp(BytewiseComparator());
  Options options;
  VersionSet vset(dbname, &options, NULL, &cmp);
-  bool save_manifest;
-  ASSERT_OK(vset.Recover(&save_manifest));
+  ASSERT_OK(vset.Recover());
  VersionEdit vbase;
  uint64_t fnum = 1;
  for (int i = 0; i < num_base_files; i++) {
-    InternalKey start(MakeKey(2*fnum), 1, kTypeValue);
-    InternalKey limit(MakeKey(2*fnum+1), 1, kTypeDeletion);
-    vbase.AddFile(2, fnum++, 1 /* file size */, start, limit);
+    InternalKey start(MakeKey(2*fnum), 0, 1, kTypeValue);
+    InternalKey limit(MakeKey(2*fnum+1), 0, 1, kTypeDeletion);
+    vbase.AddFile2(2, fnum++, 1 /* file size */, start, limit, 0,0,0);
  }
  ASSERT_OK(vset.LogAndApply(&vbase, &mu));

@ -2129,9 +2030,9 @@ void BM_LogAndApply(int iters, int num_base_files) {
  for (int i = 0; i < iters; i++) {
    VersionEdit vedit;
    vedit.DeleteFile(2, fnum);
-    InternalKey start(MakeKey(2*fnum), 1, kTypeValue);
-    InternalKey limit(MakeKey(2*fnum+1), 1, kTypeDeletion);
-    vedit.AddFile(2, fnum++, 1 /* file size */, start, limit);
+    InternalKey start(MakeKey(2*fnum), 0, 1, kTypeValue);
+    InternalKey limit(MakeKey(2*fnum+1), 0, 1, kTypeDeletion);
+    vedit.AddFile2(2, fnum++, 1 /* file size */, start, limit, 0,0,0);
    vset.LogAndApply(&vedit, &mu);
  }
  uint64_t stop_micros = env->NowMicros();
--- a/src/leveldb/db/dbformat.cc
+++ b/src/leveldb/db/dbformat.cc
@ -3,7 +3,9 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.

 #include <stdio.h>
+//#include "leveldb/expiry.h"
 #include "db/dbformat.h"
+#include "db/version_set.h"
 #include "port/port.h"
 #include "util/coding.h"

@ -11,26 +13,66 @@ namespace leveldb {

 static uint64_t PackSequenceAndType(uint64_t seq, ValueType t) {
  assert(seq <= kMaxSequenceNumber);
-  assert(t <= kValueTypeForSeek);
+  // assert(t <= kValueTypeForSeek);  requires revisit once expiry live
+  assert(t <= kTypeValueExplicitExpiry);  // temp replacement for above
  return (seq << 8) | t;
 }

 void AppendInternalKey(std::string* result, const ParsedInternalKey& key) {
  result->append(key.user_key.data(), key.user_key.size());
+  if (IsExpiryKey(key.type))
+    PutFixed64(result, key.expiry);
  PutFixed64(result, PackSequenceAndType(key.sequence, key.type));
 }

 std::string ParsedInternalKey::DebugString() const {
  char buf[50];
+  if (IsExpiryKey(type))
+    snprintf(buf, sizeof(buf), "' @ %llu %llu : %d",
+             (unsigned long long) expiry,
+             (unsigned long long) sequence,
+             int(type));
+  else
    snprintf(buf, sizeof(buf), "' @ %llu : %d",
             (unsigned long long) sequence,
             int(type));
  std::string result = "'";
-  result += EscapeString(user_key.ToString());
+  result += HexString(user_key.ToString());
  result += buf;
  return result;
 }

+std::string ParsedInternalKey::DebugStringHex() const {
+  char buf[50];
+  if (IsExpiryKey(type))
+    snprintf(buf, sizeof(buf), "' @ %llu %llu : %d",
+             (unsigned long long) expiry,
+             (unsigned long long) sequence,
+             int(type));
+  else
+    snprintf(buf, sizeof(buf), "' @ %llu : %d",
+             (unsigned long long) sequence,
+             int(type));
+  std::string result = "'";
+  result += HexString(user_key);
+  result += buf;
+  return result;
+}
+
+
+const char * KeyTypeString(ValueType val_type) {
+  const char * ret_ptr;
+  switch(val_type)
+  {
+      case kTypeDeletion: ret_ptr="kTypeDelete"; break;
+      case kTypeValue:    ret_ptr="kTypeValue"; break;
+      case kTypeValueWriteTime: ret_ptr="kTypeValueWriteTime"; break;
+      case kTypeValueExplicitExpiry: ret_ptr="kTypeValueExplicitExpiry"; break;
+      default: ret_ptr="(unknown ValueType)"; break;
+  }   // switch
+  return(ret_ptr);
+}
+
 std::string InternalKey::DebugString() const {
  std::string result;
  ParsedInternalKey parsed;
@ -54,8 +96,10 @@ int InternalKeyComparator::Compare(const Slice& akey, const Slice& bkey) const {
  //    decreasing type (though sequence# should be enough to disambiguate)
  int r = user_comparator_->Compare(ExtractUserKey(akey), ExtractUserKey(bkey));
  if (r == 0) {
-    const uint64_t anum = DecodeFixed64(akey.data() + akey.size() - 8);
-    const uint64_t bnum = DecodeFixed64(bkey.data() + bkey.size() - 8);
+    uint64_t anum = DecodeFixed64(akey.data() + akey.size() - 8);
+    uint64_t bnum = DecodeFixed64(bkey.data() + bkey.size() - 8);
+    if (IsExpiryKey((ValueType)*(unsigned char *)&anum)) *(unsigned char*)&anum=(unsigned char)kTypeValue;
+    if (IsExpiryKey((ValueType)*(unsigned char *)&bnum)) *(unsigned char*)&bnum=(unsigned char)kTypeValue;
    if (anum > bnum) {
      r = -1;
    } else if (anum < bnum) {
@ -118,7 +162,8 @@ bool InternalFilterPolicy::KeyMayMatch(const Slice& key, const Slice& f) const {
  return user_policy_->KeyMayMatch(ExtractUserKey(key), f);
 }

-LookupKey::LookupKey(const Slice& user_key, SequenceNumber s) {
+  LookupKey::LookupKey(const Slice& user_key, SequenceNumber s, KeyMetaData * meta) {
+  meta_=meta;
  size_t usize = user_key.size();
  size_t needed = usize + 13;  // A conservative estimate
  char* dst;
@ -137,4 +182,109 @@ LookupKey::LookupKey(const Slice& user_key, SequenceNumber s) {
  end_ = dst;
 }

+
+KeyRetirement::KeyRetirement(
+    const Comparator * Comparator,
+    SequenceNumber SmallestSnapshot,
+    const Options * Opts,
+    Compaction * const Compaction)
+    : has_current_user_key(false), last_sequence_for_key(kMaxSequenceNumber),
+      user_comparator(Comparator), smallest_snapshot(SmallestSnapshot),
+      options(Opts), compaction(Compaction),
+      valid(false), dropped(0), expired(0)
+{
+    // NULL is ok for compaction
+    valid=(NULL!=user_comparator);
+
+    return;
+}   // KeyRetirement::KeyRetirement
+
+
+KeyRetirement::~KeyRetirement()
+{
+    if (0!=expired)
+        gPerfCounters->Add(ePerfExpiredKeys, expired);
+}   // KeyRetirement::~KeyRetirement
+
+
+bool
+KeyRetirement::operator()(
+    Slice & key)
+{
+    ParsedInternalKey ikey;
+    bool drop = false, expire_flag;
+
+    if (valid)
+    {
+        if (!ParseInternalKey(key, &ikey))
+        {
+            // Do not hide error keys
+            current_user_key.clear();
+            has_current_user_key = false;
+            last_sequence_for_key = kMaxSequenceNumber;
+        }   // else
+        else
+        {
+            if (!has_current_user_key ||
+                user_comparator->Compare(ikey.user_key,
+                                         Slice(current_user_key)) != 0)
+            {
+                // First occurrence of this user key
+                current_user_key.assign(ikey.user_key.data(), ikey.user_key.size());
+                has_current_user_key = true;
+                last_sequence_for_key = kMaxSequenceNumber;
+            }   // if
+
+            if (last_sequence_for_key <= smallest_snapshot)
+            {
+                // Hidden by an newer entry for same user key
+                drop = true;    // (A)
+            }   // if
+
+            else
+            {
+                expire_flag=false;
+                if (NULL!=options && options->ExpiryActivated())
+                    expire_flag=options->expiry_module->KeyRetirementCallback(ikey);
+
+                if ((ikey.type == kTypeDeletion || expire_flag)
+                    && ikey.sequence <= smallest_snapshot
+                    && NULL!=compaction  // mem to level0 ignores this test
+                    && compaction->IsBaseLevelForKey(ikey.user_key))
+                {
+                    // For this user key:
+                    // (1) there is no data in higher levels
+                    // (2) data in lower levels will have larger sequence numbers
+                    // (3) data in layers that are being compacted here and have
+                    //     smaller sequence numbers will be dropped in the next
+                    //     few iterations of this loop (by rule (A) above).
+                    // Therefore this deletion marker is obsolete and can be dropped.
+                    drop = true;
+
+                    if (expire_flag)
+                        ++expired;
+                    else
+                        ++dropped;
+                }   // if
+            }   // else
+
+            last_sequence_for_key = ikey.sequence;
+        }   // else
+    }   // if
+
+#if 0
+    // needs clean up to be used again
+    Log(options_.info_log,
+        "  Compact: %s, seq %d, type: %d %d, drop: %d, is_base: %d, "
+        "%d smallest_snapshot: %d",
+        ikey.user_key.ToString().c_str(),
+        (int)ikey.sequence, ikey.type, kTypeValue, drop,
+        compact->compaction->IsBaseLevelForKey(ikey.user_key),
+        (int)last_sequence_for_key, (int)compact->smallest_snapshot);
+#endif
+    return(drop);
+
+}   // KeyRetirement::operator(Slice & )
+
+
 }  // namespace leveldb
--- a/src/leveldb/db/dbformat.h
+++ b/src/leveldb/db/dbformat.h
@ -2,13 +2,14 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.

-#ifndef STORAGE_LEVELDB_DB_DBFORMAT_H_
-#define STORAGE_LEVELDB_DB_DBFORMAT_H_
+#ifndef STORAGE_LEVELDB_DB_FORMAT_H_
+#define STORAGE_LEVELDB_DB_FORMAT_H_

 #include <stdio.h>
 #include "leveldb/comparator.h"
 #include "leveldb/db.h"
 #include "leveldb/filter_policy.h"
+#include "leveldb/options.h"
 #include "leveldb/slice.h"
 #include "leveldb/table_builder.h"
 #include "util/coding.h"
@ -16,19 +17,33 @@

 namespace leveldb {

+class Compaction;
+
 // Grouping of constants.  We may want to make some of these
 // parameters set via options.
 namespace config {
 static const int kNumLevels = 7;
+static const int kNumOverlapLevels = 2;

 // Level-0 compaction is started when we hit this many files.
-static const int kL0_CompactionTrigger = 4;
+// Google:  static const size_t kL0_CompactionTrigger = 4;
+static const size_t kL0_CompactionTrigger = 6;
+
+// Level-0 (any overlapped level) number of files where a grooming
+//     compaction could start
+static const size_t kL0_GroomingTrigger = 4;
+static const size_t kL0_GroomingTrigger10min = 2;
+static const size_t kL0_GroomingTrigger20min = 1;
+
+// ... time limits in microseconds
+static const size_t kL0_Grooming10minMicros = 10 * 60 * 1000000;
+static const size_t kL0_Grooming20minMicros = 20 * 60 * 1000000;

 // Soft limit on number of level-0 files.  We slow down writes at this point.
-static const int kL0_SlowdownWritesTrigger = 8;
+static const size_t kL0_SlowdownWritesTrigger = 8;

 // Maximum number of level-0 files.  We stop writes at this point.
-static const int kL0_StopWritesTrigger = 12;
+static const size_t kL0_StopWritesTrigger = 12;

 // Maximum level to which a new compacted memtable is pushed if it
 // does not create overlap.  We try to push to level 2 to avoid the
@ -36,31 +51,28 @@ static const int kL0_StopWritesTrigger = 12;
 // expensive manifest file operations.  We do not push all the way to
 // the largest level since that can generate a lot of wasted disk
 // space if the same key space is being repeatedly overwritten.
-static const int kMaxMemCompactLevel = 2;
-
-// Approximate gap in bytes between samples of data read during iteration.
-static const int kReadBytesPeriod = 1048576;
+// Basho: push to kNumOverlapLevels +1 ... beyond "landing level"
+static const unsigned kMaxMemCompactLevel = kNumOverlapLevels+1;

 }  // namespace config

 class InternalKey;

-// Value types encoded as the last component of internal keys.
-// DO NOT CHANGE THESE ENUM VALUES: they are embedded in the on-disk
-// data structures.
-enum ValueType {
-  kTypeDeletion = 0x0,
-  kTypeValue = 0x1
-};
 // kValueTypeForSeek defines the ValueType that should be passed when
 // constructing a ParsedInternalKey object for seeking to a particular
 // sequence number (since we sort sequence numbers in decreasing order
 // and the value type is embedded as the low 8 bits in the sequence
 // number in internal keys, we need to use the highest-numbered
 // ValueType, not the lowest).
+//  Riak note: kValueTypeForSeek is placed within temporary keys
+//             for comparisons.  Using kTypeValueExplicitExpiry would
+//             force more code changes to increase internal key size.
+//             But ValueTypeForSeek is redundant to sequence number for
+//             disambiguaty. Therefore going for easiest path and NOT changing.
 static const ValueType kValueTypeForSeek = kTypeValue;

 typedef uint64_t SequenceNumber;
+typedef uint64_t ExpiryTimeMicros;

 // We leave eight bits empty at the bottom so a type and sequence#
 // can be packed together into 64-bits.
@ -69,20 +81,17 @@ static const SequenceNumber kMaxSequenceNumber =

 struct ParsedInternalKey {
  Slice user_key;
+  ExpiryTimeMicros expiry;
  SequenceNumber sequence;
  ValueType type;

  ParsedInternalKey() { }  // Intentionally left uninitialized (for speed)
-  ParsedInternalKey(const Slice& u, const SequenceNumber& seq, ValueType t)
-      : user_key(u), sequence(seq), type(t) { }
+  ParsedInternalKey(const Slice& u, const ExpiryTimeMicros & exp, const SequenceNumber& seq, ValueType t)
+      : user_key(u), expiry(exp), sequence(seq), type(t) { }
  std::string DebugString() const;
+  std::string DebugStringHex() const;
 };

-// Return the length of the encoding of "key".
-inline size_t InternalKeyEncodingLength(const ParsedInternalKey& key) {
-  return key.user_key.size() + 8;
-}
-
 // Append the serialization of "key" to *result.
 extern void AppendInternalKey(std::string* result,
                              const ParsedInternalKey& key);
@ -94,20 +103,76 @@ extern void AppendInternalKey(std::string* result,
 extern bool ParseInternalKey(const Slice& internal_key,
                             ParsedInternalKey* result);

-// Returns the user key portion of an internal key.
-inline Slice ExtractUserKey(const Slice& internal_key) {
-  assert(internal_key.size() >= 8);
-  return Slice(internal_key.data(), internal_key.size() - 8);
-}
-
 inline ValueType ExtractValueType(const Slice& internal_key) {
  assert(internal_key.size() >= 8);
  const size_t n = internal_key.size();
-  uint64_t num = DecodeFixed64(internal_key.data() + n - 8);
-  unsigned char c = num & 0xff;
+  unsigned char c = DecodeLeastFixed64(internal_key.data() + n - sizeof(SequenceNumber));
  return static_cast<ValueType>(c);
 }

+inline size_t KeySuffixSize(ValueType val_type) {
+  size_t ret_val;
+  switch(val_type)
+  {
+      case kTypeDeletion:
+      case kTypeValue:
+          ret_val=sizeof(SequenceNumber);
+          break;
+
+      case kTypeValueWriteTime:
+      case kTypeValueExplicitExpiry:
+          ret_val=sizeof(SequenceNumber) + sizeof(ExpiryTimeMicros);
+          break;
+
+      default:
+          // assert(0);  cannot use because bloom filter block's name is passed as internal key
+          ret_val=sizeof(SequenceNumber);
+          break;
+  }   // switch
+  return(ret_val);
+}
+
+const char * KeyTypeString(ValueType val_type);
+
+inline size_t KeySuffixSize(const Slice & internal_key) {
+    return(KeySuffixSize(ExtractValueType(internal_key)));
+}
+
+// Returns the user key portion of an internal key.
+inline Slice ExtractUserKey(const Slice& internal_key) {
+  assert(internal_key.size() >= 8);
+  return Slice(internal_key.data(), internal_key.size() - KeySuffixSize(internal_key));
+}
+
+// Returns the sequence number with ValueType removed
+inline SequenceNumber ExtractSequenceNumber(const Slice& internal_key) {
+  assert(internal_key.size() >= 8);
+  return(DecodeFixed64(internal_key.data() + internal_key.size() - 8)>>8);
+}
+
+// Return the length of the encoding of "key".
+inline size_t InternalKeyEncodingLength(const ParsedInternalKey& key) {
+  return key.user_key.size() + KeySuffixSize(key.type);
+}
+
+// Riak: is this an expiry key and therefore contain extra ExpiryTime field
+inline bool IsExpiryKey(ValueType val_type) {
+  return(kTypeValueWriteTime==val_type || kTypeValueExplicitExpiry==val_type);
+}
+
+// Riak: is this an expiry key and therefore contain extra ExpiryTime field
+inline bool IsExpiryKey(const Slice & internal_key) {
+    return(internal_key.size()>=KeySuffixSize(kTypeValueWriteTime)
+           && IsExpiryKey(ExtractValueType(internal_key)));
+}
+
+// Riak: extracts expiry value
+inline ExpiryTimeMicros ExtractExpiry(const Slice& internal_key) {
+  assert(internal_key.size() >= KeySuffixSize(kTypeValueWriteTime));
+  assert(IsExpiryKey(internal_key));
+  return(DecodeFixed64(internal_key.data() + internal_key.size() - KeySuffixSize(kTypeValueWriteTime)));
+}
+
 // A comparator for internal keys that uses a specified comparator for
 // the user key portion and breaks ties by decreasing sequence number.
 class InternalKeyComparator : public Comparator {
@ -129,7 +194,7 @@ class InternalKeyComparator : public Comparator {

 // Filter policy wrapper that converts from internal keys to user keys
 class InternalFilterPolicy : public FilterPolicy {
- private:
+ protected:
  const FilterPolicy* const user_policy_;
 public:
  explicit InternalFilterPolicy(const FilterPolicy* p) : user_policy_(p) { }
@ -138,6 +203,12 @@ class InternalFilterPolicy : public FilterPolicy {
  virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const;
 };

+class InternalFilterPolicy2 : public InternalFilterPolicy {
+ public:
+  explicit InternalFilterPolicy2(const FilterPolicy* p) : InternalFilterPolicy(p) { }
+  virtual ~InternalFilterPolicy2() {delete user_policy_;};
+};
+
 // Modules in this directory should keep internal keys wrapped inside
 // the following class instead of plain strings so that we do not
 // incorrectly use string comparisons instead of an InternalKeyComparator.
@ -146,8 +217,8 @@ class InternalKey {
  std::string rep_;
 public:
  InternalKey() { }   // Leave rep_ as empty to indicate it is invalid
-  InternalKey(const Slice& user_key, SequenceNumber s, ValueType t) {
-    AppendInternalKey(&rep_, ParsedInternalKey(user_key, s, t));
+  InternalKey(const Slice& user_key, ExpiryTimeMicros exp, SequenceNumber s, ValueType t) {
+    AppendInternalKey(&rep_, ParsedInternalKey(user_key, exp, s, t));
  }

  void DecodeFrom(const Slice& s) { rep_.assign(s.data(), s.size()); }
@ -157,6 +228,7 @@ class InternalKey {
  }

  Slice user_key() const { return ExtractUserKey(rep_); }
+  Slice internal_key() const { return Slice(rep_); }

  void SetFrom(const ParsedInternalKey& p) {
    rep_.clear();
@ -181,8 +253,12 @@ inline bool ParseInternalKey(const Slice& internal_key,
  unsigned char c = num & 0xff;
  result->sequence = num >> 8;
  result->type = static_cast<ValueType>(c);
-  result->user_key = Slice(internal_key.data(), n - 8);
-  return (c <= static_cast<unsigned char>(kTypeValue));
+  if (IsExpiryKey((ValueType)c))
+    result->expiry=DecodeFixed64(internal_key.data() + n - KeySuffixSize((ValueType)c));
+  else
+    result->expiry=0;
+  result->user_key = Slice(internal_key.data(), n - KeySuffixSize((ValueType)c));
+  return (c <= static_cast<unsigned char>(kTypeValueExplicitExpiry));
 }

 // A helper class useful for DBImpl::Get()
@ -190,7 +266,7 @@ class LookupKey {
 public:
  // Initialize *this for looking up user_key at a snapshot with
  // the specified sequence number.
-  LookupKey(const Slice& user_key, SequenceNumber sequence);
+  LookupKey(const Slice& user_key, SequenceNumber sequence, KeyMetaData * meta=NULL);

  ~LookupKey();

@ -201,12 +277,38 @@ class LookupKey {
  Slice internal_key() const { return Slice(kstart_, end_ - kstart_); }

  // Return the user key
-  Slice user_key() const { return Slice(kstart_, end_ - kstart_ - 8); }
+  Slice user_key() const
+  { return Slice(kstart_, end_ - kstart_ - KeySuffixSize(internal_key())); }
+
+  // did requestor have metadata object?
+  bool WantsKeyMetaData() const {return(NULL!=meta_);};
+
+  void SetKeyMetaData(ValueType type, SequenceNumber seq, ExpiryTimeMicros expiry) const
+  {if (NULL!=meta_)
+    {
+      meta_->m_Type=type;
+      meta_->m_Sequence=seq;
+      meta_->m_Expiry=expiry;
+    } // if
+  };
+
+  void SetKeyMetaData(const ParsedInternalKey & pi_key) const
+  {if (NULL!=meta_)
+    {
+      meta_->m_Type=pi_key.type;
+      meta_->m_Sequence=pi_key.sequence;
+      meta_->m_Expiry=pi_key.expiry;
+    } // if
+  };
+
+  void SetKeyMetaData(const KeyMetaData & meta) const
+  {if (NULL!=meta_) *meta_=meta;};

 private:
  // We construct a char array of the form:
  //    klength  varint32               <-- start_
  //    userkey  char[klength]          <-- kstart_
+  //    optional uint64
  //    tag      uint64
  //                                    <-- end_
  // The array is a suitable MemTable key.
@ -216,6 +318,9 @@ class LookupKey {
  const char* end_;
  char space_[200];      // Avoid allocation for short keys

+  // allow code that finds the key to place metadata here, even if 'const'
+  mutable KeyMetaData * meta_;
+
  // No copying allowed
  LookupKey(const LookupKey&);
  void operator=(const LookupKey&);
@ -223,8 +328,47 @@ class LookupKey {

 inline LookupKey::~LookupKey() {
  if (start_ != space_) delete[] start_;
-}
+};
+
+
+// this class was constructed from code with DBImpl::DoCompactionWork (db_impl.cc)
+//   so it could be shared within BuildTable (and thus reduce Level 0 bloating)
+class KeyRetirement
+{
+protected:
+    // "state" from previous key reviewed
+    std::string current_user_key;
+    bool has_current_user_key;
+    SequenceNumber last_sequence_for_key;
+
+    // database values needed for processing
+    const Comparator * user_comparator;
+    SequenceNumber smallest_snapshot;
+    const Options * options;
+    Compaction * const compaction;
+
+    bool valid;
+    size_t dropped;   // tombstone or old version dropped
+    size_t expired;   // expired dropped
+
+public:
+    KeyRetirement(const Comparator * UserComparator, SequenceNumber SmallestSnapshot,
+                  const Options * Opts, Compaction * const Compaction=NULL);
+
+    virtual ~KeyRetirement();
+
+    bool operator()(Slice & key);
+
+    size_t GetDroppedCount() const {return(dropped);};
+    size_t GetExpiredCount() const {return(expired);};
+
+private:
+    KeyRetirement();
+    KeyRetirement(const KeyRetirement &);
+    const KeyRetirement & operator=(const KeyRetirement &);
+
+};  // class KeyRetirement

 }  // namespace leveldb

-#endif  // STORAGE_LEVELDB_DB_DBFORMAT_H_
+#endif  // STORAGE_LEVELDB_DB_FORMAT_H_
--- a/src/leveldb/db/dbformat_test.cc
+++ b/src/leveldb/db/dbformat_test.cc
@ -9,10 +9,11 @@
 namespace leveldb {

 static std::string IKey(const std::string& user_key,
+                        ExpiryTimeMicros exp,
                        uint64_t seq,
                        ValueType vt) {
  std::string encoded;
-  AppendInternalKey(&encoded, ParsedInternalKey(user_key, seq, vt));
+  AppendInternalKey(&encoded, ParsedInternalKey(user_key, exp, seq, vt));
  return encoded;
 }

@ -29,12 +30,13 @@ static std::string ShortSuccessor(const std::string& s) {
 }

 static void TestKey(const std::string& key,
+                    ExpiryTimeMicros exp,
                    uint64_t seq,
                    ValueType vt) {
-  std::string encoded = IKey(key, seq, vt);
+  std::string encoded = IKey(key, exp, seq, vt);

  Slice in(encoded);
-  ParsedInternalKey decoded("", 0, kTypeValue);
+  ParsedInternalKey decoded("", 0, 0, kTypeValue);

  ASSERT_TRUE(ParseInternalKey(in, &decoded));
  ASSERT_EQ(key, decoded.user_key.ToString());
@ -56,53 +58,53 @@ TEST(FormatTest, InternalKey_EncodeDecode) {
  };
  for (int k = 0; k < sizeof(keys) / sizeof(keys[0]); k++) {
    for (int s = 0; s < sizeof(seq) / sizeof(seq[0]); s++) {
-      TestKey(keys[k], seq[s], kTypeValue);
-      TestKey("hello", 1, kTypeDeletion);
+      TestKey(keys[k], 0, seq[s], kTypeValue);
+      TestKey("hello", 0, 1, kTypeDeletion);
    }
  }
 }

 TEST(FormatTest, InternalKeyShortSeparator) {
  // When user keys are same
-  ASSERT_EQ(IKey("foo", 100, kTypeValue),
-            Shorten(IKey("foo", 100, kTypeValue),
-                    IKey("foo", 99, kTypeValue)));
-  ASSERT_EQ(IKey("foo", 100, kTypeValue),
-            Shorten(IKey("foo", 100, kTypeValue),
-                    IKey("foo", 101, kTypeValue)));
-  ASSERT_EQ(IKey("foo", 100, kTypeValue),
-            Shorten(IKey("foo", 100, kTypeValue),
-                    IKey("foo", 100, kTypeValue)));
-  ASSERT_EQ(IKey("foo", 100, kTypeValue),
-            Shorten(IKey("foo", 100, kTypeValue),
-                    IKey("foo", 100, kTypeDeletion)));
+  ASSERT_EQ(IKey("foo", 0, 100, kTypeValue),
+            Shorten(IKey("foo", 0, 100, kTypeValue),
+                    IKey("foo", 0, 99, kTypeValue)));
+  ASSERT_EQ(IKey("foo", 0, 100, kTypeValue),
+            Shorten(IKey("foo", 0, 100, kTypeValue),
+                    IKey("foo", 0, 101, kTypeValue)));
+  ASSERT_EQ(IKey("foo", 0, 100, kTypeValue),
+            Shorten(IKey("foo", 0, 100, kTypeValue),
+                    IKey("foo", 0, 100, kTypeValue)));
+  ASSERT_EQ(IKey("foo", 0, 100, kTypeValue),
+            Shorten(IKey("foo", 0, 100, kTypeValue),
+                    IKey("foo", 0, 100, kTypeDeletion)));

  // When user keys are misordered
-  ASSERT_EQ(IKey("foo", 100, kTypeValue),
-            Shorten(IKey("foo", 100, kTypeValue),
-                    IKey("bar", 99, kTypeValue)));
+  ASSERT_EQ(IKey("foo", 0, 100, kTypeValue),
+            Shorten(IKey("foo", 0, 100, kTypeValue),
+                    IKey("bar", 0, 99, kTypeValue)));

  // When user keys are different, but correctly ordered
-  ASSERT_EQ(IKey("g", kMaxSequenceNumber, kValueTypeForSeek),
-            Shorten(IKey("foo", 100, kTypeValue),
-                    IKey("hello", 200, kTypeValue)));
+  ASSERT_EQ(IKey("g", 0, kMaxSequenceNumber, kValueTypeForSeek),
+            Shorten(IKey("foo", 0, 100, kTypeValue),
+                    IKey("hello", 0, 200, kTypeValue)));

  // When start user key is prefix of limit user key
-  ASSERT_EQ(IKey("foo", 100, kTypeValue),
-            Shorten(IKey("foo", 100, kTypeValue),
-                    IKey("foobar", 200, kTypeValue)));
+  ASSERT_EQ(IKey("foo", 0, 100, kTypeValue),
+            Shorten(IKey("foo", 0, 100, kTypeValue),
+                    IKey("foobar", 0, 200, kTypeValue)));

  // When limit user key is prefix of start user key
-  ASSERT_EQ(IKey("foobar", 100, kTypeValue),
-            Shorten(IKey("foobar", 100, kTypeValue),
-                    IKey("foo", 200, kTypeValue)));
+  ASSERT_EQ(IKey("foobar", 0, 100, kTypeValue),
+            Shorten(IKey("foobar", 0, 100, kTypeValue),
+                    IKey("foo", 0, 200, kTypeValue)));
 }

 TEST(FormatTest, InternalKeyShortestSuccessor) {
-  ASSERT_EQ(IKey("g", kMaxSequenceNumber, kValueTypeForSeek),
-            ShortSuccessor(IKey("foo", 100, kTypeValue)));
-  ASSERT_EQ(IKey("\xff\xff", 100, kTypeValue),
-            ShortSuccessor(IKey("\xff\xff", 100, kTypeValue)));
+  ASSERT_EQ(IKey("g", 0, kMaxSequenceNumber, kValueTypeForSeek),
+            ShortSuccessor(IKey("foo", 0, 100, kTypeValue)));
+  ASSERT_EQ(IKey("\xff\xff", 0, 100, kTypeValue),
+            ShortSuccessor(IKey("\xff\xff", 0, 100, kTypeValue)));
 }

 }  // namespace leveldb
--- a/src/leveldb/db/fault_injection_test.cc
+++ b/src/leveldb/db/fault_injection_test.cc
@ -1,554 +0,0 @@
-// Copyright 2014 The LevelDB Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
-
-// This test uses a custom Env to keep track of the state of a filesystem as of
-// the last "sync". It then checks for data loss errors by purposely dropping
-// file data (or entire files) not protected by a "sync".
-
-#include "leveldb/db.h"
-
-#include <map>
-#include <set>
-#include "db/db_impl.h"
-#include "db/filename.h"
-#include "db/log_format.h"
-#include "db/version_set.h"
-#include "leveldb/cache.h"
-#include "leveldb/env.h"
-#include "leveldb/table.h"
-#include "leveldb/write_batch.h"
-#include "util/logging.h"
-#include "util/mutexlock.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
-
-namespace leveldb {
-
-static const int kValueSize = 1000;
-static const int kMaxNumValues = 2000;
-static const size_t kNumIterations = 3;
-
-class FaultInjectionTestEnv;
-
-namespace {
-
-// Assume a filename, and not a directory name like "/foo/bar/"
-static std::string GetDirName(const std::string filename) {
-  size_t found = filename.find_last_of("/\\");
-  if (found == std::string::npos) {
-    return "";
-  } else {
-    return filename.substr(0, found);
-  }
-}
-
-Status SyncDir(const std::string& dir) {
-  // As this is a test it isn't required to *actually* sync this directory.
-  return Status::OK();
-}
-
-// A basic file truncation function suitable for this test.
-Status Truncate(const std::string& filename, uint64_t length) {
-  leveldb::Env* env = leveldb::Env::Default();
-
-  SequentialFile* orig_file;
-  Status s = env->NewSequentialFile(filename, &orig_file);
-  if (!s.ok())
-    return s;
-
-  char* scratch = new char[length];
-  leveldb::Slice result;
-  s = orig_file->Read(length, &result, scratch);
-  delete orig_file;
-  if (s.ok()) {
-    std::string tmp_name = GetDirName(filename) + "/truncate.tmp";
-    WritableFile* tmp_file;
-    s = env->NewWritableFile(tmp_name, &tmp_file);
-    if (s.ok()) {
-      s = tmp_file->Append(result);
-      delete tmp_file;
-      if (s.ok()) {
-        s = env->RenameFile(tmp_name, filename);
-      } else {
-        env->DeleteFile(tmp_name);
-      }
-    }
-  }
-
-  delete[] scratch;
-
-  return s;
-}
-
-struct FileState {
-  std::string filename_;
-  ssize_t pos_;
-  ssize_t pos_at_last_sync_;
-  ssize_t pos_at_last_flush_;
-
-  FileState(const std::string& filename)
-      : filename_(filename),
-        pos_(-1),
-        pos_at_last_sync_(-1),
-        pos_at_last_flush_(-1) { }
-
-  FileState() : pos_(-1), pos_at_last_sync_(-1), pos_at_last_flush_(-1) {}
-
-  bool IsFullySynced() const { return pos_ <= 0 || pos_ == pos_at_last_sync_; }
-
-  Status DropUnsyncedData() const;
-};
-
-}  // anonymous namespace
-
-// A wrapper around WritableFile which informs another Env whenever this file
-// is written to or sync'ed.
-class TestWritableFile : public WritableFile {
- public:
-  TestWritableFile(const FileState& state,
-                   WritableFile* f,
-                   FaultInjectionTestEnv* env);
-  virtual ~TestWritableFile();
-  virtual Status Append(const Slice& data);
-  virtual Status Close();
-  virtual Status Flush();
-  virtual Status Sync();
-
- private:
-  FileState state_;
-  WritableFile* target_;
-  bool writable_file_opened_;
-  FaultInjectionTestEnv* env_;
-
-  Status SyncParent();
-};
-
-class FaultInjectionTestEnv : public EnvWrapper {
- public:
-  FaultInjectionTestEnv() : EnvWrapper(Env::Default()), filesystem_active_(true) {}
-  virtual ~FaultInjectionTestEnv() { }
-  virtual Status NewWritableFile(const std::string& fname,
-                                 WritableFile** result);
-  virtual Status NewAppendableFile(const std::string& fname,
-                                   WritableFile** result);
-  virtual Status DeleteFile(const std::string& f);
-  virtual Status RenameFile(const std::string& s, const std::string& t);
-
-  void WritableFileClosed(const FileState& state);
-  Status DropUnsyncedFileData();
-  Status DeleteFilesCreatedAfterLastDirSync();
-  void DirWasSynced();
-  bool IsFileCreatedSinceLastDirSync(const std::string& filename);
-  void ResetState();
-  void UntrackFile(const std::string& f);
-  // Setting the filesystem to inactive is the test equivalent to simulating a
-  // system reset. Setting to inactive will freeze our saved filesystem state so
-  // that it will stop being recorded. It can then be reset back to the state at
-  // the time of the reset.
-  bool IsFilesystemActive() const { return filesystem_active_; }
-  void SetFilesystemActive(bool active) { filesystem_active_ = active; }
-
- private:
-  port::Mutex mutex_;
-  std::map<std::string, FileState> db_file_state_;
-  std::set<std::string> new_files_since_last_dir_sync_;
-  bool filesystem_active_;  // Record flushes, syncs, writes
-};
-
-TestWritableFile::TestWritableFile(const FileState& state,
-                                   WritableFile* f,
-                                   FaultInjectionTestEnv* env)
-    : state_(state),
-      target_(f),
-      writable_file_opened_(true),
-      env_(env) {
-  assert(f != NULL);
-}
-
-TestWritableFile::~TestWritableFile() {
-  if (writable_file_opened_) {
-    Close();
-  }
-  delete target_;
-}
-
-Status TestWritableFile::Append(const Slice& data) {
-  Status s = target_->Append(data);
-  if (s.ok() && env_->IsFilesystemActive()) {
-    state_.pos_ += data.size();
-  }
-  return s;
-}
-
-Status TestWritableFile::Close() {
-  writable_file_opened_ = false;
-  Status s = target_->Close();
-  if (s.ok()) {
-    env_->WritableFileClosed(state_);
-  }
-  return s;
-}
-
-Status TestWritableFile::Flush() {
-  Status s = target_->Flush();
-  if (s.ok() && env_->IsFilesystemActive()) {
-    state_.pos_at_last_flush_ = state_.pos_;
-  }
-  return s;
-}
-
-Status TestWritableFile::SyncParent() {
-  Status s = SyncDir(GetDirName(state_.filename_));
-  if (s.ok()) {
-    env_->DirWasSynced();
-  }
-  return s;
-}
-
-Status TestWritableFile::Sync() {
-  if (!env_->IsFilesystemActive()) {
-    return Status::OK();
-  }
-  // Ensure new files referred to by the manifest are in the filesystem.
-  Status s = target_->Sync();
-  if (s.ok()) {
-    state_.pos_at_last_sync_ = state_.pos_;
-  }
-  if (env_->IsFileCreatedSinceLastDirSync(state_.filename_)) {
-    Status ps = SyncParent();
-    if (s.ok() && !ps.ok()) {
-      s = ps;
-    }
-  }
-  return s;
-}
-
-Status FaultInjectionTestEnv::NewWritableFile(const std::string& fname,
-                                              WritableFile** result) {
-  WritableFile* actual_writable_file;
-  Status s = target()->NewWritableFile(fname, &actual_writable_file);
-  if (s.ok()) {
-    FileState state(fname);
-    state.pos_ = 0;
-    *result = new TestWritableFile(state, actual_writable_file, this);
-    // NewWritableFile doesn't append to files, so if the same file is
-    // opened again then it will be truncated - so forget our saved
-    // state.
-    UntrackFile(fname);
-    MutexLock l(&mutex_);
-    new_files_since_last_dir_sync_.insert(fname);
-  }
-  return s;
-}
-
-Status FaultInjectionTestEnv::NewAppendableFile(const std::string& fname,
-                                                WritableFile** result) {
-  WritableFile* actual_writable_file;
-  Status s = target()->NewAppendableFile(fname, &actual_writable_file);
-  if (s.ok()) {
-    FileState state(fname);
-    state.pos_ = 0;
-    {
-      MutexLock l(&mutex_);
-      if (db_file_state_.count(fname) == 0) {
-        new_files_since_last_dir_sync_.insert(fname);
-      } else {
-        state = db_file_state_[fname];
-      }
-    }
-    *result = new TestWritableFile(state, actual_writable_file, this);
-  }
-  return s;
-}
-
-Status FaultInjectionTestEnv::DropUnsyncedFileData() {
-  Status s;
-  MutexLock l(&mutex_);
-  for (std::map<std::string, FileState>::const_iterator it =
-           db_file_state_.begin();
-       s.ok() && it != db_file_state_.end(); ++it) {
-    const FileState& state = it->second;
-    if (!state.IsFullySynced()) {
-      s = state.DropUnsyncedData();
-    }
-  }
-  return s;
-}
-
-void FaultInjectionTestEnv::DirWasSynced() {
-  MutexLock l(&mutex_);
-  new_files_since_last_dir_sync_.clear();
-}
-
-bool FaultInjectionTestEnv::IsFileCreatedSinceLastDirSync(
-    const std::string& filename) {
-  MutexLock l(&mutex_);
-  return new_files_since_last_dir_sync_.find(filename) !=
-         new_files_since_last_dir_sync_.end();
-}
-
-void FaultInjectionTestEnv::UntrackFile(const std::string& f) {
-  MutexLock l(&mutex_);
-  db_file_state_.erase(f);
-  new_files_since_last_dir_sync_.erase(f);
-}
-
-Status FaultInjectionTestEnv::DeleteFile(const std::string& f) {
-  Status s = EnvWrapper::DeleteFile(f);
-  ASSERT_OK(s);
-  if (s.ok()) {
-    UntrackFile(f);
-  }
-  return s;
-}
-
-Status FaultInjectionTestEnv::RenameFile(const std::string& s,
-                                         const std::string& t) {
-  Status ret = EnvWrapper::RenameFile(s, t);
-
-  if (ret.ok()) {
-    MutexLock l(&mutex_);
-    if (db_file_state_.find(s) != db_file_state_.end()) {
-      db_file_state_[t] = db_file_state_[s];
-      db_file_state_.erase(s);
-    }
-
-    if (new_files_since_last_dir_sync_.erase(s) != 0) {
-      assert(new_files_since_last_dir_sync_.find(t) ==
-             new_files_since_last_dir_sync_.end());
-      new_files_since_last_dir_sync_.insert(t);
-    }
-  }
-
-  return ret;
-}
-
-void FaultInjectionTestEnv::ResetState() {
-  // Since we are not destroying the database, the existing files
-  // should keep their recorded synced/flushed state. Therefore
-  // we do not reset db_file_state_ and new_files_since_last_dir_sync_.
-  MutexLock l(&mutex_);
-  SetFilesystemActive(true);
-}
-
-Status FaultInjectionTestEnv::DeleteFilesCreatedAfterLastDirSync() {
-  // Because DeleteFile access this container make a copy to avoid deadlock
-  mutex_.Lock();
-  std::set<std::string> new_files(new_files_since_last_dir_sync_.begin(),
-                                  new_files_since_last_dir_sync_.end());
-  mutex_.Unlock();
-  Status s;
-  std::set<std::string>::const_iterator it;
-  for (it = new_files.begin(); s.ok() && it != new_files.end(); ++it) {
-    s = DeleteFile(*it);
-  }
-  return s;
-}
-
-void FaultInjectionTestEnv::WritableFileClosed(const FileState& state) {
-  MutexLock l(&mutex_);
-  db_file_state_[state.filename_] = state;
-}
-
-Status FileState::DropUnsyncedData() const {
-  ssize_t sync_pos = pos_at_last_sync_ == -1 ? 0 : pos_at_last_sync_;
-  return Truncate(filename_, sync_pos);
-}
-
-class FaultInjectionTest {
- public:
-  enum ExpectedVerifResult { VAL_EXPECT_NO_ERROR, VAL_EXPECT_ERROR };
-  enum ResetMethod { RESET_DROP_UNSYNCED_DATA, RESET_DELETE_UNSYNCED_FILES };
-
-  FaultInjectionTestEnv* env_;
-  std::string dbname_;
-  Cache* tiny_cache_;
-  Options options_;
-  DB* db_;
-
-  FaultInjectionTest()
-      : env_(new FaultInjectionTestEnv),
-        tiny_cache_(NewLRUCache(100)),
-        db_(NULL) {
-    dbname_ = test::TmpDir() + "/fault_test";
-    DestroyDB(dbname_, Options());  // Destroy any db from earlier run
-    options_.reuse_logs = true;
-    options_.env = env_;
-    options_.paranoid_checks = true;
-    options_.block_cache = tiny_cache_;
-    options_.create_if_missing = true;
-  }
-
-  ~FaultInjectionTest() {
-    CloseDB();
-    DestroyDB(dbname_, Options());
-    delete tiny_cache_;
-    delete env_;
-  }
-
-  void ReuseLogs(bool reuse) {
-    options_.reuse_logs = reuse;
-  }
-
-  void Build(int start_idx, int num_vals) {
-    std::string key_space, value_space;
-    WriteBatch batch;
-    for (int i = start_idx; i < start_idx + num_vals; i++) {
-      Slice key = Key(i, &key_space);
-      batch.Clear();
-      batch.Put(key, Value(i, &value_space));
-      WriteOptions options;
-      ASSERT_OK(db_->Write(options, &batch));
-    }
-  }
-
-  Status ReadValue(int i, std::string* val) const {
-    std::string key_space, value_space;
-    Slice key = Key(i, &key_space);
-    Value(i, &value_space);
-    ReadOptions options;
-    return db_->Get(options, key, val);
-  }
-
-  Status Verify(int start_idx, int num_vals,
-                ExpectedVerifResult expected) const {
-    std::string val;
-    std::string value_space;
-    Status s;
-    for (int i = start_idx; i < start_idx + num_vals && s.ok(); i++) {
-      Value(i, &value_space);
-      s = ReadValue(i, &val);
-      if (expected == VAL_EXPECT_NO_ERROR) {
-        if (s.ok()) {
-          ASSERT_EQ(value_space, val);
-        }
-      } else if (s.ok()) {
-        fprintf(stderr, "Expected an error at %d, but was OK\n", i);
-        s = Status::IOError(dbname_, "Expected value error:");
-      } else {
-        s = Status::OK();  // An expected error
-      }
-    }
-    return s;
-  }
-
-  // Return the ith key
-  Slice Key(int i, std::string* storage) const {
-    char buf[100];
-    snprintf(buf, sizeof(buf), "%016d", i);
-    storage->assign(buf, strlen(buf));
-    return Slice(*storage);
-  }
-
-  // Return the value to associate with the specified key
-  Slice Value(int k, std::string* storage) const {
-    Random r(k);
-    return test::RandomString(&r, kValueSize, storage);
-  }
-
-  Status OpenDB() {
-    delete db_;
-    db_ = NULL;
-    env_->ResetState();
-    return DB::Open(options_, dbname_, &db_);
-  }
-
-  void CloseDB() {
-    delete db_;
-    db_ = NULL;
-  }
-
-  void DeleteAllData() {
-    Iterator* iter = db_->NewIterator(ReadOptions());
-    WriteOptions options;
-    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
-      ASSERT_OK(db_->Delete(WriteOptions(), iter->key()));
-    }
-
-    delete iter;
-  }
-
-  void ResetDBState(ResetMethod reset_method) {
-    switch (reset_method) {
-      case RESET_DROP_UNSYNCED_DATA:
-        ASSERT_OK(env_->DropUnsyncedFileData());
-        break;
-      case RESET_DELETE_UNSYNCED_FILES:
-        ASSERT_OK(env_->DeleteFilesCreatedAfterLastDirSync());
-        break;
-      default:
-        assert(false);
-    }
-  }
-
-  void PartialCompactTestPreFault(int num_pre_sync, int num_post_sync) {
-    DeleteAllData();
-    Build(0, num_pre_sync);
-    db_->CompactRange(NULL, NULL);
-    Build(num_pre_sync, num_post_sync);
-  }
-
-  void PartialCompactTestReopenWithFault(ResetMethod reset_method,
-                                         int num_pre_sync,
-                                         int num_post_sync) {
-    env_->SetFilesystemActive(false);
-    CloseDB();
-    ResetDBState(reset_method);
-    ASSERT_OK(OpenDB());
-    ASSERT_OK(Verify(0, num_pre_sync, FaultInjectionTest::VAL_EXPECT_NO_ERROR));
-    ASSERT_OK(Verify(num_pre_sync, num_post_sync, FaultInjectionTest::VAL_EXPECT_ERROR));
-  }
-
-  void NoWriteTestPreFault() {
-  }
-
-  void NoWriteTestReopenWithFault(ResetMethod reset_method) {
-    CloseDB();
-    ResetDBState(reset_method);
-    ASSERT_OK(OpenDB());
-  }
-
-  void DoTest() {
-    Random rnd(0);
-    ASSERT_OK(OpenDB());
-    for (size_t idx = 0; idx < kNumIterations; idx++) {
-      int num_pre_sync = rnd.Uniform(kMaxNumValues);
-      int num_post_sync = rnd.Uniform(kMaxNumValues);
-
-      PartialCompactTestPreFault(num_pre_sync, num_post_sync);
-      PartialCompactTestReopenWithFault(RESET_DROP_UNSYNCED_DATA,
-                                        num_pre_sync,
-                                        num_post_sync);
-
-      NoWriteTestPreFault();
-      NoWriteTestReopenWithFault(RESET_DROP_UNSYNCED_DATA);
-
-      PartialCompactTestPreFault(num_pre_sync, num_post_sync);
-      // No new files created so we expect all values since no files will be
-      // dropped.
-      PartialCompactTestReopenWithFault(RESET_DELETE_UNSYNCED_FILES,
-                                        num_pre_sync + num_post_sync,
-                                        0);
-
-      NoWriteTestPreFault();
-      NoWriteTestReopenWithFault(RESET_DELETE_UNSYNCED_FILES);
-    }
-  }
-};
-
-TEST(FaultInjectionTest, FaultTestNoLogReuse) {
-  ReuseLogs(false);
-  DoTest();
-}
-
-TEST(FaultInjectionTest, FaultTestWithLogReuse) {
-  ReuseLogs(true);
-  DoTest();
-}
-
-}  // namespace leveldb
-
-int main(int argc, char** argv) {
-  return leveldb::test::RunAllTests();
-}
--- a/src/leveldb/db/filename.cc
+++ b/src/leveldb/db/filename.cc
@ -4,9 +4,14 @@

 #include <ctype.h>
 #include <stdio.h>
+#include <errno.h>
+#include <sys/stat.h>
+#include <sys/types.h>
 #include "db/filename.h"
 #include "db/dbformat.h"
+#include "db/version_set.h"
 #include "leveldb/env.h"
+#include "leveldb/status.h"
 #include "util/logging.h"

 namespace leveldb {
@ -24,19 +29,50 @@ static std::string MakeFileName(const std::string& name, uint64_t number,
  return name + buf;
 }

+static std::string MakeFileName2(const Options & options, uint64_t number,
+                                 int level, const char* suffix) {
+  char buf[100];
+  if (0<=level)
+      snprintf(buf, sizeof(buf), "/%s_%-d/%06llu.%s",
+               suffix, level,
+               static_cast<unsigned long long>(number),
+               suffix);
+  else if (-1==level)
+      snprintf(buf, sizeof(buf), "/%s/%06llu.%s",
+               suffix,
+               static_cast<unsigned long long>(number),
+               suffix);
+  else if (-2==level)
+      snprintf(buf, sizeof(buf), "/%06llu.%s",
+               static_cast<unsigned long long>(number),
+               suffix);
+
+  return((level<(int)options.tiered_slow_level ?
+          options.tiered_fast_prefix : options.tiered_slow_prefix) + buf);
+}
+
+std::string MakeDirName2(const Options & options,
+                         int level, const char* suffix) {
+  char buf[100];
+  if (-1!=level)
+      snprintf(buf, sizeof(buf), "/%s_%-d",
+               suffix, level);
+  else
+      snprintf(buf, sizeof(buf), "/%s",
+               suffix);
+
+  return((level<(int)options.tiered_slow_level ?
+          options.tiered_fast_prefix : options.tiered_slow_prefix) + buf);
+}
+
 std::string LogFileName(const std::string& name, uint64_t number) {
  assert(number > 0);
  return MakeFileName(name, number, "log");
 }

-std::string TableFileName(const std::string& name, uint64_t number) {
+std::string TableFileName(const Options & options, uint64_t number, int level) {
  assert(number > 0);
-  return MakeFileName(name, number, "ldb");
-}
-
-std::string SSTTableFileName(const std::string& name, uint64_t number) {
-  assert(number > 0);
-  return MakeFileName(name, number, "sst");
+  return MakeFileName2(options, number, level, "sst");
 }

 std::string DescriptorFileName(const std::string& dbname, uint64_t number) {
@ -69,6 +105,36 @@ std::string OldInfoLogFileName(const std::string& dbname) {
  return dbname + "/LOG.old";
 }

+//
+std::string CowFileName(const std::string& dbname) {
+  return dbname + "/COW";
+}
+
+
+// Append appropriate "backup" string to input path
+std::string BackupPath(const std::string& dbname, int backup_num) {
+    std::string dirname;
+
+  char buf[100];
+  if (0 != backup_num)
+      snprintf(buf, sizeof(buf), "/backup.%-d", backup_num);
+  else
+      snprintf(buf, sizeof(buf), "/backup");
+
+  return(dbname + buf);
+}
+
+
+// update tiered_fast_prefix and tiered_slow_prefix members of
+//  given Options object to point to desired backup path
+bool SetBackupPaths(Options & options, int backup_num) {
+
+    options.tiered_fast_prefix = BackupPath(options.tiered_fast_prefix, backup_num);
+    options.tiered_slow_prefix = BackupPath(options.tiered_slow_prefix, backup_num);
+
+    return(true);
+}
+

 // Owned filenames have the form:
 //    dbname/CURRENT
@ -76,7 +142,8 @@ std::string OldInfoLogFileName(const std::string& dbname) {
 //    dbname/LOG
 //    dbname/LOG.old
 //    dbname/MANIFEST-[0-9]+
-//    dbname/[0-9]+.(log|sst|ldb)
+//    dbname/[0-9]+.(log|sst)
+//    dbname/COW
 bool ParseFileName(const std::string& fname,
                   uint64_t* number,
                   FileType* type) {
@ -84,6 +151,9 @@ bool ParseFileName(const std::string& fname,
  if (rest == "CURRENT") {
    *number = 0;
    *type = kCurrentFile;
+  } else if (rest == "COW") {
+    *number = 0;
+    *type = kCacheWarming;
  } else if (rest == "LOCK") {
    *number = 0;
    *type = kDBLockFile;
@ -111,7 +181,7 @@ bool ParseFileName(const std::string& fname,
    Slice suffix = rest;
    if (suffix == Slice(".log")) {
      *type = kLogFile;
-    } else if (suffix == Slice(".sst") || suffix == Slice(".ldb")) {
+    } else if (suffix == Slice(".sst")) {
      *type = kTableFile;
    } else if (suffix == Slice(".dbtmp")) {
      *type = kTempFile;
@ -141,4 +211,99 @@ Status SetCurrentFile(Env* env, const std::string& dbname,
  return s;
 }

+
+Status
+MakeLevelDirectories(Env * env, const Options & options)
+{
+    Status ret_stat;
+    int level;
+    std::string dirname;
+
+    for (level=0; level<config::kNumLevels && ret_stat.ok(); ++level)
+    {
+        dirname=MakeDirName2(options, level, "sst");
+
+        // ignoring error since no way to tell if "bad" error, or "already exists" error
+        env->CreateDir(dirname.c_str());
+    }   // for
+
+    return(ret_stat);
+
+}  // MakeLevelDirectories
+
+
+bool
+TestForLevelDirectories(
+    Env * env,
+    const Options & options,
+    Version * version)
+{
+    bool ret_flag, again;
+    int level;
+    std::string dirname;
+
+    ret_flag=true;
+    again=true;
+
+    // walk backwards, fault will be in higher levels if partial conversion
+    for (level=config::kNumLevels-1; 0<=level && again; --level)
+    {
+        again=false;
+
+        // does directory exist
+        dirname=MakeDirName2(options, level, "sst");
+        ret_flag=env->FileExists(dirname.c_str());
+
+        // do all files exist in level
+        if (ret_flag)
+        {
+            const std::vector<FileMetaData*> & level_files(version->GetFileList(level));
+            std::vector<FileMetaData*>::const_iterator it;
+            std::string table_name;
+            Status s;
+
+            for (it=level_files.begin(); level_files.end()!=it && ret_flag; ++it)
+            {
+                table_name=TableFileName(options, (*it)->number, level);
+                ret_flag=env->FileExists(table_name.c_str());
+            }   // for
+
+            again=ret_flag && 0==level_files.size();
+        }   // if
+    }   // for
+
+    return(ret_flag);
+
+}   // TestForLevelDirectories
+
+std::string       // replacement dbname ... potentially tiered
+MakeTieredDbname(
+    const std::string & dbname,    // input ... original dbname from DBImpl constructor
+    Options & options)             // input/output ... writable Options, tiered values changed
+{
+    // case for "", used with internal calls to DestroyDB
+    if (0==dbname.size() && 0!=options.tiered_fast_prefix.size())
+    {
+        // do NOTHING ... options already initialized
+    }   // if
+    else if (0<(int)options.tiered_slow_level && (int)options.tiered_slow_level<config::kNumLevels
+        && 0!=options.tiered_fast_prefix.size() && 0!=options.tiered_slow_prefix.size())
+    {
+        options.tiered_fast_prefix.append("/");
+        options.tiered_fast_prefix.append(dbname);
+
+        options.tiered_slow_prefix.append("/");
+        options.tiered_slow_prefix.append(dbname);
+    }   // else if
+    else
+    {
+        options.tiered_slow_level=0;
+        options.tiered_fast_prefix=dbname; // duplicate as is
+        options.tiered_slow_prefix=dbname;
+    }   // else
+
+    return(options.tiered_fast_prefix);
+
+}   // MakeTieredDbname
+
 }  // namespace leveldb
--- a/src/leveldb/db/filename.h
+++ b/src/leveldb/db/filename.h
@ -9,6 +9,7 @@

 #include <stdint.h>
 #include <string>
+#include "leveldb/options.h"
 #include "leveldb/slice.h"
 #include "leveldb/status.h"
 #include "port/port.h"
@ -16,6 +17,7 @@
 namespace leveldb {

 class Env;
+class Version;

 enum FileType {
  kLogFile,
@ -24,9 +26,24 @@ enum FileType {
  kDescriptorFile,
  kCurrentFile,
  kTempFile,
-  kInfoLogFile  // Either the current one, or an old one
+  kInfoLogFile,  // Either the current one, or an old one
+  kCacheWarming
 };

+// Riak specific routine to help create sst_? subdirectory names
+std::string MakeDirName2(const Options & options,
+                         int level, const char* suffix);
+
+// Riak specific routine to help create sst_? subdirectories
+Status MakeLevelDirectories(Env * env, const Options & options);
+
+// Riak specific routine to test if sst_? subdirectories exist
+bool TestForLevelDirectories(Env * env, const Options & options, class Version *);
+
+// Riak specific routine to standardize conversion of dbname and
+//  Options' tiered directories (options parameter is MODIFIED)
+std::string MakeTieredDbname(const std::string &dbname, Options & options_rw);
+
 // Return the name of the log file with the specified number
 // in the db named by "dbname".  The result will be prefixed with
 // "dbname".
@ -35,12 +52,8 @@ extern std::string LogFileName(const std::string& dbname, uint64_t number);
 // Return the name of the sstable with the specified number
 // in the db named by "dbname".  The result will be prefixed with
 // "dbname".
-extern std::string TableFileName(const std::string& dbname, uint64_t number);
-
-// Return the legacy file name for an sstable with the specified number
-// in the db named by "dbname". The result will be prefixed with
-// "dbname".
-extern std::string SSTTableFileName(const std::string& dbname, uint64_t number);
+extern std::string TableFileName(const Options & options, uint64_t number,
+                                 int level);

 // Return the name of the descriptor file for the db named by
 // "dbname" and the specified incarnation number.  The result will be
@ -67,10 +80,21 @@ extern std::string InfoLogFileName(const std::string& dbname);
 // Return the name of the old info log file for "dbname".
 extern std::string OldInfoLogFileName(const std::string& dbname);

+// Return the name of the cache object file for the db named by
+// "dbname".  The result will be prefixed with "dbname".
+extern std::string CowFileName(const std::string& dbname);
+
+// Append appropriate "backup" string to input path
+extern std::string BackupPath(const std::string& dbname, int backup_num);
+
+// update tiered_fast_prefix and tiered_slow_prefix members of
+//  given Options object to point to backup path
+extern bool SetBackupPaths(Options & options, int backup_num);
+
 // If filename is a leveldb file, store the type of the file in *type.
 // The number encoded in the filename is stored in *number.  If the
 // filename was successfully parsed, returns true.  Else return false.
-extern bool ParseFileName(const std::string& filename,
+extern bool ParseFileName(const std::string& tiered_filename,
                          uint64_t* number,
                          FileType* type);

--- a/src/leveldb/db/filename_test.cc
+++ b/src/leveldb/db/filename_test.cc
@ -27,7 +27,6 @@ TEST(FileNameTest, Parse) {
    { "100.log",            100,   kLogFile },
    { "0.log",              0,     kLogFile },
    { "0.sst",              0,     kTableFile },
-    { "0.ldb",              0,     kTableFile },
    { "CURRENT",            0,     kCurrentFile },
    { "LOCK",               0,     kDBLockFile },
    { "MANIFEST-2",         2,     kDescriptorFile },
@ -71,13 +70,14 @@ TEST(FileNameTest, Parse) {
  for (int i = 0; i < sizeof(errors) / sizeof(errors[0]); i++) {
    std::string f = errors[i];
    ASSERT_TRUE(!ParseFileName(f, &number, &type)) << f;
-  }
+  };
 }

 TEST(FileNameTest, Construction) {
  uint64_t number;
  FileType type;
  std::string fname;
+  Options options;

  fname = CurrentFileName("foo");
  ASSERT_EQ("foo/", std::string(fname.data(), 4));
@ -97,12 +97,40 @@ TEST(FileNameTest, Construction) {
  ASSERT_EQ(192, number);
  ASSERT_EQ(kLogFile, type);

-  fname = TableFileName("bar", 200);
+  options.tiered_fast_prefix="bar";
+  options.tiered_slow_prefix="bar";
+  fname = TableFileName(options, 200, 1);
  ASSERT_EQ("bar/", std::string(fname.data(), 4));
-  ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
+  ASSERT_EQ("sst_1/", std::string(fname.substr(4,6)));
+  ASSERT_TRUE(ParseFileName(fname.c_str() + 10, &number, &type));
  ASSERT_EQ(200, number);
  ASSERT_EQ(kTableFile, type);

+  fname = TableFileName(options, 400, 4);
+  ASSERT_EQ("bar/", std::string(fname.data(), 4));
+  ASSERT_EQ("sst_4/", std::string(fname.substr(4,6)));
+  ASSERT_TRUE(ParseFileName(fname.c_str() + 10, &number, &type));
+  ASSERT_EQ(400, number);
+  ASSERT_EQ(kTableFile, type);
+
+  options.tiered_slow_level=4;
+  options.tiered_fast_prefix="fast";
+  options.tiered_slow_prefix="slow";
+  fname = TableFileName(options, 500, 3);
+  ASSERT_EQ("fast/", std::string(fname.data(), 5));
+  ASSERT_EQ("sst_3/", std::string(fname.substr(5,6)));
+  ASSERT_TRUE(ParseFileName(fname.c_str() + 11, &number, &type));
+  ASSERT_EQ(500, number);
+  ASSERT_EQ(kTableFile, type);
+
+  fname = TableFileName(options, 600, 4);
+  ASSERT_EQ("slow/", std::string(fname.data(), 5));
+  ASSERT_EQ("sst_4/", std::string(fname.substr(5,6)));
+  ASSERT_TRUE(ParseFileName(fname.c_str() + 11, &number, &type));
+  ASSERT_EQ(600, number);
+  ASSERT_EQ(kTableFile, type);
+
+
  fname = DescriptorFileName("bar", 100);
  ASSERT_EQ("bar/", std::string(fname.data(), 4));
  ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
@ -114,6 +142,48 @@ TEST(FileNameTest, Construction) {
  ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
  ASSERT_EQ(999, number);
  ASSERT_EQ(kTempFile, type);
+
+  fname = CowFileName("/what/goes/moo");
+  ASSERT_EQ("/what/goes/moo/COW", fname);
+
+  fname = BackupPath("/var/db/riak/data/leveldb/0",0);
+  ASSERT_EQ("/var/db/riak/data/leveldb/0/backup", fname);
+
+  fname = BackupPath("/var/db/riak/data/leveldb/0",1);
+  ASSERT_EQ("/var/db/riak/data/leveldb/0/backup.1", fname);
+
+  fname = BackupPath("/var/db/riak/data/leveldb/0",5);
+  ASSERT_EQ("/var/db/riak/data/leveldb/0/backup.5", fname);
+
+  options.tiered_slow_level=4;
+  options.tiered_fast_prefix="fast";
+  options.tiered_slow_prefix="slow";
+  fname = SetBackupPaths(options,0);
+  ASSERT_EQ("fast/backup", options.tiered_fast_prefix);
+  ASSERT_EQ("slow/backup", options.tiered_slow_prefix);
+
+  options.tiered_slow_level=4;
+  options.tiered_fast_prefix="fast";
+  options.tiered_slow_prefix="slow";
+  fname = SetBackupPaths(options,3);
+  ASSERT_EQ("fast/backup.3", options.tiered_fast_prefix);
+  ASSERT_EQ("slow/backup.3", options.tiered_slow_prefix);
+
+
+  options.tiered_slow_level=4;
+  options.tiered_fast_prefix="//mnt/fast";
+  options.tiered_slow_prefix="//mnt/slow";
+  fname=MakeTieredDbname("riak/data/leveldb", options);
+  ASSERT_EQ("//mnt/fast/riak/data/leveldb", fname);
+  ASSERT_EQ("//mnt/fast/riak/data/leveldb", options.tiered_fast_prefix);
+  ASSERT_EQ("//mnt/slow/riak/data/leveldb", options.tiered_slow_prefix);
+
+  // special case with no dbname given, should have no changes
+  fname=MakeTieredDbname("", options);
+  ASSERT_EQ("//mnt/fast/riak/data/leveldb", fname);
+  ASSERT_EQ("//mnt/fast/riak/data/leveldb", options.tiered_fast_prefix);
+  ASSERT_EQ("//mnt/slow/riak/data/leveldb", options.tiered_slow_prefix);
+
 }

 }  // namespace leveldb
--- a/src/leveldb/db/leveldbutil.cc
+++ b/src/leveldb/db/leveldbutil.cc
@ -1,65 +0,0 @@
-// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
-
-#include <stdio.h>
-#include "leveldb/dumpfile.h"
-#include "leveldb/env.h"
-#include "leveldb/status.h"
-
-namespace leveldb {
-namespace {
-
-class StdoutPrinter : public WritableFile {
- public:
-  virtual Status Append(const Slice& data) {
-    fwrite(data.data(), 1, data.size(), stdout);
-    return Status::OK();
-  }
-  virtual Status Close() { return Status::OK(); }
-  virtual Status Flush() { return Status::OK(); }
-  virtual Status Sync() { return Status::OK(); }
-  virtual std::string GetName() const { return "[stdout]"; }
-};
-
-bool HandleDumpCommand(Env* env, char** files, int num) {
-  StdoutPrinter printer;
-  bool ok = true;
-  for (int i = 0; i < num; i++) {
-    Status s = DumpFile(env, files[i], &printer);
-    if (!s.ok()) {
-      fprintf(stderr, "%s\n", s.ToString().c_str());
-      ok = false;
-    }
-  }
-  return ok;
-}
-
-}  // namespace
-}  // namespace leveldb
-
-static void Usage() {
-  fprintf(
-      stderr,
-      "Usage: leveldbutil command...\n"
-      "   dump files...         -- dump contents of specified files\n"
-      );
-}
-
-int main(int argc, char** argv) {
-  leveldb::Env* env = leveldb::Env::Default();
-  bool ok = true;
-  if (argc < 2) {
-    Usage();
-    ok = false;
-  } else {
-    std::string command = argv[1];
-    if (command == "dump") {
-      ok = leveldb::HandleDumpCommand(env, argv+2, argc-2);
-    } else {
-      Usage();
-      ok = false;
-    }
-  }
-  return (ok ? 0 : 1);
-}
--- a/src/leveldb/db/log_format.h
+++ b/src/leveldb/db/log_format.h
@ -3,7 +3,7 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 //
 // Log format information shared by reader and writer.
-// See ../doc/log_format.md for more detail.
+// See ../doc/log_format.txt for more detail.

 #ifndef STORAGE_LEVELDB_DB_LOG_FORMAT_H_
 #define STORAGE_LEVELDB_DB_LOG_FORMAT_H_
@ -26,8 +26,8 @@ static const int kMaxRecordType = kLastType;

 static const int kBlockSize = 32768;

-// Header is checksum (4 bytes), length (2 bytes), type (1 byte).
-static const int kHeaderSize = 4 + 2 + 1;
+// Header is checksum (4 bytes), type (1 byte), length (2 bytes).
+static const int kHeaderSize = 4 + 1 + 2;

 }  // namespace log
 }  // namespace leveldb
--- a/src/leveldb/db/log_reader.cc
+++ b/src/leveldb/db/log_reader.cc
@ -25,8 +25,7 @@ Reader::Reader(SequentialFile* file, Reporter* reporter, bool checksum,
      eof_(false),
      last_record_offset_(0),
      end_of_buffer_offset_(0),
-      initial_offset_(initial_offset),
-      resyncing_(initial_offset > 0) {
+      initial_offset_(initial_offset) {
 }

 Reader::~Reader() {
@ -73,25 +72,8 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch) {

  Slice fragment;
  while (true) {
+    uint64_t physical_record_offset = end_of_buffer_offset_ - buffer_.size();
    const unsigned int record_type = ReadPhysicalRecord(&fragment);
-
-    // ReadPhysicalRecord may have only had an empty trailer remaining in its
-    // internal buffer. Calculate the offset of the next physical record now
-    // that it has returned, properly accounting for its header size.
-    uint64_t physical_record_offset =
-        end_of_buffer_offset_ - buffer_.size() - kHeaderSize - fragment.size();
-
-    if (resyncing_) {
-      if (record_type == kMiddleType) {
-        continue;
-      } else if (record_type == kLastType) {
-        resyncing_ = false;
-        continue;
-      } else {
-        resyncing_ = false;
-      }
-    }
-
    switch (record_type) {
      case kFullType:
        if (in_fragmented_record) {
@ -151,9 +133,7 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch) {

      case kEof:
        if (in_fragmented_record) {
-          // This can be caused by the writer dying immediately after
-          // writing a physical record but before completing the next; don't
-          // treat it as a corruption, just ignore the entire logical record.
+          ReportCorruption(scratch->size(), "partial record without end(3)");
          scratch->clear();
        }
        return false;
@ -185,20 +165,20 @@ uint64_t Reader::LastRecordOffset() {
  return last_record_offset_;
 }

-void Reader::ReportCorruption(uint64_t bytes, const char* reason) {
-  ReportDrop(bytes, Status::Corruption(reason, file_->GetName()));
+void Reader::ReportCorruption(size_t bytes, const char* reason) {
+  ReportDrop(bytes, Status::Corruption(reason));
 }

-void Reader::ReportDrop(uint64_t bytes, const Status& reason) {
+void Reader::ReportDrop(size_t bytes, const Status& reason) {
  if (reporter_ != NULL &&
      end_of_buffer_offset_ - buffer_.size() - bytes >= initial_offset_) {
-    reporter_->Corruption(static_cast<size_t>(bytes), reason);
+    reporter_->Corruption(bytes, reason);
  }
 }

 unsigned int Reader::ReadPhysicalRecord(Slice* result) {
  while (true) {
-    if (buffer_.size() < kHeaderSize) {
+    if (buffer_.size() < (size_t)kHeaderSize) {
      if (!eof_) {
        // Last read was a full read, so this is a trailer to skip
        buffer_.clear();
@ -209,16 +189,17 @@ unsigned int Reader::ReadPhysicalRecord(Slice* result) {
          ReportDrop(kBlockSize, status);
          eof_ = true;
          return kEof;
-        } else if (buffer_.size() < kBlockSize) {
+        } else if (buffer_.size() < (size_t)kBlockSize) {
          eof_ = true;
        }
        continue;
+      } else if (buffer_.size() == 0) {
+        // End of file
+        return kEof;
      } else {
-        // Note that if buffer_ is non-empty, we have a truncated header at the
-        // end of the file, which can be caused by the writer crashing in the
-        // middle of writing the header. Instead of considering this an error,
-        // just report EOF.
+        size_t drop_size = buffer_.size();
        buffer_.clear();
+        ReportCorruption(drop_size, "truncated record at end of file");
        return kEof;
      }
    }
@ -232,15 +213,9 @@ unsigned int Reader::ReadPhysicalRecord(Slice* result) {
    if (kHeaderSize + length > buffer_.size()) {
      size_t drop_size = buffer_.size();
      buffer_.clear();
-      if (!eof_) {
      ReportCorruption(drop_size, "bad record length");
      return kBadRecord;
    }
-      // If the end of the file has been reached without reading |length| bytes
-      // of payload, assume the writer died in the middle of writing the record.
-      // Don't report a corruption.
-      return kEof;
-    }

    if (type == kZeroType && length == 0) {
      // Skip zero length record without reporting any drops since
--- a/src/leveldb/db/log_reader.h
+++ b/src/leveldb/db/log_reader.h
@ -73,11 +73,6 @@ class Reader {
  // Offset at which to start looking for the first record to return
  uint64_t const initial_offset_;

-  // True if we are resynchronizing after a seek (initial_offset_ > 0). In
-  // particular, a run of kMiddleType and kLastType records can be silently
-  // skipped in this mode
-  bool resyncing_;
-
  // Extend record types with the following special values
  enum {
    kEof = kMaxRecordType + 1,
@ -99,8 +94,8 @@ class Reader {

  // Reports dropped bytes to the reporter.
  // buffer_ must be updated to remove the dropped bytes prior to invocation.
-  void ReportCorruption(uint64_t bytes, const char* reason);
-  void ReportDrop(uint64_t bytes, const Status& reason);
+  void ReportCorruption(size_t bytes, const char* reason);
+  void ReportDrop(size_t bytes, const Status& reason);

  // No copying allowed
  Reader(const Reader&);
--- a/src/leveldb/db/log_test.cc
+++ b/src/leveldb/db/log_test.cc
@ -79,7 +79,7 @@ class LogTest {
    virtual Status Skip(uint64_t n) {
      if (n > contents_.size()) {
        contents_.clear();
-        return Status::NotFound("in-memory file skipped past end");
+        return Status::NotFound("in-memory file skipepd past end");
      }

      contents_.remove_prefix(n);
@ -104,34 +104,23 @@ class LogTest {
  StringSource source_;
  ReportCollector report_;
  bool reading_;
-  Writer* writer_;
-  Reader* reader_;
+  Writer writer_;
+  Reader reader_;

  // Record metadata for testing initial offset functionality
  static size_t initial_offset_record_sizes_[];
  static uint64_t initial_offset_last_record_offsets_[];
-  static int num_initial_offset_records_;

 public:
  LogTest() : reading_(false),
-              writer_(new Writer(&dest_)),
-              reader_(new Reader(&source_, &report_, true/*checksum*/,
-                      0/*initial_offset*/)) {
-  }
-
-  ~LogTest() {
-    delete writer_;
-    delete reader_;
-  }
-
-  void ReopenForAppend() {
-    delete writer_;
-    writer_ = new Writer(&dest_, dest_.contents_.size());
+              writer_(&dest_),
+              reader_(&source_, &report_, true/*checksum*/,
+                      0/*initial_offset*/) {
  }

  void Write(const std::string& msg) {
    ASSERT_TRUE(!reading_) << "Write() after starting to read";
-    writer_->AddRecord(Slice(msg));
+    writer_.AddRecord(Slice(msg));
  }

  size_t WrittenBytes() const {
@ -145,7 +134,7 @@ class LogTest {
    }
    std::string scratch;
    Slice record;
-    if (reader_->ReadRecord(&record, &scratch)) {
+    if (reader_.ReadRecord(&record, &scratch)) {
      return record.ToString();
    } else {
      return "EOF";
@ -193,18 +182,13 @@ class LogTest {
  }

  void WriteInitialOffsetLog() {
-    for (int i = 0; i < num_initial_offset_records_; i++) {
+    for (int i = 0; i < 4; i++) {
      std::string record(initial_offset_record_sizes_[i],
                         static_cast<char>('a' + i));
      Write(record);
    }
  }

-  void StartReadingAt(uint64_t initial_offset) {
-    delete reader_;
-    reader_ = new Reader(&source_, &report_, true/*checksum*/, initial_offset);
-  }
-
  void CheckOffsetPastEndReturnsNoRecords(uint64_t offset_past_end) {
    WriteInitialOffsetLog();
    reading_ = true;
@ -224,11 +208,6 @@ class LogTest {
    source_.contents_ = Slice(dest_.contents_);
    Reader* offset_reader = new Reader(&source_, &report_, true/*checksum*/,
                                       initial_offset);
-
-    // Read all records from expected_record_offset through the last one.
-    ASSERT_LT(expected_record_offset, num_initial_offset_records_);
-    for (; expected_record_offset < num_initial_offset_records_;
-         ++expected_record_offset) {
    Slice record;
    std::string scratch;
    ASSERT_TRUE(offset_reader->ReadRecord(&record, &scratch));
@ -237,35 +216,24 @@ class LogTest {
    ASSERT_EQ(initial_offset_last_record_offsets_[expected_record_offset],
              offset_reader->LastRecordOffset());
    ASSERT_EQ((char)('a' + expected_record_offset), record.data()[0]);
-    }
    delete offset_reader;
  }
+
 };

 size_t LogTest::initial_offset_record_sizes_[] =
    {10000,  // Two sizable records in first block
     10000,
     2 * log::kBlockSize - 1000,  // Span three blocks
-     1,
-     13716,  // Consume all but two bytes of block 3.
-     log::kBlockSize - kHeaderSize, // Consume the entirety of block 4.
-    };
+     1};

 uint64_t LogTest::initial_offset_last_record_offsets_[] =
    {0,
     kHeaderSize + 10000,
     2 * (kHeaderSize + 10000),
     2 * (kHeaderSize + 10000) +
-         (2 * log::kBlockSize - 1000) + 3 * kHeaderSize,
-     2 * (kHeaderSize + 10000) +
-         (2 * log::kBlockSize - 1000) + 3 * kHeaderSize
-         + kHeaderSize + 1,
-     3 * log::kBlockSize,
-    };
+         (2 * log::kBlockSize - 1000) + 3 * kHeaderSize};

-// LogTest::initial_offset_last_record_offsets_ must be defined before this.
-int LogTest::num_initial_offset_records_ =
-    sizeof(LogTest::initial_offset_last_record_offsets_)/sizeof(uint64_t);

 TEST(LogTest, Empty) {
  ASSERT_EQ("EOF", Read());
@ -350,15 +318,6 @@ TEST(LogTest, AlignedEof) {
  ASSERT_EQ("EOF", Read());
 }

-TEST(LogTest, OpenForAppend) {
-  Write("hello");
-  ReopenForAppend();
-  Write("world");
-  ASSERT_EQ("hello", Read());
-  ASSERT_EQ("world", Read());
-  ASSERT_EQ("EOF", Read());
-}
-
 TEST(LogTest, RandomRead) {
  const int N = 500;
  Random write_rnd(301);
@ -392,32 +351,20 @@ TEST(LogTest, BadRecordType) {
  ASSERT_EQ("OK", MatchError("unknown record type"));
 }

-TEST(LogTest, TruncatedTrailingRecordIsIgnored) {
+TEST(LogTest, TruncatedTrailingRecord) {
  Write("foo");
  ShrinkSize(4);   // Drop all payload as well as a header byte
  ASSERT_EQ("EOF", Read());
-  // Truncated last record is ignored, not treated as an error.
-  ASSERT_EQ(0, DroppedBytes());
-  ASSERT_EQ("", ReportMessage());
+  ASSERT_EQ(kHeaderSize - 1, DroppedBytes());
+  ASSERT_EQ("OK", MatchError("truncated record at end of file"));
 }

 TEST(LogTest, BadLength) {
-  const int kPayloadSize = kBlockSize - kHeaderSize;
-  Write(BigString("bar", kPayloadSize));
-  Write("foo");
-  // Least significant size byte is stored in header[4].
-  IncrementByte(4, 1);
-  ASSERT_EQ("foo", Read());
-  ASSERT_EQ(kBlockSize, DroppedBytes());
-  ASSERT_EQ("OK", MatchError("bad record length"));
-}
-
-TEST(LogTest, BadLengthAtEndIsIgnored) {
  Write("foo");
  ShrinkSize(1);
  ASSERT_EQ("EOF", Read());
-  ASSERT_EQ(0, DroppedBytes());
-  ASSERT_EQ("", ReportMessage());
+  ASSERT_EQ(kHeaderSize + 2, DroppedBytes());
+  ASSERT_EQ("OK", MatchError("bad record length"));
 }

 TEST(LogTest, ChecksumMismatch) {
@ -468,40 +415,6 @@ TEST(LogTest, UnexpectedFirstType) {
  ASSERT_EQ("OK", MatchError("partial record without end"));
 }

-TEST(LogTest, MissingLastIsIgnored) {
-  Write(BigString("bar", kBlockSize));
-  // Remove the LAST block, including header.
-  ShrinkSize(14);
-  ASSERT_EQ("EOF", Read());
-  ASSERT_EQ("", ReportMessage());
-  ASSERT_EQ(0, DroppedBytes());
-}
-
-TEST(LogTest, PartialLastIsIgnored) {
-  Write(BigString("bar", kBlockSize));
-  // Cause a bad record length in the LAST block.
-  ShrinkSize(1);
-  ASSERT_EQ("EOF", Read());
-  ASSERT_EQ("", ReportMessage());
-  ASSERT_EQ(0, DroppedBytes());
-}
-
-TEST(LogTest, SkipIntoMultiRecord) {
-  // Consider a fragmented record:
-  //    first(R1), middle(R1), last(R1), first(R2)
-  // If initial_offset points to a record after first(R1) but before first(R2)
-  // incomplete fragment errors are not actual errors, and must be suppressed
-  // until a new first or full record is encountered.
-  Write(BigString("foo", 3*kBlockSize));
-  Write("correct");
-  StartReadingAt(kBlockSize);
-
-  ASSERT_EQ("correct", Read());
-  ASSERT_EQ("", ReportMessage());
-  ASSERT_EQ(0, DroppedBytes());
-  ASSERT_EQ("EOF", Read());
-}
-
 TEST(LogTest, ErrorJoinsRecords) {
  // Consider two fragmented records:
  //    first(R1) last(R1) first(R2) last(R2)
@ -520,7 +433,7 @@ TEST(LogTest, ErrorJoinsRecords) {

  ASSERT_EQ("correct", Read());
  ASSERT_EQ("EOF", Read());
-  const size_t dropped = DroppedBytes();
+  const int dropped = DroppedBytes();
  ASSERT_LE(dropped, 2*kBlockSize + 100);
  ASSERT_GE(dropped, 2*kBlockSize);
 }
@ -571,10 +484,6 @@ TEST(LogTest, ReadFourthStart) {
      3);
 }

-TEST(LogTest, ReadInitialOffsetIntoBlockPadding) {
-  CheckInitialOffsetRecord(3 * log::kBlockSize - 3, 5);
-}
-
 TEST(LogTest, ReadEnd) {
  CheckOffsetPastEndReturnsNoRecords(0);
 }
--- a/src/leveldb/db/log_writer.cc
+++ b/src/leveldb/db/log_writer.cc
@ -12,22 +12,13 @@
 namespace leveldb {
 namespace log {

-static void InitTypeCrc(uint32_t* type_crc) {
-  for (int i = 0; i <= kMaxRecordType; i++) {
-    char t = static_cast<char>(i);
-    type_crc[i] = crc32c::Value(&t, 1);
-  }
-}
-
 Writer::Writer(WritableFile* dest)
    : dest_(dest),
      block_offset_(0) {
-  InitTypeCrc(type_crc_);
+  for (int i = 0; i <= kMaxRecordType; i++) {
+    char t = static_cast<char>(i);
+    type_crc_[i] = crc32c::Value(&t, 1);
  }
-
-Writer::Writer(WritableFile* dest, uint64_t dest_length)
-    : dest_(dest), block_offset_(dest_length % kBlockSize) {
-  InitTypeCrc(type_crc_);
 }

 Writer::~Writer() {
@ -83,7 +74,7 @@ Status Writer::AddRecord(const Slice& slice) {

 Status Writer::EmitPhysicalRecord(RecordType t, const char* ptr, size_t n) {
  assert(n <= 0xffff);  // Must fit in two bytes
-  assert(block_offset_ + kHeaderSize + n <= kBlockSize);
+  assert(block_offset_ + kHeaderSize + (int)n <= kBlockSize);

  // Format the header
  char buf[kHeaderSize];
--- a/src/leveldb/db/log_writer.h
+++ b/src/leveldb/db/log_writer.h
@ -9,11 +9,10 @@
 #include "db/log_format.h"
 #include "leveldb/slice.h"
 #include "leveldb/status.h"
+#include "leveldb/env.h"

 namespace leveldb {

-class WritableFile;
-
 namespace log {

 class Writer {
@ -22,16 +21,12 @@ class Writer {
  // "*dest" must be initially empty.
  // "*dest" must remain live while this Writer is in use.
  explicit Writer(WritableFile* dest);
-
-  // Create a writer that will append data to "*dest".
-  // "*dest" must have initial length "dest_length".
-  // "*dest" must remain live while this Writer is in use.
-  Writer(WritableFile* dest, uint64_t dest_length);
-
  ~Writer();

  Status AddRecord(const Slice& slice);

+  void Close() {delete dest_; dest_=NULL;};
+
 private:
  WritableFile* dest_;
  int block_offset_;       // Current offset in block
--- a/src/leveldb/db/memtable.cc
+++ b/src/leveldb/db/memtable.cc
@ -6,6 +6,7 @@
 #include "db/dbformat.h"
 #include "leveldb/comparator.h"
 #include "leveldb/env.h"
+#include "leveldb/expiry.h"
 #include "leveldb/iterator.h"
 #include "util/coding.h"

@ -63,6 +64,8 @@ class MemTableIterator: public Iterator {
    Slice key_slice = GetLengthPrefixedSlice(iter_.key());
    return GetLengthPrefixedSlice(key_slice.data() + key_slice.size());
  }
+  virtual KeyMetaData & keymetadata() const
+   {MemTable::DecodeKeyMetaData(iter_.key(), keymetadata_); return(keymetadata_);};

  virtual Status status() const { return Status::OK(); }

@ -81,7 +84,8 @@ Iterator* MemTable::NewIterator() {

 void MemTable::Add(SequenceNumber s, ValueType type,
                   const Slice& key,
-                   const Slice& value) {
+                   const Slice& value,
+                   const ExpiryTimeMicros & expiry) {
  // Format of an entry is concatenation of:
  //  key_size     : varint32 of internal_key.size()
  //  key bytes    : char[internal_key.size()]
@ -89,7 +93,7 @@ void MemTable::Add(SequenceNumber s, ValueType type,
  //  value bytes  : char[value.size()]
  size_t key_size = key.size();
  size_t val_size = value.size();
-  size_t internal_key_size = key_size + 8;
+  size_t internal_key_size = key_size + KeySuffixSize(type);
  const size_t encoded_len =
      VarintLength(internal_key_size) + internal_key_size +
      VarintLength(val_size) + val_size;
@ -97,15 +101,22 @@ void MemTable::Add(SequenceNumber s, ValueType type,
  char* p = EncodeVarint32(buf, internal_key_size);
  memcpy(p, key.data(), key_size);
  p += key_size;
+  if (IsExpiryKey(type))
+  {
+      EncodeFixed64(p, expiry);
+      p+=8;
+  }
  EncodeFixed64(p, (s << 8) | type);
  p += 8;
  p = EncodeVarint32(p, val_size);
  memcpy(p, value.data(), val_size);
-  assert(p + val_size == buf + encoded_len);
+  assert((size_t)((p + val_size) - buf) == encoded_len);
  table_.Insert(buf);
 }

-bool MemTable::Get(const LookupKey& key, std::string* value, Status* s) {
+bool MemTable::Get(const LookupKey& key, Value* value, Status* s,
+    const Options * options) {
+  bool ret_flag(false);
  Slice memkey = key.memtable_key();
  Table::Iterator iter(&table_);
  iter.Seek(memkey.data());
@ -113,6 +124,7 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s) {
    // entry format is:
    //    klength  varint32
    //    userkey  char[klength]
+    //    optional uint64
    //    tag      uint64
    //    vlength  varint32
    //    value    char[vlength]
@ -122,24 +134,66 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s) {
    const char* entry = iter.key();
    uint32_t key_length;
    const char* key_ptr = GetVarint32Ptr(entry, entry+5, &key_length);
+    Slice internal_key(key_ptr, key_length);
    if (comparator_.comparator.user_comparator()->Compare(
-            Slice(key_ptr, key_length - 8),
+            ExtractUserKey(internal_key),
            key.user_key()) == 0) {
      // Correct user key
-      const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8);
-      switch (static_cast<ValueType>(tag & 0xff)) {
-        case kTypeValue: {
+      KeyMetaData meta;
+      DecodeKeyMetaData(entry, meta);
+
+      switch (meta.m_Type) {
+        case kTypeValueWriteTime:
+        case kTypeValueExplicitExpiry:
+        {
+            bool expired=false;
+            if (NULL!=options && options->ExpiryActivated())
+                expired=options->expiry_module->MemTableCallback(internal_key);
+            if (expired)
+            {
+                // like kTypeDeletion
+                *s = Status::NotFound(Slice());
+                ret_flag=true;
+                break;
+            }   // if
+            //otherwise fall into kTypeValue code
+        }   // case
+
+        case kTypeValue:
+        {
          Slice v = GetLengthPrefixedSlice(key_ptr + key_length);
          value->assign(v.data(), v.size());
-          return true;
+          ret_flag=true;
+          break;
        }
        case kTypeDeletion:
          *s = Status::NotFound(Slice());
-          return true;
+          ret_flag=true;
+          break;
+      } // switch
+
+      // only unpack metadata if requested
+      if (key.WantsKeyMetaData())
+          key.SetKeyMetaData(meta);
    }
  }
-  }
-  return false;
+  return ret_flag;
 }

+// this is a static function
+void MemTable::DecodeKeyMetaData(
+    const char * key,
+    KeyMetaData & meta)
+{
+    Slice key_slice = GetLengthPrefixedSlice(key);
+
+    meta.m_Type=ExtractValueType(key_slice);
+    meta.m_Sequence=ExtractSequenceNumber(key_slice);
+    if (IsExpiryKey(meta.m_Type))
+        meta.m_Expiry=ExtractExpiry(key_slice);
+    else
+        meta.m_Expiry=0;
+
+} // DecodeKeyMetaData
+
 }  // namespace leveldb
--- a/src/leveldb/db/memtable.h
+++ b/src/leveldb/db/memtable.h
@ -24,10 +24,10 @@ class MemTable {
  explicit MemTable(const InternalKeyComparator& comparator);

  // Increase reference count.
-  void Ref() { ++refs_; }
+  void Ref() volatile { ++refs_; }

  // Drop reference count.  Delete if no more references exist.
-  void Unref() {
+  void Unref() volatile {
    --refs_;
    assert(refs_ >= 0);
    if (refs_ <= 0) {
@ -36,7 +36,10 @@ class MemTable {
  }

  // Returns an estimate of the number of bytes of data in use by this
-  // data structure. It is safe to call when MemTable is being modified.
+  // data structure.
+  //
+  // REQUIRES: external synchronization to prevent simultaneous
+  // operations on the same MemTable.
  size_t ApproximateMemoryUsage();

  // Return an iterator that yields the contents of the memtable.
@ -52,13 +55,17 @@ class MemTable {
  // Typically value will be empty if type==kTypeDeletion.
  void Add(SequenceNumber seq, ValueType type,
           const Slice& key,
-           const Slice& value);
+           const Slice& value,
+           const ExpiryTimeMicros& expiry=0);

  // If memtable contains a value for key, store it in *value and return true.
  // If memtable contains a deletion for key, store a NotFound() error
  // in *status and return true.
  // Else, return false.
-  bool Get(const LookupKey& key, std::string* value, Status* s);
+  bool Get(const LookupKey& key, Value* value, Status* s, const Options * options);
+
+  // parse keymetadata from skiplist key string
+  static void DecodeKeyMetaData(const char * key, KeyMetaData & meta);

 private:
  ~MemTable();  // Private since only Unref() should be used to delete it
@ -69,7 +76,7 @@ class MemTable {
    int operator()(const char* a, const char* b) const;
  };
  friend class MemTableIterator;
-  friend class MemTableBackwardIterator;
+  friend class MemTableBackwardIterator; // does not exist

  typedef SkipList<const char*, KeyComparator> Table;

--- a/src/leveldb/db/penalty_test.cc
+++ b/src/leveldb/db/penalty_test.cc
@ -0,0 +1,248 @@
+// -------------------------------------------------------------------
+//
+// penalty_test.cc
+//
+// Copyright (c) 2016 Basho Technologies, Inc. All Rights Reserved.
+//
+// This file is provided to you under the Apache License,
+// Version 2.0 (the "License"); you may not use this file
+// except in compliance with the License.  You may obtain
+// a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+//
+// -------------------------------------------------------------------
+
+
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+#include "leveldb/comparator.h"
+
+#include "db/version_set.h"
+
+/**
+ * Execution routine
+ */
+int main(int argc, char** argv)
+{
+  return leveldb::test::RunAllTests();
+}
+
+
+namespace leveldb {
+
+class TestVersion : public Version
+{
+public:
+    TestVersion()
+        : Version(NULL)
+    {
+        int loop;
+
+        for (loop=0; loop<config::kNumLevels; ++loop)
+        {
+            m_FalseFile[loop].file_size=0;
+            m_LevelFileCount[loop]=0;
+        }   // for
+    };
+
+    virtual size_t NumFiles(int level) const {return(m_LevelFileCount[level]);};
+
+    virtual const std::vector<FileMetaData*> & GetFileList(int level) const
+    {
+        m_FalseVector.clear();
+        m_FalseVector.push_back(&m_FalseFile[level]);
+        return(m_FalseVector);
+    };
+
+    mutable std::vector<FileMetaData*> m_FalseVector;
+    mutable FileMetaData m_FalseFile[config::kNumLevels];
+
+    size_t m_LevelFileCount[config::kNumLevels];
+
+};  // class TestVersion
+
+/**
+ * Wrapper class for tests.  Holds working variables
+ * and helper functions.
+ */
+class PenaltyTester : public VersionSet
+{
+public:
+    PenaltyTester()
+        : m_IntCompare(m_Options.comparator), VersionSet("", &m_Options, NULL, &m_IntCompare)
+    {
+    };
+
+    ~PenaltyTester()
+    {
+    };
+
+    Options m_Options;
+    InternalKeyComparator m_IntCompare;
+
+};  // class PenaltyTester
+
+
+  /*******************
+   * Form note:
+   *   using     ASSERT_TRUE(0==version.WritePenalty());
+   *    instead of ASSERT_EQ / ASSERT_NE because WritePenalty
+   *    returns a volatile int, which older compilers believe is
+   *    not an equivalent type to a constant.  RedHat 5, Solaris,
+   *    and SmartOS were giving grief.
+   *******************/
+
+/**
+ * Debug 1
+ */
+#if 0
+TEST(PenaltyTester, Debug1)
+{
+    TestVersion version;
+    int penalty;
+
+    m_Options.write_buffer_size=46416847;
+
+    version.m_FalseFile[2].file_size=1075676398;
+    version.m_LevelFileCount[1]=1;
+
+    UpdatePenalty(&version);
+
+    ASSERT_TRUE(0==version.WritePenalty());
+
+}   // test Debug1
+#endif
+
+
+/**
+ * No penalty scenarios
+ */
+TEST(PenaltyTester, NoPenalty)
+{
+    TestVersion version;
+    int level;
+
+    m_Options.write_buffer_size=46416847;
+
+    // nothing
+    UpdatePenalty(&version);
+    ASSERT_TRUE(0==version.WritePenalty());
+
+    /**
+     * Level 0
+     *  (overlapped level, penalty is count based)
+     */
+    // no penalty
+    version.m_LevelFileCount[0]=config::kL0_CompactionTrigger;
+    UpdatePenalty(&version);
+    ASSERT_TRUE(0==version.WritePenalty());
+
+    version.m_LevelFileCount[0]=config::kL0_SlowdownWritesTrigger;
+    UpdatePenalty(&version);
+    ASSERT_TRUE(0==version.WritePenalty());
+
+#if 0   // needs rewrite to be time based
+    // threshold reached ... some penalty
+    version.m_LevelFileCount[0]=config::kL0_SlowdownWritesTrigger+1;
+    UpdatePenalty(&version);
+    ASSERT_TRUE(0!=version.WritePenalty());
+
+    // clean up
+    version.m_LevelFileCount[0]=0;
+
+    /**
+     * Level 1
+     *  (overlapped level, penalty is count based)
+     */
+    // no penalty
+    version.m_LevelFileCount[1]=config::kL0_CompactionTrigger;
+    UpdatePenalty(&version);
+    ASSERT_TRUE(0==version.WritePenalty());
+
+    version.m_LevelFileCount[1]=config::kL0_SlowdownWritesTrigger;
+    UpdatePenalty(&version);
+    ASSERT_TRUE(0==version.WritePenalty());
+
+    // threshold reached ... some penalty
+    version.m_LevelFileCount[1]=config::kL0_SlowdownWritesTrigger+1;
+    UpdatePenalty(&version);
+    ASSERT_TRUE(0!=version.WritePenalty());
+
+    // clean up
+    version.m_LevelFileCount[1]=0;
+
+    /**
+     * Level 2
+     *  (landing level, penalty size based)
+     */
+    // no penalty
+    version.m_FalseFile[2].file_size=0;
+    UpdatePenalty(&version);
+    ASSERT_TRUE(0==version.WritePenalty());
+
+    version.m_FalseFile[2].file_size=VersionSet::DesiredBytesForLevel(2);
+    UpdatePenalty(&version);
+    ASSERT_TRUE(0==version.WritePenalty());
+
+    version.m_FalseFile[2].file_size=VersionSet::MaxBytesForLevel(2)-1;
+    UpdatePenalty(&version);
+    ASSERT_TRUE(0==version.WritePenalty());
+
+    version.m_FalseFile[2].file_size=VersionSet::MaxBytesForLevel(2);
+    UpdatePenalty(&version);
+    ASSERT_TRUE(0!=version.WritePenalty());
+
+    // interaction rule with level 1
+    version.m_FalseFile[2].file_size=VersionSet::MaxBytesForLevel(2)-1;
+    version.m_LevelFileCount[1]=config::kL0_CompactionTrigger/2;
+    UpdatePenalty(&version);
+    ASSERT_TRUE(0!=version.WritePenalty());
+
+    // clean up
+    version.m_LevelFileCount[1]=0;
+    version.m_FalseFile[2].file_size=0;
+
+    /**
+     * Level 3+
+     *  (landing level, penalty size based)
+     */
+    for (level=3; level<config::kNumLevels; ++level)
+    {
+        // no penalty
+        version.m_FalseFile[level].file_size=0;
+        UpdatePenalty(&version);
+	ASSERT_TRUE(0==version.WritePenalty());
+
+        version.m_FalseFile[level].file_size=VersionSet::DesiredBytesForLevel(level);
+        UpdatePenalty(&version);
+	ASSERT_TRUE(0==version.WritePenalty());
+
+        version.m_FalseFile[level].file_size=VersionSet::MaxBytesForLevel(level)-1;
+        UpdatePenalty(&version);
+	ASSERT_TRUE(0==version.WritePenalty());
+
+        version.m_FalseFile[level].file_size=VersionSet::MaxBytesForLevel(level);
+        UpdatePenalty(&version);
+        if ((config::kNumLevels-1)!=level)
+	  ASSERT_TRUE(0!=version.WritePenalty());
+        else
+	  ASSERT_TRUE(0==version.WritePenalty());
+
+        // clean up
+        version.m_FalseFile[level].file_size=0;
+    }   // for
+#endif
+}   // test NoPenalty
+
+
+
+}  // namespace leveldb
--- a/src/leveldb/db/recovery_test.cc
+++ b/src/leveldb/db/recovery_test.cc
@ -1,324 +0,0 @@
-// Copyright (c) 2014 The LevelDB Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
-
-#include "db/db_impl.h"
-#include "db/filename.h"
-#include "db/version_set.h"
-#include "db/write_batch_internal.h"
-#include "leveldb/db.h"
-#include "leveldb/env.h"
-#include "leveldb/write_batch.h"
-#include "util/logging.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
-
-namespace leveldb {
-
-class RecoveryTest {
- public:
-  RecoveryTest() : env_(Env::Default()), db_(NULL) {
-    dbname_ = test::TmpDir() + "/recovery_test";
-    DestroyDB(dbname_, Options());
-    Open();
-  }
-
-  ~RecoveryTest() {
-    Close();
-    DestroyDB(dbname_, Options());
-  }
-
-  DBImpl* dbfull() const { return reinterpret_cast<DBImpl*>(db_); }
-  Env* env() const { return env_; }
-
-  bool CanAppend() {
-    WritableFile* tmp;
-    Status s = env_->NewAppendableFile(CurrentFileName(dbname_), &tmp);
-    delete tmp;
-    if (s.IsNotSupportedError()) {
-      return false;
-    } else {
-      return true;
-    }
-  }
-
-  void Close() {
-    delete db_;
-    db_ = NULL;
-  }
-
-  void Open(Options* options = NULL) {
-    Close();
-    Options opts;
-    if (options != NULL) {
-      opts = *options;
-    } else {
-      opts.reuse_logs = true;  // TODO(sanjay): test both ways
-      opts.create_if_missing = true;
-    }
-    if (opts.env == NULL) {
-      opts.env = env_;
-    }
-    ASSERT_OK(DB::Open(opts, dbname_, &db_));
-    ASSERT_EQ(1, NumLogs());
-  }
-
-  Status Put(const std::string& k, const std::string& v) {
-    return db_->Put(WriteOptions(), k, v);
-  }
-
-  std::string Get(const std::string& k, const Snapshot* snapshot = NULL) {
-    std::string result;
-    Status s = db_->Get(ReadOptions(), k, &result);
-    if (s.IsNotFound()) {
-      result = "NOT_FOUND";
-    } else if (!s.ok()) {
-      result = s.ToString();
-    }
-    return result;
-  }
-
-  std::string ManifestFileName() {
-    std::string current;
-    ASSERT_OK(ReadFileToString(env_, CurrentFileName(dbname_), &current));
-    size_t len = current.size();
-    if (len > 0 && current[len-1] == '\n') {
-      current.resize(len - 1);
-    }
-    return dbname_ + "/" + current;
-  }
-
-  std::string LogName(uint64_t number) {
-    return LogFileName(dbname_, number);
-  }
-
-  size_t DeleteLogFiles() {
-    std::vector<uint64_t> logs = GetFiles(kLogFile);
-    for (size_t i = 0; i < logs.size(); i++) {
-      ASSERT_OK(env_->DeleteFile(LogName(logs[i]))) << LogName(logs[i]);
-    }
-    return logs.size();
-  }
-
-  uint64_t FirstLogFile() {
-    return GetFiles(kLogFile)[0];
-  }
-
-  std::vector<uint64_t> GetFiles(FileType t) {
-    std::vector<std::string> filenames;
-    ASSERT_OK(env_->GetChildren(dbname_, &filenames));
-    std::vector<uint64_t> result;
-    for (size_t i = 0; i < filenames.size(); i++) {
-      uint64_t number;
-      FileType type;
-      if (ParseFileName(filenames[i], &number, &type) && type == t) {
-        result.push_back(number);
-      }
-    }
-    return result;
-  }
-
-  int NumLogs() {
-    return GetFiles(kLogFile).size();
-  }
-
-  int NumTables() {
-    return GetFiles(kTableFile).size();
-  }
-
-  uint64_t FileSize(const std::string& fname) {
-    uint64_t result;
-    ASSERT_OK(env_->GetFileSize(fname, &result)) << fname;
-    return result;
-  }
-
-  void CompactMemTable() {
-    dbfull()->TEST_CompactMemTable();
-  }
-
-  // Directly construct a log file that sets key to val.
-  void MakeLogFile(uint64_t lognum, SequenceNumber seq, Slice key, Slice val) {
-    std::string fname = LogFileName(dbname_, lognum);
-    WritableFile* file;
-    ASSERT_OK(env_->NewWritableFile(fname, &file));
-    log::Writer writer(file);
-    WriteBatch batch;
-    batch.Put(key, val);
-    WriteBatchInternal::SetSequence(&batch, seq);
-    ASSERT_OK(writer.AddRecord(WriteBatchInternal::Contents(&batch)));
-    ASSERT_OK(file->Flush());
-    delete file;
-  }
-
- private:
-  std::string dbname_;
-  Env* env_;
-  DB* db_;
-};
-
-TEST(RecoveryTest, ManifestReused) {
-  if (!CanAppend()) {
-    fprintf(stderr, "skipping test because env does not support appending\n");
-    return;
-  }
-  ASSERT_OK(Put("foo", "bar"));
-  Close();
-  std::string old_manifest = ManifestFileName();
-  Open();
-  ASSERT_EQ(old_manifest, ManifestFileName());
-  ASSERT_EQ("bar", Get("foo"));
-  Open();
-  ASSERT_EQ(old_manifest, ManifestFileName());
-  ASSERT_EQ("bar", Get("foo"));
-}
-
-TEST(RecoveryTest, LargeManifestCompacted) {
-  if (!CanAppend()) {
-    fprintf(stderr, "skipping test because env does not support appending\n");
-    return;
-  }
-  ASSERT_OK(Put("foo", "bar"));
-  Close();
-  std::string old_manifest = ManifestFileName();
-
-  // Pad with zeroes to make manifest file very big.
-  {
-    uint64_t len = FileSize(old_manifest);
-    WritableFile* file;
-    ASSERT_OK(env()->NewAppendableFile(old_manifest, &file));
-    std::string zeroes(3*1048576 - static_cast<size_t>(len), 0);
-    ASSERT_OK(file->Append(zeroes));
-    ASSERT_OK(file->Flush());
-    delete file;
-  }
-
-  Open();
-  std::string new_manifest = ManifestFileName();
-  ASSERT_NE(old_manifest, new_manifest);
-  ASSERT_GT(10000, FileSize(new_manifest));
-  ASSERT_EQ("bar", Get("foo"));
-
-  Open();
-  ASSERT_EQ(new_manifest, ManifestFileName());
-  ASSERT_EQ("bar", Get("foo"));
-}
-
-TEST(RecoveryTest, NoLogFiles) {
-  ASSERT_OK(Put("foo", "bar"));
-  ASSERT_EQ(1, DeleteLogFiles());
-  Open();
-  ASSERT_EQ("NOT_FOUND", Get("foo"));
-  Open();
-  ASSERT_EQ("NOT_FOUND", Get("foo"));
-}
-
-TEST(RecoveryTest, LogFileReuse) {
-  if (!CanAppend()) {
-    fprintf(stderr, "skipping test because env does not support appending\n");
-    return;
-  }
-  for (int i = 0; i < 2; i++) {
-    ASSERT_OK(Put("foo", "bar"));
-    if (i == 0) {
-      // Compact to ensure current log is empty
-      CompactMemTable();
-    }
-    Close();
-    ASSERT_EQ(1, NumLogs());
-    uint64_t number = FirstLogFile();
-    if (i == 0) {
-      ASSERT_EQ(0, FileSize(LogName(number)));
-    } else {
-      ASSERT_LT(0, FileSize(LogName(number)));
-    }
-    Open();
-    ASSERT_EQ(1, NumLogs());
-    ASSERT_EQ(number, FirstLogFile()) << "did not reuse log file";
-    ASSERT_EQ("bar", Get("foo"));
-    Open();
-    ASSERT_EQ(1, NumLogs());
-    ASSERT_EQ(number, FirstLogFile()) << "did not reuse log file";
-    ASSERT_EQ("bar", Get("foo"));
-  }
-}
-
-TEST(RecoveryTest, MultipleMemTables) {
-  // Make a large log.
-  const int kNum = 1000;
-  for (int i = 0; i < kNum; i++) {
-    char buf[100];
-    snprintf(buf, sizeof(buf), "%050d", i);
-    ASSERT_OK(Put(buf, buf));
-  }
-  ASSERT_EQ(0, NumTables());
-  Close();
-  ASSERT_EQ(0, NumTables());
-  ASSERT_EQ(1, NumLogs());
-  uint64_t old_log_file = FirstLogFile();
-
-  // Force creation of multiple memtables by reducing the write buffer size.
-  Options opt;
-  opt.reuse_logs = true;
-  opt.write_buffer_size = (kNum*100) / 2;
-  Open(&opt);
-  ASSERT_LE(2, NumTables());
-  ASSERT_EQ(1, NumLogs());
-  ASSERT_NE(old_log_file, FirstLogFile()) << "must not reuse log";
-  for (int i = 0; i < kNum; i++) {
-    char buf[100];
-    snprintf(buf, sizeof(buf), "%050d", i);
-    ASSERT_EQ(buf, Get(buf));
-  }
-}
-
-TEST(RecoveryTest, MultipleLogFiles) {
-  ASSERT_OK(Put("foo", "bar"));
-  Close();
-  ASSERT_EQ(1, NumLogs());
-
-  // Make a bunch of uncompacted log files.
-  uint64_t old_log = FirstLogFile();
-  MakeLogFile(old_log+1, 1000, "hello", "world");
-  MakeLogFile(old_log+2, 1001, "hi", "there");
-  MakeLogFile(old_log+3, 1002, "foo", "bar2");
-
-  // Recover and check that all log files were processed.
-  Open();
-  ASSERT_LE(1, NumTables());
-  ASSERT_EQ(1, NumLogs());
-  uint64_t new_log = FirstLogFile();
-  ASSERT_LE(old_log+3, new_log);
-  ASSERT_EQ("bar2", Get("foo"));
-  ASSERT_EQ("world", Get("hello"));
-  ASSERT_EQ("there", Get("hi"));
-
-  // Test that previous recovery produced recoverable state.
-  Open();
-  ASSERT_LE(1, NumTables());
-  ASSERT_EQ(1, NumLogs());
-  if (CanAppend()) {
-    ASSERT_EQ(new_log, FirstLogFile());
-  }
-  ASSERT_EQ("bar2", Get("foo"));
-  ASSERT_EQ("world", Get("hello"));
-  ASSERT_EQ("there", Get("hi"));
-
-  // Check that introducing an older log file does not cause it to be re-read.
-  Close();
-  MakeLogFile(old_log+1, 2000, "hello", "stale write");
-  Open();
-  ASSERT_LE(1, NumTables());
-  ASSERT_EQ(1, NumLogs());
-  if (CanAppend()) {
-    ASSERT_EQ(new_log, FirstLogFile());
-  }
-  ASSERT_EQ("bar2", Get("foo"));
-  ASSERT_EQ("world", Get("hello"));
-  ASSERT_EQ("there", Get("hi"));
-}
-
-}  // namespace leveldb
-
-int main(int argc, char** argv) {
-  return leveldb::test::RunAllTests();
-}
--- a/src/leveldb/db/repair.cc
+++ b/src/leveldb/db/repair.cc
@ -45,30 +45,56 @@ namespace {
 class Repairer {
 public:
  Repairer(const std::string& dbname, const Options& options)
-      : dbname_(dbname),
+      : double_cache_(options),
+        options_(SanitizeOptions(dbname, &icmp_, &ipolicy_, options, double_cache_.GetBlockCache())),
+        org_options_(options),
+        dbname_(options_.tiered_fast_prefix),
+        org_dbname_(dbname),
        env_(options.env),
        icmp_(options.comparator),
        ipolicy_(options.filter_policy),
-        options_(SanitizeOptions(dbname, &icmp_, &ipolicy_, options)),
        owns_info_log_(options_.info_log != options.info_log),
-        owns_cache_(options_.block_cache != options.block_cache),
-        next_file_number_(1) {
+        db_lock_(NULL),
+        next_file_number_(1)
+  {
    // TableCache can be small since we expect each table to be opened once.
-    table_cache_ = new TableCache(dbname_, &options_, 10);
+    table_cache_ = new TableCache(dbname_, &options_, double_cache_.GetFileCache(), double_cache_);
+
  }

  ~Repairer() {
-    delete table_cache_;
    if (owns_info_log_) {
      delete options_.info_log;
    }
-    if (owns_cache_) {
-      delete options_.block_cache;
-    }
+//    if (owns_cache_) {
+//      delete options_.block_cache;
+//    }
+
+    // must remove second ref counter that keeps overlapped files locked
+    //  table cache
+    bool is_overlap;
+    for (int level = 0; level < config::kNumLevels; level++) {
+        {
+            is_overlap=(level < leveldb::config::kNumOverlapLevels);
+            for (size_t i = 0; i < table_numbers_[level].size(); i++) {
+                table_cache_->Evict(table_numbers_[level][i], is_overlap);
+            }   // for
+        }   // if
+    } // for
+
+    delete table_cache_;
  }

  Status Run() {
-    Status status = FindFiles();
+    Status status;
+
+    status = env_->LockFile(LockFileName(dbname_), &db_lock_);
+
+    if (status.ok())
+        status = MakeLevelDirectories(env_, options_);
+
+    if (status.ok()) {
+      status = FindFiles();
      if (status.ok()) {
          ConvertLogFilesToTables();
          ExtractMetaData();
@ -76,18 +102,56 @@ class Repairer {
      }
      if (status.ok()) {
        unsigned long long bytes = 0;
-      for (size_t i = 0; i < tables_.size(); i++) {
-        bytes += tables_[i].meta.file_size;
+        unsigned long long files = 0;
+
+        // calculate size for log information
+        for (int level=0; level<config::kNumLevels;++level)
+        {
+          std::vector<TableInfo> * table_ptr;
+          std::vector<TableInfo>::const_iterator i;
+
+          table_ptr=&tables_[level];
+          files+=table_ptr->size();
+
+          for ( i = table_ptr->begin(); table_ptr->end()!= i; i++) {
+            bytes += i->meta.file_size;
          }
+        } // for
+
        Log(options_.info_log,
            "**** Repaired leveldb %s; "
            "recovered %d files; %llu bytes. "
            "Some data may have been lost. "
            "****",
            dbname_.c_str(),
-          static_cast<int>(tables_.size()),
+            static_cast<int>(files),
            bytes);
      }
+      if (db_lock_ != NULL) {
+        env_->UnlockFile(db_lock_);
+      }
+    }
+
+    // perform Riak specific scan for overlapping .sst files
+    //  within a level
+    if (status.ok())
+    {
+        leveldb::DB * db_ptr;
+        Options options;
+
+        db_ptr=NULL;
+        options=org_options_;
+//        options.block_cache=NULL;  // not reusing for fear of edge cases
+        options.is_repair=true;
+        options.error_if_exists=false;
+        status=leveldb::DB::Open(options, org_dbname_, &db_ptr);
+
+        if (status.ok())
+            status=db_ptr->VerifyLevels();
+
+        delete db_ptr;
+
+    }   // if
    return status;
  }

@ -97,34 +161,36 @@ class Repairer {
    SequenceNumber max_sequence;
  };

-  std::string const dbname_;
+  DoubleCache double_cache_;
+  Options const options_, org_options_;
+  std::string const dbname_, org_dbname_;
  Env* const env_;
  InternalKeyComparator const icmp_;
  InternalFilterPolicy const ipolicy_;
-  Options const options_;
  bool owns_info_log_;
-  bool owns_cache_;
+  FileLock* db_lock_;
  TableCache* table_cache_;
  VersionEdit edit_;

  std::vector<std::string> manifests_;
-  std::vector<uint64_t> table_numbers_;
+  std::vector<uint64_t> table_numbers_[config::kNumLevels];
  std::vector<uint64_t> logs_;
-  std::vector<TableInfo> tables_;
+  std::vector<TableInfo> tables_[config::kNumLevels];
  uint64_t next_file_number_;

-  Status FindFiles() {
+  Status FindFiles()
+  {
    std::vector<std::string> filenames;
+    uint64_t number;
+    FileType type;
+    int level;
+
+    // base directory
    Status status = env_->GetChildren(dbname_, &filenames);
    if (!status.ok()) {
      return status;
    }
-    if (filenames.empty()) {
-      return Status::IOError(dbname_, "repair found no files");
-    }

-    uint64_t number;
-    FileType type;
    for (size_t i = 0; i < filenames.size(); i++) {
      if (ParseFileName(filenames[i], &number, &type)) {
        if (type == kDescriptorFile) {
@ -136,13 +202,38 @@ class Repairer {
          if (type == kLogFile) {
            logs_.push_back(number);
          } else if (type == kTableFile) {
-            table_numbers_.push_back(number);
+            table_numbers_[0].push_back(number);
          } else {
            // Ignore other files
+          } // else
+        } // else
+      } // if
+    } // for
+
+    for (level=0; level < config::kNumLevels; ++level)
+    {
+      std::string dirname;
+
+      filenames.clear();
+      dirname=MakeDirName2(options_, level, "sst");
+      Status status = env_->GetChildren(dirname, &filenames);
+      if (!status.ok()) {
+          return status;
      }
+
+      for (size_t i = 0; i < filenames.size(); i++) {
+        if (ParseFileName(filenames[i], &number, &type)) {
+          if (number + 1 > next_file_number_) {
+            next_file_number_ = number + 1;
          }
+
+          if (type == kTableFile) {
+            table_numbers_[level].push_back(number);
          }
-    }
+        } // if
+      } // for
+    } // for
+
    return status;
  }

@ -186,7 +277,7 @@ class Repairer {
    reporter.env = env_;
    reporter.info_log = options_.info_log;
    reporter.lognum = log;
-    // We intentionally make log::Reader do checksumming so that
+    // We intentially make log::Reader do checksumming so that
    // corruptions cause entire commits to be skipped instead of
    // propagating bad information (like overly large sequence
    // numbers).
@ -203,11 +294,11 @@ class Repairer {
    while (reader.ReadRecord(&record, &scratch)) {
      if (record.size() < 12) {
        reporter.Corruption(
-            record.size(), Status::Corruption("log record too small", logname));
+            record.size(), Status::Corruption("log record too small"));
        continue;
      }
      WriteBatchInternal::SetContents(&batch, record);
-      status = WriteBatchInternal::InsertInto(&batch, mem);
+      status = WriteBatchInternal::InsertInto(&batch, mem, &options_);
      if (status.ok()) {
        counter += WriteBatchInternal::Count(&batch);
      } else {
@ -223,14 +314,15 @@ class Repairer {
    // since ExtractMetaData() will also generate edits.
    FileMetaData meta;
    meta.number = next_file_number_++;
+    meta.level = 0;
    Iterator* iter = mem->NewIterator();
-    status = BuildTable(dbname_, env_, options_, table_cache_, iter, &meta);
+    status = BuildTable(dbname_, env_, options_, icmp_.user_comparator(), table_cache_, iter, &meta, 0);
    delete iter;
    mem->Unref();
    mem = NULL;
    if (status.ok()) {
      if (meta.file_size > 0) {
-        table_numbers_.push_back(meta.number);
+        table_numbers_[0].push_back(meta.number);
      }
    }
    Log(options_.info_log, "Log #%llu: %d ops saved to Table #%llu %s",
@ -242,52 +334,48 @@ class Repairer {
  }

  void ExtractMetaData() {
-    for (size_t i = 0; i < table_numbers_.size(); i++) {
-      ScanTable(table_numbers_[i]);
-    }
-  }
+    for (int level=0; level < config::kNumLevels; ++level)
+    {
+      std::vector<uint64_t> * number_ptr;
+      std::vector<uint64_t>::const_iterator i;

-  Iterator* NewTableIterator(const FileMetaData& meta) {
-    // Same as compaction iterators: if paranoid_checks are on, turn
-    // on checksum verification.
-    ReadOptions r;
-    r.verify_checksums = options_.paranoid_checks;
-    return table_cache_->NewIterator(r, meta.number, meta.file_size);
-  }
-
-  void ScanTable(uint64_t number) {
+      number_ptr=&table_numbers_[level];
+      for (i = number_ptr->begin(); number_ptr->end()!= i; ++i) {
        TableInfo t;
-    t.meta.number = number;
-    std::string fname = TableFileName(dbname_, number);
-    Status status = env_->GetFileSize(fname, &t.meta.file_size);
-    if (!status.ok()) {
-      // Try alternate file name.
-      fname = SSTTableFileName(dbname_, number);
-      Status s2 = env_->GetFileSize(fname, &t.meta.file_size);
-      if (s2.ok()) {
-        status = Status::OK();
-      }
-    }
-    if (!status.ok()) {
-      ArchiveFile(TableFileName(dbname_, number));
-      ArchiveFile(SSTTableFileName(dbname_, number));
-      Log(options_.info_log, "Table #%llu: dropped: %s",
+        t.meta.number = *i;
+        t.meta.level = level;
+        Status status = ScanTable(&t);
+        if (!status.ok())
+        {
+          std::string fname = TableFileName(options_, t.meta.number, t.meta.level);
+          Log(options_.info_log, "Table #%llu: ignoring %s",
              (unsigned long long) t.meta.number,
              status.ToString().c_str());
-      return;
+          ArchiveFile(fname, true);
+        } else {
+          tables_[level].push_back(t);
+        }
+      }
+    }
  }

-    // Extract metadata by scanning through table.
+  Status ScanTable(TableInfo* t) {
+    Table * table_ptr;
+    SstCounters counters;
+    std::string fname = TableFileName(options_, t->meta.number, t->meta.level);
    int counter = 0;
-    Iterator* iter = NewTableIterator(t.meta);
+    Status status = env_->GetFileSize(fname, &t->meta.file_size);
+    if (status.ok()) {
+      Iterator* iter = table_cache_->NewIterator(
+          ReadOptions(), t->meta.number, t->meta.file_size, t->meta.level, &table_ptr);
      bool empty = true;
      ParsedInternalKey parsed;
-    t.max_sequence = 0;
+      t->max_sequence = 0;
      for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
        Slice key = iter->key();
        if (!ParseInternalKey(key, &parsed)) {
          Log(options_.info_log, "Table #%llu: unparsable key %s",
-            (unsigned long long) t.meta.number,
+              (unsigned long long) t->meta.number,
              EscapeString(key).c_str());
          continue;
        }
@ -295,115 +383,79 @@ class Repairer {
        counter++;
        if (empty) {
          empty = false;
-        t.meta.smallest.DecodeFrom(key);
+          t->meta.smallest.DecodeFrom(key);
        }
-      t.meta.largest.DecodeFrom(key);
-      if (parsed.sequence > t.max_sequence) {
-        t.max_sequence = parsed.sequence;
+        t->meta.largest.DecodeFrom(key);
+        if (parsed.sequence > t->max_sequence) {
+          t->max_sequence = parsed.sequence;
        }
      }
      if (!iter->status().ok()) {
        status = iter->status();
      }
+      else {
+        counters=table_ptr->GetSstCounters();
+        t->meta.exp_write_low=counters.Value(eSstCountExpiry1);
+        t->meta.exp_write_high=counters.Value(eSstCountExpiry2);
+        t->meta.exp_explicit_high=counters.Value(eSstCountExpiry3);
+      }
      delete iter;
+    }
    Log(options_.info_log, "Table #%llu: %d entries %s",
-        (unsigned long long) t.meta.number,
+        (unsigned long long) t->meta.number,
        counter,
        status.ToString().c_str());
-
-    if (status.ok()) {
-      tables_.push_back(t);
-    } else {
-      RepairTable(fname, t);  // RepairTable archives input file.
-    }
-  }
-
-  void RepairTable(const std::string& src, TableInfo t) {
-    // We will copy src contents to a new table and then rename the
-    // new table over the source.
-
-    // Create builder.
-    std::string copy = TableFileName(dbname_, next_file_number_++);
-    WritableFile* file;
-    Status s = env_->NewWritableFile(copy, &file);
-    if (!s.ok()) {
-      return;
-    }
-    TableBuilder* builder = new TableBuilder(options_, file);
-
-    // Copy data.
-    Iterator* iter = NewTableIterator(t.meta);
-    int counter = 0;
-    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
-      builder->Add(iter->key(), iter->value());
-      counter++;
-    }
-    delete iter;
-
-    ArchiveFile(src);
-    if (counter == 0) {
-      builder->Abandon();  // Nothing to save
-    } else {
-      s = builder->Finish();
-      if (s.ok()) {
-        t.meta.file_size = builder->FileSize();
-      }
-    }
-    delete builder;
-    builder = NULL;
-
-    if (s.ok()) {
-      s = file->Close();
-    }
-    delete file;
-    file = NULL;
-
-    if (counter > 0 && s.ok()) {
-      std::string orig = TableFileName(dbname_, t.meta.number);
-      s = env_->RenameFile(copy, orig);
-      if (s.ok()) {
-        Log(options_.info_log, "Table #%llu: %d entries repaired",
-            (unsigned long long) t.meta.number, counter);
-        tables_.push_back(t);
-      }
-    }
-    if (!s.ok()) {
-      env_->DeleteFile(copy);
-    }
+    return status;
  }

  Status WriteDescriptor() {
    std::string tmp = TempFileName(dbname_, 1);
    WritableFile* file;
-    Status status = env_->NewWritableFile(tmp, &file);
+    Status status = env_->NewWritableFile(tmp, &file, 4096);
    if (!status.ok()) {
      return status;
    }

    SequenceNumber max_sequence = 0;
-    for (size_t i = 0; i < tables_.size(); i++) {
-      if (max_sequence < tables_[i].max_sequence) {
-        max_sequence = tables_[i].max_sequence;
-      }
+    for (int level=0; level<config::kNumLevels;++level)
+    {
+      std::vector<TableInfo> * table_ptr;
+      std::vector<TableInfo>::const_iterator i;
+
+      table_ptr=&tables_[level];
+
+      for ( i = table_ptr->begin(); table_ptr->end()!= i; i++) {
+        if (max_sequence < i->max_sequence) {
+          max_sequence = i->max_sequence;
        }
+      } // for
+    } // for

    edit_.SetComparatorName(icmp_.user_comparator()->Name());
    edit_.SetLogNumber(0);
    edit_.SetNextFile(next_file_number_);
    edit_.SetLastSequence(max_sequence);

-    for (size_t i = 0; i < tables_.size(); i++) {
-      // TODO(opt): separate out into multiple levels
-      const TableInfo& t = tables_[i];
-      edit_.AddFile(0, t.meta.number, t.meta.file_size,
-                    t.meta.smallest, t.meta.largest);
-    }
+    for (int level=0; level<config::kNumLevels;++level)
+    {
+      std::vector<TableInfo> * table_ptr;
+      std::vector<TableInfo>::const_iterator i;
+
+      table_ptr=&tables_[level];
+
+      for ( i = table_ptr->begin(); table_ptr->end()!= i; i++) {
+          edit_.AddFile2(level, i->meta.number, i->meta.file_size,
+                         i->meta.smallest, i->meta.largest,
+                         i->meta.exp_write_low, i->meta.exp_write_high, i->meta.exp_explicit_high);
+
+      } // for
+    } // for

    //fprintf(stderr, "NewDescriptor:\n%s\n", edit_.DebugString().c_str());
    {
      log::Writer log(file);
      std::string record;
-      edit_.EncodeTo(&record);
+      edit_.EncodeTo(&record);  // manifest format is default for release, options_ often incomplete
      status = log.AddRecord(record);
    }
    if (status.ok()) {
@ -431,21 +483,33 @@ class Repairer {
    return status;
  }

-  void ArchiveFile(const std::string& fname) {
+  void ArchiveFile(const std::string& fname, bool two_levels=false) {
    // Move into another directory.  E.g., for
    //    dir/foo
    // rename to
    //    dir/lost/foo
-    const char* slash = strrchr(fname.c_str(), '/');
+    std::string::size_type slash, slash2;
+
+    slash=fname.rfind('/');
+    if (two_levels && std::string::npos!=slash && 0<slash)
+    {
+        slash2=fname.rfind('/',slash-1);
+        if (std::string::npos==slash2)
+            slash2=slash;
+    }   // if
+    else
+        slash2=slash;
+
    std::string new_dir;
-    if (slash != NULL) {
-      new_dir.assign(fname.data(), slash - fname.data());
-    }
+
+    if (std::string::npos != slash2 && 0<slash2)
+      new_dir.append(fname,0,slash2);
+
    new_dir.append("/lost");
    env_->CreateDir(new_dir);  // Ignore error
    std::string new_file = new_dir;
    new_file.append("/");
-    new_file.append((slash == NULL) ? fname.c_str() : slash + 1);
+    new_file.append((std::string::npos!=slash) ? fname.substr(slash+1) : fname);
    Status s = env_->RenameFile(fname, new_file);
    Log(options_.info_log, "Archiving %s: %s\n",
        fname.c_str(), s.ToString().c_str());
--- a/src/leveldb/db/skiplist.h
+++ b/src/leveldb/db/skiplist.h
@ -1,10 +1,7 @@
 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
-
-#ifndef STORAGE_LEVELDB_DB_SKIPLIST_H_
-#define STORAGE_LEVELDB_DB_SKIPLIST_H_
-
+//
 // Thread safety
 // -------------
 //
@ -55,6 +52,12 @@ class SkipList {
  // Returns true iff an entry that compares equal to key is in the list.
  bool Contains(const Key& key) const;

+  // Returns true if all inserts have been sequentially increasing;
+  // else this SkipList has had keys inserted in non-sequential order
+  bool InSequentialInsertMode() const {
+    return sequentialInsertMode_;
+  }
+
  // Iteration over the contents of a skip list
  class Iterator {
   public:
@ -94,8 +97,22 @@ class SkipList {
    // Intentionally copyable
  };

+ protected:
+  // Checks the structure of this SkipList object, ensuring the keys are
+  // properly ordered
+  //
+  // This is protected since it is intended for use by unit tests; if a lock
+  // is used to protect Insert(), then it should be used to protect this
+  // method as well
+  bool Valid() const;
+
+  // Disables the sequential insert optimizations (used in performance testing)
+  void DisableSequentialInsertMode() {
+    sequentialInsertMode_ = false;
+  }
+
 private:
-  enum { kMaxHeight = 12 };
+  enum { kMaxHeight = 17 };

  // Immutable after construction
  Comparator const compare_;
@ -115,6 +132,18 @@ class SkipList {
  // Read/written only by Insert().
  Random rnd_;

+  // Points to the last node in the list; modified only by Insert()
+  Node* tail_;
+
+  // Pointers to the nodes previous to the tail node; have max_height_ entries
+  Node* tailPrev_[kMaxHeight];
+
+  // The height of the tail_ node
+  int tailHeight_;
+
+  // We track the tail node until we have a non-sequential insert
+  bool sequentialInsertMode_;
+
  Node* NewNode(const Key& key, int height);
  int RandomHeight();
  bool Equal(const Key& a, const Key& b) const { return (compare_(a, b) == 0); }
@ -129,6 +158,11 @@ class SkipList {
  // node at "level" for every level in [0..max_height_-1].
  Node* FindGreaterOrEqual(const Key& key, Node** prev) const;

+  // Similar to FindGreaterOrEqual() except it uses the barrier-free
+  // variant of Next(); this is used only by Insert() and it
+  // checks the tail_ pointer in case we're doing a sequential insert
+  Node* NoBarrier_FindGreaterOrEqual(const Key& key, Node** prev) const;
+
  // Return the latest node with a key < key.
  // Return head_ if there is no such node.
  Node* FindLessThan(const Key& key) const;
@ -280,6 +314,54 @@ typename SkipList<Key,Comparator>::Node* SkipList<Key,Comparator>::FindGreaterOr
  }
 }

+template<typename Key, class Comparator>
+typename SkipList<Key,Comparator>::Node*
+SkipList<Key,Comparator>::NoBarrier_FindGreaterOrEqual(const Key& key, Node** prev) const {
+  int level = GetMaxHeight() - 1;
+
+  // If we have only seen sequential inserts up to this point, we can use
+  // the tail_ node
+  if ( sequentialInsertMode_ ) {
+    if (tail_ == NULL) {
+      // The list is currently empty, so the node being inserted
+      // will be the new tail_
+      assert(level == 0);
+      if (prev != NULL) prev[0] = head_;
+      return NULL;
+    }
+    else if (KeyIsAfterNode(key, tail_)) {
+      // The new key must be inserted after the current tail_ node
+      if (prev != NULL) {
+        int i;
+        for (i = 0; i < tailHeight_; ++i) {
+          prev[i] = tail_;
+        }
+        for (/*continue with i*/; i <= level; ++i) {
+          prev[i] = tailPrev_[i];
+        }
+      }
+      return NULL;
+    }
+  }
+
+  Node* x = head_;
+  while (true) {
+    Node* next = x->NoBarrier_Next(level);
+    if (KeyIsAfterNode(key, next)) {
+      // Keep searching in this list
+      x = next;
+    } else {
+      if (prev != NULL) prev[level] = x;
+      if (level == 0) {
+        return next;
+      } else {
+        // Switch to next list
+        level--;
+      }
+    }
+  }
+}
+
 template<typename Key, class Comparator>
 typename SkipList<Key,Comparator>::Node*
 SkipList<Key,Comparator>::FindLessThan(const Key& key) const {
@ -327,25 +409,41 @@ SkipList<Key,Comparator>::SkipList(Comparator cmp, Arena* arena)
      arena_(arena),
      head_(NewNode(0 /* any key will do */, kMaxHeight)),
      max_height_(reinterpret_cast<void*>(1)),
-      rnd_(0xdeadbeef) {
+      rnd_(0xdeadbeef),
+      tail_(NULL),
+      tailHeight_(0),
+      sequentialInsertMode_(true) {
  for (int i = 0; i < kMaxHeight; i++) {
    head_->SetNext(i, NULL);
+    tailPrev_[i] = NULL;
  }
 }

 template<typename Key, class Comparator>
 void SkipList<Key,Comparator>::Insert(const Key& key) {
-  // TODO(opt): We can use a barrier-free variant of FindGreaterOrEqual()
+  // We use a barrier-free variant of FindGreaterOrEqual()
  // here since Insert() is externally synchronized.
  Node* prev[kMaxHeight];
-  Node* x = FindGreaterOrEqual(key, prev);
+  Node* x = NoBarrier_FindGreaterOrEqual(key, prev);
+
+  // If we're still in sequential-insert mode, check if the new node is being
+  // inserted at the end of the list, which is indicated by x being NULL
+  if (sequentialInsertMode_) {
+    if (x != NULL) {
+      // we have a non-sequential (AKA random) insert, so stop maintaining
+      // the tail bookkeeping overhead
+      sequentialInsertMode_ = false;
+    }
+  }

  // Our data structure does not allow duplicate insertion
  assert(x == NULL || !Equal(key, x->key));

-  int height = RandomHeight();
+  int i, height = RandomHeight();
  if (height > GetMaxHeight()) {
-    for (int i = GetMaxHeight(); i < height; i++) {
+    // We are extending max_height_ which means we need to fill in the blanks
+    // in prev[] that were not filled in by NoBarrier_FindGreaterOrEqual()
+    for (i = GetMaxHeight(); i < height; ++i) {
      prev[i] = head_;
    }
    //fprintf(stderr, "Change height from %d to %d\n", max_height_, height);
@ -361,12 +459,37 @@ void SkipList<Key,Comparator>::Insert(const Key& key) {
  }

  x = NewNode(key, height);
-  for (int i = 0; i < height; i++) {
+  for (i = 0; i < height; ++i) {
    // NoBarrier_SetNext() suffices since we will add a barrier when
    // we publish a pointer to "x" in prev[i].
    x->NoBarrier_SetNext(i, prev[i]->NoBarrier_Next(i));
    prev[i]->SetNext(i, x);
  }
+
+  // Do we need to update our tail_ pointer?
+  if (sequentialInsertMode_) {
+    Node* prevTail = tail_;
+    int prevTailHeight = tailHeight_;
+
+    tail_ = x;
+    tailHeight_ = height;
+
+    // We also need to update our tailPrev_ pointers; first we capture
+    // the nodes already pointing to the new tail_
+    for (i = 0; i < height; ++i) {
+      tailPrev_[i] = prev[i];
+    }
+
+    // If the previous tail node was taller than the new tail node, then
+    // the prev pointers above the current tail node's height (up to the
+    // height of the previous tail node) are simply the previous tail node
+    for (/*continue with i*/; i < prevTailHeight; ++i) {
+      tailPrev_[i] = prevTail;
+    }
+
+    // NOTE: any prev pointers above prevTailHeight (up to max_height_) were
+    // already set in tailPrev_ by previous calls to this method
+  }
 }

 template<typename Key, class Comparator>
@ -379,6 +502,115 @@ bool SkipList<Key,Comparator>::Contains(const Key& key) const {
  }
 }

-}  // namespace leveldb
+template<typename Key, class Comparator>
+bool SkipList<Key,Comparator>::Valid() const
+{
+  // Note that we can use barrier-free overloads in this method since it is
+  // protected by the same lock as Insert().

-#endif  // STORAGE_LEVELDB_DB_SKIPLIST_H_
+  // Ensure that the list is properly sorted; use an iterator for this check
+  const Key* pPrevKey = NULL;
+  typename SkipList<Key, Comparator>::Iterator iter(this);
+  for ( iter.SeekToFirst(); iter.Valid(); iter.Next() ) {
+    if ( pPrevKey != NULL ) {
+      if ( compare_( *pPrevKey, iter.key() ) >= 0 ) {
+        return false;
+      }
+    }
+    pPrevKey = &iter.key();
+  }
+
+  // Now walk the linked list at each level and ensure it's sorted. Also track
+  // how many nodes we see at each level; the number of nodes in the linked
+  // list at level n must not be larger than the number of nodes at level n-1.
+  std::vector<int> nodeCounts( GetMaxHeight() );
+  int level;
+  for ( level = GetMaxHeight() - 1; level >= 0; --level ) {
+    int nodeCount = 0;
+    pPrevKey = NULL;
+    for ( Node* pNode = head_->NoBarrier_Next( level );
+          pNode != NULL;
+          pNode = pNode->NoBarrier_Next( level ) ) {
+      ++nodeCount;
+      if ( pPrevKey != NULL ) {
+        if ( compare_( *pPrevKey, pNode->key ) >= 0 ) {
+          return false;
+        }
+      }
+      pPrevKey = &pNode->key;
+    }
+    nodeCounts[ level ] = nodeCount;
+  }
+
+  // Ensure the node counts do not increase as we move up the levels
+  int prevNodeCount = nodeCounts[0];
+  for ( level = 1; level < GetMaxHeight(); ++level ) {
+    int currentNodeCount = nodeCounts[ level ];
+    if ( currentNodeCount > prevNodeCount ) {
+      return false;
+    }
+    prevNodeCount = currentNodeCount;
+  }
+
+  // Ensure that tail_ points to the last node
+  if ( sequentialInsertMode_ ) {
+    if ( tail_ == NULL ) {
+      // tail_ is not set, so the list must be empty
+      if ( tailPrev_[0] != NULL || head_->NoBarrier_Next(0) != NULL ) {
+        return false;
+      }
+    }
+    else {
+      // we have a tail_ node; first ensure that its prev pointer actually
+      // points to it
+      if ( tailPrev_[0] == NULL || tailPrev_[0]->NoBarrier_Next(0) != tail_ ) {
+        return false;
+      }
+      if ( compare_( tailPrev_[0]->key, tail_->key ) >= 0 ) {
+        return false;
+      }
+
+      // now check the rest of the pointers in tailPrev_; up to tailHeight_,
+      // the next pointer of the node in tailPrev_ should point to tail_; after
+      // that, the next pointer should be NULL
+      for ( level = 1; level < GetMaxHeight(); ++level ) {
+        Node* tailPrev = tailPrev_[ level ];
+        if ( tailPrev == NULL ) {
+          return false;
+        }
+        if ( level < tailHeight_ ) {
+          if ( tailPrev->NoBarrier_Next( level ) != tail_ ) {
+            return false;
+          }
+          if ( compare_( tailPrev->key, tail_->key ) >= 0 ) {
+            return false;
+          }
+        }
+        else {
+          if ( tailPrev->NoBarrier_Next( level ) != NULL ) {
+            return false;
+          }
+        }
+      }
+
+      // the remainder of the tailPrev_ pointers (above max_height_)
+      // should be NULL
+      for ( /*continue with level*/; level < kMaxHeight; ++level ) {
+        if ( tailPrev_[ level ] != NULL ) {
+          return false;
+        }
+      }
+
+      // now ensure that FindLast() returns tail_
+      Node* lastNode = FindLast();
+      if ( lastNode != tail_ ) {
+        return false;
+      }
+    }
+  }
+
+  // if we get here, all is good
+  return true;
+}
+
+}  // namespace leveldb
--- a/src/leveldb/db/skiplist_test.cc
+++ b/src/leveldb/db/skiplist_test.cc
@ -2,11 +2,15 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.

+#define __STDC_FORMAT_MACROS
+#include <inttypes.h>
+
 #include "db/skiplist.h"
 #include <set>
 #include "leveldb/env.h"
 #include "util/arena.h"
 #include "util/hash.h"
+#include "util/mutexlock.h"
 #include "util/random.h"
 #include "util/testharness.h"

@ -26,15 +30,29 @@ struct Comparator {
  }
 };

+template<typename Key, class Comparator>
+class SkipListTest : public SkipList<Key, Comparator>
+{
+ public:
+  SkipListTest(Comparator cmp, Arena* arena) : SkipList<Key, Comparator>(cmp, arena) {}
+
+  // check the validity of this SkipList object by calling the Valid() method
+  // in the base class
+  bool Valid() const { return SkipList<Key, Comparator>::Valid(); }
+
+  void DisableSequentialInsertMode() { SkipList<Key, Comparator>::DisableSequentialInsertMode(); }
+};
+
 class SkipTest { };

 TEST(SkipTest, Empty) {
  Arena arena;
  Comparator cmp;
-  SkipList<Key, Comparator> list(cmp, &arena);
+  SkipListTest<Key, Comparator> list(cmp, &arena);
  ASSERT_TRUE(!list.Contains(10));
+  ASSERT_TRUE(list.Valid());

-  SkipList<Key, Comparator>::Iterator iter(&list);
+  SkipListTest<Key, Comparator>::Iterator iter(&list);
  ASSERT_TRUE(!iter.Valid());
  iter.SeekToFirst();
  ASSERT_TRUE(!iter.Valid());
@ -51,13 +69,14 @@ TEST(SkipTest, InsertAndLookup) {
  std::set<Key> keys;
  Arena arena;
  Comparator cmp;
-  SkipList<Key, Comparator> list(cmp, &arena);
+  SkipListTest<Key, Comparator> list(cmp, &arena);
  for (int i = 0; i < N; i++) {
    Key key = rnd.Next() % R;
    if (keys.insert(key).second) {
      list.Insert(key);
    }
  }
+  ASSERT_TRUE(list.Valid());

  for (int i = 0; i < R; i++) {
    if (list.Contains(i)) {
@ -69,7 +88,7 @@ TEST(SkipTest, InsertAndLookup) {

  // Simple iterator tests
  {
-    SkipList<Key, Comparator>::Iterator iter(&list);
+    SkipListTest<Key, Comparator>::Iterator iter(&list);
    ASSERT_TRUE(!iter.Valid());

    iter.Seek(0);
@ -87,7 +106,7 @@ TEST(SkipTest, InsertAndLookup) {

  // Forward iteration test
  for (int i = 0; i < R; i++) {
-    SkipList<Key, Comparator>::Iterator iter(&list);
+    SkipListTest<Key, Comparator>::Iterator iter(&list);
    iter.Seek(i);

    // Compare against model iterator
@ -107,7 +126,7 @@ TEST(SkipTest, InsertAndLookup) {

  // Backward iteration test
  {
-    SkipList<Key, Comparator>::Iterator iter(&list);
+    SkipListTest<Key, Comparator>::Iterator iter(&list);
    iter.SeekToLast();

    // Compare against model iterator
@ -250,7 +269,7 @@ class ConcurrentTest {
        // Note that generation 0 is never inserted, so it is ok if
        // <*,0,*> is missing.
        ASSERT_TRUE((gen(pos) == 0) ||
-                    (gen(pos) > static_cast<Key>(initial_state.Get(key(pos))))
+                    (gen(pos) > initial_state.Get(key(pos)))
                    ) << "key: " << key(pos)
                      << "; gen: " << gen(pos)
                      << "; initgen: "
@ -313,18 +332,16 @@ class TestState {
        state_cv_(&mu_) {}

  void Wait(ReaderState s) {
-    mu_.Lock();
+    MutexLock lock(&mu_);
    while (state_ != s) {
      state_cv_.Wait();
    }
-    mu_.Unlock();
  }

  void Change(ReaderState s) {
-    mu_.Lock();
+    MutexLock lock(&mu_);
    state_ = s;
    state_cv_.Signal();
-    mu_.Unlock();
  }

 private:
@ -371,6 +388,211 @@ TEST(SkipTest, Concurrent3) { RunConcurrent(3); }
 TEST(SkipTest, Concurrent4) { RunConcurrent(4); }
 TEST(SkipTest, Concurrent5) { RunConcurrent(5); }

+static void
+RunSequentialInsert(
+  const int NumKeys,
+  bool      AcquireLock,
+  bool      ReverseInsert,
+  bool      SequentialInsertModeEnabled )
+{
+  const int loopCount = 5; // repeat the whole process this many times and average the time spent
+  std::vector<uint64_t> timeSpent;
+
+  port::Mutex mutex;
+  Env* env = Env::Default();
+
+  fprintf( stderr,
+           "Sequentially inserting %d keys in %s order,\n"
+           "      seqential insert mode is initially %sabled,\n"
+           "      %sacquiring a lock for each insert (averaging over %d runs)\n",
+           NumKeys, ReverseInsert ? "reverse" : "forward",
+           SequentialInsertModeEnabled ? "en" : "dis",
+           AcquireLock ? "" : "not ", loopCount );
+
+  int k;
+  for ( k = 0; k < loopCount; ++k ) {
+    int j;
+    Arena arena;
+    Comparator cmp;
+    SkipListTest<Key, Comparator> list( cmp, &arena );
+
+    // initially the SkipList should be in sequential mode
+    ASSERT_TRUE( list.InSequentialInsertMode() );
+
+    // were we instructed to disable sequential insert mode?
+    if ( !SequentialInsertModeEnabled ) {
+      list.DisableSequentialInsertMode();
+      ASSERT_TRUE( !list.InSequentialInsertMode() );
+    }
+
+    uint64_t start = env->NowMicros();
+    for ( j = 0; j < NumKeys; ++j ) {
+      Key key = ReverseInsert ? NumKeys - 1 - j : j;
+
+      if ( AcquireLock ) mutex.Lock();
+      list.Insert( key );
+      if ( AcquireLock ) mutex.Unlock();
+    }
+    uint64_t stop = env->NowMicros();
+    timeSpent.push_back( stop - start );
+    //fprintf( stderr, "  Time for run %d: %llu\n", k, timeSpent[k] );
+
+    // if SequentialInsertModeEnabled is true, the SkipList should still be
+    // in sequential mode iff ReverseInsert is false
+    if ( SequentialInsertModeEnabled ) {
+      ASSERT_TRUE( list.InSequentialInsertMode() != ReverseInsert );
+    }
+    else {
+      ASSERT_TRUE( !list.InSequentialInsertMode() );
+    }
+
+    // ensure the SkipLlist is properly sorted
+    if ( AcquireLock ) mutex.Lock();
+    ASSERT_TRUE( list.Valid() );
+    if ( AcquireLock ) mutex.Unlock();
+
+    // ensure the SkipList contains all the keys we inserted
+    for ( j = 0; j < NumKeys; ++j ) {
+      ASSERT_TRUE( list.Contains( j ) );
+    }
+  }
+
+  // throw out the low and high times and average the rest
+  uint64_t totalTime, lowTime, highTime;
+  totalTime = lowTime = highTime = timeSpent[0];
+  for ( k = 1; k < loopCount; ++k ) {
+    uint64_t currentTime = timeSpent[k];
+    totalTime += currentTime;
+    if ( lowTime > currentTime ) lowTime = currentTime;
+    if ( highTime < currentTime ) highTime = currentTime;
+  }
+
+  totalTime -= (lowTime + highTime);
+
+  uint64_t averageTime = (totalTime / (loopCount - 2));
+  double timePerKey = (double)averageTime / (double)NumKeys;
+  fprintf( stderr, "   Average insertion time: %" PRIu64 " (%f/key)\n", averageTime, timePerKey );
+}
+
+TEST(SkipTest, SequentialInsert_NoLock_ForwardInsert)
+{
+  int numKeys = 100000;
+  bool acquireLock = false;
+  bool reverseInsert = false;
+  bool sequentialInsertModeEnabled = true;
+  RunSequentialInsert( numKeys, acquireLock, reverseInsert, sequentialInsertModeEnabled );
+
+  sequentialInsertModeEnabled = false;
+  RunSequentialInsert( numKeys, acquireLock, reverseInsert, sequentialInsertModeEnabled );
+}
+
+TEST(SkipTest, SequentialInsert_Lock_ForwardInsert)
+{
+  int numKeys = 100000;
+  bool acquireLock = true;
+  bool reverseInsert = false;
+  bool sequentialInsertModeEnabled = true;
+  RunSequentialInsert( numKeys, acquireLock, reverseInsert, sequentialInsertModeEnabled );
+
+  sequentialInsertModeEnabled = false;
+  RunSequentialInsert( numKeys, acquireLock, reverseInsert, sequentialInsertModeEnabled );
+}
+
+TEST(SkipTest, SequentialInsert_NoLock_ReverseInsert)
+{
+  int numKeys = 100000;
+  bool acquireLock = false;
+  bool reverseInsert = true;
+  bool sequentialInsertModeEnabled = true;
+  RunSequentialInsert( numKeys, acquireLock, reverseInsert, sequentialInsertModeEnabled );
+}
+
+TEST(SkipTest, SequentialInsert_Lock_ReverseInsert)
+{
+  int numKeys = 100000;
+  bool acquireLock = true;
+  bool reverseInsert = true;
+  bool sequentialInsertModeEnabled = true;
+  RunSequentialInsert( numKeys, acquireLock, reverseInsert, sequentialInsertModeEnabled );
+}
+
+TEST(SkipTest, SequentialInsert_IncreasingNumberOfInserts)
+{
+  // test with increasing numbers of keys, with sequential-insert mode both
+  // enabled and disabled; we're looking to see if per-key insertion times
+  // trend upward as the number of keys increases
+  int numKeys = 10000;
+  bool acquireLock = false;
+  bool reverseInsert = false;
+  bool sequentialInsertModeEnabled = true;
+  RunSequentialInsert( numKeys, acquireLock, reverseInsert, sequentialInsertModeEnabled );
+
+  sequentialInsertModeEnabled = false;
+  RunSequentialInsert( numKeys, acquireLock, reverseInsert, sequentialInsertModeEnabled );
+
+  numKeys = 100000;
+  sequentialInsertModeEnabled = true;
+  RunSequentialInsert( numKeys, acquireLock, reverseInsert, sequentialInsertModeEnabled );
+
+  sequentialInsertModeEnabled = false;
+  RunSequentialInsert( numKeys, acquireLock, reverseInsert, sequentialInsertModeEnabled );
+
+  numKeys = 1000000;
+  sequentialInsertModeEnabled = true;
+  RunSequentialInsert( numKeys, acquireLock, reverseInsert, sequentialInsertModeEnabled );
+
+  sequentialInsertModeEnabled = false;
+  RunSequentialInsert( numKeys, acquireLock, reverseInsert, sequentialInsertModeEnabled );
+}
+
+TEST(SkipTest, SequentialInsert_MixedInsertionModes)
+{
+  // start inserting sequentially, then switch to non-sequential inserts,
+  // ensuring all works as intended
+  int j, numSequentialKeys = 100000, numNonSequentialKeys = 100000;
+  int totalNumKeys = numSequentialKeys + numNonSequentialKeys;
+  Arena arena;
+  Comparator cmp;
+  SkipListTest<Key, Comparator> list( cmp, &arena );
+
+  // initially the SkipList should be in sequential mode
+  ASSERT_TRUE( list.InSequentialInsertMode() );
+
+  // start inserting at key=1; when we insert 0 below, the list should switch
+  // out of sequential insert mode
+  for ( j = 1; j < numSequentialKeys; ++j ) {
+    list.Insert( j );
+  }
+
+  // the SkipList should still be in sequential mode
+  ASSERT_TRUE( list.InSequentialInsertMode() );
+  ASSERT_TRUE( list.Valid() );
+
+  list.Insert( 0 );
+  ASSERT_TRUE( !list.InSequentialInsertMode() );
+  ASSERT_TRUE( list.Valid() );
+
+  // now insert the remaining keys in non-sequential order (they're not
+  // random, but that doesn't matter here; just ensure we switch to
+  // non-sequential mode and that all continues to work)
+  for ( j = 0; j < numNonSequentialKeys; j += 2 ) {
+    int key = totalNumKeys - j - 1;
+    list.Insert( key );
+  }
+  for ( j = 0; j < numNonSequentialKeys; j += 2 ) {
+    int key = numSequentialKeys + j;
+    list.Insert( key );
+  }
+
+  ASSERT_TRUE( !list.InSequentialInsertMode() );
+  ASSERT_TRUE( list.Valid() );
+
+  // ensure the SkipList contains all the keys we inserted
+  for ( j = 0; j < totalNumKeys; ++j ) {
+    ASSERT_TRUE( list.Contains( j ) );
+  }
+}
+
 }  // namespace leveldb

 int main(int argc, char** argv) {
--- a/src/leveldb/db/snapshot.h
+++ b/src/leveldb/db/snapshot.h
@ -5,7 +5,6 @@
 #ifndef STORAGE_LEVELDB_DB_SNAPSHOT_H_
 #define STORAGE_LEVELDB_DB_SNAPSHOT_H_

-#include "db/dbformat.h"
 #include "leveldb/db.h"

 namespace leveldb {
--- a/src/leveldb/db/table_cache.cc
+++ b/src/leveldb/db/table_cache.cc
@ -5,22 +5,26 @@
 #include "db/table_cache.h"

 #include "db/filename.h"
+#include "db/log_reader.h"
+#include "db/log_writer.h"
+#include "db/version_edit.h"
 #include "leveldb/env.h"
 #include "leveldb/table.h"
 #include "util/coding.h"
+#include "leveldb/perf_count.h"

 namespace leveldb {

-struct TableAndFile {
-  RandomAccessFile* file;
-  Table* table;
-};
-
 static void DeleteEntry(const Slice& key, void* value) {
  TableAndFile* tf = reinterpret_cast<TableAndFile*>(value);
+  if (0==dec_and_fetch(&tf->user_count))
+  {
+    if (NULL!=tf->doublecache)
+      tf->doublecache->SubFileSize(tf->table->GetFileSize());
    delete tf->table;
    delete tf->file;
    delete tf;
+  }   // if
 }

 static void UnrefEntry(void* arg1, void* arg2) {
@ -31,37 +35,38 @@ static void UnrefEntry(void* arg1, void* arg2) {

 TableCache::TableCache(const std::string& dbname,
                       const Options* options,
-                       int entries)
+                       Cache * file_cache,
+                       DoubleCache & doublecache)
    : env_(options->env),
      dbname_(dbname),
      options_(options),
-      cache_(NewLRUCache(entries)) {
+      cache_(file_cache),
+      doublecache_(doublecache)
+{
 }

 TableCache::~TableCache() {
-  delete cache_;
 }

-Status TableCache::FindTable(uint64_t file_number, uint64_t file_size,
-                             Cache::Handle** handle) {
+Status TableCache::FindTable(uint64_t file_number, uint64_t file_size, int level,
+                             Cache::Handle** handle, bool is_compaction,
+                             bool for_iterator) {
  Status s;
  char buf[sizeof(file_number)];
  EncodeFixed64(buf, file_number);
  Slice key(buf, sizeof(buf));
  *handle = cache_->Lookup(key);
  if (*handle == NULL) {
-    std::string fname = TableFileName(dbname_, file_number);
+    std::string fname = TableFileName(*options_, file_number, level);
    RandomAccessFile* file = NULL;
    Table* table = NULL;
    s = env_->NewRandomAccessFile(fname, &file);
-    if (!s.ok()) {
-      std::string old_fname = SSTTableFileName(dbname_, file_number);
-      if (env_->NewRandomAccessFile(old_fname, &file).ok()) {
-        s = Status::OK();
-      }
-    }
    if (s.ok()) {
      s = Table::Open(*options_, file, file_size, &table);
+
+      // Riak:  support opportunity to manage Linux page cache
+      if (is_compaction)
+          file->SetForCompaction(file_size);
    }

    if (!s.ok()) {
@ -73,22 +78,74 @@ Status TableCache::FindTable(uint64_t file_number, uint64_t file_size,
      TableAndFile* tf = new TableAndFile;
      tf->file = file;
      tf->table = table;
-      *handle = cache_->Insert(key, tf, 1, &DeleteEntry);
+      tf->doublecache = &doublecache_;
+      tf->file_number = file_number;
+      tf->level = level;
+
+      *handle = cache_->Insert(key, tf, table->TableObjectSize(), &DeleteEntry);
+      gPerfCounters->Inc(ePerfTableOpened);
+      doublecache_.AddFileSize(table->GetFileSize());
+
+      // temporary hardcoding to match number of levels defined as
+      //  overlapped in version_set.cc
+      if (level<config::kNumOverlapLevels)
+          cache_->Addref(*handle);
    }
  }
+  else
+  {
+    Table *table = reinterpret_cast<TableAndFile*>(cache_->Value(*handle))->table;
+
+    // this is NOT first access, see if bloom filter can load now
+    if (!for_iterator && table->ReadFilter())
+    {
+      // TableAndFile now going to be present in two cache entries
+      //  1. retrieve old entry within file cache
+      TableAndFile* tf = reinterpret_cast<TableAndFile*>(cache_->Value(*handle));
+      inc_and_fetch(&tf->user_count);
+
+      //  2. must clean file size, do not want double count
+      if (NULL!=tf->doublecache)
+        tf->doublecache->SubFileSize(tf->table->GetFileSize());
+
+      //  3. release current reference (and possible special overlap reference)
+      cache_->Release(*handle);
+      if (tf->level<config::kNumOverlapLevels)
+        cache_->Release(*handle);
+
+      //  4. create second table cache entry using TableObjectSize that now includes
+      //     bloom filter size
+      *handle = cache_->Insert(key, tf, table->TableObjectSize(), &DeleteEntry);
+
+      //  5. set double reference if an overlapped file (prevents from being flushed)
+      if (level<config::kNumOverlapLevels)
+        cache_->Addref(*handle);
+    }   // if
+
+    // for Linux, let fadvise start precaching
+    if (is_compaction)
+    {
+        RandomAccessFile *file = reinterpret_cast<TableAndFile*>(cache_->Value(*handle))->file;
+        file->SetForCompaction(file_size);
+    }   // if
+
+    gPerfCounters->Inc(ePerfTableCached);
+  }   // else
  return s;
 }

 Iterator* TableCache::NewIterator(const ReadOptions& options,
                                  uint64_t file_number,
                                  uint64_t file_size,
+                                  int level,
                                  Table** tableptr) {
  if (tableptr != NULL) {
    *tableptr = NULL;
  }

  Cache::Handle* handle = NULL;
-  Status s = FindTable(file_number, file_size, &handle);
+  Status s = FindTable(file_number, file_size, level, &handle, options.IsCompaction(), true);
+
  if (!s.ok()) {
    return NewErrorIterator(s);
  }
@ -105,11 +162,13 @@ Iterator* TableCache::NewIterator(const ReadOptions& options,
 Status TableCache::Get(const ReadOptions& options,
                       uint64_t file_number,
                       uint64_t file_size,
+                       int level,
                       const Slice& k,
                       void* arg,
-                       void (*saver)(void*, const Slice&, const Slice&)) {
+                       bool (*saver)(void*, const Slice&, const Slice&)) {
  Cache::Handle* handle = NULL;
-  Status s = FindTable(file_number, file_size, &handle);
+  Status s = FindTable(file_number, file_size, level, &handle);
+
  if (s.ok()) {
    Table* t = reinterpret_cast<TableAndFile*>(cache_->Value(handle))->table;
    s = t->InternalGet(options, k, arg, saver);
@ -118,10 +177,60 @@ Status TableCache::Get(const ReadOptions& options,
  return s;
 }

-void TableCache::Evict(uint64_t file_number) {
+void TableCache::Evict(uint64_t file_number, bool is_overlapped) {
  char buf[sizeof(file_number)];
  EncodeFixed64(buf, file_number);
+
+  // overlapped files have extra reference to prevent their purge,
+  //  release that reference now
+  if (is_overlapped)
+  {
+      Cache::Handle *handle;
+
+      // the Lookup call adds a reference too, back out both
+      handle=cache_->Lookup(Slice(buf, sizeof(buf)));
+
+      // with multiple background threads, file might already be
+      //  evicted
+      if (NULL!=handle)
+      {
+          cache_->Release(handle);  // release for Lookup() call just made
+          cache_->Release(handle);  // release for extra reference
+      }   // if
+  }   // if
+
  cache_->Erase(Slice(buf, sizeof(buf)));
 }

+/**
+ * Riak specific routine to return table statistic ONLY if table metadata
+ *  already within cache ... otherwise return 0.
+ */
+uint64_t
+TableCache::GetStatisticValue(
+    uint64_t file_number,
+    unsigned Index)
+{
+    uint64_t ret_val;
+    char buf[sizeof(file_number)];
+    Cache::Handle *handle;
+
+    ret_val=0;
+    EncodeFixed64(buf, file_number);
+    Slice key(buf, sizeof(buf));
+    handle = cache_->Lookup(key);
+
+    if (NULL != handle)
+    {
+        TableAndFile * tf;
+
+        tf=reinterpret_cast<TableAndFile*>(cache_->Value(handle));
+        ret_val=tf->table->GetSstCounters().Value(Index);
+        cache_->Release(handle);
+    }   // if
+
+    return(ret_val);
+
+}   // TableCache::GetStatisticValue
+
 }  // namespace leveldb
--- a/src/leveldb/db/table_cache.h
+++ b/src/leveldb/db/table_cache.h
@ -13,6 +13,7 @@
 #include "leveldb/cache.h"
 #include "leveldb/table.h"
 #include "port/port.h"
+#include "util/cache2.h"

 namespace leveldb {

@ -20,8 +21,10 @@ class Env;

 class TableCache {
 public:
-  TableCache(const std::string& dbname, const Options* options, int entries);
-  ~TableCache();
+  // clean up note:  file_cache is redundant to GetFileCache available from doublecache
+  TableCache(const std::string& dbname, const Options* options, Cache * file_cache,
+             DoubleCache & doublecache);
+  virtual ~TableCache();

  // Return an iterator for the specified file number (the corresponding
  // file length must be exactly "file_size" bytes).  If "tableptr" is
@ -33,6 +36,7 @@ class TableCache {
  Iterator* NewIterator(const ReadOptions& options,
                        uint64_t file_number,
                        uint64_t file_size,
+                        int level,
                        Table** tableptr = NULL);

  // If a seek to internal key "k" in specified file finds an entry,
@ -40,22 +44,65 @@ class TableCache {
  Status Get(const ReadOptions& options,
             uint64_t file_number,
             uint64_t file_size,
+             int level,
             const Slice& k,
             void* arg,
-             void (*handle_result)(void*, const Slice&, const Slice&));
+             bool (*handle_result)(void*, const Slice&, const Slice&));

  // Evict any entry for the specified file number
-  void Evict(uint64_t file_number);
+  void Evict(uint64_t file_number, bool is_overlapped);

- private:
+  // Riak specific:  return table statistic ONLY if table in cache, otherwise zero
+  uint64_t GetStatisticValue(uint64_t file_number, unsigned Index);
+
+
+  // access for testing tools, not for public access
+  Status TEST_FindTable(uint64_t file_number, uint64_t file_size, int level, Cache::Handle** handle)
+  {return( FindTable(file_number, file_size, level, handle));};
+
+  Cache* TEST_GetInternalCache() {return(cache_);};
+
+  void Release(Cache::Handle * handle) {cache_->Release(handle);};
+
+  // routine called if Options::cache_object_warming is true.
+  //  Writes list of all file names currently in file cache to disk.
+  Status SaveOpenFileList();
+
+  // routine called if Options::cache_object_warming is true.
+  //  Reads file created by SaveOpenFileList() and attempts to open
+  //  every file.
+  Status PreloadTableCache();
+
+ // was private, now protected to allow easy unit test overrides
+ protected:
  Env* const env_;
  const std::string dbname_;
  const Options* options_;
  Cache * cache_;
+  DoubleCache & doublecache_;

-  Status FindTable(uint64_t file_number, uint64_t file_size, Cache::Handle**);
+  // virtual to enable unit test overrides
+  virtual Status FindTable(uint64_t file_number, uint64_t file_size, int level,
+                           Cache::Handle**, bool is_compaction=false,
+                           bool for_iterator=false);
 };

+
+struct TableAndFile {
+  RandomAccessFile* file;
+  Table* table;
+  DoubleCache * doublecache;
+  uint64_t file_number;     // saved for cache object warming
+  int level;                // saved for cache object warming
+  volatile uint32_t user_count;
+
+   TableAndFile()
+   : file(NULL), table(NULL), doublecache(NULL),
+     file_number(0), level(0), user_count(1)
+   {};
+};
+
+
 }  // namespace leveldb

 #endif  // STORAGE_LEVELDB_DB_TABLE_CACHE_H_
--- a/src/leveldb/db/version_edit.cc
+++ b/src/leveldb/db/version_edit.cc
@ -9,20 +9,6 @@

 namespace leveldb {

-// Tag numbers for serialized VersionEdit.  These numbers are written to
-// disk and should not be changed.
-enum Tag {
-  kComparator           = 1,
-  kLogNumber            = 2,
-  kNextFileNumber       = 3,
-  kLastSequence         = 4,
-  kCompactPointer       = 5,
-  kDeletedFile          = 6,
-  kNewFile              = 7,
-  // 8 was used for large value refs
-  kPrevLogNumber        = 9
-};
-
 void VersionEdit::Clear() {
  comparator_.clear();
  log_number_ = 0;
@ -34,11 +20,21 @@ void VersionEdit::Clear() {
  has_prev_log_number_ = false;
  has_next_file_number_ = false;
  has_last_sequence_ = false;
+  has_f1_files_ = false;
+  has_f2_files_ = false;
+
  deleted_files_.clear();
  new_files_.clear();
 }

-void VersionEdit::EncodeTo(std::string* dst) const {
+/**
+ * EncodeTo serializes the VersionEdit object
+ *  to the "dst" string parameter.  "format2" flag
+ *  indicates whether serialization should use original
+ *  Google format for file objects (false) or Basho's updated
+ *  file2 format for expiry enabled file objects (true)
+ */
+void VersionEdit::EncodeTo(std::string* dst, bool format2) const {
  if (has_comparator_) {
    PutVarint32(dst, kComparator);
    PutLengthPrefixedSlice(dst, comparator_);
@ -76,12 +72,21 @@ void VersionEdit::EncodeTo(std::string* dst) const {

  for (size_t i = 0; i < new_files_.size(); i++) {
    const FileMetaData& f = new_files_[i].second;
+    if (format2)
+      PutVarint32(dst, kNewFile2);
+    else
      PutVarint32(dst, kNewFile);
    PutVarint32(dst, new_files_[i].first);  // level
    PutVarint64(dst, f.number);
    PutVarint64(dst, f.file_size);
    PutLengthPrefixedSlice(dst, f.smallest.Encode());
    PutLengthPrefixedSlice(dst, f.largest.Encode());
+    if (format2)
+    {
+      PutVarint64(dst, f.exp_write_low);
+      PutVarint64(dst, f.exp_write_high);
+      PutVarint64(dst, f.exp_explicit_high);
+    }
  }
 }

@ -98,7 +103,7 @@ static bool GetInternalKey(Slice* input, InternalKey* dst) {
 static bool GetLevel(Slice* input, int* level) {
  uint32_t v;
  if (GetVarint32(input, &v) &&
-      v < config::kNumLevels) {
+      v < (unsigned)config::kNumLevels) {
    *level = v;
    return true;
  } else {
@ -185,13 +190,34 @@ Status VersionEdit::DecodeFrom(const Slice& src) {
            GetVarint64(&input, &f.number) &&
            GetVarint64(&input, &f.file_size) &&
            GetInternalKey(&input, &f.smallest) &&
-            GetInternalKey(&input, &f.largest)) {
+            GetInternalKey(&input, &f.largest))
+        {
+          has_f1_files_ = true;
+          f.level=level;
          new_files_.push_back(std::make_pair(level, f));
        } else {
          msg = "new-file entry";
        }
        break;

+      case kNewFile2:
+        if (GetLevel(&input, &level) &&
+            GetVarint64(&input, &f.number) &&
+            GetVarint64(&input, &f.file_size) &&
+            GetInternalKey(&input, &f.smallest) &&
+            GetInternalKey(&input, &f.largest) &&
+            GetVarint64(&input, &f.exp_write_low) &&
+            GetVarint64(&input, &f.exp_write_high) &&
+            GetVarint64(&input, &f.exp_explicit_high))
+        {
+          has_f2_files_ = true;
+          f.level=level;
+          new_files_.push_back(std::make_pair(level, f));
+        } else {
+          msg = "new-file2 entry";
+        }
+        break;
+
      default:
        msg = "unknown tag";
        break;
@ -258,6 +284,12 @@ std::string VersionEdit::DebugString() const {
    r.append(f.smallest.DebugString());
    r.append(" .. ");
    r.append(f.largest.DebugString());
+    r.append(" ");
+    AppendNumberTo(&r, f.exp_write_low);
+    r.append(" ");
+    AppendNumberTo(&r, f.exp_write_high);
+    r.append(" ");
+    AppendNumberTo(&r, f.exp_explicit_high);
  }
  r.append("\n}\n");
  return r;
--- a/src/leveldb/db/version_edit.h
+++ b/src/leveldb/db/version_edit.h
@ -16,15 +16,41 @@ class VersionSet;

 struct FileMetaData {
  int refs;
-  int allowed_seeks;          // Seeks allowed until compaction
+//  int allowed_seeks;          // Seeks allowed until compaction
  uint64_t number;
  uint64_t file_size;         // File size in bytes
+  uint64_t num_entries;       // count of values in .sst file, only valid during table build
  InternalKey smallest;       // Smallest internal key served by table
  InternalKey largest;        // Largest internal key served by table
+  int level;
+  ExpiryTimeMicros exp_write_low;     // oldest write time in file:
+                                //  0 - non-expiry keys exist too
+                                //  ULLONG_MAX - no write time expiry & no plain keys
+  ExpiryTimeMicros exp_write_high;    // most recent write time in file
+  ExpiryTimeMicros exp_explicit_high; // most recent/furthest into future explicit expiry

-  FileMetaData() : refs(0), allowed_seeks(1 << 30), file_size(0) { }
+  FileMetaData()
+  : refs(0), /*allowed_seeks(1 << 30),*/ file_size(0),
+      num_entries(0), level(-1), exp_write_low(0), exp_write_high(0), exp_explicit_high(0)
+  { }
 };

+
+class FileMetaDataPtrCompare
+{
+protected:
+    const Comparator * comparator_;
+
+public:
+    explicit FileMetaDataPtrCompare(const Comparator * Comparer)
+        : comparator_(Comparer) {};
+
+    bool operator() (const FileMetaData * file1, const FileMetaData * file2) const
+    {
+        return(comparator_->Compare(file1->smallest.user_key(), file2->smallest.user_key()) < 0);
+    }
+};  // class FileMetaDataPtrCompare
+
 class VersionEdit {
 public:
  VersionEdit() { Clear(); }
@ -59,6 +85,7 @@ class VersionEdit {
  // Add the specified file at the specified number.
  // REQUIRES: This version has not been saved (see VersionSet::SaveTo)
  // REQUIRES: "smallest" and "largest" are smallest and largest keys in file
+#if 0
  void AddFile(int level, uint64_t file,
               uint64_t file_size,
               const InternalKey& smallest,
@ -68,6 +95,27 @@ class VersionEdit {
    f.file_size = file_size;
    f.smallest = smallest;
    f.largest = largest;
+    f.level = level;
+    new_files_.push_back(std::make_pair(level, f));
+  }
+#endif
+
+  void AddFile2(int level, uint64_t file,
+                uint64_t file_size,
+                const InternalKey& smallest,
+                const InternalKey& largest,
+                uint64_t exp_write_low,
+                uint64_t exp_write_high,
+                uint64_t exp_explicit_high) {
+    FileMetaData f;
+    f.number = file;
+    f.file_size = file_size;
+    f.smallest = smallest;
+    f.largest = largest;
+    f.level = level;
+    f.exp_write_low = exp_write_low;
+    f.exp_write_high = exp_write_high;
+    f.exp_explicit_high = exp_explicit_high;
    new_files_.push_back(std::make_pair(level, f));
  }

@ -75,16 +123,37 @@ class VersionEdit {
  void DeleteFile(int level, uint64_t file) {
    deleted_files_.insert(std::make_pair(level, file));
  }
+  size_t DeletedFileCount() const {return(deleted_files_.size());};

-  void EncodeTo(std::string* dst) const;
+  void EncodeTo(std::string* dst, bool format2=true) const;
  Status DecodeFrom(const Slice& src);

+  // unit test access to validate file entries' format types
+  bool HasF1Files() const {return(has_f1_files_);};
+  bool HasF2Files() const {return(has_f2_files_);};
+
  std::string DebugString() const;

+// Tag numbers for serialized VersionEdit.  These numbers are written to
+// disk and should not be changed.
+enum Tag {
+  kComparator           = 1,
+  kLogNumber            = 2,
+  kNextFileNumber       = 3,
+  kLastSequence         = 4,
+  kCompactPointer       = 5,
+  kDeletedFile          = 6,
+  kNewFile              = 7,
+  // 8 was used for large value refs
+  kPrevLogNumber        = 9,
+  kFileCacheObject      = 10,
+  kNewFile2             = 11  // expiry capable file
+};
+
 private:
  friend class VersionSet;

-  typedef std::set< std::pair<int, uint64_t> > DeletedFileSet;
+  USED_BY_NESTED_FRIEND2(typedef std::set< std::pair<int, uint64_t> > DeletedFileSet)

  std::string comparator_;
  uint64_t log_number_;
@ -96,10 +165,13 @@ class VersionEdit {
  bool has_prev_log_number_;
  bool has_next_file_number_;
  bool has_last_sequence_;
+  // following should be mutually exclusive, but tested independently to be sure
+  bool has_f1_files_;         // manifest uses format 1 (for unit tests)
+  bool has_f2_files_;         // manifest uses format 2 (for unit tests)

-  std::vector< std::pair<int, InternalKey> > compact_pointers_;
-  DeletedFileSet deleted_files_;
-  std::vector< std::pair<int, FileMetaData> > new_files_;
+  USED_BY_NESTED_FRIEND2(std::vector< std::pair<int, InternalKey> > compact_pointers_)
+  USED_BY_NESTED_FRIEND(DeletedFileSet deleted_files_)
+  USED_BY_NESTED_FRIEND2(std::vector< std::pair<int, FileMetaData> > new_files_)
 };

 }  // namespace leveldb
--- a/src/leveldb/db/version_edit_test.cc
+++ b/src/leveldb/db/version_edit_test.cc
@ -7,14 +7,22 @@

 namespace leveldb {

-static void TestEncodeDecode(const VersionEdit& edit) {
+static void TestEncodeDecode(
+    const VersionEdit& edit,
+    bool format2=false) {
  std::string encoded, encoded2;
-  edit.EncodeTo(&encoded);
+  edit.EncodeTo(&encoded,format2);
  VersionEdit parsed;
  Status s = parsed.DecodeFrom(encoded);
  ASSERT_TRUE(s.ok()) << s.ToString();
-  parsed.EncodeTo(&encoded2);
+  parsed.EncodeTo(&encoded2,format2);
  ASSERT_EQ(encoded, encoded2);
+
+  if (parsed.HasF1Files() || parsed.HasF2Files())
+  {
+      ASSERT_EQ(parsed.HasF1Files(), !format2);
+      ASSERT_EQ(parsed.HasF2Files(), format2);
+  }   // if
 }

 class VersionEditTest { };
@ -25,11 +33,12 @@ TEST(VersionEditTest, EncodeDecode) {
  VersionEdit edit;
  for (int i = 0; i < 4; i++) {
    TestEncodeDecode(edit);
-    edit.AddFile(3, kBig + 300 + i, kBig + 400 + i,
-                 InternalKey("foo", kBig + 500 + i, kTypeValue),
-                 InternalKey("zoo", kBig + 600 + i, kTypeDeletion));
+    edit.AddFile2(3, kBig + 300 + i, kBig + 400 + i,
+                  InternalKey("foo", 0, kBig + 500 + i, kTypeValue),
+                  InternalKey("zoo", 0, kBig + 600 + i, kTypeDeletion),
+                  0,0,0);
    edit.DeleteFile(4, kBig + 700 + i);
-    edit.SetCompactPointer(i, InternalKey("x", kBig + 900 + i, kTypeValue));
+    edit.SetCompactPointer(i, InternalKey("x", 0, kBig + 900 + i, kTypeValue));
  }

  edit.SetComparatorName("foo");
@ -39,6 +48,29 @@ TEST(VersionEditTest, EncodeDecode) {
  TestEncodeDecode(edit);
 }

+TEST(VersionEditTest, EncodeDecodeExpiry) {
+  static const uint64_t kBig = 1ull << 25;
+
+  VersionEdit edit;
+  for (int i = 0; i < 4; i++) {
+    TestEncodeDecode(edit, false); // only testing for s.ok()
+    edit.AddFile2(3, kBig + 300 + i, kBig + 400 + i,
+                 InternalKey("foo", 700+i, kBig + 500 + i, kTypeValueExplicitExpiry),
+                 InternalKey("zoo", 800+i, kBig + 600 + i, kTypeDeletion),
+                 10203040,
+                 123456789,
+                 987654321);
+    edit.DeleteFile(4, kBig + 700 + i);
+    edit.SetCompactPointer(i, InternalKey("x", 0, kBig + 900 + i, kTypeValue));
+  }
+
+  edit.SetComparatorName("foo");
+  edit.SetLogNumber(kBig + 100);
+  edit.SetNextFile(kBig + 200);
+  edit.SetLastSequence(kBig + 1000);
+  TestEncodeDecode(edit, true);
+}
+
 }  // namespace leveldb

 int main(int argc, char** argv) {
--- a/src/leveldb/db/version_set.cc
+++ b/src/leveldb/db/version_set.cc
--- a/src/leveldb/db/version_set.h
+++ b/src/leveldb/db/version_set.h
@ -21,7 +21,9 @@
 #include "db/dbformat.h"
 #include "db/version_edit.h"
 #include "port/port.h"
-#include "port/thread_annotations.h"
+#include "leveldb/atomics.h"
+#include "leveldb/env.h"
+#include "util/throttle.h"

 namespace leveldb {

@ -70,7 +72,7 @@ class Version {
    FileMetaData* seek_file;
    int seek_file_level;
  };
-  Status Get(const ReadOptions&, const LookupKey& key, std::string* val,
+  Status Get(const ReadOptions&, const LookupKey& key, Value* val,
             GetStats* stats);

  // Adds "stats" into the current state.  Returns true if a new
@ -78,12 +80,6 @@ class Version {
  // REQUIRES: lock is held
  bool UpdateStats(const GetStats& stats);

-  // Record a sample of bytes read at the specified internal key.
-  // Samples are taken approximately once every config::kReadBytesPeriod
-  // bytes.  Returns true if a new compaction may need to be triggered.
-  // REQUIRES: lock is held
-  bool RecordReadSample(Slice key);
-
  // Reference count management (so Versions do not disappear out from
  // under live iterators)
  void Ref();
@ -101,43 +97,47 @@ class Version {
  // largest_user_key==NULL represents a key largest than all keys in the DB.
  bool OverlapInLevel(int level,
                      const Slice* smallest_user_key,
-                      const Slice* largest_user_key);
+                      const Slice* largest_user_key) const;

  // Return the level at which we should place a new memtable compaction
  // result that covers the range [smallest_user_key,largest_user_key].
  int PickLevelForMemTableOutput(const Slice& smallest_user_key,
-                                 const Slice& largest_user_key);
+                                 const Slice& largest_user_key,
+                                 const int level_limit);

-  int NumFiles(int level) const { return files_[level].size(); }
+  virtual size_t NumFiles(int level) const { return files_[level].size(); }
+
+  const VersionSet * GetVersionSet() const { return vset_; }
+
+  typedef std::vector<FileMetaData*> FileMetaDataVector_t;
+
+  virtual const std::vector<FileMetaData*> & GetFileList(int level) const {return files_[level];};
+
+  volatile int WritePenalty() const {return write_penalty_; }
+
+  // Riak specific repair routine
+  bool VerifyLevels(int & level, InternalKey & begin, InternalKey & end);

  // Return a human readable string that describes this version's contents.
  std::string DebugString() const;

- private:
+protected:
  friend class Compaction;
  friend class VersionSet;

  class LevelFileNumIterator;
  Iterator* NewConcatenatingIterator(const ReadOptions&, int level) const;

-  // Call func(arg, level, f) for every file that overlaps user_key in
-  // order from newest to oldest.  If an invocation of func returns
-  // false, makes no more calls.
-  //
-  // REQUIRES: user portion of internal_key == user_key.
-  void ForEachOverlapping(Slice user_key, Slice internal_key,
-                          void* arg,
-                          bool (*func)(void*, int, FileMetaData*));
-
  VersionSet* vset_;            // VersionSet to which this Version belongs
  Version* next_;               // Next version in linked list
  Version* prev_;               // Previous version in linked list
  int refs_;                    // Number of live refs to this version

  // List of files per level
-  std::vector<FileMetaData*> files_[config::kNumLevels];
+  USED_BY_NESTED_FRIEND(std::vector<FileMetaData*> files_[config::kNumLevels];)

-  // Next file to compact based on seek stats.
+ protected:
+  // Next file to compact based on seek stats (or Riak delete test)
  FileMetaData* file_to_compact_;
  int file_to_compact_level_;

@ -146,17 +146,29 @@ class Version {
  // are initialized by Finalize().
  double compaction_score_;
  int compaction_level_;
+  bool compaction_grooming_;
+  bool compaction_no_move_;
+  bool compaction_expirefile_;
+  volatile int write_penalty_;

+ protected:
+  // make the ctor/dtor protected, so that a unit test can subclass
  explicit Version(VersionSet* vset)
      : vset_(vset), next_(this), prev_(this), refs_(0),
        file_to_compact_(NULL),
        file_to_compact_level_(-1),
        compaction_score_(-1),
-        compaction_level_(-1) {
+        compaction_level_(-1),
+        compaction_grooming_(false),
+        compaction_no_move_(false),
+        compaction_expirefile_(false),
+        write_penalty_(0)
+  {
  }

-  ~Version();
+  virtual ~Version();

+private:
  // No copying allowed
  Version(const Version&);
  void operator=(const Version&);
@ -175,11 +187,10 @@ class VersionSet {
  // current version.  Will release *mu while actually writing to the file.
  // REQUIRES: *mu is held on entry.
  // REQUIRES: no other thread concurrently calls LogAndApply()
-  Status LogAndApply(VersionEdit* edit, port::Mutex* mu)
-      EXCLUSIVE_LOCKS_REQUIRED(mu);
+  Status LogAndApply(VersionEdit* edit, port::Mutex* mu);

  // Recover the last saved descriptor from persistent storage.
-  Status Recover(bool *save_manifest);
+  Status Recover();

  // Return the current version.
  Version* current() const { return current_; }
@ -188,19 +199,29 @@ class VersionSet {
  uint64_t ManifestFileNumber() const { return manifest_file_number_; }

  // Allocate and return a new file number
-  uint64_t NewFileNumber() { return next_file_number_++; }
+  //  (-1 is to "duplicate" old post-increment logic while maintaining
+  //   some threading integrity ... next_file_number_ used naked a bunch)
+  uint64_t NewFileNumber() { return(inc_and_fetch(&next_file_number_) -1); }

  // Arrange to reuse "file_number" unless a newer file number has
  // already been allocated.
  // REQUIRES: "file_number" was returned by a call to NewFileNumber().
+  //  (disabled due to threading concerns ... and desire NOT to use mutex, matthewv)
  void ReuseFileNumber(uint64_t file_number) {
-    if (next_file_number_ == file_number + 1) {
-      next_file_number_ = file_number;
-    }
+//    if (next_file_number_ == file_number + 1) {
+//      next_file_number_ = file_number;
+//    }
  }

  // Return the number of Table files at the specified level.
-  int NumLevelFiles(int level) const;
+  size_t NumLevelFiles(int level) const;
+
+  // is the specified level overlapped (or if false->sorted)
+  static bool IsLevelOverlapped(int level);
+
+  static uint64_t DesiredBytesForLevel(int level);
+  static uint64_t MaxBytesForLevel(int level);
+  static uint64_t MaxFileSizeForLevel(int level);

  // Return the combined file size of all files at the specified level.
  int64_t NumLevelBytes(int level) const;
@ -224,11 +245,36 @@ class VersionSet {
  // being compacted, or zero if there is no such log file.
  uint64_t PrevLogNumber() const { return prev_log_number_; }

+  int WriteThrottleUsec(bool active_compaction)
+  {
+      uint64_t penalty, throttle;
+      int ret_val;
+
+      penalty=current_->write_penalty_;
+      throttle=GetThrottleWriteRate();
+
+      ret_val=0;
+      if (0==penalty && 1!=throttle)
+          ret_val=(int)throttle;
+      else if (0!=penalty)
+      {
+          if (1==throttle)
+              throttle=GetUnadjustedThrottleWriteRate();
+          ret_val=(int)penalty * throttle;
+      }   // else if
+
+      return(ret_val);
+  }
+
+
  // Pick level and inputs for a new compaction.
  // Returns NULL if there is no compaction to be done.
  // Otherwise returns a pointer to a heap-allocated object that
  // describes the compaction.  Caller should delete the result.
-  Compaction* PickCompaction();
+  //
+  // Riak October 2013:  Pick Compaction now posts work directly
+  //  to hot_thread pools
+  void PickCompaction(class DBImpl * db_impl);

  // Return a compaction object for compacting the range [begin,end] in
  // the specified level.  Returns NULL if there is nothing in that
@ -267,16 +313,42 @@ class VersionSet {
    char buffer[100];
  };
  const char* LevelSummary(LevelSummaryStorage* scratch) const;
+  const char* CompactionSummary(LevelSummaryStorage* scratch) const;

- private:
+  TableCache* GetTableCache() {return(table_cache_);};
+
+  const Options * GetOptions() const {return(options_);};
+
+  bool IsCompactionSubmitted(int level)
+  {return(m_CompactionStatus[level].m_Submitted);}
+
+  void SetCompactionSubmitted(int level)
+  {m_CompactionStatus[level].m_Submitted=true;}
+
+  void SetCompactionRunning(int level)
+  {m_CompactionStatus[level].m_Running=true;}
+
+  void SetCompactionDone(int level, uint64_t Now)
+  {   m_CompactionStatus[level].m_Running=false;
+      m_CompactionStatus[level].m_Submitted=false;
+      // must set both source and destination.  otherwise
+      //  destination might immediately decide it needs a
+      //  timed grooming too ... defeating idea to spreadout the groomings
+      m_CompactionStatus[level].m_LastCompaction=Now;
+      if ((level+1)<config::kNumLevels)
+          m_CompactionStatus[level+1].m_LastCompaction=Now;
+  }
+
+  bool NeighborCompactionsQuiet(int level);
+
+protected:
  class Builder;

  friend class Compaction;
  friend class Version;

-  bool ReuseManifest(const std::string& dscname, const std::string& dscbase);
-
-  void Finalize(Version* v);
+  bool Finalize(Version* v);
+  void UpdatePenalty(Version *v);

  void GetRange(const std::vector<FileMetaData*>& inputs,
                InternalKey* smallest,
@ -299,7 +371,7 @@ class VersionSet {
  const Options* const options_;
  TableCache* const table_cache_;
  const InternalKeyComparator icmp_;
-  uint64_t next_file_number_;
+  volatile uint64_t next_file_number_;
  uint64_t manifest_file_number_;
  uint64_t last_sequence_;
  uint64_t log_number_;
@ -315,11 +387,44 @@ class VersionSet {
  // Either an empty string, or a valid InternalKey.
  std::string compact_pointer_[config::kNumLevels];

+  // Riak allows multiple compaction threads, this mutex allows
+  //  only one to write to manifest at a time.  Only used in LogAndApply
+  port::Mutex manifest_mutex_;
+
+  volatile uint64_t last_penalty_minutes_;
+  volatile int prev_write_penalty_;
+
+
+
+  struct CompactionStatus_s
+  {
+      bool m_Submitted;     //!< level submitted to hot thread pool
+      bool m_Running;       //!< thread actually running compaction
+      uint64_t m_LastCompaction; //!<NowMicros() when last compaction completed
+
+      CompactionStatus_s()
+      : m_Submitted(false), m_Running(false), m_LastCompaction(0)
+      {};
+  } m_CompactionStatus[config::kNumLevels];
+
+private:
  // No copying allowed
  VersionSet(const VersionSet&);
  void operator=(const VersionSet&);
 };

+//
+// allows routing of compaction request to
+//  diverse processing routines via common
+//  BackgroundCall2 thread entry
+//
+enum CompactionType
+{
+    kNormalCompaction = 0x0,
+    kExpiryFileCompaction = 0x1
+};  // CompactionType
+
+
 // A Compaction encapsulates information about a compaction.
 class Compaction {
 public:
@ -329,6 +434,9 @@ class Compaction {
  // and "level+1" will be merged to produce a set of "level+1" files.
  int level() const { return level_; }

+  // Return parent Version object
+  const Version * version() const { return input_version_; }
+
  // Return the object that holds the edits to the descriptor done
  // by this compaction.
  VersionEdit* edit() { return &edit_; }
@ -356,32 +464,47 @@ class Compaction {

  // Returns true iff we should stop building the current output
  // before processing "internal_key".
-  bool ShouldStopBefore(const Slice& internal_key);
+  bool ShouldStopBefore(const Slice& internal_key, size_t key_count);

  // Release the input version for the compaction, once the compaction
  // is successful.
  void ReleaseInputs();

+  // Riak specific:  get summary statistics from compaction inputs
+  void CalcInputStats(TableCache & tables);
+  size_t TotalUserDataSize() const {return(tot_user_data_);};
+  size_t TotalIndexKeys()    const {return(tot_index_keys_);};
+  size_t AverageValueSize()  const {return(avg_value_size_);};
+  size_t AverageKeySize()    const {return(avg_key_size_);};
+  size_t AverageBlockSize()  const {return(avg_block_size_);};
+  bool IsCompressible()      const {return(compressible_);};
+
+  // Riak specific:  is move operation ok for compaction?
+  bool IsMoveOk()            const {return(!no_move_);};
+
+  enum CompactionType GetCompactionType() const {return(compaction_type_);};
+
 private:
  friend class Version;
  friend class VersionSet;

-  Compaction(const Options* options, int level);
+  explicit Compaction(int level);

  int level_;
  uint64_t max_output_file_size_;
  Version* input_version_;
  VersionEdit edit_;
+  CompactionType compaction_type_;

  // Each compaction reads inputs from "level_" and "level_+1"
  std::vector<FileMetaData*> inputs_[2];      // The two sets of inputs

-  // State used to check for number of overlapping grandparent files
+  // State used to check for number of of overlapping grandparent files
  // (parent == level_ + 1, grandparent == level_ + 2)
  std::vector<FileMetaData*> grandparents_;
  size_t grandparent_index_;  // Index in grandparent_starts_
  bool seen_key_;             // Some output key has been seen
-  int64_t overlapped_bytes_;  // Bytes of overlap between current output
+  uint64_t overlapped_bytes_;  // Bytes of overlap between current output
                              // and grandparent files

  // State for implementing IsBaseLevelForKey
@ -391,6 +514,16 @@ class Compaction {
  // higher level than the ones involved in this compaction (i.e. for
  // all L >= level_ + 2).
  size_t level_ptrs_[config::kNumLevels];
+
+  // Riak specific:  output statistics from CalcInputStats
+  size_t tot_user_data_;
+  size_t tot_index_keys_;
+  size_t avg_value_size_;
+  size_t avg_key_size_;
+  size_t avg_block_size_;
+  bool compressible_;
+  bool stats_done_;
+  bool no_move_;
 };

 }  // namespace leveldb
--- a/src/leveldb/db/version_set_test.cc
+++ b/src/leveldb/db/version_set_test.cc
@ -27,13 +27,13 @@ class FindFileTest {
           SequenceNumber largest_seq = 100) {
    FileMetaData* f = new FileMetaData;
    f->number = files_.size() + 1;
-    f->smallest = InternalKey(smallest, smallest_seq, kTypeValue);
-    f->largest = InternalKey(largest, largest_seq, kTypeValue);
+    f->smallest = InternalKey(smallest, 0, smallest_seq, kTypeValue);
+    f->largest = InternalKey(largest, 0, largest_seq, kTypeValue);
    files_.push_back(f);
  }

  int Find(const char* key) {
-    InternalKey target(key, 100, kTypeValue);
+    InternalKey target(key, 0, 100, kTypeValue);
    InternalKeyComparator cmp(BytewiseComparator());
    return FindFile(cmp, files_, target.Encode());
  }
--- a/src/leveldb/db/write_batch.cc
+++ b/src/leveldb/db/write_batch.cc
@ -13,13 +13,17 @@
 //    len: varint32
 //    data: uint8[len]

-#include "leveldb/write_batch.h"
+#include <stdint.h>

 #include "leveldb/db.h"
+#include "leveldb/env.h"
+#include "leveldb/expiry.h"
+#include "leveldb/write_batch.h"
 #include "db/dbformat.h"
 #include "db/memtable.h"
 #include "db/write_batch_internal.h"
 #include "util/coding.h"
+#include "util/throttle.h"

 namespace leveldb {

@ -47,16 +51,17 @@ Status WriteBatch::Iterate(Handler* handler) const {

  input.remove_prefix(kHeader);
  Slice key, value;
+  ExpiryTimeMicros expiry;
  int found = 0;
  while (!input.empty()) {
    found++;
-    char tag = input[0];
+    ValueType tag = (ValueType)input[0];
    input.remove_prefix(1);
    switch (tag) {
      case kTypeValue:
        if (GetLengthPrefixedSlice(&input, &key) &&
            GetLengthPrefixedSlice(&input, &value)) {
-          handler->Put(key, value);
+            handler->Put(key, value, kTypeValue, 0);
        } else {
          return Status::Corruption("bad WriteBatch Put");
        }
@ -68,6 +73,16 @@ Status WriteBatch::Iterate(Handler* handler) const {
          return Status::Corruption("bad WriteBatch Delete");
        }
        break;
+      case kTypeValueWriteTime:
+      case kTypeValueExplicitExpiry:
+        if (GetLengthPrefixedSlice(&input, &key) &&
+            GetVarint64(&input, &expiry) &&
+            GetLengthPrefixedSlice(&input, &value)) {
+            handler->Put(key, value, tag, expiry);
+        } else {
+          return Status::Corruption("bad WriteBatch Expiry");
+        }
+        break;
      default:
        return Status::Corruption("unknown WriteBatch tag");
    }
@ -95,10 +110,20 @@ void WriteBatchInternal::SetSequence(WriteBatch* b, SequenceNumber seq) {
  EncodeFixed64(&b->rep_[0], seq);
 }

-void WriteBatch::Put(const Slice& key, const Slice& value) {
+void WriteBatch::Put(const Slice& key, const Slice& value, const KeyMetaData * meta) {
+  KeyMetaData local_meta;
  WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1);
-  rep_.push_back(static_cast<char>(kTypeValue));
+  if (NULL!=meta)
+      local_meta=*meta;
+  rep_.push_back(static_cast<char>(local_meta.m_Type));
  PutLengthPrefixedSlice(&rep_, key);
+  if (kTypeValueExplicitExpiry==local_meta.m_Type
+      || kTypeValueWriteTime==local_meta.m_Type)
+  {
+      if (kTypeValueWriteTime==local_meta.m_Type && 0==local_meta.m_Expiry)
+          local_meta.m_Expiry=GetCachedTimeMicros();
+      PutVarint64(&rep_, local_meta.m_Expiry);
+  }   // if
  PutLengthPrefixedSlice(&rep_, value);
 }

@ -113,23 +138,33 @@ class MemTableInserter : public WriteBatch::Handler {
 public:
  SequenceNumber sequence_;
  MemTable* mem_;
+  const Options * options_;

-  virtual void Put(const Slice& key, const Slice& value) {
-    mem_->Add(sequence_, kTypeValue, key, value);
+  MemTableInserter() : mem_(NULL), options_(NULL) {};
+
+  virtual void Put(const Slice& key, const Slice& value, const ValueType &type, const ExpiryTimeMicros &expiry) {
+    ValueType type_use(type);
+    ExpiryTimeMicros expiry_use(expiry);
+
+    if (NULL!=options_ && options_->ExpiryActivated())
+        options_->expiry_module->MemTableInserterCallback(key, value, type_use, expiry_use);
+    mem_->Add(sequence_, (ValueType)type_use, key, value, expiry_use);
    sequence_++;
  }
  virtual void Delete(const Slice& key) {
-    mem_->Add(sequence_, kTypeDeletion, key, Slice());
+    mem_->Add(sequence_, kTypeDeletion, key, Slice(), 0);
    sequence_++;
  }
 };
 }  // namespace

 Status WriteBatchInternal::InsertInto(const WriteBatch* b,
-                                      MemTable* memtable) {
+                                      MemTable* memtable,
+                                      const Options * options) {
  MemTableInserter inserter;
  inserter.sequence_ = WriteBatchInternal::Sequence(b);
  inserter.mem_ = memtable;
+  inserter.options_ = options;
  return b->Iterate(&inserter);
 }

--- a/src/leveldb/db/write_batch_internal.h
+++ b/src/leveldb/db/write_batch_internal.h
@ -5,7 +5,6 @@
 #ifndef STORAGE_LEVELDB_DB_WRITE_BATCH_INTERNAL_H_
 #define STORAGE_LEVELDB_DB_WRITE_BATCH_INTERNAL_H_

-#include "db/dbformat.h"
 #include "leveldb/write_batch.h"

 namespace leveldb {
@ -22,10 +21,10 @@ class WriteBatchInternal {
  // Set the count for the number of entries in the batch.
  static void SetCount(WriteBatch* batch, int n);

-  // Return the sequence number for the start of this batch.
+  // Return the seqeunce number for the start of this batch.
  static SequenceNumber Sequence(const WriteBatch* batch);

-  // Store the specified number as the sequence number for the start of
+  // Store the specified number as the seqeunce number for the start of
  // this batch.
  static void SetSequence(WriteBatch* batch, SequenceNumber seq);

@ -39,7 +38,7 @@ class WriteBatchInternal {

  static void SetContents(WriteBatch* batch, const Slice& contents);

-  static Status InsertInto(const WriteBatch* batch, MemTable* memtable);
+  static Status InsertInto(const WriteBatch* batch, MemTable* memtable, const Options * options);

  static void Append(WriteBatch* dst, const WriteBatch* src);
 };
--- a/src/leveldb/db/write_batch_test.cc
+++ b/src/leveldb/db/write_batch_test.cc
@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.

+#include <sstream>
 #include "leveldb/db.h"

 #include "db/memtable.h"
@ -17,11 +18,12 @@ static std::string PrintContents(WriteBatch* b) {
  MemTable* mem = new MemTable(cmp);
  mem->Ref();
  std::string state;
-  Status s = WriteBatchInternal::InsertInto(b, mem);
+  Status s = WriteBatchInternal::InsertInto(b, mem, NULL);
  int count = 0;
  Iterator* iter = mem->NewIterator();
  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
    ParsedInternalKey ikey;
+    std::stringstream sstr;
    ASSERT_TRUE(ParseInternalKey(iter->key(), &ikey));
    switch (ikey.type) {
      case kTypeValue:
@ -32,6 +34,28 @@ static std::string PrintContents(WriteBatch* b) {
        state.append(")");
        count++;
        break;
+      case kTypeValueWriteTime:
+        state.append("PutWT(");
+        state.append(ikey.user_key.ToString());
+        state.append(", ");
+        sstr << ikey.expiry;
+        state.append(sstr.str());
+        state.append(", ");
+        state.append(iter->value().ToString());
+        state.append(")");
+        count++;
+        break;
+      case kTypeValueExplicitExpiry:
+        state.append("PutEE(");
+        state.append(ikey.user_key.ToString());
+        state.append(", ");
+        sstr << ikey.expiry;
+        state.append(sstr.str());
+        state.append(", ");
+        state.append(iter->value().ToString());
+        state.append(")");
+        count++;
+        break;
      case kTypeDeletion:
        state.append("Delete(");
        state.append(ikey.user_key.ToString());
@ -74,6 +98,32 @@ TEST(WriteBatchTest, Multiple) {
            PrintContents(&batch));
 }

+TEST(WriteBatchTest, MultipleExpiry) {
+  WriteBatch batch;
+  KeyMetaData meta;
+  batch.Put(Slice("Mary"), Slice("Lamb"));
+  meta.m_Type=kTypeValueExplicitExpiry;
+  meta.m_Expiry=2347;
+  batch.Put(Slice("Adam"), Slice("Ant"), &meta);
+  //batch.PutExplicitExpiry(Slice("Adam"), Slice("Ant"), 2347);
+  batch.Put(Slice("Frosty"), Slice("Snowman"));
+  batch.Put(Slice("Tip"), Slice("ONeal"));
+  batch.Delete(Slice("Frosty"));
+  meta.m_Type=kTypeValueExplicitExpiry;
+  meta.m_Expiry=987654321;
+  batch.Put(Slice("The"), Slice("Fonz"), &meta);
+  WriteBatchInternal::SetSequence(&batch, 200);
+  ASSERT_EQ(200, WriteBatchInternal::Sequence(&batch));
+  ASSERT_EQ(6, WriteBatchInternal::Count(&batch));
+  ASSERT_EQ("PutEE(Adam, 2347, Ant)@201"
+            "Delete(Frosty)@204"
+            "Put(Frosty, Snowman)@202"
+            "Put(Mary, Lamb)@200"
+            "PutEE(The, 987654321, Fonz)@205"
+            "Put(Tip, ONeal)@203",
+            PrintContents(&batch));
+}
+
 TEST(WriteBatchTest, Corruption) {
  WriteBatch batch;
  batch.Put(Slice("foo"), Slice("bar"));
--- a/src/leveldb/doc/bench/db_bench_sqlite3.cc
+++ b/src/leveldb/doc/bench/db_bench_sqlite3.cc
@ -618,7 +618,7 @@ class Benchmark {
        ErrorCheck(status);

        // Execute read statement
-        while ((status = sqlite3_step(read_stmt)) == SQLITE_ROW) {}
+        while ((status = sqlite3_step(read_stmt)) == SQLITE_ROW);
        StepErrorCheck(status);

        // Reset SQLite statement for another use
--- a/src/leveldb/doc/bench/db_bench_tree_db.cc
+++ b/src/leveldb/doc/bench/db_bench_tree_db.cc
@ -338,7 +338,7 @@ class Benchmark {
      bool write_sync = false;
      if (name == Slice("fillseq")) {
        Write(write_sync, SEQUENTIAL, FRESH, num_, FLAGS_value_size, 1);
-        DBSynchronize(db_);
+        
      } else if (name == Slice("fillrandom")) {
        Write(write_sync, RANDOM, FRESH, num_, FLAGS_value_size, 1);
        DBSynchronize(db_);
--- a/src/leveldb/doc/doc.css
+++ b/src/leveldb/doc/doc.css
@ -0,0 +1,89 @@
+body {
+  margin-left: 0.5in;
+  margin-right: 0.5in;
+  background: white;
+  color: black;
+}
+
+h1 {
+  margin-left: -0.2in;
+  font-size: 14pt;
+}
+h2 {
+  margin-left: -0in;
+  font-size: 12pt;
+}
+h3 {
+  margin-left: -0in;
+}
+h4 {
+  margin-left: -0in;
+}
+hr {
+  margin-left: -0in;
+}
+
+/* Definition lists: definition term bold */
+dt {
+  font-weight: bold;
+}
+
+address {
+  text-align: center;
+}
+code,samp,var {
+  color: blue;
+}
+kbd {
+  color: #600000;
+}
+div.note p {
+  float: right;
+  width: 3in;
+  margin-right: 0%;
+  padding: 1px;
+  border: 2px solid #6060a0;
+  background-color: #fffff0;
+}
+
+ul {
+  margin-top: -0em;
+  margin-bottom: -0em;
+}
+
+ol {
+  margin-top: -0em;
+  margin-bottom: -0em;
+}
+
+UL.nobullets {
+  list-style-type: none;
+  list-style-image: none;
+  margin-left: -1em;
+}
+
+p {
+  margin: 1em 0 1em 0;
+  padding: 0 0 0 0;
+}
+
+pre {
+  line-height: 1.3em;
+  padding: 0.4em 0 0.8em 0;
+  margin:  0 0 0 0;
+  border:  0 0 0 0;
+  color: blue;
+}
+
+.datatable {
+  margin-left: auto;
+  margin-right: auto;
+  margin-top: 2em;
+  margin-bottom: 2em;
+  border: 1px solid;
+}
+
+.datatable td,th {
+  padding: 0 0.5em 0 0.5em;
+  text-align: right;
+}
--- a/src/leveldb/doc/impl.html
+++ b/src/leveldb/doc/impl.html
@ -0,0 +1,213 @@
+<!DOCTYPE html>
+<html>
+<head>
+<link rel="stylesheet" type="text/css" href="doc.css" />
+<title>Leveldb file layout and compactions</title>
+</head>
+
+<body>
+
+<h1>Files</h1>
+
+The implementation of leveldb is similar in spirit to the
+representation of a single
+<a href="http://labs.google.com/papers/bigtable.html">
+Bigtable tablet (section 5.3)</a>.
+However the organization of the files that make up the representation
+is somewhat different and is explained below.
+
+<p>
+Each database is represented by a set of files stored in a directory.
+There are several different types of files as documented below:
+<p>
+<h2>Log files</h2>
+<p>
+A log file (*.log) stores a sequence of recent updates.  Each update
+is appended to the current log file.  When the log file reaches a
+pre-determined size (approximately 4MB by default), it is converted
+to a sorted table (see below) and a new log file is created for future
+updates.
+<p>
+A copy of the current log file is kept in an in-memory structure (the
+<code>memtable</code>).  This copy is consulted on every read so that read
+operations reflect all logged updates.
+<p>
+<h2>Sorted tables</h2>
+<p>
+A sorted table (*.sst) stores a sequence of entries sorted by key.
+Each entry is either a value for the key, or a deletion marker for the
+key.  (Deletion markers are kept around to hide obsolete values
+present in older sorted tables).
+<p>
+The set of sorted tables are organized into a sequence of levels.  The
+sorted table generated from a log file is placed in a special <code>young</code>
+level (also called level-0).  When the number of young files exceeds a
+certain threshold (currently four), all of the young files are merged
+together with all of the overlapping level-1 files to produce a
+sequence of new level-1 files (we create a new level-1 file for every
+2MB of data.)
+<p>
+Files in the young level may contain overlapping keys.  However files
+in other levels have distinct non-overlapping key ranges.  Consider
+level number L where L >= 1.  When the combined size of files in
+level-L exceeds (10^L) MB (i.e., 10MB for level-1, 100MB for level-2,
+...), one file in level-L, and all of the overlapping files in
+level-(L+1) are merged to form a set of new files for level-(L+1).
+These merges have the effect of gradually migrating new updates from
+the young level to the largest level using only bulk reads and writes
+(i.e., minimizing expensive seeks).
+
+<h2>Manifest</h2>
+<p>
+A MANIFEST file lists the set of sorted tables that make up each
+level, the corresponding key ranges, and other important metadata.
+A new MANIFEST file (with a new number embedded in the file name)
+is created whenever the database is reopened.  The MANIFEST file is
+formatted as a log, and changes made to the serving state (as files
+are added or removed) are appended to this log.
+<p>
+<h2>Current</h2>
+<p>
+CURRENT is a simple text file that contains the name of the latest
+MANIFEST file.
+<p>
+<h2>Info logs</h2>
+<p>
+Informational messages are printed to files named LOG and LOG.old.
+<p>
+<h2>Others</h2>
+<p>
+Other files used for miscellaneous purposes may also be present
+(LOCK, *.dbtmp).
+
+<h1>Level 0</h1>
+When the log file grows above a certain size (1MB by default):
+<ul>
+<li>Create a brand new memtable and log file and direct future updates here
+<li>In the background:
+<ul>
+<li>Write the contents of the previous memtable to an sstable
+<li>Discard the memtable
+<li>Delete the old log file and the old memtable
+<li>Add the new sstable to the young (level-0) level.
+</ul>
+</ul>
+
+<h1>Compactions</h1>
+
+<p>
+When the size of level L exceeds its limit, we compact it in a
+background thread.  The compaction picks a file from level L and all
+overlapping files from the next level L+1.  Note that if a level-L
+file overlaps only part of a level-(L+1) file, the entire file at
+level-(L+1) is used as an input to the compaction and will be
+discarded after the compaction.  Aside: because level-0 is special
+(files in it may overlap each other), we treat compactions from
+level-0 to level-1 specially: a level-0 compaction may pick more than
+one level-0 file in case some of these files overlap each other.
+
+<p>
+A compaction merges the contents of the picked files to produce a
+sequence of level-(L+1) files.  We switch to producing a new
+level-(L+1) file after the current output file has reached the target
+file size (2MB).  We also switch to a new output file when the key
+range of the current output file has grown enough to overlap more then
+ten level-(L+2) files.  This last rule ensures that a later compaction
+of a level-(L+1) file will not pick up too much data from level-(L+2).
+
+<p>
+The old files are discarded and the new files are added to the serving
+state.
+
+<p>
+Compactions for a particular level rotate through the key space.  In
+more detail, for each level L, we remember the ending key of the last
+compaction at level L.  The next compaction for level L will pick the
+first file that starts after this key (wrapping around to the
+beginning of the key space if there is no such file).
+
+<p>
+Compactions drop overwritten values.  They also drop deletion markers
+if there are no higher numbered levels that contain a file whose range
+overlaps the current key.
+
+<h2>Timing</h2>
+
+Level-0 compactions will read up to four 1MB files from level-0, and
+at worst all the level-1 files (10MB).  I.e., we will read 14MB and
+write 14MB.
+
+<p>
+Other than the special level-0 compactions, we will pick one 2MB file
+from level L.  In the worst case, this will overlap ~ 12 files from
+level L+1 (10 because level-(L+1) is ten times the size of level-L,
+and another two at the boundaries since the file ranges at level-L
+will usually not be aligned with the file ranges at level-L+1).  The
+compaction will therefore read 26MB and write 26MB.  Assuming a disk
+IO rate of 100MB/s (ballpark range for modern drives), the worst
+compaction cost will be approximately 0.5 second.
+
+<p>
+If we throttle the background writing to something small, say 10% of
+the full 100MB/s speed, a compaction may take up to 5 seconds.  If the
+user is writing at 10MB/s, we might build up lots of level-0 files
+(~50 to hold the 5*10MB).  This may signficantly increase the cost of
+reads due to the overhead of merging more files together on every
+read.
+
+<p>
+Solution 1: To reduce this problem, we might want to increase the log
+switching threshold when the number of level-0 files is large.  Though
+the downside is that the larger this threshold, the more memory we will
+need to hold the corresponding memtable.
+
+<p>
+Solution 2: We might want to decrease write rate artificially when the
+number of level-0 files goes up.
+
+<p>
+Solution 3: We work on reducing the cost of very wide merges.
+Perhaps most of the level-0 files will have their blocks sitting
+uncompressed in the cache and we will only need to worry about the
+O(N) complexity in the merging iterator.
+
+<h2>Number of files</h2>
+
+Instead of always making 2MB files, we could make larger files for
+larger levels to reduce the total file count, though at the expense of
+more bursty compactions.  Alternatively, we could shard the set of
+files into multiple directories.
+
+<p>
+An experiment on an <code>ext3</code> filesystem on Feb 04, 2011 shows
+the following timings to do 100K file opens in directories with
+varying number of files:
+<table class="datatable">
+<tr><th>Files in directory</th><th>Microseconds to open a file</th></tr>
+<tr><td>1000</td><td>9</td>
+<tr><td>10000</td><td>10</td>
+<tr><td>100000</td><td>16</td>
+</table>
+So maybe even the sharding is not necessary on modern filesystems?
+
+<h1>Recovery</h1>
+
+<ul>
+<li> Read CURRENT to find name of the latest committed MANIFEST
+<li> Read the named MANIFEST file
+<li> Clean up stale files
+<li> We could open all sstables here, but it is probably better to be lazy...
+<li> Convert log chunk to a new level-0 sstable
+<li> Start directing new writes to a new log file with recovered sequence#
+</ul>
+
+<h1>Garbage collection of files</h1>
+
+<code>DeleteObsoleteFiles()</code> is called at the end of every
+compaction and at the end of recovery.  It finds the names of all
+files in the database.  It deletes all log files that are not the
+current log file.  It deletes all table files that are not referenced
+from some level and are not the output of an active compaction.
+
+</body>
+</html>
--- a/src/leveldb/doc/impl.md
+++ b/src/leveldb/doc/impl.md
@ -1,170 +0,0 @@
-## Files
-
-The implementation of leveldb is similar in spirit to the representation of a
-single [Bigtable tablet (section 5.3)](http://research.google.com/archive/bigtable.html).
-However the organization of the files that make up the representation is
-somewhat different and is explained below.
-
-Each database is represented by a set of files stored in a directory. There are
-several different types of files as documented below:
-
-### Log files
-
-A log file (*.log) stores a sequence of recent updates. Each update is appended
-to the current log file. When the log file reaches a pre-determined size
-(approximately 4MB by default), it is converted to a sorted table (see below)
-and a new log file is created for future updates.
-
-A copy of the current log file is kept in an in-memory structure (the
-`memtable`). This copy is consulted on every read so that read operations
-reflect all logged updates.
-
-## Sorted tables
-
-A sorted table (*.ldb) stores a sequence of entries sorted by key. Each entry is
-either a value for the key, or a deletion marker for the key. (Deletion markers
-are kept around to hide obsolete values present in older sorted tables).
-
-The set of sorted tables are organized into a sequence of levels. The sorted
-table generated from a log file is placed in a special **young** level (also
-called level-0). When the number of young files exceeds a certain threshold
-(currently four), all of the young files are merged together with all of the
-overlapping level-1 files to produce a sequence of new level-1 files (we create
-a new level-1 file for every 2MB of data.)
-
-Files in the young level may contain overlapping keys. However files in other
-levels have distinct non-overlapping key ranges. Consider level number L where
-L >= 1. When the combined size of files in level-L exceeds (10^L) MB (i.e., 10MB
-for level-1, 100MB for level-2, ...), one file in level-L, and all of the
-overlapping files in level-(L+1) are merged to form a set of new files for
-level-(L+1). These merges have the effect of gradually migrating new updates
-from the young level to the largest level using only bulk reads and writes
-(i.e., minimizing expensive seeks).
-
-### Manifest
-
-A MANIFEST file lists the set of sorted tables that make up each level, the
-corresponding key ranges, and other important metadata. A new MANIFEST file
-(with a new number embedded in the file name) is created whenever the database
-is reopened. The MANIFEST file is formatted as a log, and changes made to the
-serving state (as files are added or removed) are appended to this log.
-
-### Current
-
-CURRENT is a simple text file that contains the name of the latest MANIFEST
-file.
-
-### Info logs
-
-Informational messages are printed to files named LOG and LOG.old.
-
-### Others
-
-Other files used for miscellaneous purposes may also be present (LOCK, *.dbtmp).
-
-## Level 0
-
-When the log file grows above a certain size (1MB by default):
-Create a brand new memtable and log file and direct future updates here
-In the background:
-Write the contents of the previous memtable to an sstable
-Discard the memtable
-Delete the old log file and the old memtable
-Add the new sstable to the young (level-0) level.
-
-## Compactions
-
-When the size of level L exceeds its limit, we compact it in a background
-thread. The compaction picks a file from level L and all overlapping files from
-the next level L+1. Note that if a level-L file overlaps only part of a
-level-(L+1) file, the entire file at level-(L+1) is used as an input to the
-compaction and will be discarded after the compaction.  Aside: because level-0
-is special (files in it may overlap each other), we treat compactions from
-level-0 to level-1 specially: a level-0 compaction may pick more than one
-level-0 file in case some of these files overlap each other.
-
-A compaction merges the contents of the picked files to produce a sequence of
-level-(L+1) files. We switch to producing a new level-(L+1) file after the
-current output file has reached the target file size (2MB). We also switch to a
-new output file when the key range of the current output file has grown enough
-to overlap more than ten level-(L+2) files.  This last rule ensures that a later
-compaction of a level-(L+1) file will not pick up too much data from
-level-(L+2).
-
-The old files are discarded and the new files are added to the serving state.
-
-Compactions for a particular level rotate through the key space. In more detail,
-for each level L, we remember the ending key of the last compaction at level L.
-The next compaction for level L will pick the first file that starts after this
-key (wrapping around to the beginning of the key space if there is no such
-file).
-
-Compactions drop overwritten values. They also drop deletion markers if there
-are no higher numbered levels that contain a file whose range overlaps the
-current key.
-
-### Timing
-
-Level-0 compactions will read up to four 1MB files from level-0, and at worst
-all the level-1 files (10MB). I.e., we will read 14MB and write 14MB.
-
-Other than the special level-0 compactions, we will pick one 2MB file from level
-L. In the worst case, this will overlap ~ 12 files from level L+1 (10 because
-level-(L+1) is ten times the size of level-L, and another two at the boundaries
-since the file ranges at level-L will usually not be aligned with the file
-ranges at level-L+1). The compaction will therefore read 26MB and write 26MB.
-Assuming a disk IO rate of 100MB/s (ballpark range for modern drives), the worst
-compaction cost will be approximately 0.5 second.
-
-If we throttle the background writing to something small, say 10% of the full
-100MB/s speed, a compaction may take up to 5 seconds. If the user is writing at
-10MB/s, we might build up lots of level-0 files (~50 to hold the 5*10MB). This
-may significantly increase the cost of reads due to the overhead of merging more
-files together on every read.
-
-Solution 1: To reduce this problem, we might want to increase the log switching
-threshold when the number of level-0 files is large. Though the downside is that
-the larger this threshold, the more memory we will need to hold the
-corresponding memtable.
-
-Solution 2: We might want to decrease write rate artificially when the number of
-level-0 files goes up.
-
-Solution 3: We work on reducing the cost of very wide merges. Perhaps most of
-the level-0 files will have their blocks sitting uncompressed in the cache and
-we will only need to worry about the O(N) complexity in the merging iterator.
-
-### Number of files
-
-Instead of always making 2MB files, we could make larger files for larger levels
-to reduce the total file count, though at the expense of more bursty
-compactions.  Alternatively, we could shard the set of files into multiple
-directories.
-
-An experiment on an ext3 filesystem on Feb 04, 2011 shows the following timings
-to do 100K file opens in directories with varying number of files:
-
-
-| Files in directory | Microseconds to open a file |
-|-------------------:|----------------------------:|
-|               1000 |                           9 |
-|              10000 |                          10 |
-|             100000 |                          16 |
-
-So maybe even the sharding is not necessary on modern filesystems?
-
-## Recovery
-
-* Read CURRENT to find name of the latest committed MANIFEST
-* Read the named MANIFEST file
-* Clean up stale files
-* We could open all sstables here, but it is probably better to be lazy...
-* Convert log chunk to a new level-0 sstable
-* Start directing new writes to a new log file with recovered sequence#
-
-## Garbage collection of files
-
-`DeleteObsoleteFiles()` is called at the end of every compaction and at the end
-of recovery. It finds the names of all files in the database. It deletes all log
-files that are not the current log file. It deletes all table files that are not
-referenced from some level and are not the output of an active compaction.
--- a/src/leveldb/doc/index.html
+++ b/src/leveldb/doc/index.html
@ -0,0 +1,549 @@
+<!DOCTYPE html>
+<html>
+<head>
+<link rel="stylesheet" type="text/css" href="doc.css" />
+<title>Leveldb</title>
+</head>
+
+<body>
+<h1>Leveldb</h1>
+<address>Jeff Dean, Sanjay Ghemawat</address>
+<p>
+The <code>leveldb</code> library provides a persistent key value store.  Keys and
+values are arbitrary byte arrays.  The keys are ordered within the key
+value store according to a user-specified comparator function.
+
+<p>
+<h1>Opening A Database</h1>
+<p>
+A <code>leveldb</code> database has a name which corresponds to a file system
+directory.  All of the contents of database are stored in this
+directory.  The following example shows how to open a database,
+creating it if necessary:
+<p>
+<pre>
+  #include &lt;assert&gt;
+  #include "leveldb/db.h"
+
+  leveldb::DB* db;
+  leveldb::Options options;
+  options.create_if_missing = true;
+  leveldb::Status status = leveldb::DB::Open(options, "/tmp/testdb", &amp;db);
+  assert(status.ok());
+  ...
+</pre>
+If you want to raise an error if the database already exists, add
+the following line before the <code>leveldb::DB::Open</code> call:
+<pre>
+  options.error_if_exists = true;
+</pre>
+<h1>Status</h1>
+<p>
+You may have noticed the <code>leveldb::Status</code> type above.  Values of this
+type are returned by most functions in <code>leveldb</code> that may encounter an
+error.  You can check if such a result is ok, and also print an
+associated error message:
+<p>
+<pre>
+   leveldb::Status s = ...;
+   if (!s.ok()) cerr &lt;&lt; s.ToString() &lt;&lt; endl;
+</pre>
+<h1>Closing A Database</h1>
+<p>
+When you are done with a database, just delete the database object.
+Example:
+<p>
+<pre>
+  ... open the db as described above ...
+  ... do something with db ...
+  delete db;
+</pre>
+<h1>Reads And Writes</h1>
+<p>
+The database provides <code>Put</code>, <code>Delete</code>, and <code>Get</code> methods to
+modify/query the database.  For example, the following code
+moves the value stored under key1 to key2.
+<pre>
+  std::string value;
+  leveldb::Status s = db-&gt;Get(leveldb::ReadOptions(), key1, &amp;value);
+  if (s.ok()) s = db-&gt;Put(leveldb::WriteOptions(), key2, value);
+  if (s.ok()) s = db-&gt;Delete(leveldb::WriteOptions(), key1);
+</pre>
+
+<h1>Atomic Updates</h1>
+<p>
+Note that if the process dies after the Put of key2 but before the
+delete of key1, the same value may be left stored under multiple keys.
+Such problems can be avoided by using the <code>WriteBatch</code> class to
+atomically apply a set of updates:
+<p>
+<pre>
+  #include "leveldb/write_batch.h"
+  ...
+  std::string value;
+  leveldb::Status s = db-&gt;Get(leveldb::ReadOptions(), key1, &amp;value);
+  if (s.ok()) {
+    leveldb::WriteBatch batch;
+    batch.Delete(key1);
+    batch.Put(key2, value);
+    s = db-&gt;Write(leveldb::WriteOptions(), &amp;batch);
+  }
+</pre>
+The <code>WriteBatch</code> holds a sequence of edits to be made to the database,
+and these edits within the batch are applied in order.  Note that we
+called <code>Delete</code> before <code>Put</code> so that if <code>key1</code> is identical to <code>key2</code>,
+we do not end up erroneously dropping the value entirely.
+<p>
+Apart from its atomicity benefits, <code>WriteBatch</code> may also be used to
+speed up bulk updates by placing lots of individual mutations into the
+same batch.
+
+<h1>Synchronous Writes</h1>
+By default, each write to <code>leveldb</code> is asynchronous: it
+returns after pushing the write from the process into the operating
+system.  The transfer from operating system memory to the underlying
+persistent storage happens asynchronously.  The <code>sync</code> flag
+can be turned on for a particular write to make the write operation
+not return until the data being written has been pushed all the way to
+persistent storage.  (On Posix systems, this is implemented by calling
+either <code>fsync(...)</code> or <code>fdatasync(...)</code> or
+<code>msync(..., MS_SYNC)</code> before the write operation returns.)
+<pre>
+  leveldb::WriteOptions write_options;
+  write_options.sync = true;
+  db-&gt;Put(write_options, ...);
+</pre>
+Asynchronous writes are often more than a thousand times as fast as
+synchronous writes.  The downside of asynchronous writes is that a
+crash of the machine may cause the last few updates to be lost.  Note
+that a crash of just the writing process (i.e., not a reboot) will not
+cause any loss since even when <code>sync</code> is false, an update
+is pushed from the process memory into the operating system before it
+is considered done.
+
+<p>
+Asynchronous writes can often be used safely.  For example, when
+loading a large amount of data into the database you can handle lost
+updates by restarting the bulk load after a crash.  A hybrid scheme is
+also possible where every Nth write is synchronous, and in the event
+of a crash, the bulk load is restarted just after the last synchronous
+write finished by the previous run.  (The synchronous write can update
+a marker that describes where to restart on a crash.)
+
+<p>
+<code>WriteBatch</code> provides an alternative to asynchronous writes.
+Multiple updates may be placed in the same <code>WriteBatch</code> and
+applied together using a synchronous write (i.e.,
+<code>write_options.sync</code> is set to true).  The extra cost of
+the synchronous write will be amortized across all of the writes in
+the batch.
+
+<p>
+<h1>Concurrency</h1>
+<p>
+A database may only be opened by one process at a time.
+The <code>leveldb</code> implementation acquires a lock from the
+operating system to prevent misuse.  Within a single process, the
+same <code>leveldb::DB</code> object may be safely shared by multiple
+concurrent threads.  I.e., different threads may write into or fetch
+iterators or call <code>Get</code> on the same database without any
+external synchronization (the leveldb implementation will
+automatically do the required synchronization).  However other objects
+(like Iterator and WriteBatch) may require external synchronization.
+If two threads share such an object, they must protect access to it
+using their own locking protocol.  More details are available in
+the public header files.
+<p>
+<h1>Iteration</h1>
+<p>
+The following example demonstrates how to print all key,value pairs
+in a database.
+<p>
+<pre>
+  leveldb::Iterator* it = db-&gt;NewIterator(leveldb::ReadOptions());
+  for (it-&gt;SeekToFirst(); it-&gt;Valid(); it-&gt;Next()) {
+    cout &lt;&lt; it-&gt;key().ToString() &lt;&lt; ": "  &lt;&lt; it-&gt;value().ToString() &lt;&lt; endl;
+  }
+  assert(it-&gt;status().ok());  // Check for any errors found during the scan
+  delete it;
+</pre>
+The following variation shows how to process just the keys in the
+range <code>[start,limit)</code>:
+<p>
+<pre>
+  for (it-&gt;Seek(start);
+       it-&gt;Valid() &amp;&amp; it-&gt;key().ToString() &lt; limit;
+       it-&gt;Next()) {
+    ...
+  }
+</pre>
+You can also process entries in reverse order.  (Caveat: reverse
+iteration may be somewhat slower than forward iteration.)
+<p>
+<pre>
+  for (it-&gt;SeekToLast(); it-&gt;Valid(); it-&gt;Prev()) {
+    ...
+  }
+</pre>
+<h1>Snapshots</h1>
+<p>
+Snapshots provide consistent read-only views over the entire state of
+the key-value store.  <code>ReadOptions::snapshot</code> may be non-NULL to indicate
+that a read should operate on a particular version of the DB state.
+If <code>ReadOptions::snapshot</code> is NULL, the read will operate on an
+implicit snapshot of the current state.
+<p>
+Snapshots are created by the DB::GetSnapshot() method:
+<p>
+<pre>
+  leveldb::ReadOptions options;
+  options.snapshot = db-&gt;GetSnapshot();
+  ... apply some updates to db ...
+  leveldb::Iterator* iter = db-&gt;NewIterator(options);
+  ... read using iter to view the state when the snapshot was created ...
+  delete iter;
+  db-&gt;ReleaseSnapshot(options.snapshot);
+</pre>
+Note that when a snapshot is no longer needed, it should be released
+using the DB::ReleaseSnapshot interface.  This allows the
+implementation to get rid of state that was being maintained just to
+support reading as of that snapshot.
+<h1>Slice</h1>
+<p>
+The return value of the <code>it->key()</code> and <code>it->value()</code> calls above
+are instances of the <code>leveldb::Slice</code> type.  <code>Slice</code> is a simple
+structure that contains a length and a pointer to an external byte
+array.  Returning a <code>Slice</code> is a cheaper alternative to returning a
+<code>std::string</code> since we do not need to copy potentially large keys and
+values.  In addition, <code>leveldb</code> methods do not return null-terminated
+C-style strings since <code>leveldb</code> keys and values are allowed to
+contain '\0' bytes.
+<p>
+C++ strings and null-terminated C-style strings can be easily converted
+to a Slice:
+<p>
+<pre>
+   leveldb::Slice s1 = "hello";
+
+   std::string str("world");
+   leveldb::Slice s2 = str;
+</pre>
+A Slice can be easily converted back to a C++ string:
+<pre>
+   std::string str = s1.ToString();
+   assert(str == std::string("hello"));
+</pre>
+Be careful when using Slices since it is up to the caller to ensure that
+the external byte array into which the Slice points remains live while
+the Slice is in use.  For example, the following is buggy:
+<p>
+<pre>
+   leveldb::Slice slice;
+   if (...) {
+     std::string str = ...;
+     slice = str;
+   }
+   Use(slice);
+</pre>
+When the <code>if</code> statement goes out of scope, <code>str</code> will be destroyed and the
+backing storage for <code>slice</code> will disappear.
+<p>
+<h1>Comparators</h1>
+<p>
+The preceding examples used the default ordering function for key,
+which orders bytes lexicographically.  You can however supply a custom
+comparator when opening a database.  For example, suppose each
+database key consists of two numbers and we should sort by the first
+number, breaking ties by the second number.  First, define a proper
+subclass of <code>leveldb::Comparator</code> that expresses these rules:
+<p>
+<pre>
+  class TwoPartComparator : public leveldb::Comparator {
+   public:
+    // Three-way comparison function:
+    //   if a &lt; b: negative result
+    //   if a &gt; b: positive result
+    //   else: zero result
+    int Compare(const leveldb::Slice&amp; a, const leveldb::Slice&amp; b) const {
+      int a1, a2, b1, b2;
+      ParseKey(a, &amp;a1, &amp;a2);
+      ParseKey(b, &amp;b1, &amp;b2);
+      if (a1 &lt; b1) return -1;
+      if (a1 &gt; b1) return +1;
+      if (a2 &lt; b2) return -1;
+      if (a2 &gt; b2) return +1;
+      return 0;
+    }
+
+    // Ignore the following methods for now:
+    const char* Name() const { return "TwoPartComparator"; }
+    void FindShortestSeparator(std::string*, const leveldb::Slice&amp;) const { }
+    void FindShortSuccessor(std::string*) const { }
+  };
+</pre>
+Now create a database using this custom comparator:
+<p>
+<pre>
+  TwoPartComparator cmp;
+  leveldb::DB* db;
+  leveldb::Options options;
+  options.create_if_missing = true;
+  options.comparator = &amp;cmp;
+  leveldb::Status status = leveldb::DB::Open(options, "/tmp/testdb", &amp;db);
+  ...
+</pre>
+<h2>Backwards compatibility</h2>
+<p>
+The result of the comparator's <code>Name</code> method is attached to the
+database when it is created, and is checked on every subsequent
+database open.  If the name changes, the <code>leveldb::DB::Open</code> call will
+fail.  Therefore, change the name if and only if the new key format
+and comparison function are incompatible with existing databases, and
+it is ok to discard the contents of all existing databases.
+<p>
+You can however still gradually evolve your key format over time with
+a little bit of pre-planning.  For example, you could store a version
+number at the end of each key (one byte should suffice for most uses).
+When you wish to switch to a new key format (e.g., adding an optional
+third part to the keys processed by <code>TwoPartComparator</code>),
+(a) keep the same comparator name (b) increment the version number
+for new keys (c) change the comparator function so it uses the
+version numbers found in the keys to decide how to interpret them.
+<p>
+<h1>Performance</h1>
+<p>
+Performance can be tuned by changing the default values of the
+types defined in <code>include/leveldb/options.h</code>.
+
+<p>
+<h2>Block size</h2>
+<p>
+<code>leveldb</code> groups adjacent keys together into the same block and such a
+block is the unit of transfer to and from persistent storage.  The
+default block size is approximately 4096 uncompressed bytes.
+Applications that mostly do bulk scans over the contents of the
+database may wish to increase this size.  Applications that do a lot
+of point reads of small values may wish to switch to a smaller block
+size if performance measurements indicate an improvement.  There isn't
+much benefit in using blocks smaller than one kilobyte, or larger than
+a few megabytes.  Also note that compression will be more effective
+with larger block sizes.
+<p>
+<h2>Compression</h2>
+<p>
+Each block is individually compressed before being written to
+persistent storage.  Compression is on by default since the default
+compression method is very fast, and is automatically disabled for
+uncompressible data.  In rare cases, applications may want to disable
+compression entirely, but should only do so if benchmarks show a
+performance improvement:
+<p>
+<pre>
+  leveldb::Options options;
+  options.compression = leveldb::kNoCompression;
+  ... leveldb::DB::Open(options, name, ...) ....
+</pre>
+<h2>Cache</h2>
+<p>
+The contents of the database are stored in a set of files in the
+filesystem and each file stores a sequence of compressed blocks.  If
+<code>options.cache</code> is non-NULL, it is used to cache frequently used
+uncompressed block contents.
+<p>
+<pre>
+  #include "leveldb/cache.h"
+
+  leveldb::Options options;
+  options.cache = leveldb::NewLRUCache(100 * 1048576);  // 100MB cache
+  leveldb::DB* db;
+  leveldb::DB::Open(options, name, &db);
+  ... use the db ...
+  delete db
+  delete options.cache;
+</pre>
+Note that the cache holds uncompressed data, and therefore it should
+be sized according to application level data sizes, without any
+reduction from compression.  (Caching of compressed blocks is left to
+the operating system buffer cache, or any custom <code>Env</code>
+implementation provided by the client.)
+<p>
+When performing a bulk read, the application may wish to disable
+caching so that the data processed by the bulk read does not end up
+displacing most of the cached contents.  A per-iterator option can be
+used to achieve this:
+<p>
+<pre>
+  leveldb::ReadOptions options;
+  options.fill_cache = false;
+  leveldb::Iterator* it = db-&gt;NewIterator(options);
+  for (it-&gt;SeekToFirst(); it-&gt;Valid(); it-&gt;Next()) {
+    ...
+  }
+</pre>
+<h2>Key Layout</h2>
+<p>
+Note that the unit of disk transfer and caching is a block.  Adjacent
+keys (according to the database sort order) will usually be placed in
+the same block.  Therefore the application can improve its performance
+by placing keys that are accessed together near each other and placing
+infrequently used keys in a separate region of the key space.
+<p>
+For example, suppose we are implementing a simple file system on top
+of <code>leveldb</code>.  The types of entries we might wish to store are:
+<p>
+<pre>
+   filename -&gt; permission-bits, length, list of file_block_ids
+   file_block_id -&gt; data
+</pre>
+We might want to prefix <code>filename</code> keys with one letter (say '/') and the
+<code>file_block_id</code> keys with a different letter (say '0') so that scans
+over just the metadata do not force us to fetch and cache bulky file
+contents.
+<p>
+<h2>Filters</h2>
+<p>
+Because of the way <code>leveldb</code> data is organized on disk,
+a single <code>Get()</code> call may involve multiple reads from disk.
+The optional <code>FilterPolicy</code> mechanism can be used to reduce
+the number of disk reads substantially.
+<pre>
+   leveldb::Options options;
+   options.filter_policy = NewBloomFilter(10);
+   leveldb::DB* db;
+   leveldb::DB::Open(options, "/tmp/testdb", &amp;db);
+   ... use the database ...
+   delete db;
+   delete options.filter_policy;
+</pre>
+The preceding code associates a
+<a href="http://en.wikipedia.org/wiki/Bloom_filter">Bloom filter</a>
+based filtering policy with the database.  Bloom filter based
+filtering relies on keeping some number of bits of data in memory per
+key (in this case 10 bits per key since that is the argument we passed
+to NewBloomFilter).  This filter will reduce the number of unnecessary
+disk reads needed for <code>Get()</code> calls by a factor of
+approximately a 100.  Increasing the bits per key will lead to a
+larger reduction at the cost of more memory usage.  We recommend that
+applications whose working set does not fit in memory and that do a
+lot of random reads set a filter policy.
+<p>
+If you are using a custom comparator, you should ensure that the filter
+policy you are using is compatible with your comparator.  For example,
+consider a comparator that ignores trailing spaces when comparing keys.
+<code>NewBloomFilter</code> must not be used with such a comparator.
+Instead, the application should provide a custom filter policy that
+also ignores trailing spaces.  For example:
+<pre>
+  class CustomFilterPolicy : public leveldb::FilterPolicy {
+   private:
+    FilterPolicy* builtin_policy_;
+   public:
+    CustomFilterPolicy() : builtin_policy_(NewBloomFilter(10)) { }
+    ~CustomFilterPolicy() { delete builtin_policy_; }
+
+    const char* Name() const { return "IgnoreTrailingSpacesFilter"; }
+
+    void CreateFilter(const Slice* keys, int n, std::string* dst) const {
+      // Use builtin bloom filter code after removing trailing spaces
+      std::vector&lt;Slice&gt; trimmed(n);
+      for (int i = 0; i &lt; n; i++) {
+        trimmed[i] = RemoveTrailingSpaces(keys[i]);
+      }
+      return builtin_policy_-&gt;CreateFilter(&amp;trimmed[i], n, dst);
+    }
+
+    bool KeyMayMatch(const Slice& key, const Slice& filter) const {
+      // Use builtin bloom filter code after removing trailing spaces
+      return builtin_policy_-&gt;KeyMayMatch(RemoveTrailingSpaces(key), filter);
+    }
+  };
+</pre>
+<p>
+Advanced applications may provide a filter policy that does not use
+a bloom filter but uses some other mechanism for summarizing a set
+of keys.  See <code>leveldb/filter_policy.h</code> for detail.
+<p>
+<h1>Checksums</h1>
+<p>
+<code>leveldb</code> associates checksums with all data it stores in the file system.
+There are two separate controls provided over how aggressively these
+checksums are verified:
+<p>
+<ul>
+<li> <code>ReadOptions::verify_checksums</code> may be set to true to force
+  checksum verification of all data that is read from the file system on
+  behalf of a particular read.  By default, no such verification is
+  done.
+<p>
+<li> <code>Options::paranoid_checks</code> may be set to true before opening a
+  database to make the database implementation raise an error as soon as
+  it detects an internal corruption.  Depending on which portion of the
+  database has been corrupted, the error may be raised when the database
+  is opened, or later by another database operation.  By default,
+  paranoid checking is off so that the database can be used even if
+  parts of its persistent storage have been corrupted.
+<p>
+  If a database is corrupted (perhaps it cannot be opened when
+  paranoid checking is turned on), the <code>leveldb::RepairDB</code> function
+  may be used to recover as much of the data as possible
+<p>
+</ul>
+<h1>Approximate Sizes</h1>
+<p>
+The <code>GetApproximateSizes</code> method can used to get the approximate
+number of bytes of file system space used by one or more key ranges.
+<p>
+<pre>
+   leveldb::Range ranges[2];
+   ranges[0] = leveldb::Range("a", "c");
+   ranges[1] = leveldb::Range("x", "z");
+   uint64_t sizes[2];
+   leveldb::Status s = db-&gt;GetApproximateSizes(ranges, 2, sizes);
+</pre>
+The preceding call will set <code>sizes[0]</code> to the approximate number of
+bytes of file system space used by the key range <code>[a..c)</code> and
+<code>sizes[1]</code> to the approximate number of bytes used by the key range
+<code>[x..z)</code>.
+<p>
+<h1>Environment</h1>
+<p>
+All file operations (and other operating system calls) issued by the
+<code>leveldb</code> implementation are routed through a <code>leveldb::Env</code> object.
+Sophisticated clients may wish to provide their own <code>Env</code>
+implementation to get better control.  For example, an application may
+introduce artificial delays in the file IO paths to limit the impact
+of <code>leveldb</code> on other activities in the system.
+<p>
+<pre>
+  class SlowEnv : public leveldb::Env {
+    .. implementation of the Env interface ...
+  };
+
+  SlowEnv env;
+  leveldb::Options options;
+  options.env = &amp;env;
+  Status s = leveldb::DB::Open(options, ...);
+</pre>
+<h1>Porting</h1>
+<p>
+<code>leveldb</code> may be ported to a new platform by providing platform
+specific implementations of the types/methods/functions exported by
+<code>leveldb/port/port.h</code>.  See <code>leveldb/port/port_example.h</code> for more
+details.
+<p>
+In addition, the new platform may need a new default <code>leveldb::Env</code>
+implementation.  See <code>leveldb/util/env_posix.h</code> for an example.
+
+<h1>Other Information</h1>
+
+<p>
+Details about the <code>leveldb</code> implementation may be found in
+the following documents:
+<ul>
+<li> <a href="impl.html">Implementation notes</a>
+<li> <a href="table_format.txt">Format of an immutable Table file</a>
+<li> <a href="log_format.txt">Format of a log file</a>
+</ul>
+
+</body>
+</html>
--- a/src/leveldb/doc/index.md
+++ b/src/leveldb/doc/index.md
@ -1,523 +0,0 @@
-leveldb
-=======
-
-_Jeff Dean, Sanjay Ghemawat_
-
-The leveldb library provides a persistent key value store. Keys and values are
-arbitrary byte arrays.  The keys are ordered within the key value store
-according to a user-specified comparator function.
-
-## Opening A Database
-
-A leveldb database has a name which corresponds to a file system directory. All
-of the contents of database are stored in this directory. The following example
-shows how to open a database, creating it if necessary:
-
-```c++
-#include <cassert>
-#include "leveldb/db.h"
-
-leveldb::DB* db;
-leveldb::Options options;
-options.create_if_missing = true;
-leveldb::Status status = leveldb::DB::Open(options, "/tmp/testdb", &db);
-assert(status.ok());
-...
-```
-
-If you want to raise an error if the database already exists, add the following
-line before the `leveldb::DB::Open` call:
-
-```c++
-options.error_if_exists = true;
-```
-
-## Status
-
-You may have noticed the `leveldb::Status` type above. Values of this type are
-returned by most functions in leveldb that may encounter an error. You can check
-if such a result is ok, and also print an associated error message:
-
-```c++
-leveldb::Status s = ...;
-if (!s.ok()) cerr << s.ToString() << endl;
-```
-
-## Closing A Database
-
-When you are done with a database, just delete the database object. Example:
-
-```c++
-... open the db as described above ...
-... do something with db ...
-delete db;
-```
-
-## Reads And Writes
-
-The database provides Put, Delete, and Get methods to modify/query the database.
-For example, the following code moves the value stored under key1 to key2.
-
-```c++
-std::string value;
-leveldb::Status s = db->Get(leveldb::ReadOptions(), key1, &value);
-if (s.ok()) s = db->Put(leveldb::WriteOptions(), key2, value);
-if (s.ok()) s = db->Delete(leveldb::WriteOptions(), key1);
-```
-
-## Atomic Updates
-
-Note that if the process dies after the Put of key2 but before the delete of
-key1, the same value may be left stored under multiple keys. Such problems can
-be avoided by using the `WriteBatch` class to atomically apply a set of updates:
-
-```c++
-#include "leveldb/write_batch.h"
-...
-std::string value;
-leveldb::Status s = db->Get(leveldb::ReadOptions(), key1, &value);
-if (s.ok()) {
-  leveldb::WriteBatch batch;
-  batch.Delete(key1);
-  batch.Put(key2, value);
-  s = db->Write(leveldb::WriteOptions(), &batch);
-}
-```
-
-The `WriteBatch` holds a sequence of edits to be made to the database, and these
-edits within the batch are applied in order. Note that we called Delete before
-Put so that if key1 is identical to key2, we do not end up erroneously dropping
-the value entirely.
-
-Apart from its atomicity benefits, `WriteBatch` may also be used to speed up
-bulk updates by placing lots of individual mutations into the same batch.
-
-## Synchronous Writes
-
-By default, each write to leveldb is asynchronous: it returns after pushing the
-write from the process into the operating system. The transfer from operating
-system memory to the underlying persistent storage happens asynchronously. The
-sync flag can be turned on for a particular write to make the write operation
-not return until the data being written has been pushed all the way to
-persistent storage. (On Posix systems, this is implemented by calling either
-`fsync(...)` or `fdatasync(...)` or `msync(..., MS_SYNC)` before the write
-operation returns.)
-
-```c++
-leveldb::WriteOptions write_options;
-write_options.sync = true;
-db->Put(write_options, ...);
-```
-
-Asynchronous writes are often more than a thousand times as fast as synchronous
-writes. The downside of asynchronous writes is that a crash of the machine may
-cause the last few updates to be lost. Note that a crash of just the writing
-process (i.e., not a reboot) will not cause any loss since even when sync is
-false, an update is pushed from the process memory into the operating system
-before it is considered done.
-
-Asynchronous writes can often be used safely. For example, when loading a large
-amount of data into the database you can handle lost updates by restarting the
-bulk load after a crash. A hybrid scheme is also possible where every Nth write
-is synchronous, and in the event of a crash, the bulk load is restarted just
-after the last synchronous write finished by the previous run. (The synchronous
-write can update a marker that describes where to restart on a crash.)
-
-`WriteBatch` provides an alternative to asynchronous writes. Multiple updates
-may be placed in the same WriteBatch and applied together using a synchronous
-write (i.e., `write_options.sync` is set to true). The extra cost of the
-synchronous write will be amortized across all of the writes in the batch.
-
-## Concurrency
-
-A database may only be opened by one process at a time. The leveldb
-implementation acquires a lock from the operating system to prevent misuse.
-Within a single process, the same `leveldb::DB` object may be safely shared by
-multiple concurrent threads. I.e., different threads may write into or fetch
-iterators or call Get on the same database without any external synchronization
-(the leveldb implementation will automatically do the required synchronization).
-However other objects (like Iterator and `WriteBatch`) may require external
-synchronization. If two threads share such an object, they must protect access
-to it using their own locking protocol. More details are available in the public
-header files.
-
-## Iteration
-
-The following example demonstrates how to print all key,value pairs in a
-database.
-
-```c++
-leveldb::Iterator* it = db->NewIterator(leveldb::ReadOptions());
-for (it->SeekToFirst(); it->Valid(); it->Next()) {
-  cout << it->key().ToString() << ": "  << it->value().ToString() << endl;
-}
-assert(it->status().ok());  // Check for any errors found during the scan
-delete it;
-```
-
-The following variation shows how to process just the keys in the range
-[start,limit):
-
-```c++
-for (it->Seek(start);
-   it->Valid() && it->key().ToString() < limit;
-   it->Next()) {
-  ...
-}
-```
-
-You can also process entries in reverse order. (Caveat: reverse iteration may be
-somewhat slower than forward iteration.)
-
-```c++
-for (it->SeekToLast(); it->Valid(); it->Prev()) {
-  ...
-}
-```
-
-## Snapshots
-
-Snapshots provide consistent read-only views over the entire state of the
-key-value store.  `ReadOptions::snapshot` may be non-NULL to indicate that a
-read should operate on a particular version of the DB state. If
-`ReadOptions::snapshot` is NULL, the read will operate on an implicit snapshot
-of the current state.
-
-Snapshots are created by the `DB::GetSnapshot()` method:
-
-```c++
-leveldb::ReadOptions options;
-options.snapshot = db->GetSnapshot();
-... apply some updates to db ...
-leveldb::Iterator* iter = db->NewIterator(options);
-... read using iter to view the state when the snapshot was created ...
-delete iter;
-db->ReleaseSnapshot(options.snapshot);
-```
-
-Note that when a snapshot is no longer needed, it should be released using the
-`DB::ReleaseSnapshot` interface. This allows the implementation to get rid of
-state that was being maintained just to support reading as of that snapshot.
-
-## Slice
-
-The return value of the `it->key()` and `it->value()` calls above are instances
-of the `leveldb::Slice` type. Slice is a simple structure that contains a length
-and a pointer to an external byte array. Returning a Slice is a cheaper
-alternative to returning a `std::string` since we do not need to copy
-potentially large keys and values. In addition, leveldb methods do not return
-null-terminated C-style strings since leveldb keys and values are allowed to
-contain `'\0'` bytes.
-
-C++ strings and null-terminated C-style strings can be easily converted to a
-Slice:
-
-```c++
-leveldb::Slice s1 = "hello";
-
-std::string str("world");
-leveldb::Slice s2 = str;
-```
-
-A Slice can be easily converted back to a C++ string:
-
-```c++
-std::string str = s1.ToString();
-assert(str == std::string("hello"));
-```
-
-Be careful when using Slices since it is up to the caller to ensure that the
-external byte array into which the Slice points remains live while the Slice is
-in use. For example, the following is buggy:
-
-```c++
-leveldb::Slice slice;
-if (...) {
-  std::string str = ...;
-  slice = str;
-}
-Use(slice);
-```
-
-When the if statement goes out of scope, str will be destroyed and the backing
-storage for slice will disappear.
-
-## Comparators
-
-The preceding examples used the default ordering function for key, which orders
-bytes lexicographically. You can however supply a custom comparator when opening
-a database.  For example, suppose each database key consists of two numbers and
-we should sort by the first number, breaking ties by the second number. First,
-define a proper subclass of `leveldb::Comparator` that expresses these rules:
-
-```c++
-class TwoPartComparator : public leveldb::Comparator {
- public:
-  // Three-way comparison function:
-  //   if a < b: negative result
-  //   if a > b: positive result
-  //   else: zero result
-  int Compare(const leveldb::Slice& a, const leveldb::Slice& b) const {
-    int a1, a2, b1, b2;
-    ParseKey(a, &a1, &a2);
-    ParseKey(b, &b1, &b2);
-    if (a1 < b1) return -1;
-    if (a1 > b1) return +1;
-    if (a2 < b2) return -1;
-    if (a2 > b2) return +1;
-    return 0;
-  }
-
-  // Ignore the following methods for now:
-  const char* Name() const { return "TwoPartComparator"; }
-  void FindShortestSeparator(std::string*, const leveldb::Slice&) const {}
-  void FindShortSuccessor(std::string*) const {}
-};
-```
-
-Now create a database using this custom comparator:
-
-```c++
-TwoPartComparator cmp;
-leveldb::DB* db;
-leveldb::Options options;
-options.create_if_missing = true;
-options.comparator = &cmp;
-leveldb::Status status = leveldb::DB::Open(options, "/tmp/testdb", &db);
-...
-```
-
-### Backwards compatibility
-
-The result of the comparator's Name method is attached to the database when it
-is created, and is checked on every subsequent database open. If the name
-changes, the `leveldb::DB::Open` call will fail. Therefore, change the name if
-and only if the new key format and comparison function are incompatible with
-existing databases, and it is ok to discard the contents of all existing
-databases.
-
-You can however still gradually evolve your key format over time with a little
-bit of pre-planning. For example, you could store a version number at the end of
-each key (one byte should suffice for most uses). When you wish to switch to a
-new key format (e.g., adding an optional third part to the keys processed by
-`TwoPartComparator`), (a) keep the same comparator name (b) increment the
-version number for new keys (c) change the comparator function so it uses the
-version numbers found in the keys to decide how to interpret them.
-
-## Performance
-
-Performance can be tuned by changing the default values of the types defined in
-`include/leveldb/options.h`.
-
-### Block size
-
-leveldb groups adjacent keys together into the same block and such a block is
-the unit of transfer to and from persistent storage. The default block size is
-approximately 4096 uncompressed bytes.  Applications that mostly do bulk scans
-over the contents of the database may wish to increase this size. Applications
-that do a lot of point reads of small values may wish to switch to a smaller
-block size if performance measurements indicate an improvement. There isn't much
-benefit in using blocks smaller than one kilobyte, or larger than a few
-megabytes. Also note that compression will be more effective with larger block
-sizes.
-
-### Compression
-
-Each block is individually compressed before being written to persistent
-storage. Compression is on by default since the default compression method is
-very fast, and is automatically disabled for uncompressible data. In rare cases,
-applications may want to disable compression entirely, but should only do so if
-benchmarks show a performance improvement:
-
-```c++
-leveldb::Options options;
-options.compression = leveldb::kNoCompression;
-... leveldb::DB::Open(options, name, ...) ....
-```
-
-### Cache
-
-The contents of the database are stored in a set of files in the filesystem and
-each file stores a sequence of compressed blocks. If options.cache is non-NULL,
-it is used to cache frequently used uncompressed block contents.
-
-```c++
-#include "leveldb/cache.h"
-
-leveldb::Options options;
-options.cache = leveldb::NewLRUCache(100 * 1048576);  // 100MB cache
-leveldb::DB* db;
-leveldb::DB::Open(options, name, &db);
-... use the db ...
-delete db
-delete options.cache;
-```
-
-Note that the cache holds uncompressed data, and therefore it should be sized
-according to application level data sizes, without any reduction from
-compression. (Caching of compressed blocks is left to the operating system
-buffer cache, or any custom Env implementation provided by the client.)
-
-When performing a bulk read, the application may wish to disable caching so that
-the data processed by the bulk read does not end up displacing most of the
-cached contents. A per-iterator option can be used to achieve this:
-
-```c++
-leveldb::ReadOptions options;
-options.fill_cache = false;
-leveldb::Iterator* it = db->NewIterator(options);
-for (it->SeekToFirst(); it->Valid(); it->Next()) {
-  ...
-}
-```
-
-### Key Layout
-
-Note that the unit of disk transfer and caching is a block. Adjacent keys
-(according to the database sort order) will usually be placed in the same block.
-Therefore the application can improve its performance by placing keys that are
-accessed together near each other and placing infrequently used keys in a
-separate region of the key space.
-
-For example, suppose we are implementing a simple file system on top of leveldb.
-The types of entries we might wish to store are:
-
-    filename -> permission-bits, length, list of file_block_ids
-    file_block_id -> data
-
-We might want to prefix filename keys with one letter (say '/') and the
-`file_block_id` keys with a different letter (say '0') so that scans over just
-the metadata do not force us to fetch and cache bulky file contents.
-
-### Filters
-
-Because of the way leveldb data is organized on disk, a single `Get()` call may
-involve multiple reads from disk. The optional FilterPolicy mechanism can be
-used to reduce the number of disk reads substantially.
-
-```c++
-leveldb::Options options;
-options.filter_policy = NewBloomFilterPolicy(10);
-leveldb::DB* db;
-leveldb::DB::Open(options, "/tmp/testdb", &db);
-... use the database ...
-delete db;
-delete options.filter_policy;
-```
-
-The preceding code associates a Bloom filter based filtering policy with the
-database.  Bloom filter based filtering relies on keeping some number of bits of
-data in memory per key (in this case 10 bits per key since that is the argument
-we passed to `NewBloomFilterPolicy`). This filter will reduce the number of
-unnecessary disk reads needed for Get() calls by a factor of approximately
-a 100. Increasing the bits per key will lead to a larger reduction at the cost
-of more memory usage. We recommend that applications whose working set does not
-fit in memory and that do a lot of random reads set a filter policy.
-
-If you are using a custom comparator, you should ensure that the filter policy
-you are using is compatible with your comparator. For example, consider a
-comparator that ignores trailing spaces when comparing keys.
-`NewBloomFilterPolicy` must not be used with such a comparator. Instead, the
-application should provide a custom filter policy that also ignores trailing
-spaces. For example:
-
-```c++
-class CustomFilterPolicy : public leveldb::FilterPolicy {
- private:
-  FilterPolicy* builtin_policy_;
-
- public:
-  CustomFilterPolicy() : builtin_policy_(NewBloomFilterPolicy(10)) {}
-  ~CustomFilterPolicy() { delete builtin_policy_; }
-
-  const char* Name() const { return "IgnoreTrailingSpacesFilter"; }
-
-  void CreateFilter(const Slice* keys, int n, std::string* dst) const {
-    // Use builtin bloom filter code after removing trailing spaces
-    std::vector<Slice> trimmed(n);
-    for (int i = 0; i < n; i++) {
-      trimmed[i] = RemoveTrailingSpaces(keys[i]);
-    }
-    return builtin_policy_->CreateFilter(&trimmed[i], n, dst);
-  }
-};
-```
-
-Advanced applications may provide a filter policy that does not use a bloom
-filter but uses some other mechanism for summarizing a set of keys. See
-`leveldb/filter_policy.h` for detail.
-
-## Checksums
-
-leveldb associates checksums with all data it stores in the file system. There
-are two separate controls provided over how aggressively these checksums are
-verified:
-
-`ReadOptions::verify_checksums` may be set to true to force checksum
-verification of all data that is read from the file system on behalf of a
-particular read.  By default, no such verification is done.
-
-`Options::paranoid_checks` may be set to true before opening a database to make
-the database implementation raise an error as soon as it detects an internal
-corruption. Depending on which portion of the database has been corrupted, the
-error may be raised when the database is opened, or later by another database
-operation. By default, paranoid checking is off so that the database can be used
-even if parts of its persistent storage have been corrupted.
-
-If a database is corrupted (perhaps it cannot be opened when paranoid checking
-is turned on), the `leveldb::RepairDB` function may be used to recover as much
-of the data as possible
-
-## Approximate Sizes
-
-The `GetApproximateSizes` method can used to get the approximate number of bytes
-of file system space used by one or more key ranges.
-
-```c++
-leveldb::Range ranges[2];
-ranges[0] = leveldb::Range("a", "c");
-ranges[1] = leveldb::Range("x", "z");
-uint64_t sizes[2];
-leveldb::Status s = db->GetApproximateSizes(ranges, 2, sizes);
-```
-
-The preceding call will set `sizes[0]` to the approximate number of bytes of
-file system space used by the key range `[a..c)` and `sizes[1]` to the
-approximate number of bytes used by the key range `[x..z)`.
-
-## Environment
-
-All file operations (and other operating system calls) issued by the leveldb
-implementation are routed through a `leveldb::Env` object. Sophisticated clients
-may wish to provide their own Env implementation to get better control.
-For example, an application may introduce artificial delays in the file IO
-paths to limit the impact of leveldb on other activities in the system.
-
-```c++
-class SlowEnv : public leveldb::Env {
-  ... implementation of the Env interface ...
-};
-
-SlowEnv env;
-leveldb::Options options;
-options.env = &env;
-Status s = leveldb::DB::Open(options, ...);
-```
-
-## Porting
-
-leveldb may be ported to a new platform by providing platform specific
-implementations of the types/methods/functions exported by
-`leveldb/port/port.h`.  See `leveldb/port/port_example.h` for more details.
-
-In addition, the new platform may need a new default `leveldb::Env`
-implementation.  See `leveldb/util/env_posix.h` for an example.
-
-## Other Information
-
-Details about the leveldb implementation may be found in the following
-documents:
-
-1. [Implementation notes](impl.md)
-2. [Format of an immutable Table file](table_format.md)
-3. [Format of a log file](log_format.md)
--- a/src/leveldb/doc/log_format.md
+++ b/src/leveldb/doc/log_format.md
@ -1,75 +0,0 @@
-leveldb Log format
-==================
-The log file contents are a sequence of 32KB blocks.  The only exception is that
-the tail of the file may contain a partial block.
-
-Each block consists of a sequence of records:
-
-    block := record* trailer?
-    record :=
-      checksum: uint32     // crc32c of type and data[] ; little-endian
-      length: uint16       // little-endian
-      type: uint8          // One of FULL, FIRST, MIDDLE, LAST
-      data: uint8[length]
-
-A record never starts within the last six bytes of a block (since it won't fit).
-Any leftover bytes here form the trailer, which must consist entirely of zero
-bytes and must be skipped by readers.
-
-Aside: if exactly seven bytes are left in the current block, and a new non-zero
-length record is added, the writer must emit a FIRST record (which contains zero
-bytes of user data) to fill up the trailing seven bytes of the block and then
-emit all of the user data in subsequent blocks.
-
-More types may be added in the future.  Some Readers may skip record types they
-do not understand, others may report that some data was skipped.
-
-    FULL == 1
-    FIRST == 2
-    MIDDLE == 3
-    LAST == 4
-
-The FULL record contains the contents of an entire user record.
-
-FIRST, MIDDLE, LAST are types used for user records that have been split into
-multiple fragments (typically because of block boundaries).  FIRST is the type
-of the first fragment of a user record, LAST is the type of the last fragment of
-a user record, and MIDDLE is the type of all interior fragments of a user
-record.
-
-Example: consider a sequence of user records:
-
-    A: length 1000
-    B: length 97270
-    C: length 8000
-
-**A** will be stored as a FULL record in the first block.
-
-**B** will be split into three fragments: first fragment occupies the rest of
-the first block, second fragment occupies the entirety of the second block, and
-the third fragment occupies a prefix of the third block.  This will leave six
-bytes free in the third block, which will be left empty as the trailer.
-
-**C** will be stored as a FULL record in the fourth block.
-
----
-
-## Some benefits over the recordio format:
-
-1. We do not need any heuristics for resyncing - just go to next block boundary
-   and scan.  If there is a corruption, skip to the next block.  As a
-   side-benefit, we do not get confused when part of the contents of one log
-   file are embedded as a record inside another log file.
-
-2. Splitting at approximate boundaries (e.g., for mapreduce) is simple: find the
-   next block boundary and skip records until we hit a FULL or FIRST record.
-
-3. We do not need extra buffering for large records.
-
-## Some downsides compared to recordio format:
-
-1. No packing of tiny records.  This could be fixed by adding a new record type,
-   so it is a shortcoming of the current implementation, not necessarily the
-   format.
-
-2. No compression.  Again, this could be fixed by adding new record types.
--- a/src/leveldb/doc/log_format.txt
+++ b/src/leveldb/doc/log_format.txt
@ -0,0 +1,75 @@
+The log file contents are a sequence of 32KB blocks.  The only
+exception is that the tail of the file may contain a partial block.
+
+Each block consists of a sequence of records:
+   block := record* trailer?
+   record :=
+	checksum: uint32	// crc32c of type and data[]
+	length: uint16
+	type: uint8		// One of FULL, FIRST, MIDDLE, LAST
+	data: uint8[length]
+
+A record never starts within the last six bytes of a block (since it
+won't fit).  Any leftover bytes here form the trailer, which must
+consist entirely of zero bytes and must be skipped by readers.  
+
+Aside: if exactly seven bytes are left in the current block, and a new
+non-zero length record is added, the writer must emit a FIRST record
+(which contains zero bytes of user data) to fill up the trailing seven
+bytes of the block and then emit all of the user data in subsequent
+blocks.
+
+More types may be added in the future.  Some Readers may skip record
+types they do not understand, others may report that some data was
+skipped.
+
+FULL == 1
+FIRST == 2
+MIDDLE == 3
+LAST == 4
+
+The FULL record contains the contents of an entire user record.
+
+FIRST, MIDDLE, LAST are types used for user records that have been
+split into multiple fragments (typically because of block boundaries).
+FIRST is the type of the first fragment of a user record, LAST is the
+type of the last fragment of a user record, and MID is the type of all
+interior fragments of a user record.
+
+Example: consider a sequence of user records:
+   A: length 1000
+   B: length 97270
+   C: length 8000
+A will be stored as a FULL record in the first block.
+
+B will be split into three fragments: first fragment occupies the rest
+of the first block, second fragment occupies the entirety of the
+second block, and the third fragment occupies a prefix of the third
+block.  This will leave six bytes free in the third block, which will
+be left empty as the trailer.
+
+C will be stored as a FULL record in the fourth block.
+
+===================
+
+Some benefits over the recordio format:
+
+(1) We do not need any heuristics for resyncing - just go to next
+block boundary and scan.  If there is a corruption, skip to the next
+block.  As a side-benefit, we do not get confused when part of the
+contents of one log file are embedded as a record inside another log
+file.
+
+(2) Splitting at approximate boundaries (e.g., for mapreduce) is
+simple: find the next block boundary and skip records until we
+hit a FULL or FIRST record.
+
+(3) We do not need extra buffering for large records.
+
+Some downsides compared to recordio format:
+
+(1) No packing of tiny records.  This could be fixed by adding a new
+record type, so it is a shortcoming of the current implementation,
+not necessarily the format.
+
+(2) No compression.  Again, this could be fixed by adding new record types.
--- a/src/leveldb/doc/table_format.md
+++ b/src/leveldb/doc/table_format.md
@ -1,107 +0,0 @@
-leveldb File format
-===================
-
-    <beginning_of_file>
-    [data block 1]
-    [data block 2]
-    ...
-    [data block N]
-    [meta block 1]
-    ...
-    [meta block K]
-    [metaindex block]
-    [index block]
-    [Footer]        (fixed size; starts at file_size - sizeof(Footer))
-    <end_of_file>
-
-The file contains internal pointers.  Each such pointer is called
-a BlockHandle and contains the following information:
-
-    offset:   varint64
-    size:     varint64
-
-See [varints](https://developers.google.com/protocol-buffers/docs/encoding#varints)
-for an explanation of varint64 format.
-
-1.  The sequence of key/value pairs in the file are stored in sorted
-order and partitioned into a sequence of data blocks.  These blocks
-come one after another at the beginning of the file.  Each data block
-is formatted according to the code in `block_builder.cc`, and then
-optionally compressed.
-
-2. After the data blocks we store a bunch of meta blocks.  The
-supported meta block types are described below.  More meta block types
-may be added in the future.  Each meta block is again formatted using
-`block_builder.cc` and then optionally compressed.
-
-3. A "metaindex" block.  It contains one entry for every other meta
-block where the key is the name of the meta block and the value is a
-BlockHandle pointing to that meta block.
-
-4. An "index" block.  This block contains one entry per data block,
-where the key is a string >= last key in that data block and before
-the first key in the successive data block.  The value is the
-BlockHandle for the data block.
-
-5. At the very end of the file is a fixed length footer that contains
-the BlockHandle of the metaindex and index blocks as well as a magic number.
-
-        metaindex_handle: char[p];     // Block handle for metaindex
-        index_handle:     char[q];     // Block handle for index
-        padding:          char[40-p-q];// zeroed bytes to make fixed length
-                                       // (40==2*BlockHandle::kMaxEncodedLength)
-        magic:            fixed64;     // == 0xdb4775248b80fb57 (little-endian)
-
-## "filter" Meta Block
-
-If a `FilterPolicy` was specified when the database was opened, a
-filter block is stored in each table.  The "metaindex" block contains
-an entry that maps from `filter.<N>` to the BlockHandle for the filter
-block where `<N>` is the string returned by the filter policy's
-`Name()` method.
-
-The filter block stores a sequence of filters, where filter i contains
-the output of `FilterPolicy::CreateFilter()` on all keys that are stored
-in a block whose file offset falls within the range
-
-    [ i*base ... (i+1)*base-1 ]
-
-Currently, "base" is 2KB.  So for example, if blocks X and Y start in
-the range `[ 0KB .. 2KB-1 ]`, all of the keys in X and Y will be
-converted to a filter by calling `FilterPolicy::CreateFilter()`, and the
-resulting filter will be stored as the first filter in the filter
-block.
-
-The filter block is formatted as follows:
-
-    [filter 0]
-    [filter 1]
-    [filter 2]
-    ...
-    [filter N-1]
-
-    [offset of filter 0]                  : 4 bytes
-    [offset of filter 1]                  : 4 bytes
-    [offset of filter 2]                  : 4 bytes
-    ...
-    [offset of filter N-1]                : 4 bytes
-
-    [offset of beginning of offset array] : 4 bytes
-    lg(base)                              : 1 byte
-
-The offset array at the end of the filter block allows efficient
-mapping from a data block offset to the corresponding filter.
-
-## "stats" Meta Block
-
-This meta block contains a bunch of stats.  The key is the name
-of the statistic.  The value contains the statistic.
-
-TODO(postrelease): record following stats.
-
-    data size
-    index size
-    key size (uncompressed)
-    value size (uncompressed)
-    number of entries
-    number of data blocks
--- a/src/leveldb/doc/table_format.txt
+++ b/src/leveldb/doc/table_format.txt
@ -0,0 +1,102 @@
+File format
+===========
+
+  <beginning_of_file>
+  [data block 1]
+  [data block 2]
+  ...
+  [data block N]
+  [meta block 1]
+  ...
+  [meta block K]
+  [metaindex block]
+  [index block]
+  [Footer]        (fixed size; starts at file_size - sizeof(Footer))
+  <end_of_file>
+
+The file contains internal pointers.  Each such pointer is called
+a BlockHandle and contains the following information:
+  offset:	    varint64
+  size:		    varint64
+
+(1) The sequence of key/value pairs in the file are stored in sorted
+order and partitioned into a sequence of data blocks.  These blocks
+come one after another at the beginning of the file.  Each data block
+is formatted according to the code in block_builder.cc, and then
+optionally compressed.
+
+(2) After the data blocks we store a bunch of meta blocks.  The
+supported meta block types are described below.  More meta block types
+may be added in the future.  Each meta block is again formatted using
+block_builder.cc and then optionally compressed.
+
+(3) A "metaindex" block.  It contains one entry for every other meta
+block where the key is the name of the meta block and the value is a
+BlockHandle pointing to that meta block.
+
+(4) An "index" block.  This block contains one entry per data block,
+where the key is a string >= last key in that data block and before
+the first key in the successive data block.  The value is the
+BlockHandle for the data block.
+
+(6) At the very end of the file is a fixed length footer that contains
+the BlockHandle of the metaindex and index blocks as well as a magic number.
+       metaindex_handle:       char[p];    // Block handle for metaindex
+       index_handle:	       char[q];    // Block handle for index
+       padding:		       char[40-p-q]; // 0 bytes to make fixed length
+       			 	       // (40==2*BlockHandle::kMaxEncodedLength)
+       magic:		       fixed64;    // == 0xdb4775248b80fb57
+
+"filter" Meta Block
+-------------------
+
+If a "FilterPolicy" was specified when the database was opened, a
+filter block is stored in each table.  The "metaindex" block contains
+an entry that maps from "filter.<N>" to the BlockHandle for the filter
+block where "<N>" is the string returned by the filter policy's
+"Name()" method.
+
+The filter block stores a sequence of filters, where filter i contains
+the output of FilterPolicy::CreateFilter() on all keys that are stored
+in a block whose file offset falls within the range
+
+    [ i*base ... (i+1)*base-1 ]
+
+Currently, "base" is 2KB.  So for example, if blocks X and Y start in
+the range [ 0KB .. 2KB-1 ], all of the keys in X and Y will be
+converted to a filter by calling FilterPolicy::CreateFilter(), and the
+resulting filter will be stored as the first filter in the filter
+block.
+
+The filter block is formatted as follows:
+
+     [filter 0]
+     [filter 1]
+     [filter 2]
+     ...
+     [filter N-1]
+
+     [offset of filter 0]                  : 4 bytes
+     [offset of filter 1]                  : 4 bytes
+     [offset of filter 2]                  : 4 bytes
+     ...
+     [offset of filter N-1]                : 4 bytes
+
+     [offset of beginning of offset array] : 4 bytes
+     lg(base)                              : 1 byte
+
+The offset array at the end of the filter block allows efficient
+mapping from a data block offset to the corresponding filter.
+
+"stats" Meta Block
+------------------
+
+This meta block contains a bunch of stats.  The key is the name
+of the statistic.  The value contains the statistic.
+TODO(postrelease): record following stats.
+  data size
+  index size
+  key size (uncompressed)
+  value size (uncompressed)
+  number of entries
+  number of data blocks
--- a/src/leveldb/helpers/memenv/memenv.cc
+++ b/src/leveldb/helpers/memenv/memenv.cc
@ -55,15 +55,14 @@ class FileState {
    }
    const uint64_t available = size_ - offset;
    if (n > available) {
-      n = static_cast<size_t>(available);
+      n = available;
    }
    if (n == 0) {
      *result = Slice();
      return Status::OK();
    }

-    assert(offset / kBlockSize <= SIZE_MAX);
-    size_t block = static_cast<size_t>(offset / kBlockSize);
+    size_t block = offset / kBlockSize;
    size_t block_offset = offset % kBlockSize;

    if (n <= kBlockSize - block_offset) {
@ -168,7 +167,7 @@ class SequentialFileImpl : public SequentialFile {
    if (pos_ > file_->Size()) {
      return Status::IOError("pos_ > file_->Size()");
    }
-    const uint64_t available = file_->Size() - pos_;
+    const size_t available = file_->Size() - pos_;
    if (n > available) {
      n = available;
    }
@ -176,10 +175,9 @@ class SequentialFileImpl : public SequentialFile {
    return Status::OK();
  }

-  virtual std::string GetName() const { return "[memenv]"; }
 private:
  FileState* file_;
-  uint64_t pos_;
+  size_t pos_;
 };

 class RandomAccessFileImpl : public RandomAccessFile {
@ -197,7 +195,6 @@ class RandomAccessFileImpl : public RandomAccessFile {
    return file_->Read(offset, n, result, scratch);
  }

-  virtual std::string GetName() const { return "[memenv]"; }
 private:
  FileState* file_;
 };
@ -220,16 +217,10 @@ class WritableFileImpl : public WritableFile {
  virtual Status Flush() { return Status::OK(); }
  virtual Status Sync() { return Status::OK(); }

-  virtual std::string GetName() const { return "[memenv]"; }
 private:
  FileState* file_;
 };

-class NoOpLogger : public Logger {
- public:
-  virtual void Logv(const char* format, va_list ap) { }
-};
-
 class InMemoryEnv : public EnvWrapper {
 public:
  explicit InMemoryEnv(Env* base_env) : EnvWrapper(base_env) { }
@ -266,7 +257,7 @@ class InMemoryEnv : public EnvWrapper {
  }

  virtual Status NewWritableFile(const std::string& fname,
-                                 WritableFile** result) {
+                                 WritableFile** result, size_t) {
    MutexLock lock(&mutex_);
    if (file_map_.find(fname) != file_map_.end()) {
      DeleteFileInternal(fname);
@ -280,19 +271,6 @@ class InMemoryEnv : public EnvWrapper {
    return Status::OK();
  }

-  virtual Status NewAppendableFile(const std::string& fname,
-                                   WritableFile** result) {
-    MutexLock lock(&mutex_);
-    FileState** sptr = &file_map_[fname];
-    FileState* file = *sptr;
-    if (file == NULL) {
-      file = new FileState();
-      file->Ref();
-    }
-    *result = new WritableFileImpl(file);
-    return Status::OK();
-  }
-
  virtual bool FileExists(const std::string& fname) {
    MutexLock lock(&mutex_);
    return file_map_.find(fname) != file_map_.end();
@ -380,11 +358,6 @@ class InMemoryEnv : public EnvWrapper {
    return Status::OK();
  }

-  virtual Status NewLogger(const std::string& fname, Logger** result) {
-    *result = new NoOpLogger;
-    return Status::OK();
-  }
-
 private:
  // Map from filenames to FileState objects, representing a simple file system.
  typedef std::map<std::string, FileState*> FileSystem;
--- a/src/leveldb/helpers/memenv/memenv_test.cc
+++ b/src/leveldb/helpers/memenv/memenv_test.cc
@ -29,68 +29,61 @@ TEST(MemEnvTest, Basics) {
  uint64_t file_size;
  WritableFile* writable_file;
  std::vector<std::string> children;
+  std::string dbname;

-  ASSERT_OK(env_->CreateDir("/dir"));
+  dbname=test::TmpDir();
+  ASSERT_OK(env_->CreateDir(dbname.c_str()));

  // Check that the directory is empty.
-  ASSERT_TRUE(!env_->FileExists("/dir/non_existent"));
-  ASSERT_TRUE(!env_->GetFileSize("/dir/non_existent", &file_size).ok());
-  ASSERT_OK(env_->GetChildren("/dir", &children));
+  ASSERT_TRUE(!env_->FileExists(dbname + "/non_existent"));
+  ASSERT_TRUE(!env_->GetFileSize(dbname + "/non_existent", &file_size).ok());
+  ASSERT_OK(env_->GetChildren(dbname + "", &children));
  ASSERT_EQ(0, children.size());

  // Create a file.
-  ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file));
-  ASSERT_OK(env_->GetFileSize("/dir/f", &file_size));
-  ASSERT_EQ(0, file_size);
+  ASSERT_OK(env_->NewWritableFile(dbname + "/f", &writable_file, 2<<20));
  delete writable_file;

  // Check that the file exists.
-  ASSERT_TRUE(env_->FileExists("/dir/f"));
-  ASSERT_OK(env_->GetFileSize("/dir/f", &file_size));
+  ASSERT_TRUE(env_->FileExists(dbname + "/f"));
+  ASSERT_OK(env_->GetFileSize(dbname + "/f", &file_size));
  ASSERT_EQ(0, file_size);
-  ASSERT_OK(env_->GetChildren("/dir", &children));
+  ASSERT_OK(env_->GetChildren(dbname + "", &children));
  ASSERT_EQ(1, children.size());
  ASSERT_EQ("f", children[0]);

  // Write to the file.
-  ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file));
+  ASSERT_OK(env_->NewWritableFile(dbname + "/f", &writable_file, 2<<20));
  ASSERT_OK(writable_file->Append("abc"));
  delete writable_file;

-  // Check that append works.
-  ASSERT_OK(env_->NewAppendableFile("/dir/f", &writable_file));
-  ASSERT_OK(env_->GetFileSize("/dir/f", &file_size));
-  ASSERT_EQ(3, file_size);
-  ASSERT_OK(writable_file->Append("hello"));
-  delete writable_file;
-
  // Check for expected size.
-  ASSERT_OK(env_->GetFileSize("/dir/f", &file_size));
-  ASSERT_EQ(8, file_size);
+  ASSERT_OK(env_->GetFileSize(dbname + "/f", &file_size));
+  ASSERT_EQ(3, file_size);

  // Check that renaming works.
-  ASSERT_TRUE(!env_->RenameFile("/dir/non_existent", "/dir/g").ok());
-  ASSERT_OK(env_->RenameFile("/dir/f", "/dir/g"));
-  ASSERT_TRUE(!env_->FileExists("/dir/f"));
-  ASSERT_TRUE(env_->FileExists("/dir/g"));
-  ASSERT_OK(env_->GetFileSize("/dir/g", &file_size));
-  ASSERT_EQ(8, file_size);
+  ASSERT_TRUE(!env_->RenameFile(dbname + "/non_existent", dbname + "/g").ok());
+  ASSERT_OK(env_->RenameFile(dbname + "/f", dbname + "/g"));
+  ASSERT_TRUE(!env_->FileExists(dbname + "/f"));
+  ASSERT_TRUE(env_->FileExists(dbname + "/g"));
+  ASSERT_OK(env_->GetFileSize(dbname + "/g", &file_size));
+  ASSERT_EQ(3, file_size);

  // Check that opening non-existent file fails.
  SequentialFile* seq_file;
  RandomAccessFile* rand_file;
-  ASSERT_TRUE(!env_->NewSequentialFile("/dir/non_existent", &seq_file).ok());
+  ASSERT_TRUE(!env_->NewSequentialFile(dbname + "/non_existent", &seq_file).ok());
  ASSERT_TRUE(!seq_file);
-  ASSERT_TRUE(!env_->NewRandomAccessFile("/dir/non_existent", &rand_file).ok());
+  ASSERT_TRUE(!env_->NewRandomAccessFile(dbname + "/non_existent", &rand_file).ok());
  ASSERT_TRUE(!rand_file);

  // Check that deleting works.
-  ASSERT_TRUE(!env_->DeleteFile("/dir/non_existent").ok());
-  ASSERT_OK(env_->DeleteFile("/dir/g"));
-  ASSERT_TRUE(!env_->FileExists("/dir/g"));
-  ASSERT_OK(env_->GetChildren("/dir", &children));
+  ASSERT_TRUE(!env_->DeleteFile(dbname + "/non_existent").ok());
+  ASSERT_OK(env_->DeleteFile(dbname + "/g"));
+  ASSERT_TRUE(!env_->FileExists(dbname + "/g"));
+  ASSERT_OK(env_->GetChildren(dbname + "", &children));
  ASSERT_EQ(0, children.size());
-  ASSERT_OK(env_->DeleteDir("/dir"));
+  ASSERT_OK(env_->DeleteDir(dbname + ""));
 }

 TEST(MemEnvTest, ReadWrite) {
@ -99,16 +92,19 @@ TEST(MemEnvTest, ReadWrite) {
  RandomAccessFile* rand_file;
  Slice result;
  char scratch[100];
+  std::string dbname;

-  ASSERT_OK(env_->CreateDir("/dir"));
+  dbname=test::TmpDir();

-  ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file));
+  ASSERT_OK(env_->CreateDir(dbname + ""));
+
+  ASSERT_OK(env_->NewWritableFile(dbname + "/f", &writable_file, 2<<20));
  ASSERT_OK(writable_file->Append("hello "));
  ASSERT_OK(writable_file->Append("world"));
  delete writable_file;

  // Read sequentially.
-  ASSERT_OK(env_->NewSequentialFile("/dir/f", &seq_file));
+  ASSERT_OK(env_->NewSequentialFile(dbname + "/f", &seq_file));
  ASSERT_OK(seq_file->Read(5, &result, scratch)); // Read "hello".
  ASSERT_EQ(0, result.compare("hello"));
  ASSERT_OK(seq_file->Skip(1));
@ -122,7 +118,7 @@ TEST(MemEnvTest, ReadWrite) {
  delete seq_file;

  // Random reads.
-  ASSERT_OK(env_->NewRandomAccessFile("/dir/f", &rand_file));
+  ASSERT_OK(env_->NewRandomAccessFile(dbname + "/f", &rand_file));
  ASSERT_OK(rand_file->Read(6, 5, &result, scratch)); // Read "world".
  ASSERT_EQ(0, result.compare("world"));
  ASSERT_OK(rand_file->Read(0, 5, &result, scratch)); // Read "hello".
@ -149,7 +145,7 @@ TEST(MemEnvTest, Misc) {
  ASSERT_TRUE(!test_dir.empty());

  WritableFile* writable_file;
-  ASSERT_OK(env_->NewWritableFile("/a/b", &writable_file));
+  ASSERT_OK(env_->NewWritableFile("/a/b", &writable_file, 2<<20));

  // These are no-ops, but we test they return success.
  ASSERT_OK(writable_file->Sync());
@ -161,6 +157,9 @@ TEST(MemEnvTest, Misc) {
 TEST(MemEnvTest, LargeWrite) {
  const size_t kWriteSize = 300 * 1024;
  char* scratch = new char[kWriteSize * 2];
+  std::string dbname;
+
+  dbname=test::TmpDir();

  std::string write_data;
  for (size_t i = 0; i < kWriteSize; ++i) {
@ -168,14 +167,14 @@ TEST(MemEnvTest, LargeWrite) {
  }

  WritableFile* writable_file;
-  ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file));
+  ASSERT_OK(env_->NewWritableFile(dbname + "/f", &writable_file, 2<<20));
  ASSERT_OK(writable_file->Append("foo"));
  ASSERT_OK(writable_file->Append(write_data));
  delete writable_file;

  SequentialFile* seq_file;
  Slice result;
-  ASSERT_OK(env_->NewSequentialFile("/dir/f", &seq_file));
+  ASSERT_OK(env_->NewSequentialFile(dbname + "/f", &seq_file));
  ASSERT_OK(seq_file->Read(3, &result, scratch)); // Read "foo".
  ASSERT_EQ(0, result.compare("foo"));

@ -190,17 +189,21 @@ TEST(MemEnvTest, LargeWrite) {
  delete seq_file;
  delete [] scratch;
 }
-
+#if 0
 TEST(MemEnvTest, DBTest) {
  Options options;
  options.create_if_missing = true;
  options.env = env_;
  DB* db;
+  std::string dbname;
+
+  dbname=test::TmpDir();
+  ASSERT_OK(env_->CreateDir(dbname+ "/db"));

  const Slice keys[] = {Slice("aaa"), Slice("bbb"), Slice("ccc")};
  const Slice vals[] = {Slice("foo"), Slice("bar"), Slice("baz")};

-  ASSERT_OK(DB::Open(options, "/dir/db", &db));
+  ASSERT_OK(DB::Open(options, dbname + "/db", &db));
  for (size_t i = 0; i < 3; ++i) {
    ASSERT_OK(db->Put(WriteOptions(), keys[i], vals[i]));
  }
@ -233,7 +236,7 @@ TEST(MemEnvTest, DBTest) {

  delete db;
 }
-
+#endif
 }  // namespace leveldb

 int main(int argc, char** argv) {
--- a/src/leveldb/include/leveldb/atomics.h
+++ b/src/leveldb/include/leveldb/atomics.h
@ -0,0 +1,227 @@
+// -------------------------------------------------------------------
+//
+// atomics.h: portable atomic operations for leveldb/eleveldb (http://code.google.com/p/leveldb/)
+//
+// Copyright (c) 2011-2013 Basho Technologies, Inc. All Rights Reserved.
+//
+// This file is provided to you under the Apache License,
+// Version 2.0 (the "License"); you may not use this file
+// except in compliance with the License.  You may obtain
+// a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+//
+// -------------------------------------------------------------------
+
+/// Copied from basho/eleveldb/c_src/detail.hpp September 8, 2013
+
+#ifndef LEVELDB_ATOMIC_H
+ #define LEVELDB_ATOMIC_H 1
+
+#include <stdint.h>
+#include <stddef.h>
+
+/* These can be hopefully-replaced with constexpr or compile-time assert later: */
+#if defined(OS_SOLARIS) || defined(SOLARIS) || defined(sun)
+ #define LEVELDB_IS_SOLARIS 1
+#else
+ #undef LEVELDB_IS_SOLARIS
+#endif
+
+#ifdef LEVELDB_IS_SOLARIS
+ #include <atomic.h>
+#endif
+
+namespace leveldb {
+
+/**
+ * Compare and swap
+ */
+
+// primary template
+template <typename PtrT, typename ValueT>
+inline bool compare_and_swap(volatile PtrT *ptr, const ValueT& comp_val, const ValueT& exchange_val);
+
+
+// uint32 size (needed for solaris)
+template <>
+inline bool compare_and_swap(volatile uint32_t *ptr, const int& comp_val, const int& exchange_val)
+{
+#if LEVELDB_IS_SOLARIS
+  return ((uint32_t) comp_val==atomic_cas_32(ptr, comp_val, exchange_val));
+#else
+    return __sync_bool_compare_and_swap(ptr, comp_val, exchange_val);
+#endif
+}
+
+
+// generic specification ... for pointers
+template <typename PtrT, typename ValueT>
+inline bool compare_and_swap(volatile PtrT *ptr, const ValueT& comp_val, const ValueT& exchange_val)
+{
+#if LEVELDB_IS_SOLARIS
+    return (comp_val==atomic_cas_ptr(ptr, comp_val, exchange_val));
+#else
+    return __sync_bool_compare_and_swap(ptr, comp_val, exchange_val);
+#endif
+}
+
+
+/**
+ * Atomic increment
+ */
+
+template <typename ValueT>
+inline ValueT inc_and_fetch(volatile ValueT *ptr);
+
+template <>
+inline uint64_t inc_and_fetch(volatile uint64_t *ptr)
+{
+#if LEVELDB_IS_SOLARIS
+    return atomic_inc_64_nv(ptr);
+#else
+    return __sync_add_and_fetch(ptr, 1);
+#endif
+}
+
+template <>
+inline uint32_t inc_and_fetch(volatile uint32_t *ptr)
+{
+#if LEVELDB_IS_SOLARIS
+    return atomic_inc_32_nv(ptr);
+#else
+    return __sync_add_and_fetch(ptr, 1);
+#endif
+}
+
+#if defined(__APPLE__) || defined(__OpenBSD__) || (defined(__s390__) && !defined(__s390x__))
+template <>
+inline size_t inc_and_fetch(volatile size_t *ptr)
+{
+    return __sync_add_and_fetch(ptr, 1);
+}
+#endif
+
+
+/**
+ * atomic decrement
+ */
+
+template <typename ValueT>
+inline ValueT dec_and_fetch(volatile ValueT *ptr);
+
+template <>
+inline uint64_t dec_and_fetch(volatile uint64_t *ptr)
+{
+#if LEVELDB_IS_SOLARIS
+    return atomic_dec_64_nv(ptr);
+#else
+    return __sync_sub_and_fetch(ptr, 1);
+#endif
+}
+
+template <>
+inline uint32_t dec_and_fetch(volatile uint32_t *ptr)
+{
+#if LEVELDB_IS_SOLARIS
+    return atomic_dec_32_nv(ptr);
+#else
+    return __sync_sub_and_fetch(ptr, 1);
+#endif
+}
+
+#if defined(__APPLE__) || defined(__OpenBSD__) || (defined(__s390__) && !defined(__s390x__))
+template <>
+inline size_t dec_and_fetch(volatile size_t *ptr)
+{
+    return __sync_sub_and_fetch(ptr, 1);
+}
+#endif
+
+
+/**
+ * Atomic add
+ */
+
+
+template <typename ValueT>
+inline ValueT add_and_fetch(volatile ValueT *ptr, ValueT val);
+
+template <>
+inline uint64_t add_and_fetch(volatile uint64_t *ptr, uint64_t val)
+{
+#if LEVELDB_IS_SOLARIS
+    return atomic_add_64_nv(ptr, val);
+#else
+    return __sync_add_and_fetch(ptr, val);
+#endif
+}
+
+template <>
+inline uint32_t add_and_fetch(volatile uint32_t *ptr, uint32_t val)
+{
+#if LEVELDB_IS_SOLARIS
+    return atomic_add_32_nv(ptr, val);
+#else
+    return __sync_add_and_fetch(ptr, val);
+#endif
+}
+
+#if defined(__APPLE__) || defined(__OpenBSD__) || (defined(__s390__) && !defined(__s390x__))
+template <>
+inline size_t add_and_fetch(volatile size_t *ptr, size_t val)
+{
+    return __sync_add_and_fetch(ptr, val);
+}
+#endif
+
+
+/**
+ * Atomic subtract
+ */
+
+template <typename ValueT>
+inline ValueT sub_and_fetch(volatile ValueT *ptr, ValueT val);
+
+template <>
+inline uint64_t sub_and_fetch(volatile uint64_t *ptr, uint64_t val)
+{
+#if LEVELDB_IS_SOLARIS
+    uint64_t temp=(~val)+1;  // 2's complement, bypass sign warnings
+    return atomic_add_64_nv(ptr, temp);
+#else
+    return __sync_sub_and_fetch(ptr, val);
+#endif
+}
+
+template <>
+inline uint32_t sub_and_fetch(volatile uint32_t *ptr, uint32_t val)
+{
+#if LEVELDB_IS_SOLARIS
+    uint32_t temp=(~val)+1;  // 2's complement, bypass sign warnings
+    return atomic_add_32_nv(ptr, temp);
+#else
+    return __sync_sub_and_fetch(ptr, val);
+#endif
+}
+
+#if defined(__APPLE__) || defined(__OpenBSD__) || (defined(__s390__) && !defined(__s390x__))
+template <>
+inline size_t sub_and_fetch(volatile size_t *ptr, size_t val)
+{
+    return __sync_sub_and_fetch(ptr, val);
+}
+#endif
+
+
+
+} // namespace leveldb
+
+#endif
--- a/src/leveldb/include/leveldb/c.h
+++ b/src/leveldb/include/leveldb/c.h
@ -9,6 +9,7 @@
  Does not support:
  . getters for the option types
  . custom comparators that implement key shortening
+  . capturing post-write-snapshot
  . custom iter, db, env, cache implementations using just the C bindings

  Some conventions:
@ -27,7 +28,6 @@
  be true on entry:
     *errptr == NULL
     *errptr points to a malloc()ed null-terminated error message
-       (On Windows, *errptr must have been malloc()-ed by this library.)
  On success, a leveldb routine leaves *errptr unchanged.
  On failure, leveldb frees the old value of *errptr and
  set *errptr to a malloc()ed error message.
@ -66,7 +66,7 @@ typedef struct leveldb_snapshot_t      leveldb_snapshot_t;
 typedef struct leveldb_writablefile_t  leveldb_writablefile_t;
 typedef struct leveldb_writebatch_t    leveldb_writebatch_t;
 typedef struct leveldb_writeoptions_t  leveldb_writeoptions_t;
-
+typedef struct leveldb_keymetadata_t   leveldb_keymetadata_t;
 /* DB operations */

 extern leveldb_t* leveldb_open(
@ -83,6 +83,14 @@ extern void leveldb_put(
    const char* val, size_t vallen,
    char** errptr);

+extern void leveldb_put2(
+    leveldb_t* db,
+    const leveldb_writeoptions_t* options,
+    const char* key, size_t keylen,
+    const char* val, size_t vallen,
+    char** errptr,
+    const leveldb_keymetadata_t * metadata);
+
 extern void leveldb_delete(
    leveldb_t* db,
    const leveldb_writeoptions_t* options,
@ -104,6 +112,14 @@ extern char* leveldb_get(
    size_t* vallen,
    char** errptr);

+extern char* leveldb_get2(
+    leveldb_t* db,
+    const leveldb_readoptions_t* options,
+    const char* key, size_t keylen,
+    size_t* vallen,
+    char** errptr,
+    leveldb_keymetadata_t * metadata);
+
 extern leveldb_iterator_t* leveldb_create_iterator(
    leveldb_t* db,
    const leveldb_readoptions_t* options);
@ -156,6 +172,7 @@ extern void leveldb_iter_next(leveldb_iterator_t*);
 extern void leveldb_iter_prev(leveldb_iterator_t*);
 extern const char* leveldb_iter_key(const leveldb_iterator_t*, size_t* klen);
 extern const char* leveldb_iter_value(const leveldb_iterator_t*, size_t* vlen);
+extern const void leveldb_iter_keymetadata(const leveldb_iterator_t *, leveldb_keymetadata_t *);
 extern void leveldb_iter_get_error(const leveldb_iterator_t*, char** errptr);

 /* Write batch */
@ -167,13 +184,19 @@ extern void leveldb_writebatch_put(
    leveldb_writebatch_t*,
    const char* key, size_t klen,
    const char* val, size_t vlen);
+extern void leveldb_writebatch_put2(
+    leveldb_writebatch_t*,
+    const char* key, size_t klen,
+    const char* val, size_t vlen,
+    const leveldb_keymetadata_t * meta);
 extern void leveldb_writebatch_delete(
    leveldb_writebatch_t*,
    const char* key, size_t klen);
 extern void leveldb_writebatch_iterate(
    leveldb_writebatch_t*,
    void* state,
-    void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen),
+    void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen,
+                const int & type, const uint64_t & expiry),
    void (*deleted)(void*, const char* k, size_t klen));

 /* Options */
@ -192,6 +215,8 @@ extern void leveldb_options_set_error_if_exists(
    leveldb_options_t*, unsigned char);
 extern void leveldb_options_set_paranoid_checks(
    leveldb_options_t*, unsigned char);
+extern void leveldb_options_set_verify_compactions(
+    leveldb_options_t*, unsigned char);
 extern void leveldb_options_set_env(leveldb_options_t*, leveldb_env_t*);
 extern void leveldb_options_set_info_log(leveldb_options_t*, leveldb_logger_t*);
 extern void leveldb_options_set_write_buffer_size(leveldb_options_t*, size_t);
@ -199,6 +224,7 @@ extern void leveldb_options_set_max_open_files(leveldb_options_t*, int);
 extern void leveldb_options_set_cache(leveldb_options_t*, leveldb_cache_t*);
 extern void leveldb_options_set_block_size(leveldb_options_t*, size_t);
 extern void leveldb_options_set_block_restart_interval(leveldb_options_t*, int);
+extern void leveldb_options_set_total_leveldb_mem(leveldb_options_t*, size_t);

 enum {
  leveldb_no_compression = 0,
@ -267,20 +293,20 @@ extern void leveldb_cache_destroy(leveldb_cache_t* cache);

 extern leveldb_env_t* leveldb_create_default_env();
 extern void leveldb_env_destroy(leveldb_env_t*);
+extern void leveldb_env_shutdown();

-/* Utility */
+/* Util */

-/* Calls free(ptr).
-   REQUIRES: ptr was malloc()-ed and returned by one of the routines
-   in this file.  Note that in certain cases (typically on Windows), you
-   may need to call this routine instead of free(ptr) to dispose of
-   malloc()-ed memory returned by this library. */
+/**
+ * CAUTION:  this call is only for char * objects returned by
+ *           functions like leveldb_get and leveldb_property_value.
+ *           Also used to release errptr strings.
+ */
 extern void leveldb_free(void* ptr);

-/* Return the major version number for this release. */
-extern int leveldb_major_version();
+/* Version */

-/* Return the minor version number for this release. */
+extern int leveldb_major_version();
 extern int leveldb_minor_version();

 #ifdef __cplusplus
--- a/src/leveldb/include/leveldb/cache.h
+++ b/src/leveldb/include/leveldb/cache.h
@ -29,6 +29,11 @@ class Cache;
 // of Cache uses a least-recently-used eviction policy.
 extern Cache* NewLRUCache(size_t capacity);

+// Riak customization - just like NewLRUCache except the underlying
+//  structure is NOT sharded.  Better for file cache.
+extern Cache* NewLRUCache2(size_t capacity);
+
+
 class Cache {
 public:
  Cache() { }
@ -81,16 +86,17 @@ class Cache {
  // its cache keys.
  virtual uint64_t NewId() = 0;

-  // Remove all cache entries that are not actively in use.  Memory-constrained
-  // applications may wish to call this method to reduce memory usage.
-  // Default implementation of Prune() does nothing.  Subclasses are strongly
-  // encouraged to override the default implementation.  A future release of
-  // leveldb may change Prune() to a pure abstract method.
-  virtual void Prune() {}
+  // Return size, if any, of per entry overhead for item placed in cache.
+  // Allows more accurate tracking of "charge" against each cache item.
+  virtual size_t EntryOverheadSize() {return(0);};

-  // Return an estimate of the combined charges of all elements stored in the
-  // cache.
-  virtual size_t TotalCharge() const = 0;
+  // Riak specific:  Add a reference to cache object to help hold it
+  //  in memory
+  virtual void Addref(Handle* e) = 0;
+
+  // Riak specific:  walk contents of entire cache, calling functor Acc
+  // with the "value" for each cache entry.  Locks cache throughout call.
+  virtual bool WalkCache(class CacheAccumulator & Acc) {return(true);};

 private:
  void LRU_Remove(Handle* e);
@ -107,4 +113,4 @@ class Cache {

 }  // namespace leveldb

-#endif  // STORAGE_LEVELDB_INCLUDE_CACHE_H_
+#endif  // STORAGE_LEVELDB_UTIL_CACHE_H_
--- a/src/leveldb/include/leveldb/comparator.h
+++ b/src/leveldb/include/leveldb/comparator.h
@ -58,6 +58,10 @@ class Comparator {
 // must not be deleted.
 extern const Comparator* BytewiseComparator();

+// Riak specific: cleans up the default comparitor to make
+//  valgrind results clean
+extern void ComparatorShutdown();
+
 }  // namespace leveldb

 #endif  // STORAGE_LEVELDB_INCLUDE_COMPARATOR_H_
--- a/src/leveldb/include/leveldb/db.h
+++ b/src/leveldb/include/leveldb/db.h
@ -14,7 +14,7 @@ namespace leveldb {

 // Update Makefile if you change these
 static const int kMajorVersion = 1;
-static const int kMinorVersion = 20;
+static const int kMinorVersion = 9;

 struct Options;
 struct ReadOptions;
@ -38,6 +38,17 @@ struct Range {
  Range(const Slice& s, const Slice& l) : start(s), limit(l) { }
 };

+// Abstract holder for a DB value.
+// This allows callers to manage their own value buffers and have
+// DB values copied directly into those buffers.
+class Value {
+ public:
+  virtual Value& assign(const char* data, size_t size) = 0;
+
+ protected:
+  virtual ~Value();
+};
+
 // A DB is a persistent ordered map from keys to values.
 // A DB is safe for concurrent access from multiple threads without
 // any external synchronization.
@ -60,7 +71,8 @@ class DB {
  // Note: consider setting options.sync = true.
  virtual Status Put(const WriteOptions& options,
                     const Slice& key,
-                     const Slice& value) = 0;
+                     const Slice& value,
+                     const KeyMetaData * meta=NULL) = 0;

  // Remove the database entry (if any) for "key".  Returns OK on
  // success, and a non-OK status on error.  It is not an error if "key"
@ -81,7 +93,11 @@ class DB {
  //
  // May return some other Status on an error.
  virtual Status Get(const ReadOptions& options,
-                     const Slice& key, std::string* value) = 0;
+                     const Slice& key, std::string* value,
+                     KeyMetaData * meta=NULL) = 0;
+  virtual Status Get(const ReadOptions& options,
+                     const Slice& key, Value* value,
+                     KeyMetaData * meta=NULL) = 0;

  // Return a heap-allocated iterator over the contents of the database.
  // The result of NewIterator() is initially invalid (caller must
@ -115,8 +131,6 @@ class DB {
  //     about the internal operation of the DB.
  //  "leveldb.sstables" - returns a multi-line string that describes all
  //     of the sstables that make up the db contents.
-  //  "leveldb.approximate-memory-usage" - returns the approximate number of
-  //     bytes of memory in use by the DB.
  virtual bool GetProperty(const Slice& property, std::string* value) = 0;

  // For each i in [0,n-1], store in "sizes[i]", the approximate
@ -142,6 +156,21 @@ class DB {
  //    db->CompactRange(NULL, NULL);
  virtual void CompactRange(const Slice* begin, const Slice* end) = 0;

+  // Riak specific function:  Verify that no .sst files overlap
+  // within the levels that expect non-overlapping files.  Run
+  // compactions as necessary to correct.  Assumes DB opened
+  // with Options.is_repair=true
+  virtual Status VerifyLevels();
+
+  // Riak specific function:  Request database check for
+  // available compactions.  This is to stimulate retry of
+  // grooming that might have been offered and rejected previously
+  virtual void CheckAvailableCompactions();
+
+  // Riak specific function:  Give external code, namely
+  // eleveldb, access to leveldb's logging routines.
+  virtual Logger* GetLogger() const { return NULL; }
+
 private:
  // No copying allowed
  DB(const DB&);
--- a/src/leveldb/include/leveldb/dumpfile.h
+++ b/src/leveldb/include/leveldb/dumpfile.h
@ -1,25 +0,0 @@
-// Copyright (c) 2014 The LevelDB Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
-
-#ifndef STORAGE_LEVELDB_INCLUDE_DUMPFILE_H_
-#define STORAGE_LEVELDB_INCLUDE_DUMPFILE_H_
-
-#include <string>
-#include "leveldb/env.h"
-#include "leveldb/status.h"
-
-namespace leveldb {
-
-// Dump the contents of the file named by fname in text format to
-// *dst.  Makes a sequence of dst->Append() calls; each call is passed
-// the newline-terminated text corresponding to a single item found
-// in the file.
-//
-// Returns a non-OK result if fname does not name a leveldb storage
-// file, or if the file cannot be read.
-Status DumpFile(Env* env, const std::string& fname, WritableFile* dst);
-
-}  // namespace leveldb
-
-#endif  // STORAGE_LEVELDB_INCLUDE_DUMPFILE_H_
--- a/src/leveldb/include/leveldb/env.h
+++ b/src/leveldb/include/leveldb/env.h
@ -13,15 +13,19 @@
 #ifndef STORAGE_LEVELDB_INCLUDE_ENV_H_
 #define STORAGE_LEVELDB_INCLUDE_ENV_H_

+#include <cstdarg>
+#include <pthread.h>
 #include <string>
 #include <vector>
-#include <stdarg.h>
 #include <stdint.h>
+#include "leveldb/perf_count.h"
 #include "leveldb/status.h"

 namespace leveldb {

+class AppendableFile;
 class FileLock;
+struct Options;
 class Logger;
 class RandomAccessFile;
 class SequentialFile;
@ -40,6 +44,11 @@ class Env {
  // The result of Default() belongs to leveldb and must never be deleted.
  static Env* Default();

+  // Riak specific:  Shutdown background work threads and other objects
+  //  to get clean environment for valgrind memory test.  No restart supported
+  //  after this call.  Not thread safe.
+  static void Shutdown();
+
  // Create a brand new sequentially-readable file with the specified name.
  // On success, stores a pointer to the new file in *result and returns OK.
  // On failure stores NULL in *result and returns non-OK.  If the file does
@ -67,22 +76,31 @@ class Env {
  //
  // The returned file will only be accessed by one thread at a time.
  virtual Status NewWritableFile(const std::string& fname,
-                                 WritableFile** result) = 0;
+                                 WritableFile** result,
+                                 size_t map_size) = 0;

-  // Create an object that either appends to an existing file, or
-  // writes to a new file (if the file does not exist to begin with).
-  // On success, stores a pointer to the new file in *result and
-  // returns OK.  On failure stores NULL in *result and returns
-  // non-OK.
+  // Riak specific:
+  // Derived from NewWritableFile.  One change: if the file exists,
+  // move to the end of the file and continue writing.
+  // new file.  On success, stores a pointer to the open file in
+  // *result and returns OK.  On failure stores NULL in *result and
+  // returns non-OK.
  //
  // The returned file will only be accessed by one thread at a time.
-  //
-  // May return an IsNotSupportedError error if this Env does
-  // not allow appending to an existing file.  Users of Env (including
-  // the leveldb implementation) must be prepared to deal with
-  // an Env that does not support appending.
  virtual Status NewAppendableFile(const std::string& fname,
-                                   WritableFile** result);
+                                   WritableFile** result,
+                                   size_t map_size) = 0;
+
+  // Riak specific:
+  // Allows for virtualized version of NewWritableFile that enables write
+  // and close operations to execute on background threads
+  //  (where platform supported).
+  //
+  // The returned file will only be accessed by one thread at a time.
+  virtual Status NewWriteOnlyFile(const std::string& fname,
+                                  WritableFile** result,
+                                  size_t map_size)
+  {return(NewWritableFile(fname, result, map_size));};

  // Returns true iff the named file exists.
  virtual bool FileExists(const std::string& fname) = 0;
@ -142,7 +160,7 @@ class Env {

  // Start a new thread, invoking "function(arg)" within the new thread.
  // When "function(arg)" returns, the thread will be destroyed.
-  virtual void StartThread(void (*function)(void* arg), void* arg) = 0;
+  virtual pthread_t StartThread(void (*function)(void* arg), void* arg) = 0;

  // *path is set to a temporary directory that can be used for testing. It may
  // or many not have just been created. The directory may or may not differ
@ -157,9 +175,16 @@ class Env {
  // useful for computing deltas of time.
  virtual uint64_t NowMicros() = 0;

-  // Sleep/delay the thread for the prescribed number of micro-seconds.
+  // Sleep/delay the thread for the perscribed number of micro-seconds.
  virtual void SleepForMicroseconds(int micros) = 0;

+  // Riak specific:  Get object that is tracking various software counters
+  virtual PerformanceCounters * GetPerformanceCounters() {return(gPerfCounters);};
+
+  // Riak specific:  Request size of recovery memory map, potentially using
+  //  Options data for the decision.  Default 2Mbyte is Google's original size.
+  virtual size_t RecoveryMmapSize(const struct Options *) const {return(2*1024*1024L);};
+
 private:
  // No copying allowed
  Env(const Env&);
@ -190,14 +215,6 @@ class SequentialFile {
  //
  // REQUIRES: External synchronization
  virtual Status Skip(uint64_t n) = 0;
-
-  // Get a name for the file, only for error reporting
-  virtual std::string GetName() const = 0;
-
- private:
-  // No copying allowed
-  SequentialFile(const SequentialFile&);
-  void operator=(const SequentialFile&);
 };

 // A file abstraction for randomly reading the contents of a file.
@ -218,13 +235,11 @@ class RandomAccessFile {
  virtual Status Read(uint64_t offset, size_t n, Slice* result,
                      char* scratch) const = 0;

-  // Get a name for the file, only for error reporting
-  virtual std::string GetName() const = 0;
+  // Riak optimization:  allows advising Linux page cache
+  virtual void SetForCompaction(uint64_t file_size) {};

- private:
-  // No copying allowed
-  RandomAccessFile(const RandomAccessFile&);
-  void operator=(const RandomAccessFile&);
+  // Riak addition:  size of this structure in bytes
+  virtual size_t ObjectSize() {return(sizeof(RandomAccessFile));};
 };

 // A file abstraction for sequential writing.  The implementation
@ -240,8 +255,10 @@ class WritableFile {
  virtual Status Flush() = 0;
  virtual Status Sync() = 0;

-  // Get a name for the file, only for error reporting
-  virtual std::string GetName() const = 0;
+  // Riak specific:
+  // Provide hint where key/value data ends and metadata starts
+  //  in an .sst table file.
+  virtual void SetMetadataOffset(uint64_t) {};

 private:
  // No copying allowed
@ -249,12 +266,30 @@ class WritableFile {
  void operator=(const WritableFile&);
 };

+// A file abstraction for sequential writing at end of existing file.
+class AppendableFile: public WritableFile {
+ public:
+  AppendableFile() { }
+  virtual ~AppendableFile();
+
+ private:
+  // No copying allowed
+  AppendableFile(const AppendableFile&);
+  void operator=(const AppendableFile&);
+};
+
 // An interface for writing log messages.
 class Logger {
 public:
  Logger() { }
  virtual ~Logger();

+  // Riak specific function for hot backup.
+  //  hot_backup.cc assumes that it can rotate the LOG file
+  //  via standard Env routines if this function returns a
+  //  non-zero value.
+  virtual long LogSize() {return(0);};
+
  // Write an entry to the log file with the specified format.
  virtual void Logv(const char* format, va_list ap) = 0;

@ -310,11 +345,14 @@ class EnvWrapper : public Env {
  Status NewRandomAccessFile(const std::string& f, RandomAccessFile** r) {
    return target_->NewRandomAccessFile(f, r);
  }
-  Status NewWritableFile(const std::string& f, WritableFile** r) {
-    return target_->NewWritableFile(f, r);
+  Status NewWritableFile(const std::string& f, WritableFile** r, size_t s=0) {
+    return target_->NewWritableFile(f, r, s);
  }
-  Status NewAppendableFile(const std::string& f, WritableFile** r) {
-    return target_->NewAppendableFile(f, r);
+  Status NewAppendableFile(const std::string& f, WritableFile** r, size_t s=0) {
+      return target_->NewAppendableFile(f, r, s);
+  }
+  Status NewWriteOnlyFile(const std::string& f, WritableFile** r, size_t s=0) {
+    return target_->NewWriteOnlyFile(f, r, s);
  }
  bool FileExists(const std::string& f) { return target_->FileExists(f); }
  Status GetChildren(const std::string& dir, std::vector<std::string>* r) {
@ -336,7 +374,7 @@ class EnvWrapper : public Env {
  void Schedule(void (*f)(void*), void* a) {
      return target_->Schedule(f, a);
  }
-  void StartThread(void (*f)(void*), void* a) {
+  pthread_t StartThread(void (*f)(void*), void* a) {
    return target_->StartThread(f, a);
  }
  virtual Status GetTestDirectory(std::string* path) {
@ -355,6 +393,12 @@ class EnvWrapper : public Env {
  Env* target_;
 };

+// Riak specific hack to allow runtime change
+//  of mapping size
+extern volatile size_t gMapSize;
+
+extern bool gFadviseWillNeed;
+
 }  // namespace leveldb

 #endif  // STORAGE_LEVELDB_INCLUDE_ENV_H_
--- a/src/leveldb/include/leveldb/expiry.h
+++ b/src/leveldb/include/leveldb/expiry.h
@ -0,0 +1,135 @@
+// -------------------------------------------------------------------
+//
+// expiry.h:  background expiry management for Basho's modified leveldb
+//
+// Copyright (c) 2016 Basho Technologies, Inc. All Rights Reserved.
+//
+// This file is provided to you under the Apache License,
+// Version 2.0 (the "License"); you may not use this file
+// except in compliance with the License.  You may obtain
+// a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+//
+// -------------------------------------------------------------------
+
+#ifndef EXPIRY_H
+#define EXPIRY_H
+
+#include <limits.h>
+#include <stdint.h>
+#include "leveldb/env.h"
+#include "leveldb/options.h"
+#include "util/refobject_base.h"
+
+namespace leveldb {
+
+class Compaction;
+class Logger;
+struct ParsedInternalKey;
+class Slice;
+class SstCounters;
+class Version;
+class VersionEdit;
+struct FileMetaData;
+
+
+enum EleveldbRouterActions_t
+{
+    eGetBucketProperties=1
+};  // enum EleveldbRouterActions_t
+
+
+typedef bool (* EleveldbRouter_t)(EleveldbRouterActions_t Action, int ParamCount, const void ** Params);
+
+
+class ExpiryModule : public RefObjectBase
+{
+public:
+    virtual ~ExpiryModule() {};
+
+    // Print expiry options to LOG file
+    virtual void Dump(Logger * log) const
+    {Log(log,"                        Expiry: (none)");};
+
+    // Quick test to allow manifest logic and such know if
+    //  extra expiry logic should be checked
+    virtual bool ExpiryActivated() const {return(false);};
+
+    // db/write_batch.cc MemTableInserter::Put() calls this.
+    // returns false on internal error
+    virtual bool MemTableInserterCallback(
+        const Slice & Key,   // input: user's key about to be written
+        const Slice & Value, // input: user's value object
+        ValueType & ValType,   // input/output: key type. call might change
+        ExpiryTimeMicros & Expiry) const  // input/output: 0 or specific expiry. call might change
+    {return(true);};
+
+    // db/dbformat.cc KeyRetirement::operator() calls this.
+    // db/version_set.cc SaveValue() calls this too.
+    // returns true if key is expired, returns false if key not expired
+    virtual bool KeyRetirementCallback(
+        const ParsedInternalKey & Ikey) const
+    {return(false);};
+
+    // table/table_builder.cc TableBuilder::Add() calls this.
+    // returns false on internal error
+    virtual bool TableBuilderCallback(
+        const Slice & Key,       // input: internal key
+        SstCounters & Counters) const // input/output: counters for new sst table
+    {return(true);};
+
+    // db/memtable.cc MemTable::Get() calls this.
+    // returns true if type/expiry is expired, returns false if not expired
+    virtual bool MemTableCallback(
+        const Slice & Key) const        // input: leveldb internal key
+    {return(false);};
+
+    // db/version_set.cc VersionSet::Finalize() calls this if no
+    //  other compaction selected for a level
+    // returns true if there is an expiry compaction eligible
+    virtual bool CompactionFinalizeCallback(
+        bool WantAll,                 // input: true - examine all expired files
+        const Version & Ver,          // input: database state for examination
+        int Level,                    // input: level to review for expiry
+        VersionEdit * Edit) const     // output: NULL or destination of delete list
+    {return(false);};
+
+    // yep, sometimes we want to expiry this expiry module object.
+    //  mostly for bucket level properties in Riak EE
+    virtual uint64_t ExpiryModuleExpiryMicros() {return(0);};
+
+    // Creates derived ExpiryModule object that matches compile time
+    //  switch for open source or Basho enterprise edition features.
+    static ExpiryModule * CreateExpiryModule(EleveldbRouter_t Router);
+
+    // Cleans up global objects related to expiry
+    //  switch for open source or Basho enterprise edition features.
+    static void ShutdownExpiryModule();
+
+    // Riak EE:  stash a user created module with settings
+    virtual void NoteUserExpirySettings() {};
+
+protected:
+    ExpiryModule() {};
+
+private:
+    ExpiryModule(const ExpiryModule &);
+    ExpiryModule & operator=(const ExpiryModule &);
+
+};  // ExpiryModule
+
+
+typedef RefPtr<class ExpiryModule> ExpiryPtr_t;
+
+} // namespace leveldb
+
+#endif // ifndef
+
--- a/src/leveldb/include/leveldb/filter_policy.h
+++ b/src/leveldb/include/leveldb/filter_policy.h
@ -23,9 +23,21 @@ namespace leveldb {
 class Slice;

 class FilterPolicy {
+protected:
+  mutable const FilterPolicy * m_Next;      // used by FilterInventory
+
 public:
+  FilterPolicy()
+      : m_Next(NULL)
+  {};
+
  virtual ~FilterPolicy();

+  // list pointer accessors
+  const FilterPolicy * GetNext() const {return(m_Next);};
+  void SetNext(const FilterPolicy * Next) const {m_Next=Next;};
+
+
  // Return the name of this policy.  Note that if the filter encoding
  // changes in an incompatible way, the name returned by this method
  // must be changed.  Otherwise, old incompatible filters may be
@ -47,6 +59,7 @@ class FilterPolicy {
  // This method may return true or false if the key was not on the
  // list, but it should aim to return false with a high probability.
  virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const = 0;
+
 };

 // Return a new filter policy that uses a bloom filter with approximately
@ -64,7 +77,29 @@ class FilterPolicy {
 // FilterPolicy (like NewBloomFilterPolicy) that does not ignore
 // trailing spaces in keys.
 extern const FilterPolicy* NewBloomFilterPolicy(int bits_per_key);
+extern const FilterPolicy* NewBloomFilterPolicy2(int bits_per_key);

+
+class FilterInventory
+{
+public:
+    // MUST be static variable so that it initializes before any static objects
+    //  have their initializers called
+    static const FilterPolicy * ListHead;
+
+    // This might be called prior to singleton FilterInventory object
+    //  being initialized.  NOT THREAD SAFE.
+    static void AddFilterToInventory(const FilterPolicy * Filter)
+    {
+        if (NULL!=Filter)
+        {
+            Filter->SetNext(ListHead);
+            ListHead=Filter;
+        }   // if
+        return;
    }
+};  // class FilterInventory
+
+}   // namespace leveldb

 #endif  // STORAGE_LEVELDB_INCLUDE_FILTER_POLICY_H_
--- a/src/leveldb/include/leveldb/iterator.h
+++ b/src/leveldb/include/leveldb/iterator.h
@ -17,6 +17,7 @@

 #include "leveldb/slice.h"
 #include "leveldb/status.h"
+#include "leveldb/options.h"

 namespace leveldb {

@ -37,7 +38,7 @@ class Iterator {
  // Valid() after this call iff the source is not empty.
  virtual void SeekToLast() = 0;

-  // Position at the first key in the source that is at or past target.
+  // Position at the first key in the source that at or past target
  // The iterator is Valid() after this call iff the source contains
  // an entry that comes at or past target.
  virtual void Seek(const Slice& target) = 0;
@ -61,9 +62,13 @@ class Iterator {
  // Return the value for the current entry.  The underlying storage for
  // the returned slice is valid only until the next modification of
  // the iterator.
-  // REQUIRES: Valid()
+  // REQUIRES: !AtEnd() && !AtStart()
  virtual Slice value() const = 0;

+  // Riak specific:  if a database iterator, returns key meta data
+  // REQUIRES: Valid()
+  virtual KeyMetaData & keymetadata() const {return(keymetadata_); };
+
  // If an error has occurred, return it.  Else return an ok status.
  virtual Status status() const = 0;

@ -75,6 +80,10 @@ class Iterator {
  typedef void (*CleanupFunction)(void* arg1, void* arg2);
  void RegisterCleanup(CleanupFunction function, void* arg1, void* arg2);

+ protected:
+  // mutable so reusable by derived classes
+  mutable KeyMetaData keymetadata_;
+
 private:
  struct Cleanup {
    CleanupFunction function;
--- a/src/leveldb/include/leveldb/options.h
+++ b/src/leveldb/include/leveldb/options.h
@ -6,15 +6,23 @@
 #define STORAGE_LEVELDB_INCLUDE_OPTIONS_H_

 #include <stddef.h>
+#include <stdint.h>
+#include <string>
+#include <memory>

 namespace leveldb {

 class Cache;
 class Comparator;
 class Env;
+class ExpiryModule;
 class FilterPolicy;
 class Logger;
 class Snapshot;
+namespace log
+{
+    class Writer;
+}  // namespace log

 // DB contents are stored in a set of blocks, each of which holds a
 // sequence of key,value pairs.  Each block may be compressed before
@ -24,9 +32,34 @@ enum CompressionType {
  // NOTE: do not change the values of existing entries, as these are
  // part of the persistent format on disk.
  kNoCompression     = 0x0,
-  kSnappyCompression = 0x1
+  kSnappyCompression = 0x1,
+  kLZ4Compression    = 0x2,
+  kNoCompressionAutomated = 0x3
 };

+//  Originally located in db/dbformat.h.  Now available publically.
+// Value types encoded as the last component of internal keys.
+// DO NOT CHANGE THESE ENUM VALUES: they are embedded in the on-disk
+// data structures.
+enum ValueType {
+  kTypeDeletion = 0x0,
+  kTypeValue = 0x1,
+  kTypeValueWriteTime = 0x2,
+  kTypeValueExplicitExpiry = 0x3
+};
+
+//  Originally located in db/dbformat.h
+typedef uint64_t SequenceNumber;
+typedef uint64_t ExpiryTimeMicros;
+
+};  // namespace leveldb
+
+//
+// must follow ValueType declaration
+#include "leveldb/expiry.h"
+
+namespace leveldb {
+
 // Options to control the behavior of a database (passed to DB::Open)
 struct Options {
  // -------------------
@ -56,6 +89,14 @@ struct Options {
  // Default: false
  bool paranoid_checks;

+  // Riak specific: this variable replaces paranoid_checks at one
+  // one place in the code.  This variable alone controls whether or not
+  // compaction read operations check CRC values.  Riak needs
+  // the compaction CRC check, but not other paranoid_checks ... so
+  // this independent control.
+  // Default: true
+  bool verify_compactions;
+
  // Use the specified object to interact with the environment,
  // e.g. to read/write files, schedule background work, etc.
  // Default: Env::Default()
@ -85,7 +126,7 @@ struct Options {
  // Number of open files that can be used by the DB.  You may need to
  // increase this if your database has a large working set (budget
  // one open file per 2MB of working set).
-  //
+  // RIAK: NO LONGER USED
  // Default: 1000
  int max_open_files;

@ -105,6 +146,15 @@ struct Options {
  // Default: 4K
  size_t block_size;

+  // Riak specific:  non-zero value activates code to automatically
+  // increase block_size as needed to ensure maximum number of files
+  // are available in the file cache.  The value indicates how many
+  // incremental increases to use between the original block_size
+  // and largest, reasonable block_size.
+  //
+  // Default: 16
+  int block_size_steps;
+
  // Number of keys between restart points for delta encoding of keys.
  // This parameter can be changed dynamically.  Most clients should
  // leave this parameter alone.
@ -112,18 +162,6 @@ struct Options {
  // Default: 16
  int block_restart_interval;

-  // Leveldb will write up to this amount of bytes to a file before
-  // switching to a new one.
-  // Most clients should leave this parameter alone.  However if your
-  // filesystem is more efficient with larger files, you could
-  // consider increasing the value.  The downside will be longer
-  // compactions and hence longer latency/performance hiccups.
-  // Another reason to increase this parameter might be when you are
-  // initially populating a large database.
-  //
-  // Default: 2MB
-  size_t max_file_size;
-
  // Compress blocks using the specified compression algorithm.  This
  // parameter can be changed dynamically.
  //
@ -140,12 +178,6 @@ struct Options {
  // efficiently detect that and will switch to uncompressed mode.
  CompressionType compression;

-  // EXPERIMENTAL: If true, append to existing MANIFEST and log files
-  // when a database is opened.  This can significantly speed up open.
-  //
-  // Default: currently false, but may become true later.
-  bool reuse_logs;
-
  // If non-NULL, use the specified filter policy to reduce disk reads.
  // Many applications will benefit from passing the result of
  // NewBloomFilterPolicy() here.
@ -153,8 +185,84 @@ struct Options {
  // Default: NULL
  const FilterPolicy* filter_policy;

+  // Riak specific flag used to indicate when database is open
+  // as part of a Repair operation.  Default is false
+  bool is_repair;
+
+  // Riak specific flag to mark Riak internal database versus
+  //  user database.  (User database gets larger cache resources.)
+  bool is_internal_db;
+
+  // Riak replacement for max_open_files and block_cache.  This is
+  //  TOTAL memory to be used by leveldb across ALL DATABASES.
+  //  Most recent value seen upon database open, wins.  Zero for default.
+  uint64_t total_leveldb_mem;
+
+  // Riak specific option specifying block cache space that cannot
+  //  be released for page cache use.  The space may still be
+  //  released for file cache.
+  uint64_t block_cache_threshold;
+
+  // Riak option to override most memory modeling and create
+  //  smaller memory footprint for developers.  Helps when
+  //  running large number of databases and multiple VMs. Do
+  //  NOT use this option if making performance measurements.
+  // Default: false
+  bool limited_developer_mem;
+
+  // The size of each MMAped file, choose 0 for the default (20M)
+  uint64_t mmap_size;
+
+  // Riak option to adjust aggressive delete behavior.
+  //  - zero disables aggressive delete
+  //  - positive value indicates how many deletes must exist
+  //     in a file for it to be compacted due to deletes
+  uint64_t delete_threshold;
+
+  // Riak specific flag used to indicate when fadvise() management
+  // should default to WILLNEED instead of DONTNEED.  Default is false
+  bool fadvise_willneed;
+
+  // *****
+  // Riak specific options for establishing two tiers of disk arrays.
+  // All three tier options must be valid for the option to activate.
+  // When active, leveldb directories are constructed using either
+  // the fast or slow prefix followed by the database name given
+  // in the DB::Open call.  (a synonym for "prefix" is "mount")
+  // *****
+
+  // Riak specific option setting the level number at which the
+  // "tiered_slow_prefix" should be used.  Default is zero which
+  // disables the option.  Valid values are 1 to 6.  3 or 4 recommended.
+  unsigned tiered_slow_level;
+
+  // Riak specific option with the path prefix used for "fast" disk
+  // array.  levels 0 to tiered_slow_level-1 use this path prefix
+  std::string tiered_fast_prefix;
+
+  // Riak specific option with the path prefix used for "slow" disk
+  // array.  levels tiered_slow_level through 6 use this path prefix
+  std::string tiered_slow_prefix;
+
+  // Riak specific option that writes a list of open table files
+  // to disk on close then automatically opens same files again
+  // upon restart.
+  bool cache_object_warming;
+
+  // Riak specific object that defines expiry policy for data
+  // written to leveldb.
+  ExpiryPtr_t expiry_module;
+
  // Create an Options object with default values for all fields.
  Options();
+
+  void Dump(Logger * log) const;
+
+  bool ExpiryActivated() const
+        {return(NULL!=expiry_module.get() && expiry_module->ExpiryActivated());};
+
+private:
+
 };

 // Options that control read operations
@ -171,16 +279,57 @@ struct ReadOptions {

  // If "snapshot" is non-NULL, read as of the supplied snapshot
  // (which must belong to the DB that is being read and which must
-  // not have been released).  If "snapshot" is NULL, use an implicit
+  // not have been released).  If "snapshot" is NULL, use an impliicit
  // snapshot of the state at the beginning of this read operation.
  // Default: NULL
  const Snapshot* snapshot;

+  // Riak specific flag, currently used within Erlang adaptor
+  //  to enable automatic delete and new of fresh snapshot
+  //  and database iterator objects for long running iterations
+  //  (only supports iterator NEXT operations).
+  // Default: false
+  bool iterator_refresh;
+
  ReadOptions()
-      : verify_checksums(false),
+  : verify_checksums(true),
      fill_cache(true),
-        snapshot(NULL) {
+      snapshot(NULL),
+      iterator_refresh(false),
+      is_compaction(false),
+      env(NULL),
+      info_log(NULL)
+  {
  }
+
+
+  // accessors to the private data
+  bool IsCompaction() const {return(is_compaction);};
+
+  Logger * GetInfoLog() const {return(info_log);};
+
+  const std::string & GetDBName() const {return(dbname);};
+
+  Env * GetEnv() const {return(env);};
+
+  // The items below are internal options, not for external manipulation.
+  //  They are populated by VersionSet::MakeInputIterator only during compaction operations
+private:
+  friend class VersionSet;
+
+  // true when used on background compaction
+  bool is_compaction;
+
+  // Database name for potential creation of bad blocks file
+  std::string dbname;
+
+  // Needed for file operations if creating bad blocks file
+  Env * env;
+
+  // Open log file for error notifications
+  // Only valid when is_compation==true
+  Logger* info_log;
+
 };

 // Options that control write operations
@ -208,6 +357,22 @@ struct WriteOptions {
  }
 };

+
+// Riak specific object that can return key metadata
+//  during get or iterate operation
+struct KeyMetaData
+{
+    ValueType m_Type;          // see above
+    SequenceNumber m_Sequence; // output only, leveldb internal
+    ExpiryTimeMicros m_Expiry; // microseconds since Epoch, UTC
+
+    KeyMetaData()
+    : m_Type(kTypeValue), m_Sequence(0), m_Expiry(0)
+    {};
+};  // struct KeyMetaData
+
+const char * CompileOptionsString();
+
 }  // namespace leveldb

 #endif  // STORAGE_LEVELDB_INCLUDE_OPTIONS_H_
--- a/src/leveldb/include/leveldb/perf_count.h
+++ b/src/leveldb/include/leveldb/perf_count.h
@ -0,0 +1,329 @@
+// -------------------------------------------------------------------
+//
+// perf_count.h:  performance counters LevelDB
+//
+// Copyright (c) 2012-2016 Basho Technologies, Inc. All Rights Reserved.
+//
+// This file is provided to you under the Apache License,
+// Version 2.0 (the "License"); you may not use this file
+// except in compliance with the License.  You may obtain
+// a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+//
+// -------------------------------------------------------------------
+
+#ifndef STORAGE_LEVELDB_INCLUDE_PERF_COUNT_H_
+#define STORAGE_LEVELDB_INCLUDE_PERF_COUNT_H_
+
+#include <stdint.h>
+#include <string>
+#include "leveldb/status.h"
+
+namespace leveldb {
+
+enum SstCountEnum
+{
+    //
+    // array index values/names
+    //
+    eSstCountKeys=0,           //!< how many keys in this sst
+    eSstCountBlocks=1,         //!< how many blocks in this sst
+    eSstCountCompressAborted=2,//!< how many blocks attempted compression and aborted use
+    eSstCountKeySize=3,        //!< byte count of all keys
+    eSstCountValueSize=4,      //!< byte count of all values
+    eSstCountBlockSize=5,      //!< byte count of all blocks (pre-compression)
+    eSstCountBlockWriteSize=6, //!< post-compression size, or BlockSize if no compression
+    eSstCountIndexKeys=7,      //!< how many keys in the index block
+    eSstCountKeyLargest=8,     //!< largest key in sst
+    eSstCountKeySmallest=9,    //!< smallest key in sst
+    eSstCountValueLargest=10,  //!< largest value in sst
+    eSstCountValueSmallest=11, //!< smallest value in sst
+    eSstCountDeleteKey=12,     //!< tombstone count
+    eSstCountBlockSizeUsed=13, //!< Options::block_size used with this file
+    eSstCountUserDataSize=14,  //!< post-compression size of non-metadata (user keys/values/block overhead)
+    eSstCountExpiry1=15,       //!< undocumented expiry counter 1
+    eSstCountExpiry2=16,       //!< undocumented expiry counter 2
+    eSstCountExpiry3=17,       //!< undocumented expiry counter 3
+    eSstCountSequence=18,      //!< highest sequence number in file
+
+    // must follow last index name to represent size of array
+    eSstCountEnumSize,          //!< size of the array described by the enum values
+
+    eSstCountVersion=1
+
+};  // enum SstCountEnum
+
+
+class SstCounters
+{
+protected:
+    bool m_IsReadOnly;         //!< set when data decoded from a file
+    uint32_t m_Version;        //!< object revision identification
+    uint32_t m_CounterSize;    //!< number of objects in m_Counter
+
+    uint64_t m_Counter[eSstCountEnumSize];
+
+public:
+    // constructors / destructor
+    SstCounters();
+
+    // Put data into disk form
+    void EncodeTo(std::string & Dst) const;
+
+    // Populate member data from prior EncodeTo block
+    Status DecodeFrom(const Slice& src);
+
+    // increment the counter
+    uint64_t Inc(unsigned Index);
+
+    // add value to the counter
+    uint64_t Add(unsigned Index, uint64_t Amount);
+
+    // return value of a counter
+    uint64_t Value(unsigned Index) const;
+
+    // set a value
+    void Set(unsigned Index, uint64_t);
+
+    // return number of counters
+    uint32_t Size() const {return(m_CounterSize);};
+
+    // printf all values
+    void Dump() const;
+
+};  // class SstCounters
+
+
+extern struct PerformanceCounters * gPerfCounters;
+
+
+enum PerformanceCountersEnum
+{
+    //
+    // array index values/names
+    //  (enum explicitly numbered to allow future edits / moves / inserts)
+    //
+    ePerfROFileOpen=0,      //!< PosixMmapReadableFile open
+    ePerfROFileClose=1,     //!<  closed
+    ePerfROFileUnmap=2,     //!<  unmap without close
+
+    ePerfRWFileOpen=3,      //!< PosixMmapFile open
+    ePerfRWFileClose=4,     //!<  closed
+    ePerfRWFileUnmap=5,     //!<  unmap without close
+
+    ePerfApiOpen=6,         //!< Count of DB::Open completions
+    ePerfApiGet=7,          //!< Count of DBImpl::Get completions
+    ePerfApiWrite=8,        //!< Count of DBImpl::Get completions
+
+    ePerfWriteSleep=9,      //!< DBImpl::MakeRoomForWrite called sleep
+    ePerfWriteWaitImm=10,   //!< DBImpl::MakeRoomForWrite called Wait on Imm compact
+    ePerfWriteWaitLevel0=11,//!< DBImpl::MakeRoomForWrite called Wait on Level0 compact
+    ePerfWriteNewMem=12,    //!< DBImpl::MakeRoomForWrite created new memory log
+    ePerfWriteError=13,     //!< DBImpl::MakeRoomForWrite saw bg_error_
+    ePerfWriteNoWait=14,    //!< DBImpl::MakeRoomForWrite took no action
+
+    ePerfGetMem=15,         //!< DBImpl::Get read from memory log
+    ePerfGetImm=16,         //!< DBImpl::Get read from previous memory log
+    ePerfGetVersion=17,     //!< DBImpl::Get read from Version object
+
+    // code ASSUMES the levels are in numerical order,
+    //  i.e. based off of ePerfSearchLevel0
+    ePerfSearchLevel0=18,   //!< Version::Get read searched one or more files here
+    ePerfSearchLevel1=19,   //!< Version::Get read searched one or more files here
+    ePerfSearchLevel2=20,   //!< Version::Get read searched one or more files here
+    ePerfSearchLevel3=21,   //!< Version::Get read searched one or more files here
+    ePerfSearchLevel4=22,   //!< Version::Get read searched one or more files here
+    ePerfSearchLevel5=23,   //!< Version::Get read searched one or more files here
+    ePerfSearchLevel6=24,   //!< Version::Get read searched one or more files here
+
+    ePerfTableCached=25,    //!< TableCache::FindTable found table in cache
+    ePerfTableOpened=26,    //!< TableCache::FindTable had to open table file
+    ePerfTableGet=27,       //!< TableCache::Get used to retrieve a key
+
+    ePerfBGCloseUnmap=28,   //!< PosixEnv::BGThreaed started Unmap/Close job
+    ePerfBGCompactImm=29,   //!< PosixEnv::BGThreaed started compaction of Imm
+    ePerfBGNormal=30,       //!< PosixEnv::BGThreaed started normal compaction job
+    ePerfBGCompactLevel0=31,//!< PosixEnv::BGThreaed started compaction of Level0
+
+    ePerfBlockFiltered=32,  //!< Table::BlockReader search stopped due to filter
+    ePerfBlockFilterFalse=33,//!< Table::BlockReader gave a false positive for match
+    ePerfBlockCached=34,    //!< Table::BlockReader found block in cache
+    ePerfBlockRead=35,      //!< Table::BlockReader read block from disk
+    ePerfBlockFilterRead=36,//!< Table::ReadMeta filter loaded from file
+    ePerfBlockValidGet=37,  //!< Table::InternalGet has valid iterator
+
+    ePerfDebug0=38,         //!< Developer debug counters, moveable
+    ePerfDebug1=39,         //!< Developer debug counters, moveable
+    ePerfDebug2=40,         //!< Developer debug counters, moveable
+    ePerfDebug3=41,         //!< Developer debug counters, moveable
+    ePerfDebug4=42,         //!< Developer debug counters, moveable
+
+    ePerfReadBlockError=43, //!< crc or compression error in ReadBlock (format.cc)
+
+    ePerfIterNew=44,        //!< Count of DBImpl::NewDBIterator calls
+    ePerfIterNext=45,       //!< Count of DBIter::Next calls
+    ePerfIterPrev=46,       //!< Count of DBIter::Prev calls
+    ePerfIterSeek=47,       //!< Count of DBIter::Seek calls
+    ePerfIterSeekFirst=48,  //!< Count of DBIter::SeekFirst calls
+    ePerfIterSeekLast=49,   //!< Count of DBIter::SeekLast calls
+    ePerfIterDelete=50,     //!< Count of DBIter::~DBIter
+
+    ePerfElevelDirect=51,   //!< eleveldb's FindWaitingThread went direct to thread
+    ePerfElevelQueued=52,   //!< eleveldb's FindWaitingThread queued work item
+    ePerfElevelDequeued=53, //!< eleveldb's worker took item from backlog queue
+
+    ePerfElevelRefCreate=54,//!< eleveldb RefObject constructed
+    ePerfElevelRefDelete=55,//!< eleveldb RefObject destructed
+
+    ePerfThrottleGauge=56,  //!< current throttle value
+    ePerfThrottleCounter=57,//!< running throttle by seconds
+
+    ePerfThrottleMicros0=58,//!< level 0 micros spent compacting
+    ePerfThrottleKeys0=59,  //!< level 0 keys processed
+    ePerfThrottleBacklog0=60,//!< backlog at time of posting (level0)
+    ePerfThrottleCompacts0=61,//!< number of level 0 compactions
+
+    ePerfThrottleMicros1=62,//!< level 1+ micros spent compacting
+    ePerfThrottleKeys1=63,  //!< level 1+ keys processed
+    ePerfThrottleBacklog1=64,//!< backlog at time of posting (level1+)
+    ePerfThrottleCompacts1=65,//!< number of level 1+ compactions
+
+    ePerfBGWriteError=66,   //!< error in write/close, see syslog
+
+    ePerfThrottleWait=67,   //!< milliseconds of throttle wait
+    ePerfThreadError=68,    //!< system error on thread related call, no LOG access
+
+    ePerfBGImmDirect=69,    //!< count Imm compactions happened directly
+    ePerfBGImmQueued=70,    //!< count Imm compactions placed on queue
+    ePerfBGImmDequeued=71,  //!< count Imm compactions removed from queue
+    ePerfBGImmWeighted=72,  //!< total microseconds item spent on queue
+
+    ePerfBGUnmapDirect=73,  //!< count Unmap operations happened directly
+    ePerfBGUnmapQueued=74,  //!< count Unmap operations placed on queue
+    ePerfBGUnmapDequeued=75,//!< count Unmap operations removed from queue
+    ePerfBGUnmapWeighted=76,//!< total microseconds item spent on queue
+
+    ePerfBGLevel0Direct=77,  //!< count Level0 compactions happened directly
+    ePerfBGLevel0Queued=78,  //!< count Level0 compactions placed on queue
+    ePerfBGLevel0Dequeued=79,//!< count Level0 compactions removed from queue
+    ePerfBGLevel0Weighted=80,//!< total microseconds item spent on queue
+
+    ePerfBGCompactDirect=81,  //!< count generic compactions happened directly
+    ePerfBGCompactQueued=82,  //!< count generic compactions placed on queue
+    ePerfBGCompactDequeued=83,//!< count generic compactions removed from queue
+    ePerfBGCompactWeighted=84,//!< total microseconds item spent on queue
+
+    ePerfFileCacheInsert=85,  //!< total bytes inserted into file cache
+    ePerfFileCacheRemove=86,  //!< total bytes removed from file cache
+
+    ePerfBlockCacheInsert=87, //!< total bytes inserted into block cache
+    ePerfBlockCacheRemove=88, //!< total bytes removed from block cache
+
+    ePerfApiDelete=89,        //!< Count of DB::Delete
+
+    ePerfBGMove=90,           //!< compaction was a successful move
+    ePerfBGMoveFail=91,       //!< compaction move failed, regular compaction attempted
+
+    ePerfThrottleUnadjusted=92,//!< current unadjusted throttle gauge
+
+    // this one was added to the other ePerfElevelXxx counters above when we backported HotThreadPool to eleveldb
+    ePerfElevelWeighted=93,   //!< total microseconds item spent on queue
+
+    ePerfExpiredKeys=94,      //!< key physically removed because it expired
+    ePerfExpiredFiles=95,     //!< entire file removed because all keys expired
+
+    ePerfSyslogWrite=96,      //!< logged message to syslog
+    ePerfBackupStarted=97,    //!< hot backup initiated
+    ePerfBackupError=98,      //!< hot backup had an error
+
+    ePerfPropCacheHit=99,     //!< property cache had data
+    ePerfPropCacheMiss=100,   //!< property cache had to look up data
+    ePerfPropCacheError=101,  //!< no property cache entry built/located
+
+    // must follow last index name to represent size of array
+    //  (ASSUMES previous enum is highest value)
+    ePerfCountEnumSize,     //!< size of the array described by the enum values
+
+    ePerfVersion=1,         //!< structure versioning
+    ePerfKey=41207          //!< random number as shared memory identifier
+};
+
+
+struct PerfCounterAttributes
+{
+    const char * m_PerfCounterName;  //!< text description
+    const bool m_PerfDiscretionary;  //!< true if ok to disable
+};  // PerfCounterAttributes
+
+
+//
+// Do NOT use virtual functions.  This structure will be aligned at different
+//  locations in multiple processes.  Things can get messy with virtuals.
+
+struct PerformanceCounters
+{
+public:
+    static int m_LastError;
+
+protected:
+    uint32_t m_Version;        //!< object revision identification
+    uint32_t m_CounterSize;    //!< number of objects in m_Counter
+
+    volatile uint64_t m_Counter[ePerfCountEnumSize];
+
+    static const PerfCounterAttributes m_PerfCounterAttr[];
+    static int m_PerfSharedId;
+    static volatile uint64_t m_BogusCounter;  //!< for out of range GetPtr calls
+
+public:
+    // only called for local object, not for shared memory
+    PerformanceCounters();
+
+    //!< does executable's idea of version match shared object?
+    bool VersionTest()
+        {return(ePerfCountEnumSize<=m_CounterSize && ePerfVersion==m_Version);};
+
+    //!< mostly for perf_count_test.cc
+    void SetVersion(uint32_t Version, uint32_t CounterSize)
+    {m_Version=Version; m_CounterSize=CounterSize;};
+
+    static PerformanceCounters * Init(bool IsReadOnly);
+    static int Close(PerformanceCounters * Counts);
+
+    uint64_t Inc(unsigned Index);
+    uint64_t Dec(unsigned Index);
+
+    // add value to the counter
+    uint64_t Add(unsigned Index, uint64_t Amount);
+
+    // return value of a counter
+    uint64_t Value(unsigned Index) const;
+
+    // set a value
+    void Set(unsigned Index, uint64_t);
+
+    volatile const uint64_t * GetPtr(unsigned Index) const;
+
+    static const char * GetNamePtr(unsigned Index);
+
+    int LookupCounter(const char * Name);
+
+    void Dump();
+
+};  // struct PerformanceCounters
+
+extern PerformanceCounters * gPerfCounters;
+
+extern volatile bool gPerfCountersDisabled;
+
+}  // namespace leveldb
+
+#endif  // STORAGE_LEVELDB_INCLUDE_PERF_COUNT_H_
--- a/src/leveldb/include/leveldb/slice.h
+++ b/src/leveldb/include/leveldb/slice.h
@ -94,7 +94,7 @@ inline bool operator!=(const Slice& x, const Slice& y) {
 }

 inline int Slice::compare(const Slice& b) const {
-  const size_t min_len = (size_ < b.size_) ? size_ : b.size_;
+  const int min_len = (size_ < b.size_) ? size_ : b.size_;
  int r = memcmp(data_, b.data_, min_len);
  if (r == 0) {
    if (size_ < b.size_) r = -1;
--- a/src/leveldb/include/leveldb/status.h
+++ b/src/leveldb/include/leveldb/status.h
@ -60,12 +60,6 @@ class Status {
  // Returns true iff the status indicates an IOError.
  bool IsIOError() const { return code() == kIOError; }

-  // Returns true iff the status indicates a NotSupportedError.
-  bool IsNotSupportedError() const { return code() == kNotSupported; }
-
-  // Returns true iff the status indicates an InvalidArgument.
-  bool IsInvalidArgument() const { return code() == kInvalidArgument; }
-
  // Return a string representation of this status suitable for printing.
  // Returns the string "OK" for success.
  std::string ToString() const;
--- a/src/leveldb/include/leveldb/table.h
+++ b/src/leveldb/include/leveldb/table.h
@ -7,6 +7,7 @@

 #include <stdint.h>
 #include "leveldb/iterator.h"
+#include "leveldb/perf_count.h"

 namespace leveldb {

@ -40,7 +41,7 @@ class Table {
                     uint64_t file_size,
                     Table** table);

-  ~Table();
+  virtual ~Table();

  // Returns a new iterator over the table contents.
  // The result of NewIterator() is initially invalid (caller must
@ -55,7 +56,29 @@ class Table {
  // be close to the file length.
  uint64_t ApproximateOffsetOf(const Slice& key) const;

- private:
+  // return a static copy of the table's counters.
+  SstCounters GetSstCounters() const;
+
+  // riak routine to retrieve total memory footprint of an open table
+  //  object in memory
+  size_t TableObjectSize();
+
+  // riak routine to retrieve disk size of table file
+  //  ("virtual" is for unit test activites)
+  virtual uint64_t GetFileSize();
+
+  // Riak routine to request bloom filter load on
+  //  second read operation (not iterator read)
+  bool ReadFilter();
+
+  // access routines for testing tools, not for public use
+  Block * TEST_GetIndexBlock();
+  size_t TEST_TableObjectSize() {return(TableObjectSize());};
+  size_t TEST_FilterDataSize();
+  static Iterator* TEST_BlockReader(void* Ptr, const ReadOptions& ROptions, const Slice& SliceReturn)
+    {return(BlockReader(Ptr, ROptions, SliceReturn));};
+
+ protected:  // was private, made protected for unit tests
  struct Rep;
  Rep* rep_;

@ -69,11 +92,12 @@ class Table {
  Status InternalGet(
      const ReadOptions&, const Slice& key,
      void* arg,
-      void (*handle_result)(void* arg, const Slice& k, const Slice& v));
+      bool (*handle_result)(void* arg, const Slice& k, const Slice& v));


  void ReadMeta(const Footer& footer);
-  void ReadFilter(const Slice& filter_handle_value);
+  void ReadFilter(class BlockHandle & filter_handle_value, const class FilterPolicy * policy);
+  void ReadSstCounters(const Slice& sst_counters_handle_value);

  // No copying allowed
  Table(const Table&);
--- a/src/leveldb/include/leveldb/table_builder.h
+++ b/src/leveldb/include/leveldb/table_builder.h
@ -74,6 +74,14 @@ class TableBuilder {
  // Finish() call, returns the size of the final generated file.
  uint64_t FileSize() const;

+  // Number of delete tombstones so far.
+  uint64_t NumDeletes() const;
+
+  // Retrieve expiry control values
+  uint64_t GetExpiryWriteLow() const;
+  uint64_t GetExpiryWriteHigh() const;
+  uint64_t GetExpiryExplicitHigh() const;
+
 private:
  bool ok() const { return status().ok(); }
  void WriteBlock(BlockBuilder* block, BlockHandle* handle);
--- a/src/leveldb/include/leveldb/write_batch.h
+++ b/src/leveldb/include/leveldb/write_batch.h
@ -23,6 +23,7 @@

 #include <string>
 #include "leveldb/status.h"
+#include "leveldb/options.h"

 namespace leveldb {

@ -34,7 +35,7 @@ class WriteBatch {
  ~WriteBatch();

  // Store the mapping "key->value" in the database.
-  void Put(const Slice& key, const Slice& value);
+  void Put(const Slice& key, const Slice& value, const KeyMetaData * meta=NULL);

  // If the database contains a mapping for "key", erase it.  Else do nothing.
  void Delete(const Slice& key);
@ -46,7 +47,8 @@ class WriteBatch {
  class Handler {
   public:
    virtual ~Handler();
-    virtual void Put(const Slice& key, const Slice& value) = 0;
+    virtual void Put(const Slice& key, const Slice& value,
+                     const ValueType & type, const ExpiryTimeMicros & expiry) = 0;
    virtual void Delete(const Slice& key) = 0;
  };
  Status Iterate(Handler* handler) const;
--- a/src/leveldb/issues/issue178_test.cc
+++ b/src/leveldb/issues/issue178_test.cc
@ -1,92 +0,0 @@
-// Copyright (c) 2013 The LevelDB Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
-
-// Test for issue 178: a manual compaction causes deleted data to reappear.
-#include <iostream>
-#include <sstream>
-#include <cstdlib>
-
-#include "leveldb/db.h"
-#include "leveldb/write_batch.h"
-#include "util/testharness.h"
-
-namespace {
-
-const int kNumKeys = 1100000;
-
-std::string Key1(int i) {
-  char buf[100];
-  snprintf(buf, sizeof(buf), "my_key_%d", i);
-  return buf;
-}
-
-std::string Key2(int i) {
-  return Key1(i) + "_xxx";
-}
-
-class Issue178 { };
-
-TEST(Issue178, Test) {
-  // Get rid of any state from an old run.
-  std::string dbpath = leveldb::test::TmpDir() + "/leveldb_cbug_test";
-  DestroyDB(dbpath, leveldb::Options());
-
-  // Open database.  Disable compression since it affects the creation
-  // of layers and the code below is trying to test against a very
-  // specific scenario.
-  leveldb::DB* db;
-  leveldb::Options db_options;
-  db_options.create_if_missing = true;
-  db_options.compression = leveldb::kNoCompression;
-  ASSERT_OK(leveldb::DB::Open(db_options, dbpath, &db));
-
-  // create first key range
-  leveldb::WriteBatch batch;
-  for (size_t i = 0; i < kNumKeys; i++) {
-    batch.Put(Key1(i), "value for range 1 key");
-  }
-  ASSERT_OK(db->Write(leveldb::WriteOptions(), &batch));
-
-  // create second key range
-  batch.Clear();
-  for (size_t i = 0; i < kNumKeys; i++) {
-    batch.Put(Key2(i), "value for range 2 key");
-  }
-  ASSERT_OK(db->Write(leveldb::WriteOptions(), &batch));
-
-  // delete second key range
-  batch.Clear();
-  for (size_t i = 0; i < kNumKeys; i++) {
-    batch.Delete(Key2(i));
-  }
-  ASSERT_OK(db->Write(leveldb::WriteOptions(), &batch));
-
-  // compact database
-  std::string start_key = Key1(0);
-  std::string end_key = Key1(kNumKeys - 1);
-  leveldb::Slice least(start_key.data(), start_key.size());
-  leveldb::Slice greatest(end_key.data(), end_key.size());
-
-  // commenting out the line below causes the example to work correctly
-  db->CompactRange(&least, &greatest);
-
-  // count the keys
-  leveldb::Iterator* iter = db->NewIterator(leveldb::ReadOptions());
-  size_t num_keys = 0;
-  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
-    num_keys++;
-  }
-  delete iter;
-  ASSERT_EQ(kNumKeys, num_keys) << "Bad number of keys";
-
-  // close database
-  delete db;
-  DestroyDB(dbpath, leveldb::Options());
-}
-
-}  // anonymous namespace
-
-int main(int argc, char** argv) {
-  return leveldb::test::RunAllTests();
-}
--- a/src/leveldb/issues/issue200_test.cc
+++ b/src/leveldb/issues/issue200_test.cc
@ -1,59 +0,0 @@
-// Copyright (c) 2013 The LevelDB Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
-
-// Test for issue 200: when iterator switches direction from backward
-// to forward, the current key can be yielded unexpectedly if a new
-// mutation has been added just before the current key.
-
-#include "leveldb/db.h"
-#include "util/testharness.h"
-
-namespace leveldb {
-
-class Issue200 { };
-
-TEST(Issue200, Test) {
-  // Get rid of any state from an old run.
-  std::string dbpath = test::TmpDir() + "/leveldb_issue200_test";
-  DestroyDB(dbpath, Options());
-
-  DB *db;
-  Options options;
-  options.create_if_missing = true;
-  ASSERT_OK(DB::Open(options, dbpath, &db));
-
-  WriteOptions write_options;
-  ASSERT_OK(db->Put(write_options, "1", "b"));
-  ASSERT_OK(db->Put(write_options, "2", "c"));
-  ASSERT_OK(db->Put(write_options, "3", "d"));
-  ASSERT_OK(db->Put(write_options, "4", "e"));
-  ASSERT_OK(db->Put(write_options, "5", "f"));
-
-  ReadOptions read_options;
-  Iterator *iter = db->NewIterator(read_options);
-
-  // Add an element that should not be reflected in the iterator.
-  ASSERT_OK(db->Put(write_options, "25", "cd"));
-
-  iter->Seek("5");
-  ASSERT_EQ(iter->key().ToString(), "5");
-  iter->Prev();
-  ASSERT_EQ(iter->key().ToString(), "4");
-  iter->Prev();
-  ASSERT_EQ(iter->key().ToString(), "3");
-  iter->Next();
-  ASSERT_EQ(iter->key().ToString(), "4");
-  iter->Next();
-  ASSERT_EQ(iter->key().ToString(), "5");
-
-  delete iter;
-  delete db;
-  DestroyDB(dbpath, options);
-}
-
-}  // namespace leveldb
-
-int main(int argc, char** argv) {
-  return leveldb::test::RunAllTests();
-}
--- a/src/leveldb/leveldb_os/compile_opt.cc
+++ b/src/leveldb/leveldb_os/compile_opt.cc
@ -0,0 +1,32 @@
+// -------------------------------------------------------------------
+//
+// compile_opt.h
+//
+// Copyright (c) 2016 Basho Technologies, Inc. All Rights Reserved.
+//
+// This file is provided to you under the Apache License,
+// Version 2.0 (the "License"); you may not use this file
+// except in compliance with the License.  You may obtain
+// a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+//
+// -------------------------------------------------------------------
+
+#include "leveldb/options.h"
+
+namespace leveldb
+{
+    const char * CompileOptionsString()
+    {
+        return("(open source)");
+    }
+}  // namespace leveldb
+
--- a/src/leveldb/leveldb_os/expiry_os_stub.cc
+++ b/src/leveldb/leveldb_os/expiry_os_stub.cc
@ -0,0 +1,62 @@
+// -------------------------------------------------------------------
+//
+// expiry_os_stub.cc
+//
+// Copyright (c) 2016-2017 Basho Technologies, Inc. All Rights Reserved.
+//
+// This file is provided to you under the Apache License,
+// Version 2.0 (the "License"); you may not use this file
+// except in compliance with the License.  You may obtain
+// a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+//
+// -------------------------------------------------------------------
+
+#include "db/dbformat.h"
+#include "leveldb/expiry.h"
+#include "util/expiry_os.h"
+
+namespace leveldb {
+
+/**
+ * This is the factory function to create
+ *  an open source version of object expiry
+ */
+ExpiryModule *
+ExpiryModule::CreateExpiryModule(
+    EleveldbRouter_t Router)
+{
+
+    return(new leveldb::ExpiryModuleOS);
+
+}   // ExpiryModule::CreateExpiryModule()
+
+
+void
+ExpiryModule::ShutdownExpiryModule()
+{
+
+    return;
+
+}   // ExpiryModule::ShutdownExpiryModule
+
+
+uint64_t
+CuttlefishDurationMinutes(
+    const char * Buffer)
+{
+
+    // zero is safe return since it implies "disable write time expiry"
+    return(0);
+
+}   // CuttlefishDurationMinutes
+
+}  // namespace leveldb
--- a/src/leveldb/leveldb_os/hot_backup_stub.cc
+++ b/src/leveldb/leveldb_os/hot_backup_stub.cc
@ -0,0 +1,37 @@
+// -------------------------------------------------------------------
+//
+// hot_backup_stub.cc
+//
+// Copyright (c) 2011-2016 Basho Technologies, Inc. All Rights Reserved.
+//
+// This file is provided to you under the Apache License,
+// Version 2.0 (the "License"); you may not use this file
+// except in compliance with the License.  You may obtain
+// a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+//
+// -------------------------------------------------------------------
+
+#include "util/thread_tasks.h"
+
+namespace leveldb {
+
+/**
+ * Called by throttle.cc's thread once a minute.  Used to
+ *  test for trigger condition
+ */
+void
+CheckHotBackupTrigger()
+{
+    return;
+}   // CheckHotBackupTrigger
+
+}  // namespace leveldb
--- a/Show more
+++ b/Show more