diff --git a/src/Makefile.am b/src/Makefile.am
index e88e21a60..02be7e443 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -475,7 +475,6 @@ lbrycrdd_LDADD = \
   $(LIBBITCOIN_CONSENSUS) \
   $(LIBBITCOIN_CRYPTO) \
   $(LIBLEVELDB) \
-  $(LIBLEVELDB_SSE42) \
   $(LIBMEMENV) \
   $(LIBSECP256K1)
 
@@ -573,7 +572,7 @@ $(top_srcdir)/$(subdir)/config/bitcoin-config.h.in:  $(am__configure_deps)
 clean-local:
 	-$(MAKE) -C secp256k1 clean
 	-$(MAKE) -C univalue clean
-	-rm -f leveldb/*/*.gcda leveldb/*/*.gcno leveldb/helpers/memenv/*.gcda leveldb/helpers/memenv/*.gcno
+	-$(MAKE) -C leveldb clean
 	-rm -f config.h
 	-rm -rf test/__pycache__
 
diff --git a/src/Makefile.bench.include b/src/Makefile.bench.include
index 0462ce04f..68b813675 100644
--- a/src/Makefile.bench.include
+++ b/src/Makefile.bench.include
@@ -42,7 +42,6 @@ bench_bench_bitcoin_LDADD = \
   $(LIBBITCOIN_CONSENSUS) \
   $(LIBBITCOIN_CRYPTO) \
   $(LIBLEVELDB) \
-  $(LIBLEVELDB_SSE42) \
   $(LIBMEMENV) \
   $(LIBSECP256K1) \
   $(LIBUNIVALUE)
diff --git a/src/Makefile.leveldb.include b/src/Makefile.leveldb.include
index 833f3d2a1..25ea1a355 100644
--- a/src/Makefile.leveldb.include
+++ b/src/Makefile.leveldb.include
@@ -2,148 +2,23 @@
 # Distributed under the MIT software license, see the accompanying
 # file COPYING or http://www.opensource.org/licenses/mit-license.php.
 
+SUBDIRS = leveldb
+
 LIBLEVELDB_INT = leveldb/libleveldb.a
 LIBMEMENV_INT  = leveldb/libmemenv.a
-LIBLEVELDB_SSE42_INT  = leveldb/libleveldb_sse42.a
 
 EXTRA_LIBRARIES += $(LIBLEVELDB_INT)
 EXTRA_LIBRARIES += $(LIBMEMENV_INT)
-EXTRA_LIBRARIES += $(LIBLEVELDB_SSE42_INT)
 
 LIBLEVELDB += $(LIBLEVELDB_INT)
 LIBMEMENV += $(LIBMEMENV_INT)
-LIBLEVELDB_SSE42 = $(LIBLEVELDB_SSE42_INT)
 
 LEVELDB_CPPFLAGS += -I$(srcdir)/leveldb/include
 LEVELDB_CPPFLAGS += -I$(srcdir)/leveldb/helpers/memenv
+LEVELDB_CPPFLAGS += -I$(srcdir)/leveldb
 
-LEVELDB_CPPFLAGS_INT =
-LEVELDB_CPPFLAGS_INT += -I$(srcdir)/leveldb
-LEVELDB_CPPFLAGS_INT += $(LEVELDB_TARGET_FLAGS)
-LEVELDB_CPPFLAGS_INT += -DLEVELDB_ATOMIC_PRESENT
-LEVELDB_CPPFLAGS_INT += -D__STDC_LIMIT_MACROS
+leveldb/libleveldb.a:
+	$(AM_V_at)$(MAKE) $(AM_MAKEFLAGS) -C leveldb
 
-if TARGET_WINDOWS
-LEVELDB_CPPFLAGS_INT += -DLEVELDB_PLATFORM_WINDOWS -DWINVER=0x0500 -D__USE_MINGW_ANSI_STDIO=1
-else
-LEVELDB_CPPFLAGS_INT += -DLEVELDB_PLATFORM_POSIX
-endif
-
-leveldb_libleveldb_a_CPPFLAGS = $(AM_CPPFLAGS) $(LEVELDB_CPPFLAGS_INT) $(LEVELDB_CPPFLAGS)
-leveldb_libleveldb_a_CXXFLAGS = $(AM_CXXFLAGS) $(PIE_FLAGS)
-
-leveldb_libleveldb_a_SOURCES=
-leveldb_libleveldb_a_SOURCES += leveldb/port/atomic_pointer.h
-leveldb_libleveldb_a_SOURCES += leveldb/port/port_example.h
-leveldb_libleveldb_a_SOURCES += leveldb/port/port_posix.h
-leveldb_libleveldb_a_SOURCES += leveldb/port/win/stdint.h
-leveldb_libleveldb_a_SOURCES += leveldb/port/port.h
-leveldb_libleveldb_a_SOURCES += leveldb/port/port_win.h
-leveldb_libleveldb_a_SOURCES += leveldb/port/thread_annotations.h
-leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/db.h
-leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/options.h
-leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/comparator.h
-leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/filter_policy.h
-leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/slice.h
-leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/table_builder.h
-leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/env.h
-leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/c.h
-leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/iterator.h
-leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/cache.h
-leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/dumpfile.h
-leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/table.h
-leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/write_batch.h
-leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/status.h
-leveldb_libleveldb_a_SOURCES += leveldb/db/log_format.h
-leveldb_libleveldb_a_SOURCES += leveldb/db/memtable.h
-leveldb_libleveldb_a_SOURCES += leveldb/db/version_set.h
-leveldb_libleveldb_a_SOURCES += leveldb/db/write_batch_internal.h
-leveldb_libleveldb_a_SOURCES += leveldb/db/filename.h
-leveldb_libleveldb_a_SOURCES += leveldb/db/version_edit.h
-leveldb_libleveldb_a_SOURCES += leveldb/db/dbformat.h
-leveldb_libleveldb_a_SOURCES += leveldb/db/builder.h
-leveldb_libleveldb_a_SOURCES += leveldb/db/log_writer.h
-leveldb_libleveldb_a_SOURCES += leveldb/db/db_iter.h
-leveldb_libleveldb_a_SOURCES += leveldb/db/skiplist.h
-leveldb_libleveldb_a_SOURCES += leveldb/db/db_impl.h
-leveldb_libleveldb_a_SOURCES += leveldb/db/table_cache.h
-leveldb_libleveldb_a_SOURCES += leveldb/db/snapshot.h
-leveldb_libleveldb_a_SOURCES += leveldb/db/log_reader.h
-leveldb_libleveldb_a_SOURCES += leveldb/table/filter_block.h
-leveldb_libleveldb_a_SOURCES += leveldb/table/block_builder.h
-leveldb_libleveldb_a_SOURCES += leveldb/table/block.h
-leveldb_libleveldb_a_SOURCES += leveldb/table/two_level_iterator.h
-leveldb_libleveldb_a_SOURCES += leveldb/table/merger.h
-leveldb_libleveldb_a_SOURCES += leveldb/table/format.h
-leveldb_libleveldb_a_SOURCES += leveldb/table/iterator_wrapper.h
-leveldb_libleveldb_a_SOURCES += leveldb/util/crc32c.h
-leveldb_libleveldb_a_SOURCES += leveldb/util/env_posix_test_helper.h
-leveldb_libleveldb_a_SOURCES += leveldb/util/arena.h
-leveldb_libleveldb_a_SOURCES += leveldb/util/random.h
-leveldb_libleveldb_a_SOURCES += leveldb/util/posix_logger.h
-leveldb_libleveldb_a_SOURCES += leveldb/util/hash.h
-leveldb_libleveldb_a_SOURCES += leveldb/util/histogram.h
-leveldb_libleveldb_a_SOURCES += leveldb/util/coding.h
-leveldb_libleveldb_a_SOURCES += leveldb/util/testutil.h
-leveldb_libleveldb_a_SOURCES += leveldb/util/mutexlock.h
-leveldb_libleveldb_a_SOURCES += leveldb/util/logging.h
-leveldb_libleveldb_a_SOURCES += leveldb/util/testharness.h
-
-leveldb_libleveldb_a_SOURCES += leveldb/db/builder.cc
-leveldb_libleveldb_a_SOURCES += leveldb/db/c.cc
-leveldb_libleveldb_a_SOURCES += leveldb/db/dbformat.cc
-leveldb_libleveldb_a_SOURCES += leveldb/db/db_impl.cc
-leveldb_libleveldb_a_SOURCES += leveldb/db/db_iter.cc
-leveldb_libleveldb_a_SOURCES += leveldb/db/dumpfile.cc
-leveldb_libleveldb_a_SOURCES += leveldb/db/filename.cc
-leveldb_libleveldb_a_SOURCES += leveldb/db/log_reader.cc
-leveldb_libleveldb_a_SOURCES += leveldb/db/log_writer.cc
-leveldb_libleveldb_a_SOURCES += leveldb/db/memtable.cc
-leveldb_libleveldb_a_SOURCES += leveldb/db/repair.cc
-leveldb_libleveldb_a_SOURCES += leveldb/db/table_cache.cc
-leveldb_libleveldb_a_SOURCES += leveldb/db/version_edit.cc
-leveldb_libleveldb_a_SOURCES += leveldb/db/version_set.cc
-leveldb_libleveldb_a_SOURCES += leveldb/db/write_batch.cc
-leveldb_libleveldb_a_SOURCES += leveldb/table/block_builder.cc
-leveldb_libleveldb_a_SOURCES += leveldb/table/block.cc
-leveldb_libleveldb_a_SOURCES += leveldb/table/filter_block.cc
-leveldb_libleveldb_a_SOURCES += leveldb/table/format.cc
-leveldb_libleveldb_a_SOURCES += leveldb/table/iterator.cc
-leveldb_libleveldb_a_SOURCES += leveldb/table/merger.cc
-leveldb_libleveldb_a_SOURCES += leveldb/table/table_builder.cc
-leveldb_libleveldb_a_SOURCES += leveldb/table/table.cc
-leveldb_libleveldb_a_SOURCES += leveldb/table/two_level_iterator.cc
-leveldb_libleveldb_a_SOURCES += leveldb/util/arena.cc
-leveldb_libleveldb_a_SOURCES += leveldb/util/bloom.cc
-leveldb_libleveldb_a_SOURCES += leveldb/util/cache.cc
-leveldb_libleveldb_a_SOURCES += leveldb/util/coding.cc
-leveldb_libleveldb_a_SOURCES += leveldb/util/comparator.cc
-leveldb_libleveldb_a_SOURCES += leveldb/util/crc32c.cc
-leveldb_libleveldb_a_SOURCES += leveldb/util/env.cc
-leveldb_libleveldb_a_SOURCES += leveldb/util/env_posix.cc
-leveldb_libleveldb_a_SOURCES += leveldb/util/filter_policy.cc
-leveldb_libleveldb_a_SOURCES += leveldb/util/hash.cc
-leveldb_libleveldb_a_SOURCES += leveldb/util/histogram.cc
-leveldb_libleveldb_a_SOURCES += leveldb/util/logging.cc
-leveldb_libleveldb_a_SOURCES += leveldb/util/options.cc
-leveldb_libleveldb_a_SOURCES += leveldb/util/status.cc
-
-if TARGET_WINDOWS
-leveldb_libleveldb_a_SOURCES += leveldb/util/env_win.cc
-leveldb_libleveldb_a_SOURCES += leveldb/port/port_win.cc
-else
-leveldb_libleveldb_a_SOURCES += leveldb/port/port_posix.cc
-endif
-
-leveldb_libmemenv_a_CPPFLAGS = $(leveldb_libleveldb_a_CPPFLAGS)
-leveldb_libmemenv_a_CXXFLAGS = $(leveldb_libleveldb_a_CXXFLAGS)
-leveldb_libmemenv_a_SOURCES =  leveldb/helpers/memenv/memenv.cc
-leveldb_libmemenv_a_SOURCES += leveldb/helpers/memenv/memenv.h
-
-leveldb_libleveldb_sse42_a_CPPFLAGS = $(leveldb_libleveldb_a_CPPFLAGS)
-leveldb_libleveldb_sse42_a_CXXFLAGS = $(leveldb_libleveldb_a_CXXFLAGS)
-if ENABLE_HWCRC32
-leveldb_libleveldb_sse42_a_CPPFLAGS += -DLEVELDB_PLATFORM_POSIX_SSE
-leveldb_libleveldb_sse42_a_CXXFLAGS += $(SSE42_CXXFLAGS)
-endif
-leveldb_libleveldb_sse42_a_SOURCES =  leveldb/port/port_posix_sse.cc
+leveldb/libmemenv.a: leveldb/libleveldb.a
+	$(AM_V_at)$(MAKE) $(AM_MAKEFLAGS) -C leveldb memenv_test
diff --git a/src/Makefile.qt.include b/src/Makefile.qt.include
index 98371539a..74971f6ac 100644
--- a/src/Makefile.qt.include
+++ b/src/Makefile.qt.include
@@ -408,7 +408,7 @@ endif
 if ENABLE_ZMQ
 qt_lbrycrd_qt_LDADD += $(LIBBITCOIN_ZMQ) $(ZMQ_LIBS)
 endif
-qt_lbrycrd_qt_LDADD += $(LIBBITCOIN_CLI) $(LIBBITCOIN_COMMON) $(LIBBITCOIN_UTIL) $(LIBBITCOIN_CONSENSUS) $(LIBBITCOIN_CRYPTO) $(LIBUNIVALUE) $(LIBLEVELDB) $(LIBLEVELDB_SSE42) $(LIBMEMENV) \
+qt_lbrycrd_qt_LDADD += $(LIBBITCOIN_CLI) $(LIBBITCOIN_COMMON) $(LIBBITCOIN_UTIL) $(LIBBITCOIN_CONSENSUS) $(LIBBITCOIN_CRYPTO) $(LIBUNIVALUE) $(LIBLEVELDB) $(LIBMEMENV) \
   $(BOOST_LIBS) $(QT_LIBS) $(QT_DBUS_LIBS) $(QR_LIBS) $(PROTOBUF_LIBS) $(ICU_LIBS) $(BDB_LIBS) $(SSL_LIBS) $(CRYPTO_LIBS) $(MINIUPNPC_LIBS) $(LIBSECP256K1) \
   $(EVENT_PTHREADS_LIBS) $(EVENT_LIBS)
 qt_lbrycrd_qt_LDFLAGS = $(RELDFLAGS) $(AM_LDFLAGS) $(QT_LDFLAGS) $(LIBTOOL_APP_LDFLAGS)
diff --git a/src/Makefile.qttest.include b/src/Makefile.qttest.include
index 450e9faf7..616e44284 100644
--- a/src/Makefile.qttest.include
+++ b/src/Makefile.qttest.include
@@ -63,7 +63,7 @@ if ENABLE_ZMQ
 qt_test_test_lbrycrd_qt_LDADD += $(LIBBITCOIN_ZMQ) $(ZMQ_LIBS)
 endif
 qt_test_test_lbrycrd_qt_LDADD += $(LIBBITCOIN_CLI) $(LIBBITCOIN_COMMON) $(LIBBITCOIN_UTIL) $(LIBBITCOIN_CONSENSUS) $(LIBBITCOIN_CRYPTO) $(LIBUNIVALUE) $(LIBLEVELDB) \
-  $(LIBLEVELDB_SSE42) $(LIBMEMENV) $(BOOST_LIBS) $(QT_DBUS_LIBS) $(QT_TEST_LIBS) $(QT_LIBS) \
+  $(LIBMEMENV) $(BOOST_LIBS) $(QT_DBUS_LIBS) $(QT_TEST_LIBS) $(QT_LIBS) \
   $(QR_LIBS) $(PROTOBUF_LIBS) $(ICU_LIBS) $(BDB_LIBS) $(SSL_LIBS) $(CRYPTO_LIBS) $(MINIUPNPC_LIBS) $(LIBSECP256K1) \
   $(EVENT_PTHREADS_LIBS) $(EVENT_LIBS)
 qt_test_test_lbrycrd_qt_LDFLAGS = $(RELDFLAGS) $(AM_LDFLAGS) $(QT_LDFLAGS) $(LIBTOOL_APP_LDFLAGS)
diff --git a/src/Makefile.test.include b/src/Makefile.test.include
index 0e18a3ba8..4f0ca5f06 100644
--- a/src/Makefile.test.include
+++ b/src/Makefile.test.include
@@ -122,7 +122,7 @@ test_test_lbrycrd_LDADD += $(LIBBITCOIN_WALLET)
 endif
 
 test_test_lbrycrd_LDADD += $(LIBBITCOIN_SERVER) $(LIBBITCOIN_CLI) $(LIBBITCOIN_COMMON) $(LIBBITCOIN_UTIL) $(LIBBITCOIN_CONSENSUS) $(LIBBITCOIN_CRYPTO) $(LIBUNIVALUE) \
-  $(LIBLEVELDB) $(LIBLEVELDB_SSE42) $(LIBMEMENV) $(BOOST_LIBS) $(BOOST_UNIT_TEST_FRAMEWORK_LIB) $(LIBSECP256K1) $(EVENT_LIBS) $(EVENT_PTHREADS_LIBS)
+  $(LIBLEVELDB) $(LIBMEMENV) $(BOOST_LIBS) $(BOOST_UNIT_TEST_FRAMEWORK_LIB) $(LIBSECP256K1) $(EVENT_LIBS) $(EVENT_PTHREADS_LIBS)
 test_test_lbrycrd_CXXFLAGS = $(AM_CXXFLAGS) $(PIE_FLAGS)
 
 test_test_lbrycrd_LDADD += $(LIBBITCOIN_CONSENSUS) $(BDB_LIBS) $(CRYPTO_LIBS) $(ICU_LIBS) $(MINIUPNPC_LIBS)
diff --git a/src/claimtrie.cpp b/src/claimtrie.cpp
index 0ddf7bad5..8fb414376 100644
--- a/src/claimtrie.cpp
+++ b/src/claimtrie.cpp
@@ -597,7 +597,7 @@ bool CClaimTrieCacheBase::flush()
     base->nNextHeight = nNextHeight;
     if (!nodesToAddOrUpdate.empty() && (LogAcceptCategory(BCLog::CLAIMS) || LogAcceptCategory(BCLog::BENCH))) {
         LogPrintf("TrieCache size: %zu nodes on block %d, batch writes %zu bytes.\n",
-                nodesToAddOrUpdate.height(), nNextHeight, batch.SizeEstimate());
+                nodesToAddOrUpdate.height(), nNextHeight, batch.SizeEstimate(), base->db->DynamicMemoryUsage());
     }
     auto ret = base->db->WriteBatch(batch);
     clear();
diff --git a/src/dbwrapper.cpp b/src/dbwrapper.cpp
index dbbf9c877..8b9c5d906 100644
--- a/src/dbwrapper.cpp
+++ b/src/dbwrapper.cpp
@@ -97,11 +97,45 @@ static void SetMaxOpenFiles(leveldb::Options *options) {
              options->max_open_files, default_open_files);
 }
 
+class CappedLenCache: public leveldb::Cache {
+    leveldb::Cache* inner;
+    std::size_t maxKeyLen;
+public:
+    CappedLenCache(std::size_t capacity, std::size_t maxKeyLen)
+        : inner(leveldb::NewLRUCache(capacity)), maxKeyLen(maxKeyLen) {}
+
+    ~CappedLenCache() override { delete inner; }
+
+    Handle* Insert(const leveldb::Slice& key, void* value, size_t charge,
+                           void (*deleter)(const leveldb::Slice& key, void* value)) override {
+        if (key.size() <= maxKeyLen)
+            return inner->Insert(key, value, charge, deleter);
+        deleter(key, value);
+        return nullptr;
+    }
+
+    Handle* Lookup(const leveldb::Slice& key) override { return inner->Lookup(key); }
+    void Release(Handle* handle) override { return inner->Release(handle); }
+    void* Value(Handle* handle) override { return inner->Value(handle); }
+    void Erase(const leveldb::Slice& key) override {return inner->Erase(key); }
+    uint64_t NewId() override { return inner->NewId(); }
+};
+
 static leveldb::Options GetOptions(size_t nCacheSize)
 {
     leveldb::Options options;
-    auto write_cache = std::min(nCacheSize / 4, size_t(16) << 20U); // cap write_cache at 16MB (4x default)
+
+    options.filter_policy=leveldb::NewBloomFilterPolicy2(16);
+    options.write_buffer_size=60 * 1024 * 1024;
+    options.total_leveldb_mem=2500ULL * 1024ULL * 1024ULL;
+    options.env=leveldb::Env::Default();
+    options.compression = leveldb::kNoCompression;
+    options.info_log = new CBitcoinLevelDBLogger();
+    return options;
+
+    auto write_cache = std::min(nCacheSize / 4, size_t(4 * 1024 * 1024)); // cap write_cache at 4MB (default)
     options.block_cache = leveldb::NewLRUCache(nCacheSize - write_cache * 2);
+    // options.block_cache = new CappedLenCache(nCacheSize - write_cache * 2, 6);
     options.write_buffer_size = write_cache; // up to two write buffers may be held in memory simultaneously
     options.filter_policy = leveldb::NewBloomFilterPolicy(10);
     options.compression = leveldb::kNoCompression;
@@ -112,6 +146,7 @@ static leveldb::Options GetOptions(size_t nCacheSize)
         options.paranoid_checks = true;
     }
     SetMaxOpenFiles(&options);
+    options.max_open_files = 30000;
     return options;
 }
 
diff --git a/src/dbwrapper.h b/src/dbwrapper.h
index c20b64bc7..687760c08 100644
--- a/src/dbwrapper.h
+++ b/src/dbwrapper.h
@@ -81,7 +81,7 @@ public:
         ssValue.Xor(dbwrapper_private::GetObfuscateKey(parent));
         leveldb::Slice slValue(ssValue.data(), ssValue.size());
 
-        batch.Put(slKey, slValue);
+        batch.Put(slKey, slValue, nullptr);
         // LevelDB serializes writes as:
         // - byte: header
         // - varint: key length (1 byte up to 127B, 2 bytes up to 16383B, ...)
diff --git a/src/leveldb/.gitignore b/src/leveldb/.gitignore
deleted file mode 100644
index 71d87a4ee..000000000
--- a/src/leveldb/.gitignore
+++ /dev/null
@@ -1,13 +0,0 @@
-build_config.mk
-*.a
-*.o
-*.dylib*
-*.so
-*.so.*
-*_test
-db_bench
-leveldbutil
-Release
-Debug
-Benchmark
-vs2010.*
diff --git a/src/leveldb/AUTHORS b/src/leveldb/AUTHORS
index 2439d7a45..27a9407e5 100644
--- a/src/leveldb/AUTHORS
+++ b/src/leveldb/AUTHORS
@@ -6,7 +6,3 @@ Google Inc.
 # Initial version authors:
 Jeffrey Dean <jeff@google.com>
 Sanjay Ghemawat <sanjay@google.com>
-
-# Partial list of contributors:
-Kevin Regan <kevin.d.regan@gmail.com>
-Johan Bilien <jobi@litl.com>
diff --git a/src/leveldb/BASHO_RELEASES b/src/leveldb/BASHO_RELEASES
new file mode 100644
index 000000000..56726135d
--- /dev/null
+++ b/src/leveldb/BASHO_RELEASES
@@ -0,0 +1,72 @@
+github.com tag 2.0.34 - February 15, 2017
+-----------------------------------------
+mv-hot-backup2:  - correct MakeTieredDbname() within db/filename.cc
+                   for case where dbname input is blank and fast/slow
+                   already populated in options.  Corrects issue
+                   with hot backup in non-tiered storage situations
+
+github.com tag 2.0.33 - November 21, 2016
+-----------------------------------------
+mv-bucket-expiry:  - partial branch to enable X-Riak-Meta-Expiry-Base-Seconds
+                     property within enterprise edition
+
+--- no 2.0.32 tag on leveldb ---
+
+github.com tag 2.0.31 - November 1, 2016
+----------------------------------------
+ - version shipped with Riak 2.2
+mv-no-md-expiry: - Riak specific
+                 - never convert a key prefix of sext:encoded "{md" to expiry
+                 - update sst_scan for dumping Riak formated keys
+mv-tuning8:      - rework penalty rules in version_set.cc UpdatePenalty()
+                 - add unit test framework for UpdatePenalty()
+
+github.com tag 2.0.30 - October 11, 2016
+----------------------------------------
+mv-delayed-bloom: - when opening an .sst table file, only load
+                    bloom filter on second Get() operation.  Saves time.
+                  - correct VersionSet::Finalize() logic for level 1 when
+                    when level 2 is above desired size
+                  - move hot backup to Riak ee build
+
+github.com tag 2.0.29 - September 13, 2016
+------------------------------------------
+mv-expiry-manifest:  only switch to expiry enabled manifest format
+                     if expiry function enabled.  Eases downgrade
+                     during early Riak releases containing expiry
+
+github.com tag 2.0.28 - September 6, 2016
+-----------------------------------------
+mv-hot-backup:  add externally triggered hot backup feature
+
+github.com tag 2.0.27 - August 22, 2016
+---------------------------------------
+mv-mem-fences:  fix iterator double delete bug in eleveldb and
+                build better memory fenced operations for referenced count objects.
+
+github.com tag 2.0.26 - August 21, 2016
+---------------------------------------
+mv-expiry-iter-bug:  DBImpl::NewIterator() was not setting the new expiry parameter.
+
+github.com tag 2.0.25 - August 10, 2016
+---------------------------------------
+Make LZ4 the default compression instead of Snappy.
+
+github.com tag 2.0.24 - August 2, 2016
+--------------------------------------
+mv-expiry:  open source expiry.  Supports one expiry policy for all databases.
+
+github.com tag 2.0.23 - July 20, 2016
+-------------------------------------
+mv-no-semaphore:  remove semaphore controlled thread in hot_threads.cc.  Instead use
+ use mutex of thread 0 (only one thread's mutex) to address know race condition.
+
+github.com tag 2.0.22 - June 22, 2016
+-------------------------------------
+no change: iterator fix in eleveldb
+
+github.com tag 2.0.21 - June 16, 2016
+-------------------------------------
+branch mv-iterator-hot-threads:  correct condition where eleveldb MoveTask
+ could hang an iterator. (https://github.com/basho/leveldb/wiki/mv-iterator-hot-threads)
+
diff --git a/src/leveldb/CONTRIBUTING.md b/src/leveldb/CONTRIBUTING.md
deleted file mode 100644
index cd600ff46..000000000
--- a/src/leveldb/CONTRIBUTING.md
+++ /dev/null
@@ -1,36 +0,0 @@
-# Contributing
-
-We'd love to accept your code patches! However, before we can take them, we
-have to jump a couple of legal hurdles.
-
-## Contributor License Agreements
-
-Please fill out either the individual or corporate Contributor License
-Agreement as appropriate.
-
-* If you are an individual writing original source code and you're sure you
-own the intellectual property, then sign an [individual CLA](https://developers.google.com/open-source/cla/individual).
-* If you work for a company that wants to allow you to contribute your work,
-then sign a [corporate CLA](https://developers.google.com/open-source/cla/corporate).
-
-Follow either of the two links above to access the appropriate CLA and
-instructions for how to sign and return it.
-
-## Submitting a Patch
-
-1. Sign the contributors license agreement above.
-2. Decide which code you want to submit. A submission should be a set of changes
-that addresses one issue in the [issue tracker](https://github.com/google/leveldb/issues).
-Please don't mix more than one logical change per submission, because it makes
-the history hard to follow. If you want to make a change
-(e.g. add a sample or feature) that doesn't have a corresponding issue in the
-issue tracker, please create one.
-3. **Submitting**: When you are ready to submit, send us a Pull Request. Be
-sure to include the issue number you fixed and the name you used to sign
-the CLA.
-
-## Writing Code ##
-
-If your contribution contains code, please make sure that it follows 
-[the style guide](http://google-styleguide.googlecode.com/svn/trunk/cppguide.xml).
-Otherwise we will have to ask you to make changes, and that's no fun for anyone.
diff --git a/src/leveldb/Makefile b/src/leveldb/Makefile
index f7cc7d736..dbe1d7bf3 100644
--- a/src/leveldb/Makefile
+++ b/src/leveldb/Makefile
@@ -2,423 +2,219 @@
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file. See the AUTHORS file for names of contributors.
 
+# Inherit some settings from environment variables, if available
+INSTALL_PATH ?= $(CURDIR)
+
 #-----------------------------------------------
 # Uncomment exactly one of the lines labelled (A), (B), and (C) below
 # to switch between compilation modes.
+#  NOTE: targets "debug" and "prof" provide same functionality
+#  NOTE 2: -DNDEBUG disables assert() statements within C code,
+#            i.e. no assert()s in production code
 
-# (A) Production use (optimized mode)
-OPT ?= -O2 -DNDEBUG
-# (B) Debug mode, w/ full line-level debugging symbols
-# OPT ?= -g2
-# (C) Profiling mode: opt, but w/debugging symbols
-# OPT ?= -O2 -g2 -DNDEBUG
+OPT ?= -O2 -g -DNDEBUG    # (A) Production use (optimized mode)
+# OPT ?= -g2              # (B) Debug mode, w/ full line-level debugging symbols
+# OPT ?= -O2 -g2 -DNDEBUG # (C) Profiling mode: opt, but w/debugging symbols
 #-----------------------------------------------
 
 # detect what platform we're building on
-$(shell CC="$(CC)" CXX="$(CXX)" TARGET_OS="$(TARGET_OS)" \
-    ./build_detect_platform build_config.mk ./)
+ifeq ($(wildcard build_config.mk),)
+$(shell ./build_detect_platform build_config.mk)
+endif
 # this file is generated by the previous line to set build flags and sources
 include build_config.mk
 
-TESTS = \
-	db/autocompact_test \
-	db/c_test \
-	db/corruption_test \
-	db/db_test \
-	db/dbformat_test \
-	db/fault_injection_test \
-	db/filename_test \
-	db/log_test \
-	db/recovery_test \
-	db/skiplist_test \
-	db/version_edit_test \
-	db/version_set_test \
-	db/write_batch_test \
-	helpers/memenv/memenv_test \
-	issues/issue178_test \
-	issues/issue200_test \
-	table/filter_block_test \
-	table/table_test \
-	util/arena_test \
-	util/bloom_test \
-	util/cache_test \
-	util/coding_test \
-	util/crc32c_test \
-	util/env_posix_test \
-	util/env_test \
-	util/hash_test
-
-UTILS = \
-	db/db_bench \
-	db/leveldbutil
-
-# Put the object files in a subdirectory, but the application at the top of the object dir.
-PROGNAMES := $(notdir $(TESTS) $(UTILS))
-
-# On Linux may need libkyotocabinet-dev for dependency.
-BENCHMARKS = \
-	doc/bench/db_bench_sqlite3 \
-	doc/bench/db_bench_tree_db
-
 CFLAGS += -I. -I./include $(PLATFORM_CCFLAGS) $(OPT)
 CXXFLAGS += -I. -I./include $(PLATFORM_CXXFLAGS) $(OPT)
 
 LDFLAGS += $(PLATFORM_LDFLAGS)
-LIBS += $(PLATFORM_LIBS)
 
-SIMULATOR_OUTDIR=out-ios-x86
-DEVICE_OUTDIR=out-ios-arm
+LIBOBJECTS := $(SOURCES:.cc=.o)
+LIBOBJECTS += util/lz4.o
+MEMENVOBJECTS = $(MEMENV_SOURCES:.cc=.o)
+DEPEND := $(SOURCES:.cc=.d)
 
-ifeq ($(PLATFORM), IOS)
-# Note: iOS should probably be using libtool, not ar.
-AR=xcrun ar
-SIMULATORSDK=$(shell xcrun -sdk iphonesimulator --show-sdk-path)
-DEVICESDK=$(shell xcrun -sdk iphoneos --show-sdk-path)
-DEVICE_CFLAGS = -isysroot "$(DEVICESDK)" -arch armv6 -arch armv7 -arch armv7s -arch arm64
-SIMULATOR_CFLAGS = -isysroot "$(SIMULATORSDK)" -arch i686 -arch x86_64
-STATIC_OUTDIR=out-ios-universal
+TESTUTIL = ./util/testutil.o
+TESTHARNESS = ./util/testharness.o $(TESTUTIL)
+
+TESTS := $(sort $(notdir $(basename $(TEST_SOURCES))))
+
+TOOLS = \
+	leveldb_repair \
+	perf_dump \
+	sst_rewrite \
+	sst_scan
+
+PROGRAMS = db_bench $(TESTS) $(TOOLS)
+BENCHMARKS = db_bench_sqlite3 db_bench_tree_db
+
+LIBRARY = libleveldb.a
+MEMENVLIBRARY = libmemenv.a
+
+#
+# static link leveldb to tools to simplify platform usage (if Linux)
+#
+ifeq ($(PLATFORM),OS_LINUX)
+LEVEL_LDFLAGS := -L . -Wl,-non_shared -lleveldb -Wl,-call_shared
 else
-STATIC_OUTDIR=out-static
-SHARED_OUTDIR=out-shared
-STATIC_PROGRAMS := $(addprefix $(STATIC_OUTDIR)/, $(PROGNAMES))
-SHARED_PROGRAMS := $(addprefix $(SHARED_OUTDIR)/, db_bench)
+LEVEL_LDFLAGS := -L . -lleveldb
 endif
 
-STATIC_LIBOBJECTS := $(addprefix $(STATIC_OUTDIR)/, $(SOURCES:.cc=.o))
-STATIC_MEMENVOBJECTS := $(addprefix $(STATIC_OUTDIR)/, $(MEMENV_SOURCES:.cc=.o))
-
-DEVICE_LIBOBJECTS := $(addprefix $(DEVICE_OUTDIR)/, $(SOURCES:.cc=.o))
-DEVICE_MEMENVOBJECTS := $(addprefix $(DEVICE_OUTDIR)/, $(MEMENV_SOURCES:.cc=.o))
-
-SIMULATOR_LIBOBJECTS := $(addprefix $(SIMULATOR_OUTDIR)/, $(SOURCES:.cc=.o))
-SIMULATOR_MEMENVOBJECTS := $(addprefix $(SIMULATOR_OUTDIR)/, $(MEMENV_SOURCES:.cc=.o))
-
-SHARED_LIBOBJECTS := $(addprefix $(SHARED_OUTDIR)/, $(SOURCES:.cc=.o))
-SHARED_MEMENVOBJECTS := $(addprefix $(SHARED_OUTDIR)/, $(MEMENV_SOURCES:.cc=.o))
-
-TESTUTIL := $(STATIC_OUTDIR)/util/testutil.o
-TESTHARNESS := $(STATIC_OUTDIR)/util/testharness.o $(TESTUTIL)
-
-STATIC_TESTOBJS := $(addprefix $(STATIC_OUTDIR)/, $(addsuffix .o, $(TESTS)))
-STATIC_UTILOBJS := $(addprefix $(STATIC_OUTDIR)/, $(addsuffix .o, $(UTILS)))
-STATIC_ALLOBJS := $(STATIC_LIBOBJECTS) $(STATIC_MEMENVOBJECTS) $(STATIC_TESTOBJS) $(STATIC_UTILOBJS) $(TESTHARNESS)
-DEVICE_ALLOBJS := $(DEVICE_LIBOBJECTS) $(DEVICE_MEMENVOBJECTS)
-SIMULATOR_ALLOBJS := $(SIMULATOR_LIBOBJECTS) $(SIMULATOR_MEMENVOBJECTS)
-
 default: all
 
 # Should we build shared libraries?
 ifneq ($(PLATFORM_SHARED_EXT),)
 
-# Many leveldb test apps use non-exported API's. Only build a subset for testing.
-SHARED_ALLOBJS := $(SHARED_LIBOBJECTS) $(SHARED_MEMENVOBJECTS) $(TESTHARNESS)
-
 ifneq ($(PLATFORM_SHARED_VERSIONED),true)
-SHARED_LIB1 = libleveldb.$(PLATFORM_SHARED_EXT)
-SHARED_LIB2 = $(SHARED_LIB1)
-SHARED_LIB3 = $(SHARED_LIB1)
-SHARED_LIBS = $(SHARED_LIB1)
-SHARED_MEMENVLIB = $(SHARED_OUTDIR)/libmemenv.a
+SHARED1 = libleveldb.$(PLATFORM_SHARED_EXT)
+SHARED2 = $(SHARED1)
+SHARED3 = $(SHARED1)
+SHARED = $(SHARED1)
 else
 # Update db.h if you change these.
-SHARED_VERSION_MAJOR = 1
-SHARED_VERSION_MINOR = 20
-SHARED_LIB1 = libleveldb.$(PLATFORM_SHARED_EXT)
-SHARED_LIB2 = $(SHARED_LIB1).$(SHARED_VERSION_MAJOR)
-SHARED_LIB3 = $(SHARED_LIB1).$(SHARED_VERSION_MAJOR).$(SHARED_VERSION_MINOR)
-SHARED_LIBS = $(SHARED_OUTDIR)/$(SHARED_LIB1) $(SHARED_OUTDIR)/$(SHARED_LIB2) $(SHARED_OUTDIR)/$(SHARED_LIB3)
-$(SHARED_OUTDIR)/$(SHARED_LIB1): $(SHARED_OUTDIR)/$(SHARED_LIB3)
-	ln -fs $(SHARED_LIB3) $(SHARED_OUTDIR)/$(SHARED_LIB1)
-$(SHARED_OUTDIR)/$(SHARED_LIB2): $(SHARED_OUTDIR)/$(SHARED_LIB3)
-	ln -fs $(SHARED_LIB3) $(SHARED_OUTDIR)/$(SHARED_LIB2)
-SHARED_MEMENVLIB = $(SHARED_OUTDIR)/libmemenv.a
+SHARED_MAJOR = 1
+SHARED_MINOR = 9
+SHARED1 = libleveldb.$(PLATFORM_SHARED_EXT)
+SHARED2 = $(SHARED1).$(SHARED_MAJOR)
+SHARED3 = $(SHARED1).$(SHARED_MAJOR).$(SHARED_MINOR)
+SHARED = $(SHARED1) $(SHARED2) $(SHARED3)
+$(SHARED1): $(SHARED3)
+	ln -fs $(SHARED3) $(SHARED1)
+$(SHARED2): $(SHARED3)
+	ln -fs $(SHARED3) $(SHARED2)
 endif
 
-$(SHARED_OUTDIR)/$(SHARED_LIB3): $(SHARED_LIBOBJECTS)
-	$(CXX) $(LDFLAGS) $(PLATFORM_SHARED_LDFLAGS)$(SHARED_LIB2) $(SHARED_LIBOBJECTS) -o $(SHARED_OUTDIR)/$(SHARED_LIB3) $(LIBS)
+$(SHARED3): $(LIBOBJECTS)
+	$(CXX) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) $(LIBOBJECTS) -o $(SHARED3) $(LDFLAGS) $(PLATFORM_SHARED_LDFLAGS)$(SHARED2)
 
 endif  # PLATFORM_SHARED_EXT
 
-all: $(SHARED_LIBS) $(SHARED_PROGRAMS) $(STATIC_OUTDIR)/libleveldb.a $(STATIC_OUTDIR)/libmemenv.a $(STATIC_PROGRAMS)
+all: $(SHARED) $(LIBRARY)
 
-check: $(STATIC_PROGRAMS)
-	for t in $(notdir $(TESTS)); do echo "***** Running $$t"; $(STATIC_OUTDIR)/$$t || exit 1; done
+test check: all $(PROGRAMS) $(TESTS)
+	for t in $(TESTS); do echo "***** Running $$t"; ./$$t || exit 1; done
 
-clean:
-	-rm -rf out-static out-shared out-ios-x86 out-ios-arm out-ios-universal
-	-rm -f build_config.mk
-	-rm -rf ios-x86 ios-arm
+tools: all $(TOOLS)
 
-$(STATIC_OUTDIR):
-	mkdir $@
-
-$(STATIC_OUTDIR)/db: | $(STATIC_OUTDIR)
-	mkdir $@
-
-$(STATIC_OUTDIR)/helpers/memenv: | $(STATIC_OUTDIR)
-	mkdir -p $@
-
-$(STATIC_OUTDIR)/port: | $(STATIC_OUTDIR)
-	mkdir $@
-
-$(STATIC_OUTDIR)/table: | $(STATIC_OUTDIR)
-	mkdir $@
-
-$(STATIC_OUTDIR)/util: | $(STATIC_OUTDIR)
-	mkdir $@
-
-.PHONY: STATIC_OBJDIRS
-STATIC_OBJDIRS: \
-	$(STATIC_OUTDIR)/db \
-	$(STATIC_OUTDIR)/port \
-	$(STATIC_OUTDIR)/table \
-	$(STATIC_OUTDIR)/util \
-	$(STATIC_OUTDIR)/helpers/memenv
-
-$(SHARED_OUTDIR):
-	mkdir $@
-
-$(SHARED_OUTDIR)/db: | $(SHARED_OUTDIR)
-	mkdir $@
-
-$(SHARED_OUTDIR)/helpers/memenv: | $(SHARED_OUTDIR)
-	mkdir -p $@
-
-$(SHARED_OUTDIR)/port: | $(SHARED_OUTDIR)
-	mkdir $@
-
-$(SHARED_OUTDIR)/table: | $(SHARED_OUTDIR)
-	mkdir $@
-
-$(SHARED_OUTDIR)/util: | $(SHARED_OUTDIR)
-	mkdir $@
-
-.PHONY: SHARED_OBJDIRS
-SHARED_OBJDIRS: \
-	$(SHARED_OUTDIR)/db \
-	$(SHARED_OUTDIR)/port \
-	$(SHARED_OUTDIR)/table \
-	$(SHARED_OUTDIR)/util \
-	$(SHARED_OUTDIR)/helpers/memenv
-
-$(DEVICE_OUTDIR):
-	mkdir $@
-
-$(DEVICE_OUTDIR)/db: | $(DEVICE_OUTDIR)
-	mkdir $@
-
-$(DEVICE_OUTDIR)/helpers/memenv: | $(DEVICE_OUTDIR)
-	mkdir -p $@
-
-$(DEVICE_OUTDIR)/port: | $(DEVICE_OUTDIR)
-	mkdir $@
-
-$(DEVICE_OUTDIR)/table: | $(DEVICE_OUTDIR)
-	mkdir $@
-
-$(DEVICE_OUTDIR)/util: | $(DEVICE_OUTDIR)
-	mkdir $@
-
-.PHONY: DEVICE_OBJDIRS
-DEVICE_OBJDIRS: \
-	$(DEVICE_OUTDIR)/db \
-	$(DEVICE_OUTDIR)/port \
-	$(DEVICE_OUTDIR)/table \
-	$(DEVICE_OUTDIR)/util \
-	$(DEVICE_OUTDIR)/helpers/memenv
-
-$(SIMULATOR_OUTDIR):
-	mkdir $@
-
-$(SIMULATOR_OUTDIR)/db: | $(SIMULATOR_OUTDIR)
-	mkdir $@
-
-$(SIMULATOR_OUTDIR)/helpers/memenv: | $(SIMULATOR_OUTDIR)
-	mkdir -p $@
-
-$(SIMULATOR_OUTDIR)/port: | $(SIMULATOR_OUTDIR)
-	mkdir $@
-
-$(SIMULATOR_OUTDIR)/table: | $(SIMULATOR_OUTDIR)
-	mkdir $@
-
-$(SIMULATOR_OUTDIR)/util: | $(SIMULATOR_OUTDIR)
-	mkdir $@
-
-.PHONY: SIMULATOR_OBJDIRS
-SIMULATOR_OBJDIRS: \
-	$(SIMULATOR_OUTDIR)/db \
-	$(SIMULATOR_OUTDIR)/port \
-	$(SIMULATOR_OUTDIR)/table \
-	$(SIMULATOR_OUTDIR)/util \
-	$(SIMULATOR_OUTDIR)/helpers/memenv
-
-$(STATIC_ALLOBJS): | STATIC_OBJDIRS
-$(DEVICE_ALLOBJS): | DEVICE_OBJDIRS
-$(SIMULATOR_ALLOBJS): | SIMULATOR_OBJDIRS
-$(SHARED_ALLOBJS): | SHARED_OBJDIRS
-
-ifeq ($(PLATFORM), IOS)
-$(DEVICE_OUTDIR)/libleveldb.a: $(DEVICE_LIBOBJECTS)
-	rm -f $@
-	$(AR) -rs $@ $(DEVICE_LIBOBJECTS)
-
-$(SIMULATOR_OUTDIR)/libleveldb.a: $(SIMULATOR_LIBOBJECTS)
-	rm -f $@
-	$(AR) -rs $@ $(SIMULATOR_LIBOBJECTS)
-
-$(DEVICE_OUTDIR)/libmemenv.a: $(DEVICE_MEMENVOBJECTS)
-	rm -f $@
-	$(AR) -rs $@ $(DEVICE_MEMENVOBJECTS)
-
-$(SIMULATOR_OUTDIR)/libmemenv.a: $(SIMULATOR_MEMENVOBJECTS)
-	rm -f $@
-	$(AR) -rs $@ $(SIMULATOR_MEMENVOBJECTS)
-
-# For iOS, create universal object libraries to be used on both the simulator and
-# a device.
-$(STATIC_OUTDIR)/libleveldb.a: $(STATIC_OUTDIR) $(DEVICE_OUTDIR)/libleveldb.a $(SIMULATOR_OUTDIR)/libleveldb.a
-	lipo -create $(DEVICE_OUTDIR)/libleveldb.a $(SIMULATOR_OUTDIR)/libleveldb.a -output $@
-
-$(STATIC_OUTDIR)/libmemenv.a: $(STATIC_OUTDIR) $(DEVICE_OUTDIR)/libmemenv.a $(SIMULATOR_OUTDIR)/libmemenv.a
-	lipo -create $(DEVICE_OUTDIR)/libmemenv.a $(SIMULATOR_OUTDIR)/libmemenv.a -output $@
-else
-$(STATIC_OUTDIR)/libleveldb.a:$(STATIC_LIBOBJECTS)
-	rm -f $@
-	$(AR) -rs $@ $(STATIC_LIBOBJECTS)
-
-$(STATIC_OUTDIR)/libmemenv.a:$(STATIC_MEMENVOBJECTS)
-	rm -f $@
-	$(AR) -rs $@ $(STATIC_MEMENVOBJECTS)
+#
+# command line targets:  debug and prof
+#  just like
+ifneq ($(filter debug,$(MAKECMDGOALS)),)
+OPT := -g2              # (B) Debug mode, w/ full line-level debugging symbols
+debug: all
 endif
 
-$(SHARED_MEMENVLIB):$(SHARED_MEMENVOBJECTS)
+ifneq ($(filter prof,$(MAKECMDGOALS)),)
+OPT := -O2 -g2 -DNDEBUG # (C) Profiling mode: opt, but w/debugging symbols
+prof: all
+endif
+
+
+clean:
+	-rm -f $(PROGRAMS) $(BENCHMARKS) $(LIBRARY) $(SHARED) $(MEMENVLIBRARY) */*.o */*/*.o */*.d */*/*.d ios-x86/*/*.o ios-arm/*/*.o build_config.mk include/leveldb/ldb_config.h
+	-rm -rf ios-x86/* ios-arm/* *.dSYM
+
+
+$(LIBRARY): $(LIBOBJECTS)
 	rm -f $@
-	$(AR) -rs $@ $(SHARED_MEMENVOBJECTS)
+	$(AR) -rs $@ $(LIBOBJECTS)
 
-$(STATIC_OUTDIR)/db_bench:db/db_bench.cc $(STATIC_LIBOBJECTS) $(TESTUTIL)
-	$(CXX) $(LDFLAGS) $(CXXFLAGS) db/db_bench.cc $(STATIC_LIBOBJECTS) $(TESTUTIL) -o $@ $(LIBS)
+#
+# all tools, programs, and tests depend upon the static library
+$(TESTS) $(PROGRAMS) $(TOOLS) : $(LIBRARY)
 
-$(STATIC_OUTDIR)/db_bench_sqlite3:doc/bench/db_bench_sqlite3.cc $(STATIC_LIBOBJECTS) $(TESTUTIL)
-	$(CXX) $(LDFLAGS) $(CXXFLAGS) doc/bench/db_bench_sqlite3.cc $(STATIC_LIBOBJECTS) $(TESTUTIL) -o $@ -lsqlite3 $(LIBS)
+#
+# all tests depend upon the test harness
+$(TESTS) : $(TESTHARNESS)
 
-$(STATIC_OUTDIR)/db_bench_tree_db:doc/bench/db_bench_tree_db.cc $(STATIC_LIBOBJECTS) $(TESTUTIL)
-	$(CXX) $(LDFLAGS) $(CXXFLAGS) doc/bench/db_bench_tree_db.cc $(STATIC_LIBOBJECTS) $(TESTUTIL) -o $@ -lkyotocabinet $(LIBS)
+#
+# tools, programs, and tests will compile to the root directory
+#  but their .cc source file will be in one of the following subdirectories
+vpath %.cc db:table:util:leveldb_ee:leveldb_os
 
-$(STATIC_OUTDIR)/leveldbutil:db/leveldbutil.cc $(STATIC_LIBOBJECTS)
-	$(CXX) $(LDFLAGS) $(CXXFLAGS) db/leveldbutil.cc $(STATIC_LIBOBJECTS) -o $@ $(LIBS)
+# special case for c_test
+vpath %.c db
 
-$(STATIC_OUTDIR)/arena_test:util/arena_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
-	$(CXX) $(LDFLAGS) $(CXXFLAGS) util/arena_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
+db_bench: db/db_bench.o $(LIBRARY) $(TESTUTIL)
+	$(CXX) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) $< $(TESTUTIL) -o $@  $(LEVEL_LDFLAGS) $(LDFLAGS)
 
-$(STATIC_OUTDIR)/autocompact_test:db/autocompact_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
-	$(CXX) $(LDFLAGS) $(CXXFLAGS) db/autocompact_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
+db_bench_sqlite3: doc/bench/db_bench_sqlite3.o $(LIBRARY) $(TESTUTIL)
 
-$(STATIC_OUTDIR)/bloom_test:util/bloom_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
-	$(CXX) $(LDFLAGS) $(CXXFLAGS) util/bloom_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
+db_bench_tree_db: doc/bench/db_bench_tree_db.o $(LIBRARY) $(TESTUTIL)
 
-$(STATIC_OUTDIR)/c_test:$(STATIC_OUTDIR)/db/c_test.o $(STATIC_LIBOBJECTS) $(TESTHARNESS)
-	$(CXX) $(LDFLAGS) $(STATIC_OUTDIR)/db/c_test.o $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
 
-$(STATIC_OUTDIR)/cache_test:util/cache_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
-	$(CXX) $(LDFLAGS) $(CXXFLAGS) util/cache_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
+#
+# build line taken from lz4 makefile
+#
+util/lz4.o: util/lz4.c util/lz4.h
+	$(CC) $(CFLAGS) $(PLATFORM_SHARED_CFLAGS) -O3 -std=c99 -Wall -Wextra -Wundef -Wshadow -Wcast-qual -Wcast-align -Wstrict-prototypes -pedantic -DLZ4_VERSION=\"r130\"  -c util/lz4.c -o util/lz4.o
 
-$(STATIC_OUTDIR)/coding_test:util/coding_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
-	$(CXX) $(LDFLAGS) $(CXXFLAGS) util/coding_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
+#
+# memory env
+#
+$(MEMENVLIBRARY) : $(MEMENVOBJECTS)
+	rm -f $@
+	$(AR) -rs $@ $(MEMENVOBJECTS)
 
-$(STATIC_OUTDIR)/corruption_test:db/corruption_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
-	$(CXX) $(LDFLAGS) $(CXXFLAGS) db/corruption_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
+memenv_test : helpers/memenv/memenv_test.o $(MEMENVLIBRARY) $(LIBRARY) $(TESTHARNESS)
+	$(CXX) helpers/memenv/memenv_test.o $(MEMENVLIBRARY) $(LIBRARY) $(TESTHARNESS) -o $@ $(LDFLAGS)
 
-$(STATIC_OUTDIR)/crc32c_test:util/crc32c_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
-	$(CXX) $(LDFLAGS) $(CXXFLAGS) util/crc32c_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
+#
+# IOS build
+#
+ifeq ($(PLATFORM), IOS)
+# For iOS, create universal object files to be used on both the simulator and
+# a device.
+PLATFORMSROOT=/Applications/Xcode.app/Contents/Developer/Platforms
+SIMULATORROOT=$(PLATFORMSROOT)/iPhoneSimulator.platform/Developer
+DEVICEROOT=$(PLATFORMSROOT)/iPhoneOS.platform/Developer
+IOSVERSION=$(shell defaults read $(PLATFORMSROOT)/iPhoneOS.platform/version CFBundleShortVersionString)
 
-$(STATIC_OUTDIR)/db_test:db/db_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
-	$(CXX) $(LDFLAGS) $(CXXFLAGS) db/db_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
+.cc.o:
+	mkdir -p ios-x86/$(dir $@)
+	$(SIMULATORROOT)/usr/bin/$(CXX) $(CXXFLAGS) -isysroot $(SIMULATORROOT)/SDKs/iPhoneSimulator$(IOSVERSION).sdk -arch i686 -c $< -o ios-x86/$@
+	mkdir -p ios-arm/$(dir $@)
+	$(DEVICEROOT)/usr/bin/$(CXX) $(CXXFLAGS) -isysroot $(DEVICEROOT)/SDKs/iPhoneOS$(IOSVERSION).sdk -arch armv6 -arch armv7 -c $< -o ios-arm/$@
+	lipo ios-x86/$@ ios-arm/$@ -create -output $@
 
-$(STATIC_OUTDIR)/dbformat_test:db/dbformat_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
-	$(CXX) $(LDFLAGS) $(CXXFLAGS) db/dbformat_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
+.c.o:
+	mkdir -p ios-x86/$(dir $@)
+	$(SIMULATORROOT)/usr/bin/$(CC) $(CFLAGS) -isysroot $(SIMULATORROOT)/SDKs/iPhoneSimulator$(IOSVERSION).sdk -arch i686 -c $< -o ios-x86/$@
+	mkdir -p ios-arm/$(dir $@)
+	$(DEVICEROOT)/usr/bin/$(CC) $(CFLAGS) -isysroot $(DEVICEROOT)/SDKs/iPhoneOS$(IOSVERSION).sdk -arch armv6 -arch armv7 -c $< -o ios-arm/$@
+	lipo ios-x86/$@ ios-arm/$@ -create -output $@
 
-$(STATIC_OUTDIR)/env_posix_test:util/env_posix_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
-	$(CXX) $(LDFLAGS) $(CXXFLAGS) util/env_posix_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
-
-$(STATIC_OUTDIR)/env_test:util/env_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
-	$(CXX) $(LDFLAGS) $(CXXFLAGS) util/env_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
-
-$(STATIC_OUTDIR)/fault_injection_test:db/fault_injection_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
-	$(CXX) $(LDFLAGS) $(CXXFLAGS) db/fault_injection_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
-
-$(STATIC_OUTDIR)/filename_test:db/filename_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
-	$(CXX) $(LDFLAGS) $(CXXFLAGS) db/filename_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
-
-$(STATIC_OUTDIR)/filter_block_test:table/filter_block_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
-	$(CXX) $(LDFLAGS) $(CXXFLAGS) table/filter_block_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
-
-$(STATIC_OUTDIR)/hash_test:util/hash_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
-	$(CXX) $(LDFLAGS) $(CXXFLAGS) util/hash_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
-
-$(STATIC_OUTDIR)/issue178_test:issues/issue178_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
-	$(CXX) $(LDFLAGS) $(CXXFLAGS) issues/issue178_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
-
-$(STATIC_OUTDIR)/issue200_test:issues/issue200_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
-	$(CXX) $(LDFLAGS) $(CXXFLAGS) issues/issue200_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
-
-$(STATIC_OUTDIR)/log_test:db/log_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
-	$(CXX) $(LDFLAGS) $(CXXFLAGS) db/log_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
-
-$(STATIC_OUTDIR)/recovery_test:db/recovery_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
-	$(CXX) $(LDFLAGS) $(CXXFLAGS) db/recovery_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
-
-$(STATIC_OUTDIR)/table_test:table/table_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
-	$(CXX) $(LDFLAGS) $(CXXFLAGS) table/table_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
-
-$(STATIC_OUTDIR)/skiplist_test:db/skiplist_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
-	$(CXX) $(LDFLAGS) $(CXXFLAGS) db/skiplist_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
-
-$(STATIC_OUTDIR)/version_edit_test:db/version_edit_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
-	$(CXX) $(LDFLAGS) $(CXXFLAGS) db/version_edit_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
-
-$(STATIC_OUTDIR)/version_set_test:db/version_set_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
-	$(CXX) $(LDFLAGS) $(CXXFLAGS) db/version_set_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
-
-$(STATIC_OUTDIR)/write_batch_test:db/write_batch_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
-	$(CXX) $(LDFLAGS) $(CXXFLAGS) db/write_batch_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
-
-$(STATIC_OUTDIR)/memenv_test:$(STATIC_OUTDIR)/helpers/memenv/memenv_test.o $(STATIC_OUTDIR)/libmemenv.a $(STATIC_OUTDIR)/libleveldb.a $(TESTHARNESS)
-	$(XCRUN) $(CXX) $(LDFLAGS) $(STATIC_OUTDIR)/helpers/memenv/memenv_test.o $(STATIC_OUTDIR)/libmemenv.a $(STATIC_OUTDIR)/libleveldb.a $(TESTHARNESS) -o $@ $(LIBS)
-
-$(SHARED_OUTDIR)/db_bench:$(SHARED_OUTDIR)/db/db_bench.o $(SHARED_LIBS) $(TESTUTIL)
-	$(XCRUN) $(CXX) $(LDFLAGS) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) $(SHARED_OUTDIR)/db/db_bench.o $(TESTUTIL) $(SHARED_OUTDIR)/$(SHARED_LIB3) -o $@ $(LIBS)
-
-.PHONY: run-shared
-run-shared: $(SHARED_OUTDIR)/db_bench
-	LD_LIBRARY_PATH=$(SHARED_OUTDIR) $(SHARED_OUTDIR)/db_bench
-
-$(SIMULATOR_OUTDIR)/%.o: %.cc
-	xcrun -sdk iphonesimulator $(CXX) $(CXXFLAGS) $(SIMULATOR_CFLAGS) -c $< -o $@
-
-$(DEVICE_OUTDIR)/%.o: %.cc
-	xcrun -sdk iphoneos $(CXX) $(CXXFLAGS) $(DEVICE_CFLAGS) -c $< -o $@
-
-$(SIMULATOR_OUTDIR)/%.o: %.c
-	xcrun -sdk iphonesimulator $(CC) $(CFLAGS) $(SIMULATOR_CFLAGS) -c $< -o $@
-
-$(DEVICE_OUTDIR)/%.o: %.c
-	xcrun -sdk iphoneos $(CC) $(CFLAGS) $(DEVICE_CFLAGS) -c $< -o $@
-
-$(STATIC_OUTDIR)/%.o: %.cc
-	$(CXX) $(CXXFLAGS) -c $< -o $@
-
-$(STATIC_OUTDIR)/%.o: %.c
-	$(CC) $(CFLAGS) -c $< -o $@
-
-$(SHARED_OUTDIR)/%.o: %.cc
+else
+#
+# build for everything NOT IOS
+#
+.cc.o:
 	$(CXX) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) -c $< -o $@
 
-$(SHARED_OUTDIR)/%.o: %.c
+.c.o:
 	$(CC) $(CFLAGS) $(PLATFORM_SHARED_CFLAGS) -c $< -o $@
 
-$(STATIC_OUTDIR)/port/port_posix_sse.o: port/port_posix_sse.cc
-	$(CXX) $(CXXFLAGS) $(PLATFORM_SSEFLAGS) -c $< -o $@
+## 	@echo -- Creating dependency file for $<
+%.d: %.cc
+	$(CC) $(CFLAGS) $(PLATFORM_SHARED_CFLAGS) -MM -E -MT $(basename $@).d -MT $(basename $@).o -MF $@ $<
+	@echo $(basename $@).o: $(basename $@).d >>$@
 
-$(SHARED_OUTDIR)/port/port_posix_sse.o: port/port_posix_sse.cc
-	$(CXX) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) $(PLATFORM_SSEFLAGS) -c $< -o $@
+# generic build for command line tests
+%: %.cc
+	$(CXX) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) $< $(TESTHARNESS) -o $@ $(LEVEL_LDFLAGS) $(LDFLAGS)
+
+%: db/%.c
+	$(CXX) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) $< $(TESTHARNESS) -o $@ $(LEVEL_LDFLAGS) $(LDFLAGS)
+
+# for tools, omits test harness
+%: tools/%.cc
+	$(CXX) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) $< -o $@ $(LEVEL_LDFLAGS) $(LDFLAGS)
+
+endif
+
+#
+# load dependency files
+#
+ifeq ($(filter tar clean allclean distclean,$(MAKECMDGOALS)),)
+-include $(DEPEND)
+endif
diff --git a/src/leveldb/README b/src/leveldb/README
new file mode 100644
index 000000000..6a3677406
--- /dev/null
+++ b/src/leveldb/README
@@ -0,0 +1,83 @@
+leveldb: A key-value store
+Authors: Sanjay Ghemawat (sanjay@google.com) and Jeff Dean (jeff@google.com)
+
+The original Google README is now README.GOOGLE.
+
+** Introduction
+
+This repository contains the Google source code as modified to benefit
+the Riak environment.  The typical Riak environment has two attributes
+that necessitate leveldb adjustments, both in options and code:
+
+- production servers: Riak often runs in heavy Internet environments:
+  servers with many CPU cores, lots of memory, and 24x7 disk activity.
+  Basho's leveldb takes advantage of the environment by adding
+  hardware CRC calculation, increasing Bloom filter accuracy, and
+  defaulting to integrity checking enabled.
+
+- multiple databases open: Riak opens 8 to 128 databases
+  simultaneously.  Google's leveldb supports this, but its background
+  compaction thread can fall behind.  leveldb will "stall" new user
+  writes whenever the compaction thread gets too far behind.  Basho's
+  leveldb modification include multiple thread blocks that each
+  contain prioritized threads for specific compaction activities.
+
+Details for Basho's customizations exist in the leveldb wiki:
+
+  http://github.com/basho/leveldb/wiki
+
+
+** Branch pattern
+
+This repository follows the Basho standard for branch management 
+as of November 28, 2013.  The standard is found here:
+
+https://github.com/basho/riak/wiki/Basho-repository-management
+
+In summary, the "develop" branch contains the most recently reviewed
+engineering work.  The "master" branch contains the most recently
+released work, i.e. distributed as part of a Riak release.
+
+
+** Basic options needed
+
+Those wishing to truly savor the benefits of Basho's modifications
+need to initialize a new leveldb::Options structure similar to the
+following before each call to leveldb::DB::Open:
+
+    leveldb::Options * options;
+
+    options=new Leveldb::Options;
+
+    options.filter_policy=leveldb::NewBloomFilterPolicy2(16);
+    options.write_buffer_size=62914560;  // 60Mbytes
+    options.total_leveldb_mem=2684354560; // 2.5Gbytes (details below)
+    options.env=leveldb::Env::Default();
+
+
+** Memory plan
+
+Basho's leveldb dramatically departed from Google's original internal
+memory allotment plan with Riak 2.0.  Basho's leveldb uses a methodology
+called flexcache.  The technical details are here:
+
+   https://github.com/basho/leveldb/wiki/mv-flexcache
+
+The key points are:
+
+- options.total_leveldb_mem is an allocation for the entire process,
+  not a single database
+
+- giving different values to options.total_leveldb_mem on subsequent Open
+  calls causes memory to rearrange to current value across all databases
+
+- recommended minimum for Basho's leveldb is 340Mbytes per database.  
+
+- performance improves rapidly from 340Mbytes to 2.5Gbytes per database (3.0Gbytes
+  if using Riak's active anti-entropy).  Even more is nice, but not as helpful.
+
+- never assign more than 75% of available RAM to total_leveldb_mem.  There is
+  too much unaccounted memory overhead (worse if you use tcmalloc library).
+
+- options.max_open_files and options.block_cache should not be used.
+  
diff --git a/src/leveldb/README.GOOGLE b/src/leveldb/README.GOOGLE
new file mode 100644
index 000000000..3618adeee
--- /dev/null
+++ b/src/leveldb/README.GOOGLE
@@ -0,0 +1,51 @@
+leveldb: A key-value store
+Authors: Sanjay Ghemawat (sanjay@google.com) and Jeff Dean (jeff@google.com)
+
+The code under this directory implements a system for maintaining a
+persistent key/value store.
+
+See doc/index.html for more explanation.
+See doc/impl.html for a brief overview of the implementation.
+
+The public interface is in include/*.h.  Callers should not include or
+rely on the details of any other header files in this package.  Those
+internal APIs may be changed without warning.
+
+Guide to header files:
+
+include/db.h
+    Main interface to the DB: Start here
+
+include/options.h
+    Control over the behavior of an entire database, and also
+    control over the behavior of individual reads and writes.
+
+include/comparator.h
+    Abstraction for user-specified comparison function.  If you want
+    just bytewise comparison of keys, you can use the default comparator,
+    but clients can write their own comparator implementations if they
+    want custom ordering (e.g. to handle different character
+    encodings, etc.)
+
+include/iterator.h
+    Interface for iterating over data. You can get an iterator
+    from a DB object.
+
+include/write_batch.h
+    Interface for atomically applying multiple updates to a database.
+
+include/slice.h
+    A simple module for maintaining a pointer and a length into some
+    other byte array.
+
+include/status.h
+    Status is returned from many of the public interfaces and is used
+    to report success and various kinds of errors.
+
+include/env.h
+    Abstraction of the OS environment.  A posix implementation of
+    this interface is in util/env_posix.cc
+
+include/table.h
+include/table_builder.h
+    Lower-level modules that most clients probably won't use directly
diff --git a/src/leveldb/README.md b/src/leveldb/README.md
deleted file mode 100644
index a010c5085..000000000
--- a/src/leveldb/README.md
+++ /dev/null
@@ -1,174 +0,0 @@
-**LevelDB is a fast key-value storage library written at Google that provides an ordered mapping from string keys to string values.**
-
-[![Build Status](https://travis-ci.org/google/leveldb.svg?branch=master)](https://travis-ci.org/google/leveldb)
-
-Authors: Sanjay Ghemawat (sanjay@google.com) and Jeff Dean (jeff@google.com)
-
-# Features
-  * Keys and values are arbitrary byte arrays.
-  * Data is stored sorted by key.
-  * Callers can provide a custom comparison function to override the sort order.
-  * The basic operations are `Put(key,value)`, `Get(key)`, `Delete(key)`.
-  * Multiple changes can be made in one atomic batch.
-  * Users can create a transient snapshot to get a consistent view of data.
-  * Forward and backward iteration is supported over the data.
-  * Data is automatically compressed using the [Snappy compression library](http://google.github.io/snappy/).
-  * External activity (file system operations etc.) is relayed through a virtual interface so users can customize the operating system interactions.
-
-# Documentation
-  [LevelDB library documentation](https://github.com/google/leveldb/blob/master/doc/index.md) is online and bundled with the source code.
-
-
-# Limitations
-  * This is not a SQL database.  It does not have a relational data model, it does not support SQL queries, and it has no support for indexes.
-  * Only a single process (possibly multi-threaded) can access a particular database at a time.
-  * There is no client-server support builtin to the library.  An application that needs such support will have to wrap their own server around the library.
-
-# Contributing to the leveldb Project
-The leveldb project welcomes contributions. leveldb's primary goal is to be
-a reliable and fast key/value store. Changes that are in line with the
-features/limitations outlined above, and meet the requirements below,
-will be considered.
-
-Contribution requirements:
-
-1. **POSIX only**. We _generally_ will only accept changes that are both
-   compiled, and tested on a POSIX platform - usually Linux. Very small
-   changes will sometimes be accepted, but consider that more of an
-   exception than the rule.
-
-2. **Stable API**. We strive very hard to maintain a stable API. Changes that
-   require changes for projects using leveldb _might_ be rejected without
-   sufficient benefit to the project.
-
-3. **Tests**: All changes must be accompanied by a new (or changed) test, or
-   a sufficient explanation as to why a new (or changed) test is not required.
-
-## Submitting a Pull Request
-Before any pull request will be accepted the author must first sign a
-Contributor License Agreement (CLA) at https://cla.developers.google.com/.
-
-In order to keep the commit timeline linear
-[squash](https://git-scm.com/book/en/v2/Git-Tools-Rewriting-History#Squashing-Commits)
-your changes down to a single commit and [rebase](https://git-scm.com/docs/git-rebase)
-on google/leveldb/master. This keeps the commit timeline linear and more easily sync'ed
-with the internal repository at Google. More information at GitHub's
-[About Git rebase](https://help.github.com/articles/about-git-rebase/) page.
-
-# Performance
-
-Here is a performance report (with explanations) from the run of the
-included db_bench program.  The results are somewhat noisy, but should
-be enough to get a ballpark performance estimate.
-
-## Setup
-
-We use a database with a million entries.  Each entry has a 16 byte
-key, and a 100 byte value.  Values used by the benchmark compress to
-about half their original size.
-
-    LevelDB:    version 1.1
-    Date:       Sun May  1 12:11:26 2011
-    CPU:        4 x Intel(R) Core(TM)2 Quad CPU    Q6600  @ 2.40GHz
-    CPUCache:   4096 KB
-    Keys:       16 bytes each
-    Values:     100 bytes each (50 bytes after compression)
-    Entries:    1000000
-    Raw Size:   110.6 MB (estimated)
-    File Size:  62.9 MB (estimated)
-
-## Write performance
-
-The "fill" benchmarks create a brand new database, in either
-sequential, or random order.  The "fillsync" benchmark flushes data
-from the operating system to the disk after every operation; the other
-write operations leave the data sitting in the operating system buffer
-cache for a while.  The "overwrite" benchmark does random writes that
-update existing keys in the database.
-
-    fillseq      :       1.765 micros/op;   62.7 MB/s
-    fillsync     :     268.409 micros/op;    0.4 MB/s (10000 ops)
-    fillrandom   :       2.460 micros/op;   45.0 MB/s
-    overwrite    :       2.380 micros/op;   46.5 MB/s
-
-Each "op" above corresponds to a write of a single key/value pair.
-I.e., a random write benchmark goes at approximately 400,000 writes per second.
-
-Each "fillsync" operation costs much less (0.3 millisecond)
-than a disk seek (typically 10 milliseconds).  We suspect that this is
-because the hard disk itself is buffering the update in its memory and
-responding before the data has been written to the platter.  This may
-or may not be safe based on whether or not the hard disk has enough
-power to save its memory in the event of a power failure.
-
-## Read performance
-
-We list the performance of reading sequentially in both the forward
-and reverse direction, and also the performance of a random lookup.
-Note that the database created by the benchmark is quite small.
-Therefore the report characterizes the performance of leveldb when the
-working set fits in memory.  The cost of reading a piece of data that
-is not present in the operating system buffer cache will be dominated
-by the one or two disk seeks needed to fetch the data from disk.
-Write performance will be mostly unaffected by whether or not the
-working set fits in memory.
-
-    readrandom  : 16.677 micros/op;  (approximately 60,000 reads per second)
-    readseq     :  0.476 micros/op;  232.3 MB/s
-    readreverse :  0.724 micros/op;  152.9 MB/s
-
-LevelDB compacts its underlying storage data in the background to
-improve read performance.  The results listed above were done
-immediately after a lot of random writes.  The results after
-compactions (which are usually triggered automatically) are better.
-
-    readrandom  : 11.602 micros/op;  (approximately 85,000 reads per second)
-    readseq     :  0.423 micros/op;  261.8 MB/s
-    readreverse :  0.663 micros/op;  166.9 MB/s
-
-Some of the high cost of reads comes from repeated decompression of blocks
-read from disk.  If we supply enough cache to the leveldb so it can hold the
-uncompressed blocks in memory, the read performance improves again:
-
-    readrandom  : 9.775 micros/op;  (approximately 100,000 reads per second before compaction)
-    readrandom  : 5.215 micros/op;  (approximately 190,000 reads per second after compaction)
-
-## Repository contents
-
-See [doc/index.md](doc/index.md) for more explanation. See
-[doc/impl.md](doc/impl.md) for a brief overview of the implementation.
-
-The public interface is in include/*.h.  Callers should not include or
-rely on the details of any other header files in this package.  Those
-internal APIs may be changed without warning.
-
-Guide to header files:
-
-* **include/db.h**: Main interface to the DB: Start here
-
-* **include/options.h**: Control over the behavior of an entire database,
-and also control over the behavior of individual reads and writes.
-
-* **include/comparator.h**: Abstraction for user-specified comparison function.
-If you want just bytewise comparison of keys, you can use the default
-comparator, but clients can write their own comparator implementations if they
-want custom ordering (e.g. to handle different character encodings, etc.)
-
-* **include/iterator.h**: Interface for iterating over data. You can get
-an iterator from a DB object.
-
-* **include/write_batch.h**: Interface for atomically applying multiple
-updates to a database.
-
-* **include/slice.h**: A simple module for maintaining a pointer and a
-length into some other byte array.
-
-* **include/status.h**: Status is returned from many of the public interfaces
-and is used to report success and various kinds of errors.
-
-* **include/env.h**:
-Abstraction of the OS environment.  A posix implementation of this interface is
-in util/env_posix.cc
-
-* **include/table.h, include/table_builder.h**: Lower-level modules that most
-clients probably won't use directly
diff --git a/src/leveldb/TODO b/src/leveldb/TODO
index e603c0713..9130b6a9f 100644
--- a/src/leveldb/TODO
+++ b/src/leveldb/TODO
@@ -7,7 +7,6 @@ db
   within [start_key..end_key]?  For Chrome, deletion of obsolete
   object stores, etc. can be done in the background anyway, so
   probably not that important.
-- There have been requests for MultiGet.
 
 After a range is completely deleted, what gets rid of the
 corresponding files if we do no future changes to that range.  Make
diff --git a/src/leveldb/WINDOWS.md b/src/leveldb/WINDOWS.md
deleted file mode 100644
index 5b76c2448..000000000
--- a/src/leveldb/WINDOWS.md
+++ /dev/null
@@ -1,39 +0,0 @@
-# Building LevelDB On Windows
-
-## Prereqs 
-
-Install the [Windows Software Development Kit version 7.1](http://www.microsoft.com/downloads/dlx/en-us/listdetailsview.aspx?FamilyID=6b6c21d2-2006-4afa-9702-529fa782d63b).
-
-Download and extract the [Snappy source distribution](http://snappy.googlecode.com/files/snappy-1.0.5.tar.gz)
-
-1. Open the "Windows SDK 7.1 Command Prompt" :
-   Start Menu -> "Microsoft Windows SDK v7.1" > "Windows SDK 7.1 Command Prompt"
-2. Change the directory to the leveldb project
-
-## Building the Static lib 
-
-* 32 bit Version 
-
-        setenv /x86
-        msbuild.exe /p:Configuration=Release /p:Platform=Win32 /p:Snappy=..\snappy-1.0.5
-
-* 64 bit Version 
-
-        setenv /x64
-        msbuild.exe /p:Configuration=Release /p:Platform=x64 /p:Snappy=..\snappy-1.0.5
-
-
-## Building and Running the Benchmark app
-
-* 32 bit Version 
-
-	    setenv /x86
-	    msbuild.exe /p:Configuration=Benchmark /p:Platform=Win32 /p:Snappy=..\snappy-1.0.5
-		Benchmark\leveldb.exe
-
-* 64 bit Version 
-
-	    setenv /x64
-	    msbuild.exe /p:Configuration=Benchmark /p:Platform=x64 /p:Snappy=..\snappy-1.0.5
-	    x64\Benchmark\leveldb.exe
-
diff --git a/src/leveldb/build_detect_platform b/src/leveldb/build_detect_platform
index 4a9471590..0f231fc1d 100755
--- a/src/leveldb/build_detect_platform
+++ b/src/leveldb/build_detect_platform
@@ -7,11 +7,8 @@
 #   CC                          C Compiler path
 #   CXX                         C++ Compiler path
 #   PLATFORM_LDFLAGS            Linker flags
-#   PLATFORM_LIBS               Libraries flags
 #   PLATFORM_SHARED_EXT         Extension for shared libraries
 #   PLATFORM_SHARED_LDFLAGS     Flags for building shared library
-#                               This flag is embedded just before the name
-#                               of the shared library without intervening spaces
 #   PLATFORM_SHARED_CFLAGS      Flags for compiling objects for shared library
 #   PLATFORM_CCFLAGS            C compiler flags
 #   PLATFORM_CXXFLAGS           C++ compiler flags.  Will contain:
@@ -20,15 +17,14 @@
 #
 # The PLATFORM_CCFLAGS and PLATFORM_CXXFLAGS might include the following:
 #
-#       -DLEVELDB_ATOMIC_PRESENT     if <atomic> is present
+#       -DLEVELDB_CSTDATOMIC_PRESENT if <cstdatomic> is present
 #       -DLEVELDB_PLATFORM_POSIX     for Posix-based platforms
 #       -DSNAPPY                     if the Snappy library is present
 #
 
 OUTPUT=$1
-PREFIX=$2
-if test -z "$OUTPUT" || test -z "$PREFIX"; then
-  echo "usage: $0 <output-filename> <directory_prefix>" >&2
+if test -z "$OUTPUT"; then
+  echo "usage: $0 <output-filename>" >&2
   exit 1
 fi
 
@@ -44,10 +40,6 @@ if test -z "$CXX"; then
     CXX=g++
 fi
 
-if test -z "$TMPDIR"; then
-    TMPDIR=/tmp
-fi
-
 # Detect OS
 if test -z "$TARGET_OS"; then
     TARGET_OS=`uname -s`
@@ -58,119 +50,77 @@ CROSS_COMPILE=
 PLATFORM_CCFLAGS=
 PLATFORM_CXXFLAGS=
 PLATFORM_LDFLAGS=
-PLATFORM_LIBS=
-PLATFORM_SHARED_EXT="so"
+PLATFORM_SHARED_EXT=
 PLATFORM_SHARED_LDFLAGS="-shared -Wl,-soname -Wl,"
 PLATFORM_SHARED_CFLAGS="-fPIC"
 PLATFORM_SHARED_VERSIONED=true
-PLATFORM_SSEFLAGS=
 
-MEMCMP_FLAG=
-if [ "$CXX" = "g++" ]; then
-    # Use libc's memcmp instead of GCC's memcmp.  This results in ~40%
-    # performance improvement on readrandom under gcc 4.4.3 on Linux/x86.
-    MEMCMP_FLAG="-fno-builtin-memcmp"
+if test -n "$LEVELDB_VSN"; then
+    VERSION_FLAGS="$VERSION_FLAGS -DLEVELDB_VSN=\"$LEVELDB_VSN\""
 fi
 
+# On GCC, we pick libc's memcmp over GCC's memcmp via -fno-builtin-memcmp
 case "$TARGET_OS" in
-    CYGWIN_*)
-        PLATFORM=OS_LINUX
-        COMMON_FLAGS="$MEMCMP_FLAG -lpthread -DOS_LINUX -DCYGWIN"
-        PLATFORM_LDFLAGS="-lpthread"
-        PORT_FILE=port/port_posix.cc
-        PORT_SSE_FILE=port/port_posix_sse.cc
-        ;;
     Darwin)
         PLATFORM=OS_MACOSX
-        COMMON_FLAGS="$MEMCMP_FLAG -DOS_MACOSX"
-        PLATFORM_SHARED_EXT=dylib
-        [ -z "$INSTALL_PATH" ] && INSTALL_PATH=`pwd`
-        PLATFORM_SHARED_LDFLAGS="-dynamiclib -install_name $INSTALL_PATH/"
+        oIFS="$IFS"; IFS=.
+        set `uname -r`
+        IFS="$oIFS"
+        if [ "$1" -ge 13 ]; then
+            # assume clang compiler
+            COMMON_FLAGS="-mmacosx-version-min=10.8 -DOS_MACOSX -stdlib=libc++"
+            PLATFORM_LDFLAGS="-mmacosx-version-min=10.8"
+        else
+            COMMON_FLAGS="-fno-builtin-memcmp -DOS_MACOSX"
+        fi
+        PLATFORM_SHARED_EXT=
+        PLATFORM_SHARED_LDFLAGS="-dynamiclib -install_name "
         PORT_FILE=port/port_posix.cc
-        PORT_SSE_FILE=port/port_posix_sse.cc
         ;;
     Linux)
         PLATFORM=OS_LINUX
-        COMMON_FLAGS="$MEMCMP_FLAG -pthread -DOS_LINUX"
-        PLATFORM_LDFLAGS="-pthread"
+        COMMON_FLAGS="-fno-builtin-memcmp -pthread -DOS_LINUX"
+        PLATFORM_LDFLAGS="-pthread -lrt"
         PORT_FILE=port/port_posix.cc
-        PORT_SSE_FILE=port/port_posix_sse.cc
         ;;
     SunOS)
         PLATFORM=OS_SOLARIS
-        COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_SOLARIS"
-        PLATFORM_LIBS="-lpthread -lrt"
+        COMMON_FLAGS="-fno-builtin-memcmp -D_REENTRANT -DOS_SOLARIS -m64"
+        PLATFORM_LDFLAGS="-lpthread -lrt"
+        PLATFORM_SHARED_EXT=
         PORT_FILE=port/port_posix.cc
-        PORT_SSE_FILE=port/port_posix_sse.cc
         ;;
     FreeBSD)
+        CC=cc
+        CXX=c++
         PLATFORM=OS_FREEBSD
-        COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_FREEBSD"
-        PLATFORM_LIBS="-lpthread"
-        PORT_FILE=port/port_posix.cc
-        PORT_SSE_FILE=port/port_posix_sse.cc
-        ;;
-    GNU/kFreeBSD)
-        PLATFORM=OS_KFREEBSD
-        COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_KFREEBSD"
-        PLATFORM_LIBS="-lpthread"
+        COMMON_FLAGS="-fno-builtin-memcmp -D_REENTRANT -DOS_FREEBSD"
+        PLATFORM_LDFLAGS="-lpthread"
         PORT_FILE=port/port_posix.cc
         ;;
     NetBSD)
         PLATFORM=OS_NETBSD
-        COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_NETBSD"
-        PLATFORM_LIBS="-lpthread -lgcc_s"
+        COMMON_FLAGS="-fno-builtin-memcmp -D_REENTRANT -DOS_NETBSD"
+        PLATFORM_LDFLAGS="-lpthread -lgcc_s"
         PORT_FILE=port/port_posix.cc
-        PORT_SSE_FILE=port/port_posix_sse.cc
         ;;
     OpenBSD)
         PLATFORM=OS_OPENBSD
-        COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_OPENBSD"
+        COMMON_FLAGS="-fno-builtin-memcmp -D_REENTRANT -DOS_OPENBSD"
         PLATFORM_LDFLAGS="-pthread"
         PORT_FILE=port/port_posix.cc
-        PORT_SSE_FILE=port/port_posix_sse.cc
         ;;
     DragonFly)
         PLATFORM=OS_DRAGONFLYBSD
-        COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_DRAGONFLYBSD"
-        PLATFORM_LIBS="-lpthread"
+        COMMON_FLAGS="-fno-builtin-memcmp -D_REENTRANT -DOS_DRAGONFLYBSD"
+        PLATFORM_LDFLAGS="-lpthread"
         PORT_FILE=port/port_posix.cc
-        PORT_SSE_FILE=port/port_posix_sse.cc
         ;;
     OS_ANDROID_CROSSCOMPILE)
         PLATFORM=OS_ANDROID
-        COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_ANDROID -DLEVELDB_PLATFORM_POSIX"
+        COMMON_FLAGS="-fno-builtin-memcmp -D_REENTRANT -DOS_ANDROID -DLEVELDB_PLATFORM_POSIX"
         PLATFORM_LDFLAGS=""  # All pthread features are in the Android C library
         PORT_FILE=port/port_posix.cc
-        PORT_SSE_FILE=port/port_posix_sse.cc
-        CROSS_COMPILE=true
-        ;;
-    HP-UX)
-        PLATFORM=OS_HPUX
-        COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_HPUX"
-        PLATFORM_LDFLAGS="-pthread"
-        PORT_FILE=port/port_posix.cc
-        PORT_SSE_FILE=port/port_posix_sse.cc
-        # man ld: +h internal_name
-        PLATFORM_SHARED_LDFLAGS="-shared -Wl,+h -Wl,"
-        ;;
-    IOS)
-        PLATFORM=IOS
-        COMMON_FLAGS="$MEMCMP_FLAG -DOS_MACOSX"
-        [ -z "$INSTALL_PATH" ] && INSTALL_PATH=`pwd`
-        PORT_FILE=port/port_posix.cc
-        PORT_SSE_FILE=port/port_posix_sse.cc
-        PLATFORM_SHARED_EXT=
-        PLATFORM_SHARED_LDFLAGS=
-        PLATFORM_SHARED_CFLAGS=
-        PLATFORM_SHARED_VERSIONED=
-        ;;
-    OS_WINDOWS_CROSSCOMPILE | NATIVE_WINDOWS)
-        PLATFORM=OS_WINDOWS
-        COMMON_FLAGS="-fno-builtin-memcmp -D_REENTRANT -DOS_WINDOWS -DLEVELDB_PLATFORM_WINDOWS -DWINVER=0x0500 -D__USE_MINGW_ANSI_STDIO=1"
-        PLATFORM_SOURCES="util/env_win.cc"
-        PLATFORM_LIBS="-lshlwapi"
-        PORT_FILE=port/port_win.cc
         CROSS_COMPILE=true
         ;;
     *)
@@ -182,78 +132,106 @@ esac
 # except for the test and benchmark files. By default, find will output a list
 # of all files matching either rule, so we need to append -print to make the
 # prune take effect.
-DIRS="$PREFIX/db $PREFIX/util $PREFIX/table"
-
+if [ -f leveldb_ee/README.md ]; then
+DIRS="util db table leveldb_ee"
+else
+DIRS="util db table leveldb_os"
+fi
 set -f # temporarily disable globbing so that our patterns aren't expanded
 PRUNE_TEST="-name *test*.cc -prune"
 PRUNE_BENCH="-name *_bench.cc -prune"
-PRUNE_TOOL="-name leveldbutil.cc -prune"
-PORTABLE_FILES=`find $DIRS $PRUNE_TEST -o $PRUNE_BENCH -o $PRUNE_TOOL -o -name '*.cc' -print | sort | sed "s,^$PREFIX/,," | tr "\n" " "`
-
+PORTABLE_FILES=`find $DIRS $PRUNE_TEST -o $PRUNE_BENCH -o -name '*.cc' -print | sort | tr "\n" " "`
+TESTS=`find $DIRS -name '*_test.c*' -print | sort | tr "\n" " "`
 set +f # re-enable globbing
 
 # The sources consist of the portable files, plus the platform-specific port
 # file.
-echo "SOURCES=$PORTABLE_FILES $PORT_FILE $PORT_SSE_FILE" >> $OUTPUT
+echo "SOURCES=$PORTABLE_FILES $PORT_FILE" >> $OUTPUT
 echo "MEMENV_SOURCES=helpers/memenv/memenv.cc" >> $OUTPUT
+echo "TEST_SOURCES=$TESTS" >>$OUTPUT
 
 if [ "$CROSS_COMPILE" = "true" ]; then
     # Cross-compiling; do not try any compilation tests.
     true
 else
-    CXXOUTPUT="${TMPDIR}/leveldb_build_detect_platform-cxx.$$"
-
-    # If -std=c++0x works, use <atomic> as fallback for when memory barriers
-    # are not available.
-    $CXX $CXXFLAGS -std=c++0x -x c++ - -o $CXXOUTPUT 2>/dev/null  <<EOF
-      #include <atomic>
+    # If -std=c++0x works, use <cstdatomic>.  Otherwise use port_posix.h.
+    $CXX $CFLAGS -std=c++0x -x c++ - -o /dev/null 2>/dev/null  <<EOF
+      #include <cstdatomic>
       int main() {}
 EOF
     if [ "$?" = 0 ]; then
-        COMMON_FLAGS="$COMMON_FLAGS -DLEVELDB_PLATFORM_POSIX -DLEVELDB_ATOMIC_PRESENT"
+        COMMON_FLAGS="$COMMON_FLAGS -DLEVELDB_PLATFORM_POSIX -DLEVELDB_CSTDATOMIC_PRESENT"
         PLATFORM_CXXFLAGS="-std=c++0x"
     else
         COMMON_FLAGS="$COMMON_FLAGS -DLEVELDB_PLATFORM_POSIX"
     fi
 
+    # Test whether Snappy library is installed
+    # http://code.google.com/p/snappy/
+    $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
+      #include <snappy.h>
+      int main() {}
+EOF
+    if [ "$?" = 0 ]; then
+        COMMON_FLAGS="$COMMON_FLAGS -DSNAPPY"
+        if [ "$PLATFORM" = "OS_LINUX" ]; then
+            # Basho: switching to static snappy library to make tools more portable
+            PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -Wl,-non_shared -lsnappy -Wl,-call_shared"
+        else
+            PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lsnappy"
+        fi
+    fi
+
     # Test whether tcmalloc is available
-    $CXX $CXXFLAGS -x c++ - -o $CXXOUTPUT -ltcmalloc 2>/dev/null  <<EOF
+    $CXX $CFLAGS -x c++ - -o /dev/null -ltcmalloc 2>/dev/null  <<EOF
       int main() {}
 EOF
     if [ "$?" = 0 ]; then
-        PLATFORM_LIBS="$PLATFORM_LIBS -ltcmalloc"
+        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -ltcmalloc"
     fi
-
-    rm -f $CXXOUTPUT 2>/dev/null
-
-    # Test if gcc SSE 4.2 is supported
-    $CXX $CXXFLAGS -x c++ - -o $CXXOUTPUT -msse4.2 2>/dev/null  <<EOF
-      int main() {}
-EOF
-    if [ "$?" = 0 ]; then
-        PLATFORM_SSEFLAGS="-msse4.2"
-    fi
-
-    rm -f $CXXOUTPUT 2>/dev/null
 fi
 
-# Use the SSE 4.2 CRC32C intrinsics iff runtime checks indicate compiler supports them.
-if [ -n "$PLATFORM_SSEFLAGS" ]; then
-    PLATFORM_SSEFLAGS="$PLATFORM_SSEFLAGS -DLEVELDB_PLATFORM_POSIX_SSE"
-fi
-
-PLATFORM_CCFLAGS="$PLATFORM_CCFLAGS $COMMON_FLAGS"
-PLATFORM_CXXFLAGS="$PLATFORM_CXXFLAGS $COMMON_FLAGS"
+PLATFORM_CCFLAGS="$PLATFORM_CCFLAGS $COMMON_FLAGS $VERSION_FLAGS"
+PLATFORM_CXXFLAGS="$PLATFORM_CXXFLAGS $COMMON_FLAGS $VERSION_FLAGS"
 
 echo "CC=$CC" >> $OUTPUT
 echo "CXX=$CXX" >> $OUTPUT
 echo "PLATFORM=$PLATFORM" >> $OUTPUT
 echo "PLATFORM_LDFLAGS=$PLATFORM_LDFLAGS" >> $OUTPUT
-echo "PLATFORM_LIBS=$PLATFORM_LIBS" >> $OUTPUT
 echo "PLATFORM_CCFLAGS=$PLATFORM_CCFLAGS" >> $OUTPUT
 echo "PLATFORM_CXXFLAGS=$PLATFORM_CXXFLAGS" >> $OUTPUT
-echo "PLATFORM_SSEFLAGS=$PLATFORM_SSEFLAGS" >> $OUTPUT
 echo "PLATFORM_SHARED_CFLAGS=$PLATFORM_SHARED_CFLAGS" >> $OUTPUT
 echo "PLATFORM_SHARED_EXT=$PLATFORM_SHARED_EXT" >> $OUTPUT
 echo "PLATFORM_SHARED_LDFLAGS=$PLATFORM_SHARED_LDFLAGS" >> $OUTPUT
 echo "PLATFORM_SHARED_VERSIONED=$PLATFORM_SHARED_VERSIONED" >> $OUTPUT
+
+#
+# Basho extension to place -D variable in include/leveldb/ldb_config.h
+#
+
+LDB_CONFIG="include/leveldb/ldb_config.h"
+
+# Delete existing output, if it exists
+rm -f $LDB_CONFIG
+
+write_config_h()
+{
+    for param in $@
+    do
+        prefix=$(expr -- $param : "\(..\)")
+        if [ X$prefix = "X-D" ]
+        then
+            echo "" >>$LDB_CONFIG
+            echo "#ifndef $(expr -- $param : '..\(.*\)')" >>$LDB_CONFIG
+            echo "    #define $(expr -- $param : '..\(.*\)')" >>$LDB_CONFIG
+            echo "#endif" >>$LDB_CONFIG
+        fi
+    done
+}
+
+echo "/** This file is generated by build_detect_platform." >$LDB_CONFIG
+echo " *   It saves the state of compile flags.  This benefits the reuse" >>$LDB_CONFIG
+echo " *   of internal include files outside of a leveldb build." >>$LDB_CONFIG
+echo " */" >>$LDB_CONFIG
+
+write_config_h $COMMON_FLAGS
diff --git a/src/leveldb/db/autocompact_test.cc b/src/leveldb/db/autocompact_test.cc
deleted file mode 100644
index d20a2362c..000000000
--- a/src/leveldb/db/autocompact_test.cc
+++ /dev/null
@@ -1,118 +0,0 @@
-// Copyright (c) 2013 The LevelDB Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
-
-#include "leveldb/db.h"
-#include "db/db_impl.h"
-#include "leveldb/cache.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
-
-namespace leveldb {
-
-class AutoCompactTest {
- public:
-  std::string dbname_;
-  Cache* tiny_cache_;
-  Options options_;
-  DB* db_;
-
-  AutoCompactTest() {
-    dbname_ = test::TmpDir() + "/autocompact_test";
-    tiny_cache_ = NewLRUCache(100);
-    options_.block_cache = tiny_cache_;
-    DestroyDB(dbname_, options_);
-    options_.create_if_missing = true;
-    options_.compression = kNoCompression;
-    ASSERT_OK(DB::Open(options_, dbname_, &db_));
-  }
-
-  ~AutoCompactTest() {
-    delete db_;
-    DestroyDB(dbname_, Options());
-    delete tiny_cache_;
-  }
-
-  std::string Key(int i) {
-    char buf[100];
-    snprintf(buf, sizeof(buf), "key%06d", i);
-    return std::string(buf);
-  }
-
-  uint64_t Size(const Slice& start, const Slice& limit) {
-    Range r(start, limit);
-    uint64_t size;
-    db_->GetApproximateSizes(&r, 1, &size);
-    return size;
-  }
-
-  void DoReads(int n);
-};
-
-static const int kValueSize = 200 * 1024;
-static const int kTotalSize = 100 * 1024 * 1024;
-static const int kCount = kTotalSize / kValueSize;
-
-// Read through the first n keys repeatedly and check that they get
-// compacted (verified by checking the size of the key space).
-void AutoCompactTest::DoReads(int n) {
-  std::string value(kValueSize, 'x');
-  DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
-
-  // Fill database
-  for (int i = 0; i < kCount; i++) {
-    ASSERT_OK(db_->Put(WriteOptions(), Key(i), value));
-  }
-  ASSERT_OK(dbi->TEST_CompactMemTable());
-
-  // Delete everything
-  for (int i = 0; i < kCount; i++) {
-    ASSERT_OK(db_->Delete(WriteOptions(), Key(i)));
-  }
-  ASSERT_OK(dbi->TEST_CompactMemTable());
-
-  // Get initial measurement of the space we will be reading.
-  const int64_t initial_size = Size(Key(0), Key(n));
-  const int64_t initial_other_size = Size(Key(n), Key(kCount));
-
-  // Read until size drops significantly.
-  std::string limit_key = Key(n);
-  for (int read = 0; true; read++) {
-    ASSERT_LT(read, 100) << "Taking too long to compact";
-    Iterator* iter = db_->NewIterator(ReadOptions());
-    for (iter->SeekToFirst();
-         iter->Valid() && iter->key().ToString() < limit_key;
-         iter->Next()) {
-      // Drop data
-    }
-    delete iter;
-    // Wait a little bit to allow any triggered compactions to complete.
-    Env::Default()->SleepForMicroseconds(1000000);
-    uint64_t size = Size(Key(0), Key(n));
-    fprintf(stderr, "iter %3d => %7.3f MB [other %7.3f MB]\n",
-            read+1, size/1048576.0, Size(Key(n), Key(kCount))/1048576.0);
-    if (size <= initial_size/10) {
-      break;
-    }
-  }
-
-  // Verify that the size of the key space not touched by the reads
-  // is pretty much unchanged.
-  const int64_t final_other_size = Size(Key(n), Key(kCount));
-  ASSERT_LE(final_other_size, initial_other_size + 1048576);
-  ASSERT_GE(final_other_size, initial_other_size/5 - 1048576);
-}
-
-TEST(AutoCompactTest, ReadAll) {
-  DoReads(kCount);
-}
-
-TEST(AutoCompactTest, ReadHalf) {
-  DoReads(kCount/2);
-}
-
-}  // namespace leveldb
-
-int main(int argc, char** argv) {
-  return leveldb::test::RunAllTests();
-}
diff --git a/src/leveldb/db/builder.cc b/src/leveldb/db/builder.cc
index f41988219..4ac60f488 100644
--- a/src/leveldb/db/builder.cc
+++ b/src/leveldb/db/builder.cc
@@ -2,12 +2,16 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
+#define __STDC_FORMAT_MACROS
+#include <inttypes.h>
+
 #include "db/builder.h"
 
 #include "db/filename.h"
 #include "db/dbformat.h"
 #include "db/table_cache.h"
 #include "db/version_edit.h"
+#include "db/version_set.h"
 #include "leveldb/db.h"
 #include "leveldb/env.h"
 #include "leveldb/iterator.h"
@@ -17,27 +21,51 @@ namespace leveldb {
 Status BuildTable(const std::string& dbname,
                   Env* env,
                   const Options& options,
+                  const Comparator * user_comparator,
                   TableCache* table_cache,
                   Iterator* iter,
-                  FileMetaData* meta) {
+                  FileMetaData* meta,
+                  SequenceNumber smallest_snapshot) {
   Status s;
+  size_t keys_seen, keys_retired;
+
+  keys_seen=0;
+  keys_retired=0;
+
   meta->file_size = 0;
   iter->SeekToFirst();
 
-  std::string fname = TableFileName(dbname, meta->number);
+  KeyRetirement retire(user_comparator, smallest_snapshot, &options);
+
+  std::string fname = TableFileName(options, meta->number, meta->level);
   if (iter->Valid()) {
     WritableFile* file;
-    s = env->NewWritableFile(fname, &file);
+
+    s = env->NewWritableFile(fname, &file,
+                                 env->RecoveryMmapSize(&options));
     if (!s.ok()) {
       return s;
     }
 
+    // tune fadvise to keep all of this lower level file in page cache
+    //  (compaction of unsorted files causes severe cache misses)
+    file->SetMetadataOffset(1);
+
     TableBuilder* builder = new TableBuilder(options, file);
     meta->smallest.DecodeFrom(iter->key());
     for (; iter->Valid(); iter->Next()) {
+      ++keys_seen;
       Slice key = iter->key();
-      meta->largest.DecodeFrom(key);
-      builder->Add(key, iter->value());
+      if (!retire(key))
+      {
+          meta->largest.DecodeFrom(key);
+          builder->Add(key, iter->value());
+          ++meta->num_entries;
+      }   // if
+      else
+      {
+          ++keys_retired;
+      }   // else
     }
 
     // Finish and check for builder errors
@@ -45,6 +73,9 @@ Status BuildTable(const std::string& dbname,
       s = builder->Finish();
       if (s.ok()) {
         meta->file_size = builder->FileSize();
+        meta->exp_write_low = builder->GetExpiryWriteLow();
+        meta->exp_write_high = builder->GetExpiryWriteHigh();
+        meta->exp_explicit_high = builder->GetExpiryExplicitHigh();
         assert(meta->file_size > 0);
       }
     } else {
@@ -64,10 +95,20 @@ Status BuildTable(const std::string& dbname,
 
     if (s.ok()) {
       // Verify that the table is usable
+      Table * table_ptr;
       Iterator* it = table_cache->NewIterator(ReadOptions(),
                                               meta->number,
-                                              meta->file_size);
+                                              meta->file_size,
+                                              meta->level,
+                                              &table_ptr);
       s = it->status();
+
+      // Riak specific: bloom filter is no longer read by default,
+      //  force read on highly used overlapped table files
+      if (s.ok() && VersionSet::IsLevelOverlapped(meta->level))
+          table_ptr->ReadFilter();
+
+      // table_ptr is owned by it and therefore invalidated by this delete
       delete it;
     }
   }
@@ -79,6 +120,11 @@ Status BuildTable(const std::string& dbname,
 
   if (s.ok() && meta->file_size > 0) {
     // Keep it
+      if (0!=keys_retired)
+      {
+          Log(options.info_log, "Level-0 table #%" PRIu64 ": %zd keys seen, %zd keys retired, %zd keys expired",
+              meta->number, keys_seen, retire.GetDroppedCount(), retire.GetExpiredCount());
+      }   // if
   } else {
     env->DeleteFile(fname);
   }
diff --git a/src/leveldb/db/builder.h b/src/leveldb/db/builder.h
index 62431fcf4..712924f8b 100644
--- a/src/leveldb/db/builder.h
+++ b/src/leveldb/db/builder.h
@@ -6,6 +6,7 @@
 #define STORAGE_LEVELDB_DB_BUILDER_H_
 
 #include "leveldb/status.h"
+#include "db/dbformat.h"
 
 namespace leveldb {
 
@@ -25,9 +26,11 @@ class VersionEdit;
 extern Status BuildTable(const std::string& dbname,
                          Env* env,
                          const Options& options,
+                         const Comparator * user_comparator,
                          TableCache* table_cache,
                          Iterator* iter,
-                         FileMetaData* meta);
+                         FileMetaData* meta,
+                         SequenceNumber smallest_snapshot);
 
 }  // namespace leveldb
 
diff --git a/src/leveldb/db/c.cc b/src/leveldb/db/c.cc
index 08ff0ad90..36066ffe0 100644
--- a/src/leveldb/db/c.cc
+++ b/src/leveldb/db/c.cc
@@ -6,6 +6,7 @@
 
 #include <stdlib.h>
 #include <unistd.h>
+#include <stdint.h>
 #include "leveldb/cache.h"
 #include "leveldb/comparator.h"
 #include "leveldb/db.h"
@@ -40,6 +41,8 @@ using leveldb::Status;
 using leveldb::WritableFile;
 using leveldb::WriteBatch;
 using leveldb::WriteOptions;
+using leveldb::KeyMetaData;
+using leveldb::ValueType;
 
 extern "C" {
 
@@ -49,6 +52,7 @@ struct leveldb_writebatch_t   { WriteBatch        rep; };
 struct leveldb_snapshot_t     { const Snapshot*   rep; };
 struct leveldb_readoptions_t  { ReadOptions       rep; };
 struct leveldb_writeoptions_t { WriteOptions      rep; };
+struct leveldb_keymetadata_t  { KeyMetaData       rep; };
 struct leveldb_options_t      { Options           rep; };
 struct leveldb_cache_t        { Cache*            rep; };
 struct leveldb_seqfile_t      { SequentialFile*   rep; };
@@ -173,8 +177,19 @@ void leveldb_put(
     const char* key, size_t keylen,
     const char* val, size_t vallen,
     char** errptr) {
+    return(leveldb_put2(db, options, key, keylen, val, vallen, errptr, NULL));
+}
+
+void leveldb_put2(
+    leveldb_t* db,
+    const leveldb_writeoptions_t* options,
+    const char* key, size_t keylen,
+    const char* val, size_t vallen,
+    char** errptr,
+    const leveldb_keymetadata_t * metadata) {
   SaveError(errptr,
-            db->rep->Put(options->rep, Slice(key, keylen), Slice(val, vallen)));
+            db->rep->Put(options->rep, Slice(key, keylen), Slice(val, vallen),
+                         (NULL==metadata ? NULL : &metadata->rep)));
 }
 
 void leveldb_delete(
@@ -200,9 +215,21 @@ char* leveldb_get(
     const char* key, size_t keylen,
     size_t* vallen,
     char** errptr) {
+
+ return(leveldb_get2(db, options, key, keylen, vallen, errptr, NULL));
+}
+
+char* leveldb_get2(
+    leveldb_t* db,
+    const leveldb_readoptions_t* options,
+    const char* key, size_t keylen,
+    size_t* vallen,
+    char** errptr,
+    leveldb_keymetadata_t * metadata) {
   char* result = NULL;
   std::string tmp;
-  Status s = db->rep->Get(options->rep, Slice(key, keylen), &tmp);
+  Status s = db->rep->Get(options->rep, Slice(key, keylen), &tmp,
+                          (NULL==metadata ? NULL : &metadata->rep));
   if (s.ok()) {
     *vallen = tmp.size();
     result = CopyString(tmp);
@@ -330,6 +357,15 @@ const char* leveldb_iter_value(const leveldb_iterator_t* iter, size_t* vlen) {
   return s.data();
 }
 
+const void leveldb_iter_keymetadata(const leveldb_iterator_t* iter,
+                                    leveldb_keymetadata_t * meta)
+{
+  if (NULL!=iter && NULL!=meta)
+  {
+    meta->rep=iter->rep->keymetadata();
+  } // if
+}
+
 void leveldb_iter_get_error(const leveldb_iterator_t* iter, char** errptr) {
   SaveError(errptr, iter->rep->status());
 }
@@ -350,7 +386,16 @@ void leveldb_writebatch_put(
     leveldb_writebatch_t* b,
     const char* key, size_t klen,
     const char* val, size_t vlen) {
-  b->rep.Put(Slice(key, klen), Slice(val, vlen));
+    leveldb_writebatch_put2(b, key, klen, val, vlen,NULL);
+}
+
+void leveldb_writebatch_put2(
+    leveldb_writebatch_t* b,
+    const char* key, size_t klen,
+    const char* val, size_t vlen,
+    const leveldb_keymetadata_t * metadata) {
+    b->rep.Put(Slice(key, klen), Slice(val, vlen),
+                         (NULL==metadata ? NULL : &metadata->rep));
 }
 
 void leveldb_writebatch_delete(
@@ -362,15 +407,20 @@ void leveldb_writebatch_delete(
 void leveldb_writebatch_iterate(
     leveldb_writebatch_t* b,
     void* state,
-    void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen),
+    void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen,
+                const int & type, const uint64_t & expiry),
     void (*deleted)(void*, const char* k, size_t klen)) {
   class H : public WriteBatch::Handler {
    public:
     void* state_;
-    void (*put_)(void*, const char* k, size_t klen, const char* v, size_t vlen);
+    void (*put_)(void*, const char* k, size_t klen, const char* v, size_t vlen,
+                 const int & type, const uint64_t & expiry);
     void (*deleted_)(void*, const char* k, size_t klen);
-    virtual void Put(const Slice& key, const Slice& value) {
-      (*put_)(state_, key.data(), key.size(), value.data(), value.size());
+    virtual void Put(const Slice& key, const Slice& value,
+                     const leveldb::ValueType & type,
+                     const leveldb::ExpiryTimeMicros & expiry)
+    {
+        (*put_)(state_, key.data(), key.size(), value.data(), value.size(), (int)type, (uint64_t)expiry);
     }
     virtual void Delete(const Slice& key) {
       (*deleted_)(state_, key.data(), key.size());
@@ -418,6 +468,11 @@ void leveldb_options_set_paranoid_checks(
   opt->rep.paranoid_checks = v;
 }
 
+void leveldb_options_set_verify_compactions(
+    leveldb_options_t* opt, unsigned char v) {
+  opt->rep.verify_compactions = v;
+}
+
 void leveldb_options_set_env(leveldb_options_t* opt, leveldb_env_t* env) {
   opt->rep.env = (env ? env->rep : NULL);
 }
@@ -450,6 +505,10 @@ void leveldb_options_set_compression(leveldb_options_t* opt, int t) {
   opt->rep.compression = static_cast<CompressionType>(t);
 }
 
+void leveldb_options_set_total_leveldb_mem(leveldb_options_t* opt, size_t s) {
+  opt->rep.total_leveldb_mem = s;
+}
+
 leveldb_comparator_t* leveldb_comparator_create(
     void* state,
     void (*destructor)(void*),
@@ -580,8 +639,18 @@ void leveldb_env_destroy(leveldb_env_t* env) {
   delete env;
 }
 
+void leveldb_env_shutdown() {
+  Env::Shutdown();
+}
+
+/**
+ * CAUTION:  this call is only for char * objects returned by
+ *           functions like leveldb_get and leveldb_property_value.
+ *           Also used to release errptr strings.
+ */
 void leveldb_free(void* ptr) {
-  free(ptr);
+  if (NULL!=ptr)
+    free(ptr);
 }
 
 int leveldb_major_version() {
diff --git a/src/leveldb/db/c_test.c b/src/leveldb/db/c_test.c
index 7cd5ee020..637ba9311 100644
--- a/src/leveldb/db/c_test.c
+++ b/src/leveldb/db/c_test.c
@@ -3,6 +3,8 @@
    found in the LICENSE file. See the AUTHORS file for names of contributors. */
 
 #include "leveldb/c.h"
+#include "leveldb/options.h"
+#include "port/port.h"
 
 #include <stddef.h>
 #include <stdio.h>
@@ -11,8 +13,13 @@
 #include <sys/types.h>
 #include <unistd.h>
 
+using leveldb::ValueType;
+
+struct leveldb_keymetadata_t  { leveldb::KeyMetaData       rep; };
+
 const char* phase = "";
 static char dbname[200];
+static leveldb::ExpiryTimeMicros gStartTime;
 
 static void StartPhase(const char* name) {
   fprintf(stderr, "=== Test %s\n", name);
@@ -33,7 +40,7 @@ static const char* GetTempDir(void) {
   }
 
 #define CheckCondition(cond)                                            \
-  if (!(cond)) {                                                        \
+  if (!(cond)) {                                                     \
     fprintf(stderr, "%s:%d: %s: %s\n", __FILE__, __LINE__, phase, #cond); \
     abort();                                                            \
   }
@@ -49,7 +56,7 @@ static void CheckEqual(const char* expected, const char* v, size_t n) {
     fprintf(stderr, "%s: expected '%s', got '%s'\n",
             phase,
             (expected ? expected : "(null)"),
-            (v ? v : "(null"));
+            (v ? v : "(null)"));
     abort();
   }
 }
@@ -112,6 +119,117 @@ static void CheckDel(void* ptr, const char* k, size_t klen) {
   (*state)++;
 }
 
+//  (expiry enabled)
+static void CheckGet2(
+    leveldb_t* db,
+    const leveldb_readoptions_t* options,
+    const char* key,
+    const char* expected,
+    ValueType type,
+    uint64_t expiry) {
+  char* err = NULL;
+  size_t val_len;
+  char* val;
+  leveldb_keymetadata_t meta;
+
+  val = leveldb_get2(db, options, key, strlen(key), &val_len, &err, &meta);
+  CheckNoError(err);
+  CheckEqual(expected, val, val_len);
+  CheckCondition(type==meta.rep.m_Type);
+  if (0==expiry && leveldb::kTypeValueWriteTime==type)
+  {
+    leveldb::ExpiryTimeMicros now=leveldb::port::TimeMicros();
+    CheckCondition(gStartTime<=meta.rep.m_Expiry && meta.rep.m_Expiry<=now);
+  }   // if
+  else
+    {CheckCondition(expiry==meta.rep.m_Expiry);}
+
+  Free(&val);
+}
+
+//  (expiry enabled)
+static void CheckIter2(leveldb_iterator_t* iter,
+                       const char* key, const char* val,
+                       const leveldb::KeyMetaData & meta) {
+  size_t len;
+  const char* str;
+  leveldb_keymetadata_t it_meta;
+
+  str = leveldb_iter_key(iter, &len);
+  CheckEqual(key, str, len);
+  str = leveldb_iter_value(iter, &len);
+  CheckEqual(val, str, len);
+
+  leveldb_iter_keymetadata(iter, &it_meta);
+  CheckCondition(meta.m_Type==it_meta.rep.m_Type);
+  if (0==meta.m_Expiry && leveldb::kTypeValueWriteTime==meta.m_Type)
+  {
+    leveldb::ExpiryTimeMicros now=leveldb::port::TimeMicros();
+    CheckCondition(gStartTime<=it_meta.rep.m_Expiry && it_meta.rep.m_Expiry<=now);
+  }   // if
+  else
+    {CheckCondition(meta.m_Expiry==it_meta.rep.m_Expiry);}
+
+}
+
+// Callback from leveldb_writebatch_iterate()
+//  (expiry enabled)
+struct CheckPut2Data
+{
+    const char * m_Key;
+    const char * m_Value;
+    ValueType m_Type;
+    uint64_t m_Expiry;
+};
+
+static struct CheckPut2Data gCheckPut2Data[]=
+{
+    {"foo","hello_put2",leveldb::kTypeValue,0},
+    {"box","c_put2",leveldb::kTypeValue,0},
+    {"disney","cartoon_put2",leveldb::kTypeValueWriteTime, 0},
+    {"money","lotsof_put2",leveldb::kTypeValueWriteTime, 9988776655},
+    {"time","ismoney_put2",leveldb::kTypeValueExplicitExpiry, 221199887766}
+};
+
+static struct CheckPut2Data gCheckPut2ItrData[]=
+{
+    {"bar","b",leveldb::kTypeValue,0},
+    {"box","c",leveldb::kTypeValue,0},
+    {"bar","",leveldb::kTypeDeletion,0},
+    {"mom","texas",leveldb::kTypeValueWriteTime,0},
+    {"dad","poland",leveldb::kTypeValueExplicitExpiry,22446688}
+  };
+
+static void CheckPut2(void* ptr,
+                      const char* k, size_t klen,
+                      const char* v, size_t vlen,
+                      const int & type_int,
+                      const uint64_t & expiry) {
+  int* state = (int*) ptr;
+  CheckCondition(*state < (sizeof(gCheckPut2ItrData)/sizeof(gCheckPut2ItrData[0])));
+  struct CheckPut2Data * test;
+
+  test=&gCheckPut2ItrData[*state];
+  CheckEqual(test->m_Key, k, klen);
+  CheckEqual(test->m_Value, v, vlen);
+  CheckCondition((int)test->m_Type==type_int);
+  if (leveldb::kTypeValueWriteTime!=test->m_Type)
+    {CheckCondition((uint64_t)test->m_Expiry==expiry);}
+  (*state)++;
+}
+
+// Callback from leveldb_writebatch_iterate()
+//  (expiry enabled)
+static void CheckDel2(void* ptr, const char* k, size_t klen) {
+  int* state = (int*) ptr;
+  CheckCondition(*state < (sizeof(gCheckPut2ItrData)/sizeof(gCheckPut2ItrData[0])));
+  struct CheckPut2Data * test;
+
+  test=&gCheckPut2ItrData[*state];
+  CheckEqual(test->m_Key, k, klen);
+  (*state)++;
+}
+
 static void CmpDestroy(void* arg) { }
 
 static int CmpCompare(void* arg, const char* a, size_t alen,
@@ -141,7 +259,7 @@ static char* FilterCreate(
     int num_keys,
     size_t* filter_length) {
   *filter_length = 4;
-  char* result = malloc(4);
+  char* result = (char*)malloc(4);
   memcpy(result, "fake", 4);
   return result;
 }
@@ -167,6 +285,7 @@ int main(int argc, char** argv) {
 
   CheckCondition(leveldb_major_version() >= 1);
   CheckCondition(leveldb_minor_version() >= 1);
+  gStartTime=leveldb::port::TimeMicros();
 
   snprintf(dbname, sizeof(dbname),
            "%s/leveldb_c_test-%d",
@@ -207,12 +326,6 @@ int main(int argc, char** argv) {
   CheckCondition(err != NULL);
   Free(&err);
 
-  StartPhase("leveldb_free");
-  db = leveldb_open(options, dbname, &err);
-  CheckCondition(err != NULL);
-  leveldb_free(err);
-  err = NULL;
-
   StartPhase("open");
   leveldb_options_set_create_if_missing(options, 1);
   db = leveldb_open(options, dbname, &err);
@@ -234,42 +347,74 @@ int main(int argc, char** argv) {
 
   StartPhase("writebatch");
   {
+    leveldb_keymetadata_t meta;
     leveldb_writebatch_t* wb = leveldb_writebatch_create();
     leveldb_writebatch_put(wb, "foo", 3, "a", 1);
     leveldb_writebatch_clear(wb);
     leveldb_writebatch_put(wb, "bar", 3, "b", 1);
     leveldb_writebatch_put(wb, "box", 3, "c", 1);
     leveldb_writebatch_delete(wb, "bar", 3);
+    meta.rep.m_Type=leveldb::kTypeValueWriteTime;
+    meta.rep.m_Expiry=0;
+    leveldb_writebatch_put2(wb, "mom", 3, "texas", 5, &meta);
+    meta.rep.m_Type=leveldb::kTypeValueExplicitExpiry;
+    meta.rep.m_Expiry=22446688;
+    leveldb_writebatch_put2(wb, "dad", 3, "poland", 6, &meta);
     leveldb_write(db, woptions, wb, &err);
     CheckNoError(err);
     CheckGet(db, roptions, "foo", "hello");
     CheckGet(db, roptions, "bar", NULL);
     CheckGet(db, roptions, "box", "c");
+    CheckGet2(db, roptions, "dad", "poland", leveldb::kTypeValueExplicitExpiry, 22446688);
+    CheckGet2(db, roptions, "mom", "texas", leveldb::kTypeValueWriteTime, 0);
     int pos = 0;
-    leveldb_writebatch_iterate(wb, &pos, CheckPut, CheckDel);
-    CheckCondition(pos == 3);
+    leveldb_writebatch_iterate(wb, &pos, CheckPut2, CheckDel2);
+    CheckCondition(pos == 5);
     leveldb_writebatch_destroy(wb);
   }
 
+  // reminder:  keymetadata not supported on backward iteration
   StartPhase("iter");
   {
+    leveldb::KeyMetaData meta;
     leveldb_iterator_t* iter = leveldb_create_iterator(db, roptions);
     CheckCondition(!leveldb_iter_valid(iter));
     leveldb_iter_seek_to_first(iter);
     CheckCondition(leveldb_iter_valid(iter));
     CheckIter(iter, "box", "c");
+    meta.m_Type=leveldb::kTypeValue;
+    meta.m_Expiry=0;
+    CheckIter2(iter, "box", "c", meta);
+
+    meta.m_Type=leveldb::kTypeValueExplicitExpiry;
+    meta.m_Expiry=22446688;
+    leveldb_iter_next(iter);
+    CheckIter2(iter, "dad", "poland", meta);
     leveldb_iter_next(iter);
     CheckIter(iter, "foo", "hello");
     leveldb_iter_prev(iter);
+    CheckIter(iter, "dad", "poland");
+    leveldb_iter_prev(iter);
     CheckIter(iter, "box", "c");
     leveldb_iter_prev(iter);
     CheckCondition(!leveldb_iter_valid(iter));
     leveldb_iter_seek_to_last(iter);
-    CheckIter(iter, "foo", "hello");
+    CheckIter(iter, "mom", "texas");
     leveldb_iter_seek(iter, "b", 1);
     CheckIter(iter, "box", "c");
     leveldb_iter_get_error(iter, &err);
     CheckNoError(err);
+
+    meta.m_Type=leveldb::kTypeValue;
+    meta.m_Expiry=0;
+    CheckIter2(iter, "box", "c", meta);
+    leveldb_iter_seek(iter, "m", 1);
+    meta.m_Type=leveldb::kTypeValueWriteTime;
+    meta.m_Expiry=0;
+    CheckIter2(iter, "mom", "texas", meta);
+    leveldb_iter_get_error(iter, &err);
+    CheckNoError(err);
+
     leveldb_iter_destroy(iter);
   }
 
@@ -335,6 +480,70 @@ int main(int argc, char** argv) {
     leveldb_options_set_error_if_exists(options, 1);
   }
 
+  StartPhase("put expiry");
+  {
+      leveldb_keymetadata_t meta;
+      int loop, count;
+
+      count = sizeof(gCheckPut2Data) / sizeof(gCheckPut2Data[0]);
+
+      for (loop=0; loop<count; ++loop)
+      {
+          size_t klen, vlen;
+          leveldb_keymetadata_t meta;
+          struct CheckPut2Data * test;
+
+          test=&gCheckPut2Data[loop];
+          klen=strlen(test->m_Key);
+          vlen=strlen(test->m_Value);
+          meta.rep.m_Type=test->m_Type;
+          meta.rep.m_Expiry=test->m_Expiry;
+
+          leveldb_put2(db, woptions, test->m_Key, klen,
+                       test->m_Value, vlen, &err,
+                       &meta);
+          CheckNoError(err);
+      }   // for
+
+      // testing memtable right now
+      for (loop=0; loop<count; ++loop)
+      {
+          size_t klen, vlen;
+          leveldb_keymetadata_t meta;
+          struct CheckPut2Data * test;
+
+          test=&gCheckPut2Data[loop];
+          klen=strlen(test->m_Key);
+          vlen=strlen(test->m_Value);
+
+          CheckGet2(db, roptions, test->m_Key, test->m_Value,
+                    test->m_Type, test->m_Expiry);
+      }   // for
+
+      // close and open to force memory table into .sst upon open
+      leveldb_close(db);
+      leveldb_options_set_error_if_exists(options, 0);
+      db = leveldb_open(options, dbname, &err);
+      CheckNoError(err);
+
+      // now testing get from a level-0 .sst file
+      for (loop=0; loop<count; ++loop)
+      {
+          size_t klen, vlen;
+          leveldb_keymetadata_t meta;
+          struct CheckPut2Data * test;
+
+          test=&gCheckPut2Data[loop];
+          klen=strlen(test->m_Key);
+          vlen=strlen(test->m_Value);
+
+          CheckGet2(db, roptions, test->m_Key, test->m_Value,
+                    test->m_Type, test->m_Expiry);
+      }   // for
+  }
+
+  //
+  // This screws up "options" for real database work.  execute last.
   StartPhase("filter");
   for (run = 0; run < 2; run++) {
     // First run uses custom filter, second run uses bloom filter
@@ -376,6 +585,8 @@ int main(int argc, char** argv) {
     leveldb_filterpolicy_destroy(policy);
   }
 
+
+
   StartPhase("cleanup");
   leveldb_close(db);
   leveldb_options_destroy(options);
@@ -386,5 +597,7 @@ int main(int argc, char** argv) {
   leveldb_env_destroy(env);
 
   fprintf(stderr, "PASS\n");
+
+  leveldb_env_shutdown();
   return 0;
 }
diff --git a/src/leveldb/db/corruption_test.cc b/src/leveldb/db/corruption_test.cc
index 37a484d25..3b40b1c96 100644
--- a/src/leveldb/db/corruption_test.cc
+++ b/src/leveldb/db/corruption_test.cc
@@ -35,8 +35,8 @@ class CorruptionTest {
   CorruptionTest() {
     tiny_cache_ = NewLRUCache(100);
     options_.env = &env_;
-    options_.block_cache = tiny_cache_;
-    dbname_ = test::TmpDir() + "/corruption_test";
+    dbname_ = test::TmpDir() + "/db_test";
+    dbname_ = MakeTieredDbname(dbname_, options_);
     DestroyDB(dbname_, options_);
 
     db_ = NULL;
@@ -51,14 +51,17 @@ class CorruptionTest {
      delete tiny_cache_;
   }
 
-  Status TryReopen() {
+  Status TryReopen(Options* options = NULL) {
     delete db_;
     db_ = NULL;
-    return DB::Open(options_, dbname_, &db_);
+    Options opt = (options ? *options : options_);
+    opt.env = &env_;
+    opt.block_cache = tiny_cache_;
+    return DB::Open(opt, dbname_, &db_);
   }
 
-  void Reopen() {
-    ASSERT_OK(TryReopen());
+  void Reopen(Options* options = NULL) {
+    ASSERT_OK(TryReopen(options));
   }
 
   void RepairDB() {
@@ -75,13 +78,7 @@ class CorruptionTest {
       Slice key = Key(i, &key_space);
       batch.Clear();
       batch.Put(key, Value(i, &value_space));
-      WriteOptions options;
-      // Corrupt() doesn't work without this sync on windows; stat reports 0 for
-      // the file size.
-      if (i == n - 1) {
-        options.sync = true;
-      }
-      ASSERT_OK(db_->Write(options, &batch));
+      ASSERT_OK(db_->Write(WriteOptions(), &batch));
     }
   }
 
@@ -96,10 +93,6 @@ class CorruptionTest {
     for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
       uint64_t key;
       Slice in(iter->key());
-      if (in == "" || in == "~") {
-        // Ignore boundary keys.
-        continue;
-      }
       if (!ConsumeDecimalNumber(&in, &key) ||
           !in.empty() ||
           key < next_expected) {
@@ -123,19 +116,26 @@ class CorruptionTest {
     ASSERT_GE(max_expected, correct);
   }
 
-  void Corrupt(FileType filetype, int offset, int bytes_to_corrupt) {
+  void Corrupt(FileType filetype, int offset, int bytes_to_corrupt, int level=0) {
     // Pick file to corrupt
     std::vector<std::string> filenames;
-    ASSERT_OK(env_.GetChildren(dbname_, &filenames));
+    std::string dirname;
+    if (leveldb::kTableFile!=filetype)
+        dirname=dbname_;
+    else
+        dirname=MakeDirName2(options_, level, "sst");
+
+    ASSERT_OK(env_.GetChildren(dirname, &filenames));
+
     uint64_t number;
     FileType type;
     std::string fname;
     int picked_number = -1;
-    for (size_t i = 0; i < filenames.size(); i++) {
+    for (int i = 0; i < filenames.size(); i++) {
       if (ParseFileName(filenames[i], &number, &type) &&
           type == filetype &&
           int(number) > picked_number) {  // Pick latest file
-        fname = dbname_ + "/" + filenames[i];
+        fname = dirname + "/" + filenames[i];
         picked_number = number;
       }
     }
@@ -222,12 +222,14 @@ TEST(CorruptionTest, NewFileErrorDuringWrite) {
   const int num = 3 + (Options().write_buffer_size / kValueSize);
   std::string value_storage;
   Status s;
-  for (int i = 0; s.ok() && i < num; i++) {
+  for (int i = 0;
+       s.ok() && i < num && 0==env_.num_writable_file_errors_;
+       i++) {
     WriteBatch batch;
     batch.Put("a", Value(100, &value_storage));
     s = db_->Write(WriteOptions(), &batch);
   }
-  ASSERT_TRUE(!s.ok());
+//  ASSERT_TRUE(!s.ok());  Background write thread will never report this
   ASSERT_GE(env_.num_writable_file_errors_, 1);
   env_.writable_file_error_ = false;
   Reopen();
@@ -240,34 +242,18 @@ TEST(CorruptionTest, TableFile) {
   dbi->TEST_CompactRange(0, NULL, NULL);
   dbi->TEST_CompactRange(1, NULL, NULL);
 
-  Corrupt(kTableFile, 100, 1);
-  Check(90, 99);
-}
-
-TEST(CorruptionTest, TableFileRepair) {
-  options_.block_size = 2 * kValueSize;  // Limit scope of corruption
-  options_.paranoid_checks = true;
-  Reopen();
-  Build(100);
-  DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
-  dbi->TEST_CompactMemTable();
-  dbi->TEST_CompactRange(0, NULL, NULL);
-  dbi->TEST_CompactRange(1, NULL, NULL);
-
-  Corrupt(kTableFile, 100, 1);
-  RepairDB();
-  Reopen();
+  Corrupt(kTableFile, 100, 1, config::kMaxMemCompactLevel);
   Check(95, 99);
 }
 
 TEST(CorruptionTest, TableFileIndexData) {
-  Build(10000);  // Enough to build multiple Tables
+  Build(100000);  // Enough to build multiple Tables
   DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
   dbi->TEST_CompactMemTable();
 
-  Corrupt(kTableFile, -2000, 500);
+  Corrupt(kTableFile, -2000, 500, config::kMaxMemCompactLevel);
   Reopen();
-  Check(5000, 9999);
+  Check(50000, 99999);
 }
 
 TEST(CorruptionTest, MissingDescriptor) {
@@ -319,10 +305,10 @@ TEST(CorruptionTest, CompactionInputError) {
   Build(10);
   DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
   dbi->TEST_CompactMemTable();
-  const int last = config::kMaxMemCompactLevel;
+  const int last = config::kMaxMemCompactLevel; // Riak does not "move" files
   ASSERT_EQ(1, Property("leveldb.num-files-at-level" + NumberToString(last)));
 
-  Corrupt(kTableFile, 100, 1);
+  Corrupt(kTableFile, 100, 1, last);
   Check(5, 9);
 
   // Force compactions by writing lots of values
@@ -331,31 +317,50 @@ TEST(CorruptionTest, CompactionInputError) {
 }
 
 TEST(CorruptionTest, CompactionInputErrorParanoid) {
-  options_.paranoid_checks = true;
-  options_.write_buffer_size = 512 << 10;
-  Reopen();
+  Options options;
+  options.paranoid_checks = true;
+  options.write_buffer_size = 1048576;
+  Reopen(&options);
+
+  int current_corruption=Property("leveldb.ReadBlockError");
   DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
 
-  // Make multiple inputs so we need to compact.
-  for (int i = 0; i < 2; i++) {
-    Build(10);
+  // Fill levels >= 1 so memtable compaction outputs to level 1
+  //  matthewv 1/10/14 - what does "levels" have to do with this,
+  //  switching to compaction trigger.
+  // 7/10/14 - compaction starts between 4 and 6 files ... assume 4 and 1 move
+  //  (will make a new, descriptive constant for 4)
+  for (int level = Property("leveldb.num-files-at-level0")+1;
+       level < config::kL0_GroomingTrigger; level++) {
+    dbi->Put(WriteOptions(), "", "begin");
+    dbi->Put(WriteOptions(), "~", "end");
     dbi->TEST_CompactMemTable();
-    Corrupt(kTableFile, 100, 1);
-    env_.SleepForMicroseconds(100000);
   }
-  dbi->CompactRange(NULL, NULL);
 
-  // Write must fail because of corrupted table
+  Build(10);
+  dbi->TEST_CompactMemTable();
+  ASSERT_TRUE(1 < Property("leveldb.num-files-at-level0"));
+
+  Corrupt(kTableFile, 100, 1, 0);
+  Check(5, 9);
+
+  // Write must eventually fail because of corrupted table
+  Status s;
   std::string tmp1, tmp2;
-  Status s = db_->Put(WriteOptions(), Key(5, &tmp1), Value(5, &tmp2));
-  ASSERT_TRUE(!s.ok()) << "write did not fail in corrupted paranoid db";
+  for (int i = 0; i < 10000 && s.ok(); i++) {
+    s = db_->Put(WriteOptions(), Key(i, &tmp1), Value(i, &tmp2));
+  }
+  if (s.ok())
+      ASSERT_NE(current_corruption, Property("leveldb.ReadBlockError")) << "no ReadBlockError seen";
+  else
+      ASSERT_TRUE(!s.ok()) << "write did not fail in corrupted paranoid db";
 }
 
 TEST(CorruptionTest, UnrelatedKeys) {
   Build(10);
   DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
   dbi->TEST_CompactMemTable();
-  Corrupt(kTableFile, 100, 1);
+  Corrupt(kTableFile, 100, 1, config::kMaxMemCompactLevel);
 
   std::string tmp1, tmp2;
   ASSERT_OK(db_->Put(WriteOptions(), Key(1000, &tmp1), Value(1000, &tmp2)));
diff --git a/src/leveldb/db/db_bench.cc b/src/leveldb/db/db_bench.cc
index 3ad19a512..644cf479c 100644
--- a/src/leveldb/db/db_bench.cc
+++ b/src/leveldb/db/db_bench.cc
@@ -33,7 +33,6 @@
 //      readmissing   -- read N missing keys in random order
 //      readhot       -- read N times in random order from 1% section of DB
 //      seekrandom    -- N random seeks
-//      open          -- cost of opening a DB
 //      crc32c        -- repeated crc32c of 4K of data
 //      acquireload   -- load N*1000 times
 //   Meta operations:
@@ -84,14 +83,6 @@ static bool FLAGS_histogram = false;
 // (initialized to default value by "main")
 static int FLAGS_write_buffer_size = 0;
 
-// Number of bytes written to each file.
-// (initialized to default value by "main")
-static int FLAGS_max_file_size = 0;
-
-// Approximate size of user data packed per block (before compression.
-// (initialized to default value by "main")
-static int FLAGS_block_size = 0;
-
 // Number of bytes to use as a cache of uncompressed data.
 // Negative means use default settings.
 static int FLAGS_cache_size = -1;
@@ -103,21 +94,26 @@ static int FLAGS_open_files = 0;
 // Negative means use default settings.
 static int FLAGS_bloom_bits = -1;
 
+// Riak bloom adaptation
+static int FLAGS_bloom2_bits = -1;
+
+// Riak param for total memory allocation (flex_cache)
+static uint64_t FLAGS_leveldb_memory = -1;
+
+// Riak param for compression setting
+static int FLAGS_compression = 2;
+
 // If true, do not destroy the existing database.  If you set this
 // flag and also specify a benchmark that wants a fresh database, that
 // benchmark will fail.
 static bool FLAGS_use_existing_db = false;
 
-// If true, reuse existing log/MANIFEST files when re-opening a database.
-static bool FLAGS_reuse_logs = false;
-
 // Use the db with the following name.
 static const char* FLAGS_db = NULL;
 
 namespace leveldb {
 
 namespace {
-leveldb::Env* g_env = NULL;
 
 // Helper for quickly generating random data.
 class RandomGenerator {
@@ -141,7 +137,7 @@ class RandomGenerator {
     pos_ = 0;
   }
 
-  Slice Generate(size_t len) {
+  Slice Generate(int len) {
     if (pos_ + len > data_.size()) {
       pos_ = 0;
       assert(len < data_.size());
@@ -151,19 +147,17 @@ class RandomGenerator {
   }
 };
 
-#if defined(__linux)
 static Slice TrimSpace(Slice s) {
-  size_t start = 0;
+  int start = 0;
   while (start < s.size() && isspace(s[start])) {
     start++;
   }
-  size_t limit = s.size();
+  int limit = s.size();
   while (limit > start && isspace(s[limit-1])) {
     limit--;
   }
   return Slice(s.data() + start, limit - start);
 }
-#endif
 
 static void AppendWithSpace(std::string* str, Slice msg) {
   if (msg.empty()) return;
@@ -195,7 +189,7 @@ class Stats {
     done_ = 0;
     bytes_ = 0;
     seconds_ = 0;
-    start_ = g_env->NowMicros();
+    start_ = Env::Default()->NowMicros();
     finish_ = start_;
     message_.clear();
   }
@@ -213,7 +207,7 @@ class Stats {
   }
 
   void Stop() {
-    finish_ = g_env->NowMicros();
+    finish_ = Env::Default()->NowMicros();
     seconds_ = (finish_ - start_) * 1e-6;
   }
 
@@ -223,7 +217,7 @@ class Stats {
 
   void FinishedSingleOp() {
     if (FLAGS_histogram) {
-      double now = g_env->NowMicros();
+      double now = Env::Default()->NowMicros();
       double micros = now - last_op_finish_;
       hist_.Add(micros);
       if (micros > 20000) {
@@ -405,7 +399,7 @@ class Benchmark {
   : cache_(FLAGS_cache_size >= 0 ? NewLRUCache(FLAGS_cache_size) : NULL),
     filter_policy_(FLAGS_bloom_bits >= 0
                    ? NewBloomFilterPolicy(FLAGS_bloom_bits)
-                   : NULL),
+                   : (FLAGS_bloom2_bits >=0 ? NewBloomFilterPolicy2(FLAGS_bloom2_bits) : NULL)),
     db_(NULL),
     num_(FLAGS_num),
     value_size_(FLAGS_value_size),
@@ -413,10 +407,10 @@ class Benchmark {
     reads_(FLAGS_reads < 0 ? FLAGS_num : FLAGS_reads),
     heap_counter_(0) {
     std::vector<std::string> files;
-    g_env->GetChildren(FLAGS_db, &files);
-    for (size_t i = 0; i < files.size(); i++) {
+    Env::Default()->GetChildren(FLAGS_db, &files);
+    for (int i = 0; i < files.size(); i++) {
       if (Slice(files[i]).starts_with("heap-")) {
-        g_env->DeleteFile(std::string(FLAGS_db) + "/" + files[i]);
+        Env::Default()->DeleteFile(std::string(FLAGS_db) + "/" + files[i]);
       }
     }
     if (!FLAGS_use_existing_db) {
@@ -446,7 +440,7 @@ class Benchmark {
         benchmarks = sep + 1;
       }
 
-      // Reset parameters that may be overridden below
+      // Reset parameters that may be overriddden bwlow
       num_ = FLAGS_num;
       reads_ = (FLAGS_reads < 0 ? FLAGS_num : FLAGS_reads);
       value_size_ = FLAGS_value_size;
@@ -457,11 +451,7 @@ class Benchmark {
       bool fresh_db = false;
       int num_threads = FLAGS_threads;
 
-      if (name == Slice("open")) {
-        method = &Benchmark::OpenBench;
-        num_ /= 10000;
-        if (num_ < 1) num_ = 1;
-      } else if (name == Slice("fillseq")) {
+      if (name == Slice("fillseq")) {
         fresh_db = true;
         method = &Benchmark::WriteSeq;
       } else if (name == Slice("fillbatch")) {
@@ -553,6 +543,7 @@ class Benchmark {
     SharedState* shared;
     ThreadState* thread;
     void (Benchmark::*method)(ThreadState*);
+    pthread_t thread_id;
   };
 
   static void ThreadBody(void* v) {
@@ -598,7 +589,8 @@ class Benchmark {
       arg[i].shared = &shared;
       arg[i].thread = new ThreadState(i);
       arg[i].thread->shared = &shared;
-      g_env->StartThread(ThreadBody, &arg[i]);
+      arg[i].thread_id=Env::Default()->StartThread(ThreadBody, &arg[i]);
+      pthread_detach(arg[i].thread_id);
     }
 
     shared.mu.Lock();
@@ -709,15 +701,12 @@ class Benchmark {
   void Open() {
     assert(db_ == NULL);
     Options options;
-    options.env = g_env;
     options.create_if_missing = !FLAGS_use_existing_db;
     options.block_cache = cache_;
     options.write_buffer_size = FLAGS_write_buffer_size;
-    options.max_file_size = FLAGS_max_file_size;
-    options.block_size = FLAGS_block_size;
-    options.max_open_files = FLAGS_open_files;
     options.filter_policy = filter_policy_;
-    options.reuse_logs = FLAGS_reuse_logs;
+    options.compression = (leveldb::CompressionType)FLAGS_compression;
+    options.total_leveldb_mem = FLAGS_leveldb_memory;
     Status s = DB::Open(options, FLAGS_db, &db_);
     if (!s.ok()) {
       fprintf(stderr, "open error: %s\n", s.ToString().c_str());
@@ -725,14 +714,6 @@ class Benchmark {
     }
   }
 
-  void OpenBench(ThreadState* thread) {
-    for (int i = 0; i < num_; i++) {
-      delete db_;
-      Open();
-      thread->stats.FinishedSingleOp();
-    }
-  }
-
   void WriteSeq(ThreadState* thread) {
     DoWrite(thread, true);
   }
@@ -842,6 +823,7 @@ class Benchmark {
 
   void SeekRandom(ThreadState* thread) {
     ReadOptions options;
+    std::string value;
     int found = 0;
     for (int i = 0; i < reads_; i++) {
       Iterator* iter = db_->NewIterator(options);
@@ -937,7 +919,7 @@ class Benchmark {
     char fname[100];
     snprintf(fname, sizeof(fname), "%s/heap-%04d", FLAGS_db, ++heap_counter_);
     WritableFile* file;
-    Status s = g_env->NewWritableFile(fname, &file);
+    Status s = Env::Default()->NewWritableFile(fname, &file, 2<<20);
     if (!s.ok()) {
       fprintf(stderr, "%s\n", s.ToString().c_str());
       return;
@@ -946,7 +928,7 @@ class Benchmark {
     delete file;
     if (!ok) {
       fprintf(stderr, "heap profiling not supported\n");
-      g_env->DeleteFile(fname);
+      Env::Default()->DeleteFile(fname);
     }
   }
 };
@@ -955,14 +937,14 @@ class Benchmark {
 
 int main(int argc, char** argv) {
   FLAGS_write_buffer_size = leveldb::Options().write_buffer_size;
-  FLAGS_max_file_size = leveldb::Options().max_file_size;
-  FLAGS_block_size = leveldb::Options().block_size;
   FLAGS_open_files = leveldb::Options().max_open_files;
+  FLAGS_leveldb_memory = 25000000000LL;
   std::string default_db_path;
 
   for (int i = 1; i < argc; i++) {
     double d;
     int n;
+    uint64_t u;
     char junk;
     if (leveldb::Slice(argv[i]).starts_with("--benchmarks=")) {
       FLAGS_benchmarks = argv[i] + strlen("--benchmarks=");
@@ -974,9 +956,6 @@ int main(int argc, char** argv) {
     } else if (sscanf(argv[i], "--use_existing_db=%d%c", &n, &junk) == 1 &&
                (n == 0 || n == 1)) {
       FLAGS_use_existing_db = n;
-    } else if (sscanf(argv[i], "--reuse_logs=%d%c", &n, &junk) == 1 &&
-               (n == 0 || n == 1)) {
-      FLAGS_reuse_logs = n;
     } else if (sscanf(argv[i], "--num=%d%c", &n, &junk) == 1) {
       FLAGS_num = n;
     } else if (sscanf(argv[i], "--reads=%d%c", &n, &junk) == 1) {
@@ -987,16 +966,18 @@ int main(int argc, char** argv) {
       FLAGS_value_size = n;
     } else if (sscanf(argv[i], "--write_buffer_size=%d%c", &n, &junk) == 1) {
       FLAGS_write_buffer_size = n;
-    } else if (sscanf(argv[i], "--max_file_size=%d%c", &n, &junk) == 1) {
-      FLAGS_max_file_size = n;
-    } else if (sscanf(argv[i], "--block_size=%d%c", &n, &junk) == 1) {
-      FLAGS_block_size = n;
     } else if (sscanf(argv[i], "--cache_size=%d%c", &n, &junk) == 1) {
       FLAGS_cache_size = n;
     } else if (sscanf(argv[i], "--bloom_bits=%d%c", &n, &junk) == 1) {
       FLAGS_bloom_bits = n;
+    } else if (sscanf(argv[i], "--bloom_bits2=%d%c", &n, &junk) == 1) {
+      FLAGS_bloom2_bits = n;
+    } else if (sscanf(argv[i], "--leveldb_memory=%d%c", &n, &junk) == 1) {
+      FLAGS_leveldb_memory = n * 1024 * 1024LL;
     } else if (sscanf(argv[i], "--open_files=%d%c", &n, &junk) == 1) {
       FLAGS_open_files = n;
+    } else if (sscanf(argv[i], "--compression=%d%c", &n, &junk) == 1) {
+      FLAGS_compression = n;
     } else if (strncmp(argv[i], "--db=", 5) == 0) {
       FLAGS_db = argv[i] + 5;
     } else {
@@ -1005,16 +986,20 @@ int main(int argc, char** argv) {
     }
   }
 
-  leveldb::g_env = leveldb::Env::Default();
-
   // Choose a location for the test database if none given with --db=<path>
   if (FLAGS_db == NULL) {
-      leveldb::g_env->GetTestDirectory(&default_db_path);
+      leveldb::Env::Default()->GetTestDirectory(&default_db_path);
       default_db_path += "/dbbench";
       FLAGS_db = default_db_path.c_str();
   }
 
-  leveldb::Benchmark benchmark;
-  benchmark.Run();
+  // benchmark class needs to destruct before Shutdown call
+  {
+      leveldb::Benchmark benchmark;
+      benchmark.Run();
+  }
+
+  leveldb::Env::Shutdown();
+
   return 0;
 }
diff --git a/src/leveldb/db/db_impl.cc b/src/leveldb/db/db_impl.cc
index 3bb58e560..9c0f0b555 100644
--- a/src/leveldb/db/db_impl.cc
+++ b/src/leveldb/db/db_impl.cc
@@ -4,11 +4,16 @@
 
 #include "db/db_impl.h"
 
+#include <time.h>
 #include <algorithm>
+#include <errno.h>
+#include <limits.h>
+#include <math.h>
 #include <set>
 #include <string>
 #include <stdint.h>
 #include <stdio.h>
+#include <unistd.h>
 #include <vector>
 #include "db/builder.h"
 #include "db/db_iter.h"
@@ -29,14 +34,21 @@
 #include "table/block.h"
 #include "table/merger.h"
 #include "table/two_level_iterator.h"
+#include "util/db_list.h"
 #include "util/coding.h"
+#include "util/flexcache.h"
+#include "util/hot_threads.h"
 #include "util/logging.h"
 #include "util/mutexlock.h"
+#include "util/thread_tasks.h"
+#include "util/throttle.h"
+#include "leveldb/perf_count.h"
+
+#define __STDC_FORMAT_MACROS
+#include <inttypes.h>
 
 namespace leveldb {
 
-const int kNumNonTableCacheFiles = 10;
-
 // Information kept for every waiting writer
 struct DBImpl::Writer {
   Status status;
@@ -62,6 +74,9 @@ struct DBImpl::CompactionState {
     uint64_t number;
     uint64_t file_size;
     InternalKey smallest, largest;
+    uint64_t exp_write_low, exp_write_high, exp_explicit_high;
+
+    Output() : number(0), file_size(0), exp_write_low(ULLONG_MAX), exp_write_high(0), exp_explicit_high(0) {}
   };
   std::vector<Output> outputs;
 
@@ -70,6 +85,7 @@ struct DBImpl::CompactionState {
   TableBuilder* builder;
 
   uint64_t total_bytes;
+  uint64_t num_entries;
 
   Output* current_output() { return &outputs[outputs.size()-1]; }
 
@@ -77,86 +93,150 @@ struct DBImpl::CompactionState {
       : compaction(c),
         outfile(NULL),
         builder(NULL),
-        total_bytes(0) {
+        total_bytes(0),
+        num_entries(0) {
   }
 };
 
+Value::~Value() {}
+
+class StringValue : public Value {
+ public:
+  explicit StringValue(std::string& val) : value_(val) {}
+  ~StringValue() {}
+
+  StringValue& assign(const char* data, size_t size) {
+    value_.assign(data, size);
+    return *this;
+  }
+
+ private:
+  std::string& value_;
+};
+
 // Fix user-supplied options to be reasonable
 template <class T,class V>
 static void ClipToRange(T* ptr, V minvalue, V maxvalue) {
   if (static_cast<V>(*ptr) > maxvalue) *ptr = maxvalue;
   if (static_cast<V>(*ptr) < minvalue) *ptr = minvalue;
 }
+
 Options SanitizeOptions(const std::string& dbname,
                         const InternalKeyComparator* icmp,
                         const InternalFilterPolicy* ipolicy,
-                        const Options& src) {
+                        const Options& src,
+                        Cache * block_cache) {
+  std::string tiered_dbname;
   Options result = src;
   result.comparator = icmp;
   result.filter_policy = (src.filter_policy != NULL) ? ipolicy : NULL;
-  ClipToRange(&result.max_open_files,    64 + kNumNonTableCacheFiles, 50000);
-  ClipToRange(&result.write_buffer_size, 64<<10,                      1<<30);
-  ClipToRange(&result.max_file_size,     1<<20,                       1<<30);
-  ClipToRange(&result.block_size,        1<<10,                       4<<20);
+  ClipToRange(&result.max_open_files,            20,     50000);
+  ClipToRange(&result.write_buffer_size,         64<<10, 1<<30);
+  ClipToRange(&result.block_size,                1<<10,  4<<20);
+
+  // alternate means to change gMapSize ... more generic
+  if (0!=src.mmap_size)
+      gMapSize=src.mmap_size;
+
+  // reduce buffer sizes if limited_developer_mem is true
+  if (src.limited_developer_mem)
+  {
+      if (0==src.mmap_size)
+          gMapSize=2*1024*1024L;
+      if (gMapSize < result.write_buffer_size) // let unit tests be smaller
+          result.write_buffer_size=gMapSize;
+  }   // if
+
+  // Validate tiered storage options
+  tiered_dbname=MakeTieredDbname(dbname, result);
+
   if (result.info_log == NULL) {
     // Open a log file in the same directory as the db
-    src.env->CreateDir(dbname);  // In case it does not exist
-    src.env->RenameFile(InfoLogFileName(dbname), OldInfoLogFileName(dbname));
-    Status s = src.env->NewLogger(InfoLogFileName(dbname), &result.info_log);
+    src.env->CreateDir(tiered_dbname);  // In case it does not exist
+    src.env->RenameFile(InfoLogFileName(tiered_dbname), OldInfoLogFileName(tiered_dbname));
+    Status s = src.env->NewLogger(InfoLogFileName(tiered_dbname), &result.info_log);
     if (!s.ok()) {
       // No place suitable for logging
       result.info_log = NULL;
     }
   }
+
   if (result.block_cache == NULL) {
-    result.block_cache = NewLRUCache(8 << 20);
+      result.block_cache = block_cache;
   }
+
+  // remove anything expiry if this is an internal database
+  if (result.is_internal_db)
+      result.expiry_module.reset();
+  else if (NULL!=result.expiry_module.get())
+      result.expiry_module.get()->NoteUserExpirySettings();
+
   return result;
 }
 
-DBImpl::DBImpl(const Options& raw_options, const std::string& dbname)
-    : env_(raw_options.env),
-      internal_comparator_(raw_options.comparator),
-      internal_filter_policy_(raw_options.filter_policy),
-      options_(SanitizeOptions(dbname, &internal_comparator_,
-                               &internal_filter_policy_, raw_options)),
-      owns_info_log_(options_.info_log != raw_options.info_log),
-      owns_cache_(options_.block_cache != raw_options.block_cache),
-      dbname_(dbname),
+DBImpl::DBImpl(const Options& options, const std::string& dbname)
+    : double_cache(options),
+      env_(options.env),
+      internal_comparator_(options.comparator),
+      internal_filter_policy_(options.filter_policy),
+      options_(SanitizeOptions(
+          dbname, &internal_comparator_, &internal_filter_policy_,
+          options, block_cache())),
+      owns_info_log_(options_.info_log != options.info_log),
+      owns_cache_(options_.block_cache != options.block_cache),
+      dbname_(options_.tiered_fast_prefix),
       db_lock_(NULL),
       shutting_down_(NULL),
       bg_cv_(&mutex_),
-      mem_(NULL),
+      mem_(new MemTable(internal_comparator_)),
       imm_(NULL),
       logfile_(NULL),
       logfile_number_(0),
       log_(NULL),
-      seed_(0),
       tmp_batch_(new WriteBatch),
-      bg_compaction_scheduled_(false),
-      manual_compaction_(NULL) {
+      manual_compaction_(NULL),
+      throttle_end(0),
+      running_compactions_(0),
+      block_size_changed_(0), last_low_mem_(0),
+      hotbackup_pending_(false)
+{
+  current_block_size_=options_.block_size;
+
+  mem_->Ref();
   has_imm_.Release_Store(NULL);
 
-  // Reserve ten files or so for other uses and give the rest to TableCache.
-  const int table_cache_size = options_.max_open_files - kNumNonTableCacheFiles;
-  table_cache_ = new TableCache(dbname_, &options_, table_cache_size);
+  table_cache_ = new TableCache(dbname_, &options_, file_cache(), double_cache);
 
   versions_ = new VersionSet(dbname_, &options_, table_cache_,
                              &internal_comparator_);
+
+  // switch global for everyone ... tacky implementation for now
+  gFadviseWillNeed=options_.fadvise_willneed;
+
+  // CAUTION: all object initialization must be completed
+  //          before the AddDB and SetTotalMemory calls.
+  DBList()->AddDB(this, options_.is_internal_db);
+  gFlexCache.SetTotalMemory(options_.total_leveldb_mem);
+
+  options_.Dump(options_.info_log);
+  Log(options_.info_log,"               File cache size: %zd", double_cache.GetCapacity(true));
+  Log(options_.info_log,"              Block cache size: %zd", double_cache.GetCapacity(false));
 }
 
 DBImpl::~DBImpl() {
+  DBList()->ReleaseDB(this, options_.is_internal_db);
+
   // Wait for background work to finish
   mutex_.Lock();
   shutting_down_.Release_Store(this);  // Any non-NULL value is ok
-  while (bg_compaction_scheduled_) {
+  while (IsCompactionScheduled()) {
     bg_cv_.Wait();
   }
   mutex_.Unlock();
 
-  if (db_lock_ != NULL) {
-    env_->UnlockFile(db_lock_);
-  }
+  // make sure flex cache knows this db is gone
+  //  (must follow ReleaseDB() call ... see above)
+  gFlexCache.RecalculateAllocations();
 
   delete versions_;
   if (mem_ != NULL) mem_->Unref();
@@ -164,13 +244,17 @@ DBImpl::~DBImpl() {
   delete tmp_batch_;
   delete log_;
   delete logfile_;
+
+  if (options_.cache_object_warming)
+      table_cache_->SaveOpenFileList();
+
   delete table_cache_;
 
   if (owns_info_log_) {
     delete options_.info_log;
   }
-  if (owns_cache_) {
-    delete options_.block_cache;
+  if (db_lock_ != NULL) {
+    env_->UnlockFile(db_lock_);
   }
 }
 
@@ -183,14 +267,14 @@ Status DBImpl::NewDB() {
 
   const std::string manifest = DescriptorFileName(dbname_, 1);
   WritableFile* file;
-  Status s = env_->NewWritableFile(manifest, &file);
+  Status s = env_->NewWritableFile(manifest, &file, 4*1024L);
   if (!s.ok()) {
     return s;
   }
   {
     log::Writer log(file);
     std::string record;
-    new_db.EncodeTo(&record);
+    new_db.EncodeTo(&record, options_.ExpiryActivated());
     s = log.AddRecord(record);
     if (s.ok()) {
       s = file->Close();
@@ -203,6 +287,7 @@ Status DBImpl::NewDB() {
   } else {
     env_->DeleteFile(manifest);
   }
+
   return s;
 }
 
@@ -215,69 +300,120 @@ void DBImpl::MaybeIgnoreError(Status* s) const {
   }
 }
 
-void DBImpl::DeleteObsoleteFiles() {
-  if (!bg_error_.ok()) {
-    // After a background error, we don't know whether a new version may
-    // or may not have been committed, so we cannot safely garbage collect.
-    return;
-  }
+void DBImpl::DeleteObsoleteFiles()
+{
+  // Only run this routine when down to one
+  //  simultaneous compaction
+  if (RunningCompactionCount()<2)
+  {
+      // each caller has mutex, we need to release it
+      //  since this disk activity can take a while
+      mutex_.AssertHeld();
 
-  // Make a set of all of the live files
-  std::set<uint64_t> live = pending_outputs_;
-  versions_->AddLiveFiles(&live);
+      // Make a set of all of the live files
+      std::set<uint64_t> live = pending_outputs_;
+      versions_->AddLiveFiles(&live);
 
-  std::vector<std::string> filenames;
-  env_->GetChildren(dbname_, &filenames); // Ignoring errors on purpose
-  uint64_t number;
-  FileType type;
-  for (size_t i = 0; i < filenames.size(); i++) {
-    if (ParseFileName(filenames[i], &number, &type)) {
-      bool keep = true;
-      switch (type) {
-        case kLogFile:
-          keep = ((number >= versions_->LogNumber()) ||
-                  (number == versions_->PrevLogNumber()));
-          break;
-        case kDescriptorFile:
-          // Keep my manifest file, and any newer incarnations'
-          // (in case there is a race that allows other incarnations)
-          keep = (number >= versions_->ManifestFileNumber());
-          break;
-        case kTableFile:
-          keep = (live.find(number) != live.end());
-          break;
-        case kTempFile:
-          // Any temp files that are currently being written to must
-          // be recorded in pending_outputs_, which is inserted into "live"
-          keep = (live.find(number) != live.end());
-          break;
-        case kCurrentFile:
-        case kDBLockFile:
-        case kInfoLogFile:
-          keep = true;
-          break;
-      }
+      // prune the database root directory
+      std::vector<std::string> filenames;
+      env_->GetChildren(dbname_, &filenames); // Ignoring errors on purpose
+      for (size_t i = 0; i < filenames.size(); i++) {
+          KeepOrDelete(filenames[i], -1, live);
+      }   // for
 
-      if (!keep) {
-        if (type == kTableFile) {
-          table_cache_->Evict(number);
-        }
-        Log(options_.info_log, "Delete type=%d #%lld\n",
-            int(type),
-            static_cast<unsigned long long>(number));
-        env_->DeleteFile(dbname_ + "/" + filenames[i]);
-      }
-    }
-  }
+      // prune the table file directories
+      for (int level=0; level<config::kNumLevels; ++level)
+      {
+          std::string dirname;
+
+          filenames.clear();
+          dirname=MakeDirName2(options_, level, "sst");
+          env_->GetChildren(dirname, &filenames); // Ignoring errors on purpose
+          for (size_t i = 0; i < filenames.size(); i++) {
+              KeepOrDelete(filenames[i], level, live);
+          }   // for
+      }   // for
+  }   // if
 }
 
-Status DBImpl::Recover(VersionEdit* edit, bool *save_manifest) {
+void
+DBImpl::KeepOrDelete(
+    const std::string & Filename,
+    int Level,
+    const std::set<uint64_t> & Live)
+{
+  uint64_t number;
+  FileType type;
+  bool keep = true;
+
+  if (ParseFileName(Filename, &number, &type))
+  {
+      switch (type)
+      {
+          case kLogFile:
+              keep = ((number >= versions_->LogNumber()) ||
+                      (number == versions_->PrevLogNumber()));
+              break;
+
+          case kDescriptorFile:
+              // Keep my manifest file, and any newer incarnations'
+              // (in case there is a race that allows other incarnations)
+              keep = (number >= versions_->ManifestFileNumber());
+              break;
+
+          case kTableFile:
+              keep = (Live.find(number) != Live.end());
+              break;
+
+          case kTempFile:
+              // Any temp files that are currently being written to must
+              // be recorded in pending_outputs_, which is inserted into "Live"
+              keep = (Live.find(number) != Live.end());
+          break;
+
+          case kCurrentFile:
+          case kDBLockFile:
+          case kInfoLogFile:
+          case kCacheWarming:
+              keep = true;
+              break;
+      }   // switch
+
+      if (!keep)
+      {
+          if (type == kTableFile) {
+              // temporary hard coding of extra overlapped
+              //  levels
+              table_cache_->Evict(number, (Level<config::kNumOverlapLevels));
+          }
+          Log(options_.info_log, "Delete type=%d #%lld\n",
+              int(type),
+              static_cast<unsigned long long>(number));
+
+          if (-1!=Level)
+          {
+              std::string file;
+
+              file=TableFileName(options_, number, Level);
+              env_->DeleteFile(file);
+          }   // if
+          else
+          {
+              env_->DeleteFile(dbname_ + "/" + Filename);
+          }   // else
+      }   // if
+  }   // if
+} // DBImpl::KeepOrDelete
+
+
+Status DBImpl::Recover(VersionEdit* edit) {
   mutex_.AssertHeld();
 
   // Ignore error from CreateDir since the creation of the DB is
   // committed only when the descriptor is created, and this directory
   // may already exist from a previous failed creation attempt.
-  env_->CreateDir(dbname_);
+  env_->CreateDir(options_.tiered_fast_prefix);
+  env_->CreateDir(options_.tiered_slow_prefix);
   assert(db_lock_ == NULL);
   Status s = env_->LockFile(LockFileName(dbname_), &db_lock_);
   if (!s.ok()) {
@@ -301,69 +437,155 @@ Status DBImpl::Recover(VersionEdit* edit, bool *save_manifest) {
     }
   }
 
-  s = versions_->Recover(save_manifest);
-  if (!s.ok()) {
-    return s;
-  }
-  SequenceNumber max_sequence(0);
+  // read manifest
+  s = versions_->Recover();
 
-  // Recover from all newer log files than the ones named in the
-  // descriptor (new log files may have been added by the previous
-  // incarnation without registering them in the descriptor).
-  //
-  // Note that PrevLogNumber() is no longer used, but we pay
-  // attention to it in case we are recovering a database
-  // produced by an older version of leveldb.
-  const uint64_t min_log = versions_->LogNumber();
-  const uint64_t prev_log = versions_->PrevLogNumber();
-  std::vector<std::string> filenames;
-  s = env_->GetChildren(dbname_, &filenames);
-  if (!s.ok()) {
-    return s;
-  }
-  std::set<uint64_t> expected;
-  versions_->AddLiveFiles(&expected);
-  uint64_t number;
-  FileType type;
-  std::vector<uint64_t> logs;
-  for (size_t i = 0; i < filenames.size(); i++) {
-    if (ParseFileName(filenames[i], &number, &type)) {
-      expected.erase(number);
-      if (type == kLogFile && ((number >= min_log) || (number == prev_log)))
-        logs.push_back(number);
-    }
-  }
-  if (!expected.empty()) {
-    char buf[50];
-    snprintf(buf, sizeof(buf), "%d missing files; e.g.",
-             static_cast<int>(expected.size()));
-    return Status::Corruption(buf, TableFileName(dbname_, *(expected.begin())));
-  }
+  // Verify Riak 1.3 directory structure created and ready
+  if (s.ok() && !TestForLevelDirectories(env_, options_, versions_->current()))
+  {
+      int level;
+      std::string old_name, new_name;
 
-  // Recover in the order in which the logs were generated
-  std::sort(logs.begin(), logs.end());
-  for (size_t i = 0; i < logs.size(); i++) {
-    s = RecoverLogFile(logs[i], (i == logs.size() - 1), save_manifest, edit,
-                       &max_sequence);
+      if (options_.create_if_missing)
+      {
+          // move files from old heirarchy to new
+          s=MakeLevelDirectories(env_, options_);
+          if (s.ok())
+          {
+              for (level=0; level<config::kNumLevels && s.ok(); ++level)
+              {
+                  const std::vector<FileMetaData*> & level_files(versions_->current()->GetFileList(level));
+                  std::vector<FileMetaData*>::const_iterator it;
+
+                  for (it=level_files.begin(); level_files.end()!=it && s.ok(); ++it)
+                  {
+                      new_name=TableFileName(options_, (*it)->number, level);
+
+                      // test for partial completion
+                      if (!env_->FileExists(new_name.c_str()))
+                      {
+                          old_name=TableFileName(options_, (*it)->number, -2);
+                          s=env_->RenameFile(old_name, new_name);
+                      }   // if
+                  }   // for
+              }   // for
+          }   // if
+          else
+              return s;
+      }   // if
+      else
+      {
+          return Status::InvalidArgument(
+              dbname_, "level directories do not exist (create_if_missing is false)");
+      }   // else
+  }   // if
+
+
+  if (s.ok()) {
+    SequenceNumber max_sequence(0);
+
+    // Recover from all newer log files than the ones named in the
+    // descriptor (new log files may have been added by the previous
+    // incarnation without registering them in the descriptor).
+    //
+    // Note that PrevLogNumber() is no longer used, but we pay
+    // attention to it in case we are recovering a database
+    // produced by an older version of leveldb.
+    const uint64_t min_log = versions_->LogNumber();
+    const uint64_t prev_log = versions_->PrevLogNumber();
+    std::vector<std::string> filenames;
+    s = env_->GetChildren(dbname_, &filenames);
     if (!s.ok()) {
       return s;
     }
+    uint64_t number;
+    FileType type;
+    std::vector<uint64_t> logs;
+    for (size_t i = 0; i < filenames.size(); i++) {
+      if (ParseFileName(filenames[i], &number, &type)
+          && type == kLogFile
+          && ((number >= min_log) || (number == prev_log))) {
+        logs.push_back(number);
+      }
+    }
 
-    // The previous incarnation may not have written any MANIFEST
-    // records after allocating this log number.  So we manually
-    // update the file number allocation counter in VersionSet.
-    versions_->MarkFileNumberUsed(logs[i]);
+    // Recover in the order in which the logs were generated
+    std::sort(logs.begin(), logs.end());
+    for (size_t i = 0; i < logs.size() && s.ok(); i++) {
+      s = RecoverLogFile(logs[i], edit, &max_sequence);
+
+      // The previous incarnation may not have written any MANIFEST
+      // records after allocating this log number.  So we manually
+      // update the file number allocation counter in VersionSet.
+      versions_->MarkFileNumberUsed(logs[i]);
+    }
+
+    if (s.ok()) {
+      if (versions_->LastSequence() < max_sequence) {
+        versions_->SetLastSequence(max_sequence);
+      }
+    }
   }
 
-  if (versions_->LastSequence() < max_sequence) {
-    versions_->SetLastSequence(max_sequence);
-  }
-
-  return Status::OK();
+  return s;
 }
 
-Status DBImpl::RecoverLogFile(uint64_t log_number, bool last_log,
-                              bool* save_manifest, VersionEdit* edit,
+
+void DBImpl::CheckCompactionState()
+{
+    mutex_.AssertHeld();
+    bool log_flag, need_compaction;
+
+    // Verify Riak 1.4 level sizing, run compactions to fix as necessary
+    //  (also recompacts hard repair of all files to level 0)
+
+    log_flag=false;
+    need_compaction=false;
+
+    // loop on pending background compactions
+    //  reminder: mutex_ is held
+    do
+    {
+        int level;
+
+        // wait out executing compaction (Wait gives mutex to compactions)
+        if (IsCompactionScheduled())
+            bg_cv_.Wait();
+
+        for (level=0, need_compaction=false;
+             level<config::kNumLevels && !need_compaction;
+             ++level)
+        {
+            if (versions_->IsLevelOverlapped(level)
+                && config::kL0_SlowdownWritesTrigger<=versions_->NumLevelFiles(level))
+            {
+                need_compaction=true;
+                MaybeScheduleCompaction();
+                if (!log_flag)
+                {
+                    log_flag=true;
+                    Log(options_.info_log, "Cleanup compactions started ... DB::Open paused");
+                }   // if
+            }   //if
+        }   // for
+
+    } while(IsCompactionScheduled() && need_compaction);
+
+    if (log_flag)
+        Log(options_.info_log, "Cleanup compactions completed ... DB::Open continuing");
+
+    // prior code only called this function instead of CheckCompactionState
+    //  (duplicates original Google functionality)
+    else
+        MaybeScheduleCompaction();
+
+    return;
+
+}  // DBImpl::CheckCompactionState()
+
+
+Status DBImpl::RecoverLogFile(uint64_t log_number,
+                              VersionEdit* edit,
                               SequenceNumber* max_sequence) {
   struct LogReporter : public log::Reader::Reporter {
     Env* env;
@@ -395,7 +617,7 @@ Status DBImpl::RecoverLogFile(uint64_t log_number, bool last_log,
   reporter.info_log = options_.info_log;
   reporter.fname = fname.c_str();
   reporter.status = (options_.paranoid_checks ? &status : NULL);
-  // We intentionally make log::Reader do checksumming even if
+  // We intentially make log::Reader do checksumming even if
   // paranoid_checks==false so that corruptions cause entire commits
   // to be skipped instead of propagating bad information (like overly
   // large sequence numbers).
@@ -408,13 +630,12 @@ Status DBImpl::RecoverLogFile(uint64_t log_number, bool last_log,
   std::string scratch;
   Slice record;
   WriteBatch batch;
-  int compactions = 0;
   MemTable* mem = NULL;
   while (reader.ReadRecord(&record, &scratch) &&
          status.ok()) {
     if (record.size() < 12) {
       reporter.Corruption(
-          record.size(), Status::Corruption("log record too small", fname));
+          record.size(), Status::Corruption("log record too small"));
       continue;
     }
     WriteBatchInternal::SetContents(&batch, record);
@@ -423,7 +644,7 @@ Status DBImpl::RecoverLogFile(uint64_t log_number, bool last_log,
       mem = new MemTable(internal_comparator_);
       mem->Ref();
     }
-    status = WriteBatchInternal::InsertInto(&batch, mem);
+    status = WriteBatchInternal::InsertInto(&batch, mem, &options_);
     MaybeIgnoreError(&status);
     if (!status.ok()) {
       break;
@@ -436,77 +657,68 @@ Status DBImpl::RecoverLogFile(uint64_t log_number, bool last_log,
     }
 
     if (mem->ApproximateMemoryUsage() > options_.write_buffer_size) {
-      compactions++;
-      *save_manifest = true;
       status = WriteLevel0Table(mem, edit, NULL);
-      mem->Unref();
-      mem = NULL;
       if (!status.ok()) {
         // Reflect errors immediately so that conditions like full
         // file-systems cause the DB::Open() to fail.
         break;
       }
+      mem->Unref();
+      mem = NULL;
     }
   }
 
+  if (status.ok() && mem != NULL) {
+    status = WriteLevel0Table(mem, edit, NULL);
+    // Reflect errors immediately so that conditions like full
+    // file-systems cause the DB::Open() to fail.
+  }
+
+  if (mem != NULL) mem->Unref();
   delete file;
-
-  // See if we should keep reusing the last log file.
-  if (status.ok() && options_.reuse_logs && last_log && compactions == 0) {
-    assert(logfile_ == NULL);
-    assert(log_ == NULL);
-    assert(mem_ == NULL);
-    uint64_t lfile_size;
-    if (env_->GetFileSize(fname, &lfile_size).ok() &&
-        env_->NewAppendableFile(fname, &logfile_).ok()) {
-      Log(options_.info_log, "Reusing old log %s \n", fname.c_str());
-      log_ = new log::Writer(logfile_, lfile_size);
-      logfile_number_ = log_number;
-      if (mem != NULL) {
-        mem_ = mem;
-        mem = NULL;
-      } else {
-        // mem can be NULL if lognum exists but was empty.
-        mem_ = new MemTable(internal_comparator_);
-        mem_->Ref();
-      }
-    }
-  }
-
-  if (mem != NULL) {
-    // mem did not get reused; compact it.
-    if (status.ok()) {
-      *save_manifest = true;
-      status = WriteLevel0Table(mem, edit, NULL);
-    }
-    mem->Unref();
-  }
-
   return status;
 }
 
-Status DBImpl::WriteLevel0Table(MemTable* mem, VersionEdit* edit,
+Status DBImpl::WriteLevel0Table(volatile MemTable* mem, VersionEdit* edit,
                                 Version* base) {
   mutex_.AssertHeld();
   const uint64_t start_micros = env_->NowMicros();
   FileMetaData meta;
   meta.number = versions_->NewFileNumber();
+  meta.level = 0;
   pending_outputs_.insert(meta.number);
-  Iterator* iter = mem->NewIterator();
-  Log(options_.info_log, "Level-0 table #%llu: started",
-      (unsigned long long) meta.number);
+  Iterator* iter = ((MemTable *)mem)->NewIterator();
+  SequenceNumber smallest_snapshot;
+
+  if (snapshots_.empty()) {
+    smallest_snapshot = versions_->LastSequence();
+  } else {
+    smallest_snapshot = snapshots_.oldest()->number_;
+  }
 
   Status s;
   {
+    Options local_options;
+
     mutex_.Unlock();
-    s = BuildTable(dbname_, env_, options_, table_cache_, iter, &meta);
+    Log(options_.info_log, "Level-0 table #%llu: started",
+        (unsigned long long) meta.number);
+
+    // want the data slammed to disk as fast as possible,
+    //  no compression for level 0.
+    local_options=options_;
+    // matthewv Nov 2, 2016 local_options.compression=kNoCompression;
+    local_options.block_size=current_block_size_;
+    s = BuildTable(dbname_, env_, local_options, user_comparator(), table_cache_, iter, &meta, smallest_snapshot);
+
+    Log(options_.info_log, "Level-0 table #%llu: %llu bytes, %llu keys %s",
+        (unsigned long long) meta.number,
+        (unsigned long long) meta.file_size,
+        (unsigned long long) meta.num_entries,
+      s.ToString().c_str());
     mutex_.Lock();
   }
 
-  Log(options_.info_log, "Level-0 table #%llu: %lld bytes %s",
-      (unsigned long long) meta.number,
-      (unsigned long long) meta.file_size,
-      s.ToString().c_str());
   delete iter;
   pending_outputs_.erase(meta.number);
 
@@ -518,20 +730,75 @@ Status DBImpl::WriteLevel0Table(MemTable* mem, VersionEdit* edit,
     const Slice min_user_key = meta.smallest.user_key();
     const Slice max_user_key = meta.largest.user_key();
     if (base != NULL) {
-      level = base->PickLevelForMemTableOutput(min_user_key, max_user_key);
+        int level_limit;
+        if (0!=options_.tiered_slow_level && (options_.tiered_slow_level-1)<static_cast<unsigned>(config::kMaxMemCompactLevel))
+            level_limit=options_.tiered_slow_level-1;
+        else
+            level_limit=config::kMaxMemCompactLevel;
+
+        // remember, mutex is held so safe to push file into a non-compacting level
+        level = base->PickLevelForMemTableOutput(min_user_key, max_user_key, level_limit);
+        if (versions_->IsCompactionSubmitted(level) || !versions_->NeighborCompactionsQuiet(level))
+            level=0;
+
+        if (0!=level)
+        {
+            Status move_s;
+            std::string old_name, new_name;
+
+            old_name=TableFileName(options_, meta.number, 0);
+            new_name=TableFileName(options_, meta.number, level);
+            move_s=env_->RenameFile(old_name, new_name);
+
+            if (move_s.ok())
+            {
+                // builder already added file to table_cache with 2 references and
+                //  marked as level 0 (used by cache warming) ... going to remove from cache
+                //  and add again correctly
+                table_cache_->Evict(meta.number, true);
+                meta.level=level;
+
+                // sadly, we must hold the mutex during this file open
+                //  since operating in non-overlapped level
+                Iterator* it=table_cache_->NewIterator(ReadOptions(),
+                                                       meta.number,
+                                                       meta.file_size,
+                                                       meta.level);
+                delete it;
+
+                // argh!  logging while holding mutex ... cannot release
+                Log(options_.info_log, "Level-0 table #%llu:  moved to level %d",
+                    (unsigned long long) meta.number,
+                    level);
+            }   // if
+            else
+            {
+                level=0;
+            }   // else
+        }   // if
     }
-    edit->AddFile(level, meta.number, meta.file_size,
-                  meta.smallest, meta.largest);
+
+    if (s.ok())
+        edit->AddFile2(level, meta.number, meta.file_size,
+                       meta.smallest, meta.largest,
+                       meta.exp_write_low, meta.exp_write_high, meta.exp_explicit_high);
   }
 
   CompactionStats stats;
   stats.micros = env_->NowMicros() - start_micros;
   stats.bytes_written = meta.file_size;
   stats_[level].Add(stats);
+
+  // Riak adds extra reference to file, must remove it
+  //  in this race condition upon close
+  if (s.ok() && shutting_down_.Acquire_Load()) {
+      table_cache_->Evict(meta.number, versions_->IsLevelOverlapped(level));
+  }
+
   return s;
 }
 
-void DBImpl::CompactMemTable() {
+Status DBImpl::CompactMemTable() {
   mutex_.AssertHeld();
   assert(imm_ != NULL);
 
@@ -559,9 +826,9 @@ void DBImpl::CompactMemTable() {
     imm_ = NULL;
     has_imm_.Release_Store(NULL);
     DeleteObsoleteFiles();
-  } else {
-    RecordBackgroundError(s);
   }
+
+  return s;
 }
 
 void DBImpl::CompactRange(const Slice* begin, const Slice* end) {
@@ -575,7 +842,7 @@ void DBImpl::CompactRange(const Slice* begin, const Slice* end) {
       }
     }
   }
-  TEST_CompactMemTable(); // TODO(sanjay): Skip if memtable does not overlap
+  CompactMemTableSynchronous(); // TODO(sanjay): Skip if memtable does not overlap
   for (int level = 0; level < max_level_with_files; level++) {
     TEST_CompactRange(level, begin, end);
   }
@@ -593,32 +860,40 @@ void DBImpl::TEST_CompactRange(int level, const Slice* begin,const Slice* end) {
   if (begin == NULL) {
     manual.begin = NULL;
   } else {
-    begin_storage = InternalKey(*begin, kMaxSequenceNumber, kValueTypeForSeek);
+    begin_storage = InternalKey(*begin, 0, kMaxSequenceNumber, kValueTypeForSeek);
     manual.begin = &begin_storage;
   }
   if (end == NULL) {
     manual.end = NULL;
   } else {
-    end_storage = InternalKey(*end, 0, static_cast<ValueType>(0));
+    end_storage = InternalKey(*end, 0, 0, static_cast<ValueType>(0));
     manual.end = &end_storage;
   }
 
   MutexLock l(&mutex_);
-  while (!manual.done && !shutting_down_.Acquire_Load() && bg_error_.ok()) {
-    if (manual_compaction_ == NULL) {  // Idle
-      manual_compaction_ = &manual;
-      MaybeScheduleCompaction();
-    } else {  // Running either my compaction or another compaction.
+  while (!manual.done) {
+    while (manual_compaction_ != NULL || IsCompactionScheduled()) {
+      bg_cv_.Wait();
+    }
+    manual_compaction_ = &manual;
+    MaybeScheduleCompaction();
+    while (manual_compaction_ == &manual) {
       bg_cv_.Wait();
     }
   }
-  if (manual_compaction_ == &manual) {
-    // Cancel my manual compaction since we aborted early for some reason.
-    manual_compaction_ = NULL;
-  }
 }
 
+/**
+ * This "test" routine was used in one production location,
+ *  then two with addition of hot backup.  Inappropriate for
+ *  TEST_ prefix if used in production.
+ */
 Status DBImpl::TEST_CompactMemTable() {
+    return(CompactMemTableSynchronous());
+}   // TEST_CompactMemTable
+
+
+Status DBImpl::CompactMemTableSynchronous() {
   // NULL batch means just wait for earlier writes to be done
   Status s = Write(WriteOptions(), NULL);
   if (s.ok()) {
@@ -634,68 +909,168 @@ Status DBImpl::TEST_CompactMemTable() {
   return s;
 }
 
-void DBImpl::RecordBackgroundError(const Status& s) {
-  mutex_.AssertHeld();
-  if (bg_error_.ok()) {
-    bg_error_ = s;
-    bg_cv_.SignalAll();
-  }
-}
-
 void DBImpl::MaybeScheduleCompaction() {
   mutex_.AssertHeld();
-  if (bg_compaction_scheduled_) {
-    // Already scheduled
-  } else if (shutting_down_.Acquire_Load()) {
-    // DB is being deleted; no more background compactions
-  } else if (!bg_error_.ok()) {
-    // Already got an error; no more changes
-  } else if (imm_ == NULL &&
-             manual_compaction_ == NULL &&
-             !versions_->NeedsCompaction()) {
-    // No work to be done
-  } else {
-    bg_compaction_scheduled_ = true;
-    env_->Schedule(&DBImpl::BGWork, this);
-  }
+
+  if (!shutting_down_.Acquire_Load())
+  {
+      if (NULL==manual_compaction_)
+      {
+          // ask versions_ to schedule work to hot threads
+          versions_->PickCompaction(this);
+      }   // if
+
+      else if (!versions_->IsCompactionSubmitted(manual_compaction_->level))
+      {
+          // support manual compaction under hot threads
+          versions_->SetCompactionSubmitted(manual_compaction_->level);
+          ThreadTask * task=new CompactionTask(this, NULL);
+          gCompactionThreads->Submit(task, true);
+      }   // else if
+  }   // if
 }
 
-void DBImpl::BGWork(void* db) {
-  reinterpret_cast<DBImpl*>(db)->BackgroundCall();
-}
 
-void DBImpl::BackgroundCall() {
+void DBImpl::BackgroundCall2(
+    Compaction * Compact) {
   MutexLock l(&mutex_);
-  assert(bg_compaction_scheduled_);
-  if (shutting_down_.Acquire_Load()) {
-    // No more background work when shutting down.
-  } else if (!bg_error_.ok()) {
-    // No more background work after a background error.
-  } else {
-    BackgroundCompaction();
-  }
+  int level, type;
+  assert(IsCompactionScheduled());
 
-  bg_compaction_scheduled_ = false;
+  type=kNormalCompaction;
+  ++running_compactions_;
+  if (NULL!=Compact)
+  {
+      level=Compact->level();
+      type=Compact->GetCompactionType();
+  }   // if
+  else if (NULL!=manual_compaction_)
+      level=manual_compaction_->level;
+  else
+      level=0;
+
+  if (0==level)
+      gPerfCounters->Inc(ePerfBGCompactLevel0);
+  else
+      gPerfCounters->Inc(ePerfBGNormal);
+
+  versions_->SetCompactionRunning(level);
+
+  if (!shutting_down_.Acquire_Load()) {
+    Status s;
+
+    switch(type)
+    {
+        case kNormalCompaction:
+            s = BackgroundCompaction(Compact);
+            break;
+
+        case kExpiryFileCompaction:
+            s = BackgroundExpiry(Compact);
+            break;
+
+        default:
+            assert(0);
+            break;
+    }   // switch
+
+    if (!s.ok() && !shutting_down_.Acquire_Load()) {
+      // Wait a little bit before retrying background compaction in
+      // case this is an environmental problem and we do not want to
+      // chew up resources for failed compactions for the duration of
+      // the problem.
+      bg_cv_.SignalAll();  // In case a waiter can proceed despite the error
+      mutex_.Unlock();
+      Log(options_.info_log, "Waiting after background compaction error: %s",
+          s.ToString().c_str());
+      env_->SleepForMicroseconds(1000000);
+      mutex_.Lock();
+    }
+  }
+  else
+  {
+    delete Compact;
+  }   // else
+
+  --running_compactions_;
+  versions_->SetCompactionDone(level, env_->NowMicros());
 
   // Previous compaction may have produced too many files in a level,
   // so reschedule another compaction if needed.
-  MaybeScheduleCompaction();
+  if (!options_.is_repair)
+      MaybeScheduleCompaction();
+  bg_cv_.SignalAll();
+
+}
+
+
+void
+DBImpl::BackgroundImmCompactCall() {
+  MutexLock l(&mutex_);
+  assert(NULL != imm_);
+  Status s;
+
+  ++running_compactions_;
+  gPerfCounters->Inc(ePerfBGCompactImm);
+
+  if (!shutting_down_.Acquire_Load()) {
+    s = CompactMemTable();
+    if (!s.ok() && !shutting_down_.Acquire_Load()) {
+      // Wait a little bit before retrying background compaction in
+      // case this is an environmental problem and we do not want to
+      // chew up resources for failed compactions for the duration of
+      // the problem.
+      bg_cv_.SignalAll();  // In case a waiter can proceed despite the error
+      mutex_.Unlock();
+      Log(options_.info_log, "Waiting after background imm compaction error: %s",
+          s.ToString().c_str());
+      env_->SleepForMicroseconds(1000000);
+      mutex_.Lock();
+    }
+  }
+
+  --running_compactions_;
+
+  // Previous compaction may have produced too many files in a level,
+  // so reschedule another compaction if needed.
+  if (!options_.is_repair)
+      MaybeScheduleCompaction();
+
+  // shutdown is waiting for this imm_ to clear
+  if (shutting_down_.Acquire_Load()) {
+
+    // must abandon data in memory ... hope recovery log works
+    if (NULL!=imm_)
+      imm_->Unref();
+    imm_ = NULL;
+    has_imm_.Release_Store(NULL);
+  } // if
+
+  // retry imm compaction if failed and not shutting down
+  else if (!s.ok())
+  {
+      ThreadTask * task=new ImmWriteTask(this);
+      gImmThreads->Submit(task, true);
+  }   // else
+
   bg_cv_.SignalAll();
 }
 
-void DBImpl::BackgroundCompaction() {
+
+Status DBImpl::BackgroundCompaction(
+    Compaction * Compact) {
+  Status status;
+  bool do_compact(true);
+
   mutex_.AssertHeld();
 
-  if (imm_ != NULL) {
-    CompactMemTable();
-    return;
-  }
-
-  Compaction* c;
+  Compaction* c(Compact);
   bool is_manual = (manual_compaction_ != NULL);
   InternalKey manual_end;
-  if (is_manual) {
-    ManualCompaction* m = manual_compaction_;
+  if (NULL!=c) {
+      // do nothing in this work block
+  } else  if (is_manual) {
+    ManualCompaction* m = (ManualCompaction *) manual_compaction_;
     c = versions_->CompactRange(m->level, m->begin, m->end);
     m->done = (c == NULL);
     if (c != NULL) {
@@ -708,36 +1083,58 @@ void DBImpl::BackgroundCompaction() {
         (m->end ? m->end->DebugString().c_str() : "(end)"),
         (m->done ? "(end)" : manual_end.DebugString().c_str()));
   } else {
-    c = versions_->PickCompaction();
+      // c = versions_->PickCompaction();
   }
 
-  Status status;
+
   if (c == NULL) {
     // Nothing to do
-  } else if (!is_manual && c->IsTrivialMove()) {
+    do_compact=false;
+  } else if (!is_manual && c->IsTrivialMove()
+             && (c->level()+1)!=(int)options_.tiered_slow_level) {
     // Move file to next level
     assert(c->num_input_files(0) == 1);
+    std::string old_name, new_name;
     FileMetaData* f = c->input(0, 0);
-    c->edit()->DeleteFile(c->level(), f->number);
-    c->edit()->AddFile(c->level() + 1, f->number, f->file_size,
-                       f->smallest, f->largest);
-    status = versions_->LogAndApply(c->edit(), &mutex_);
-    if (!status.ok()) {
-      RecordBackgroundError(status);
-    }
-    VersionSet::LevelSummaryStorage tmp;
-    Log(options_.info_log, "Moved #%lld to level-%d %lld bytes %s: %s\n",
-        static_cast<unsigned long long>(f->number),
-        c->level() + 1,
-        static_cast<unsigned long long>(f->file_size),
-        status.ToString().c_str(),
-        versions_->LevelSummary(&tmp));
-  } else {
+
+    old_name=TableFileName(options_, f->number, c->level());
+    new_name=TableFileName(options_, f->number, c->level() +1);
+    status=env_->RenameFile(old_name, new_name);
+
+    if (status.ok())
+    {
+        gPerfCounters->Inc(ePerfBGMove);
+        do_compact=false;
+        c->edit()->DeleteFile(c->level(), f->number);
+        c->edit()->AddFile2(c->level() + 1, f->number, f->file_size,
+                            f->smallest, f->largest,
+                            f->exp_write_low, f->exp_write_high, f->exp_explicit_high);
+        status = versions_->LogAndApply(c->edit(), &mutex_);
+        DeleteObsoleteFiles();
+
+        // if LogAndApply fails, should file be renamed back to original spot?
+        VersionSet::LevelSummaryStorage tmp;
+        Log(options_.info_log, "Moved #%lld to level-%d %lld bytes %s: %s\n",
+            static_cast<unsigned long long>(f->number),
+            c->level() + 1,
+            static_cast<unsigned long long>(f->file_size),
+            status.ToString().c_str(),
+            versions_->LevelSummary(&tmp));
+
+        // no time, no keys ... just make the call so that one compaction
+        //  gets posted against potential backlog ... extremely important
+        //  to write throttle logic.
+        SetThrottleWriteRate(0, 0, (0 == c->level()));
+    }  // if
+    else {
+        // retry as compaction instead of move
+        do_compact=true; // redundant but safe
+        gPerfCounters->Inc(ePerfBGMoveFail);
+    }   // else
+  }
+  if (do_compact) {
     CompactionState* compact = new CompactionState(c);
     status = DoCompactionWork(compact);
-    if (!status.ok()) {
-      RecordBackgroundError(status);
-    }
     CleanupCompaction(compact);
     c->ReleaseInputs();
     DeleteObsoleteFiles();
@@ -751,10 +1148,13 @@ void DBImpl::BackgroundCompaction() {
   } else {
     Log(options_.info_log,
         "Compaction error: %s", status.ToString().c_str());
+    if (options_.paranoid_checks && bg_error_.ok()) {
+      bg_error_ = status;
+    }
   }
 
   if (is_manual) {
-    ManualCompaction* m = manual_compaction_;
+    ManualCompaction* m = (ManualCompaction *)manual_compaction_;
     if (!status.ok()) {
       m->done = true;
     }
@@ -766,6 +1166,8 @@ void DBImpl::BackgroundCompaction() {
     }
     manual_compaction_ = NULL;
   }
+
+  return status;
 }
 
 void DBImpl::CleanupCompaction(CompactionState* compact) {
@@ -785,10 +1187,14 @@ void DBImpl::CleanupCompaction(CompactionState* compact) {
   delete compact;
 }
 
-Status DBImpl::OpenCompactionOutputFile(CompactionState* compact) {
+Status DBImpl::OpenCompactionOutputFile(
+    CompactionState* compact,
+    size_t sample_value_size) {
   assert(compact != NULL);
   assert(compact->builder == NULL);
   uint64_t file_number;
+  bool pagecache_flag;
+
   {
     mutex_.Lock();
     file_number = versions_->NewFileNumber();
@@ -798,18 +1204,230 @@ Status DBImpl::OpenCompactionOutputFile(CompactionState* compact) {
     out.smallest.Clear();
     out.largest.Clear();
     compact->outputs.push_back(out);
+    pagecache_flag=Send2PageCache(compact);
     mutex_.Unlock();
   }
 
   // Make the output file
-  std::string fname = TableFileName(dbname_, file_number);
-  Status s = env_->NewWritableFile(fname, &compact->outfile);
+  std::string fname = TableFileName(options_, file_number, compact->compaction->level()+1);
+  Status s = env_->NewWritableFile(fname, &compact->outfile, gMapSize);
   if (s.ok()) {
-    compact->builder = new TableBuilder(options_, compact->outfile);
-  }
+      Options options;
+      options=options_;
+      options.block_size=current_block_size_;
+
+      // consider larger block size if option enabled (block_size_steps!=0)
+      //  and low on file cache space
+      if (0!=options.block_size_steps)
+      {
+          uint64_t now;
+
+          now=env_->NowMicros();
+
+          if (!double_cache.GetPlentySpace())
+          {
+              // keep track of last time there was lack of space.
+              //  use info in block below to revert block_size
+              last_low_mem_=now;
+
+              // do not make changes often, a multi file compaction
+              //  could raise more than one step (5 min)
+              if (block_size_changed_+(5*60*1000000L) < now)
+              {
+                  size_t old_size=current_block_size_;
+
+                  options.block_size=MaybeRaiseBlockSize(*compact->compaction, sample_value_size);
+
+                  // did size change?
+                  if (options.block_size!=old_size)
+                  {
+                      block_size_changed_=now;
+                  }   // if
+              }   // if
+
+          }   // if
+
+          // has system's memory been ok for a while now
+          else if (last_low_mem_+double_cache.GetFileTimeout()*1000000L < now)
+          {
+              // reset size to original, data could have been deleted and/or old
+              //  files no longer need cache space
+              current_block_size_=options_.block_size;
+          }   // else if
+
+      }   // if
+
+      // force call to CalcInputState to set IsCompressible
+      compact->compaction->CalcInputStats(*table_cache_);
+
+      // do not attempt compression if data known to not compress
+      if (kSnappyCompression==options.compression && !compact->compaction->IsCompressible())
+      {
+          options.compression=kNoCompressionAutomated;
+          Log(options.info_log, "kNoCompressionAutomated");
+      }   // if
+
+
+      // tune fadvise to keep as much of the file data in RAM as
+      //  reasonably possible
+      if (pagecache_flag)
+          compact->outfile->SetMetadataOffset(1);
+      compact->builder = new TableBuilder(options, compact->outfile);
+  }   // if
+
   return s;
 }
 
+
+bool
+DBImpl::Send2PageCache(
+    CompactionState* compact)
+{
+    bool ret_flag;
+
+    mutex_.AssertHeld();
+
+    // tune fadvise to keep all of the lower level file in page cache
+    //  (compaction of unsorted files causes severe cache misses)
+    if (versions_->IsLevelOverlapped(compact->compaction->level()))
+//    if (0==compact->compaction->level())
+    {
+        ret_flag=true;
+    }   // if
+
+    // look at current RAM availability to decide whether or not to keep
+    //  file data in page cache
+    else
+    {
+        size_t avail_block;
+        int64_t lower_levels;
+        int level;
+
+        // current block cache size without PageCache estimation
+        avail_block=double_cache.GetCapacity(false, false);
+
+        lower_levels=0;
+        for (level=0; level<=compact->compaction->level(); ++level)
+            lower_levels+=versions_->NumLevelBytes(level);
+
+        // does the block cache's unadjusted size exceed higher
+        //  volatility file sizes in lower levels?
+        ret_flag=(lower_levels<=(int64_t)avail_block);
+    }   // else
+
+    return(ret_flag);
+
+}   // DbImpl::Send2PageCache
+
+size_t
+DBImpl::MaybeRaiseBlockSize(
+    Compaction & CompactionStuff,
+    size_t SampleValueSize)
+{
+    size_t new_block_size, tot_user_data, tot_index_keys, avg_value_size,
+        avg_key_size, avg_block_size;
+
+    // start with most recent dynamic sizing
+    new_block_size=current_block_size_;
+
+    //
+    // 1. Get estimates for key values.  Zero implies unable to estimate
+    //    (as the formula is tuned, some of the values become unused ... apologies
+    CompactionStuff.CalcInputStats(*table_cache_);
+    tot_user_data=CompactionStuff.TotalUserDataSize();
+    tot_index_keys=CompactionStuff.TotalIndexKeys();
+    avg_value_size=CompactionStuff.AverageValueSize();
+    avg_key_size=CompactionStuff.AverageKeySize();
+    avg_block_size=CompactionStuff.AverageBlockSize();
+
+    // CalcInputStats does not have second source for avg_value_size.
+    //  Use size of next key.
+    if (0==avg_value_size)
+        avg_value_size=SampleValueSize;
+
+    Log(options_.info_log,
+        "Block stats used %zd user data, %zd index keys, %zd avg value, %zd avg key, %zd avg block",
+        tot_user_data, tot_index_keys, avg_value_size, avg_key_size, avg_block_size);
+
+    //
+    // 2. Define boundaries of block size steps.  Calculate
+    //    "next step"
+    //
+    if (0!=tot_user_data && 0!=tot_index_keys && 0!=avg_value_size
+        && 0!=avg_key_size && 0!=avg_block_size)
+    {
+        size_t high_size, low_size, cur_size, increment, file_data_size, keys_per_file;
+
+        // 2a. Highest block size:
+        //      (sqrt()/sqrt() stuff is from first derivative to minimize
+        //       total read size of one block plus file metadata)
+
+        // limited by keys or filesize? (pretend metadata is zero, i love pretend games)
+        file_data_size=versions_->MaxFileSizeForLevel(CompactionStuff.level());
+        keys_per_file=file_data_size / avg_value_size;
+
+        if (300000 < keys_per_file)
+        {
+            keys_per_file = 300000;
+            file_data_size = avg_value_size * keys_per_file;
+        }   // if
+
+        // cast to double inside sqrt() is required for Solaris 13
+        high_size=(size_t)((double)file_data_size / (sqrt((double)file_data_size)/sqrt((double)avg_key_size)));
+
+        // 2b. Lowest block size: largest of given block size or average value size
+        //      because large values are one block
+        if (avg_value_size < options_.block_size)
+            low_size=options_.block_size;
+        else
+            low_size=avg_value_size;
+
+        // 2c. Current block size: compaction can skew numbers in files
+        //     without counters, use current dynamic block_size in that case
+        if (options_.block_size < avg_block_size)
+            cur_size=avg_block_size;
+        else
+            cur_size=current_block_size_;
+
+        // safety check values to eliminate negatives
+        if (low_size <= high_size)
+        {
+            size_t cur_step;
+
+            increment=(high_size - low_size)/options_.block_size_steps;
+
+            // adjust old, too low stuff
+            if (low_size < cur_size)
+                cur_step=(cur_size - low_size)/increment;
+            else
+                cur_step=0;
+
+            // move to next step, but not over the top step
+            if (cur_step < (size_t)options_.block_size_steps)
+                ++cur_step;
+            else
+                cur_step=options_.block_size_steps;
+
+            //
+            // 3. Set new block size to next higher step
+            //
+            new_block_size=low_size + increment * cur_step;
+
+            Log(options_.info_log,
+                "Block size selected %zd block size, %zd cur, %zd low, %zd high, %zd inc, %zd step",
+                new_block_size, cur_size, low_size, high_size, increment, cur_step);
+
+            // This is not thread safe, but not worthy of mutex either
+            if (current_block_size_ < new_block_size)
+                current_block_size_ = new_block_size;
+        }   // if
+    }   // if
+
+    return(new_block_size);
+
+}   // DBImpl::MaybeRaiseBlockSize
+
+
 Status DBImpl::FinishCompactionOutputFile(CompactionState* compact,
                                           Iterator* input) {
   assert(compact != NULL);
@@ -830,6 +1448,10 @@ Status DBImpl::FinishCompactionOutputFile(CompactionState* compact,
   const uint64_t current_bytes = compact->builder->FileSize();
   compact->current_output()->file_size = current_bytes;
   compact->total_bytes += current_bytes;
+  compact->num_entries += compact->builder->NumEntries();
+  compact->current_output()->exp_write_low = compact->builder->GetExpiryWriteLow();
+  compact->current_output()->exp_write_high = compact->builder->GetExpiryWriteHigh();
+  compact->current_output()->exp_explicit_high = compact->builder->GetExpiryExplicitHigh();
   delete compact->builder;
   compact->builder = NULL;
 
@@ -845,16 +1467,25 @@ Status DBImpl::FinishCompactionOutputFile(CompactionState* compact,
 
   if (s.ok() && current_entries > 0) {
     // Verify that the table is usable
+    Table * table_ptr;
     Iterator* iter = table_cache_->NewIterator(ReadOptions(),
                                                output_number,
-                                               current_bytes);
+                                               current_bytes,
+                                               compact->compaction->level()+1,
+                                               &table_ptr);
     s = iter->status();
+    // Riak specific: bloom filter is no longer read by default,
+    //  force read on highly used overlapped table files
+    if (s.ok() && VersionSet::IsLevelOverlapped(compact->compaction->level()+1))
+        table_ptr->ReadFilter();
+
+    // table_ptr invalidated by this delete
     delete iter;
+
     if (s.ok()) {
       Log(options_.info_log,
-          "Generated table #%llu@%d: %lld keys, %lld bytes",
+          "Generated table #%llu: %lld keys, %lld bytes",
           (unsigned long long) output_number,
-          compact->compaction->level(),
           (unsigned long long) current_entries,
           (unsigned long long) current_bytes);
     }
@@ -865,34 +1496,31 @@ Status DBImpl::FinishCompactionOutputFile(CompactionState* compact,
 
 Status DBImpl::InstallCompactionResults(CompactionState* compact) {
   mutex_.AssertHeld();
+
+  mutex_.Unlock();
+  // release lock while writing Log entry, could stall
   Log(options_.info_log,  "Compacted %d@%d + %d@%d files => %lld bytes",
       compact->compaction->num_input_files(0),
       compact->compaction->level(),
       compact->compaction->num_input_files(1),
       compact->compaction->level() + 1,
       static_cast<long long>(compact->total_bytes));
+  mutex_.Lock();
 
   // Add compaction outputs
   compact->compaction->AddInputDeletions(compact->compaction->edit());
   const int level = compact->compaction->level();
   for (size_t i = 0; i < compact->outputs.size(); i++) {
     const CompactionState::Output& out = compact->outputs[i];
-    compact->compaction->edit()->AddFile(
+    compact->compaction->edit()->AddFile2(
         level + 1,
-        out.number, out.file_size, out.smallest, out.largest);
+        out.number, out.file_size, out.smallest, out.largest,
+        out.exp_write_low, out.exp_write_high, out.exp_explicit_high);
   }
   return versions_->LogAndApply(compact->compaction->edit(), &mutex_);
 }
 
 Status DBImpl::DoCompactionWork(CompactionState* compact) {
-  const uint64_t start_micros = env_->NowMicros();
-  int64_t imm_micros = 0;  // Micros spent doing imm_ compactions
-
-  Log(options_.info_log,  "Compacting %d@%d + %d@%d files",
-      compact->compaction->num_input_files(0),
-      compact->compaction->level(),
-      compact->compaction->num_input_files(1),
-      compact->compaction->level() + 1);
 
   assert(versions_->NumLevelFiles(compact->compaction->level()) > 0);
   assert(compact->builder == NULL);
@@ -906,29 +1534,28 @@ Status DBImpl::DoCompactionWork(CompactionState* compact) {
   // Release mutex while we're actually doing the compaction work
   mutex_.Unlock();
 
+  Log(options_.info_log,  "Compacting %d@%d + %d@%d files",
+      compact->compaction->num_input_files(0),
+      compact->compaction->level(),
+      compact->compaction->num_input_files(1),
+      compact->compaction->level() + 1);
+
+  bool is_level0_compaction=(0 == compact->compaction->level());
+
+  const uint64_t start_micros = env_->NowMicros();
+
   Iterator* input = versions_->MakeInputIterator(compact->compaction);
   input->SeekToFirst();
   Status status;
-  ParsedInternalKey ikey;
-  std::string current_user_key;
-  bool has_current_user_key = false;
-  SequenceNumber last_sequence_for_key = kMaxSequenceNumber;
-  for (; input->Valid() && !shutting_down_.Acquire_Load(); ) {
-    // Prioritize immutable compaction work
-    if (has_imm_.NoBarrier_Load() != NULL) {
-      const uint64_t imm_start = env_->NowMicros();
-      mutex_.Lock();
-      if (imm_ != NULL) {
-        CompactMemTable();
-        bg_cv_.SignalAll();  // Wakeup MakeRoomForWrite() if necessary
-      }
-      mutex_.Unlock();
-      imm_micros += (env_->NowMicros() - imm_start);
-    }
 
+  KeyRetirement retire(user_comparator(), compact->smallest_snapshot, &options_, compact->compaction);
+
+  for (; input->Valid() && !shutting_down_.Acquire_Load(); )
+  {
     Slice key = input->key();
-    if (compact->compaction->ShouldStopBefore(key) &&
-        compact->builder != NULL) {
+    if (compact->builder != NULL
+        && compact->compaction->ShouldStopBefore(key, compact->builder->NumEntries())) {
+
       status = FinishCompactionOutputFile(compact, input);
       if (!status.ok()) {
         break;
@@ -936,54 +1563,12 @@ Status DBImpl::DoCompactionWork(CompactionState* compact) {
     }
 
     // Handle key/value, add to state, etc.
-    bool drop = false;
-    if (!ParseInternalKey(key, &ikey)) {
-      // Do not hide error keys
-      current_user_key.clear();
-      has_current_user_key = false;
-      last_sequence_for_key = kMaxSequenceNumber;
-    } else {
-      if (!has_current_user_key ||
-          user_comparator()->Compare(ikey.user_key,
-                                     Slice(current_user_key)) != 0) {
-        // First occurrence of this user key
-        current_user_key.assign(ikey.user_key.data(), ikey.user_key.size());
-        has_current_user_key = true;
-        last_sequence_for_key = kMaxSequenceNumber;
-      }
-
-      if (last_sequence_for_key <= compact->smallest_snapshot) {
-        // Hidden by an newer entry for same user key
-        drop = true;    // (A)
-      } else if (ikey.type == kTypeDeletion &&
-                 ikey.sequence <= compact->smallest_snapshot &&
-                 compact->compaction->IsBaseLevelForKey(ikey.user_key)) {
-        // For this user key:
-        // (1) there is no data in higher levels
-        // (2) data in lower levels will have larger sequence numbers
-        // (3) data in layers that are being compacted here and have
-        //     smaller sequence numbers will be dropped in the next
-        //     few iterations of this loop (by rule (A) above).
-        // Therefore this deletion marker is obsolete and can be dropped.
-        drop = true;
-      }
-
-      last_sequence_for_key = ikey.sequence;
-    }
-#if 0
-    Log(options_.info_log,
-        "  Compact: %s, seq %d, type: %d %d, drop: %d, is_base: %d, "
-        "%d smallest_snapshot: %d",
-        ikey.user_key.ToString().c_str(),
-        (int)ikey.sequence, ikey.type, kTypeValue, drop,
-        compact->compaction->IsBaseLevelForKey(ikey.user_key),
-        (int)last_sequence_for_key, (int)compact->smallest_snapshot);
-#endif
+    bool drop = retire(key);
 
     if (!drop) {
       // Open output file if necessary
       if (compact->builder == NULL) {
-        status = OpenCompactionOutputFile(compact);
+        status = OpenCompactionOutputFile(compact, input->value().size() + key.size());
         if (!status.ok()) {
           break;
         }
@@ -1009,6 +1594,17 @@ Status DBImpl::DoCompactionWork(CompactionState* compact) {
 
   if (status.ok() && shutting_down_.Acquire_Load()) {
     status = Status::IOError("Deleting DB during compaction");
+#if 0 // validating this block is redundant  (eleveldb issue #110)
+    // cleanup Riak modification that adds extra reference
+    //  to overlap levels files.
+    if (compact->compaction->level() < config::kNumOverlapLevels)
+    {
+        for (size_t i = 0; i < compact->outputs.size(); i++) {
+            const CompactionState::Output& out = compact->outputs[i];
+            versions_->GetTableCache()->Evict(out.number, true);
+        }   // for
+    }   // if
+#endif
   }
   if (status.ok() && compact->builder != NULL) {
     status = FinishCompactionOutputFile(compact, input);
@@ -1020,7 +1616,7 @@ Status DBImpl::DoCompactionWork(CompactionState* compact) {
   input = NULL;
 
   CompactionStats stats;
-  stats.micros = env_->NowMicros() - start_micros - imm_micros;
+  stats.micros = env_->NowMicros() - start_micros;
   for (int which = 0; which < 2; which++) {
     for (int i = 0; i < compact->compaction->num_input_files(which); i++) {
       stats.bytes_read += compact->compaction->input(which, i)->file_size;
@@ -1030,27 +1626,31 @@ Status DBImpl::DoCompactionWork(CompactionState* compact) {
     stats.bytes_written += compact->outputs[i].file_size;
   }
 
+  // write log before taking mutex_
+  VersionSet::LevelSummaryStorage tmp;
+  Log(options_.info_log,
+      "compacted to: %s", versions_->LevelSummary(&tmp));
+
   mutex_.Lock();
   stats_[compact->compaction->level() + 1].Add(stats);
 
   if (status.ok()) {
+    if (0!=compact->num_entries)
+        SetThrottleWriteRate((env_->NowMicros() - start_micros),
+                             compact->num_entries, is_level0_compaction);
     status = InstallCompactionResults(compact);
   }
-  if (!status.ok()) {
-    RecordBackgroundError(status);
-  }
-  VersionSet::LevelSummaryStorage tmp;
-  Log(options_.info_log,
-      "compacted to: %s", versions_->LevelSummary(&tmp));
+
   return status;
 }
 
+
 namespace {
 struct IterState {
   port::Mutex* mu;
   Version* version;
   MemTable* mem;
-  MemTable* imm;
+  volatile MemTable* imm;
 };
 
 static void CleanupIteratorState(void* arg1, void* arg2) {
@@ -1065,8 +1665,7 @@ static void CleanupIteratorState(void* arg1, void* arg2) {
 }  // namespace
 
 Iterator* DBImpl::NewInternalIterator(const ReadOptions& options,
-                                      SequenceNumber* latest_snapshot,
-                                      uint32_t* seed) {
+                                      SequenceNumber* latest_snapshot) {
   IterState* cleanup = new IterState;
   mutex_.Lock();
   *latest_snapshot = versions_->LastSequence();
@@ -1076,7 +1675,7 @@ Iterator* DBImpl::NewInternalIterator(const ReadOptions& options,
   list.push_back(mem_->NewIterator());
   mem_->Ref();
   if (imm_ != NULL) {
-    list.push_back(imm_->NewIterator());
+     list.push_back(((MemTable *)imm_)->NewIterator());
     imm_->Ref();
   }
   versions_->current()->AddIterators(options, &list);
@@ -1090,15 +1689,13 @@ Iterator* DBImpl::NewInternalIterator(const ReadOptions& options,
   cleanup->version = versions_->current();
   internal_iter->RegisterCleanup(CleanupIteratorState, cleanup, NULL);
 
-  *seed = ++seed_;
   mutex_.Unlock();
   return internal_iter;
 }
 
 Iterator* DBImpl::TEST_NewInternalIterator() {
   SequenceNumber ignored;
-  uint32_t ignored_seed;
-  return NewInternalIterator(ReadOptions(), &ignored, &ignored_seed);
+  return NewInternalIterator(ReadOptions(), &ignored);
 }
 
 int64_t DBImpl::TEST_MaxNextLevelOverlappingBytes() {
@@ -1108,7 +1705,16 @@ int64_t DBImpl::TEST_MaxNextLevelOverlappingBytes() {
 
 Status DBImpl::Get(const ReadOptions& options,
                    const Slice& key,
-                   std::string* value) {
+                   std::string* value,
+                   KeyMetaData * meta) {
+  StringValue stringvalue(*value);
+  return DBImpl::Get(options, key, &stringvalue, meta);
+}
+
+Status DBImpl::Get(const ReadOptions& options,
+                   const Slice& key,
+                   Value* value,
+                   KeyMetaData * meta) {
   Status s;
   MutexLock l(&mutex_);
   SequenceNumber snapshot;
@@ -1119,7 +1725,7 @@ Status DBImpl::Get(const ReadOptions& options,
   }
 
   MemTable* mem = mem_;
-  MemTable* imm = imm_;
+  volatile MemTable* imm = imm_;
   Version* current = versions_->current();
   mem->Ref();
   if (imm != NULL) imm->Ref();
@@ -1132,44 +1738,44 @@ Status DBImpl::Get(const ReadOptions& options,
   {
     mutex_.Unlock();
     // First look in the memtable, then in the immutable memtable (if any).
-    LookupKey lkey(key, snapshot);
-    if (mem->Get(lkey, value, &s)) {
+    LookupKey lkey(key, snapshot, meta);
+    if (mem->Get(lkey, value, &s, &options_)) {
       // Done
-    } else if (imm != NULL && imm->Get(lkey, value, &s)) {
+        gPerfCounters->Inc(ePerfGetMem);
+    } else if (imm != NULL && ((MemTable *)imm)->Get(lkey, value, &s, &options_)) {
       // Done
+        gPerfCounters->Inc(ePerfGetImm);
     } else {
       s = current->Get(options, lkey, value, &stats);
       have_stat_update = true;
+      gPerfCounters->Inc(ePerfGetVersion);
     }
     mutex_.Lock();
   }
 
   if (have_stat_update && current->UpdateStats(stats)) {
-    MaybeScheduleCompaction();
+      // no compactions initiated by reads, takes too long
+      // MaybeScheduleCompaction();
   }
   mem->Unref();
   if (imm != NULL) imm->Unref();
   current->Unref();
+
+  gPerfCounters->Inc(ePerfApiGet);
+
   return s;
 }
 
 Iterator* DBImpl::NewIterator(const ReadOptions& options) {
   SequenceNumber latest_snapshot;
-  uint32_t seed;
-  Iterator* iter = NewInternalIterator(options, &latest_snapshot, &seed);
+  Iterator* internal_iter = NewInternalIterator(options, &latest_snapshot);
+  gPerfCounters->Inc(ePerfIterNew);
   return NewDBIterator(
-      this, user_comparator(), iter,
+      &dbname_, env_, user_comparator(), internal_iter,
       (options.snapshot != NULL
        ? reinterpret_cast<const SnapshotImpl*>(options.snapshot)->number_
        : latest_snapshot),
-      seed);
-}
-
-void DBImpl::RecordReadSample(Slice key) {
-  MutexLock l(&mutex_);
-  if (versions_->current()->RecordReadSample(key)) {
-    MaybeScheduleCompaction();
-  }
+      options_.expiry_module.get());
 }
 
 const Snapshot* DBImpl::GetSnapshot() {
@@ -1183,8 +1789,8 @@ void DBImpl::ReleaseSnapshot(const Snapshot* s) {
 }
 
 // Convenience methods
-Status DBImpl::Put(const WriteOptions& o, const Slice& key, const Slice& val) {
-  return DB::Put(o, key, val);
+Status DBImpl::Put(const WriteOptions& o, const Slice& key, const Slice& val, const KeyMetaData * meta) {
+  return DB::Put(o, key, val, meta);
 }
 
 Status DBImpl::Delete(const WriteOptions& options, const Slice& key) {
@@ -1192,22 +1798,27 @@ Status DBImpl::Delete(const WriteOptions& options, const Slice& key) {
 }
 
 Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
+  Status status;
+  int throttle(0);
+
   Writer w(&mutex_);
   w.batch = my_batch;
   w.sync = options.sync;
   w.done = false;
 
+  {  // place mutex_ within a block
+     //  not changing tabs to ease compare to Google sources
   MutexLock l(&mutex_);
   writers_.push_back(&w);
   while (!w.done && &w != writers_.front()) {
     w.cv.Wait();
   }
   if (w.done) {
-    return w.status;
+    return w.status;  // skips throttle ... maintenance unfriendly coding, bastards
   }
 
   // May temporarily unlock and wait.
-  Status status = MakeRoomForWrite(my_batch == NULL);
+  status = MakeRoomForWrite(my_batch == NULL);
   uint64_t last_sequence = versions_->LastSequence();
   Writer* last_writer = &w;
   if (status.ok() && my_batch != NULL) {  // NULL batch is for compactions
@@ -1222,23 +1833,13 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
     {
       mutex_.Unlock();
       status = log_->AddRecord(WriteBatchInternal::Contents(updates));
-      bool sync_error = false;
       if (status.ok() && options.sync) {
         status = logfile_->Sync();
-        if (!status.ok()) {
-          sync_error = true;
-        }
       }
       if (status.ok()) {
-        status = WriteBatchInternal::InsertInto(updates, mem_);
+        status = WriteBatchInternal::InsertInto(updates, mem_, &options_);
       }
       mutex_.Lock();
-      if (sync_error) {
-        // The state of the log file is indeterminate: the log record we
-        // just added may or may not show up when the DB is re-opened.
-        // So we force the DB into a mode where all future writes fail.
-        RecordBackgroundError(status);
-      }
     }
     if (updates == tmp_batch_) tmp_batch_->Clear();
 
@@ -1261,12 +1862,75 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
     writers_.front()->cv.Signal();
   }
 
+  gPerfCounters->Inc(ePerfApiWrite);
+
+  // protect use of versions_ ... still within scope of mutex_ lock
+  throttle=versions_->WriteThrottleUsec(IsCompactionScheduled());
+  }  // release  MutexLock l(&mutex_)
+
+
+  // throttle on exit to reduce possible reordering
+  if (0!=throttle)
+  {
+      uint64_t now, remaining_wait, new_end, batch_wait;
+      int batch_count;
+
+      /// slowing each call down sequentially
+      MutexLock l(&throttle_mutex_);
+
+      // server may have been busy since previous write,
+      //  use only the remaining time as throttle
+      now=env_->NowMicros();
+
+      if (now < throttle_end)
+      {
+
+          remaining_wait=throttle_end - now;
+          env_->SleepForMicroseconds(remaining_wait);
+          new_end=now+remaining_wait+throttle;
+
+          gPerfCounters->Add(ePerfThrottleWait, remaining_wait);
+      }   // if
+      else
+      {
+          remaining_wait=0;
+          new_end=now + throttle;
+      }   // else
+
+      // throttle is per key write, how many in batch?
+      //  (do not use batch count on internal db because of impact to AAE)
+      batch_count=(!options_.is_internal_db && NULL!=my_batch ? WriteBatchInternal::Count(my_batch) : 1);
+      if (0 < batch_count)  // unclear if Count() could return zero
+          --batch_count;
+      batch_wait=throttle * batch_count;
+
+      // only wait on batch if extends beyond potential wait period
+      if (now + remaining_wait < throttle_end + batch_wait)
+      {
+          remaining_wait=throttle_end + batch_wait - (now + remaining_wait);
+          env_->SleepForMicroseconds(remaining_wait);
+          new_end +=remaining_wait;
+
+          gPerfCounters->Add(ePerfThrottleWait, remaining_wait);
+      }   // if
+
+      throttle_end=new_end;
+  }   // if
+
+  // throttle not needed, kill off old wait time
+  else if (0!=throttle_end)
+  {
+      throttle_end=0;
+  }   // else if
+
   return status;
 }
 
 // REQUIRES: Writer list must be non-empty
 // REQUIRES: First writer must have a non-NULL batch
+// REQUIRES: mutex_ is held
 WriteBatch* DBImpl::BuildBatchGroup(Writer** last_writer) {
+  mutex_.AssertHeld();
   assert(!writers_.empty());
   Writer* first = writers_.front();
   WriteBatch* result = first->batch;
@@ -1299,7 +1963,7 @@ WriteBatch* DBImpl::BuildBatchGroup(Writer** last_writer) {
         break;
       }
 
-      // Append to *result
+      // Append to *reuslt
       if (result == first->batch) {
         // Switch to temporary batch instead of disturbing caller's batch
         result = tmp_batch_;
@@ -1320,14 +1984,16 @@ Status DBImpl::MakeRoomForWrite(bool force) {
   assert(!writers_.empty());
   bool allow_delay = !force;
   Status s;
+
   while (true) {
     if (!bg_error_.ok()) {
       // Yield previous error
+        gPerfCounters->Inc(ePerfWriteError);
       s = bg_error_;
       break;
     } else if (
         allow_delay &&
-        versions_->NumLevelFiles(0) >= config::kL0_SlowdownWritesTrigger) {
+        versions_->NumLevelFiles(0) >= (int)config::kL0_SlowdownWritesTrigger) {
       // We are getting close to hitting a hard limit on the number of
       // L0 files.  Rather than delaying a single write by several
       // seconds when we hit the hard limit, start delaying each
@@ -1335,42 +2001,59 @@ Status DBImpl::MakeRoomForWrite(bool force) {
       // this delay hands over some CPU to the compaction thread in
       // case it is sharing the same core as the writer.
       mutex_.Unlock();
+#if 0   // see if this impacts smoothing or helps (but keep the counts)
+      // (original Google code left for reference)
       env_->SleepForMicroseconds(1000);
+#endif
       allow_delay = false;  // Do not delay a single write more than once
+      gPerfCounters->Inc(ePerfWriteSleep);
       mutex_.Lock();
     } else if (!force &&
                (mem_->ApproximateMemoryUsage() <= options_.write_buffer_size)) {
       // There is room in current memtable
+        gPerfCounters->Inc(ePerfWriteNoWait);
       break;
     } else if (imm_ != NULL) {
       // We have filled up the current memtable, but the previous
       // one is still being compacted, so we wait.
-      Log(options_.info_log, "Current memtable full; waiting...\n");
-      bg_cv_.Wait();
+      Log(options_.info_log, "waiting 2...\n");
+      gPerfCounters->Inc(ePerfWriteWaitImm);
+      MaybeScheduleCompaction();
+      if (!shutting_down_.Acquire_Load())
+          bg_cv_.Wait();
+      Log(options_.info_log, "running 2...\n");
     } else if (versions_->NumLevelFiles(0) >= config::kL0_StopWritesTrigger) {
       // There are too many level-0 files.
-      Log(options_.info_log, "Too many L0 files; waiting...\n");
-      bg_cv_.Wait();
+      Log(options_.info_log, "waiting...\n");
+      gPerfCounters->Inc(ePerfWriteWaitLevel0);
+      MaybeScheduleCompaction();
+      if (!shutting_down_.Acquire_Load())
+          bg_cv_.Wait();
+      Log(options_.info_log, "running...\n");
     } else {
       // Attempt to switch to a new memtable and trigger compaction of old
       assert(versions_->PrevLogNumber() == 0);
       uint64_t new_log_number = versions_->NewFileNumber();
-      WritableFile* lfile = NULL;
-      s = env_->NewWritableFile(LogFileName(dbname_, new_log_number), &lfile);
+
+      gPerfCounters->Inc(ePerfWriteNewMem);
+      s = NewRecoveryLog(new_log_number);
+
       if (!s.ok()) {
         // Avoid chewing through file number space in a tight loop.
         versions_->ReuseFileNumber(new_log_number);
         break;
       }
-      delete log_;
-      delete logfile_;
-      logfile_ = lfile;
-      logfile_number_ = new_log_number;
-      log_ = new log::Writer(lfile);
+
       imm_ = mem_;
-      has_imm_.Release_Store(imm_);
+      has_imm_.Release_Store((MemTable*)imm_);
+      if (NULL!=imm_)
+      {
+         ThreadTask * task=new ImmWriteTask(this);
+         gImmThreads->Submit(task, true);
+      }
       mem_ = new MemTable(internal_comparator_);
       mem_->Ref();
+
       force = false;   // Do not force another compaction if have room
       MaybeScheduleCompaction();
     }
@@ -1378,6 +2061,35 @@ Status DBImpl::MakeRoomForWrite(bool force) {
   return s;
 }
 
+
+// the following steps existed in two places, DB::Open() and
+//  DBImpl::MakeRoomForWrite().  This lead to a bug in Basho's
+//  tiered storage feature.  Unifying the code.
+Status DBImpl::NewRecoveryLog(
+    uint64_t NewLogNumber)
+{
+    mutex_.AssertHeld();
+    Status s;
+    WritableFile * lfile(NULL);
+
+    s = env_->NewWriteOnlyFile(LogFileName(dbname_, NewLogNumber), &lfile,
+                               options_.env->RecoveryMmapSize(&options_));
+    if (s.ok())
+    {
+        // close any existing
+        delete log_;
+        delete logfile_;
+
+        logfile_ = lfile;
+        logfile_number_ = NewLogNumber;
+        log_ = new log::Writer(lfile);
+    }   // if
+
+    return(s);
+
+}   // DBImpl::NewRecoveryLog
+
+
 bool DBImpl::GetProperty(const Slice& property, std::string* value) {
   value->clear();
 
@@ -1391,11 +2103,11 @@ bool DBImpl::GetProperty(const Slice& property, std::string* value) {
     in.remove_prefix(strlen("num-files-at-level"));
     uint64_t level;
     bool ok = ConsumeDecimalNumber(&in, &level) && in.empty();
-    if (!ok || level >= config::kNumLevels) {
+    if (!ok || level >= (uint64_t)config::kNumLevels) {
       return false;
     } else {
       char buf[100];
-      snprintf(buf, sizeof(buf), "%d",
+      snprintf(buf, sizeof(buf), "%zd",
                versions_->NumLevelFiles(static_cast<int>(level)));
       *value = buf;
       return true;
@@ -1427,19 +2139,34 @@ bool DBImpl::GetProperty(const Slice& property, std::string* value) {
   } else if (in == "sstables") {
     *value = versions_->current()->DebugString();
     return true;
-  } else if (in == "approximate-memory-usage") {
-    size_t total_usage = options_.block_cache->TotalCharge();
-    if (mem_) {
-      total_usage += mem_->ApproximateMemoryUsage();
-    }
-    if (imm_) {
-      total_usage += imm_->ApproximateMemoryUsage();
-    }
+  } else if (in == "total-bytes") {
     char buf[50];
-    snprintf(buf, sizeof(buf), "%llu",
-             static_cast<unsigned long long>(total_usage));
+    uint64_t total = 0;
+    for (int level = 0; level < config::kNumLevels; level++) {
+      total += versions_->NumLevelBytes(level);
+    }
+    snprintf(buf, sizeof(buf), "%" PRIu64, total);
     value->append(buf);
     return true;
+  } else if (in == "file-cache") {
+    char buf[50];
+    snprintf(buf, sizeof(buf), "%zd", double_cache.GetCapacity(true));
+    value->append(buf);
+    return true;
+  } else if (in == "block-cache") {
+    char buf[50];
+    snprintf(buf, sizeof(buf), "%zd", double_cache.GetCapacity(false));
+    value->append(buf);
+    return true;
+  } else if (-1!=gPerfCounters->LookupCounter(in.ToString().c_str())) {
+
+      char buf[66];
+      int index;
+
+      index=gPerfCounters->LookupCounter(in.ToString().c_str());
+      snprintf(buf, sizeof(buf), "%" PRIu64 , gPerfCounters->Value(index));
+      value->append(buf);
+      return(true);
   }
 
   return false;
@@ -1458,8 +2185,8 @@ void DBImpl::GetApproximateSizes(
 
   for (int i = 0; i < n; i++) {
     // Convert user_key into a corresponding internal key.
-    InternalKey k1(range[i].start, kMaxSequenceNumber, kValueTypeForSeek);
-    InternalKey k2(range[i].limit, kMaxSequenceNumber, kValueTypeForSeek);
+    InternalKey k1(range[i].start, 0, kMaxSequenceNumber, kValueTypeForSeek);
+    InternalKey k2(range[i].limit, 0, kMaxSequenceNumber, kValueTypeForSeek);
     uint64_t start = versions_->ApproximateOffsetOf(v, k1);
     uint64_t limit = versions_->ApproximateOffsetOf(v, k2);
     sizes[i] = (limit >= start ? limit - start : 0);
@@ -1473,15 +2200,21 @@ void DBImpl::GetApproximateSizes(
 
 // Default implementations of convenience methods that subclasses of DB
 // can call if they wish
-Status DB::Put(const WriteOptions& opt, const Slice& key, const Slice& value) {
+Status DB::Put(const WriteOptions& opt, const Slice& key, const Slice& value,
+  const KeyMetaData * meta) {
   WriteBatch batch;
-  batch.Put(key, value);
+  batch.Put(key, value, meta);
   return Write(opt, &batch);
 }
 
 Status DB::Delete(const WriteOptions& opt, const Slice& key) {
   WriteBatch batch;
   batch.Delete(key);
+
+  // Negate the count to "ApiWrite"
+  gPerfCounters->Dec(ePerfApiWrite);
+  gPerfCounters->Inc(ePerfApiDelete);
+
   return Write(opt, &batch);
 }
 
@@ -1494,40 +2227,47 @@ Status DB::Open(const Options& options, const std::string& dbname,
   DBImpl* impl = new DBImpl(options, dbname);
   impl->mutex_.Lock();
   VersionEdit edit;
-  // Recover handles create_if_missing, error_if_exists
-  bool save_manifest = false;
-  Status s = impl->Recover(&edit, &save_manifest);
-  if (s.ok() && impl->mem_ == NULL) {
-    // Create new log and a corresponding memtable.
+  Status s;
+
+  // WARNING:  only use impl and impl->options_ from this point.
+  //           Things like tiered storage change the meanings
+
+  // 4 level0 files at 2Mbytes and 2Mbytes of block cache
+  //  (but first level1 file is likely to thrash)
+  //  ... this value is AFTER write_buffer and 40M for recovery log and LOG
+  //if (!options.limited_developer_mem && impl->GetCacheCapacity() < flex::kMinimumDBMemory)
+  //    s=Status::InvalidArgument("Less than 10Mbytes per database/vnode");
+
+  if (s.ok())
+      s = impl->Recover(&edit); // Handles create_if_missing, error_if_exists
+
+  if (s.ok()) {
     uint64_t new_log_number = impl->versions_->NewFileNumber();
-    WritableFile* lfile;
-    s = options.env->NewWritableFile(LogFileName(dbname, new_log_number),
-                                     &lfile);
+
+    s = impl->NewRecoveryLog(new_log_number);
+
     if (s.ok()) {
       edit.SetLogNumber(new_log_number);
-      impl->logfile_ = lfile;
-      impl->logfile_number_ = new_log_number;
-      impl->log_ = new log::Writer(lfile);
-      impl->mem_ = new MemTable(impl->internal_comparator_);
-      impl->mem_->Ref();
+      s = impl->versions_->LogAndApply(&edit, &impl->mutex_);
+    }
+    if (s.ok()) {
+      impl->DeleteObsoleteFiles();
+      impl->CheckCompactionState();
     }
   }
-  if (s.ok() && save_manifest) {
-    edit.SetPrevLogNumber(0);  // No older logs needed after recovery.
-    edit.SetLogNumber(impl->logfile_number_);
-    s = impl->versions_->LogAndApply(&edit, &impl->mutex_);
-  }
-  if (s.ok()) {
-    impl->DeleteObsoleteFiles();
-    impl->MaybeScheduleCompaction();
-  }
+
+  if (impl->options_.cache_object_warming)
+      impl->table_cache_->PreloadTableCache();
+
   impl->mutex_.Unlock();
   if (s.ok()) {
-    assert(impl->mem_ != NULL);
     *dbptr = impl;
   } else {
     delete impl;
   }
+
+  gPerfCounters->Inc(ePerfApiOpen);
+
   return s;
 }
 
@@ -1537,22 +2277,50 @@ Snapshot::~Snapshot() {
 Status DestroyDB(const std::string& dbname, const Options& options) {
   Env* env = options.env;
   std::vector<std::string> filenames;
+  Options options_tiered;
+  std::string dbname_tiered;
+
+  options_tiered=options;
+  dbname_tiered=MakeTieredDbname(dbname, options_tiered);
+
   // Ignore error in case directory does not exist
-  env->GetChildren(dbname, &filenames);
+  env->GetChildren(dbname_tiered, &filenames);
   if (filenames.empty()) {
     return Status::OK();
   }
 
   FileLock* lock;
-  const std::string lockname = LockFileName(dbname);
+  const std::string lockname = LockFileName(dbname_tiered);
   Status result = env->LockFile(lockname, &lock);
   if (result.ok()) {
     uint64_t number;
     FileType type;
+
+    // prune the table file directories
+    for (int level=0; level<config::kNumLevels; ++level)
+    {
+        std::string dirname;
+
+        filenames.clear();
+        dirname=MakeDirName2(options_tiered, level, "sst");
+        env->GetChildren(dirname, &filenames); // Ignoring errors on purpose
+        for (size_t i = 0; i < filenames.size(); i++) {
+            if (ParseFileName(filenames[i], &number, &type)) {
+                Status del = env->DeleteFile(dirname + "/" + filenames[i]);
+                if (result.ok() && !del.ok()) {
+                    result = del;
+                }   // if
+            }   // if
+        }   // for
+        env->DeleteDir(dirname);
+    }   // for
+
+    filenames.clear();
+    env->GetChildren(dbname_tiered, &filenames);
     for (size_t i = 0; i < filenames.size(); i++) {
       if (ParseFileName(filenames[i], &number, &type) &&
           type != kDBLockFile) {  // Lock file will be deleted at end
-        Status del = env->DeleteFile(dbname + "/" + filenames[i]);
+        Status del = env->DeleteFile(dbname_tiered + "/" + filenames[i]);
         if (result.ok() && !del.ok()) {
           result = del;
         }
@@ -1560,9 +2328,89 @@ Status DestroyDB(const std::string& dbname, const Options& options) {
     }
     env->UnlockFile(lock);  // Ignore error since state is already gone
     env->DeleteFile(lockname);
-    env->DeleteDir(dbname);  // Ignore error in case dir contains other files
+    env->DeleteDir(options.tiered_fast_prefix);  // Ignore error in case dir contains other files
+    env->DeleteDir(options.tiered_slow_prefix);  // Ignore error in case dir contains other files
   }
   return result;
 }
 
+
+Status DB::VerifyLevels() {return(Status::InvalidArgument("is_repair not set in Options before database opened"));};
+
+// Riak specific repair
+Status
+DBImpl::VerifyLevels()
+{
+    Status result;
+
+    // did they remember to open the db with flag set in options
+    if (options_.is_repair)
+    {
+        InternalKey begin, end;
+        bool overlap_found;
+        int level;
+        Version * ver;
+
+        overlap_found=false;
+        level=0;
+
+        do
+        {
+            // get a copy of current version
+            {
+                MutexLock l(&mutex_);
+                ver = versions_->current();
+                ver->Ref();
+            }
+
+            // level is input and output (acts as cursor to progress)
+            //  begin and end are outputs of function
+            overlap_found=ver->VerifyLevels(level, begin, end);
+            ver->Unref();
+
+            if (overlap_found)
+            {
+                Slice s_begin, s_end;
+
+                s_begin=begin.user_key();
+                s_end=end.user_key();
+                TEST_CompactRange(level, &s_begin, &s_end);
+            }   // if
+
+        } while(overlap_found);
+
+    }   // if
+    else
+    {
+        result=Status::InvalidArgument("is_repair not set in Options before database opened");
+    }   // else
+
+    return(result);
+
+}   // VerifyLevels
+
+void DB::CheckAvailableCompactions() {return;};
+
+// Used internally for inter-database notification
+//  of potential grooming timeslot availability.
+void
+DBImpl::CheckAvailableCompactions()
+{
+    MutexLock l(&mutex_);
+    MaybeScheduleCompaction();
+
+    return;
+}   // CheckAvailableCompactions
+
+
+bool
+DBImpl::IsCompactionScheduled()
+{
+    mutex_.AssertHeld();
+    bool flag(false);
+    for (int level=0; level< config::kNumLevels && !flag; ++level)
+        flag=versions_->IsCompactionSubmitted(level);
+    return(flag || NULL!=imm_ || hotbackup_pending_);
+}   // DBImpl::IsCompactionScheduled
+
 }  // namespace leveldb
diff --git a/src/leveldb/db/db_impl.h b/src/leveldb/db/db_impl.h
index 8ff323e72..5e3976a31 100644
--- a/src/leveldb/db/db_impl.h
+++ b/src/leveldb/db/db_impl.h
@@ -13,7 +13,7 @@
 #include "leveldb/db.h"
 #include "leveldb/env.h"
 #include "port/port.h"
-#include "port/thread_annotations.h"
+#include "util/cache2.h"
 
 namespace leveldb {
 
@@ -29,26 +29,37 @@ class DBImpl : public DB {
   virtual ~DBImpl();
 
   // Implementations of the DB interface
-  virtual Status Put(const WriteOptions&, const Slice& key, const Slice& value);
+  virtual Status Put(const WriteOptions&, const Slice& key, const Slice& value, const KeyMetaData * meta=NULL);
   virtual Status Delete(const WriteOptions&, const Slice& key);
   virtual Status Write(const WriteOptions& options, WriteBatch* updates);
   virtual Status Get(const ReadOptions& options,
                      const Slice& key,
-                     std::string* value);
+                     std::string* value,
+                     KeyMetaData * meta=NULL);
+  virtual Status Get(const ReadOptions& options,
+                     const Slice& key,
+                     Value* value,
+                     KeyMetaData * meta=NULL);
   virtual Iterator* NewIterator(const ReadOptions&);
   virtual const Snapshot* GetSnapshot();
   virtual void ReleaseSnapshot(const Snapshot* snapshot);
   virtual bool GetProperty(const Slice& property, std::string* value);
   virtual void GetApproximateSizes(const Range* range, int n, uint64_t* sizes);
   virtual void CompactRange(const Slice* begin, const Slice* end);
+  virtual Status VerifyLevels();
+  virtual void CheckAvailableCompactions();
+  virtual Logger* GetLogger() const { return options_.info_log; }
 
   // Extra methods (for testing) that are not in the public DB interface
 
+  const Options & GetOptions() const { return options_; };
+
   // Compact any files in the named level that overlap [*begin,*end]
   void TEST_CompactRange(int level, const Slice* begin, const Slice* end);
 
-  // Force current memtable contents to be compacted.
-  Status TEST_CompactMemTable();
+  // Force current memtable contents to be compacted, waits for completion
+  Status CompactMemTableSynchronous();
+  Status TEST_CompactMemTable();       // wraps CompactMemTableSynchronous (historical)
 
   // Return an internal iterator over the current state of the database.
   // The keys of this iterator are internal keys (see format.h).
@@ -59,64 +70,82 @@ class DBImpl : public DB {
   // file at a level >= 1.
   int64_t TEST_MaxNextLevelOverlappingBytes();
 
-  // Record a sample of bytes read at the specified internal key.
-  // Samples are taken approximately once every config::kReadBytesPeriod
-  // bytes.
-  void RecordReadSample(Slice key);
+  // These are routines that DBListImpl calls across all open databases
+  void ResizeCaches() {double_cache.ResizeCaches();};
+  size_t GetCacheCapacity() {return(double_cache.GetCapacity(false));}
+  void PurgeExpiredFileCache() {double_cache.PurgeExpiredFiles();};
 
- private:
+  // in util/hot_backup.cc
+  void HotBackup();
+  bool PurgeWriteBuffer();
+  bool WriteBackupManifest();
+  bool CreateBackupLinks(Version * Version, Options & BackupOptions);
+  bool CopyLOGSegment(long FileEnd);
+  void HotBackupComplete();
+
+  void BackgroundCall2(Compaction * Compact);
+  void BackgroundImmCompactCall();
+  bool IsCompactionScheduled();
+  uint32_t RunningCompactionCount() {mutex_.AssertHeld(); return(running_compactions_);};
+
+ protected:
   friend class DB;
   struct CompactionState;
   struct Writer;
 
   Iterator* NewInternalIterator(const ReadOptions&,
-                                SequenceNumber* latest_snapshot,
-                                uint32_t* seed);
+                                SequenceNumber* latest_snapshot);
 
   Status NewDB();
 
   // Recover the descriptor from persistent storage.  May do a significant
   // amount of work to recover recently logged updates.  Any changes to
   // be made to the descriptor are added to *edit.
-  Status Recover(VersionEdit* edit, bool* save_manifest)
-      EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+  Status Recover(VersionEdit* edit);
+
+  // Riak routine:  pause DB::Open if too many compactions
+  //  stacked up immediately.  Happens in some repairs and
+  //  some Riak upgrades
+  void CheckCompactionState();
 
   void MaybeIgnoreError(Status* s) const;
 
   // Delete any unneeded files and stale in-memory entries.
   void DeleteObsoleteFiles();
+  void KeepOrDelete(const std::string & Filename, int level, const std::set<uint64_t> & Live);
 
   // Compact the in-memory write buffer to disk.  Switches to a new
   // log-file/memtable and writes a new descriptor iff successful.
-  // Errors are recorded in bg_error_.
-  void CompactMemTable() EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+  Status CompactMemTable();
 
-  Status RecoverLogFile(uint64_t log_number, bool last_log, bool* save_manifest,
-                        VersionEdit* edit, SequenceNumber* max_sequence)
-      EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+  Status RecoverLogFile(uint64_t log_number,
+                        VersionEdit* edit,
+                        SequenceNumber* max_sequence);
 
-  Status WriteLevel0Table(MemTable* mem, VersionEdit* edit, Version* base)
-      EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+  Status WriteLevel0Table(volatile MemTable* mem, VersionEdit* edit, Version* base);
+
+  Status MakeRoomForWrite(bool force /* TRUE forces memtable rotation to disk (for testing) */);
+  Status NewRecoveryLog(uint64_t NewLogNumber);
 
-  Status MakeRoomForWrite(bool force /* compact even if there is room? */)
-      EXCLUSIVE_LOCKS_REQUIRED(mutex_);
   WriteBatch* BuildBatchGroup(Writer** last_writer);
 
-  void RecordBackgroundError(const Status& s);
+  void MaybeScheduleCompaction();
 
-  void MaybeScheduleCompaction() EXCLUSIVE_LOCKS_REQUIRED(mutex_);
-  static void BGWork(void* db);
-  void BackgroundCall();
-  void  BackgroundCompaction() EXCLUSIVE_LOCKS_REQUIRED(mutex_);
-  void CleanupCompaction(CompactionState* compact)
-      EXCLUSIVE_LOCKS_REQUIRED(mutex_);
-  Status DoCompactionWork(CompactionState* compact)
-      EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+  Status BackgroundCompaction(Compaction * Compact=NULL);
+  Status BackgroundExpiry(Compaction * Compact=NULL);
 
-  Status OpenCompactionOutputFile(CompactionState* compact);
+  void CleanupCompaction(CompactionState* compact);
+  Status DoCompactionWork(CompactionState* compact);
+  int64_t PrioritizeWork(bool IsLevel0);
+
+  Status OpenCompactionOutputFile(CompactionState* compact, size_t sample_value_size);
+  bool Send2PageCache(CompactionState * compact);
+  size_t MaybeRaiseBlockSize(Compaction & CompactionStuff, size_t SampleValueSize);
   Status FinishCompactionOutputFile(CompactionState* compact, Iterator* input);
-  Status InstallCompactionResults(CompactionState* compact)
-      EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+  Status InstallCompactionResults(CompactionState* compact);
+
+  // initialized before options so its block_cache is available
+  class DoubleCache double_cache;
 
   // Constant after construction
   Env* const env_;
@@ -130,20 +159,22 @@ class DBImpl : public DB {
   // table_cache_ provides its own synchronization
   TableCache* table_cache_;
 
+
   // Lock over the persistent DB state.  Non-NULL iff successfully acquired.
   FileLock* db_lock_;
 
   // State below is protected by mutex_
   port::Mutex mutex_;
+  port::Mutex throttle_mutex_;   // used by write throttle to force sequential waits on callers
   port::AtomicPointer shutting_down_;
+
   port::CondVar bg_cv_;          // Signalled when background work finishes
   MemTable* mem_;
-  MemTable* imm_;                // Memtable being compacted
+  volatile MemTable* imm_;                // Memtable being compacted
   port::AtomicPointer has_imm_;  // So bg thread can detect non-NULL imm_
   WritableFile* logfile_;
   uint64_t logfile_number_;
   log::Writer* log_;
-  uint32_t seed_;                // For sampling.
 
   // Queue of writers.
   std::deque<Writer*> writers_;
@@ -155,9 +186,6 @@ class DBImpl : public DB {
   // part of ongoing compactions.
   std::set<uint64_t> pending_outputs_;
 
-  // Has a background compaction been scheduled or is running?
-  bool bg_compaction_scheduled_;
-
   // Information for a manual compaction
   struct ManualCompaction {
     int level;
@@ -166,7 +194,7 @@ class DBImpl : public DB {
     const InternalKey* end;     // NULL means end of key range
     InternalKey tmp_storage;    // Used to keep track of compaction progress
   };
-  ManualCompaction* manual_compaction_;
+  volatile ManualCompaction* manual_compaction_;
 
   VersionSet* versions_;
 
@@ -190,6 +218,18 @@ class DBImpl : public DB {
   };
   CompactionStats stats_[config::kNumLevels];
 
+  volatile uint64_t throttle_end;
+  volatile uint32_t running_compactions_;
+  volatile size_t current_block_size_;    // last dynamic block size computed
+  volatile uint64_t block_size_changed_;  // NowMicros() when block size computed
+  volatile uint64_t last_low_mem_;        // NowMicros() when low memory last seen
+
+  // accessor to new, dynamic block_cache
+  Cache * block_cache() {return(double_cache.GetBlockCache());};
+  Cache * file_cache() {return(double_cache.GetFileCache());};
+
+  volatile bool hotbackup_pending_;
+
   // No copying allowed
   DBImpl(const DBImpl&);
   void operator=(const DBImpl&);
@@ -204,7 +244,8 @@ class DBImpl : public DB {
 extern Options SanitizeOptions(const std::string& db,
                                const InternalKeyComparator* icmp,
                                const InternalFilterPolicy* ipolicy,
-                               const Options& src);
+                               const Options& src,
+                               Cache * block_cache);
 
 }  // namespace leveldb
 
diff --git a/src/leveldb/db/db_iter.cc b/src/leveldb/db/db_iter.cc
index 3b2035e9e..3ef3b2b2e 100644
--- a/src/leveldb/db/db_iter.cc
+++ b/src/leveldb/db/db_iter.cc
@@ -5,14 +5,14 @@
 #include "db/db_iter.h"
 
 #include "db/filename.h"
-#include "db/db_impl.h"
 #include "db/dbformat.h"
 #include "leveldb/env.h"
+#include "leveldb/expiry.h"
 #include "leveldb/iterator.h"
+#include "leveldb/perf_count.h"
 #include "port/port.h"
 #include "util/logging.h"
 #include "util/mutexlock.h"
-#include "util/random.h"
 
 namespace leveldb {
 
@@ -48,18 +48,20 @@ class DBIter: public Iterator {
     kReverse
   };
 
-  DBIter(DBImpl* db, const Comparator* cmp, Iterator* iter, SequenceNumber s,
-         uint32_t seed)
-      : db_(db),
+  DBIter(const std::string* dbname, Env* env,
+         const Comparator* cmp, Iterator* iter, SequenceNumber s,
+         const ExpiryModule * expiry)
+      : dbname_(dbname),
+        env_(env),
         user_comparator_(cmp),
         iter_(iter),
         sequence_(s),
         direction_(kForward),
         valid_(false),
-        rnd_(seed),
-        bytes_counter_(RandomPeriod()) {
+        expiry_(expiry) {
   }
   virtual ~DBIter() {
+    gPerfCounters->Inc(ePerfIterDelete);
     delete iter_;
   }
   virtual bool Valid() const { return valid_; }
@@ -71,6 +73,26 @@ class DBIter: public Iterator {
     assert(valid_);
     return (direction_ == kForward) ? iter_->value() : saved_value_;
   }
+  // Riak specific:  if a database iterator, returns key meta data
+  // REQUIRES: Valid() and forward iteration
+  //  (reverse iteration is possible, just needs code)
+  virtual KeyMetaData & keymetadata() const
+  {
+    assert(valid_ && kForward==direction_);
+    if (kForward==direction_)
+    {
+      ParsedInternalKey parsed;
+      // this initialization clears a warning.  ParsedInternalKey says
+      //  it is not initializing for performance reasons ... oh well
+      parsed.type=kTypeValue; parsed.sequence=0; parsed.expiry=0;
+      ParseInternalKey(iter_->key(), &parsed);
+      keymetadata_.m_Type=parsed.type;
+      keymetadata_.m_Sequence=parsed.sequence;
+      keymetadata_.m_Expiry=parsed.expiry;
+    }
+    return(keymetadata_);
+  }
+
   virtual Status status() const {
     if (status_.ok()) {
       return iter_->status();
@@ -103,12 +125,8 @@ class DBIter: public Iterator {
     }
   }
 
-  // Pick next gap with average value of config::kReadBytesPeriod.
-  ssize_t RandomPeriod() {
-    return rnd_.Uniform(2*config::kReadBytesPeriod);
-  }
-
-  DBImpl* db_;
+  const std::string* const dbname_;
+  Env* const env_;
   const Comparator* const user_comparator_;
   Iterator* const iter_;
   SequenceNumber const sequence_;
@@ -118,9 +136,7 @@ class DBIter: public Iterator {
   std::string saved_value_;   // == current raw value when direction_==kReverse
   Direction direction_;
   bool valid_;
-
-  Random rnd_;
-  ssize_t bytes_counter_;
+  const ExpiryModule * expiry_;
 
   // No copying allowed
   DBIter(const DBIter&);
@@ -128,14 +144,7 @@ class DBIter: public Iterator {
 };
 
 inline bool DBIter::ParseKey(ParsedInternalKey* ikey) {
-  Slice k = iter_->key();
-  ssize_t n = k.size() + iter_->value().size();
-  bytes_counter_ -= n;
-  while (bytes_counter_ < 0) {
-    bytes_counter_ += RandomPeriod();
-    db_->RecordReadSample(k);
-  }
-  if (!ParseInternalKey(k, ikey)) {
+  if (!ParseInternalKey(iter_->key(), ikey)) {
     status_ = Status::Corruption("corrupted internal key in DBIter");
     return false;
   } else {
@@ -146,6 +155,7 @@ inline bool DBIter::ParseKey(ParsedInternalKey* ikey) {
 void DBIter::Next() {
   assert(valid_);
 
+  gPerfCounters->Inc(ePerfIterNext);
   if (direction_ == kReverse) {  // Switch directions?
     direction_ = kForward;
     // iter_ is pointing just before the entries for this->key(),
@@ -161,13 +171,12 @@ void DBIter::Next() {
       saved_key_.clear();
       return;
     }
-    // saved_key_ already contains the key to skip past.
-  } else {
-    // Store in saved_key_ the current key so we skip it below.
-    SaveKey(ExtractUserKey(iter_->key()), &saved_key_);
   }
 
-  FindNextUserEntry(true, &saved_key_);
+  // Temporarily use saved_key_ as storage for key to skip.
+  std::string* skip = &saved_key_;
+  SaveKey(ExtractUserKey(iter_->key()), skip);
+  FindNextUserEntry(true, skip);
 }
 
 void DBIter::FindNextUserEntry(bool skipping, std::string* skip) {
@@ -177,6 +186,9 @@ void DBIter::FindNextUserEntry(bool skipping, std::string* skip) {
   do {
     ParsedInternalKey ikey;
     if (ParseKey(&ikey) && ikey.sequence <= sequence_) {
+      if (IsExpiryKey(ikey.type) && NULL!=expiry_
+          && expiry_->KeyRetirementCallback(ikey))
+        ikey.type=kTypeDeletion;
       switch (ikey.type) {
         case kTypeDeletion:
           // Arrange to skip all upcoming entries for this key since
@@ -184,6 +196,9 @@ void DBIter::FindNextUserEntry(bool skipping, std::string* skip) {
           SaveKey(ikey.user_key, skip);
           skipping = true;
           break;
+
+        case kTypeValueWriteTime:
+        case kTypeValueExplicitExpiry:
         case kTypeValue:
           if (skipping &&
               user_comparator_->Compare(ikey.user_key, *skip) <= 0) {
@@ -205,6 +220,7 @@ void DBIter::FindNextUserEntry(bool skipping, std::string* skip) {
 void DBIter::Prev() {
   assert(valid_);
 
+  gPerfCounters->Inc(ePerfIterPrev);
   if (direction_ == kForward) {  // Switch directions?
     // iter_ is pointing at the current entry.  Scan backwards until
     // the key changes so we can use the normal reverse scanning code.
@@ -242,6 +258,10 @@ void DBIter::FindPrevUserEntry() {
           // We encountered a non-deleted value in entries for previous keys,
           break;
         }
+        if (IsExpiryKey(ikey.type) && NULL!=expiry_
+            && expiry_->KeyRetirementCallback(ikey))
+          ikey.type=kTypeDeletion;
+
         value_type = ikey.type;
         if (value_type == kTypeDeletion) {
           saved_key_.clear();
@@ -272,11 +292,12 @@ void DBIter::FindPrevUserEntry() {
 }
 
 void DBIter::Seek(const Slice& target) {
+  gPerfCounters->Inc(ePerfIterSeek);
   direction_ = kForward;
   ClearSavedValue();
   saved_key_.clear();
   AppendInternalKey(
-      &saved_key_, ParsedInternalKey(target, sequence_, kValueTypeForSeek));
+      &saved_key_, ParsedInternalKey(target, 0, sequence_, kValueTypeForSeek));
   iter_->Seek(saved_key_);
   if (iter_->Valid()) {
     FindNextUserEntry(false, &saved_key_ /* temporary storage */);
@@ -286,6 +307,7 @@ void DBIter::Seek(const Slice& target) {
 }
 
 void DBIter::SeekToFirst() {
+  gPerfCounters->Inc(ePerfIterSeekFirst);
   direction_ = kForward;
   ClearSavedValue();
   iter_->SeekToFirst();
@@ -297,6 +319,7 @@ void DBIter::SeekToFirst() {
 }
 
 void DBIter::SeekToLast() {
+  gPerfCounters->Inc(ePerfIterSeekLast);
   direction_ = kReverse;
   ClearSavedValue();
   iter_->SeekToLast();
@@ -306,12 +329,13 @@ void DBIter::SeekToLast() {
 }  // anonymous namespace
 
 Iterator* NewDBIterator(
-    DBImpl* db,
+    const std::string* dbname,
+    Env* env,
     const Comparator* user_key_comparator,
     Iterator* internal_iter,
-    SequenceNumber sequence,
-    uint32_t seed) {
-  return new DBIter(db, user_key_comparator, internal_iter, sequence, seed);
+    const SequenceNumber& sequence,
+    const ExpiryModule * expiry) {
+  return new DBIter(dbname, env, user_key_comparator, internal_iter, sequence, expiry);
 }
 
 }  // namespace leveldb
diff --git a/src/leveldb/db/db_iter.h b/src/leveldb/db/db_iter.h
index 04927e937..c3f40469f 100644
--- a/src/leveldb/db/db_iter.h
+++ b/src/leveldb/db/db_iter.h
@@ -7,21 +7,21 @@
 
 #include <stdint.h>
 #include "leveldb/db.h"
+#include "leveldb/expiry.h"
 #include "db/dbformat.h"
 
 namespace leveldb {
 
-class DBImpl;
-
 // Return a new iterator that converts internal keys (yielded by
 // "*internal_iter") that were live at the specified "sequence" number
 // into appropriate user keys.
 extern Iterator* NewDBIterator(
-    DBImpl* db,
+    const std::string* dbname,
+    Env* env,
     const Comparator* user_key_comparator,
     Iterator* internal_iter,
-    SequenceNumber sequence,
-    uint32_t seed);
+    const SequenceNumber& sequence,
+    const ExpiryModule * expiry=NULL);
 
 }  // namespace leveldb
 
diff --git a/src/leveldb/db/db_test.cc b/src/leveldb/db/db_test.cc
index a0b08bc19..0916673b4 100644
--- a/src/leveldb/db/db_test.cc
+++ b/src/leveldb/db/db_test.cc
@@ -33,11 +33,8 @@ class AtomicCounter {
  public:
   AtomicCounter() : count_(0) { }
   void Increment() {
-    IncrementBy(1);
-  }
-  void IncrementBy(int count) {
     MutexLock l(&mu_);
-    count_ += count;
+    count_++;
   }
   int Read() {
     MutexLock l(&mu_);
@@ -48,20 +45,13 @@ class AtomicCounter {
     count_ = 0;
   }
 };
-
-void DelayMilliseconds(int millis) {
-  Env::Default()->SleepForMicroseconds(millis * 1000);
-}
 }
 
 // Special Env used to delay background operations
 class SpecialEnv : public EnvWrapper {
  public:
-  // sstable/log Sync() calls are blocked while this pointer is non-NULL.
-  port::AtomicPointer delay_data_sync_;
-
-  // sstable/log Sync() calls return an error.
-  port::AtomicPointer data_sync_error_;
+  // sstable Sync() calls are blocked while this pointer is non-NULL.
+  port::AtomicPointer delay_sstable_sync_;
 
   // Simulate no-space errors while this pointer is non-NULL.
   port::AtomicPointer no_space_;
@@ -69,37 +59,30 @@ class SpecialEnv : public EnvWrapper {
   // Simulate non-writable file system while this pointer is non-NULL
   port::AtomicPointer non_writable_;
 
-  // Force sync of manifest files to fail while this pointer is non-NULL
-  port::AtomicPointer manifest_sync_error_;
-
-  // Force write to manifest files to fail while this pointer is non-NULL
-  port::AtomicPointer manifest_write_error_;
-
   bool count_random_reads_;
   AtomicCounter random_read_counter_;
 
+  AtomicCounter sleep_counter_;
+
   explicit SpecialEnv(Env* base) : EnvWrapper(base) {
-    delay_data_sync_.Release_Store(NULL);
-    data_sync_error_.Release_Store(NULL);
+    delay_sstable_sync_.Release_Store(NULL);
     no_space_.Release_Store(NULL);
     non_writable_.Release_Store(NULL);
     count_random_reads_ = false;
-    manifest_sync_error_.Release_Store(NULL);
-    manifest_write_error_.Release_Store(NULL);
   }
 
-  Status NewWritableFile(const std::string& f, WritableFile** r) {
-    class DataFile : public WritableFile {
+  Status NewWritableFile(const std::string& f, WritableFile** r, size_t map_size) {
+    class SSTableFile : public WritableFile {
      private:
       SpecialEnv* env_;
       WritableFile* base_;
 
      public:
-      DataFile(SpecialEnv* env, WritableFile* base)
+      SSTableFile(SpecialEnv* env, WritableFile* base)
           : env_(env),
             base_(base) {
       }
-      ~DataFile() { delete base_; }
+      ~SSTableFile() { delete base_; }
       Status Append(const Slice& data) {
         if (env_->no_space_.Acquire_Load() != NULL) {
           // Drop writes on the floor
@@ -111,51 +94,21 @@ class SpecialEnv : public EnvWrapper {
       Status Close() { return base_->Close(); }
       Status Flush() { return base_->Flush(); }
       Status Sync() {
-        if (env_->data_sync_error_.Acquire_Load() != NULL) {
-          return Status::IOError("simulated data sync error");
-        }
-        while (env_->delay_data_sync_.Acquire_Load() != NULL) {
-          DelayMilliseconds(100);
+        while (env_->delay_sstable_sync_.Acquire_Load() != NULL) {
+          env_->SleepForMicroseconds(100000);
         }
         return base_->Sync();
       }
     };
-    class ManifestFile : public WritableFile {
-     private:
-      SpecialEnv* env_;
-      WritableFile* base_;
-     public:
-      ManifestFile(SpecialEnv* env, WritableFile* b) : env_(env), base_(b) { }
-      ~ManifestFile() { delete base_; }
-      Status Append(const Slice& data) {
-        if (env_->manifest_write_error_.Acquire_Load() != NULL) {
-          return Status::IOError("simulated writer error");
-        } else {
-          return base_->Append(data);
-        }
-      }
-      Status Close() { return base_->Close(); }
-      Status Flush() { return base_->Flush(); }
-      Status Sync() {
-        if (env_->manifest_sync_error_.Acquire_Load() != NULL) {
-          return Status::IOError("simulated sync error");
-        } else {
-          return base_->Sync();
-        }
-      }
-    };
 
     if (non_writable_.Acquire_Load() != NULL) {
       return Status::IOError("simulated write error");
     }
 
-    Status s = target()->NewWritableFile(f, r);
+    Status s = target()->NewWritableFile(f, r, 2<<20);
     if (s.ok()) {
-      if (strstr(f.c_str(), ".ldb") != NULL ||
-          strstr(f.c_str(), ".log") != NULL) {
-        *r = new DataFile(this, *r);
-      } else if (strstr(f.c_str(), "MANIFEST") != NULL) {
-        *r = new ManifestFile(this, *r);
+      if (strstr(f.c_str(), ".sst") != NULL) {
+        *r = new SSTableFile(this, *r);
       }
     }
     return s;
@@ -184,6 +137,11 @@ class SpecialEnv : public EnvWrapper {
     }
     return s;
   }
+
+  virtual void SleepForMicroseconds(int micros) {
+    sleep_counter_.Increment();
+    target()->SleepForMicroseconds(micros);
+  }
 };
 
 class DBTest {
@@ -193,7 +151,6 @@ class DBTest {
   // Sequence of option configurations to try
   enum OptionConfig {
     kDefault,
-    kReuse,
     kFilter,
     kUncompressed,
     kEnd
@@ -209,7 +166,7 @@ class DBTest {
 
   DBTest() : option_config_(kDefault),
              env_(new SpecialEnv(Env::Default())) {
-    filter_policy_ = NewBloomFilterPolicy(10);
+    filter_policy_ = NewBloomFilterPolicy2(16);
     dbname_ = test::TmpDir() + "/db_test";
     DestroyDB(dbname_, Options());
     db_ = NULL;
@@ -238,11 +195,7 @@ class DBTest {
   // Return the current option configuration.
   Options CurrentOptions() {
     Options options;
-    options.reuse_logs = false;
     switch (option_config_) {
-      case kReuse:
-        options.reuse_logs = true;
-        break;
       case kFilter:
         options.filter_policy = filter_policy_;
         break;
@@ -290,6 +243,23 @@ class DBTest {
     return DB::Open(opts, dbname_, &db_);
   }
 
+  Status DoubleOpen(Options* options = NULL) {
+    DB * db_fail;
+    delete db_;
+    db_ = NULL;
+    Options opts, opts2;
+    if (options != NULL) {
+      opts = *options;
+    } else {
+      opts = CurrentOptions();
+      opts.create_if_missing = true;
+    }
+    last_options_ = opts;
+
+    DB::Open(opts, dbname_, &db_);
+    return DB::Open(opts2, dbname_, &db_fail);
+  }
+
   Status Put(const std::string& k, const std::string& v) {
     return db_->Put(WriteOptions(), k, v);
   }
@@ -311,6 +281,20 @@ class DBTest {
     return result;
   }
 
+  std::string GetNoCache(const std::string& k, const Snapshot* snapshot = NULL) {
+    ReadOptions options;
+    options.snapshot = snapshot;
+    options.fill_cache=false;
+    std::string result;
+    Status s = db_->Get(options, k, &result);
+    if (s.IsNotFound()) {
+      result = "NOT_FOUND";
+    } else if (!s.ok()) {
+      result = s.ToString();
+    }
+    return result;
+  }
+
   // Return a string that contains all key,value pairs in order,
   // formatted like "(k1->v1)(k2->v2)".
   std::string Contents() {
@@ -326,7 +310,7 @@ class DBTest {
     }
 
     // Check reverse iteration results are the reverse of forward results
-    size_t matched = 0;
+    int matched = 0;
     for (iter->SeekToLast(); iter->Valid(); iter->Prev()) {
       ASSERT_LT(matched, forward.size());
       ASSERT_EQ(IterStatus(iter), forward[forward.size() - matched - 1]);
@@ -340,7 +324,7 @@ class DBTest {
 
   std::string AllEntriesFor(const Slice& user_key) {
     Iterator* iter = dbfull()->TEST_NewInternalIterator();
-    InternalKey target(user_key, kMaxSequenceNumber, kTypeValue);
+    InternalKey target(user_key, 0, kMaxSequenceNumber, kTypeValue);
     iter->Seek(target.Encode());
     std::string result;
     if (!iter->status().ok()) {
@@ -361,6 +345,8 @@ class DBTest {
           }
           first = false;
           switch (ikey.type) {
+            case kTypeValueWriteTime:
+            case kTypeValueExplicitExpiry:
             case kTypeValue:
               result += iter->value().ToString();
               break;
@@ -474,38 +460,6 @@ class DBTest {
     }
     return result;
   }
-
-  bool DeleteAnSSTFile() {
-    std::vector<std::string> filenames;
-    ASSERT_OK(env_->GetChildren(dbname_, &filenames));
-    uint64_t number;
-    FileType type;
-    for (size_t i = 0; i < filenames.size(); i++) {
-      if (ParseFileName(filenames[i], &number, &type) && type == kTableFile) {
-        ASSERT_OK(env_->DeleteFile(TableFileName(dbname_, number)));
-        return true;
-      }
-    }
-    return false;
-  }
-
-  // Returns number of files renamed.
-  int RenameLDBToSST() {
-    std::vector<std::string> filenames;
-    ASSERT_OK(env_->GetChildren(dbname_, &filenames));
-    uint64_t number;
-    FileType type;
-    int files_renamed = 0;
-    for (size_t i = 0; i < filenames.size(); i++) {
-      if (ParseFileName(filenames[i], &number, &type) && type == kTableFile) {
-        const std::string from = TableFileName(dbname_, number);
-        const std::string to = SSTTableFileName(dbname_, number);
-        ASSERT_OK(env_->RenameFile(from, to));
-        files_renamed++;
-      }
-    }
-    return files_renamed;
-  }
 };
 
 TEST(DBTest, Empty) {
@@ -515,6 +469,11 @@ TEST(DBTest, Empty) {
   } while (ChangeOptions());
 }
 
+TEST(DBTest, DoubleOpen)
+{
+    ASSERT_NOTOK(DoubleOpen());
+}
+
 TEST(DBTest, ReadWrite) {
   do {
     ASSERT_OK(Put("foo", "v1"));
@@ -547,11 +506,11 @@ TEST(DBTest, GetFromImmutableLayer) {
     ASSERT_OK(Put("foo", "v1"));
     ASSERT_EQ("v1", Get("foo"));
 
-    env_->delay_data_sync_.Release_Store(env_);      // Block sync calls
+    env_->delay_sstable_sync_.Release_Store(env_);   // Block sync calls
     Put("k1", std::string(100000, 'x'));             // Fill memtable
     Put("k2", std::string(100000, 'y'));             // Trigger compaction
     ASSERT_EQ("v1", Get("foo"));
-    env_->delay_data_sync_.Release_Store(NULL);      // Release sync calls
+    env_->delay_sstable_sync_.Release_Store(NULL);   // Release sync calls
   } while (ChangeOptions());
 }
 
@@ -563,17 +522,6 @@ TEST(DBTest, GetFromVersions) {
   } while (ChangeOptions());
 }
 
-TEST(DBTest, GetMemUsage) {
-  do {
-    ASSERT_OK(Put("foo", "v1"));
-    std::string val;
-    ASSERT_TRUE(db_->GetProperty("leveldb.approximate-memory-usage", &val));
-    int mem_usage = atoi(val.c_str());
-    ASSERT_GT(mem_usage, 0);
-    ASSERT_LT(mem_usage, 5*1024*1024);
-  } while (ChangeOptions());
-}
-
 TEST(DBTest, GetSnapshot) {
   do {
     // Try with both a short key and a long key
@@ -634,6 +582,9 @@ TEST(DBTest, GetPicksCorrectFile) {
   } while (ChangeOptions());
 }
 
+#if 0
+// riak does not execute compaction due to reads
+
 TEST(DBTest, GetEncountersEmptyLevel) {
   do {
     // Arrange for the following to happen:
@@ -642,7 +593,7 @@ TEST(DBTest, GetEncountersEmptyLevel) {
     //   * sstable B in level 2
     // Then do enough Get() calls to arrange for an automatic compaction
     // of sstable A.  A bug would cause the compaction to be marked as
-    // occurring at level 1 (instead of the correct level 0).
+    // occuring at level 1 (instead of the correct level 0).
 
     // Step 1: First place sstables in levels 0 and 2
     int compaction_count = 0;
@@ -667,11 +618,12 @@ TEST(DBTest, GetEncountersEmptyLevel) {
     }
 
     // Step 4: Wait for compaction to finish
-    DelayMilliseconds(1000);
+    env_->SleepForMicroseconds(1000000);
 
     ASSERT_EQ(NumTableFilesAtLevel(0), 0);
   } while (ChangeOptions());
 }
+#endif
 
 TEST(DBTest, IterEmpty) {
   Iterator* iter = db_->NewIterator(ReadOptions());
@@ -996,7 +948,8 @@ TEST(DBTest, CompactionsGenerateMultipleFiles) {
   dbfull()->TEST_CompactRange(0, NULL, NULL);
 
   ASSERT_EQ(NumTableFilesAtLevel(0), 0);
-  ASSERT_GT(NumTableFilesAtLevel(1), 1);
+// not riak  ASSERT_GT(NumTableFilesAtLevel(1), 1);
+  ASSERT_EQ(NumTableFilesAtLevel(1), 1);  // yes riak
   for (int i = 0; i < 80; i++) {
     ASSERT_EQ(Get(Key(i)), values[i]);
   }
@@ -1010,7 +963,8 @@ TEST(DBTest, RepeatedWritesToSameKey) {
 
   // We must have at most one file per level except for level-0,
   // which may have up to kL0_StopWritesTrigger files.
-  const int kMaxFiles = config::kNumLevels + config::kL0_StopWritesTrigger;
+  //  ... basho adds *2 since level-1 is now overlapped too
+  const int kMaxFiles = config::kNumLevels + config::kL0_StopWritesTrigger*2;
 
   Random rnd(301);
   std::string value = RandomString(&rnd, 2 * options.write_buffer_size);
@@ -1054,11 +1008,13 @@ TEST(DBTest, SparseMerge) {
 
   // Compactions should not cause us to create a situation where
   // a file overlaps too much data at the next level.
-  ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576);
+  // 07/10/14 matthewv - we overlap first two levels.  sparse test not appropriate there,
+  //                     and we set overlaps into 100s of megabytes as "normal"
+//  ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576);
   dbfull()->TEST_CompactRange(0, NULL, NULL);
-  ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576);
+//  ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576);
   dbfull()->TEST_CompactRange(1, NULL, NULL);
-  ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576);
+//  ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576);
 }
 
 static bool Between(uint64_t val, uint64_t low, uint64_t high) {
@@ -1096,14 +1052,6 @@ TEST(DBTest, ApproximateSizes) {
     // 0 because GetApproximateSizes() does not account for memtable space
     ASSERT_TRUE(Between(Size("", Key(50)), 0, 0));
 
-    if (options.reuse_logs) {
-      // Recovery will reuse memtable, and GetApproximateSizes() does not
-      // account for memtable usage;
-      Reopen(&options);
-      ASSERT_TRUE(Between(Size("", Key(50)), 0, 0));
-      continue;
-    }
-
     // Check sizes across recovery by reopening a few times
     for (int run = 0; run < 3; run++) {
       Reopen(&options);
@@ -1147,11 +1095,6 @@ TEST(DBTest, ApproximateSizes_MixOfSmallAndLarge) {
     ASSERT_OK(Put(Key(6), RandomString(&rnd, 300000)));
     ASSERT_OK(Put(Key(7), RandomString(&rnd, 10000)));
 
-    if (options.reuse_logs) {
-      // Need to force a memtable compaction since recovery does not do so.
-      ASSERT_OK(dbfull()->TEST_CompactMemTable());
-    }
-
     // Check sizes across recovery by reopening a few times
     for (int run = 0; run < 3; run++) {
       Reopen(&options);
@@ -1223,7 +1166,7 @@ TEST(DBTest, Snapshot) {
     ASSERT_EQ("v4", Get("foo"));
   } while (ChangeOptions());
 }
-
+#if 0 // trouble under Riak due to assumed file sizes
 TEST(DBTest, HiddenValuesAreRemoved) {
   do {
     Random rnd(301);
@@ -1254,7 +1197,7 @@ TEST(DBTest, HiddenValuesAreRemoved) {
     ASSERT_TRUE(Between(Size("", "pastfoo"), 0, 1000));
   } while (ChangeOptions());
 }
-
+#endif
 TEST(DBTest, DeletionMarkers1) {
   Put("foo", "v1");
   ASSERT_OK(dbfull()->TEST_CompactMemTable());
@@ -1271,13 +1214,14 @@ TEST(DBTest, DeletionMarkers1) {
   Delete("foo");
   Put("foo", "v2");
   ASSERT_EQ(AllEntriesFor("foo"), "[ v2, DEL, v1 ]");
-  ASSERT_OK(dbfull()->TEST_CompactMemTable());  // Moves to level last-2
-  ASSERT_EQ(AllEntriesFor("foo"), "[ v2, DEL, v1 ]");
+  ASSERT_OK(dbfull()->TEST_CompactMemTable());  // stays at level 0
+  ASSERT_EQ(AllEntriesFor("foo"), "[ v2, v1 ]"); // riak 1.3, DEL merged out by BuildTable
   Slice z("z");
-  dbfull()->TEST_CompactRange(last-2, NULL, &z);
+  dbfull()->TEST_CompactRange(0, NULL, &z);
+  dbfull()->TEST_CompactRange(1, NULL, &z);
   // DEL eliminated, but v1 remains because we aren't compacting that level
   // (DEL can be eliminated because v2 hides v1).
-  ASSERT_EQ(AllEntriesFor("foo"), "[ v2, v1 ]");
+  ASSERT_EQ(AllEntriesFor("foo"), "[ v2, v1 ]"); // Riak 1.4 has merged to level 1
   dbfull()->TEST_CompactRange(last-1, NULL, NULL);
   // Merging last-1 w/ last, so we are the base level for "foo", so
   // DEL is removed.  (as is v1).
@@ -1289,39 +1233,47 @@ TEST(DBTest, DeletionMarkers2) {
   ASSERT_OK(dbfull()->TEST_CompactMemTable());
   const int last = config::kMaxMemCompactLevel;
   ASSERT_EQ(NumTableFilesAtLevel(last), 1);   // foo => v1 is now in last level
+  dbfull()->TEST_CompactRange(0, NULL, NULL);
+  ASSERT_EQ(NumTableFilesAtLevel(last), 1);   // foo => v1 is now in last level
+  ASSERT_EQ(NumTableFilesAtLevel(last-1), 0);
 
   // Place a table at level last-1 to prevent merging with preceding mutation
   Put("a", "begin");
   Put("z", "end");
-  dbfull()->TEST_CompactMemTable();
-  ASSERT_EQ(NumTableFilesAtLevel(last), 1);
+  dbfull()->TEST_CompactMemTable(); // goes to last-1
   ASSERT_EQ(NumTableFilesAtLevel(last-1), 1);
 
   Delete("foo");
   ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]");
-  ASSERT_OK(dbfull()->TEST_CompactMemTable());  // Moves to level last-2
+  ASSERT_OK(dbfull()->TEST_CompactMemTable());  // Moves to level 0
   ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]");
-  dbfull()->TEST_CompactRange(last-2, NULL, NULL);
+  dbfull()->TEST_CompactRange(0, NULL, NULL);   // Riak overlaps level 1
   // DEL kept: "last" file overlaps
   ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]");
-  dbfull()->TEST_CompactRange(last-1, NULL, NULL);
   // Merging last-1 w/ last, so we are the base level for "foo", so
   // DEL is removed.  (as is v1).
+  dbfull()->TEST_CompactRange(1, NULL, NULL);
+  ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]");
+
+  dbfull()->TEST_CompactRange(2, NULL, NULL);
   ASSERT_EQ(AllEntriesFor("foo"), "[ ]");
 }
 
 TEST(DBTest, OverlapInLevel0) {
   do {
-    ASSERT_EQ(config::kMaxMemCompactLevel, 2) << "Fix test to match config";
+    ASSERT_EQ(config::kMaxMemCompactLevel, 3) << "Fix test to match config";
 
     // Fill levels 1 and 2 to disable the pushing of new memtables to levels > 0.
     ASSERT_OK(Put("100", "v100"));
     ASSERT_OK(Put("999", "v999"));
     dbfull()->TEST_CompactMemTable();
+    dbfull()->TEST_CompactRange(0, NULL, NULL);
+    dbfull()->TEST_CompactRange(1, NULL, NULL);
     ASSERT_OK(Delete("100"));
     ASSERT_OK(Delete("999"));
     dbfull()->TEST_CompactMemTable();
-    ASSERT_EQ("0,1,1", FilesPerLevel());
+    dbfull()->TEST_CompactRange(0, NULL, NULL);
+    ASSERT_EQ("0,0,1,1", FilesPerLevel());
 
     // Make files spanning the following ranges in level-0:
     //  files[0]  200 .. 900
@@ -1334,7 +1286,7 @@ TEST(DBTest, OverlapInLevel0) {
     ASSERT_OK(Put("600", "v600"));
     ASSERT_OK(Put("900", "v900"));
     dbfull()->TEST_CompactMemTable();
-    ASSERT_EQ("2,1,1", FilesPerLevel());
+    ASSERT_EQ("2,0,1,1", FilesPerLevel());
 
     // Compact away the placeholder files we created initially
     dbfull()->TEST_CompactRange(1, NULL, NULL);
@@ -1364,7 +1316,7 @@ TEST(DBTest, L0_CompactionBug_Issue44_a) {
   Reopen();
   Reopen();
   ASSERT_EQ("(a->v)", Contents());
-  DelayMilliseconds(1000);  // Wait for compaction to finish
+  env_->SleepForMicroseconds(1000000);  // Wait for compaction to finish
   ASSERT_EQ("(a->v)", Contents());
 }
 
@@ -1380,7 +1332,7 @@ TEST(DBTest, L0_CompactionBug_Issue44_b) {
   Put("","");
   Reopen();
   Put("","");
-  DelayMilliseconds(1000);  // Wait for compaction to finish
+  env_->SleepForMicroseconds(1000000);  // Wait for compaction to finish
   Reopen();
   Put("d","dv");
   Reopen();
@@ -1390,7 +1342,7 @@ TEST(DBTest, L0_CompactionBug_Issue44_b) {
   Delete("b");
   Reopen();
   ASSERT_EQ("(->)(c->cv)", Contents());
-  DelayMilliseconds(1000);  // Wait for compaction to finish
+  env_->SleepForMicroseconds(1000000);  // Wait for compaction to finish
   ASSERT_EQ("(->)(c->cv)", Contents());
 }
 
@@ -1473,37 +1425,37 @@ TEST(DBTest, CustomComparator) {
 }
 
 TEST(DBTest, ManualCompaction) {
-  ASSERT_EQ(config::kMaxMemCompactLevel, 2)
+  ASSERT_EQ(config::kMaxMemCompactLevel, 3)
       << "Need to update this test to match kMaxMemCompactLevel";
 
   MakeTables(3, "p", "q");
-  ASSERT_EQ("1,1,1", FilesPerLevel());
+  ASSERT_EQ("1,0,1,1", FilesPerLevel());
 
   // Compaction range falls before files
   Compact("", "c");
-  ASSERT_EQ("1,1,1", FilesPerLevel());
+  ASSERT_EQ("0,1,1,1", FilesPerLevel());
 
   // Compaction range falls after files
   Compact("r", "z");
-  ASSERT_EQ("1,1,1", FilesPerLevel());
+  ASSERT_EQ("0,1,1,1", FilesPerLevel());
 
   // Compaction range overlaps files
   Compact("p1", "p9");
-  ASSERT_EQ("0,0,1", FilesPerLevel());
+  ASSERT_EQ("0,0,0,1", FilesPerLevel());
 
   // Populate a different range
   MakeTables(3, "c", "e");
-  ASSERT_EQ("1,1,2", FilesPerLevel());
+  ASSERT_EQ("1,0,1,2", FilesPerLevel());
 
   // Compact just the new range
   Compact("b", "f");
-  ASSERT_EQ("0,0,2", FilesPerLevel());
+  ASSERT_EQ("0,0,0,2", FilesPerLevel());
 
   // Compact all
   MakeTables(1, "a", "z");
-  ASSERT_EQ("0,1,2", FilesPerLevel());
+  ASSERT_EQ("0,0,1,2", FilesPerLevel());
   db_->CompactRange(NULL, NULL);
-  ASSERT_EQ("0,0,1", FilesPerLevel());
+  ASSERT_EQ("0,0,0,1", FilesPerLevel());
 }
 
 TEST(DBTest, DBOpen_Options) {
@@ -1545,12 +1497,6 @@ TEST(DBTest, DBOpen_Options) {
   db = NULL;
 }
 
-TEST(DBTest, Locking) {
-  DB* db2 = NULL;
-  Status s = DB::Open(CurrentOptions(), dbname_, &db2);
-  ASSERT_TRUE(!s.ok()) << "Locking did not prevent re-opening db";
-}
-
 // Check that number of files does not grow when we are out of space
 TEST(DBTest, NoSpace) {
   Options options = CurrentOptions();
@@ -1562,15 +1508,19 @@ TEST(DBTest, NoSpace) {
   Compact("a", "z");
   const int num_files = CountFiles();
   env_->no_space_.Release_Store(env_);   // Force out-of-space errors
-  for (int i = 0; i < 10; i++) {
+  env_->sleep_counter_.Reset();
+  for (int i = 0; i < 5; i++) {
     for (int level = 0; level < config::kNumLevels-1; level++) {
       dbfull()->TEST_CompactRange(level, NULL, NULL);
     }
   }
   env_->no_space_.Release_Store(NULL);
   ASSERT_LT(CountFiles(), num_files + 3);
-}
 
+  // Check that compaction attempts slept after errors
+  ASSERT_GE(env_->sleep_counter_.Read(), 5);
+}
+#if 0
 TEST(DBTest, NonWritableFileSystem) {
   Options options = CurrentOptions();
   options.write_buffer_size = 1000;
@@ -1584,119 +1534,13 @@ TEST(DBTest, NonWritableFileSystem) {
     fprintf(stderr, "iter %d; errors %d\n", i, errors);
     if (!Put("foo", big).ok()) {
       errors++;
-      DelayMilliseconds(100);
+      env_->SleepForMicroseconds(100000);
     }
   }
   ASSERT_GT(errors, 0);
   env_->non_writable_.Release_Store(NULL);
 }
-
-TEST(DBTest, WriteSyncError) {
-  // Check that log sync errors cause the DB to disallow future writes.
-
-  // (a) Cause log sync calls to fail
-  Options options = CurrentOptions();
-  options.env = env_;
-  Reopen(&options);
-  env_->data_sync_error_.Release_Store(env_);
-
-  // (b) Normal write should succeed
-  WriteOptions w;
-  ASSERT_OK(db_->Put(w, "k1", "v1"));
-  ASSERT_EQ("v1", Get("k1"));
-
-  // (c) Do a sync write; should fail
-  w.sync = true;
-  ASSERT_TRUE(!db_->Put(w, "k2", "v2").ok());
-  ASSERT_EQ("v1", Get("k1"));
-  ASSERT_EQ("NOT_FOUND", Get("k2"));
-
-  // (d) make sync behave normally
-  env_->data_sync_error_.Release_Store(NULL);
-
-  // (e) Do a non-sync write; should fail
-  w.sync = false;
-  ASSERT_TRUE(!db_->Put(w, "k3", "v3").ok());
-  ASSERT_EQ("v1", Get("k1"));
-  ASSERT_EQ("NOT_FOUND", Get("k2"));
-  ASSERT_EQ("NOT_FOUND", Get("k3"));
-}
-
-TEST(DBTest, ManifestWriteError) {
-  // Test for the following problem:
-  // (a) Compaction produces file F
-  // (b) Log record containing F is written to MANIFEST file, but Sync() fails
-  // (c) GC deletes F
-  // (d) After reopening DB, reads fail since deleted F is named in log record
-
-  // We iterate twice.  In the second iteration, everything is the
-  // same except the log record never makes it to the MANIFEST file.
-  for (int iter = 0; iter < 2; iter++) {
-    port::AtomicPointer* error_type = (iter == 0)
-        ? &env_->manifest_sync_error_
-        : &env_->manifest_write_error_;
-
-    // Insert foo=>bar mapping
-    Options options = CurrentOptions();
-    options.env = env_;
-    options.create_if_missing = true;
-    options.error_if_exists = false;
-    DestroyAndReopen(&options);
-    ASSERT_OK(Put("foo", "bar"));
-    ASSERT_EQ("bar", Get("foo"));
-
-    // Memtable compaction (will succeed)
-    dbfull()->TEST_CompactMemTable();
-    ASSERT_EQ("bar", Get("foo"));
-    const int last = config::kMaxMemCompactLevel;
-    ASSERT_EQ(NumTableFilesAtLevel(last), 1);   // foo=>bar is now in last level
-
-    // Merging compaction (will fail)
-    error_type->Release_Store(env_);
-    dbfull()->TEST_CompactRange(last, NULL, NULL);  // Should fail
-    ASSERT_EQ("bar", Get("foo"));
-
-    // Recovery: should not lose data
-    error_type->Release_Store(NULL);
-    Reopen(&options);
-    ASSERT_EQ("bar", Get("foo"));
-  }
-}
-
-TEST(DBTest, MissingSSTFile) {
-  ASSERT_OK(Put("foo", "bar"));
-  ASSERT_EQ("bar", Get("foo"));
-
-  // Dump the memtable to disk.
-  dbfull()->TEST_CompactMemTable();
-  ASSERT_EQ("bar", Get("foo"));
-
-  Close();
-  ASSERT_TRUE(DeleteAnSSTFile());
-  Options options = CurrentOptions();
-  options.paranoid_checks = true;
-  Status s = TryReopen(&options);
-  ASSERT_TRUE(!s.ok());
-  ASSERT_TRUE(s.ToString().find("issing") != std::string::npos)
-      << s.ToString();
-}
-
-TEST(DBTest, StillReadSST) {
-  ASSERT_OK(Put("foo", "bar"));
-  ASSERT_EQ("bar", Get("foo"));
-
-  // Dump the memtable to disk.
-  dbfull()->TEST_CompactMemTable();
-  ASSERT_EQ("bar", Get("foo"));
-  Close();
-  ASSERT_GT(RenameLDBToSST(), 0);
-  Options options = CurrentOptions();
-  options.paranoid_checks = true;
-  Status s = TryReopen(&options);
-  ASSERT_TRUE(s.ok());
-  ASSERT_EQ("bar", Get("foo"));
-}
-
+#endif
 TEST(DBTest, FilesDeletedAfterCompaction) {
   ASSERT_OK(Put("foo", "v2"));
   Compact("a", "z");
@@ -1713,7 +1557,7 @@ TEST(DBTest, BloomFilter) {
   Options options = CurrentOptions();
   options.env = env_;
   options.block_cache = NewLRUCache(0);  // Prevent cache hits
-  options.filter_policy = NewBloomFilterPolicy(10);
+  options.filter_policy = NewBloomFilterPolicy2(16);
   Reopen(&options);
 
   // Populate multiple layers
@@ -1728,12 +1572,12 @@ TEST(DBTest, BloomFilter) {
   dbfull()->TEST_CompactMemTable();
 
   // Prevent auto compactions triggered by seeks
-  env_->delay_data_sync_.Release_Store(env_);
+  env_->delay_sstable_sync_.Release_Store(env_);
 
   // Lookup present keys.  Should rarely read from small sstable.
   env_->random_read_counter_.Reset();
   for (int i = 0; i < N; i++) {
-    ASSERT_EQ(Key(i), Get(Key(i)));
+    ASSERT_EQ(Key(i), GetNoCache(Key(i)));
   }
   int reads = env_->random_read_counter_.Read();
   fprintf(stderr, "%d present => %d reads\n", N, reads);
@@ -1743,13 +1587,13 @@ TEST(DBTest, BloomFilter) {
   // Lookup present keys.  Should rarely read from either sstable.
   env_->random_read_counter_.Reset();
   for (int i = 0; i < N; i++) {
-    ASSERT_EQ("NOT_FOUND", Get(Key(i) + ".missing"));
+    ASSERT_EQ("NOT_FOUND", GetNoCache(Key(i) + ".missing"));
   }
   reads = env_->random_read_counter_.Read();
   fprintf(stderr, "%d missing => %d reads\n", N, reads);
   ASSERT_LE(reads, 3*N/100);
 
-  env_->delay_data_sync_.Release_Store(NULL);
+  env_->delay_sstable_sync_.Release_Store(NULL);
   Close();
   delete options.block_cache;
   delete options.filter_policy;
@@ -1809,7 +1653,7 @@ static void MTThreadBody(void* arg) {
         ASSERT_EQ(k, key);
         ASSERT_GE(w, 0);
         ASSERT_LT(w, kNumThreads);
-        ASSERT_LE(static_cast<uintptr_t>(c), reinterpret_cast<uintptr_t>(
+        ASSERT_LE(c, reinterpret_cast<uintptr_t>(
             t->state->counter[w].Acquire_Load()));
       }
     }
@@ -1834,27 +1678,35 @@ TEST(DBTest, MultiThreaded) {
 
     // Start threads
     MTThread thread[kNumThreads];
+    pthread_t tid;
     for (int id = 0; id < kNumThreads; id++) {
       thread[id].state = &mt;
       thread[id].id = id;
-      env_->StartThread(MTThreadBody, &thread[id]);
+      tid=env_->StartThread(MTThreadBody, &thread[id]);
+      pthread_detach(tid);
     }
 
     // Let them run for a while
-    DelayMilliseconds(kTestSeconds * 1000);
+    env_->SleepForMicroseconds(kTestSeconds * 1000000);
 
     // Stop the threads and wait for them to finish
     mt.stop.Release_Store(&mt);
     for (int id = 0; id < kNumThreads; id++) {
       while (mt.thread_done[id].Acquire_Load() == NULL) {
-        DelayMilliseconds(100);
+        env_->SleepForMicroseconds(100000);
       }
     }
   } while (ChangeOptions());
 }
 
 namespace {
-typedef std::map<std::string, std::string> KVMap;
+struct KVEntry
+{
+    std::string m_Value;
+    KeyMetaData m_Meta;
+};
+
+typedef std::map<std::string, KVEntry> KVMap;
 }
 
 class ModelDB: public DB {
@@ -1866,14 +1718,21 @@ class ModelDB: public DB {
 
   explicit ModelDB(const Options& options): options_(options) { }
   ~ModelDB() { }
-  virtual Status Put(const WriteOptions& o, const Slice& k, const Slice& v) {
-    return DB::Put(o, k, v);
+  virtual Status Put(const WriteOptions& o, const Slice& k, const Slice& v, const KeyMetaData * meta=NULL) {
+    return DB::Put(o, k, v, meta);
   }
   virtual Status Delete(const WriteOptions& o, const Slice& key) {
     return DB::Delete(o, key);
   }
   virtual Status Get(const ReadOptions& options,
-                     const Slice& key, std::string* value) {
+                     const Slice& key, std::string* value,
+                     KeyMetaData * meta = NULL) {
+    assert(false);      // Not implemented
+    return Status::NotFound(key);
+  }
+  virtual Status Get(const ReadOptions& options,
+                     const Slice& key, Value* value,
+                     KeyMetaData * meta = NULL) {
     assert(false);      // Not implemented
     return Status::NotFound(key);
   }
@@ -1901,8 +1760,13 @@ class ModelDB: public DB {
     class Handler : public WriteBatch::Handler {
      public:
       KVMap* map_;
-      virtual void Put(const Slice& key, const Slice& value) {
-        (*map_)[key.ToString()] = value.ToString();
+      virtual void Put(const Slice& key, const Slice& value,
+                       const ValueType & type, const ExpiryTimeMicros & expiry) {
+        KVEntry ent;
+        ent.m_Value=value.ToString();
+        ent.m_Meta.m_Type=type;
+        ent.m_Meta.m_Expiry=expiry;
+        (*map_)[key.ToString()] = ent;
       }
       virtual void Delete(const Slice& key) {
         map_->erase(key.ToString());
@@ -1948,7 +1812,7 @@ class ModelDB: public DB {
     virtual void Next() { ++iter_; }
     virtual void Prev() { --iter_; }
     virtual Slice key() const { return iter_->first; }
-    virtual Slice value() const { return iter_->second; }
+    virtual Slice value() const { return iter_->second.m_Value; }
     virtual Status status() const { return Status::OK(); }
    private:
     const KVMap* const map_;
@@ -2085,6 +1949,44 @@ TEST(DBTest, Randomized) {
   } while (ChangeOptions());
 }
 
+
+class SimpleBugs
+{
+    // need a class for the test harness
+};
+
+
+TEST(SimpleBugs, TieredRecoveryLog)
+{
+    // DB::Open created first recovery log directly
+    //   which lead to it NOT being in tiered storage location.
+    // nope std::string dbname = test::TmpDir() + "/leveldb_nontiered";
+    std::string dbname = "leveldb";
+    std::string fastname = test::TmpDir() + "/leveldb_fast";
+    std::string slowname = test::TmpDir() + "/leveldb_slow";
+    std::string combined;
+
+    DB* db = NULL;
+    Options opts;
+
+    opts.tiered_slow_level = 4;
+    opts.tiered_fast_prefix = fastname;
+    opts.tiered_slow_prefix = slowname;
+    opts.create_if_missing = true;
+
+    Env::Default()->CreateDir(fastname);
+    Env::Default()->CreateDir(slowname);
+
+    Status s = DB::Open(opts, dbname, &db);
+    ASSERT_OK(s);
+    ASSERT_TRUE(db != NULL);
+
+    delete db;
+    DestroyDB(dbname, opts);
+
+}   // TieredRecoveryLog
+
+
 std::string MakeKey(unsigned int num) {
   char buf[30];
   snprintf(buf, sizeof(buf), "%016u", num);
@@ -2113,14 +2015,13 @@ void BM_LogAndApply(int iters, int num_base_files) {
   InternalKeyComparator cmp(BytewiseComparator());
   Options options;
   VersionSet vset(dbname, &options, NULL, &cmp);
-  bool save_manifest;
-  ASSERT_OK(vset.Recover(&save_manifest));
+  ASSERT_OK(vset.Recover());
   VersionEdit vbase;
   uint64_t fnum = 1;
   for (int i = 0; i < num_base_files; i++) {
-    InternalKey start(MakeKey(2*fnum), 1, kTypeValue);
-    InternalKey limit(MakeKey(2*fnum+1), 1, kTypeDeletion);
-    vbase.AddFile(2, fnum++, 1 /* file size */, start, limit);
+    InternalKey start(MakeKey(2*fnum), 0, 1, kTypeValue);
+    InternalKey limit(MakeKey(2*fnum+1), 0, 1, kTypeDeletion);
+    vbase.AddFile2(2, fnum++, 1 /* file size */, start, limit, 0,0,0);
   }
   ASSERT_OK(vset.LogAndApply(&vbase, &mu));
 
@@ -2129,9 +2030,9 @@ void BM_LogAndApply(int iters, int num_base_files) {
   for (int i = 0; i < iters; i++) {
     VersionEdit vedit;
     vedit.DeleteFile(2, fnum);
-    InternalKey start(MakeKey(2*fnum), 1, kTypeValue);
-    InternalKey limit(MakeKey(2*fnum+1), 1, kTypeDeletion);
-    vedit.AddFile(2, fnum++, 1 /* file size */, start, limit);
+    InternalKey start(MakeKey(2*fnum), 0, 1, kTypeValue);
+    InternalKey limit(MakeKey(2*fnum+1), 0, 1, kTypeDeletion);
+    vedit.AddFile2(2, fnum++, 1 /* file size */, start, limit, 0,0,0);
     vset.LogAndApply(&vedit, &mu);
   }
   uint64_t stop_micros = env->NowMicros();
diff --git a/src/leveldb/db/dbformat.cc b/src/leveldb/db/dbformat.cc
index 20a7ca446..6d44ea114 100644
--- a/src/leveldb/db/dbformat.cc
+++ b/src/leveldb/db/dbformat.cc
@@ -3,7 +3,9 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #include <stdio.h>
+//#include "leveldb/expiry.h"
 #include "db/dbformat.h"
+#include "db/version_set.h"
 #include "port/port.h"
 #include "util/coding.h"
 
@@ -11,26 +13,66 @@ namespace leveldb {
 
 static uint64_t PackSequenceAndType(uint64_t seq, ValueType t) {
   assert(seq <= kMaxSequenceNumber);
-  assert(t <= kValueTypeForSeek);
+  // assert(t <= kValueTypeForSeek);  requires revisit once expiry live
+  assert(t <= kTypeValueExplicitExpiry);  // temp replacement for above
   return (seq << 8) | t;
 }
 
 void AppendInternalKey(std::string* result, const ParsedInternalKey& key) {
   result->append(key.user_key.data(), key.user_key.size());
+  if (IsExpiryKey(key.type))
+    PutFixed64(result, key.expiry);
   PutFixed64(result, PackSequenceAndType(key.sequence, key.type));
 }
 
 std::string ParsedInternalKey::DebugString() const {
   char buf[50];
-  snprintf(buf, sizeof(buf), "' @ %llu : %d",
-           (unsigned long long) sequence,
-           int(type));
+  if (IsExpiryKey(type))
+    snprintf(buf, sizeof(buf), "' @ %llu %llu : %d",
+             (unsigned long long) expiry,
+             (unsigned long long) sequence,
+             int(type));
+  else
+    snprintf(buf, sizeof(buf), "' @ %llu : %d",
+             (unsigned long long) sequence,
+             int(type));
   std::string result = "'";
-  result += EscapeString(user_key.ToString());
+  result += HexString(user_key.ToString());
   result += buf;
   return result;
 }
 
+std::string ParsedInternalKey::DebugStringHex() const {
+  char buf[50];
+  if (IsExpiryKey(type))
+    snprintf(buf, sizeof(buf), "' @ %llu %llu : %d",
+             (unsigned long long) expiry,
+             (unsigned long long) sequence,
+             int(type));
+  else
+    snprintf(buf, sizeof(buf), "' @ %llu : %d",
+             (unsigned long long) sequence,
+             int(type));
+  std::string result = "'";
+  result += HexString(user_key);
+  result += buf;
+  return result;
+}
+
+
+const char * KeyTypeString(ValueType val_type) {
+  const char * ret_ptr;
+  switch(val_type)
+  {
+      case kTypeDeletion: ret_ptr="kTypeDelete"; break;
+      case kTypeValue:    ret_ptr="kTypeValue"; break;
+      case kTypeValueWriteTime: ret_ptr="kTypeValueWriteTime"; break;
+      case kTypeValueExplicitExpiry: ret_ptr="kTypeValueExplicitExpiry"; break;
+      default: ret_ptr="(unknown ValueType)"; break;
+  }   // switch
+  return(ret_ptr);
+}
+
 std::string InternalKey::DebugString() const {
   std::string result;
   ParsedInternalKey parsed;
@@ -54,8 +96,10 @@ int InternalKeyComparator::Compare(const Slice& akey, const Slice& bkey) const {
   //    decreasing type (though sequence# should be enough to disambiguate)
   int r = user_comparator_->Compare(ExtractUserKey(akey), ExtractUserKey(bkey));
   if (r == 0) {
-    const uint64_t anum = DecodeFixed64(akey.data() + akey.size() - 8);
-    const uint64_t bnum = DecodeFixed64(bkey.data() + bkey.size() - 8);
+    uint64_t anum = DecodeFixed64(akey.data() + akey.size() - 8);
+    uint64_t bnum = DecodeFixed64(bkey.data() + bkey.size() - 8);
+    if (IsExpiryKey((ValueType)*(unsigned char *)&anum)) *(unsigned char*)&anum=(unsigned char)kTypeValue;
+    if (IsExpiryKey((ValueType)*(unsigned char *)&bnum)) *(unsigned char*)&bnum=(unsigned char)kTypeValue;
     if (anum > bnum) {
       r = -1;
     } else if (anum < bnum) {
@@ -118,7 +162,8 @@ bool InternalFilterPolicy::KeyMayMatch(const Slice& key, const Slice& f) const {
   return user_policy_->KeyMayMatch(ExtractUserKey(key), f);
 }
 
-LookupKey::LookupKey(const Slice& user_key, SequenceNumber s) {
+  LookupKey::LookupKey(const Slice& user_key, SequenceNumber s, KeyMetaData * meta) {
+  meta_=meta;
   size_t usize = user_key.size();
   size_t needed = usize + 13;  // A conservative estimate
   char* dst;
@@ -137,4 +182,109 @@ LookupKey::LookupKey(const Slice& user_key, SequenceNumber s) {
   end_ = dst;
 }
 
+
+KeyRetirement::KeyRetirement(
+    const Comparator * Comparator,
+    SequenceNumber SmallestSnapshot,
+    const Options * Opts,
+    Compaction * const Compaction)
+    : has_current_user_key(false), last_sequence_for_key(kMaxSequenceNumber),
+      user_comparator(Comparator), smallest_snapshot(SmallestSnapshot),
+      options(Opts), compaction(Compaction),
+      valid(false), dropped(0), expired(0)
+{
+    // NULL is ok for compaction
+    valid=(NULL!=user_comparator);
+
+    return;
+}   // KeyRetirement::KeyRetirement
+
+
+KeyRetirement::~KeyRetirement()
+{
+    if (0!=expired)
+        gPerfCounters->Add(ePerfExpiredKeys, expired);
+}   // KeyRetirement::~KeyRetirement
+
+
+bool
+KeyRetirement::operator()(
+    Slice & key)
+{
+    ParsedInternalKey ikey;
+    bool drop = false, expire_flag;
+
+    if (valid)
+    {
+        if (!ParseInternalKey(key, &ikey))
+        {
+            // Do not hide error keys
+            current_user_key.clear();
+            has_current_user_key = false;
+            last_sequence_for_key = kMaxSequenceNumber;
+        }   // else
+        else
+        {
+            if (!has_current_user_key ||
+                user_comparator->Compare(ikey.user_key,
+                                         Slice(current_user_key)) != 0)
+            {
+                // First occurrence of this user key
+                current_user_key.assign(ikey.user_key.data(), ikey.user_key.size());
+                has_current_user_key = true;
+                last_sequence_for_key = kMaxSequenceNumber;
+            }   // if
+
+            if (last_sequence_for_key <= smallest_snapshot)
+            {
+                // Hidden by an newer entry for same user key
+                drop = true;    // (A)
+            }   // if
+
+            else
+            {
+                expire_flag=false;
+                if (NULL!=options && options->ExpiryActivated())
+                    expire_flag=options->expiry_module->KeyRetirementCallback(ikey);
+
+                if ((ikey.type == kTypeDeletion || expire_flag)
+                    && ikey.sequence <= smallest_snapshot
+                    && NULL!=compaction  // mem to level0 ignores this test
+                    && compaction->IsBaseLevelForKey(ikey.user_key))
+                {
+                    // For this user key:
+                    // (1) there is no data in higher levels
+                    // (2) data in lower levels will have larger sequence numbers
+                    // (3) data in layers that are being compacted here and have
+                    //     smaller sequence numbers will be dropped in the next
+                    //     few iterations of this loop (by rule (A) above).
+                    // Therefore this deletion marker is obsolete and can be dropped.
+                    drop = true;
+
+                    if (expire_flag)
+                        ++expired;
+                    else
+                        ++dropped;
+                }   // if
+            }   // else
+
+            last_sequence_for_key = ikey.sequence;
+        }   // else
+    }   // if
+
+#if 0
+    // needs clean up to be used again
+    Log(options_.info_log,
+        "  Compact: %s, seq %d, type: %d %d, drop: %d, is_base: %d, "
+        "%d smallest_snapshot: %d",
+        ikey.user_key.ToString().c_str(),
+        (int)ikey.sequence, ikey.type, kTypeValue, drop,
+        compact->compaction->IsBaseLevelForKey(ikey.user_key),
+        (int)last_sequence_for_key, (int)compact->smallest_snapshot);
+#endif
+    return(drop);
+
+}   // KeyRetirement::operator(Slice & )
+
+
 }  // namespace leveldb
diff --git a/src/leveldb/db/dbformat.h b/src/leveldb/db/dbformat.h
index ea897b13c..ec3c80c98 100644
--- a/src/leveldb/db/dbformat.h
+++ b/src/leveldb/db/dbformat.h
@@ -2,13 +2,14 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#ifndef STORAGE_LEVELDB_DB_DBFORMAT_H_
-#define STORAGE_LEVELDB_DB_DBFORMAT_H_
+#ifndef STORAGE_LEVELDB_DB_FORMAT_H_
+#define STORAGE_LEVELDB_DB_FORMAT_H_
 
 #include <stdio.h>
 #include "leveldb/comparator.h"
 #include "leveldb/db.h"
 #include "leveldb/filter_policy.h"
+#include "leveldb/options.h"
 #include "leveldb/slice.h"
 #include "leveldb/table_builder.h"
 #include "util/coding.h"
@@ -16,19 +17,33 @@
 
 namespace leveldb {
 
+class Compaction;
+
 // Grouping of constants.  We may want to make some of these
 // parameters set via options.
 namespace config {
 static const int kNumLevels = 7;
+static const int kNumOverlapLevels = 2;
 
 // Level-0 compaction is started when we hit this many files.
-static const int kL0_CompactionTrigger = 4;
+// Google:  static const size_t kL0_CompactionTrigger = 4;
+static const size_t kL0_CompactionTrigger = 6;
+
+// Level-0 (any overlapped level) number of files where a grooming
+//     compaction could start
+static const size_t kL0_GroomingTrigger = 4;
+static const size_t kL0_GroomingTrigger10min = 2;
+static const size_t kL0_GroomingTrigger20min = 1;
+
+// ... time limits in microseconds
+static const size_t kL0_Grooming10minMicros = 10 * 60 * 1000000;
+static const size_t kL0_Grooming20minMicros = 20 * 60 * 1000000;
 
 // Soft limit on number of level-0 files.  We slow down writes at this point.
-static const int kL0_SlowdownWritesTrigger = 8;
+static const size_t kL0_SlowdownWritesTrigger = 8;
 
 // Maximum number of level-0 files.  We stop writes at this point.
-static const int kL0_StopWritesTrigger = 12;
+static const size_t kL0_StopWritesTrigger = 12;
 
 // Maximum level to which a new compacted memtable is pushed if it
 // does not create overlap.  We try to push to level 2 to avoid the
@@ -36,31 +51,28 @@ static const int kL0_StopWritesTrigger = 12;
 // expensive manifest file operations.  We do not push all the way to
 // the largest level since that can generate a lot of wasted disk
 // space if the same key space is being repeatedly overwritten.
-static const int kMaxMemCompactLevel = 2;
-
-// Approximate gap in bytes between samples of data read during iteration.
-static const int kReadBytesPeriod = 1048576;
+// Basho: push to kNumOverlapLevels +1 ... beyond "landing level"
+static const unsigned kMaxMemCompactLevel = kNumOverlapLevels+1;
 
 }  // namespace config
 
 class InternalKey;
 
-// Value types encoded as the last component of internal keys.
-// DO NOT CHANGE THESE ENUM VALUES: they are embedded in the on-disk
-// data structures.
-enum ValueType {
-  kTypeDeletion = 0x0,
-  kTypeValue = 0x1
-};
 // kValueTypeForSeek defines the ValueType that should be passed when
 // constructing a ParsedInternalKey object for seeking to a particular
 // sequence number (since we sort sequence numbers in decreasing order
 // and the value type is embedded as the low 8 bits in the sequence
 // number in internal keys, we need to use the highest-numbered
 // ValueType, not the lowest).
+//  Riak note: kValueTypeForSeek is placed within temporary keys
+//             for comparisons.  Using kTypeValueExplicitExpiry would
+//             force more code changes to increase internal key size.
+//             But ValueTypeForSeek is redundant to sequence number for
+//             disambiguaty. Therefore going for easiest path and NOT changing.
 static const ValueType kValueTypeForSeek = kTypeValue;
 
 typedef uint64_t SequenceNumber;
+typedef uint64_t ExpiryTimeMicros;
 
 // We leave eight bits empty at the bottom so a type and sequence#
 // can be packed together into 64-bits.
@@ -69,20 +81,17 @@ static const SequenceNumber kMaxSequenceNumber =
 
 struct ParsedInternalKey {
   Slice user_key;
+  ExpiryTimeMicros expiry;
   SequenceNumber sequence;
   ValueType type;
 
   ParsedInternalKey() { }  // Intentionally left uninitialized (for speed)
-  ParsedInternalKey(const Slice& u, const SequenceNumber& seq, ValueType t)
-      : user_key(u), sequence(seq), type(t) { }
+  ParsedInternalKey(const Slice& u, const ExpiryTimeMicros & exp, const SequenceNumber& seq, ValueType t)
+      : user_key(u), expiry(exp), sequence(seq), type(t) { }
   std::string DebugString() const;
+  std::string DebugStringHex() const;
 };
 
-// Return the length of the encoding of "key".
-inline size_t InternalKeyEncodingLength(const ParsedInternalKey& key) {
-  return key.user_key.size() + 8;
-}
-
 // Append the serialization of "key" to *result.
 extern void AppendInternalKey(std::string* result,
                               const ParsedInternalKey& key);
@@ -94,20 +103,76 @@ extern void AppendInternalKey(std::string* result,
 extern bool ParseInternalKey(const Slice& internal_key,
                              ParsedInternalKey* result);
 
-// Returns the user key portion of an internal key.
-inline Slice ExtractUserKey(const Slice& internal_key) {
-  assert(internal_key.size() >= 8);
-  return Slice(internal_key.data(), internal_key.size() - 8);
-}
-
 inline ValueType ExtractValueType(const Slice& internal_key) {
   assert(internal_key.size() >= 8);
   const size_t n = internal_key.size();
-  uint64_t num = DecodeFixed64(internal_key.data() + n - 8);
-  unsigned char c = num & 0xff;
+  unsigned char c = DecodeLeastFixed64(internal_key.data() + n - sizeof(SequenceNumber));
   return static_cast<ValueType>(c);
 }
 
+inline size_t KeySuffixSize(ValueType val_type) {
+  size_t ret_val;
+  switch(val_type)
+  {
+      case kTypeDeletion:
+      case kTypeValue:
+          ret_val=sizeof(SequenceNumber);
+          break;
+
+      case kTypeValueWriteTime:
+      case kTypeValueExplicitExpiry:
+          ret_val=sizeof(SequenceNumber) + sizeof(ExpiryTimeMicros);
+          break;
+
+      default:
+          // assert(0);  cannot use because bloom filter block's name is passed as internal key
+          ret_val=sizeof(SequenceNumber);
+          break;
+  }   // switch
+  return(ret_val);
+}
+
+const char * KeyTypeString(ValueType val_type);
+
+inline size_t KeySuffixSize(const Slice & internal_key) {
+    return(KeySuffixSize(ExtractValueType(internal_key)));
+}
+
+// Returns the user key portion of an internal key.
+inline Slice ExtractUserKey(const Slice& internal_key) {
+  assert(internal_key.size() >= 8);
+  return Slice(internal_key.data(), internal_key.size() - KeySuffixSize(internal_key));
+}
+
+// Returns the sequence number with ValueType removed
+inline SequenceNumber ExtractSequenceNumber(const Slice& internal_key) {
+  assert(internal_key.size() >= 8);
+  return(DecodeFixed64(internal_key.data() + internal_key.size() - 8)>>8);
+}
+
+// Return the length of the encoding of "key".
+inline size_t InternalKeyEncodingLength(const ParsedInternalKey& key) {
+  return key.user_key.size() + KeySuffixSize(key.type);
+}
+
+// Riak: is this an expiry key and therefore contain extra ExpiryTime field
+inline bool IsExpiryKey(ValueType val_type) {
+  return(kTypeValueWriteTime==val_type || kTypeValueExplicitExpiry==val_type);
+}
+
+// Riak: is this an expiry key and therefore contain extra ExpiryTime field
+inline bool IsExpiryKey(const Slice & internal_key) {
+    return(internal_key.size()>=KeySuffixSize(kTypeValueWriteTime)
+           && IsExpiryKey(ExtractValueType(internal_key)));
+}
+
+// Riak: extracts expiry value
+inline ExpiryTimeMicros ExtractExpiry(const Slice& internal_key) {
+  assert(internal_key.size() >= KeySuffixSize(kTypeValueWriteTime));
+  assert(IsExpiryKey(internal_key));
+  return(DecodeFixed64(internal_key.data() + internal_key.size() - KeySuffixSize(kTypeValueWriteTime)));
+}
+
 // A comparator for internal keys that uses a specified comparator for
 // the user key portion and breaks ties by decreasing sequence number.
 class InternalKeyComparator : public Comparator {
@@ -129,7 +194,7 @@ class InternalKeyComparator : public Comparator {
 
 // Filter policy wrapper that converts from internal keys to user keys
 class InternalFilterPolicy : public FilterPolicy {
- private:
+ protected:
   const FilterPolicy* const user_policy_;
  public:
   explicit InternalFilterPolicy(const FilterPolicy* p) : user_policy_(p) { }
@@ -138,6 +203,12 @@ class InternalFilterPolicy : public FilterPolicy {
   virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const;
 };
 
+class InternalFilterPolicy2 : public InternalFilterPolicy {
+ public:
+  explicit InternalFilterPolicy2(const FilterPolicy* p) : InternalFilterPolicy(p) { }
+  virtual ~InternalFilterPolicy2() {delete user_policy_;};
+};
+
 // Modules in this directory should keep internal keys wrapped inside
 // the following class instead of plain strings so that we do not
 // incorrectly use string comparisons instead of an InternalKeyComparator.
@@ -146,8 +217,8 @@ class InternalKey {
   std::string rep_;
  public:
   InternalKey() { }   // Leave rep_ as empty to indicate it is invalid
-  InternalKey(const Slice& user_key, SequenceNumber s, ValueType t) {
-    AppendInternalKey(&rep_, ParsedInternalKey(user_key, s, t));
+  InternalKey(const Slice& user_key, ExpiryTimeMicros exp, SequenceNumber s, ValueType t) {
+    AppendInternalKey(&rep_, ParsedInternalKey(user_key, exp, s, t));
   }
 
   void DecodeFrom(const Slice& s) { rep_.assign(s.data(), s.size()); }
@@ -157,6 +228,7 @@ class InternalKey {
   }
 
   Slice user_key() const { return ExtractUserKey(rep_); }
+  Slice internal_key() const { return Slice(rep_); }
 
   void SetFrom(const ParsedInternalKey& p) {
     rep_.clear();
@@ -181,8 +253,12 @@ inline bool ParseInternalKey(const Slice& internal_key,
   unsigned char c = num & 0xff;
   result->sequence = num >> 8;
   result->type = static_cast<ValueType>(c);
-  result->user_key = Slice(internal_key.data(), n - 8);
-  return (c <= static_cast<unsigned char>(kTypeValue));
+  if (IsExpiryKey((ValueType)c))
+    result->expiry=DecodeFixed64(internal_key.data() + n - KeySuffixSize((ValueType)c));
+  else
+    result->expiry=0;
+  result->user_key = Slice(internal_key.data(), n - KeySuffixSize((ValueType)c));
+  return (c <= static_cast<unsigned char>(kTypeValueExplicitExpiry));
 }
 
 // A helper class useful for DBImpl::Get()
@@ -190,7 +266,7 @@ class LookupKey {
  public:
   // Initialize *this for looking up user_key at a snapshot with
   // the specified sequence number.
-  LookupKey(const Slice& user_key, SequenceNumber sequence);
+  LookupKey(const Slice& user_key, SequenceNumber sequence, KeyMetaData * meta=NULL);
 
   ~LookupKey();
 
@@ -201,12 +277,38 @@ class LookupKey {
   Slice internal_key() const { return Slice(kstart_, end_ - kstart_); }
 
   // Return the user key
-  Slice user_key() const { return Slice(kstart_, end_ - kstart_ - 8); }
+  Slice user_key() const
+  { return Slice(kstart_, end_ - kstart_ - KeySuffixSize(internal_key())); }
+
+  // did requestor have metadata object?
+  bool WantsKeyMetaData() const {return(NULL!=meta_);};
+
+  void SetKeyMetaData(ValueType type, SequenceNumber seq, ExpiryTimeMicros expiry) const
+  {if (NULL!=meta_)
+    {
+      meta_->m_Type=type;
+      meta_->m_Sequence=seq;
+      meta_->m_Expiry=expiry;
+    } // if
+  };
+
+  void SetKeyMetaData(const ParsedInternalKey & pi_key) const
+  {if (NULL!=meta_)
+    {
+      meta_->m_Type=pi_key.type;
+      meta_->m_Sequence=pi_key.sequence;
+      meta_->m_Expiry=pi_key.expiry;
+    } // if
+  };
+
+  void SetKeyMetaData(const KeyMetaData & meta) const
+  {if (NULL!=meta_) *meta_=meta;};
 
  private:
   // We construct a char array of the form:
   //    klength  varint32               <-- start_
   //    userkey  char[klength]          <-- kstart_
+  //    optional uint64
   //    tag      uint64
   //                                    <-- end_
   // The array is a suitable MemTable key.
@@ -216,6 +318,9 @@ class LookupKey {
   const char* end_;
   char space_[200];      // Avoid allocation for short keys
 
+  // allow code that finds the key to place metadata here, even if 'const'
+  mutable KeyMetaData * meta_;
+
   // No copying allowed
   LookupKey(const LookupKey&);
   void operator=(const LookupKey&);
@@ -223,8 +328,47 @@ class LookupKey {
 
 inline LookupKey::~LookupKey() {
   if (start_ != space_) delete[] start_;
-}
+};
+
+
+// this class was constructed from code with DBImpl::DoCompactionWork (db_impl.cc)
+//   so it could be shared within BuildTable (and thus reduce Level 0 bloating)
+class KeyRetirement
+{
+protected:
+    // "state" from previous key reviewed
+    std::string current_user_key;
+    bool has_current_user_key;
+    SequenceNumber last_sequence_for_key;
+
+    // database values needed for processing
+    const Comparator * user_comparator;
+    SequenceNumber smallest_snapshot;
+    const Options * options;
+    Compaction * const compaction;
+
+    bool valid;
+    size_t dropped;   // tombstone or old version dropped
+    size_t expired;   // expired dropped
+
+public:
+    KeyRetirement(const Comparator * UserComparator, SequenceNumber SmallestSnapshot,
+                  const Options * Opts, Compaction * const Compaction=NULL);
+
+    virtual ~KeyRetirement();
+
+    bool operator()(Slice & key);
+
+    size_t GetDroppedCount() const {return(dropped);};
+    size_t GetExpiredCount() const {return(expired);};
+
+private:
+    KeyRetirement();
+    KeyRetirement(const KeyRetirement &);
+    const KeyRetirement & operator=(const KeyRetirement &);
+
+};  // class KeyRetirement
 
 }  // namespace leveldb
 
-#endif  // STORAGE_LEVELDB_DB_DBFORMAT_H_
+#endif  // STORAGE_LEVELDB_DB_FORMAT_H_
diff --git a/src/leveldb/db/dbformat_test.cc b/src/leveldb/db/dbformat_test.cc
index 5d82f5d31..3ad1cd647 100644
--- a/src/leveldb/db/dbformat_test.cc
+++ b/src/leveldb/db/dbformat_test.cc
@@ -9,10 +9,11 @@
 namespace leveldb {
 
 static std::string IKey(const std::string& user_key,
+                        ExpiryTimeMicros exp,
                         uint64_t seq,
                         ValueType vt) {
   std::string encoded;
-  AppendInternalKey(&encoded, ParsedInternalKey(user_key, seq, vt));
+  AppendInternalKey(&encoded, ParsedInternalKey(user_key, exp, seq, vt));
   return encoded;
 }
 
@@ -29,12 +30,13 @@ static std::string ShortSuccessor(const std::string& s) {
 }
 
 static void TestKey(const std::string& key,
+                    ExpiryTimeMicros exp,
                     uint64_t seq,
                     ValueType vt) {
-  std::string encoded = IKey(key, seq, vt);
+  std::string encoded = IKey(key, exp, seq, vt);
 
   Slice in(encoded);
-  ParsedInternalKey decoded("", 0, kTypeValue);
+  ParsedInternalKey decoded("", 0, 0, kTypeValue);
 
   ASSERT_TRUE(ParseInternalKey(in, &decoded));
   ASSERT_EQ(key, decoded.user_key.ToString());
@@ -56,53 +58,53 @@ TEST(FormatTest, InternalKey_EncodeDecode) {
   };
   for (int k = 0; k < sizeof(keys) / sizeof(keys[0]); k++) {
     for (int s = 0; s < sizeof(seq) / sizeof(seq[0]); s++) {
-      TestKey(keys[k], seq[s], kTypeValue);
-      TestKey("hello", 1, kTypeDeletion);
+      TestKey(keys[k], 0, seq[s], kTypeValue);
+      TestKey("hello", 0, 1, kTypeDeletion);
     }
   }
 }
 
 TEST(FormatTest, InternalKeyShortSeparator) {
   // When user keys are same
-  ASSERT_EQ(IKey("foo", 100, kTypeValue),
-            Shorten(IKey("foo", 100, kTypeValue),
-                    IKey("foo", 99, kTypeValue)));
-  ASSERT_EQ(IKey("foo", 100, kTypeValue),
-            Shorten(IKey("foo", 100, kTypeValue),
-                    IKey("foo", 101, kTypeValue)));
-  ASSERT_EQ(IKey("foo", 100, kTypeValue),
-            Shorten(IKey("foo", 100, kTypeValue),
-                    IKey("foo", 100, kTypeValue)));
-  ASSERT_EQ(IKey("foo", 100, kTypeValue),
-            Shorten(IKey("foo", 100, kTypeValue),
-                    IKey("foo", 100, kTypeDeletion)));
+  ASSERT_EQ(IKey("foo", 0, 100, kTypeValue),
+            Shorten(IKey("foo", 0, 100, kTypeValue),
+                    IKey("foo", 0, 99, kTypeValue)));
+  ASSERT_EQ(IKey("foo", 0, 100, kTypeValue),
+            Shorten(IKey("foo", 0, 100, kTypeValue),
+                    IKey("foo", 0, 101, kTypeValue)));
+  ASSERT_EQ(IKey("foo", 0, 100, kTypeValue),
+            Shorten(IKey("foo", 0, 100, kTypeValue),
+                    IKey("foo", 0, 100, kTypeValue)));
+  ASSERT_EQ(IKey("foo", 0, 100, kTypeValue),
+            Shorten(IKey("foo", 0, 100, kTypeValue),
+                    IKey("foo", 0, 100, kTypeDeletion)));
 
   // When user keys are misordered
-  ASSERT_EQ(IKey("foo", 100, kTypeValue),
-            Shorten(IKey("foo", 100, kTypeValue),
-                    IKey("bar", 99, kTypeValue)));
+  ASSERT_EQ(IKey("foo", 0, 100, kTypeValue),
+            Shorten(IKey("foo", 0, 100, kTypeValue),
+                    IKey("bar", 0, 99, kTypeValue)));
 
   // When user keys are different, but correctly ordered
-  ASSERT_EQ(IKey("g", kMaxSequenceNumber, kValueTypeForSeek),
-            Shorten(IKey("foo", 100, kTypeValue),
-                    IKey("hello", 200, kTypeValue)));
+  ASSERT_EQ(IKey("g", 0, kMaxSequenceNumber, kValueTypeForSeek),
+            Shorten(IKey("foo", 0, 100, kTypeValue),
+                    IKey("hello", 0, 200, kTypeValue)));
 
   // When start user key is prefix of limit user key
-  ASSERT_EQ(IKey("foo", 100, kTypeValue),
-            Shorten(IKey("foo", 100, kTypeValue),
-                    IKey("foobar", 200, kTypeValue)));
+  ASSERT_EQ(IKey("foo", 0, 100, kTypeValue),
+            Shorten(IKey("foo", 0, 100, kTypeValue),
+                    IKey("foobar", 0, 200, kTypeValue)));
 
   // When limit user key is prefix of start user key
-  ASSERT_EQ(IKey("foobar", 100, kTypeValue),
-            Shorten(IKey("foobar", 100, kTypeValue),
-                    IKey("foo", 200, kTypeValue)));
+  ASSERT_EQ(IKey("foobar", 0, 100, kTypeValue),
+            Shorten(IKey("foobar", 0, 100, kTypeValue),
+                    IKey("foo", 0, 200, kTypeValue)));
 }
 
 TEST(FormatTest, InternalKeyShortestSuccessor) {
-  ASSERT_EQ(IKey("g", kMaxSequenceNumber, kValueTypeForSeek),
-            ShortSuccessor(IKey("foo", 100, kTypeValue)));
-  ASSERT_EQ(IKey("\xff\xff", 100, kTypeValue),
-            ShortSuccessor(IKey("\xff\xff", 100, kTypeValue)));
+  ASSERT_EQ(IKey("g", 0, kMaxSequenceNumber, kValueTypeForSeek),
+            ShortSuccessor(IKey("foo", 0, 100, kTypeValue)));
+  ASSERT_EQ(IKey("\xff\xff", 0, 100, kTypeValue),
+            ShortSuccessor(IKey("\xff\xff", 0, 100, kTypeValue)));
 }
 
 }  // namespace leveldb
diff --git a/src/leveldb/db/fault_injection_test.cc b/src/leveldb/db/fault_injection_test.cc
deleted file mode 100644
index 875dfe81e..000000000
--- a/src/leveldb/db/fault_injection_test.cc
+++ /dev/null
@@ -1,554 +0,0 @@
-// Copyright 2014 The LevelDB Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
-
-// This test uses a custom Env to keep track of the state of a filesystem as of
-// the last "sync". It then checks for data loss errors by purposely dropping
-// file data (or entire files) not protected by a "sync".
-
-#include "leveldb/db.h"
-
-#include <map>
-#include <set>
-#include "db/db_impl.h"
-#include "db/filename.h"
-#include "db/log_format.h"
-#include "db/version_set.h"
-#include "leveldb/cache.h"
-#include "leveldb/env.h"
-#include "leveldb/table.h"
-#include "leveldb/write_batch.h"
-#include "util/logging.h"
-#include "util/mutexlock.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
-
-namespace leveldb {
-
-static const int kValueSize = 1000;
-static const int kMaxNumValues = 2000;
-static const size_t kNumIterations = 3;
-
-class FaultInjectionTestEnv;
-
-namespace {
-
-// Assume a filename, and not a directory name like "/foo/bar/"
-static std::string GetDirName(const std::string filename) {
-  size_t found = filename.find_last_of("/\\");
-  if (found == std::string::npos) {
-    return "";
-  } else {
-    return filename.substr(0, found);
-  }
-}
-
-Status SyncDir(const std::string& dir) {
-  // As this is a test it isn't required to *actually* sync this directory.
-  return Status::OK();
-}
-
-// A basic file truncation function suitable for this test.
-Status Truncate(const std::string& filename, uint64_t length) {
-  leveldb::Env* env = leveldb::Env::Default();
-
-  SequentialFile* orig_file;
-  Status s = env->NewSequentialFile(filename, &orig_file);
-  if (!s.ok())
-    return s;
-
-  char* scratch = new char[length];
-  leveldb::Slice result;
-  s = orig_file->Read(length, &result, scratch);
-  delete orig_file;
-  if (s.ok()) {
-    std::string tmp_name = GetDirName(filename) + "/truncate.tmp";
-    WritableFile* tmp_file;
-    s = env->NewWritableFile(tmp_name, &tmp_file);
-    if (s.ok()) {
-      s = tmp_file->Append(result);
-      delete tmp_file;
-      if (s.ok()) {
-        s = env->RenameFile(tmp_name, filename);
-      } else {
-        env->DeleteFile(tmp_name);
-      }
-    }
-  }
-
-  delete[] scratch;
-
-  return s;
-}
-
-struct FileState {
-  std::string filename_;
-  ssize_t pos_;
-  ssize_t pos_at_last_sync_;
-  ssize_t pos_at_last_flush_;
-
-  FileState(const std::string& filename)
-      : filename_(filename),
-        pos_(-1),
-        pos_at_last_sync_(-1),
-        pos_at_last_flush_(-1) { }
-
-  FileState() : pos_(-1), pos_at_last_sync_(-1), pos_at_last_flush_(-1) {}
-
-  bool IsFullySynced() const { return pos_ <= 0 || pos_ == pos_at_last_sync_; }
-
-  Status DropUnsyncedData() const;
-};
-
-}  // anonymous namespace
-
-// A wrapper around WritableFile which informs another Env whenever this file
-// is written to or sync'ed.
-class TestWritableFile : public WritableFile {
- public:
-  TestWritableFile(const FileState& state,
-                   WritableFile* f,
-                   FaultInjectionTestEnv* env);
-  virtual ~TestWritableFile();
-  virtual Status Append(const Slice& data);
-  virtual Status Close();
-  virtual Status Flush();
-  virtual Status Sync();
-
- private:
-  FileState state_;
-  WritableFile* target_;
-  bool writable_file_opened_;
-  FaultInjectionTestEnv* env_;
-
-  Status SyncParent();
-};
-
-class FaultInjectionTestEnv : public EnvWrapper {
- public:
-  FaultInjectionTestEnv() : EnvWrapper(Env::Default()), filesystem_active_(true) {}
-  virtual ~FaultInjectionTestEnv() { }
-  virtual Status NewWritableFile(const std::string& fname,
-                                 WritableFile** result);
-  virtual Status NewAppendableFile(const std::string& fname,
-                                   WritableFile** result);
-  virtual Status DeleteFile(const std::string& f);
-  virtual Status RenameFile(const std::string& s, const std::string& t);
-
-  void WritableFileClosed(const FileState& state);
-  Status DropUnsyncedFileData();
-  Status DeleteFilesCreatedAfterLastDirSync();
-  void DirWasSynced();
-  bool IsFileCreatedSinceLastDirSync(const std::string& filename);
-  void ResetState();
-  void UntrackFile(const std::string& f);
-  // Setting the filesystem to inactive is the test equivalent to simulating a
-  // system reset. Setting to inactive will freeze our saved filesystem state so
-  // that it will stop being recorded. It can then be reset back to the state at
-  // the time of the reset.
-  bool IsFilesystemActive() const { return filesystem_active_; }
-  void SetFilesystemActive(bool active) { filesystem_active_ = active; }
-
- private:
-  port::Mutex mutex_;
-  std::map<std::string, FileState> db_file_state_;
-  std::set<std::string> new_files_since_last_dir_sync_;
-  bool filesystem_active_;  // Record flushes, syncs, writes
-};
-
-TestWritableFile::TestWritableFile(const FileState& state,
-                                   WritableFile* f,
-                                   FaultInjectionTestEnv* env)
-    : state_(state),
-      target_(f),
-      writable_file_opened_(true),
-      env_(env) {
-  assert(f != NULL);
-}
-
-TestWritableFile::~TestWritableFile() {
-  if (writable_file_opened_) {
-    Close();
-  }
-  delete target_;
-}
-
-Status TestWritableFile::Append(const Slice& data) {
-  Status s = target_->Append(data);
-  if (s.ok() && env_->IsFilesystemActive()) {
-    state_.pos_ += data.size();
-  }
-  return s;
-}
-
-Status TestWritableFile::Close() {
-  writable_file_opened_ = false;
-  Status s = target_->Close();
-  if (s.ok()) {
-    env_->WritableFileClosed(state_);
-  }
-  return s;
-}
-
-Status TestWritableFile::Flush() {
-  Status s = target_->Flush();
-  if (s.ok() && env_->IsFilesystemActive()) {
-    state_.pos_at_last_flush_ = state_.pos_;
-  }
-  return s;
-}
-
-Status TestWritableFile::SyncParent() {
-  Status s = SyncDir(GetDirName(state_.filename_));
-  if (s.ok()) {
-    env_->DirWasSynced();
-  }
-  return s;
-}
-
-Status TestWritableFile::Sync() {
-  if (!env_->IsFilesystemActive()) {
-    return Status::OK();
-  }
-  // Ensure new files referred to by the manifest are in the filesystem.
-  Status s = target_->Sync();
-  if (s.ok()) {
-    state_.pos_at_last_sync_ = state_.pos_;
-  }
-  if (env_->IsFileCreatedSinceLastDirSync(state_.filename_)) {
-    Status ps = SyncParent();
-    if (s.ok() && !ps.ok()) {
-      s = ps;
-    }
-  }
-  return s;
-}
-
-Status FaultInjectionTestEnv::NewWritableFile(const std::string& fname,
-                                              WritableFile** result) {
-  WritableFile* actual_writable_file;
-  Status s = target()->NewWritableFile(fname, &actual_writable_file);
-  if (s.ok()) {
-    FileState state(fname);
-    state.pos_ = 0;
-    *result = new TestWritableFile(state, actual_writable_file, this);
-    // NewWritableFile doesn't append to files, so if the same file is
-    // opened again then it will be truncated - so forget our saved
-    // state.
-    UntrackFile(fname);
-    MutexLock l(&mutex_);
-    new_files_since_last_dir_sync_.insert(fname);
-  }
-  return s;
-}
-
-Status FaultInjectionTestEnv::NewAppendableFile(const std::string& fname,
-                                                WritableFile** result) {
-  WritableFile* actual_writable_file;
-  Status s = target()->NewAppendableFile(fname, &actual_writable_file);
-  if (s.ok()) {
-    FileState state(fname);
-    state.pos_ = 0;
-    {
-      MutexLock l(&mutex_);
-      if (db_file_state_.count(fname) == 0) {
-        new_files_since_last_dir_sync_.insert(fname);
-      } else {
-        state = db_file_state_[fname];
-      }
-    }
-    *result = new TestWritableFile(state, actual_writable_file, this);
-  }
-  return s;
-}
-
-Status FaultInjectionTestEnv::DropUnsyncedFileData() {
-  Status s;
-  MutexLock l(&mutex_);
-  for (std::map<std::string, FileState>::const_iterator it =
-           db_file_state_.begin();
-       s.ok() && it != db_file_state_.end(); ++it) {
-    const FileState& state = it->second;
-    if (!state.IsFullySynced()) {
-      s = state.DropUnsyncedData();
-    }
-  }
-  return s;
-}
-
-void FaultInjectionTestEnv::DirWasSynced() {
-  MutexLock l(&mutex_);
-  new_files_since_last_dir_sync_.clear();
-}
-
-bool FaultInjectionTestEnv::IsFileCreatedSinceLastDirSync(
-    const std::string& filename) {
-  MutexLock l(&mutex_);
-  return new_files_since_last_dir_sync_.find(filename) !=
-         new_files_since_last_dir_sync_.end();
-}
-
-void FaultInjectionTestEnv::UntrackFile(const std::string& f) {
-  MutexLock l(&mutex_);
-  db_file_state_.erase(f);
-  new_files_since_last_dir_sync_.erase(f);
-}
-
-Status FaultInjectionTestEnv::DeleteFile(const std::string& f) {
-  Status s = EnvWrapper::DeleteFile(f);
-  ASSERT_OK(s);
-  if (s.ok()) {
-    UntrackFile(f);
-  }
-  return s;
-}
-
-Status FaultInjectionTestEnv::RenameFile(const std::string& s,
-                                         const std::string& t) {
-  Status ret = EnvWrapper::RenameFile(s, t);
-
-  if (ret.ok()) {
-    MutexLock l(&mutex_);
-    if (db_file_state_.find(s) != db_file_state_.end()) {
-      db_file_state_[t] = db_file_state_[s];
-      db_file_state_.erase(s);
-    }
-
-    if (new_files_since_last_dir_sync_.erase(s) != 0) {
-      assert(new_files_since_last_dir_sync_.find(t) ==
-             new_files_since_last_dir_sync_.end());
-      new_files_since_last_dir_sync_.insert(t);
-    }
-  }
-
-  return ret;
-}
-
-void FaultInjectionTestEnv::ResetState() {
-  // Since we are not destroying the database, the existing files
-  // should keep their recorded synced/flushed state. Therefore
-  // we do not reset db_file_state_ and new_files_since_last_dir_sync_.
-  MutexLock l(&mutex_);
-  SetFilesystemActive(true);
-}
-
-Status FaultInjectionTestEnv::DeleteFilesCreatedAfterLastDirSync() {
-  // Because DeleteFile access this container make a copy to avoid deadlock
-  mutex_.Lock();
-  std::set<std::string> new_files(new_files_since_last_dir_sync_.begin(),
-                                  new_files_since_last_dir_sync_.end());
-  mutex_.Unlock();
-  Status s;
-  std::set<std::string>::const_iterator it;
-  for (it = new_files.begin(); s.ok() && it != new_files.end(); ++it) {
-    s = DeleteFile(*it);
-  }
-  return s;
-}
-
-void FaultInjectionTestEnv::WritableFileClosed(const FileState& state) {
-  MutexLock l(&mutex_);
-  db_file_state_[state.filename_] = state;
-}
-
-Status FileState::DropUnsyncedData() const {
-  ssize_t sync_pos = pos_at_last_sync_ == -1 ? 0 : pos_at_last_sync_;
-  return Truncate(filename_, sync_pos);
-}
-
-class FaultInjectionTest {
- public:
-  enum ExpectedVerifResult { VAL_EXPECT_NO_ERROR, VAL_EXPECT_ERROR };
-  enum ResetMethod { RESET_DROP_UNSYNCED_DATA, RESET_DELETE_UNSYNCED_FILES };
-
-  FaultInjectionTestEnv* env_;
-  std::string dbname_;
-  Cache* tiny_cache_;
-  Options options_;
-  DB* db_;
-
-  FaultInjectionTest()
-      : env_(new FaultInjectionTestEnv),
-        tiny_cache_(NewLRUCache(100)),
-        db_(NULL) {
-    dbname_ = test::TmpDir() + "/fault_test";
-    DestroyDB(dbname_, Options());  // Destroy any db from earlier run
-    options_.reuse_logs = true;
-    options_.env = env_;
-    options_.paranoid_checks = true;
-    options_.block_cache = tiny_cache_;
-    options_.create_if_missing = true;
-  }
-
-  ~FaultInjectionTest() {
-    CloseDB();
-    DestroyDB(dbname_, Options());
-    delete tiny_cache_;
-    delete env_;
-  }
-
-  void ReuseLogs(bool reuse) {
-    options_.reuse_logs = reuse;
-  }
-
-  void Build(int start_idx, int num_vals) {
-    std::string key_space, value_space;
-    WriteBatch batch;
-    for (int i = start_idx; i < start_idx + num_vals; i++) {
-      Slice key = Key(i, &key_space);
-      batch.Clear();
-      batch.Put(key, Value(i, &value_space));
-      WriteOptions options;
-      ASSERT_OK(db_->Write(options, &batch));
-    }
-  }
-
-  Status ReadValue(int i, std::string* val) const {
-    std::string key_space, value_space;
-    Slice key = Key(i, &key_space);
-    Value(i, &value_space);
-    ReadOptions options;
-    return db_->Get(options, key, val);
-  }
-
-  Status Verify(int start_idx, int num_vals,
-                ExpectedVerifResult expected) const {
-    std::string val;
-    std::string value_space;
-    Status s;
-    for (int i = start_idx; i < start_idx + num_vals && s.ok(); i++) {
-      Value(i, &value_space);
-      s = ReadValue(i, &val);
-      if (expected == VAL_EXPECT_NO_ERROR) {
-        if (s.ok()) {
-          ASSERT_EQ(value_space, val);
-        }
-      } else if (s.ok()) {
-        fprintf(stderr, "Expected an error at %d, but was OK\n", i);
-        s = Status::IOError(dbname_, "Expected value error:");
-      } else {
-        s = Status::OK();  // An expected error
-      }
-    }
-    return s;
-  }
-
-  // Return the ith key
-  Slice Key(int i, std::string* storage) const {
-    char buf[100];
-    snprintf(buf, sizeof(buf), "%016d", i);
-    storage->assign(buf, strlen(buf));
-    return Slice(*storage);
-  }
-
-  // Return the value to associate with the specified key
-  Slice Value(int k, std::string* storage) const {
-    Random r(k);
-    return test::RandomString(&r, kValueSize, storage);
-  }
-
-  Status OpenDB() {
-    delete db_;
-    db_ = NULL;
-    env_->ResetState();
-    return DB::Open(options_, dbname_, &db_);
-  }
-
-  void CloseDB() {
-    delete db_;
-    db_ = NULL;
-  }
-
-  void DeleteAllData() {
-    Iterator* iter = db_->NewIterator(ReadOptions());
-    WriteOptions options;
-    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
-      ASSERT_OK(db_->Delete(WriteOptions(), iter->key()));
-    }
-
-    delete iter;
-  }
-
-  void ResetDBState(ResetMethod reset_method) {
-    switch (reset_method) {
-      case RESET_DROP_UNSYNCED_DATA:
-        ASSERT_OK(env_->DropUnsyncedFileData());
-        break;
-      case RESET_DELETE_UNSYNCED_FILES:
-        ASSERT_OK(env_->DeleteFilesCreatedAfterLastDirSync());
-        break;
-      default:
-        assert(false);
-    }
-  }
-
-  void PartialCompactTestPreFault(int num_pre_sync, int num_post_sync) {
-    DeleteAllData();
-    Build(0, num_pre_sync);
-    db_->CompactRange(NULL, NULL);
-    Build(num_pre_sync, num_post_sync);
-  }
-
-  void PartialCompactTestReopenWithFault(ResetMethod reset_method,
-                                         int num_pre_sync,
-                                         int num_post_sync) {
-    env_->SetFilesystemActive(false);
-    CloseDB();
-    ResetDBState(reset_method);
-    ASSERT_OK(OpenDB());
-    ASSERT_OK(Verify(0, num_pre_sync, FaultInjectionTest::VAL_EXPECT_NO_ERROR));
-    ASSERT_OK(Verify(num_pre_sync, num_post_sync, FaultInjectionTest::VAL_EXPECT_ERROR));
-  }
-
-  void NoWriteTestPreFault() {
-  }
-
-  void NoWriteTestReopenWithFault(ResetMethod reset_method) {
-    CloseDB();
-    ResetDBState(reset_method);
-    ASSERT_OK(OpenDB());
-  }
-
-  void DoTest() {
-    Random rnd(0);
-    ASSERT_OK(OpenDB());
-    for (size_t idx = 0; idx < kNumIterations; idx++) {
-      int num_pre_sync = rnd.Uniform(kMaxNumValues);
-      int num_post_sync = rnd.Uniform(kMaxNumValues);
-
-      PartialCompactTestPreFault(num_pre_sync, num_post_sync);
-      PartialCompactTestReopenWithFault(RESET_DROP_UNSYNCED_DATA,
-                                        num_pre_sync,
-                                        num_post_sync);
-
-      NoWriteTestPreFault();
-      NoWriteTestReopenWithFault(RESET_DROP_UNSYNCED_DATA);
-
-      PartialCompactTestPreFault(num_pre_sync, num_post_sync);
-      // No new files created so we expect all values since no files will be
-      // dropped.
-      PartialCompactTestReopenWithFault(RESET_DELETE_UNSYNCED_FILES,
-                                        num_pre_sync + num_post_sync,
-                                        0);
-
-      NoWriteTestPreFault();
-      NoWriteTestReopenWithFault(RESET_DELETE_UNSYNCED_FILES);
-    }
-  }
-};
-
-TEST(FaultInjectionTest, FaultTestNoLogReuse) {
-  ReuseLogs(false);
-  DoTest();
-}
-
-TEST(FaultInjectionTest, FaultTestWithLogReuse) {
-  ReuseLogs(true);
-  DoTest();
-}
-
-}  // namespace leveldb
-
-int main(int argc, char** argv) {
-  return leveldb::test::RunAllTests();
-}
diff --git a/src/leveldb/db/filename.cc b/src/leveldb/db/filename.cc
index da32946d9..bb90e6560 100644
--- a/src/leveldb/db/filename.cc
+++ b/src/leveldb/db/filename.cc
@@ -4,9 +4,14 @@
 
 #include <ctype.h>
 #include <stdio.h>
+#include <errno.h>
+#include <sys/stat.h>
+#include <sys/types.h>
 #include "db/filename.h"
 #include "db/dbformat.h"
+#include "db/version_set.h"
 #include "leveldb/env.h"
+#include "leveldb/status.h"
 #include "util/logging.h"
 
 namespace leveldb {
@@ -24,19 +29,50 @@ static std::string MakeFileName(const std::string& name, uint64_t number,
   return name + buf;
 }
 
+static std::string MakeFileName2(const Options & options, uint64_t number,
+                                 int level, const char* suffix) {
+  char buf[100];
+  if (0<=level)
+      snprintf(buf, sizeof(buf), "/%s_%-d/%06llu.%s",
+               suffix, level,
+               static_cast<unsigned long long>(number),
+               suffix);
+  else if (-1==level)
+      snprintf(buf, sizeof(buf), "/%s/%06llu.%s",
+               suffix,
+               static_cast<unsigned long long>(number),
+               suffix);
+  else if (-2==level)
+      snprintf(buf, sizeof(buf), "/%06llu.%s",
+               static_cast<unsigned long long>(number),
+               suffix);
+
+  return((level<(int)options.tiered_slow_level ?
+          options.tiered_fast_prefix : options.tiered_slow_prefix) + buf);
+}
+
+std::string MakeDirName2(const Options & options,
+                         int level, const char* suffix) {
+  char buf[100];
+  if (-1!=level)
+      snprintf(buf, sizeof(buf), "/%s_%-d",
+               suffix, level);
+  else
+      snprintf(buf, sizeof(buf), "/%s",
+               suffix);
+
+  return((level<(int)options.tiered_slow_level ?
+          options.tiered_fast_prefix : options.tiered_slow_prefix) + buf);
+}
+
 std::string LogFileName(const std::string& name, uint64_t number) {
   assert(number > 0);
   return MakeFileName(name, number, "log");
 }
 
-std::string TableFileName(const std::string& name, uint64_t number) {
+std::string TableFileName(const Options & options, uint64_t number, int level) {
   assert(number > 0);
-  return MakeFileName(name, number, "ldb");
-}
-
-std::string SSTTableFileName(const std::string& name, uint64_t number) {
-  assert(number > 0);
-  return MakeFileName(name, number, "sst");
+  return MakeFileName2(options, number, level, "sst");
 }
 
 std::string DescriptorFileName(const std::string& dbname, uint64_t number) {
@@ -69,6 +105,36 @@ std::string OldInfoLogFileName(const std::string& dbname) {
   return dbname + "/LOG.old";
 }
 
+//
+std::string CowFileName(const std::string& dbname) {
+  return dbname + "/COW";
+}
+
+
+// Append appropriate "backup" string to input path
+std::string BackupPath(const std::string& dbname, int backup_num) {
+    std::string dirname;
+
+  char buf[100];
+  if (0 != backup_num)
+      snprintf(buf, sizeof(buf), "/backup.%-d", backup_num);
+  else
+      snprintf(buf, sizeof(buf), "/backup");
+
+  return(dbname + buf);
+}
+
+
+// update tiered_fast_prefix and tiered_slow_prefix members of
+//  given Options object to point to desired backup path
+bool SetBackupPaths(Options & options, int backup_num) {
+
+    options.tiered_fast_prefix = BackupPath(options.tiered_fast_prefix, backup_num);
+    options.tiered_slow_prefix = BackupPath(options.tiered_slow_prefix, backup_num);
+
+    return(true);
+}
+
 
 // Owned filenames have the form:
 //    dbname/CURRENT
@@ -76,7 +142,8 @@ std::string OldInfoLogFileName(const std::string& dbname) {
 //    dbname/LOG
 //    dbname/LOG.old
 //    dbname/MANIFEST-[0-9]+
-//    dbname/[0-9]+.(log|sst|ldb)
+//    dbname/[0-9]+.(log|sst)
+//    dbname/COW
 bool ParseFileName(const std::string& fname,
                    uint64_t* number,
                    FileType* type) {
@@ -84,6 +151,9 @@ bool ParseFileName(const std::string& fname,
   if (rest == "CURRENT") {
     *number = 0;
     *type = kCurrentFile;
+  } else if (rest == "COW") {
+    *number = 0;
+    *type = kCacheWarming;
   } else if (rest == "LOCK") {
     *number = 0;
     *type = kDBLockFile;
@@ -111,7 +181,7 @@ bool ParseFileName(const std::string& fname,
     Slice suffix = rest;
     if (suffix == Slice(".log")) {
       *type = kLogFile;
-    } else if (suffix == Slice(".sst") || suffix == Slice(".ldb")) {
+    } else if (suffix == Slice(".sst")) {
       *type = kTableFile;
     } else if (suffix == Slice(".dbtmp")) {
       *type = kTempFile;
@@ -141,4 +211,99 @@ Status SetCurrentFile(Env* env, const std::string& dbname,
   return s;
 }
 
+
+Status
+MakeLevelDirectories(Env * env, const Options & options)
+{
+    Status ret_stat;
+    int level;
+    std::string dirname;
+
+    for (level=0; level<config::kNumLevels && ret_stat.ok(); ++level)
+    {
+        dirname=MakeDirName2(options, level, "sst");
+
+        // ignoring error since no way to tell if "bad" error, or "already exists" error
+        env->CreateDir(dirname.c_str());
+    }   // for
+
+    return(ret_stat);
+
+}  // MakeLevelDirectories
+
+
+bool
+TestForLevelDirectories(
+    Env * env,
+    const Options & options,
+    Version * version)
+{
+    bool ret_flag, again;
+    int level;
+    std::string dirname;
+
+    ret_flag=true;
+    again=true;
+
+    // walk backwards, fault will be in higher levels if partial conversion
+    for (level=config::kNumLevels-1; 0<=level && again; --level)
+    {
+        again=false;
+
+        // does directory exist
+        dirname=MakeDirName2(options, level, "sst");
+        ret_flag=env->FileExists(dirname.c_str());
+
+        // do all files exist in level
+        if (ret_flag)
+        {
+            const std::vector<FileMetaData*> & level_files(version->GetFileList(level));
+            std::vector<FileMetaData*>::const_iterator it;
+            std::string table_name;
+            Status s;
+
+            for (it=level_files.begin(); level_files.end()!=it && ret_flag; ++it)
+            {
+                table_name=TableFileName(options, (*it)->number, level);
+                ret_flag=env->FileExists(table_name.c_str());
+            }   // for
+
+            again=ret_flag && 0==level_files.size();
+        }   // if
+    }   // for
+
+    return(ret_flag);
+
+}   // TestForLevelDirectories
+
+std::string       // replacement dbname ... potentially tiered
+MakeTieredDbname(
+    const std::string & dbname,    // input ... original dbname from DBImpl constructor
+    Options & options)             // input/output ... writable Options, tiered values changed
+{
+    // case for "", used with internal calls to DestroyDB
+    if (0==dbname.size() && 0!=options.tiered_fast_prefix.size())
+    {
+        // do NOTHING ... options already initialized
+    }   // if
+    else if (0<(int)options.tiered_slow_level && (int)options.tiered_slow_level<config::kNumLevels
+        && 0!=options.tiered_fast_prefix.size() && 0!=options.tiered_slow_prefix.size())
+    {
+        options.tiered_fast_prefix.append("/");
+        options.tiered_fast_prefix.append(dbname);
+
+        options.tiered_slow_prefix.append("/");
+        options.tiered_slow_prefix.append(dbname);
+    }   // else if
+    else
+    {
+        options.tiered_slow_level=0;
+        options.tiered_fast_prefix=dbname; // duplicate as is
+        options.tiered_slow_prefix=dbname;
+    }   // else
+
+    return(options.tiered_fast_prefix);
+
+}   // MakeTieredDbname
+
 }  // namespace leveldb
diff --git a/src/leveldb/db/filename.h b/src/leveldb/db/filename.h
index 87a752605..15050c9c2 100644
--- a/src/leveldb/db/filename.h
+++ b/src/leveldb/db/filename.h
@@ -9,6 +9,7 @@
 
 #include <stdint.h>
 #include <string>
+#include "leveldb/options.h"
 #include "leveldb/slice.h"
 #include "leveldb/status.h"
 #include "port/port.h"
@@ -16,6 +17,7 @@
 namespace leveldb {
 
 class Env;
+class Version;
 
 enum FileType {
   kLogFile,
@@ -24,9 +26,24 @@ enum FileType {
   kDescriptorFile,
   kCurrentFile,
   kTempFile,
-  kInfoLogFile  // Either the current one, or an old one
+  kInfoLogFile,  // Either the current one, or an old one
+  kCacheWarming
 };
 
+// Riak specific routine to help create sst_? subdirectory names
+std::string MakeDirName2(const Options & options,
+                         int level, const char* suffix);
+
+// Riak specific routine to help create sst_? subdirectories
+Status MakeLevelDirectories(Env * env, const Options & options);
+
+// Riak specific routine to test if sst_? subdirectories exist
+bool TestForLevelDirectories(Env * env, const Options & options, class Version *);
+
+// Riak specific routine to standardize conversion of dbname and
+//  Options' tiered directories (options parameter is MODIFIED)
+std::string MakeTieredDbname(const std::string &dbname, Options & options_rw);
+
 // Return the name of the log file with the specified number
 // in the db named by "dbname".  The result will be prefixed with
 // "dbname".
@@ -35,12 +52,8 @@ extern std::string LogFileName(const std::string& dbname, uint64_t number);
 // Return the name of the sstable with the specified number
 // in the db named by "dbname".  The result will be prefixed with
 // "dbname".
-extern std::string TableFileName(const std::string& dbname, uint64_t number);
-
-// Return the legacy file name for an sstable with the specified number
-// in the db named by "dbname". The result will be prefixed with
-// "dbname".
-extern std::string SSTTableFileName(const std::string& dbname, uint64_t number);
+extern std::string TableFileName(const Options & options, uint64_t number,
+                                 int level);
 
 // Return the name of the descriptor file for the db named by
 // "dbname" and the specified incarnation number.  The result will be
@@ -67,10 +80,21 @@ extern std::string InfoLogFileName(const std::string& dbname);
 // Return the name of the old info log file for "dbname".
 extern std::string OldInfoLogFileName(const std::string& dbname);
 
+// Return the name of the cache object file for the db named by
+// "dbname".  The result will be prefixed with "dbname".
+extern std::string CowFileName(const std::string& dbname);
+
+// Append appropriate "backup" string to input path
+extern std::string BackupPath(const std::string& dbname, int backup_num);
+
+// update tiered_fast_prefix and tiered_slow_prefix members of
+//  given Options object to point to backup path
+extern bool SetBackupPaths(Options & options, int backup_num);
+
 // If filename is a leveldb file, store the type of the file in *type.
 // The number encoded in the filename is stored in *number.  If the
 // filename was successfully parsed, returns true.  Else return false.
-extern bool ParseFileName(const std::string& filename,
+extern bool ParseFileName(const std::string& tiered_filename,
                           uint64_t* number,
                           FileType* type);
 
diff --git a/src/leveldb/db/filename_test.cc b/src/leveldb/db/filename_test.cc
index a32556dea..a075f9b71 100644
--- a/src/leveldb/db/filename_test.cc
+++ b/src/leveldb/db/filename_test.cc
@@ -27,7 +27,6 @@ TEST(FileNameTest, Parse) {
     { "100.log",            100,   kLogFile },
     { "0.log",              0,     kLogFile },
     { "0.sst",              0,     kTableFile },
-    { "0.ldb",              0,     kTableFile },
     { "CURRENT",            0,     kCurrentFile },
     { "LOCK",               0,     kDBLockFile },
     { "MANIFEST-2",         2,     kDescriptorFile },
@@ -71,13 +70,14 @@ TEST(FileNameTest, Parse) {
   for (int i = 0; i < sizeof(errors) / sizeof(errors[0]); i++) {
     std::string f = errors[i];
     ASSERT_TRUE(!ParseFileName(f, &number, &type)) << f;
-  }
+  };
 }
 
 TEST(FileNameTest, Construction) {
   uint64_t number;
   FileType type;
   std::string fname;
+  Options options;
 
   fname = CurrentFileName("foo");
   ASSERT_EQ("foo/", std::string(fname.data(), 4));
@@ -97,12 +97,40 @@ TEST(FileNameTest, Construction) {
   ASSERT_EQ(192, number);
   ASSERT_EQ(kLogFile, type);
 
-  fname = TableFileName("bar", 200);
+  options.tiered_fast_prefix="bar";
+  options.tiered_slow_prefix="bar";
+  fname = TableFileName(options, 200, 1);
   ASSERT_EQ("bar/", std::string(fname.data(), 4));
-  ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
+  ASSERT_EQ("sst_1/", std::string(fname.substr(4,6)));
+  ASSERT_TRUE(ParseFileName(fname.c_str() + 10, &number, &type));
   ASSERT_EQ(200, number);
   ASSERT_EQ(kTableFile, type);
 
+  fname = TableFileName(options, 400, 4);
+  ASSERT_EQ("bar/", std::string(fname.data(), 4));
+  ASSERT_EQ("sst_4/", std::string(fname.substr(4,6)));
+  ASSERT_TRUE(ParseFileName(fname.c_str() + 10, &number, &type));
+  ASSERT_EQ(400, number);
+  ASSERT_EQ(kTableFile, type);
+
+  options.tiered_slow_level=4;
+  options.tiered_fast_prefix="fast";
+  options.tiered_slow_prefix="slow";
+  fname = TableFileName(options, 500, 3);
+  ASSERT_EQ("fast/", std::string(fname.data(), 5));
+  ASSERT_EQ("sst_3/", std::string(fname.substr(5,6)));
+  ASSERT_TRUE(ParseFileName(fname.c_str() + 11, &number, &type));
+  ASSERT_EQ(500, number);
+  ASSERT_EQ(kTableFile, type);
+
+  fname = TableFileName(options, 600, 4);
+  ASSERT_EQ("slow/", std::string(fname.data(), 5));
+  ASSERT_EQ("sst_4/", std::string(fname.substr(5,6)));
+  ASSERT_TRUE(ParseFileName(fname.c_str() + 11, &number, &type));
+  ASSERT_EQ(600, number);
+  ASSERT_EQ(kTableFile, type);
+
+
   fname = DescriptorFileName("bar", 100);
   ASSERT_EQ("bar/", std::string(fname.data(), 4));
   ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
@@ -114,6 +142,48 @@ TEST(FileNameTest, Construction) {
   ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
   ASSERT_EQ(999, number);
   ASSERT_EQ(kTempFile, type);
+
+  fname = CowFileName("/what/goes/moo");
+  ASSERT_EQ("/what/goes/moo/COW", fname);
+
+  fname = BackupPath("/var/db/riak/data/leveldb/0",0);
+  ASSERT_EQ("/var/db/riak/data/leveldb/0/backup", fname);
+
+  fname = BackupPath("/var/db/riak/data/leveldb/0",1);
+  ASSERT_EQ("/var/db/riak/data/leveldb/0/backup.1", fname);
+
+  fname = BackupPath("/var/db/riak/data/leveldb/0",5);
+  ASSERT_EQ("/var/db/riak/data/leveldb/0/backup.5", fname);
+
+  options.tiered_slow_level=4;
+  options.tiered_fast_prefix="fast";
+  options.tiered_slow_prefix="slow";
+  fname = SetBackupPaths(options,0);
+  ASSERT_EQ("fast/backup", options.tiered_fast_prefix);
+  ASSERT_EQ("slow/backup", options.tiered_slow_prefix);
+
+  options.tiered_slow_level=4;
+  options.tiered_fast_prefix="fast";
+  options.tiered_slow_prefix="slow";
+  fname = SetBackupPaths(options,3);
+  ASSERT_EQ("fast/backup.3", options.tiered_fast_prefix);
+  ASSERT_EQ("slow/backup.3", options.tiered_slow_prefix);
+
+
+  options.tiered_slow_level=4;
+  options.tiered_fast_prefix="//mnt/fast";
+  options.tiered_slow_prefix="//mnt/slow";
+  fname=MakeTieredDbname("riak/data/leveldb", options);
+  ASSERT_EQ("//mnt/fast/riak/data/leveldb", fname);
+  ASSERT_EQ("//mnt/fast/riak/data/leveldb", options.tiered_fast_prefix);
+  ASSERT_EQ("//mnt/slow/riak/data/leveldb", options.tiered_slow_prefix);
+
+  // special case with no dbname given, should have no changes
+  fname=MakeTieredDbname("", options);
+  ASSERT_EQ("//mnt/fast/riak/data/leveldb", fname);
+  ASSERT_EQ("//mnt/fast/riak/data/leveldb", options.tiered_fast_prefix);
+  ASSERT_EQ("//mnt/slow/riak/data/leveldb", options.tiered_slow_prefix);
+
 }
 
 }  // namespace leveldb
diff --git a/src/leveldb/db/leveldbutil.cc b/src/leveldb/db/leveldbutil.cc
deleted file mode 100644
index d06d64d64..000000000
--- a/src/leveldb/db/leveldbutil.cc
+++ /dev/null
@@ -1,65 +0,0 @@
-// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
-
-#include <stdio.h>
-#include "leveldb/dumpfile.h"
-#include "leveldb/env.h"
-#include "leveldb/status.h"
-
-namespace leveldb {
-namespace {
-
-class StdoutPrinter : public WritableFile {
- public:
-  virtual Status Append(const Slice& data) {
-    fwrite(data.data(), 1, data.size(), stdout);
-    return Status::OK();
-  }
-  virtual Status Close() { return Status::OK(); }
-  virtual Status Flush() { return Status::OK(); }
-  virtual Status Sync() { return Status::OK(); }
-  virtual std::string GetName() const { return "[stdout]"; }
-};
-
-bool HandleDumpCommand(Env* env, char** files, int num) {
-  StdoutPrinter printer;
-  bool ok = true;
-  for (int i = 0; i < num; i++) {
-    Status s = DumpFile(env, files[i], &printer);
-    if (!s.ok()) {
-      fprintf(stderr, "%s\n", s.ToString().c_str());
-      ok = false;
-    }
-  }
-  return ok;
-}
-
-}  // namespace
-}  // namespace leveldb
-
-static void Usage() {
-  fprintf(
-      stderr,
-      "Usage: leveldbutil command...\n"
-      "   dump files...         -- dump contents of specified files\n"
-      );
-}
-
-int main(int argc, char** argv) {
-  leveldb::Env* env = leveldb::Env::Default();
-  bool ok = true;
-  if (argc < 2) {
-    Usage();
-    ok = false;
-  } else {
-    std::string command = argv[1];
-    if (command == "dump") {
-      ok = leveldb::HandleDumpCommand(env, argv+2, argc-2);
-    } else {
-      Usage();
-      ok = false;
-    }
-  }
-  return (ok ? 0 : 1);
-}
diff --git a/src/leveldb/db/log_format.h b/src/leveldb/db/log_format.h
index 356e69fca..2690cb978 100644
--- a/src/leveldb/db/log_format.h
+++ b/src/leveldb/db/log_format.h
@@ -3,7 +3,7 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 //
 // Log format information shared by reader and writer.
-// See ../doc/log_format.md for more detail.
+// See ../doc/log_format.txt for more detail.
 
 #ifndef STORAGE_LEVELDB_DB_LOG_FORMAT_H_
 #define STORAGE_LEVELDB_DB_LOG_FORMAT_H_
@@ -26,8 +26,8 @@ static const int kMaxRecordType = kLastType;
 
 static const int kBlockSize = 32768;
 
-// Header is checksum (4 bytes), length (2 bytes), type (1 byte).
-static const int kHeaderSize = 4 + 2 + 1;
+// Header is checksum (4 bytes), type (1 byte), length (2 bytes).
+static const int kHeaderSize = 4 + 1 + 2;
 
 }  // namespace log
 }  // namespace leveldb
diff --git a/src/leveldb/db/log_reader.cc b/src/leveldb/db/log_reader.cc
index 8b6ad136d..ddd620246 100644
--- a/src/leveldb/db/log_reader.cc
+++ b/src/leveldb/db/log_reader.cc
@@ -25,8 +25,7 @@ Reader::Reader(SequentialFile* file, Reporter* reporter, bool checksum,
       eof_(false),
       last_record_offset_(0),
       end_of_buffer_offset_(0),
-      initial_offset_(initial_offset),
-      resyncing_(initial_offset > 0) {
+      initial_offset_(initial_offset) {
 }
 
 Reader::~Reader() {
@@ -73,25 +72,8 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch) {
 
   Slice fragment;
   while (true) {
+    uint64_t physical_record_offset = end_of_buffer_offset_ - buffer_.size();
     const unsigned int record_type = ReadPhysicalRecord(&fragment);
-
-    // ReadPhysicalRecord may have only had an empty trailer remaining in its
-    // internal buffer. Calculate the offset of the next physical record now
-    // that it has returned, properly accounting for its header size.
-    uint64_t physical_record_offset =
-        end_of_buffer_offset_ - buffer_.size() - kHeaderSize - fragment.size();
-
-    if (resyncing_) {
-      if (record_type == kMiddleType) {
-        continue;
-      } else if (record_type == kLastType) {
-        resyncing_ = false;
-        continue;
-      } else {
-        resyncing_ = false;
-      }
-    }
-
     switch (record_type) {
       case kFullType:
         if (in_fragmented_record) {
@@ -151,9 +133,7 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch) {
 
       case kEof:
         if (in_fragmented_record) {
-          // This can be caused by the writer dying immediately after
-          // writing a physical record but before completing the next; don't
-          // treat it as a corruption, just ignore the entire logical record.
+          ReportCorruption(scratch->size(), "partial record without end(3)");
           scratch->clear();
         }
         return false;
@@ -185,20 +165,20 @@ uint64_t Reader::LastRecordOffset() {
   return last_record_offset_;
 }
 
-void Reader::ReportCorruption(uint64_t bytes, const char* reason) {
-  ReportDrop(bytes, Status::Corruption(reason, file_->GetName()));
+void Reader::ReportCorruption(size_t bytes, const char* reason) {
+  ReportDrop(bytes, Status::Corruption(reason));
 }
 
-void Reader::ReportDrop(uint64_t bytes, const Status& reason) {
+void Reader::ReportDrop(size_t bytes, const Status& reason) {
   if (reporter_ != NULL &&
       end_of_buffer_offset_ - buffer_.size() - bytes >= initial_offset_) {
-    reporter_->Corruption(static_cast<size_t>(bytes), reason);
+    reporter_->Corruption(bytes, reason);
   }
 }
 
 unsigned int Reader::ReadPhysicalRecord(Slice* result) {
   while (true) {
-    if (buffer_.size() < kHeaderSize) {
+    if (buffer_.size() < (size_t)kHeaderSize) {
       if (!eof_) {
         // Last read was a full read, so this is a trailer to skip
         buffer_.clear();
@@ -209,16 +189,17 @@ unsigned int Reader::ReadPhysicalRecord(Slice* result) {
           ReportDrop(kBlockSize, status);
           eof_ = true;
           return kEof;
-        } else if (buffer_.size() < kBlockSize) {
+        } else if (buffer_.size() < (size_t)kBlockSize) {
           eof_ = true;
         }
         continue;
+      } else if (buffer_.size() == 0) {
+        // End of file
+        return kEof;
       } else {
-        // Note that if buffer_ is non-empty, we have a truncated header at the
-        // end of the file, which can be caused by the writer crashing in the
-        // middle of writing the header. Instead of considering this an error,
-        // just report EOF.
+        size_t drop_size = buffer_.size();
         buffer_.clear();
+        ReportCorruption(drop_size, "truncated record at end of file");
         return kEof;
       }
     }
@@ -232,14 +213,8 @@ unsigned int Reader::ReadPhysicalRecord(Slice* result) {
     if (kHeaderSize + length > buffer_.size()) {
       size_t drop_size = buffer_.size();
       buffer_.clear();
-      if (!eof_) {
-        ReportCorruption(drop_size, "bad record length");
-        return kBadRecord;
-      }
-      // If the end of the file has been reached without reading |length| bytes
-      // of payload, assume the writer died in the middle of writing the record.
-      // Don't report a corruption.
-      return kEof;
+      ReportCorruption(drop_size, "bad record length");
+      return kBadRecord;
     }
 
     if (type == kZeroType && length == 0) {
diff --git a/src/leveldb/db/log_reader.h b/src/leveldb/db/log_reader.h
index 8389d61f8..82d4bee68 100644
--- a/src/leveldb/db/log_reader.h
+++ b/src/leveldb/db/log_reader.h
@@ -73,11 +73,6 @@ class Reader {
   // Offset at which to start looking for the first record to return
   uint64_t const initial_offset_;
 
-  // True if we are resynchronizing after a seek (initial_offset_ > 0). In
-  // particular, a run of kMiddleType and kLastType records can be silently
-  // skipped in this mode
-  bool resyncing_;
-
   // Extend record types with the following special values
   enum {
     kEof = kMaxRecordType + 1,
@@ -99,8 +94,8 @@ class Reader {
 
   // Reports dropped bytes to the reporter.
   // buffer_ must be updated to remove the dropped bytes prior to invocation.
-  void ReportCorruption(uint64_t bytes, const char* reason);
-  void ReportDrop(uint64_t bytes, const Status& reason);
+  void ReportCorruption(size_t bytes, const char* reason);
+  void ReportDrop(size_t bytes, const Status& reason);
 
   // No copying allowed
   Reader(const Reader&);
diff --git a/src/leveldb/db/log_test.cc b/src/leveldb/db/log_test.cc
index 48a592865..4c5cf8757 100644
--- a/src/leveldb/db/log_test.cc
+++ b/src/leveldb/db/log_test.cc
@@ -79,7 +79,7 @@ class LogTest {
     virtual Status Skip(uint64_t n) {
       if (n > contents_.size()) {
         contents_.clear();
-        return Status::NotFound("in-memory file skipped past end");
+        return Status::NotFound("in-memory file skipepd past end");
       }
 
       contents_.remove_prefix(n);
@@ -104,34 +104,23 @@ class LogTest {
   StringSource source_;
   ReportCollector report_;
   bool reading_;
-  Writer* writer_;
-  Reader* reader_;
+  Writer writer_;
+  Reader reader_;
 
   // Record metadata for testing initial offset functionality
   static size_t initial_offset_record_sizes_[];
   static uint64_t initial_offset_last_record_offsets_[];
-  static int num_initial_offset_records_;
 
  public:
   LogTest() : reading_(false),
-              writer_(new Writer(&dest_)),
-              reader_(new Reader(&source_, &report_, true/*checksum*/,
-                      0/*initial_offset*/)) {
-  }
-
-  ~LogTest() {
-    delete writer_;
-    delete reader_;
-  }
-
-  void ReopenForAppend() {
-    delete writer_;
-    writer_ = new Writer(&dest_, dest_.contents_.size());
+              writer_(&dest_),
+              reader_(&source_, &report_, true/*checksum*/,
+                      0/*initial_offset*/) {
   }
 
   void Write(const std::string& msg) {
     ASSERT_TRUE(!reading_) << "Write() after starting to read";
-    writer_->AddRecord(Slice(msg));
+    writer_.AddRecord(Slice(msg));
   }
 
   size_t WrittenBytes() const {
@@ -145,7 +134,7 @@ class LogTest {
     }
     std::string scratch;
     Slice record;
-    if (reader_->ReadRecord(&record, &scratch)) {
+    if (reader_.ReadRecord(&record, &scratch)) {
       return record.ToString();
     } else {
       return "EOF";
@@ -193,18 +182,13 @@ class LogTest {
   }
 
   void WriteInitialOffsetLog() {
-    for (int i = 0; i < num_initial_offset_records_; i++) {
+    for (int i = 0; i < 4; i++) {
       std::string record(initial_offset_record_sizes_[i],
                          static_cast<char>('a' + i));
       Write(record);
     }
   }
 
-  void StartReadingAt(uint64_t initial_offset) {
-    delete reader_;
-    reader_ = new Reader(&source_, &report_, true/*checksum*/, initial_offset);
-  }
-
   void CheckOffsetPastEndReturnsNoRecords(uint64_t offset_past_end) {
     WriteInitialOffsetLog();
     reading_ = true;
@@ -224,48 +208,32 @@ class LogTest {
     source_.contents_ = Slice(dest_.contents_);
     Reader* offset_reader = new Reader(&source_, &report_, true/*checksum*/,
                                        initial_offset);
-
-    // Read all records from expected_record_offset through the last one.
-    ASSERT_LT(expected_record_offset, num_initial_offset_records_);
-    for (; expected_record_offset < num_initial_offset_records_;
-         ++expected_record_offset) {
-      Slice record;
-      std::string scratch;
-      ASSERT_TRUE(offset_reader->ReadRecord(&record, &scratch));
-      ASSERT_EQ(initial_offset_record_sizes_[expected_record_offset],
-                record.size());
-      ASSERT_EQ(initial_offset_last_record_offsets_[expected_record_offset],
-                offset_reader->LastRecordOffset());
-      ASSERT_EQ((char)('a' + expected_record_offset), record.data()[0]);
-    }
+    Slice record;
+    std::string scratch;
+    ASSERT_TRUE(offset_reader->ReadRecord(&record, &scratch));
+    ASSERT_EQ(initial_offset_record_sizes_[expected_record_offset],
+              record.size());
+    ASSERT_EQ(initial_offset_last_record_offsets_[expected_record_offset],
+              offset_reader->LastRecordOffset());
+    ASSERT_EQ((char)('a' + expected_record_offset), record.data()[0]);
     delete offset_reader;
   }
+
 };
 
 size_t LogTest::initial_offset_record_sizes_[] =
     {10000,  // Two sizable records in first block
      10000,
      2 * log::kBlockSize - 1000,  // Span three blocks
-     1,
-     13716,  // Consume all but two bytes of block 3.
-     log::kBlockSize - kHeaderSize, // Consume the entirety of block 4.
-    };
+     1};
 
 uint64_t LogTest::initial_offset_last_record_offsets_[] =
     {0,
      kHeaderSize + 10000,
      2 * (kHeaderSize + 10000),
      2 * (kHeaderSize + 10000) +
-         (2 * log::kBlockSize - 1000) + 3 * kHeaderSize,
-     2 * (kHeaderSize + 10000) +
-         (2 * log::kBlockSize - 1000) + 3 * kHeaderSize
-         + kHeaderSize + 1,
-     3 * log::kBlockSize,
-    };
+         (2 * log::kBlockSize - 1000) + 3 * kHeaderSize};
 
-// LogTest::initial_offset_last_record_offsets_ must be defined before this.
-int LogTest::num_initial_offset_records_ =
-    sizeof(LogTest::initial_offset_last_record_offsets_)/sizeof(uint64_t);
 
 TEST(LogTest, Empty) {
   ASSERT_EQ("EOF", Read());
@@ -350,15 +318,6 @@ TEST(LogTest, AlignedEof) {
   ASSERT_EQ("EOF", Read());
 }
 
-TEST(LogTest, OpenForAppend) {
-  Write("hello");
-  ReopenForAppend();
-  Write("world");
-  ASSERT_EQ("hello", Read());
-  ASSERT_EQ("world", Read());
-  ASSERT_EQ("EOF", Read());
-}
-
 TEST(LogTest, RandomRead) {
   const int N = 500;
   Random write_rnd(301);
@@ -392,32 +351,20 @@ TEST(LogTest, BadRecordType) {
   ASSERT_EQ("OK", MatchError("unknown record type"));
 }
 
-TEST(LogTest, TruncatedTrailingRecordIsIgnored) {
+TEST(LogTest, TruncatedTrailingRecord) {
   Write("foo");
   ShrinkSize(4);   // Drop all payload as well as a header byte
   ASSERT_EQ("EOF", Read());
-  // Truncated last record is ignored, not treated as an error.
-  ASSERT_EQ(0, DroppedBytes());
-  ASSERT_EQ("", ReportMessage());
+  ASSERT_EQ(kHeaderSize - 1, DroppedBytes());
+  ASSERT_EQ("OK", MatchError("truncated record at end of file"));
 }
 
 TEST(LogTest, BadLength) {
-  const int kPayloadSize = kBlockSize - kHeaderSize;
-  Write(BigString("bar", kPayloadSize));
-  Write("foo");
-  // Least significant size byte is stored in header[4].
-  IncrementByte(4, 1);
-  ASSERT_EQ("foo", Read());
-  ASSERT_EQ(kBlockSize, DroppedBytes());
-  ASSERT_EQ("OK", MatchError("bad record length"));
-}
-
-TEST(LogTest, BadLengthAtEndIsIgnored) {
   Write("foo");
   ShrinkSize(1);
   ASSERT_EQ("EOF", Read());
-  ASSERT_EQ(0, DroppedBytes());
-  ASSERT_EQ("", ReportMessage());
+  ASSERT_EQ(kHeaderSize + 2, DroppedBytes());
+  ASSERT_EQ("OK", MatchError("bad record length"));
 }
 
 TEST(LogTest, ChecksumMismatch) {
@@ -468,40 +415,6 @@ TEST(LogTest, UnexpectedFirstType) {
   ASSERT_EQ("OK", MatchError("partial record without end"));
 }
 
-TEST(LogTest, MissingLastIsIgnored) {
-  Write(BigString("bar", kBlockSize));
-  // Remove the LAST block, including header.
-  ShrinkSize(14);
-  ASSERT_EQ("EOF", Read());
-  ASSERT_EQ("", ReportMessage());
-  ASSERT_EQ(0, DroppedBytes());
-}
-
-TEST(LogTest, PartialLastIsIgnored) {
-  Write(BigString("bar", kBlockSize));
-  // Cause a bad record length in the LAST block.
-  ShrinkSize(1);
-  ASSERT_EQ("EOF", Read());
-  ASSERT_EQ("", ReportMessage());
-  ASSERT_EQ(0, DroppedBytes());
-}
-
-TEST(LogTest, SkipIntoMultiRecord) {
-  // Consider a fragmented record:
-  //    first(R1), middle(R1), last(R1), first(R2)
-  // If initial_offset points to a record after first(R1) but before first(R2)
-  // incomplete fragment errors are not actual errors, and must be suppressed
-  // until a new first or full record is encountered.
-  Write(BigString("foo", 3*kBlockSize));
-  Write("correct");
-  StartReadingAt(kBlockSize);
-
-  ASSERT_EQ("correct", Read());
-  ASSERT_EQ("", ReportMessage());
-  ASSERT_EQ(0, DroppedBytes());
-  ASSERT_EQ("EOF", Read());
-}
-
 TEST(LogTest, ErrorJoinsRecords) {
   // Consider two fragmented records:
   //    first(R1) last(R1) first(R2) last(R2)
@@ -520,7 +433,7 @@ TEST(LogTest, ErrorJoinsRecords) {
 
   ASSERT_EQ("correct", Read());
   ASSERT_EQ("EOF", Read());
-  const size_t dropped = DroppedBytes();
+  const int dropped = DroppedBytes();
   ASSERT_LE(dropped, 2*kBlockSize + 100);
   ASSERT_GE(dropped, 2*kBlockSize);
 }
@@ -571,10 +484,6 @@ TEST(LogTest, ReadFourthStart) {
       3);
 }
 
-TEST(LogTest, ReadInitialOffsetIntoBlockPadding) {
-  CheckInitialOffsetRecord(3 * log::kBlockSize - 3, 5);
-}
-
 TEST(LogTest, ReadEnd) {
   CheckOffsetPastEndReturnsNoRecords(0);
 }
diff --git a/src/leveldb/db/log_writer.cc b/src/leveldb/db/log_writer.cc
index 74a03270d..18c7bd837 100644
--- a/src/leveldb/db/log_writer.cc
+++ b/src/leveldb/db/log_writer.cc
@@ -12,22 +12,13 @@
 namespace leveldb {
 namespace log {
 
-static void InitTypeCrc(uint32_t* type_crc) {
-  for (int i = 0; i <= kMaxRecordType; i++) {
-    char t = static_cast<char>(i);
-    type_crc[i] = crc32c::Value(&t, 1);
-  }
-}
-
 Writer::Writer(WritableFile* dest)
     : dest_(dest),
       block_offset_(0) {
-  InitTypeCrc(type_crc_);
-}
-
-Writer::Writer(WritableFile* dest, uint64_t dest_length)
-    : dest_(dest), block_offset_(dest_length % kBlockSize) {
-  InitTypeCrc(type_crc_);
+  for (int i = 0; i <= kMaxRecordType; i++) {
+    char t = static_cast<char>(i);
+    type_crc_[i] = crc32c::Value(&t, 1);
+  }
 }
 
 Writer::~Writer() {
@@ -83,7 +74,7 @@ Status Writer::AddRecord(const Slice& slice) {
 
 Status Writer::EmitPhysicalRecord(RecordType t, const char* ptr, size_t n) {
   assert(n <= 0xffff);  // Must fit in two bytes
-  assert(block_offset_ + kHeaderSize + n <= kBlockSize);
+  assert(block_offset_ + kHeaderSize + (int)n <= kBlockSize);
 
   // Format the header
   char buf[kHeaderSize];
diff --git a/src/leveldb/db/log_writer.h b/src/leveldb/db/log_writer.h
index 9e7cc4705..c6ad7a4ff 100644
--- a/src/leveldb/db/log_writer.h
+++ b/src/leveldb/db/log_writer.h
@@ -9,11 +9,10 @@
 #include "db/log_format.h"
 #include "leveldb/slice.h"
 #include "leveldb/status.h"
+#include "leveldb/env.h"
 
 namespace leveldb {
 
-class WritableFile;
-
 namespace log {
 
 class Writer {
@@ -22,16 +21,12 @@ class Writer {
   // "*dest" must be initially empty.
   // "*dest" must remain live while this Writer is in use.
   explicit Writer(WritableFile* dest);
-
-  // Create a writer that will append data to "*dest".
-  // "*dest" must have initial length "dest_length".
-  // "*dest" must remain live while this Writer is in use.
-  Writer(WritableFile* dest, uint64_t dest_length);
-
   ~Writer();
 
   Status AddRecord(const Slice& slice);
 
+  void Close() {delete dest_; dest_=NULL;};
+
  private:
   WritableFile* dest_;
   int block_offset_;       // Current offset in block
diff --git a/src/leveldb/db/memtable.cc b/src/leveldb/db/memtable.cc
index 287afdbdc..965c9d9c0 100644
--- a/src/leveldb/db/memtable.cc
+++ b/src/leveldb/db/memtable.cc
@@ -6,6 +6,7 @@
 #include "db/dbformat.h"
 #include "leveldb/comparator.h"
 #include "leveldb/env.h"
+#include "leveldb/expiry.h"
 #include "leveldb/iterator.h"
 #include "util/coding.h"
 
@@ -63,6 +64,8 @@ class MemTableIterator: public Iterator {
     Slice key_slice = GetLengthPrefixedSlice(iter_.key());
     return GetLengthPrefixedSlice(key_slice.data() + key_slice.size());
   }
+  virtual KeyMetaData & keymetadata() const
+   {MemTable::DecodeKeyMetaData(iter_.key(), keymetadata_); return(keymetadata_);};
 
   virtual Status status() const { return Status::OK(); }
 
@@ -81,7 +84,8 @@ Iterator* MemTable::NewIterator() {
 
 void MemTable::Add(SequenceNumber s, ValueType type,
                    const Slice& key,
-                   const Slice& value) {
+                   const Slice& value,
+                   const ExpiryTimeMicros & expiry) {
   // Format of an entry is concatenation of:
   //  key_size     : varint32 of internal_key.size()
   //  key bytes    : char[internal_key.size()]
@@ -89,7 +93,7 @@ void MemTable::Add(SequenceNumber s, ValueType type,
   //  value bytes  : char[value.size()]
   size_t key_size = key.size();
   size_t val_size = value.size();
-  size_t internal_key_size = key_size + 8;
+  size_t internal_key_size = key_size + KeySuffixSize(type);
   const size_t encoded_len =
       VarintLength(internal_key_size) + internal_key_size +
       VarintLength(val_size) + val_size;
@@ -97,15 +101,22 @@ void MemTable::Add(SequenceNumber s, ValueType type,
   char* p = EncodeVarint32(buf, internal_key_size);
   memcpy(p, key.data(), key_size);
   p += key_size;
+  if (IsExpiryKey(type))
+  {
+      EncodeFixed64(p, expiry);
+      p+=8;
+  }
   EncodeFixed64(p, (s << 8) | type);
   p += 8;
   p = EncodeVarint32(p, val_size);
   memcpy(p, value.data(), val_size);
-  assert(p + val_size == buf + encoded_len);
+  assert((size_t)((p + val_size) - buf) == encoded_len);
   table_.Insert(buf);
 }
 
-bool MemTable::Get(const LookupKey& key, std::string* value, Status* s) {
+bool MemTable::Get(const LookupKey& key, Value* value, Status* s,
+    const Options * options) {
+  bool ret_flag(false);
   Slice memkey = key.memtable_key();
   Table::Iterator iter(&table_);
   iter.Seek(memkey.data());
@@ -113,6 +124,7 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s) {
     // entry format is:
     //    klength  varint32
     //    userkey  char[klength]
+    //    optional uint64
     //    tag      uint64
     //    vlength  varint32
     //    value    char[vlength]
@@ -122,24 +134,66 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s) {
     const char* entry = iter.key();
     uint32_t key_length;
     const char* key_ptr = GetVarint32Ptr(entry, entry+5, &key_length);
+    Slice internal_key(key_ptr, key_length);
     if (comparator_.comparator.user_comparator()->Compare(
-            Slice(key_ptr, key_length - 8),
+            ExtractUserKey(internal_key),
             key.user_key()) == 0) {
       // Correct user key
-      const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8);
-      switch (static_cast<ValueType>(tag & 0xff)) {
-        case kTypeValue: {
+      KeyMetaData meta;
+      DecodeKeyMetaData(entry, meta);
+
+      switch (meta.m_Type) {
+        case kTypeValueWriteTime:
+        case kTypeValueExplicitExpiry:
+        {
+            bool expired=false;
+            if (NULL!=options && options->ExpiryActivated())
+                expired=options->expiry_module->MemTableCallback(internal_key);
+            if (expired)
+            {
+                // like kTypeDeletion
+                *s = Status::NotFound(Slice());
+                ret_flag=true;
+                break;
+            }   // if
+            //otherwise fall into kTypeValue code
+        }   // case
+
+        case kTypeValue:
+        {
           Slice v = GetLengthPrefixedSlice(key_ptr + key_length);
           value->assign(v.data(), v.size());
-          return true;
+          ret_flag=true;
+          break;
         }
         case kTypeDeletion:
           *s = Status::NotFound(Slice());
-          return true;
-      }
+          ret_flag=true;
+          break;
+      } // switch
+
+      // only unpack metadata if requested
+      if (key.WantsKeyMetaData())
+          key.SetKeyMetaData(meta);
     }
   }
-  return false;
+  return ret_flag;
 }
 
+// this is a static function
+void MemTable::DecodeKeyMetaData(
+    const char * key,
+    KeyMetaData & meta)
+{
+    Slice key_slice = GetLengthPrefixedSlice(key);
+
+    meta.m_Type=ExtractValueType(key_slice);
+    meta.m_Sequence=ExtractSequenceNumber(key_slice);
+    if (IsExpiryKey(meta.m_Type))
+        meta.m_Expiry=ExtractExpiry(key_slice);
+    else
+        meta.m_Expiry=0;
+
+} // DecodeKeyMetaData
+
 }  // namespace leveldb
diff --git a/src/leveldb/db/memtable.h b/src/leveldb/db/memtable.h
index 9f41567cd..ff0e98220 100644
--- a/src/leveldb/db/memtable.h
+++ b/src/leveldb/db/memtable.h
@@ -24,10 +24,10 @@ class MemTable {
   explicit MemTable(const InternalKeyComparator& comparator);
 
   // Increase reference count.
-  void Ref() { ++refs_; }
+  void Ref() volatile { ++refs_; }
 
   // Drop reference count.  Delete if no more references exist.
-  void Unref() {
+  void Unref() volatile {
     --refs_;
     assert(refs_ >= 0);
     if (refs_ <= 0) {
@@ -36,7 +36,10 @@ class MemTable {
   }
 
   // Returns an estimate of the number of bytes of data in use by this
-  // data structure. It is safe to call when MemTable is being modified.
+  // data structure.
+  //
+  // REQUIRES: external synchronization to prevent simultaneous
+  // operations on the same MemTable.
   size_t ApproximateMemoryUsage();
 
   // Return an iterator that yields the contents of the memtable.
@@ -52,13 +55,17 @@ class MemTable {
   // Typically value will be empty if type==kTypeDeletion.
   void Add(SequenceNumber seq, ValueType type,
            const Slice& key,
-           const Slice& value);
+           const Slice& value,
+           const ExpiryTimeMicros& expiry=0);
 
   // If memtable contains a value for key, store it in *value and return true.
   // If memtable contains a deletion for key, store a NotFound() error
   // in *status and return true.
   // Else, return false.
-  bool Get(const LookupKey& key, std::string* value, Status* s);
+  bool Get(const LookupKey& key, Value* value, Status* s, const Options * options);
+
+  // parse keymetadata from skiplist key string
+  static void DecodeKeyMetaData(const char * key, KeyMetaData & meta);
 
  private:
   ~MemTable();  // Private since only Unref() should be used to delete it
@@ -69,7 +76,7 @@ class MemTable {
     int operator()(const char* a, const char* b) const;
   };
   friend class MemTableIterator;
-  friend class MemTableBackwardIterator;
+  friend class MemTableBackwardIterator; // does not exist
 
   typedef SkipList<const char*, KeyComparator> Table;
 
diff --git a/src/leveldb/db/penalty_test.cc b/src/leveldb/db/penalty_test.cc
new file mode 100644
index 000000000..fc28ae887
--- /dev/null
+++ b/src/leveldb/db/penalty_test.cc
@@ -0,0 +1,248 @@
+// -------------------------------------------------------------------
+//
+// penalty_test.cc
+//
+// Copyright (c) 2016 Basho Technologies, Inc. All Rights Reserved.
+//
+// This file is provided to you under the Apache License,
+// Version 2.0 (the "License"); you may not use this file
+// except in compliance with the License.  You may obtain
+// a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+//
+// -------------------------------------------------------------------
+
+
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+#include "leveldb/comparator.h"
+
+#include "db/version_set.h"
+
+/**
+ * Execution routine
+ */
+int main(int argc, char** argv)
+{
+  return leveldb::test::RunAllTests();
+}
+
+
+namespace leveldb {
+
+class TestVersion : public Version
+{
+public:
+    TestVersion()
+        : Version(NULL)
+    {
+        int loop;
+
+        for (loop=0; loop<config::kNumLevels; ++loop)
+        {
+            m_FalseFile[loop].file_size=0;
+            m_LevelFileCount[loop]=0;
+        }   // for
+    };
+
+    virtual size_t NumFiles(int level) const {return(m_LevelFileCount[level]);};
+
+    virtual const std::vector<FileMetaData*> & GetFileList(int level) const
+    {
+        m_FalseVector.clear();
+        m_FalseVector.push_back(&m_FalseFile[level]);
+        return(m_FalseVector);
+    };
+
+    mutable std::vector<FileMetaData*> m_FalseVector;
+    mutable FileMetaData m_FalseFile[config::kNumLevels];
+
+    size_t m_LevelFileCount[config::kNumLevels];
+
+};  // class TestVersion
+
+/**
+ * Wrapper class for tests.  Holds working variables
+ * and helper functions.
+ */
+class PenaltyTester : public VersionSet
+{
+public:
+    PenaltyTester()
+        : m_IntCompare(m_Options.comparator), VersionSet("", &m_Options, NULL, &m_IntCompare)
+    {
+    };
+
+    ~PenaltyTester()
+    {
+    };
+
+    Options m_Options;
+    InternalKeyComparator m_IntCompare;
+
+};  // class PenaltyTester
+
+
+  /*******************
+   * Form note:
+   *   using     ASSERT_TRUE(0==version.WritePenalty());
+   *    instead of ASSERT_EQ / ASSERT_NE because WritePenalty
+   *    returns a volatile int, which older compilers believe is
+   *    not an equivalent type to a constant.  RedHat 5, Solaris,
+   *    and SmartOS were giving grief.
+   *******************/
+
+/**
+ * Debug 1
+ */
+#if 0
+TEST(PenaltyTester, Debug1)
+{
+    TestVersion version;
+    int penalty;
+
+    m_Options.write_buffer_size=46416847;
+
+    version.m_FalseFile[2].file_size=1075676398;
+    version.m_LevelFileCount[1]=1;
+
+    UpdatePenalty(&version);
+
+    ASSERT_TRUE(0==version.WritePenalty());
+
+}   // test Debug1
+#endif
+
+
+/**
+ * No penalty scenarios
+ */
+TEST(PenaltyTester, NoPenalty)
+{
+    TestVersion version;
+    int level;
+
+    m_Options.write_buffer_size=46416847;
+
+    // nothing
+    UpdatePenalty(&version);
+    ASSERT_TRUE(0==version.WritePenalty());
+
+    /**
+     * Level 0
+     *  (overlapped level, penalty is count based)
+     */
+    // no penalty
+    version.m_LevelFileCount[0]=config::kL0_CompactionTrigger;
+    UpdatePenalty(&version);
+    ASSERT_TRUE(0==version.WritePenalty());
+
+    version.m_LevelFileCount[0]=config::kL0_SlowdownWritesTrigger;
+    UpdatePenalty(&version);
+    ASSERT_TRUE(0==version.WritePenalty());
+
+#if 0   // needs rewrite to be time based
+    // threshold reached ... some penalty
+    version.m_LevelFileCount[0]=config::kL0_SlowdownWritesTrigger+1;
+    UpdatePenalty(&version);
+    ASSERT_TRUE(0!=version.WritePenalty());
+
+    // clean up
+    version.m_LevelFileCount[0]=0;
+
+    /**
+     * Level 1
+     *  (overlapped level, penalty is count based)
+     */
+    // no penalty
+    version.m_LevelFileCount[1]=config::kL0_CompactionTrigger;
+    UpdatePenalty(&version);
+    ASSERT_TRUE(0==version.WritePenalty());
+
+    version.m_LevelFileCount[1]=config::kL0_SlowdownWritesTrigger;
+    UpdatePenalty(&version);
+    ASSERT_TRUE(0==version.WritePenalty());
+
+    // threshold reached ... some penalty
+    version.m_LevelFileCount[1]=config::kL0_SlowdownWritesTrigger+1;
+    UpdatePenalty(&version);
+    ASSERT_TRUE(0!=version.WritePenalty());
+
+    // clean up
+    version.m_LevelFileCount[1]=0;
+
+    /**
+     * Level 2
+     *  (landing level, penalty size based)
+     */
+    // no penalty
+    version.m_FalseFile[2].file_size=0;
+    UpdatePenalty(&version);
+    ASSERT_TRUE(0==version.WritePenalty());
+
+    version.m_FalseFile[2].file_size=VersionSet::DesiredBytesForLevel(2);
+    UpdatePenalty(&version);
+    ASSERT_TRUE(0==version.WritePenalty());
+
+    version.m_FalseFile[2].file_size=VersionSet::MaxBytesForLevel(2)-1;
+    UpdatePenalty(&version);
+    ASSERT_TRUE(0==version.WritePenalty());
+
+    version.m_FalseFile[2].file_size=VersionSet::MaxBytesForLevel(2);
+    UpdatePenalty(&version);
+    ASSERT_TRUE(0!=version.WritePenalty());
+
+    // interaction rule with level 1
+    version.m_FalseFile[2].file_size=VersionSet::MaxBytesForLevel(2)-1;
+    version.m_LevelFileCount[1]=config::kL0_CompactionTrigger/2;
+    UpdatePenalty(&version);
+    ASSERT_TRUE(0!=version.WritePenalty());
+
+    // clean up
+    version.m_LevelFileCount[1]=0;
+    version.m_FalseFile[2].file_size=0;
+
+    /**
+     * Level 3+
+     *  (landing level, penalty size based)
+     */
+    for (level=3; level<config::kNumLevels; ++level)
+    {
+        // no penalty
+        version.m_FalseFile[level].file_size=0;
+        UpdatePenalty(&version);
+	ASSERT_TRUE(0==version.WritePenalty());
+
+        version.m_FalseFile[level].file_size=VersionSet::DesiredBytesForLevel(level);
+        UpdatePenalty(&version);
+	ASSERT_TRUE(0==version.WritePenalty());
+
+        version.m_FalseFile[level].file_size=VersionSet::MaxBytesForLevel(level)-1;
+        UpdatePenalty(&version);
+	ASSERT_TRUE(0==version.WritePenalty());
+
+        version.m_FalseFile[level].file_size=VersionSet::MaxBytesForLevel(level);
+        UpdatePenalty(&version);
+        if ((config::kNumLevels-1)!=level)
+	  ASSERT_TRUE(0!=version.WritePenalty());
+        else
+	  ASSERT_TRUE(0==version.WritePenalty());
+
+        // clean up
+        version.m_FalseFile[level].file_size=0;
+    }   // for
+#endif
+}   // test NoPenalty
+
+
+
+}  // namespace leveldb
diff --git a/src/leveldb/db/recovery_test.cc b/src/leveldb/db/recovery_test.cc
deleted file mode 100644
index 9596f4288..000000000
--- a/src/leveldb/db/recovery_test.cc
+++ /dev/null
@@ -1,324 +0,0 @@
-// Copyright (c) 2014 The LevelDB Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
-
-#include "db/db_impl.h"
-#include "db/filename.h"
-#include "db/version_set.h"
-#include "db/write_batch_internal.h"
-#include "leveldb/db.h"
-#include "leveldb/env.h"
-#include "leveldb/write_batch.h"
-#include "util/logging.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
-
-namespace leveldb {
-
-class RecoveryTest {
- public:
-  RecoveryTest() : env_(Env::Default()), db_(NULL) {
-    dbname_ = test::TmpDir() + "/recovery_test";
-    DestroyDB(dbname_, Options());
-    Open();
-  }
-
-  ~RecoveryTest() {
-    Close();
-    DestroyDB(dbname_, Options());
-  }
-
-  DBImpl* dbfull() const { return reinterpret_cast<DBImpl*>(db_); }
-  Env* env() const { return env_; }
-
-  bool CanAppend() {
-    WritableFile* tmp;
-    Status s = env_->NewAppendableFile(CurrentFileName(dbname_), &tmp);
-    delete tmp;
-    if (s.IsNotSupportedError()) {
-      return false;
-    } else {
-      return true;
-    }
-  }
-
-  void Close() {
-    delete db_;
-    db_ = NULL;
-  }
-
-  void Open(Options* options = NULL) {
-    Close();
-    Options opts;
-    if (options != NULL) {
-      opts = *options;
-    } else {
-      opts.reuse_logs = true;  // TODO(sanjay): test both ways
-      opts.create_if_missing = true;
-    }
-    if (opts.env == NULL) {
-      opts.env = env_;
-    }
-    ASSERT_OK(DB::Open(opts, dbname_, &db_));
-    ASSERT_EQ(1, NumLogs());
-  }
-
-  Status Put(const std::string& k, const std::string& v) {
-    return db_->Put(WriteOptions(), k, v);
-  }
-
-  std::string Get(const std::string& k, const Snapshot* snapshot = NULL) {
-    std::string result;
-    Status s = db_->Get(ReadOptions(), k, &result);
-    if (s.IsNotFound()) {
-      result = "NOT_FOUND";
-    } else if (!s.ok()) {
-      result = s.ToString();
-    }
-    return result;
-  }
-
-  std::string ManifestFileName() {
-    std::string current;
-    ASSERT_OK(ReadFileToString(env_, CurrentFileName(dbname_), &current));
-    size_t len = current.size();
-    if (len > 0 && current[len-1] == '\n') {
-      current.resize(len - 1);
-    }
-    return dbname_ + "/" + current;
-  }
-
-  std::string LogName(uint64_t number) {
-    return LogFileName(dbname_, number);
-  }
-
-  size_t DeleteLogFiles() {
-    std::vector<uint64_t> logs = GetFiles(kLogFile);
-    for (size_t i = 0; i < logs.size(); i++) {
-      ASSERT_OK(env_->DeleteFile(LogName(logs[i]))) << LogName(logs[i]);
-    }
-    return logs.size();
-  }
-
-  uint64_t FirstLogFile() {
-    return GetFiles(kLogFile)[0];
-  }
-
-  std::vector<uint64_t> GetFiles(FileType t) {
-    std::vector<std::string> filenames;
-    ASSERT_OK(env_->GetChildren(dbname_, &filenames));
-    std::vector<uint64_t> result;
-    for (size_t i = 0; i < filenames.size(); i++) {
-      uint64_t number;
-      FileType type;
-      if (ParseFileName(filenames[i], &number, &type) && type == t) {
-        result.push_back(number);
-      }
-    }
-    return result;
-  }
-
-  int NumLogs() {
-    return GetFiles(kLogFile).size();
-  }
-
-  int NumTables() {
-    return GetFiles(kTableFile).size();
-  }
-
-  uint64_t FileSize(const std::string& fname) {
-    uint64_t result;
-    ASSERT_OK(env_->GetFileSize(fname, &result)) << fname;
-    return result;
-  }
-
-  void CompactMemTable() {
-    dbfull()->TEST_CompactMemTable();
-  }
-
-  // Directly construct a log file that sets key to val.
-  void MakeLogFile(uint64_t lognum, SequenceNumber seq, Slice key, Slice val) {
-    std::string fname = LogFileName(dbname_, lognum);
-    WritableFile* file;
-    ASSERT_OK(env_->NewWritableFile(fname, &file));
-    log::Writer writer(file);
-    WriteBatch batch;
-    batch.Put(key, val);
-    WriteBatchInternal::SetSequence(&batch, seq);
-    ASSERT_OK(writer.AddRecord(WriteBatchInternal::Contents(&batch)));
-    ASSERT_OK(file->Flush());
-    delete file;
-  }
-
- private:
-  std::string dbname_;
-  Env* env_;
-  DB* db_;
-};
-
-TEST(RecoveryTest, ManifestReused) {
-  if (!CanAppend()) {
-    fprintf(stderr, "skipping test because env does not support appending\n");
-    return;
-  }
-  ASSERT_OK(Put("foo", "bar"));
-  Close();
-  std::string old_manifest = ManifestFileName();
-  Open();
-  ASSERT_EQ(old_manifest, ManifestFileName());
-  ASSERT_EQ("bar", Get("foo"));
-  Open();
-  ASSERT_EQ(old_manifest, ManifestFileName());
-  ASSERT_EQ("bar", Get("foo"));
-}
-
-TEST(RecoveryTest, LargeManifestCompacted) {
-  if (!CanAppend()) {
-    fprintf(stderr, "skipping test because env does not support appending\n");
-    return;
-  }
-  ASSERT_OK(Put("foo", "bar"));
-  Close();
-  std::string old_manifest = ManifestFileName();
-
-  // Pad with zeroes to make manifest file very big.
-  {
-    uint64_t len = FileSize(old_manifest);
-    WritableFile* file;
-    ASSERT_OK(env()->NewAppendableFile(old_manifest, &file));
-    std::string zeroes(3*1048576 - static_cast<size_t>(len), 0);
-    ASSERT_OK(file->Append(zeroes));
-    ASSERT_OK(file->Flush());
-    delete file;
-  }
-
-  Open();
-  std::string new_manifest = ManifestFileName();
-  ASSERT_NE(old_manifest, new_manifest);
-  ASSERT_GT(10000, FileSize(new_manifest));
-  ASSERT_EQ("bar", Get("foo"));
-
-  Open();
-  ASSERT_EQ(new_manifest, ManifestFileName());
-  ASSERT_EQ("bar", Get("foo"));
-}
-
-TEST(RecoveryTest, NoLogFiles) {
-  ASSERT_OK(Put("foo", "bar"));
-  ASSERT_EQ(1, DeleteLogFiles());
-  Open();
-  ASSERT_EQ("NOT_FOUND", Get("foo"));
-  Open();
-  ASSERT_EQ("NOT_FOUND", Get("foo"));
-}
-
-TEST(RecoveryTest, LogFileReuse) {
-  if (!CanAppend()) {
-    fprintf(stderr, "skipping test because env does not support appending\n");
-    return;
-  }
-  for (int i = 0; i < 2; i++) {
-    ASSERT_OK(Put("foo", "bar"));
-    if (i == 0) {
-      // Compact to ensure current log is empty
-      CompactMemTable();
-    }
-    Close();
-    ASSERT_EQ(1, NumLogs());
-    uint64_t number = FirstLogFile();
-    if (i == 0) {
-      ASSERT_EQ(0, FileSize(LogName(number)));
-    } else {
-      ASSERT_LT(0, FileSize(LogName(number)));
-    }
-    Open();
-    ASSERT_EQ(1, NumLogs());
-    ASSERT_EQ(number, FirstLogFile()) << "did not reuse log file";
-    ASSERT_EQ("bar", Get("foo"));
-    Open();
-    ASSERT_EQ(1, NumLogs());
-    ASSERT_EQ(number, FirstLogFile()) << "did not reuse log file";
-    ASSERT_EQ("bar", Get("foo"));
-  }
-}
-
-TEST(RecoveryTest, MultipleMemTables) {
-  // Make a large log.
-  const int kNum = 1000;
-  for (int i = 0; i < kNum; i++) {
-    char buf[100];
-    snprintf(buf, sizeof(buf), "%050d", i);
-    ASSERT_OK(Put(buf, buf));
-  }
-  ASSERT_EQ(0, NumTables());
-  Close();
-  ASSERT_EQ(0, NumTables());
-  ASSERT_EQ(1, NumLogs());
-  uint64_t old_log_file = FirstLogFile();
-
-  // Force creation of multiple memtables by reducing the write buffer size.
-  Options opt;
-  opt.reuse_logs = true;
-  opt.write_buffer_size = (kNum*100) / 2;
-  Open(&opt);
-  ASSERT_LE(2, NumTables());
-  ASSERT_EQ(1, NumLogs());
-  ASSERT_NE(old_log_file, FirstLogFile()) << "must not reuse log";
-  for (int i = 0; i < kNum; i++) {
-    char buf[100];
-    snprintf(buf, sizeof(buf), "%050d", i);
-    ASSERT_EQ(buf, Get(buf));
-  }
-}
-
-TEST(RecoveryTest, MultipleLogFiles) {
-  ASSERT_OK(Put("foo", "bar"));
-  Close();
-  ASSERT_EQ(1, NumLogs());
-
-  // Make a bunch of uncompacted log files.
-  uint64_t old_log = FirstLogFile();
-  MakeLogFile(old_log+1, 1000, "hello", "world");
-  MakeLogFile(old_log+2, 1001, "hi", "there");
-  MakeLogFile(old_log+3, 1002, "foo", "bar2");
-
-  // Recover and check that all log files were processed.
-  Open();
-  ASSERT_LE(1, NumTables());
-  ASSERT_EQ(1, NumLogs());
-  uint64_t new_log = FirstLogFile();
-  ASSERT_LE(old_log+3, new_log);
-  ASSERT_EQ("bar2", Get("foo"));
-  ASSERT_EQ("world", Get("hello"));
-  ASSERT_EQ("there", Get("hi"));
-
-  // Test that previous recovery produced recoverable state.
-  Open();
-  ASSERT_LE(1, NumTables());
-  ASSERT_EQ(1, NumLogs());
-  if (CanAppend()) {
-    ASSERT_EQ(new_log, FirstLogFile());
-  }
-  ASSERT_EQ("bar2", Get("foo"));
-  ASSERT_EQ("world", Get("hello"));
-  ASSERT_EQ("there", Get("hi"));
-
-  // Check that introducing an older log file does not cause it to be re-read.
-  Close();
-  MakeLogFile(old_log+1, 2000, "hello", "stale write");
-  Open();
-  ASSERT_LE(1, NumTables());
-  ASSERT_EQ(1, NumLogs());
-  if (CanAppend()) {
-    ASSERT_EQ(new_log, FirstLogFile());
-  }
-  ASSERT_EQ("bar2", Get("foo"));
-  ASSERT_EQ("world", Get("hello"));
-  ASSERT_EQ("there", Get("hi"));
-}
-
-}  // namespace leveldb
-
-int main(int argc, char** argv) {
-  return leveldb::test::RunAllTests();
-}
diff --git a/src/leveldb/db/repair.cc b/src/leveldb/db/repair.cc
index 7281e3d34..b1c1bc2c2 100644
--- a/src/leveldb/db/repair.cc
+++ b/src/leveldb/db/repair.cc
@@ -45,49 +45,113 @@ namespace {
 class Repairer {
  public:
   Repairer(const std::string& dbname, const Options& options)
-      : dbname_(dbname),
+      : double_cache_(options),
+        options_(SanitizeOptions(dbname, &icmp_, &ipolicy_, options, double_cache_.GetBlockCache())),
+        org_options_(options),
+        dbname_(options_.tiered_fast_prefix),
+        org_dbname_(dbname),
         env_(options.env),
         icmp_(options.comparator),
         ipolicy_(options.filter_policy),
-        options_(SanitizeOptions(dbname, &icmp_, &ipolicy_, options)),
         owns_info_log_(options_.info_log != options.info_log),
-        owns_cache_(options_.block_cache != options.block_cache),
-        next_file_number_(1) {
+        db_lock_(NULL),
+        next_file_number_(1)
+  {
     // TableCache can be small since we expect each table to be opened once.
-    table_cache_ = new TableCache(dbname_, &options_, 10);
+    table_cache_ = new TableCache(dbname_, &options_, double_cache_.GetFileCache(), double_cache_);
+
   }
 
   ~Repairer() {
-    delete table_cache_;
     if (owns_info_log_) {
       delete options_.info_log;
     }
-    if (owns_cache_) {
-      delete options_.block_cache;
-    }
+//    if (owns_cache_) {
+//      delete options_.block_cache;
+//    }
+
+    // must remove second ref counter that keeps overlapped files locked
+    //  table cache
+    bool is_overlap;
+    for (int level = 0; level < config::kNumLevels; level++) {
+        {
+            is_overlap=(level < leveldb::config::kNumOverlapLevels);
+            for (size_t i = 0; i < table_numbers_[level].size(); i++) {
+                table_cache_->Evict(table_numbers_[level][i], is_overlap);
+            }   // for
+        }   // if
+    } // for
+
+    delete table_cache_;
   }
 
   Status Run() {
-    Status status = FindFiles();
+    Status status;
+
+    status = env_->LockFile(LockFileName(dbname_), &db_lock_);
+
+    if (status.ok())
+        status = MakeLevelDirectories(env_, options_);
+
     if (status.ok()) {
-      ConvertLogFilesToTables();
-      ExtractMetaData();
-      status = WriteDescriptor();
-    }
-    if (status.ok()) {
-      unsigned long long bytes = 0;
-      for (size_t i = 0; i < tables_.size(); i++) {
-        bytes += tables_[i].meta.file_size;
+      status = FindFiles();
+      if (status.ok()) {
+          ConvertLogFilesToTables();
+          ExtractMetaData();
+          status = WriteDescriptor();
+      }
+      if (status.ok()) {
+        unsigned long long bytes = 0;
+        unsigned long long files = 0;
+
+        // calculate size for log information
+        for (int level=0; level<config::kNumLevels;++level)
+        {
+          std::vector<TableInfo> * table_ptr;
+          std::vector<TableInfo>::const_iterator i;
+
+          table_ptr=&tables_[level];
+          files+=table_ptr->size();
+
+          for ( i = table_ptr->begin(); table_ptr->end()!= i; i++) {
+            bytes += i->meta.file_size;
+          }
+        } // for
+
+        Log(options_.info_log,
+            "**** Repaired leveldb %s; "
+            "recovered %d files; %llu bytes. "
+            "Some data may have been lost. "
+            "****",
+            dbname_.c_str(),
+            static_cast<int>(files),
+            bytes);
+      }
+      if (db_lock_ != NULL) {
+        env_->UnlockFile(db_lock_);
       }
-      Log(options_.info_log,
-          "**** Repaired leveldb %s; "
-          "recovered %d files; %llu bytes. "
-          "Some data may have been lost. "
-          "****",
-          dbname_.c_str(),
-          static_cast<int>(tables_.size()),
-          bytes);
     }
+
+    // perform Riak specific scan for overlapping .sst files
+    //  within a level
+    if (status.ok())
+    {
+        leveldb::DB * db_ptr;
+        Options options;
+
+        db_ptr=NULL;
+        options=org_options_;
+//        options.block_cache=NULL;  // not reusing for fear of edge cases
+        options.is_repair=true;
+        options.error_if_exists=false;
+        status=leveldb::DB::Open(options, org_dbname_, &db_ptr);
+
+        if (status.ok())
+            status=db_ptr->VerifyLevels();
+
+        delete db_ptr;
+
+    }   // if
     return status;
   }
 
@@ -97,34 +161,36 @@ class Repairer {
     SequenceNumber max_sequence;
   };
 
-  std::string const dbname_;
+  DoubleCache double_cache_;
+  Options const options_, org_options_;
+  std::string const dbname_, org_dbname_;
   Env* const env_;
   InternalKeyComparator const icmp_;
   InternalFilterPolicy const ipolicy_;
-  Options const options_;
   bool owns_info_log_;
-  bool owns_cache_;
+  FileLock* db_lock_;
   TableCache* table_cache_;
   VersionEdit edit_;
 
   std::vector<std::string> manifests_;
-  std::vector<uint64_t> table_numbers_;
+  std::vector<uint64_t> table_numbers_[config::kNumLevels];
   std::vector<uint64_t> logs_;
-  std::vector<TableInfo> tables_;
+  std::vector<TableInfo> tables_[config::kNumLevels];
   uint64_t next_file_number_;
 
-  Status FindFiles() {
+  Status FindFiles()
+  {
     std::vector<std::string> filenames;
+    uint64_t number;
+    FileType type;
+    int level;
+
+    // base directory
     Status status = env_->GetChildren(dbname_, &filenames);
     if (!status.ok()) {
       return status;
     }
-    if (filenames.empty()) {
-      return Status::IOError(dbname_, "repair found no files");
-    }
 
-    uint64_t number;
-    FileType type;
     for (size_t i = 0; i < filenames.size(); i++) {
       if (ParseFileName(filenames[i], &number, &type)) {
         if (type == kDescriptorFile) {
@@ -136,13 +202,38 @@ class Repairer {
           if (type == kLogFile) {
             logs_.push_back(number);
           } else if (type == kTableFile) {
-            table_numbers_.push_back(number);
+            table_numbers_[0].push_back(number);
           } else {
             // Ignore other files
-          }
-        }
+          } // else
+        } // else
+      } // if
+    } // for
+
+    for (level=0; level < config::kNumLevels; ++level)
+    {
+      std::string dirname;
+
+      filenames.clear();
+      dirname=MakeDirName2(options_, level, "sst");
+      Status status = env_->GetChildren(dirname, &filenames);
+      if (!status.ok()) {
+          return status;
       }
-    }
+
+      for (size_t i = 0; i < filenames.size(); i++) {
+        if (ParseFileName(filenames[i], &number, &type)) {
+          if (number + 1 > next_file_number_) {
+            next_file_number_ = number + 1;
+          }
+
+          if (type == kTableFile) {
+            table_numbers_[level].push_back(number);
+          }
+        } // if
+      } // for
+    } // for
+
     return status;
   }
 
@@ -186,7 +277,7 @@ class Repairer {
     reporter.env = env_;
     reporter.info_log = options_.info_log;
     reporter.lognum = log;
-    // We intentionally make log::Reader do checksumming so that
+    // We intentially make log::Reader do checksumming so that
     // corruptions cause entire commits to be skipped instead of
     // propagating bad information (like overly large sequence
     // numbers).
@@ -203,11 +294,11 @@ class Repairer {
     while (reader.ReadRecord(&record, &scratch)) {
       if (record.size() < 12) {
         reporter.Corruption(
-            record.size(), Status::Corruption("log record too small", logname));
+            record.size(), Status::Corruption("log record too small"));
         continue;
       }
       WriteBatchInternal::SetContents(&batch, record);
-      status = WriteBatchInternal::InsertInto(&batch, mem);
+      status = WriteBatchInternal::InsertInto(&batch, mem, &options_);
       if (status.ok()) {
         counter += WriteBatchInternal::Count(&batch);
       } else {
@@ -223,14 +314,15 @@ class Repairer {
     // since ExtractMetaData() will also generate edits.
     FileMetaData meta;
     meta.number = next_file_number_++;
+    meta.level = 0;
     Iterator* iter = mem->NewIterator();
-    status = BuildTable(dbname_, env_, options_, table_cache_, iter, &meta);
+    status = BuildTable(dbname_, env_, options_, icmp_.user_comparator(), table_cache_, iter, &meta, 0);
     delete iter;
     mem->Unref();
     mem = NULL;
     if (status.ok()) {
       if (meta.file_size > 0) {
-        table_numbers_.push_back(meta.number);
+        table_numbers_[0].push_back(meta.number);
       }
     }
     Log(options_.info_log, "Log #%llu: %d ops saved to Table #%llu %s",
@@ -242,168 +334,128 @@ class Repairer {
   }
 
   void ExtractMetaData() {
-    for (size_t i = 0; i < table_numbers_.size(); i++) {
-      ScanTable(table_numbers_[i]);
-    }
-  }
+    for (int level=0; level < config::kNumLevels; ++level)
+    {
+      std::vector<uint64_t> * number_ptr;
+      std::vector<uint64_t>::const_iterator i;
 
-  Iterator* NewTableIterator(const FileMetaData& meta) {
-    // Same as compaction iterators: if paranoid_checks are on, turn
-    // on checksum verification.
-    ReadOptions r;
-    r.verify_checksums = options_.paranoid_checks;
-    return table_cache_->NewIterator(r, meta.number, meta.file_size);
-  }
-
-  void ScanTable(uint64_t number) {
-    TableInfo t;
-    t.meta.number = number;
-    std::string fname = TableFileName(dbname_, number);
-    Status status = env_->GetFileSize(fname, &t.meta.file_size);
-    if (!status.ok()) {
-      // Try alternate file name.
-      fname = SSTTableFileName(dbname_, number);
-      Status s2 = env_->GetFileSize(fname, &t.meta.file_size);
-      if (s2.ok()) {
-        status = Status::OK();
+      number_ptr=&table_numbers_[level];
+      for (i = number_ptr->begin(); number_ptr->end()!= i; ++i) {
+        TableInfo t;
+        t.meta.number = *i;
+        t.meta.level = level;
+        Status status = ScanTable(&t);
+        if (!status.ok())
+        {
+          std::string fname = TableFileName(options_, t.meta.number, t.meta.level);
+          Log(options_.info_log, "Table #%llu: ignoring %s",
+              (unsigned long long) t.meta.number,
+              status.ToString().c_str());
+          ArchiveFile(fname, true);
+        } else {
+          tables_[level].push_back(t);
+        }
       }
     }
-    if (!status.ok()) {
-      ArchiveFile(TableFileName(dbname_, number));
-      ArchiveFile(SSTTableFileName(dbname_, number));
-      Log(options_.info_log, "Table #%llu: dropped: %s",
-          (unsigned long long) t.meta.number,
-          status.ToString().c_str());
-      return;
-    }
+  }
 
-    // Extract metadata by scanning through table.
+  Status ScanTable(TableInfo* t) {
+    Table * table_ptr;
+    SstCounters counters;
+    std::string fname = TableFileName(options_, t->meta.number, t->meta.level);
     int counter = 0;
-    Iterator* iter = NewTableIterator(t.meta);
-    bool empty = true;
-    ParsedInternalKey parsed;
-    t.max_sequence = 0;
-    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
-      Slice key = iter->key();
-      if (!ParseInternalKey(key, &parsed)) {
-        Log(options_.info_log, "Table #%llu: unparsable key %s",
-            (unsigned long long) t.meta.number,
-            EscapeString(key).c_str());
-        continue;
-      }
+    Status status = env_->GetFileSize(fname, &t->meta.file_size);
+    if (status.ok()) {
+      Iterator* iter = table_cache_->NewIterator(
+          ReadOptions(), t->meta.number, t->meta.file_size, t->meta.level, &table_ptr);
+      bool empty = true;
+      ParsedInternalKey parsed;
+      t->max_sequence = 0;
+      for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+        Slice key = iter->key();
+        if (!ParseInternalKey(key, &parsed)) {
+          Log(options_.info_log, "Table #%llu: unparsable key %s",
+              (unsigned long long) t->meta.number,
+              EscapeString(key).c_str());
+          continue;
+        }
 
-      counter++;
-      if (empty) {
-        empty = false;
-        t.meta.smallest.DecodeFrom(key);
+        counter++;
+        if (empty) {
+          empty = false;
+          t->meta.smallest.DecodeFrom(key);
+        }
+        t->meta.largest.DecodeFrom(key);
+        if (parsed.sequence > t->max_sequence) {
+          t->max_sequence = parsed.sequence;
+        }
       }
-      t.meta.largest.DecodeFrom(key);
-      if (parsed.sequence > t.max_sequence) {
-        t.max_sequence = parsed.sequence;
+      if (!iter->status().ok()) {
+        status = iter->status();
       }
+      else {
+        counters=table_ptr->GetSstCounters();
+        t->meta.exp_write_low=counters.Value(eSstCountExpiry1);
+        t->meta.exp_write_high=counters.Value(eSstCountExpiry2);
+        t->meta.exp_explicit_high=counters.Value(eSstCountExpiry3);
+      }
+      delete iter;
     }
-    if (!iter->status().ok()) {
-      status = iter->status();
-    }
-    delete iter;
     Log(options_.info_log, "Table #%llu: %d entries %s",
-        (unsigned long long) t.meta.number,
+        (unsigned long long) t->meta.number,
         counter,
         status.ToString().c_str());
-
-    if (status.ok()) {
-      tables_.push_back(t);
-    } else {
-      RepairTable(fname, t);  // RepairTable archives input file.
-    }
-  }
-
-  void RepairTable(const std::string& src, TableInfo t) {
-    // We will copy src contents to a new table and then rename the
-    // new table over the source.
-
-    // Create builder.
-    std::string copy = TableFileName(dbname_, next_file_number_++);
-    WritableFile* file;
-    Status s = env_->NewWritableFile(copy, &file);
-    if (!s.ok()) {
-      return;
-    }
-    TableBuilder* builder = new TableBuilder(options_, file);
-
-    // Copy data.
-    Iterator* iter = NewTableIterator(t.meta);
-    int counter = 0;
-    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
-      builder->Add(iter->key(), iter->value());
-      counter++;
-    }
-    delete iter;
-
-    ArchiveFile(src);
-    if (counter == 0) {
-      builder->Abandon();  // Nothing to save
-    } else {
-      s = builder->Finish();
-      if (s.ok()) {
-        t.meta.file_size = builder->FileSize();
-      }
-    }
-    delete builder;
-    builder = NULL;
-
-    if (s.ok()) {
-      s = file->Close();
-    }
-    delete file;
-    file = NULL;
-
-    if (counter > 0 && s.ok()) {
-      std::string orig = TableFileName(dbname_, t.meta.number);
-      s = env_->RenameFile(copy, orig);
-      if (s.ok()) {
-        Log(options_.info_log, "Table #%llu: %d entries repaired",
-            (unsigned long long) t.meta.number, counter);
-        tables_.push_back(t);
-      }
-    }
-    if (!s.ok()) {
-      env_->DeleteFile(copy);
-    }
+    return status;
   }
 
   Status WriteDescriptor() {
     std::string tmp = TempFileName(dbname_, 1);
     WritableFile* file;
-    Status status = env_->NewWritableFile(tmp, &file);
+    Status status = env_->NewWritableFile(tmp, &file, 4096);
     if (!status.ok()) {
       return status;
     }
 
     SequenceNumber max_sequence = 0;
-    for (size_t i = 0; i < tables_.size(); i++) {
-      if (max_sequence < tables_[i].max_sequence) {
-        max_sequence = tables_[i].max_sequence;
-      }
-    }
+    for (int level=0; level<config::kNumLevels;++level)
+    {
+      std::vector<TableInfo> * table_ptr;
+      std::vector<TableInfo>::const_iterator i;
+
+      table_ptr=&tables_[level];
+
+      for ( i = table_ptr->begin(); table_ptr->end()!= i; i++) {
+        if (max_sequence < i->max_sequence) {
+          max_sequence = i->max_sequence;
+        }
+      } // for
+    } // for
 
     edit_.SetComparatorName(icmp_.user_comparator()->Name());
     edit_.SetLogNumber(0);
     edit_.SetNextFile(next_file_number_);
     edit_.SetLastSequence(max_sequence);
 
-    for (size_t i = 0; i < tables_.size(); i++) {
-      // TODO(opt): separate out into multiple levels
-      const TableInfo& t = tables_[i];
-      edit_.AddFile(0, t.meta.number, t.meta.file_size,
-                    t.meta.smallest, t.meta.largest);
-    }
+    for (int level=0; level<config::kNumLevels;++level)
+    {
+      std::vector<TableInfo> * table_ptr;
+      std::vector<TableInfo>::const_iterator i;
+
+      table_ptr=&tables_[level];
+
+      for ( i = table_ptr->begin(); table_ptr->end()!= i; i++) {
+          edit_.AddFile2(level, i->meta.number, i->meta.file_size,
+                         i->meta.smallest, i->meta.largest,
+                         i->meta.exp_write_low, i->meta.exp_write_high, i->meta.exp_explicit_high);
+
+      } // for
+    } // for
 
     //fprintf(stderr, "NewDescriptor:\n%s\n", edit_.DebugString().c_str());
     {
       log::Writer log(file);
       std::string record;
-      edit_.EncodeTo(&record);
+      edit_.EncodeTo(&record);  // manifest format is default for release, options_ often incomplete
       status = log.AddRecord(record);
     }
     if (status.ok()) {
@@ -431,21 +483,33 @@ class Repairer {
     return status;
   }
 
-  void ArchiveFile(const std::string& fname) {
+  void ArchiveFile(const std::string& fname, bool two_levels=false) {
     // Move into another directory.  E.g., for
     //    dir/foo
     // rename to
     //    dir/lost/foo
-    const char* slash = strrchr(fname.c_str(), '/');
+    std::string::size_type slash, slash2;
+
+    slash=fname.rfind('/');
+    if (two_levels && std::string::npos!=slash && 0<slash)
+    {
+        slash2=fname.rfind('/',slash-1);
+        if (std::string::npos==slash2)
+            slash2=slash;
+    }   // if
+    else
+        slash2=slash;
+
     std::string new_dir;
-    if (slash != NULL) {
-      new_dir.assign(fname.data(), slash - fname.data());
-    }
+
+    if (std::string::npos != slash2 && 0<slash2)
+      new_dir.append(fname,0,slash2);
+
     new_dir.append("/lost");
     env_->CreateDir(new_dir);  // Ignore error
     std::string new_file = new_dir;
     new_file.append("/");
-    new_file.append((slash == NULL) ? fname.c_str() : slash + 1);
+    new_file.append((std::string::npos!=slash) ? fname.substr(slash+1) : fname);
     Status s = env_->RenameFile(fname, new_file);
     Log(options_.info_log, "Archiving %s: %s\n",
         fname.c_str(), s.ToString().c_str());
diff --git a/src/leveldb/db/skiplist.h b/src/leveldb/db/skiplist.h
index 8bd77764d..2ad4c6642 100644
--- a/src/leveldb/db/skiplist.h
+++ b/src/leveldb/db/skiplist.h
@@ -1,10 +1,7 @@
 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
-
-#ifndef STORAGE_LEVELDB_DB_SKIPLIST_H_
-#define STORAGE_LEVELDB_DB_SKIPLIST_H_
-
+//
 // Thread safety
 // -------------
 //
@@ -55,6 +52,12 @@ class SkipList {
   // Returns true iff an entry that compares equal to key is in the list.
   bool Contains(const Key& key) const;
 
+  // Returns true if all inserts have been sequentially increasing;
+  // else this SkipList has had keys inserted in non-sequential order
+  bool InSequentialInsertMode() const {
+    return sequentialInsertMode_;
+  }
+
   // Iteration over the contents of a skip list
   class Iterator {
    public:
@@ -94,8 +97,22 @@ class SkipList {
     // Intentionally copyable
   };
 
+ protected:
+  // Checks the structure of this SkipList object, ensuring the keys are
+  // properly ordered
+  //
+  // This is protected since it is intended for use by unit tests; if a lock
+  // is used to protect Insert(), then it should be used to protect this
+  // method as well
+  bool Valid() const;
+
+  // Disables the sequential insert optimizations (used in performance testing)
+  void DisableSequentialInsertMode() {
+    sequentialInsertMode_ = false;
+  }
+
  private:
-  enum { kMaxHeight = 12 };
+  enum { kMaxHeight = 17 };
 
   // Immutable after construction
   Comparator const compare_;
@@ -115,6 +132,18 @@ class SkipList {
   // Read/written only by Insert().
   Random rnd_;
 
+  // Points to the last node in the list; modified only by Insert()
+  Node* tail_;
+
+  // Pointers to the nodes previous to the tail node; have max_height_ entries
+  Node* tailPrev_[kMaxHeight];
+
+  // The height of the tail_ node
+  int tailHeight_;
+
+  // We track the tail node until we have a non-sequential insert
+  bool sequentialInsertMode_;
+
   Node* NewNode(const Key& key, int height);
   int RandomHeight();
   bool Equal(const Key& a, const Key& b) const { return (compare_(a, b) == 0); }
@@ -129,6 +158,11 @@ class SkipList {
   // node at "level" for every level in [0..max_height_-1].
   Node* FindGreaterOrEqual(const Key& key, Node** prev) const;
 
+  // Similar to FindGreaterOrEqual() except it uses the barrier-free
+  // variant of Next(); this is used only by Insert() and it
+  // checks the tail_ pointer in case we're doing a sequential insert
+  Node* NoBarrier_FindGreaterOrEqual(const Key& key, Node** prev) const;
+
   // Return the latest node with a key < key.
   // Return head_ if there is no such node.
   Node* FindLessThan(const Key& key) const;
@@ -280,6 +314,54 @@ typename SkipList<Key,Comparator>::Node* SkipList<Key,Comparator>::FindGreaterOr
   }
 }
 
+template<typename Key, class Comparator>
+typename SkipList<Key,Comparator>::Node*
+SkipList<Key,Comparator>::NoBarrier_FindGreaterOrEqual(const Key& key, Node** prev) const {
+  int level = GetMaxHeight() - 1;
+
+  // If we have only seen sequential inserts up to this point, we can use
+  // the tail_ node
+  if ( sequentialInsertMode_ ) {
+    if (tail_ == NULL) {
+      // The list is currently empty, so the node being inserted
+      // will be the new tail_
+      assert(level == 0);
+      if (prev != NULL) prev[0] = head_;
+      return NULL;
+    }
+    else if (KeyIsAfterNode(key, tail_)) {
+      // The new key must be inserted after the current tail_ node
+      if (prev != NULL) {
+        int i;
+        for (i = 0; i < tailHeight_; ++i) {
+          prev[i] = tail_;
+        }
+        for (/*continue with i*/; i <= level; ++i) {
+          prev[i] = tailPrev_[i];
+        }
+      }
+      return NULL;
+    }
+  }
+
+  Node* x = head_;
+  while (true) {
+    Node* next = x->NoBarrier_Next(level);
+    if (KeyIsAfterNode(key, next)) {
+      // Keep searching in this list
+      x = next;
+    } else {
+      if (prev != NULL) prev[level] = x;
+      if (level == 0) {
+        return next;
+      } else {
+        // Switch to next list
+        level--;
+      }
+    }
+  }
+}
+
 template<typename Key, class Comparator>
 typename SkipList<Key,Comparator>::Node*
 SkipList<Key,Comparator>::FindLessThan(const Key& key) const {
@@ -327,25 +409,41 @@ SkipList<Key,Comparator>::SkipList(Comparator cmp, Arena* arena)
       arena_(arena),
       head_(NewNode(0 /* any key will do */, kMaxHeight)),
       max_height_(reinterpret_cast<void*>(1)),
-      rnd_(0xdeadbeef) {
+      rnd_(0xdeadbeef),
+      tail_(NULL),
+      tailHeight_(0),
+      sequentialInsertMode_(true) {
   for (int i = 0; i < kMaxHeight; i++) {
     head_->SetNext(i, NULL);
+    tailPrev_[i] = NULL;
   }
 }
 
 template<typename Key, class Comparator>
 void SkipList<Key,Comparator>::Insert(const Key& key) {
-  // TODO(opt): We can use a barrier-free variant of FindGreaterOrEqual()
+  // We use a barrier-free variant of FindGreaterOrEqual()
   // here since Insert() is externally synchronized.
   Node* prev[kMaxHeight];
-  Node* x = FindGreaterOrEqual(key, prev);
+  Node* x = NoBarrier_FindGreaterOrEqual(key, prev);
+
+  // If we're still in sequential-insert mode, check if the new node is being
+  // inserted at the end of the list, which is indicated by x being NULL
+  if (sequentialInsertMode_) {
+    if (x != NULL) {
+      // we have a non-sequential (AKA random) insert, so stop maintaining
+      // the tail bookkeeping overhead
+      sequentialInsertMode_ = false;
+    }
+  }
 
   // Our data structure does not allow duplicate insertion
   assert(x == NULL || !Equal(key, x->key));
 
-  int height = RandomHeight();
+  int i, height = RandomHeight();
   if (height > GetMaxHeight()) {
-    for (int i = GetMaxHeight(); i < height; i++) {
+    // We are extending max_height_ which means we need to fill in the blanks
+    // in prev[] that were not filled in by NoBarrier_FindGreaterOrEqual()
+    for (i = GetMaxHeight(); i < height; ++i) {
       prev[i] = head_;
     }
     //fprintf(stderr, "Change height from %d to %d\n", max_height_, height);
@@ -361,12 +459,37 @@ void SkipList<Key,Comparator>::Insert(const Key& key) {
   }
 
   x = NewNode(key, height);
-  for (int i = 0; i < height; i++) {
+  for (i = 0; i < height; ++i) {
     // NoBarrier_SetNext() suffices since we will add a barrier when
     // we publish a pointer to "x" in prev[i].
     x->NoBarrier_SetNext(i, prev[i]->NoBarrier_Next(i));
     prev[i]->SetNext(i, x);
   }
+
+  // Do we need to update our tail_ pointer?
+  if (sequentialInsertMode_) {
+    Node* prevTail = tail_;
+    int prevTailHeight = tailHeight_;
+
+    tail_ = x;
+    tailHeight_ = height;
+
+    // We also need to update our tailPrev_ pointers; first we capture
+    // the nodes already pointing to the new tail_
+    for (i = 0; i < height; ++i) {
+      tailPrev_[i] = prev[i];
+    }
+
+    // If the previous tail node was taller than the new tail node, then
+    // the prev pointers above the current tail node's height (up to the
+    // height of the previous tail node) are simply the previous tail node
+    for (/*continue with i*/; i < prevTailHeight; ++i) {
+      tailPrev_[i] = prevTail;
+    }
+
+    // NOTE: any prev pointers above prevTailHeight (up to max_height_) were
+    // already set in tailPrev_ by previous calls to this method
+  }
 }
 
 template<typename Key, class Comparator>
@@ -379,6 +502,115 @@ bool SkipList<Key,Comparator>::Contains(const Key& key) const {
   }
 }
 
-}  // namespace leveldb
+template<typename Key, class Comparator>
+bool SkipList<Key,Comparator>::Valid() const
+{
+  // Note that we can use barrier-free overloads in this method since it is
+  // protected by the same lock as Insert().
 
-#endif  // STORAGE_LEVELDB_DB_SKIPLIST_H_
+  // Ensure that the list is properly sorted; use an iterator for this check
+  const Key* pPrevKey = NULL;
+  typename SkipList<Key, Comparator>::Iterator iter(this);
+  for ( iter.SeekToFirst(); iter.Valid(); iter.Next() ) {
+    if ( pPrevKey != NULL ) {
+      if ( compare_( *pPrevKey, iter.key() ) >= 0 ) {
+        return false;
+      }
+    }
+    pPrevKey = &iter.key();
+  }
+
+  // Now walk the linked list at each level and ensure it's sorted. Also track
+  // how many nodes we see at each level; the number of nodes in the linked
+  // list at level n must not be larger than the number of nodes at level n-1.
+  std::vector<int> nodeCounts( GetMaxHeight() );
+  int level;
+  for ( level = GetMaxHeight() - 1; level >= 0; --level ) {
+    int nodeCount = 0;
+    pPrevKey = NULL;
+    for ( Node* pNode = head_->NoBarrier_Next( level );
+          pNode != NULL;
+          pNode = pNode->NoBarrier_Next( level ) ) {
+      ++nodeCount;
+      if ( pPrevKey != NULL ) {
+        if ( compare_( *pPrevKey, pNode->key ) >= 0 ) {
+          return false;
+        }
+      }
+      pPrevKey = &pNode->key;
+    }
+    nodeCounts[ level ] = nodeCount;
+  }
+
+  // Ensure the node counts do not increase as we move up the levels
+  int prevNodeCount = nodeCounts[0];
+  for ( level = 1; level < GetMaxHeight(); ++level ) {
+    int currentNodeCount = nodeCounts[ level ];
+    if ( currentNodeCount > prevNodeCount ) {
+      return false;
+    }
+    prevNodeCount = currentNodeCount;
+  }
+
+  // Ensure that tail_ points to the last node
+  if ( sequentialInsertMode_ ) {
+    if ( tail_ == NULL ) {
+      // tail_ is not set, so the list must be empty
+      if ( tailPrev_[0] != NULL || head_->NoBarrier_Next(0) != NULL ) {
+        return false;
+      }
+    }
+    else {
+      // we have a tail_ node; first ensure that its prev pointer actually
+      // points to it
+      if ( tailPrev_[0] == NULL || tailPrev_[0]->NoBarrier_Next(0) != tail_ ) {
+        return false;
+      }
+      if ( compare_( tailPrev_[0]->key, tail_->key ) >= 0 ) {
+        return false;
+      }
+
+      // now check the rest of the pointers in tailPrev_; up to tailHeight_,
+      // the next pointer of the node in tailPrev_ should point to tail_; after
+      // that, the next pointer should be NULL
+      for ( level = 1; level < GetMaxHeight(); ++level ) {
+        Node* tailPrev = tailPrev_[ level ];
+        if ( tailPrev == NULL ) {
+          return false;
+        }
+        if ( level < tailHeight_ ) {
+          if ( tailPrev->NoBarrier_Next( level ) != tail_ ) {
+            return false;
+          }
+          if ( compare_( tailPrev->key, tail_->key ) >= 0 ) {
+            return false;
+          }
+        }
+        else {
+          if ( tailPrev->NoBarrier_Next( level ) != NULL ) {
+            return false;
+          }
+        }
+      }
+
+      // the remainder of the tailPrev_ pointers (above max_height_)
+      // should be NULL
+      for ( /*continue with level*/; level < kMaxHeight; ++level ) {
+        if ( tailPrev_[ level ] != NULL ) {
+          return false;
+        }
+      }
+
+      // now ensure that FindLast() returns tail_
+      Node* lastNode = FindLast();
+      if ( lastNode != tail_ ) {
+        return false;
+      }
+    }
+  }
+
+  // if we get here, all is good
+  return true;
+}
+
+}  // namespace leveldb
diff --git a/src/leveldb/db/skiplist_test.cc b/src/leveldb/db/skiplist_test.cc
index aee1461e1..c8643071c 100644
--- a/src/leveldb/db/skiplist_test.cc
+++ b/src/leveldb/db/skiplist_test.cc
@@ -2,11 +2,15 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
+#define __STDC_FORMAT_MACROS
+#include <inttypes.h>
+
 #include "db/skiplist.h"
 #include <set>
 #include "leveldb/env.h"
 #include "util/arena.h"
 #include "util/hash.h"
+#include "util/mutexlock.h"
 #include "util/random.h"
 #include "util/testharness.h"
 
@@ -26,15 +30,29 @@ struct Comparator {
   }
 };
 
+template<typename Key, class Comparator>
+class SkipListTest : public SkipList<Key, Comparator>
+{
+ public:
+  SkipListTest(Comparator cmp, Arena* arena) : SkipList<Key, Comparator>(cmp, arena) {}
+
+  // check the validity of this SkipList object by calling the Valid() method
+  // in the base class
+  bool Valid() const { return SkipList<Key, Comparator>::Valid(); }
+
+  void DisableSequentialInsertMode() { SkipList<Key, Comparator>::DisableSequentialInsertMode(); }
+};
+
 class SkipTest { };
 
 TEST(SkipTest, Empty) {
   Arena arena;
   Comparator cmp;
-  SkipList<Key, Comparator> list(cmp, &arena);
+  SkipListTest<Key, Comparator> list(cmp, &arena);
   ASSERT_TRUE(!list.Contains(10));
+  ASSERT_TRUE(list.Valid());
 
-  SkipList<Key, Comparator>::Iterator iter(&list);
+  SkipListTest<Key, Comparator>::Iterator iter(&list);
   ASSERT_TRUE(!iter.Valid());
   iter.SeekToFirst();
   ASSERT_TRUE(!iter.Valid());
@@ -51,13 +69,14 @@ TEST(SkipTest, InsertAndLookup) {
   std::set<Key> keys;
   Arena arena;
   Comparator cmp;
-  SkipList<Key, Comparator> list(cmp, &arena);
+  SkipListTest<Key, Comparator> list(cmp, &arena);
   for (int i = 0; i < N; i++) {
     Key key = rnd.Next() % R;
     if (keys.insert(key).second) {
       list.Insert(key);
     }
   }
+  ASSERT_TRUE(list.Valid());
 
   for (int i = 0; i < R; i++) {
     if (list.Contains(i)) {
@@ -69,7 +88,7 @@ TEST(SkipTest, InsertAndLookup) {
 
   // Simple iterator tests
   {
-    SkipList<Key, Comparator>::Iterator iter(&list);
+    SkipListTest<Key, Comparator>::Iterator iter(&list);
     ASSERT_TRUE(!iter.Valid());
 
     iter.Seek(0);
@@ -87,7 +106,7 @@ TEST(SkipTest, InsertAndLookup) {
 
   // Forward iteration test
   for (int i = 0; i < R; i++) {
-    SkipList<Key, Comparator>::Iterator iter(&list);
+    SkipListTest<Key, Comparator>::Iterator iter(&list);
     iter.Seek(i);
 
     // Compare against model iterator
@@ -107,7 +126,7 @@ TEST(SkipTest, InsertAndLookup) {
 
   // Backward iteration test
   {
-    SkipList<Key, Comparator>::Iterator iter(&list);
+    SkipListTest<Key, Comparator>::Iterator iter(&list);
     iter.SeekToLast();
 
     // Compare against model iterator
@@ -250,7 +269,7 @@ class ConcurrentTest {
         // Note that generation 0 is never inserted, so it is ok if
         // <*,0,*> is missing.
         ASSERT_TRUE((gen(pos) == 0) ||
-                    (gen(pos) > static_cast<Key>(initial_state.Get(key(pos))))
+                    (gen(pos) > initial_state.Get(key(pos)))
                     ) << "key: " << key(pos)
                       << "; gen: " << gen(pos)
                       << "; initgen: "
@@ -313,18 +332,16 @@ class TestState {
         state_cv_(&mu_) {}
 
   void Wait(ReaderState s) {
-    mu_.Lock();
+    MutexLock lock(&mu_);
     while (state_ != s) {
       state_cv_.Wait();
     }
-    mu_.Unlock();
   }
 
   void Change(ReaderState s) {
-    mu_.Lock();
+    MutexLock lock(&mu_);
     state_ = s;
     state_cv_.Signal();
-    mu_.Unlock();
   }
 
  private:
@@ -371,6 +388,211 @@ TEST(SkipTest, Concurrent3) { RunConcurrent(3); }
 TEST(SkipTest, Concurrent4) { RunConcurrent(4); }
 TEST(SkipTest, Concurrent5) { RunConcurrent(5); }
 
+static void
+RunSequentialInsert(
+  const int NumKeys,
+  bool      AcquireLock,
+  bool      ReverseInsert,
+  bool      SequentialInsertModeEnabled )
+{
+  const int loopCount = 5; // repeat the whole process this many times and average the time spent
+  std::vector<uint64_t> timeSpent;
+
+  port::Mutex mutex;
+  Env* env = Env::Default();
+
+  fprintf( stderr,
+           "Sequentially inserting %d keys in %s order,\n"
+           "      seqential insert mode is initially %sabled,\n"
+           "      %sacquiring a lock for each insert (averaging over %d runs)\n",
+           NumKeys, ReverseInsert ? "reverse" : "forward",
+           SequentialInsertModeEnabled ? "en" : "dis",
+           AcquireLock ? "" : "not ", loopCount );
+
+  int k;
+  for ( k = 0; k < loopCount; ++k ) {
+    int j;
+    Arena arena;
+    Comparator cmp;
+    SkipListTest<Key, Comparator> list( cmp, &arena );
+
+    // initially the SkipList should be in sequential mode
+    ASSERT_TRUE( list.InSequentialInsertMode() );
+
+    // were we instructed to disable sequential insert mode?
+    if ( !SequentialInsertModeEnabled ) {
+      list.DisableSequentialInsertMode();
+      ASSERT_TRUE( !list.InSequentialInsertMode() );
+    }
+
+    uint64_t start = env->NowMicros();
+    for ( j = 0; j < NumKeys; ++j ) {
+      Key key = ReverseInsert ? NumKeys - 1 - j : j;
+
+      if ( AcquireLock ) mutex.Lock();
+      list.Insert( key );
+      if ( AcquireLock ) mutex.Unlock();
+    }
+    uint64_t stop = env->NowMicros();
+    timeSpent.push_back( stop - start );
+    //fprintf( stderr, "  Time for run %d: %llu\n", k, timeSpent[k] );
+
+    // if SequentialInsertModeEnabled is true, the SkipList should still be
+    // in sequential mode iff ReverseInsert is false
+    if ( SequentialInsertModeEnabled ) {
+      ASSERT_TRUE( list.InSequentialInsertMode() != ReverseInsert );
+    }
+    else {
+      ASSERT_TRUE( !list.InSequentialInsertMode() );
+    }
+
+    // ensure the SkipLlist is properly sorted
+    if ( AcquireLock ) mutex.Lock();
+    ASSERT_TRUE( list.Valid() );
+    if ( AcquireLock ) mutex.Unlock();
+
+    // ensure the SkipList contains all the keys we inserted
+    for ( j = 0; j < NumKeys; ++j ) {
+      ASSERT_TRUE( list.Contains( j ) );
+    }
+  }
+
+  // throw out the low and high times and average the rest
+  uint64_t totalTime, lowTime, highTime;
+  totalTime = lowTime = highTime = timeSpent[0];
+  for ( k = 1; k < loopCount; ++k ) {
+    uint64_t currentTime = timeSpent[k];
+    totalTime += currentTime;
+    if ( lowTime > currentTime ) lowTime = currentTime;
+    if ( highTime < currentTime ) highTime = currentTime;
+  }
+
+  totalTime -= (lowTime + highTime);
+
+  uint64_t averageTime = (totalTime / (loopCount - 2));
+  double timePerKey = (double)averageTime / (double)NumKeys;
+  fprintf( stderr, "   Average insertion time: %" PRIu64 " (%f/key)\n", averageTime, timePerKey );
+}
+
+TEST(SkipTest, SequentialInsert_NoLock_ForwardInsert)
+{
+  int numKeys = 100000;
+  bool acquireLock = false;
+  bool reverseInsert = false;
+  bool sequentialInsertModeEnabled = true;
+  RunSequentialInsert( numKeys, acquireLock, reverseInsert, sequentialInsertModeEnabled );
+
+  sequentialInsertModeEnabled = false;
+  RunSequentialInsert( numKeys, acquireLock, reverseInsert, sequentialInsertModeEnabled );
+}
+
+TEST(SkipTest, SequentialInsert_Lock_ForwardInsert)
+{
+  int numKeys = 100000;
+  bool acquireLock = true;
+  bool reverseInsert = false;
+  bool sequentialInsertModeEnabled = true;
+  RunSequentialInsert( numKeys, acquireLock, reverseInsert, sequentialInsertModeEnabled );
+
+  sequentialInsertModeEnabled = false;
+  RunSequentialInsert( numKeys, acquireLock, reverseInsert, sequentialInsertModeEnabled );
+}
+
+TEST(SkipTest, SequentialInsert_NoLock_ReverseInsert)
+{
+  int numKeys = 100000;
+  bool acquireLock = false;
+  bool reverseInsert = true;
+  bool sequentialInsertModeEnabled = true;
+  RunSequentialInsert( numKeys, acquireLock, reverseInsert, sequentialInsertModeEnabled );
+}
+
+TEST(SkipTest, SequentialInsert_Lock_ReverseInsert)
+{
+  int numKeys = 100000;
+  bool acquireLock = true;
+  bool reverseInsert = true;
+  bool sequentialInsertModeEnabled = true;
+  RunSequentialInsert( numKeys, acquireLock, reverseInsert, sequentialInsertModeEnabled );
+}
+
+TEST(SkipTest, SequentialInsert_IncreasingNumberOfInserts)
+{
+  // test with increasing numbers of keys, with sequential-insert mode both
+  // enabled and disabled; we're looking to see if per-key insertion times
+  // trend upward as the number of keys increases
+  int numKeys = 10000;
+  bool acquireLock = false;
+  bool reverseInsert = false;
+  bool sequentialInsertModeEnabled = true;
+  RunSequentialInsert( numKeys, acquireLock, reverseInsert, sequentialInsertModeEnabled );
+
+  sequentialInsertModeEnabled = false;
+  RunSequentialInsert( numKeys, acquireLock, reverseInsert, sequentialInsertModeEnabled );
+
+  numKeys = 100000;
+  sequentialInsertModeEnabled = true;
+  RunSequentialInsert( numKeys, acquireLock, reverseInsert, sequentialInsertModeEnabled );
+
+  sequentialInsertModeEnabled = false;
+  RunSequentialInsert( numKeys, acquireLock, reverseInsert, sequentialInsertModeEnabled );
+
+  numKeys = 1000000;
+  sequentialInsertModeEnabled = true;
+  RunSequentialInsert( numKeys, acquireLock, reverseInsert, sequentialInsertModeEnabled );
+
+  sequentialInsertModeEnabled = false;
+  RunSequentialInsert( numKeys, acquireLock, reverseInsert, sequentialInsertModeEnabled );
+}
+
+TEST(SkipTest, SequentialInsert_MixedInsertionModes)
+{
+  // start inserting sequentially, then switch to non-sequential inserts,
+  // ensuring all works as intended
+  int j, numSequentialKeys = 100000, numNonSequentialKeys = 100000;
+  int totalNumKeys = numSequentialKeys + numNonSequentialKeys;
+  Arena arena;
+  Comparator cmp;
+  SkipListTest<Key, Comparator> list( cmp, &arena );
+
+  // initially the SkipList should be in sequential mode
+  ASSERT_TRUE( list.InSequentialInsertMode() );
+
+  // start inserting at key=1; when we insert 0 below, the list should switch
+  // out of sequential insert mode
+  for ( j = 1; j < numSequentialKeys; ++j ) {
+    list.Insert( j );
+  }
+
+  // the SkipList should still be in sequential mode
+  ASSERT_TRUE( list.InSequentialInsertMode() );
+  ASSERT_TRUE( list.Valid() );
+
+  list.Insert( 0 );
+  ASSERT_TRUE( !list.InSequentialInsertMode() );
+  ASSERT_TRUE( list.Valid() );
+
+  // now insert the remaining keys in non-sequential order (they're not
+  // random, but that doesn't matter here; just ensure we switch to
+  // non-sequential mode and that all continues to work)
+  for ( j = 0; j < numNonSequentialKeys; j += 2 ) {
+    int key = totalNumKeys - j - 1;
+    list.Insert( key );
+  }
+  for ( j = 0; j < numNonSequentialKeys; j += 2 ) {
+    int key = numSequentialKeys + j;
+    list.Insert( key );
+  }
+
+  ASSERT_TRUE( !list.InSequentialInsertMode() );
+  ASSERT_TRUE( list.Valid() );
+
+  // ensure the SkipList contains all the keys we inserted
+  for ( j = 0; j < totalNumKeys; ++j ) {
+    ASSERT_TRUE( list.Contains( j ) );
+  }
+}
+
 }  // namespace leveldb
 
 int main(int argc, char** argv) {
diff --git a/src/leveldb/db/snapshot.h b/src/leveldb/db/snapshot.h
index 6ed413c42..e7f8fd2c3 100644
--- a/src/leveldb/db/snapshot.h
+++ b/src/leveldb/db/snapshot.h
@@ -5,7 +5,6 @@
 #ifndef STORAGE_LEVELDB_DB_SNAPSHOT_H_
 #define STORAGE_LEVELDB_DB_SNAPSHOT_H_
 
-#include "db/dbformat.h"
 #include "leveldb/db.h"
 
 namespace leveldb {
diff --git a/src/leveldb/db/table_cache.cc b/src/leveldb/db/table_cache.cc
index e3d82cd3e..34b03c7aa 100644
--- a/src/leveldb/db/table_cache.cc
+++ b/src/leveldb/db/table_cache.cc
@@ -5,22 +5,26 @@
 #include "db/table_cache.h"
 
 #include "db/filename.h"
+#include "db/log_reader.h"
+#include "db/log_writer.h"
+#include "db/version_edit.h"
 #include "leveldb/env.h"
 #include "leveldb/table.h"
 #include "util/coding.h"
+#include "leveldb/perf_count.h"
 
 namespace leveldb {
 
-struct TableAndFile {
-  RandomAccessFile* file;
-  Table* table;
-};
-
 static void DeleteEntry(const Slice& key, void* value) {
   TableAndFile* tf = reinterpret_cast<TableAndFile*>(value);
-  delete tf->table;
-  delete tf->file;
-  delete tf;
+  if (0==dec_and_fetch(&tf->user_count))
+  {
+    if (NULL!=tf->doublecache)
+      tf->doublecache->SubFileSize(tf->table->GetFileSize());
+    delete tf->table;
+    delete tf->file;
+    delete tf;
+  }   // if
 }
 
 static void UnrefEntry(void* arg1, void* arg2) {
@@ -31,37 +35,38 @@ static void UnrefEntry(void* arg1, void* arg2) {
 
 TableCache::TableCache(const std::string& dbname,
                        const Options* options,
-                       int entries)
+                       Cache * file_cache,
+                       DoubleCache & doublecache)
     : env_(options->env),
       dbname_(dbname),
       options_(options),
-      cache_(NewLRUCache(entries)) {
+      cache_(file_cache),
+      doublecache_(doublecache)
+{
 }
 
 TableCache::~TableCache() {
-  delete cache_;
 }
 
-Status TableCache::FindTable(uint64_t file_number, uint64_t file_size,
-                             Cache::Handle** handle) {
+Status TableCache::FindTable(uint64_t file_number, uint64_t file_size, int level,
+                             Cache::Handle** handle, bool is_compaction,
+                             bool for_iterator) {
   Status s;
   char buf[sizeof(file_number)];
   EncodeFixed64(buf, file_number);
   Slice key(buf, sizeof(buf));
   *handle = cache_->Lookup(key);
   if (*handle == NULL) {
-    std::string fname = TableFileName(dbname_, file_number);
+    std::string fname = TableFileName(*options_, file_number, level);
     RandomAccessFile* file = NULL;
     Table* table = NULL;
     s = env_->NewRandomAccessFile(fname, &file);
-    if (!s.ok()) {
-      std::string old_fname = SSTTableFileName(dbname_, file_number);
-      if (env_->NewRandomAccessFile(old_fname, &file).ok()) {
-        s = Status::OK();
-      }
-    }
     if (s.ok()) {
       s = Table::Open(*options_, file, file_size, &table);
+
+      // Riak:  support opportunity to manage Linux page cache
+      if (is_compaction)
+          file->SetForCompaction(file_size);
     }
 
     if (!s.ok()) {
@@ -73,22 +78,74 @@ Status TableCache::FindTable(uint64_t file_number, uint64_t file_size,
       TableAndFile* tf = new TableAndFile;
       tf->file = file;
       tf->table = table;
-      *handle = cache_->Insert(key, tf, 1, &DeleteEntry);
+      tf->doublecache = &doublecache_;
+      tf->file_number = file_number;
+      tf->level = level;
+
+      *handle = cache_->Insert(key, tf, table->TableObjectSize(), &DeleteEntry);
+      gPerfCounters->Inc(ePerfTableOpened);
+      doublecache_.AddFileSize(table->GetFileSize());
+
+      // temporary hardcoding to match number of levels defined as
+      //  overlapped in version_set.cc
+      if (level<config::kNumOverlapLevels)
+          cache_->Addref(*handle);
     }
   }
+  else
+  {
+    Table *table = reinterpret_cast<TableAndFile*>(cache_->Value(*handle))->table;
+
+    // this is NOT first access, see if bloom filter can load now
+    if (!for_iterator && table->ReadFilter())
+    {
+      // TableAndFile now going to be present in two cache entries
+      //  1. retrieve old entry within file cache
+      TableAndFile* tf = reinterpret_cast<TableAndFile*>(cache_->Value(*handle));
+      inc_and_fetch(&tf->user_count);
+
+      //  2. must clean file size, do not want double count
+      if (NULL!=tf->doublecache)
+        tf->doublecache->SubFileSize(tf->table->GetFileSize());
+
+      //  3. release current reference (and possible special overlap reference)
+      cache_->Release(*handle);
+      if (tf->level<config::kNumOverlapLevels)
+        cache_->Release(*handle);
+
+      //  4. create second table cache entry using TableObjectSize that now includes
+      //     bloom filter size
+      *handle = cache_->Insert(key, tf, table->TableObjectSize(), &DeleteEntry);
+
+      //  5. set double reference if an overlapped file (prevents from being flushed)
+      if (level<config::kNumOverlapLevels)
+        cache_->Addref(*handle);
+    }   // if
+
+    // for Linux, let fadvise start precaching
+    if (is_compaction)
+    {
+        RandomAccessFile *file = reinterpret_cast<TableAndFile*>(cache_->Value(*handle))->file;
+        file->SetForCompaction(file_size);
+    }   // if
+
+    gPerfCounters->Inc(ePerfTableCached);
+  }   // else
   return s;
 }
 
 Iterator* TableCache::NewIterator(const ReadOptions& options,
                                   uint64_t file_number,
                                   uint64_t file_size,
+                                  int level,
                                   Table** tableptr) {
   if (tableptr != NULL) {
     *tableptr = NULL;
   }
 
   Cache::Handle* handle = NULL;
-  Status s = FindTable(file_number, file_size, &handle);
+  Status s = FindTable(file_number, file_size, level, &handle, options.IsCompaction(), true);
+
   if (!s.ok()) {
     return NewErrorIterator(s);
   }
@@ -105,11 +162,13 @@ Iterator* TableCache::NewIterator(const ReadOptions& options,
 Status TableCache::Get(const ReadOptions& options,
                        uint64_t file_number,
                        uint64_t file_size,
+                       int level,
                        const Slice& k,
                        void* arg,
-                       void (*saver)(void*, const Slice&, const Slice&)) {
+                       bool (*saver)(void*, const Slice&, const Slice&)) {
   Cache::Handle* handle = NULL;
-  Status s = FindTable(file_number, file_size, &handle);
+  Status s = FindTable(file_number, file_size, level, &handle);
+
   if (s.ok()) {
     Table* t = reinterpret_cast<TableAndFile*>(cache_->Value(handle))->table;
     s = t->InternalGet(options, k, arg, saver);
@@ -118,10 +177,60 @@ Status TableCache::Get(const ReadOptions& options,
   return s;
 }
 
-void TableCache::Evict(uint64_t file_number) {
+void TableCache::Evict(uint64_t file_number, bool is_overlapped) {
   char buf[sizeof(file_number)];
   EncodeFixed64(buf, file_number);
+
+  // overlapped files have extra reference to prevent their purge,
+  //  release that reference now
+  if (is_overlapped)
+  {
+      Cache::Handle *handle;
+
+      // the Lookup call adds a reference too, back out both
+      handle=cache_->Lookup(Slice(buf, sizeof(buf)));
+
+      // with multiple background threads, file might already be
+      //  evicted
+      if (NULL!=handle)
+      {
+          cache_->Release(handle);  // release for Lookup() call just made
+          cache_->Release(handle);  // release for extra reference
+      }   // if
+  }   // if
+
   cache_->Erase(Slice(buf, sizeof(buf)));
 }
 
+/**
+ * Riak specific routine to return table statistic ONLY if table metadata
+ *  already within cache ... otherwise return 0.
+ */
+uint64_t
+TableCache::GetStatisticValue(
+    uint64_t file_number,
+    unsigned Index)
+{
+    uint64_t ret_val;
+    char buf[sizeof(file_number)];
+    Cache::Handle *handle;
+
+    ret_val=0;
+    EncodeFixed64(buf, file_number);
+    Slice key(buf, sizeof(buf));
+    handle = cache_->Lookup(key);
+
+    if (NULL != handle)
+    {
+        TableAndFile * tf;
+
+        tf=reinterpret_cast<TableAndFile*>(cache_->Value(handle));
+        ret_val=tf->table->GetSstCounters().Value(Index);
+        cache_->Release(handle);
+    }   // if
+
+    return(ret_val);
+
+}   // TableCache::GetStatisticValue
+
 }  // namespace leveldb
diff --git a/src/leveldb/db/table_cache.h b/src/leveldb/db/table_cache.h
index 8cf4aaf12..8f77c58dd 100644
--- a/src/leveldb/db/table_cache.h
+++ b/src/leveldb/db/table_cache.h
@@ -13,6 +13,7 @@
 #include "leveldb/cache.h"
 #include "leveldb/table.h"
 #include "port/port.h"
+#include "util/cache2.h"
 
 namespace leveldb {
 
@@ -20,8 +21,10 @@ class Env;
 
 class TableCache {
  public:
-  TableCache(const std::string& dbname, const Options* options, int entries);
-  ~TableCache();
+  // clean up note:  file_cache is redundant to GetFileCache available from doublecache
+  TableCache(const std::string& dbname, const Options* options, Cache * file_cache,
+             DoubleCache & doublecache);
+  virtual ~TableCache();
 
   // Return an iterator for the specified file number (the corresponding
   // file length must be exactly "file_size" bytes).  If "tableptr" is
@@ -33,6 +36,7 @@ class TableCache {
   Iterator* NewIterator(const ReadOptions& options,
                         uint64_t file_number,
                         uint64_t file_size,
+                        int level,
                         Table** tableptr = NULL);
 
   // If a seek to internal key "k" in specified file finds an entry,
@@ -40,22 +44,65 @@ class TableCache {
   Status Get(const ReadOptions& options,
              uint64_t file_number,
              uint64_t file_size,
+             int level,
              const Slice& k,
              void* arg,
-             void (*handle_result)(void*, const Slice&, const Slice&));
+             bool (*handle_result)(void*, const Slice&, const Slice&));
 
   // Evict any entry for the specified file number
-  void Evict(uint64_t file_number);
+  void Evict(uint64_t file_number, bool is_overlapped);
 
- private:
+  // Riak specific:  return table statistic ONLY if table in cache, otherwise zero
+  uint64_t GetStatisticValue(uint64_t file_number, unsigned Index);
+
+
+  // access for testing tools, not for public access
+  Status TEST_FindTable(uint64_t file_number, uint64_t file_size, int level, Cache::Handle** handle)
+  {return( FindTable(file_number, file_size, level, handle));};
+
+  Cache* TEST_GetInternalCache() {return(cache_);};
+
+  void Release(Cache::Handle * handle) {cache_->Release(handle);};
+
+  // routine called if Options::cache_object_warming is true.
+  //  Writes list of all file names currently in file cache to disk.
+  Status SaveOpenFileList();
+
+  // routine called if Options::cache_object_warming is true.
+  //  Reads file created by SaveOpenFileList() and attempts to open
+  //  every file.
+  Status PreloadTableCache();
+
+ // was private, now protected to allow easy unit test overrides
+ protected:
   Env* const env_;
   const std::string dbname_;
   const Options* options_;
-  Cache* cache_;
+  Cache * cache_;
+  DoubleCache & doublecache_;
 
-  Status FindTable(uint64_t file_number, uint64_t file_size, Cache::Handle**);
+  // virtual to enable unit test overrides
+  virtual Status FindTable(uint64_t file_number, uint64_t file_size, int level,
+                           Cache::Handle**, bool is_compaction=false,
+                           bool for_iterator=false);
 };
 
+
+struct TableAndFile {
+  RandomAccessFile* file;
+  Table* table;
+  DoubleCache * doublecache;
+  uint64_t file_number;     // saved for cache object warming
+  int level;                // saved for cache object warming
+  volatile uint32_t user_count;
+
+   TableAndFile()
+   : file(NULL), table(NULL), doublecache(NULL),
+     file_number(0), level(0), user_count(1)
+   {};
+};
+
+
 }  // namespace leveldb
 
 #endif  // STORAGE_LEVELDB_DB_TABLE_CACHE_H_
diff --git a/src/leveldb/db/version_edit.cc b/src/leveldb/db/version_edit.cc
index f10a2d58b..17b565679 100644
--- a/src/leveldb/db/version_edit.cc
+++ b/src/leveldb/db/version_edit.cc
@@ -9,20 +9,6 @@
 
 namespace leveldb {
 
-// Tag numbers for serialized VersionEdit.  These numbers are written to
-// disk and should not be changed.
-enum Tag {
-  kComparator           = 1,
-  kLogNumber            = 2,
-  kNextFileNumber       = 3,
-  kLastSequence         = 4,
-  kCompactPointer       = 5,
-  kDeletedFile          = 6,
-  kNewFile              = 7,
-  // 8 was used for large value refs
-  kPrevLogNumber        = 9
-};
-
 void VersionEdit::Clear() {
   comparator_.clear();
   log_number_ = 0;
@@ -34,11 +20,21 @@ void VersionEdit::Clear() {
   has_prev_log_number_ = false;
   has_next_file_number_ = false;
   has_last_sequence_ = false;
+  has_f1_files_ = false;
+  has_f2_files_ = false;
+
   deleted_files_.clear();
   new_files_.clear();
 }
 
-void VersionEdit::EncodeTo(std::string* dst) const {
+/**
+ * EncodeTo serializes the VersionEdit object
+ *  to the "dst" string parameter.  "format2" flag
+ *  indicates whether serialization should use original
+ *  Google format for file objects (false) or Basho's updated
+ *  file2 format for expiry enabled file objects (true)
+ */
+void VersionEdit::EncodeTo(std::string* dst, bool format2) const {
   if (has_comparator_) {
     PutVarint32(dst, kComparator);
     PutLengthPrefixedSlice(dst, comparator_);
@@ -76,12 +72,21 @@ void VersionEdit::EncodeTo(std::string* dst) const {
 
   for (size_t i = 0; i < new_files_.size(); i++) {
     const FileMetaData& f = new_files_[i].second;
-    PutVarint32(dst, kNewFile);
+    if (format2)
+      PutVarint32(dst, kNewFile2);
+    else
+      PutVarint32(dst, kNewFile);
     PutVarint32(dst, new_files_[i].first);  // level
     PutVarint64(dst, f.number);
     PutVarint64(dst, f.file_size);
     PutLengthPrefixedSlice(dst, f.smallest.Encode());
     PutLengthPrefixedSlice(dst, f.largest.Encode());
+    if (format2)
+    {
+      PutVarint64(dst, f.exp_write_low);
+      PutVarint64(dst, f.exp_write_high);
+      PutVarint64(dst, f.exp_explicit_high);
+    }
   }
 }
 
@@ -98,7 +103,7 @@ static bool GetInternalKey(Slice* input, InternalKey* dst) {
 static bool GetLevel(Slice* input, int* level) {
   uint32_t v;
   if (GetVarint32(input, &v) &&
-      v < config::kNumLevels) {
+      v < (unsigned)config::kNumLevels) {
     *level = v;
     return true;
   } else {
@@ -185,13 +190,34 @@ Status VersionEdit::DecodeFrom(const Slice& src) {
             GetVarint64(&input, &f.number) &&
             GetVarint64(&input, &f.file_size) &&
             GetInternalKey(&input, &f.smallest) &&
-            GetInternalKey(&input, &f.largest)) {
+            GetInternalKey(&input, &f.largest))
+        {
+          has_f1_files_ = true;
+          f.level=level;
           new_files_.push_back(std::make_pair(level, f));
         } else {
           msg = "new-file entry";
         }
         break;
 
+      case kNewFile2:
+        if (GetLevel(&input, &level) &&
+            GetVarint64(&input, &f.number) &&
+            GetVarint64(&input, &f.file_size) &&
+            GetInternalKey(&input, &f.smallest) &&
+            GetInternalKey(&input, &f.largest) &&
+            GetVarint64(&input, &f.exp_write_low) &&
+            GetVarint64(&input, &f.exp_write_high) &&
+            GetVarint64(&input, &f.exp_explicit_high))
+        {
+          has_f2_files_ = true;
+          f.level=level;
+          new_files_.push_back(std::make_pair(level, f));
+        } else {
+          msg = "new-file2 entry";
+        }
+        break;
+
       default:
         msg = "unknown tag";
         break;
@@ -258,6 +284,12 @@ std::string VersionEdit::DebugString() const {
     r.append(f.smallest.DebugString());
     r.append(" .. ");
     r.append(f.largest.DebugString());
+    r.append(" ");
+    AppendNumberTo(&r, f.exp_write_low);
+    r.append(" ");
+    AppendNumberTo(&r, f.exp_write_high);
+    r.append(" ");
+    AppendNumberTo(&r, f.exp_explicit_high);
   }
   r.append("\n}\n");
   return r;
diff --git a/src/leveldb/db/version_edit.h b/src/leveldb/db/version_edit.h
index eaef77b32..ba0c8f8ae 100644
--- a/src/leveldb/db/version_edit.h
+++ b/src/leveldb/db/version_edit.h
@@ -16,15 +16,41 @@ class VersionSet;
 
 struct FileMetaData {
   int refs;
-  int allowed_seeks;          // Seeks allowed until compaction
+//  int allowed_seeks;          // Seeks allowed until compaction
   uint64_t number;
   uint64_t file_size;         // File size in bytes
+  uint64_t num_entries;       // count of values in .sst file, only valid during table build
   InternalKey smallest;       // Smallest internal key served by table
   InternalKey largest;        // Largest internal key served by table
+  int level;
+  ExpiryTimeMicros exp_write_low;     // oldest write time in file:
+                                //  0 - non-expiry keys exist too
+                                //  ULLONG_MAX - no write time expiry & no plain keys
+  ExpiryTimeMicros exp_write_high;    // most recent write time in file
+  ExpiryTimeMicros exp_explicit_high; // most recent/furthest into future explicit expiry
 
-  FileMetaData() : refs(0), allowed_seeks(1 << 30), file_size(0) { }
+  FileMetaData()
+  : refs(0), /*allowed_seeks(1 << 30),*/ file_size(0),
+      num_entries(0), level(-1), exp_write_low(0), exp_write_high(0), exp_explicit_high(0)
+  { }
 };
 
+
+class FileMetaDataPtrCompare
+{
+protected:
+    const Comparator * comparator_;
+
+public:
+    explicit FileMetaDataPtrCompare(const Comparator * Comparer)
+        : comparator_(Comparer) {};
+
+    bool operator() (const FileMetaData * file1, const FileMetaData * file2) const
+    {
+        return(comparator_->Compare(file1->smallest.user_key(), file2->smallest.user_key()) < 0);
+    }
+};  // class FileMetaDataPtrCompare
+
 class VersionEdit {
  public:
   VersionEdit() { Clear(); }
@@ -59,6 +85,7 @@ class VersionEdit {
   // Add the specified file at the specified number.
   // REQUIRES: This version has not been saved (see VersionSet::SaveTo)
   // REQUIRES: "smallest" and "largest" are smallest and largest keys in file
+#if 0
   void AddFile(int level, uint64_t file,
                uint64_t file_size,
                const InternalKey& smallest,
@@ -68,6 +95,27 @@ class VersionEdit {
     f.file_size = file_size;
     f.smallest = smallest;
     f.largest = largest;
+    f.level = level;
+    new_files_.push_back(std::make_pair(level, f));
+  }
+#endif
+
+  void AddFile2(int level, uint64_t file,
+                uint64_t file_size,
+                const InternalKey& smallest,
+                const InternalKey& largest,
+                uint64_t exp_write_low,
+                uint64_t exp_write_high,
+                uint64_t exp_explicit_high) {
+    FileMetaData f;
+    f.number = file;
+    f.file_size = file_size;
+    f.smallest = smallest;
+    f.largest = largest;
+    f.level = level;
+    f.exp_write_low = exp_write_low;
+    f.exp_write_high = exp_write_high;
+    f.exp_explicit_high = exp_explicit_high;
     new_files_.push_back(std::make_pair(level, f));
   }
 
@@ -75,16 +123,37 @@ class VersionEdit {
   void DeleteFile(int level, uint64_t file) {
     deleted_files_.insert(std::make_pair(level, file));
   }
+  size_t DeletedFileCount() const {return(deleted_files_.size());};
 
-  void EncodeTo(std::string* dst) const;
+  void EncodeTo(std::string* dst, bool format2=true) const;
   Status DecodeFrom(const Slice& src);
 
+  // unit test access to validate file entries' format types
+  bool HasF1Files() const {return(has_f1_files_);};
+  bool HasF2Files() const {return(has_f2_files_);};
+
   std::string DebugString() const;
 
+// Tag numbers for serialized VersionEdit.  These numbers are written to
+// disk and should not be changed.
+enum Tag {
+  kComparator           = 1,
+  kLogNumber            = 2,
+  kNextFileNumber       = 3,
+  kLastSequence         = 4,
+  kCompactPointer       = 5,
+  kDeletedFile          = 6,
+  kNewFile              = 7,
+  // 8 was used for large value refs
+  kPrevLogNumber        = 9,
+  kFileCacheObject      = 10,
+  kNewFile2             = 11  // expiry capable file
+};
+
  private:
   friend class VersionSet;
 
-  typedef std::set< std::pair<int, uint64_t> > DeletedFileSet;
+  USED_BY_NESTED_FRIEND2(typedef std::set< std::pair<int, uint64_t> > DeletedFileSet)
 
   std::string comparator_;
   uint64_t log_number_;
@@ -96,10 +165,13 @@ class VersionEdit {
   bool has_prev_log_number_;
   bool has_next_file_number_;
   bool has_last_sequence_;
+  // following should be mutually exclusive, but tested independently to be sure
+  bool has_f1_files_;         // manifest uses format 1 (for unit tests)
+  bool has_f2_files_;         // manifest uses format 2 (for unit tests)
 
-  std::vector< std::pair<int, InternalKey> > compact_pointers_;
-  DeletedFileSet deleted_files_;
-  std::vector< std::pair<int, FileMetaData> > new_files_;
+  USED_BY_NESTED_FRIEND2(std::vector< std::pair<int, InternalKey> > compact_pointers_)
+  USED_BY_NESTED_FRIEND(DeletedFileSet deleted_files_)
+  USED_BY_NESTED_FRIEND2(std::vector< std::pair<int, FileMetaData> > new_files_)
 };
 
 }  // namespace leveldb
diff --git a/src/leveldb/db/version_edit_test.cc b/src/leveldb/db/version_edit_test.cc
index 280310b49..bd2c9a31c 100644
--- a/src/leveldb/db/version_edit_test.cc
+++ b/src/leveldb/db/version_edit_test.cc
@@ -7,14 +7,22 @@
 
 namespace leveldb {
 
-static void TestEncodeDecode(const VersionEdit& edit) {
+static void TestEncodeDecode(
+    const VersionEdit& edit,
+    bool format2=false) {
   std::string encoded, encoded2;
-  edit.EncodeTo(&encoded);
+  edit.EncodeTo(&encoded,format2);
   VersionEdit parsed;
   Status s = parsed.DecodeFrom(encoded);
   ASSERT_TRUE(s.ok()) << s.ToString();
-  parsed.EncodeTo(&encoded2);
+  parsed.EncodeTo(&encoded2,format2);
   ASSERT_EQ(encoded, encoded2);
+
+  if (parsed.HasF1Files() || parsed.HasF2Files())
+  {
+      ASSERT_EQ(parsed.HasF1Files(), !format2);
+      ASSERT_EQ(parsed.HasF2Files(), format2);
+  }   // if
 }
 
 class VersionEditTest { };
@@ -25,11 +33,12 @@ TEST(VersionEditTest, EncodeDecode) {
   VersionEdit edit;
   for (int i = 0; i < 4; i++) {
     TestEncodeDecode(edit);
-    edit.AddFile(3, kBig + 300 + i, kBig + 400 + i,
-                 InternalKey("foo", kBig + 500 + i, kTypeValue),
-                 InternalKey("zoo", kBig + 600 + i, kTypeDeletion));
+    edit.AddFile2(3, kBig + 300 + i, kBig + 400 + i,
+                  InternalKey("foo", 0, kBig + 500 + i, kTypeValue),
+                  InternalKey("zoo", 0, kBig + 600 + i, kTypeDeletion),
+                  0,0,0);
     edit.DeleteFile(4, kBig + 700 + i);
-    edit.SetCompactPointer(i, InternalKey("x", kBig + 900 + i, kTypeValue));
+    edit.SetCompactPointer(i, InternalKey("x", 0, kBig + 900 + i, kTypeValue));
   }
 
   edit.SetComparatorName("foo");
@@ -39,6 +48,29 @@ TEST(VersionEditTest, EncodeDecode) {
   TestEncodeDecode(edit);
 }
 
+TEST(VersionEditTest, EncodeDecodeExpiry) {
+  static const uint64_t kBig = 1ull << 25;
+
+  VersionEdit edit;
+  for (int i = 0; i < 4; i++) {
+    TestEncodeDecode(edit, false); // only testing for s.ok()
+    edit.AddFile2(3, kBig + 300 + i, kBig + 400 + i,
+                 InternalKey("foo", 700+i, kBig + 500 + i, kTypeValueExplicitExpiry),
+                 InternalKey("zoo", 800+i, kBig + 600 + i, kTypeDeletion),
+                 10203040,
+                 123456789,
+                 987654321);
+    edit.DeleteFile(4, kBig + 700 + i);
+    edit.SetCompactPointer(i, InternalKey("x", 0, kBig + 900 + i, kTypeValue));
+  }
+
+  edit.SetComparatorName("foo");
+  edit.SetLogNumber(kBig + 100);
+  edit.SetNextFile(kBig + 200);
+  edit.SetLastSequence(kBig + 1000);
+  TestEncodeDecode(edit, true);
+}
+
 }  // namespace leveldb
 
 int main(int argc, char** argv) {
diff --git a/src/leveldb/db/version_set.cc b/src/leveldb/db/version_set.cc
index 2cb6d80ed..4a35306cc 100644
--- a/src/leveldb/db/version_set.cc
+++ b/src/leveldb/db/version_set.cc
@@ -12,48 +12,65 @@
 #include "db/memtable.h"
 #include "db/table_cache.h"
 #include "leveldb/env.h"
+#include "leveldb/expiry.h"
 #include "leveldb/table_builder.h"
+#include "table/block.h"
 #include "table/merger.h"
 #include "table/two_level_iterator.h"
 #include "util/coding.h"
+#include "util/db_list.h"
+#include "util/hot_threads.h"
 #include "util/logging.h"
+#include "util/mutexlock.h"
+#include "util/thread_tasks.h"
+#include "leveldb/perf_count.h"
 
 namespace leveldb {
 
-static size_t TargetFileSize(const Options* options) {
-  return options->max_file_size;
-}
+// branch mv-level-work1, March 2013
+//
+// Notes:
+//
+static struct
+{
+    uint64_t m_TargetFileSize;                   //!< mostly useless
+    uint64_t m_MaxGrandParentOverlapBytes;       //!< needs tuning, but not essential
+                                                 //!<   since moves eliminated
+    int64_t  m_ExpandedCompactionByteSizeLimit;  //!< needs tuning
 
-// Maximum bytes of overlaps in grandparent (i.e., level+2) before we
-// stop building a single file in a level->level+1 compaction.
-static int64_t MaxGrandParentOverlapBytes(const Options* options) {
-  return 10 * TargetFileSize(options);
-}
+    // next two ignored if m_OverlappedFiles is true
+    uint64_t m_MaxBytesForLevel;                 //!< start write throttle above this
+    uint64_t m_DesiredBytesForLevel;             //!< compact into next level until this
 
-// Maximum number of bytes in all compacted files.  We avoid expanding
-// the lower level file set of a compaction if it would make the
-// total compaction cover more than this many bytes.
-static int64_t ExpandedCompactionByteSizeLimit(const Options* options) {
-  return 25 * TargetFileSize(options);
-}
+    uint64_t m_MaxFileSizeForLevel;              //!< google really applies this
+                                                 //!<   to file size of NEXT level
+    bool m_OverlappedFiles;                      //!< false means sst files are sorted
+                                                 //!<   and do not overlap
+} gLevelTraits[config::kNumLevels]=
 
-static double MaxBytesForLevel(const Options* options, int level) {
-  // Note: the result for level zero is not really used since we set
-  // the level-0 compaction threshold based on number of files.
+// level-0 and level-1 create .sst table files that have overlapping key spaces.
+//   The compaction selection logic within VersionSet::Finalize() selects based
+//   upon file count, not accumulated file size.  Write throttle is harsh if too
+//   many files accumulate.  Timed grooming (if activated) adjusts the file
+//   count threshold by time since last compaction.
+// level-2 is the "landing zone" / first sorted level.  Try to keep it clear,
+//   hence the low m_DesiredBytes for level.
+// level-2+:  VersionSet::Finalize() selects compaction files when the
+//   total bytes for level exceeds m_DesiredBytesForLevel.  Write throttle
+//   starts when total bytes exceeds m_MaxFileSizeForLevel.
 
-  // Result for both level-0 and level-1
-  double result = 10. * 1048576.0;
-  while (level > 1) {
-    result *= 10;
-    level--;
-  }
-  return result;
-}
+// WARNING: m_OverlappedFiles flags need to match config::kNumOverlapFiles ... until unified
+{
+    {10485760,  262144000,  57671680,      209715200,                 0,     420000000, true},
+    {10485760,   82914560,  57671680,      419430400,                 0,     209715200, true},
+    {10485760,  314572800,  57671680,     3082813440,         200000000,     314572800, false},
+    {10485760,  419430400,  57671680,     6442450944ULL,     4294967296ULL,  419430400, false},
+    {10485760,  524288000,  57671680,   128849018880ULL,    85899345920ULL,  524288000, false},
+    {10485760,  629145600,  57671680,  2576980377600ULL,  1717986918400ULL,  629145600, false},
+    {10485760,  734003200,  57671680, 51539607552000ULL, 34359738368000ULL,  734003200, false}
+};
 
-static uint64_t MaxFileSizeForLevel(const Options* options, int level) {
-  // We could vary per level to reduce number of files?
-  return TargetFileSize(options);
-}
+/// ULL above needed to compile on OSX 10.7.3
 
 static int64_t TotalFileSize(const std::vector<FileMetaData*>& files) {
   int64_t sum = 0;
@@ -76,7 +93,12 @@ Version::~Version() {
       FileMetaData* f = files_[level][i];
       assert(f->refs > 0);
       f->refs--;
+
       if (f->refs <= 0) {
+        // clear Riak's double reference of overlapped files
+        if (vset_->IsLevelOverlapped(level))
+          vset_->GetTableCache()->Evict(f->number, true);
+
         delete f;
       }
     }
@@ -143,7 +165,7 @@ bool SomeFileOverlapsRange(
   uint32_t index = 0;
   if (smallest_user_key != NULL) {
     // Find the earliest possible internal key for smallest_user_key
-    InternalKey small(*smallest_user_key, kMaxSequenceNumber,kValueTypeForSeek);
+    InternalKey small(*smallest_user_key, 0, kMaxSequenceNumber, kValueTypeForSeek);
     index = FindFile(icmp, files, small.Encode());
   }
 
@@ -198,6 +220,7 @@ class Version::LevelFileNumIterator : public Iterator {
     assert(Valid());
     EncodeFixed64(value_buf_, (*flist_)[index_]->number);
     EncodeFixed64(value_buf_+8, (*flist_)[index_]->file_size);
+    EncodeFixed32(value_buf_+16, (*flist_)[index_]->level);
     return Slice(value_buf_, sizeof(value_buf_));
   }
   virtual Status status() const { return Status::OK(); }
@@ -206,21 +229,22 @@ class Version::LevelFileNumIterator : public Iterator {
   const std::vector<FileMetaData*>* const flist_;
   uint32_t index_;
 
-  // Backing store for value().  Holds the file number and size.
-  mutable char value_buf_[16];
+  // Backing store for value().  Holds the file number and size (and level).
+  mutable char value_buf_[20];
 };
 
 static Iterator* GetFileIterator(void* arg,
                                  const ReadOptions& options,
                                  const Slice& file_value) {
   TableCache* cache = reinterpret_cast<TableCache*>(arg);
-  if (file_value.size() != 16) {
+  if (file_value.size() != 20) {
     return NewErrorIterator(
         Status::Corruption("FileReader invoked with unexpected value"));
   } else {
     return cache->NewIterator(options,
                               DecodeFixed64(file_value.data()),
-                              DecodeFixed64(file_value.data() + 8));
+                              DecodeFixed64(file_value.data() + 8),
+                              DecodeFixed32(file_value.data() + 16));
   }
 }
 
@@ -233,22 +257,35 @@ Iterator* Version::NewConcatenatingIterator(const ReadOptions& options,
 
 void Version::AddIterators(const ReadOptions& options,
                            std::vector<Iterator*>* iters) {
-  // Merge all level zero files together since they may overlap
-  for (size_t i = 0; i < files_[0].size(); i++) {
-    iters->push_back(
-        vset_->table_cache_->NewIterator(
-            options, files_[0][i]->number, files_[0][i]->file_size));
-  }
 
-  // For levels > 0, we can use a concatenating iterator that sequentially
-  // walks through the non-overlapping files in the level, opening them
-  // lazily.
-  for (int level = 1; level < config::kNumLevels; level++) {
-    if (!files_[level].empty()) {
-      iters->push_back(NewConcatenatingIterator(options, level));
-    }
-  }
-}
+    int level;
+
+    for (level=0; level < config::kNumLevels; ++level)
+    {
+        if (gLevelTraits[level].m_OverlappedFiles)
+        {
+            // Merge all level files together since they may overlap
+            for (size_t i = 0; i < files_[level].size(); i++)
+            {
+                iters->push_back(
+                    vset_->table_cache_->NewIterator(
+                        options, files_[level][i]->number, files_[level][i]->file_size, level));
+            }   // for
+        }   // if
+
+        else
+        {
+            // For sorted levels, we can use a concatenating iterator that sequentially
+            // walks through the non-overlapping files in the level, opening them
+            // lazily.
+            if (!files_[level].empty())
+            {
+                iters->push_back(NewConcatenatingIterator(options, level));
+            }   // if
+        }   // else
+    }   // for
+}   // Version::NewConcatenatingIterator
+
 
 // Callback from TableCache::Get()
 namespace {
@@ -261,77 +298,42 @@ enum SaverState {
 struct Saver {
   SaverState state;
   const Comparator* ucmp;
+  const Options* options;
   Slice user_key;
-  std::string* value;
+  Value* value;
+  const LookupKey * lookup;
 };
 }
-static void SaveValue(void* arg, const Slice& ikey, const Slice& v) {
+static bool SaveValue(void* arg, const Slice& ikey, const Slice& v) {
+  bool match=false;
+  bool expired=false;
   Saver* s = reinterpret_cast<Saver*>(arg);
   ParsedInternalKey parsed_key;
   if (!ParseInternalKey(ikey, &parsed_key)) {
     s->state = kCorrupt;
   } else {
     if (s->ucmp->Compare(parsed_key.user_key, s->user_key) == 0) {
-      s->state = (parsed_key.type == kTypeValue) ? kFound : kDeleted;
+      match=true;
+      if (NULL!=s->options && s->options->ExpiryActivated())
+        expired=s->options->expiry_module->KeyRetirementCallback(parsed_key);
+      s->state = (parsed_key.type != kTypeDeletion && !expired) ? kFound : kDeleted;
       if (s->state == kFound) {
         s->value->assign(v.data(), v.size());
       }
+      if (NULL!=s->lookup)
+        s->lookup->SetKeyMetaData(parsed_key);
     }
   }
+  return(match);
 }
 
 static bool NewestFirst(FileMetaData* a, FileMetaData* b) {
   return a->number > b->number;
 }
 
-void Version::ForEachOverlapping(Slice user_key, Slice internal_key,
-                                 void* arg,
-                                 bool (*func)(void*, int, FileMetaData*)) {
-  // TODO(sanjay): Change Version::Get() to use this function.
-  const Comparator* ucmp = vset_->icmp_.user_comparator();
-
-  // Search level-0 in order from newest to oldest.
-  std::vector<FileMetaData*> tmp;
-  tmp.reserve(files_[0].size());
-  for (uint32_t i = 0; i < files_[0].size(); i++) {
-    FileMetaData* f = files_[0][i];
-    if (ucmp->Compare(user_key, f->smallest.user_key()) >= 0 &&
-        ucmp->Compare(user_key, f->largest.user_key()) <= 0) {
-      tmp.push_back(f);
-    }
-  }
-  if (!tmp.empty()) {
-    std::sort(tmp.begin(), tmp.end(), NewestFirst);
-    for (uint32_t i = 0; i < tmp.size(); i++) {
-      if (!(*func)(arg, 0, tmp[i])) {
-        return;
-      }
-    }
-  }
-
-  // Search other levels.
-  for (int level = 1; level < config::kNumLevels; level++) {
-    size_t num_files = files_[level].size();
-    if (num_files == 0) continue;
-
-    // Binary search to find earliest index whose largest key >= internal_key.
-    uint32_t index = FindFile(vset_->icmp_, files_[level], internal_key);
-    if (index < num_files) {
-      FileMetaData* f = files_[level][index];
-      if (ucmp->Compare(user_key, f->smallest.user_key()) < 0) {
-        // All of "f" is past any data for user_key
-      } else {
-        if (!(*func)(arg, level, f)) {
-          return;
-        }
-      }
-    }
-  }
-}
-
 Status Version::Get(const ReadOptions& options,
                     const LookupKey& k,
-                    std::string* value,
+                    Value* value,
                     GetStats* stats) {
   Slice ikey = k.internal_key();
   Slice user_key = k.user_key();
@@ -354,8 +356,8 @@ Status Version::Get(const ReadOptions& options,
 
     // Get the list of files to search in this level
     FileMetaData* const* files = &files_[level][0];
-    if (level == 0) {
-      // Level-0 files may overlap each other.  Find all files that
+    if (gLevelTraits[level].m_OverlappedFiles) {
+      // Level files may overlap each other.  Find all files that
       // overlap user_key and process them in order from newest to oldest.
       tmp.reserve(num_files);
       for (uint32_t i = 0; i < num_files; i++) {
@@ -389,6 +391,9 @@ Status Version::Get(const ReadOptions& options,
       }
     }
 
+    if (0!=num_files)
+        gPerfCounters->Add(ePerfSearchLevel0 + level, num_files);
+
     for (uint32_t i = 0; i < num_files; ++i) {
       if (last_file_read != NULL && stats->seek_file == NULL) {
         // We have had more than one seek for this read.  Charge the 1st file.
@@ -403,9 +408,11 @@ Status Version::Get(const ReadOptions& options,
       Saver saver;
       saver.state = kNotFound;
       saver.ucmp = ucmp;
+      saver.options = vset_->options_;
       saver.user_key = user_key;
       saver.value = value;
-      s = vset_->table_cache_->Get(options, f->number, f->file_size,
+      saver.lookup = &k;
+      s = vset_->table_cache_->Get(options, f->number, f->file_size, level,
                                    ikey, &saver, SaveValue);
       if (!s.ok()) {
         return s;
@@ -429,6 +436,7 @@ Status Version::Get(const ReadOptions& options,
 }
 
 bool Version::UpdateStats(const GetStats& stats) {
+#if 0
   FileMetaData* f = stats.seek_file;
   if (f != NULL) {
     f->allowed_seeks--;
@@ -438,44 +446,7 @@ bool Version::UpdateStats(const GetStats& stats) {
       return true;
     }
   }
-  return false;
-}
-
-bool Version::RecordReadSample(Slice internal_key) {
-  ParsedInternalKey ikey;
-  if (!ParseInternalKey(internal_key, &ikey)) {
-    return false;
-  }
-
-  struct State {
-    GetStats stats;  // Holds first matching file
-    int matches;
-
-    static bool Match(void* arg, int level, FileMetaData* f) {
-      State* state = reinterpret_cast<State*>(arg);
-      state->matches++;
-      if (state->matches == 1) {
-        // Remember first match.
-        state->stats.seek_file = f;
-        state->stats.seek_file_level = level;
-      }
-      // We can stop iterating once we have a second match.
-      return state->matches < 2;
-    }
-  };
-
-  State state;
-  state.matches = 0;
-  ForEachOverlapping(ikey.user_key, internal_key, &state, &State::Match);
-
-  // Must have at least two matches since we want to merge across
-  // files. But what if we have a single file that contains many
-  // overwrites and deletions?  Should we have another mechanism for
-  // finding such files?
-  if (state.matches >= 2) {
-    // 1MB cost is about 1 seek (see comment in Builder::Apply).
-    return UpdateStats(state.stats);
-  }
+#endif
   return false;
 }
 
@@ -494,36 +465,43 @@ void Version::Unref() {
 
 bool Version::OverlapInLevel(int level,
                              const Slice* smallest_user_key,
-                             const Slice* largest_user_key) {
-  return SomeFileOverlapsRange(vset_->icmp_, (level > 0), files_[level],
+                             const Slice* largest_user_key) const {
+  return SomeFileOverlapsRange(vset_->icmp_,
+                               !gLevelTraits[level].m_OverlappedFiles,
+                               files_[level],
                                smallest_user_key, largest_user_key);
 }
 
 int Version::PickLevelForMemTableOutput(
     const Slice& smallest_user_key,
-    const Slice& largest_user_key) {
+    const Slice& largest_user_key,
+    const int level_limit) {
   int level = 0;
+
+// test if level 1 m_OverlappedFiles is false, proceded only then
   if (!OverlapInLevel(0, &smallest_user_key, &largest_user_key)) {
     // Push to next level if there is no overlap in next level,
     // and the #bytes overlapping in the level after that are limited.
-    InternalKey start(smallest_user_key, kMaxSequenceNumber, kValueTypeForSeek);
-    InternalKey limit(largest_user_key, 0, static_cast<ValueType>(0));
+    InternalKey start(smallest_user_key, 0, kMaxSequenceNumber, kValueTypeForSeek);
+    InternalKey limit(largest_user_key, 0, 0, static_cast<ValueType>(0));
     std::vector<FileMetaData*> overlaps;
-    while (level < config::kMaxMemCompactLevel) {
+    while (level < level_limit) {
       if (OverlapInLevel(level + 1, &smallest_user_key, &largest_user_key)) {
         break;
       }
-      if (level + 2 < config::kNumLevels) {
-        // Check that file does not overlap too many grandparent bytes.
-        GetOverlappingInputs(level + 2, &start, &limit, &overlaps);
-        const int64_t sum = TotalFileSize(overlaps);
-        if (sum > MaxGrandParentOverlapBytes(vset_->options_)) {
-          break;
-        }
+      GetOverlappingInputs(level + 2, &start, &limit, &overlaps);
+      const uint64_t sum = TotalFileSize(overlaps);
+      if (sum > gLevelTraits[level].m_MaxGrandParentOverlapBytes) {
+        break;
       }
       level++;
     }
+    // do not waste a move into an overlapped level, breaks
+    //  different performance improvement
+    if (gLevelTraits[level].m_OverlappedFiles)
+        level=0;
   }
+
   return level;
 }
 
@@ -533,44 +511,89 @@ void Version::GetOverlappingInputs(
     const InternalKey* begin,
     const InternalKey* end,
     std::vector<FileMetaData*>* inputs) {
-  assert(level >= 0);
-  assert(level < config::kNumLevels);
   inputs->clear();
   Slice user_begin, user_end;
+
+  // overlap takes everything
+  bool test_inputs(!gLevelTraits[level].m_OverlappedFiles);
+
   if (begin != NULL) {
-    user_begin = begin->user_key();
+      user_begin = begin->user_key();
   }
   if (end != NULL) {
-    user_end = end->user_key();
+      user_end = end->user_key();
   }
+
   const Comparator* user_cmp = vset_->icmp_.user_comparator();
   for (size_t i = 0; i < files_[level].size(); ) {
     FileMetaData* f = files_[level][i++];
     const Slice file_start = f->smallest.user_key();
     const Slice file_limit = f->largest.user_key();
-    if (begin != NULL && user_cmp->Compare(file_limit, user_begin) < 0) {
+    if (test_inputs && begin != NULL && user_cmp->Compare(file_limit, user_begin) < 0) {
       // "f" is completely before specified range; skip it
-    } else if (end != NULL && user_cmp->Compare(file_start, user_end) > 0) {
+    } else if (test_inputs && end != NULL && user_cmp->Compare(file_start, user_end) > 0) {
       // "f" is completely after specified range; skip it
     } else {
       inputs->push_back(f);
-      if (level == 0) {
-        // Level-0 files may overlap each other.  So check if the newly
-        // added file has expanded the range.  If so, restart search.
-        if (begin != NULL && user_cmp->Compare(file_start, user_begin) < 0) {
-          user_begin = file_start;
-          inputs->clear();
-          i = 0;
-        } else if (end != NULL && user_cmp->Compare(file_limit, user_end) > 0) {
-          user_end = file_limit;
-          inputs->clear();
-          i = 0;
-        }
-      }
     }
   }
 }
 
+
+bool
+Version::VerifyLevels(
+    int & level,           // input / output for current level to inspect
+    InternalKey & begin,   // output of lowest key in first overlapped file
+    InternalKey & end)     // output of highest key in first overlapped file
+{
+    bool overlap_found;
+    const Comparator* user_cmp;
+
+    overlap_found=false;
+    user_cmp = vset_->icmp_.user_comparator();
+
+    do
+    {
+        // test only levels that do not expect overlapped .sst files
+        if (!gLevelTraits[level].m_OverlappedFiles && 1<files_[level].size())
+        {
+            const std::vector<FileMetaData*>& files = files_[level];
+            size_t inner, outer;
+
+            for (outer=0; outer<files.size()-1 && !overlap_found; ++outer)
+            {
+                FileMetaData* outer_meta = files_[level][outer];
+                const Slice outer_limit = outer_meta->largest.user_key();
+
+                for (inner=outer+1; inner<files.size() && !overlap_found; ++inner)
+                {
+                    FileMetaData* inner_meta = files_[level][inner];
+                    const Slice inner_start = inner_meta->smallest.user_key();
+
+                    // do files overlap? assumes vector sorted by "start"
+                    if (user_cmp->Compare(inner_start, outer_limit) <= 0)
+                    {
+                        overlap_found=true;
+                        begin=outer_meta->smallest;
+                        end=outer_meta->largest;
+                    }   // if
+                }   // for
+            }   // for
+        }   // if
+
+        // current level is clean, move to next
+        if (!overlap_found)
+            ++level;
+
+        // stopping before the last level.  that needs much
+        //  more support code ... later project
+    } while(!overlap_found && (level+1)<config::kNumLevels);
+
+    return(overlap_found);
+
+}   // VersionSet::VerifyLevels
+
+
 std::string Version::DebugString() const {
   std::string r;
   for (int level = 0; level < config::kNumLevels; level++) {
@@ -686,6 +709,7 @@ class VersionSet::Builder {
       FileMetaData* f = new FileMetaData(edit->new_files_[i].second);
       f->refs = 1;
 
+#if 0
       // We arrange to automatically compact this file after
       // a certain number of seeks.  Let's assume:
       //   (1) One seek costs 10ms
@@ -701,6 +725,7 @@ class VersionSet::Builder {
       // of data before triggering a compaction.
       f->allowed_seeks = (f->file_size / 16384);
       if (f->allowed_seeks < 100) f->allowed_seeks = 100;
+#endif
 
       levels_[level].deleted_files.erase(f->number);
       levels_[level].added_files->insert(f);
@@ -740,11 +765,12 @@ class VersionSet::Builder {
 
 #ifndef NDEBUG
       // Make sure there is no overlap in levels > 0
-      if (level > 0) {
+      if (!gLevelTraits[level].m_OverlappedFiles) {
         for (uint32_t i = 1; i < v->files_[level].size(); i++) {
           const InternalKey& prev_end = v->files_[level][i-1]->largest;
           const InternalKey& this_begin = v->files_[level][i]->smallest;
-          if (vset_->icmp_.Compare(prev_end, this_begin) >= 0) {
+          if (vset_->icmp_.Compare(prev_end, this_begin) >= 0
+              && !vset_->options_->is_repair) {
             fprintf(stderr, "overlapping ranges in same level %s vs. %s\n",
                     prev_end.DebugString().c_str(),
                     this_begin.DebugString().c_str());
@@ -761,7 +787,8 @@ class VersionSet::Builder {
       // File is deleted: do nothing
     } else {
       std::vector<FileMetaData*>* files = &v->files_[level];
-      if (level > 0 && !files->empty()) {
+      if (!gLevelTraits[level].m_OverlappedFiles && !files->empty()
+          && !vset_->options_->is_repair) {
         // Must not overlap
         assert(vset_->icmp_.Compare((*files)[files->size()-1]->largest,
                                     f->smallest) < 0);
@@ -789,11 +816,17 @@ VersionSet::VersionSet(const std::string& dbname,
       descriptor_file_(NULL),
       descriptor_log_(NULL),
       dummy_versions_(this),
-      current_(NULL) {
+      current_(NULL),
+      last_penalty_minutes_(0),
+      prev_write_penalty_(0)
+{
   AppendVersion(new Version(this));
 }
 
 VersionSet::~VersionSet() {
+  // must remove second ref counter that keeps overlapped files locked
+  //  table cache
+
   current_->Unref();
   assert(dummy_versions_.next_ == &dummy_versions_);  // List must be empty
   delete descriptor_log_;
@@ -838,7 +871,6 @@ Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mu) {
     builder.Apply(edit);
     builder.SaveTo(v);
   }
-  Finalize(v);
 
   // Initialize new descriptor log file if necessary by creating
   // a temporary file that contains a snapshot of the current version.
@@ -850,45 +882,54 @@ Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mu) {
     assert(descriptor_file_ == NULL);
     new_manifest_file = DescriptorFileName(dbname_, manifest_file_number_);
     edit->SetNextFile(next_file_number_);
-    s = env_->NewWritableFile(new_manifest_file, &descriptor_file_);
+    s = env_->NewWritableFile(new_manifest_file, &descriptor_file_, 4*1024L);
     if (s.ok()) {
       descriptor_log_ = new log::Writer(descriptor_file_);
       s = WriteSnapshot(descriptor_log_);
     }
   }
 
-  // Unlock during expensive MANIFEST log write
-  {
-    mu->Unlock();
-
-    // Write new record to MANIFEST log
-    if (s.ok()) {
-      std::string record;
-      edit->EncodeTo(&record);
-      s = descriptor_log_->AddRecord(record);
-      if (s.ok()) {
-        s = descriptor_file_->Sync();
-      }
-      if (!s.ok()) {
-        Log(options_->info_log, "MANIFEST write: %s\n", s.ToString().c_str());
-      }
-    }
-
-    // If we just created a new descriptor file, install it by writing a
-    // new CURRENT file that points to it.
-    if (s.ok() && !new_manifest_file.empty()) {
-      s = SetCurrentFile(env_, dbname_, manifest_file_number_);
-    }
-
-    mu->Lock();
-  }
-
   // Install the new version
+  //  matthewv Oct 2013 - this used to be after the MANIFEST write
+  //  but overlapping compactions allow for a file to get lost
+  //  if first does not post to version completely.
   if (s.ok()) {
     AppendVersion(v);
     log_number_ = edit->log_number_;
     prev_log_number_ = edit->prev_log_number_;
-  } else {
+
+    // Unlock during expensive MANIFEST log write
+    {
+        mu->Unlock();
+
+        // but only one writer at a time
+        {
+            MutexLock lock(&manifest_mutex_);
+            // Write new record to MANIFEST log
+            if (s.ok()) {
+                std::string record;
+                edit->EncodeTo(&record, options_->ExpiryActivated());
+                s = descriptor_log_->AddRecord(record);
+                if (s.ok()) {
+                    s = descriptor_file_->Sync();
+                }
+            }
+
+            // If we just created a new descriptor file, install it by writing a
+            // new CURRENT file that points to it.
+            if (s.ok() && !new_manifest_file.empty()) {
+                s = SetCurrentFile(env_, dbname_, manifest_file_number_);
+            }
+        }   // manifest_lock_
+
+        mu->Lock();
+    }
+  }
+
+  // this used to be "else" clause to if(s.ok)
+  //  moved on Oct 2013
+  else
+  {
     delete v;
     if (!new_manifest_file.empty()) {
       delete descriptor_log_;
@@ -902,7 +943,7 @@ Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mu) {
   return s;
 }
 
-Status VersionSet::Recover(bool *save_manifest) {
+Status VersionSet::Recover() {
   struct LogReporter : public log::Reader::Reporter {
     Status* status;
     virtual void Corruption(size_t bytes, const Status& s) {
@@ -951,7 +992,7 @@ Status VersionSet::Recover(bool *save_manifest) {
         if (edit.has_comparator_ &&
             edit.comparator_ != icmp_.user_comparator()->Name()) {
           s = Status::InvalidArgument(
-              edit.comparator_ + " does not match existing comparator ",
+              edit.comparator_ + "does not match existing comparator ",
               icmp_.user_comparator()->Name());
         }
       }
@@ -1005,99 +1046,368 @@ Status VersionSet::Recover(bool *save_manifest) {
     Version* v = new Version(this);
     builder.SaveTo(v);
     // Install recovered version
-    Finalize(v);
     AppendVersion(v);
     manifest_file_number_ = next_file;
     next_file_number_ = next_file + 1;
     last_sequence_ = last_sequence;
     log_number_ = log_number;
     prev_log_number_ = prev_log_number;
-
-    // See if we can reuse the existing MANIFEST file.
-    if (ReuseManifest(dscname, current)) {
-      // No need to save new manifest
-    } else {
-      *save_manifest = true;
-    }
   }
 
   return s;
 }
 
-bool VersionSet::ReuseManifest(const std::string& dscname,
-                               const std::string& dscbase) {
-  if (!options_->reuse_logs) {
-    return false;
-  }
-  FileType manifest_type;
-  uint64_t manifest_number;
-  uint64_t manifest_size;
-  if (!ParseFileName(dscbase, &manifest_number, &manifest_type) ||
-      manifest_type != kDescriptorFile ||
-      !env_->GetFileSize(dscname, &manifest_size).ok() ||
-      // Make new compacted MANIFEST if old one is too big
-      manifest_size >= TargetFileSize(options_)) {
-    return false;
-  }
-
-  assert(descriptor_file_ == NULL);
-  assert(descriptor_log_ == NULL);
-  Status r = env_->NewAppendableFile(dscname, &descriptor_file_);
-  if (!r.ok()) {
-    Log(options_->info_log, "Reuse MANIFEST: %s\n", r.ToString().c_str());
-    assert(descriptor_file_ == NULL);
-    return false;
-  }
-
-  Log(options_->info_log, "Reusing MANIFEST %s\n", dscname.c_str());
-  descriptor_log_ = new log::Writer(descriptor_file_, manifest_size);
-  manifest_file_number_ = manifest_number;
-  return true;
-}
-
 void VersionSet::MarkFileNumberUsed(uint64_t number) {
   if (next_file_number_ <= number) {
     next_file_number_ = number + 1;
   }
 }
 
-void VersionSet::Finalize(Version* v) {
-  // Precomputed best level for next compaction
-  int best_level = -1;
-  double best_score = -1;
 
-  for (int level = 0; level < config::kNumLevels-1; level++) {
-    double score;
-    if (level == 0) {
-      // We treat level-0 specially by bounding the number of files
-      // instead of number of bytes for two reasons:
-      //
-      // (1) With larger write-buffer sizes, it is nice not to do too
-      // many level-0 compactions.
-      //
-      // (2) The files in level-0 are merged on every read and
-      // therefore we wish to avoid too many files when the individual
-      // file size is small (perhaps because of a small write-buffer
-      // setting, or very high compression ratios, or lots of
-      // overwrites/deletions).
-      score = v->files_[level].size() /
-          static_cast<double>(config::kL0_CompactionTrigger);
-    } else {
-      // Compute the ratio of current size to size limit.
-      const uint64_t level_bytes = TotalFileSize(v->files_[level]);
-      score =
-          static_cast<double>(level_bytes) / MaxBytesForLevel(options_, level);
-    }
+bool
+VersionSet::NeighborCompactionsQuiet(int level)
+{
+    uint64_t parent_level_bytes(0);
 
-    if (score > best_score) {
-      best_level = level;
-      best_score = score;
-    }
-  }
+    if (level < config::kNumLevels-1)
+        parent_level_bytes = TotalFileSize(current_->files_[level+1]);
+
+    // not an overlapped level and must not have compactions
+    //   scheduled on either level below or level above
+    return((0==level || !m_CompactionStatus[level-1].m_Submitted)
+           && !gLevelTraits[level].m_OverlappedFiles
+           && (level==config::kNumLevels-1
+               || (!m_CompactionStatus[level+1].m_Submitted
+                   && parent_level_bytes<=((gLevelTraits[level+1].m_MaxBytesForLevel
+                                            +gLevelTraits[level+1].m_DesiredBytesForLevel)/2))));
+}   // VersionSet::NeighborCompactionsQuiet
+
+
+bool
+VersionSet::Finalize(Version* v)
+{
+    // Riak:  looking for first compaction needed in level order
+    int best_level = -1;
+    double best_score = -1;
+    bool compaction_found;
+    bool is_grooming, no_move, expire_file;
+    uint64_t micros_now;
+
+    compaction_found=false;
+    is_grooming=false;
+    no_move=false;
+    expire_file=false;
+    micros_now=env_->NowMicros();
+
+    // Note: level kNumLevels-1 only examined for whole file expiry
+    for (int level = v->compaction_level_+1; level < config::kNumLevels && !compaction_found; ++level)
+    {
+        bool compact_ok;
+        double score(0);
+        uint64_t parent_level_bytes(0);
+
+        is_grooming=false;
+        // is this level eligible for compaction consideration?
+        compact_ok=!m_CompactionStatus[level].m_Submitted;
+
+        // not already scheduled for compaction
+        if (compact_ok)
+        {
+            if (level < (config::kNumLevels-1))
+                parent_level_bytes = TotalFileSize(v->files_[level+1]);
+
+            // is overlapped and so is next level
+            if (gLevelTraits[level].m_OverlappedFiles && gLevelTraits[level+1].m_OverlappedFiles)
+            {
+                // good ... stop consideration
+            }   // if
+
+            // overlapped and next level is not compacting
+            else if (gLevelTraits[level].m_OverlappedFiles && !m_CompactionStatus[level+1].m_Submitted
+                     && (parent_level_bytes<=gLevelTraits[level+1].m_DesiredBytesForLevel
+                         || config::kL0_CompactionTrigger <= v->files_[level].size()))
+            {
+                // good ... stop consideration
+            }   // else if
+
+            else
+            {
+                // must not have compactions scheduled on neither level below nor level above
+                compact_ok=NeighborCompactionsQuiet(level);
+            }   // else
+        }   // if
+
+        // consider this level
+        if (compact_ok)
+        {
+            size_t grooming_trigger;
+            uint64_t elapsed_micros;
+
+            // some platforms use gettimeofday() which can move backward
+            if ( m_CompactionStatus[level].m_LastCompaction < micros_now
+                 && 0 != m_CompactionStatus[level].m_LastCompaction)
+                elapsed_micros=micros_now - m_CompactionStatus[level].m_LastCompaction;
+            else
+                elapsed_micros=0;
+
+            // reevaluating timed grooming ... seems to crush caching
+            //  this disables the code but leaves it in place for future
+            //  reuse after block cache flushing impact addressed
+            elapsed_micros=0;
+
+            // which grooming trigger point?  based upon how long
+            //  since last compaction on this level
+            //   - less than 10 minutes?
+            if (elapsed_micros < config::kL0_Grooming10minMicros)
+                grooming_trigger=config::kL0_GroomingTrigger;
+
+            //   - less than 20 minutes?
+            else if (elapsed_micros < config::kL0_Grooming20minMicros)
+                grooming_trigger=config::kL0_GroomingTrigger10min;
+
+            //   - more than 20 minutes
+            else
+                grooming_trigger=config::kL0_GroomingTrigger20min;
+
+            if (gLevelTraits[level].m_OverlappedFiles) {
+                // We treat level-0 specially by bounding the number of files
+                // instead of number of bytes for two reasons:
+                //
+                // (1) With larger write-buffer sizes, it is nice not to do too
+                // many level-0 compactions.
+                //
+                // (2) The files in level-0 are merged on every read and
+                // therefore we wish to avoid too many files when the individual
+                // file size is small (perhaps because of a small write-buffer
+                // setting, or very high compression ratios, or lots of
+                // overwrites/deletions).
+                score=0;
+
+                // score of 1 at compaction trigger, incrementing for each thereafter
+                if ( config::kL0_CompactionTrigger <= v->files_[level].size())
+                    score += v->files_[level].size() - config::kL0_CompactionTrigger +1;
+
+                is_grooming=false;
+
+                // early overlapped compaction
+                //  only occurs if no other compactions running on groomer thread
+                //  (no grooming if landing level is still overloaded)
+                if (0==score && grooming_trigger<=v->files_[level].size()
+                    && 2<DBList()->GetDBCount(false)   // for non-Riak use cases, helps throughput
+                    && (uint64_t)TotalFileSize(v->files_[config::kNumOverlapLevels])
+		    < gLevelTraits[config::kNumOverlapLevels].m_DesiredBytesForLevel)
+                {
+                    // secondary test, don't push too much to next Overlap too soon
+                    if (!gLevelTraits[level+1].m_OverlappedFiles
+                         || v->files_[level+1].size()<=config::kL0_CompactionTrigger)
+                    {
+                        score=1;
+                        is_grooming=true;
+                    }   // if
+                }   // if
+            }   // if
+
+            // highest level, kNumLevels-1, only considered for expiry not compaction
+            else if (level < config::kNumLevels-1) {
+                // Compute the ratio of current size to size limit.
+                const uint64_t level_bytes = TotalFileSize(v->files_[level]);
+                score = static_cast<double>(level_bytes) / gLevelTraits[level].m_DesiredBytesForLevel;
+                is_grooming=(level_bytes < gLevelTraits[level].m_MaxFileSizeForLevel);
+
+                // force landing level to not be grooming ... ever
+                if (gLevelTraits[level-1].m_OverlappedFiles)
+                    is_grooming=false;
+
+                // within size constraints, are there any deletes worthy of consideration
+                //  (must not do this on overlapped levels.  causes huge throughput problems
+                //   on heavy loads)
+                if (score < 1 && 0!=options_->delete_threshold)
+                {
+                    Version::FileMetaDataVector_t::iterator it;
+
+                    for (it=v->files_[level].begin();
+                         v->files_[level].end()!=it && !compaction_found;
+                         ++it)
+                    {
+                        // if number of tombstones in stats exceeds threshold,
+                        //  we have a compaction candidate
+                        if (options_->delete_threshold <= GetTableCache()->GetStatisticValue((*it)->number, eSstCountDeleteKey))
+                        {
+                            compaction_found=true;
+                            best_level=level;
+                            best_score=0;
+                            v->file_to_compact_=*it;
+                            v->file_to_compact_level_=level;
+                            is_grooming=true;
+                            no_move=true;
+                        }
+                    }   // for
+                }   // if
+            }   // else
+
+            // this code block is old, should be rewritten
+            if (1<=score)
+            {
+                best_level = level;
+                best_score = score;
+                compaction_found=true;
+            }   // if
+
+            // finally test for expiry if no compaction candidates
+            if (!compaction_found && options_->ExpiryActivated())
+            {
+                compaction_found=options_->expiry_module->CompactionFinalizeCallback(false,
+                                                                                     *v,
+                                                                                     level,
+                                                                                     NULL);
+
+                if (compaction_found)
+                {
+                    best_level=level;
+                    best_score=0;
+                    is_grooming=false;
+                    no_move=true;
+                    expire_file=true;
+                    v->file_to_compact_level_=level;
+                }   // if
+            }   // if
+        }   // if
+    }   // for
+
+    // set (almost) all at once to ensure
+    //  no hold over from prior Finalize() call on this version.
+    //  (could rewrite cleaner by doing reset of these at top of function)
+    v->compaction_level_ = best_level;
+    v->compaction_score_ = best_score;
+    v->compaction_grooming_ = is_grooming;
+    v->compaction_no_move_ = no_move;
+    v->compaction_expirefile_ = expire_file;
+
+    return(compaction_found);
+
+} // VersionSet::Finalize
+
+
+/**
+ * UpdatePenalty was previous part of Finalize().  It is now
+ *  an independent routine dedicated to setting the penalty
+ *  value used within the WriteThrottle calculations.
+ *
+ * Penalty is an estimate of how many compactions/keys of work
+ *  are overdue.
+ */
+void
+VersionSet::UpdatePenalty(
+    Version* v)
+{
+    int penalty=0;
+
+    for (int level = 0; level < config::kNumLevels-1; ++level)
+    {
+        int loop, count, value;
+
+        value=0;
+        count=0;
+
+        if (gLevelTraits[level].m_OverlappedFiles)
+        {
+
+            // compute penalty for write throttle if too many Level-0 files accumulating
+            if (config::kL0_SlowdownWritesTrigger < v->NumFiles(level))
+            {
+                // assume each overlapped file represents another pass at same key
+                //   and we are "close" on compaction backlog
+                if ( v->NumFiles(level) < config::kL0_SlowdownWritesTrigger)
+                {
+                    // this code block will not execute due both "if"s using same values now
+                    value = 1;
+                    count = 0;
+                }   // if
+
+                // no longer estimating work, now trying to throw on the breaks
+                //  to keep leveldb from stalling
+                else
+                {
+                    count=(v->NumFiles(level) - config::kL0_SlowdownWritesTrigger);
+
+                    // level 0 has own thread pool and will stall writes,
+                    //  heavy penalty
+                    if (0==level)
+                    {   // non-linear penalty
+                        value=2;
+                    }   // if
+                    else
+                    {   // slightly less penalty
+                        value=1;
+                    }   // else
+                }   // else
+            }   // if
+        }   // if
+        else
+        {
+            const uint64_t level_bytes = TotalFileSize(v->GetFileList(level));
+
+	    // how dire is the situation
+            count=(int)(static_cast<double>(level_bytes) / gLevelTraits[level].m_MaxBytesForLevel);
+
+            if (0<count)
+            {
+	        // how many compaction behind
+                value=(level_bytes-gLevelTraits[level].m_MaxBytesForLevel) / options_->write_buffer_size;
+                value+=1;
+            }   // if
+
+            // this penalty is about reducing write amplification, its
+            //  side effect is to also improve compaction performance across
+            //  the level 1 to 2 to 3 boundry.
+            else if (config::kNumOverlapLevels==level
+                && gLevelTraits[level].m_DesiredBytesForLevel < level_bytes)
+            {
+                // this approximates the number of compactions needed, no other penalty
+                value=(int)(static_cast<double>(level_bytes-gLevelTraits[level].m_DesiredBytesForLevel) / options_->write_buffer_size);
+
+		// how urgent is the need to clear this level before next flood
+		//  (negative value is ignored)
+                count= v->NumFiles(level-1) - (config::kL0_CompactionTrigger/2);
+
+                // only throttle if backlog on the horizon
+                if (count < 0)
+                    value=0;
+            }   // else if
+
+        }   // else
+
+        penalty+=value;
+
+    }   // for
+
+    // put a ceiling on the value
+    if (1000<penalty || penalty<0)
+        penalty=1000;
+
+    uint64_t temp_min;
+    temp_min=port::TimeMicros();
+
+    if (last_penalty_minutes_<temp_min)
+    {
+        last_penalty_minutes_=temp_min+15*1000000;
+
+
+        if (prev_write_penalty_<penalty)
+            prev_write_penalty_+=(penalty - prev_write_penalty_)/7 +1;
+        else
+            prev_write_penalty_-=(prev_write_penalty_ - penalty)/5 +1;
+
+        if (prev_write_penalty_ < 0)
+            prev_write_penalty_ = 0;
+    }   // if
+
+    v->write_penalty_=prev_write_penalty_;
+
+    return;
+
+}   // VersionSet::UpdatePenalty
 
-  v->compaction_level_ = best_level;
-  v->compaction_score_ = best_score;
-}
 
 Status VersionSet::WriteSnapshot(log::Writer* log) {
   // TODO: Break up into multiple records to reduce memory usage on recovery?
@@ -1120,21 +1430,46 @@ Status VersionSet::WriteSnapshot(log::Writer* log) {
     const std::vector<FileMetaData*>& files = current_->files_[level];
     for (size_t i = 0; i < files.size(); i++) {
       const FileMetaData* f = files[i];
-      edit.AddFile(level, f->number, f->file_size, f->smallest, f->largest);
+      edit.AddFile2(level, f->number, f->file_size, f->smallest, f->largest,
+                    f->exp_write_low, f->exp_write_high, f->exp_explicit_high);
     }
   }
 
   std::string record;
-  edit.EncodeTo(&record);
+  edit.EncodeTo(&record, options_->ExpiryActivated());
   return log->AddRecord(record);
 }
 
-int VersionSet::NumLevelFiles(int level) const {
+size_t VersionSet::NumLevelFiles(int level) const {
   assert(level >= 0);
   assert(level < config::kNumLevels);
   return current_->files_[level].size();
 }
 
+bool VersionSet::IsLevelOverlapped(int level) {
+  assert(level >= 0);
+  assert(level < config::kNumLevels);
+  return(gLevelTraits[level].m_OverlappedFiles);
+}
+
+uint64_t VersionSet::DesiredBytesForLevel(int level) {
+  assert(level >= 0);
+  assert(level < config::kNumLevels);
+  return(gLevelTraits[level].m_DesiredBytesForLevel);
+}
+
+uint64_t VersionSet::MaxBytesForLevel(int level) {
+  assert(level >= 0);
+  assert(level < config::kNumLevels);
+  return(gLevelTraits[level].m_MaxBytesForLevel);
+}
+
+uint64_t VersionSet::MaxFileSizeForLevel(int level) {
+  assert(level >= 0);
+  assert(level < config::kNumLevels);
+  return(gLevelTraits[level].m_MaxFileSizeForLevel);
+}
+
 const char* VersionSet::LevelSummary(LevelSummaryStorage* scratch) const {
   // Update code if kNumLevels changes
   assert(config::kNumLevels == 7);
@@ -1150,6 +1485,22 @@ const char* VersionSet::LevelSummary(LevelSummaryStorage* scratch) const {
   return scratch->buffer;
 }
 
+const char* VersionSet::CompactionSummary(LevelSummaryStorage* scratch) const {
+  // Update code if kNumLevels changes
+  assert(config::kNumLevels == 7);
+  snprintf(scratch->buffer, sizeof(scratch->buffer),
+           "files[ %d,%d %d,%d %d,%d %d,%d %d,%d %d,%d %d,%d ]",
+           m_CompactionStatus[0].m_Submitted, m_CompactionStatus[0].m_Running,
+           m_CompactionStatus[1].m_Submitted, m_CompactionStatus[1].m_Running,
+           m_CompactionStatus[2].m_Submitted, m_CompactionStatus[2].m_Running,
+           m_CompactionStatus[3].m_Submitted, m_CompactionStatus[3].m_Running,
+           m_CompactionStatus[4].m_Submitted, m_CompactionStatus[4].m_Running,
+           m_CompactionStatus[5].m_Submitted, m_CompactionStatus[5].m_Running,
+           m_CompactionStatus[6].m_Submitted, m_CompactionStatus[6].m_Running);
+
+  return scratch->buffer;
+}
+
 uint64_t VersionSet::ApproximateOffsetOf(Version* v, const InternalKey& ikey) {
   uint64_t result = 0;
   for (int level = 0; level < config::kNumLevels; level++) {
@@ -1160,8 +1511,8 @@ uint64_t VersionSet::ApproximateOffsetOf(Version* v, const InternalKey& ikey) {
         result += files[i]->file_size;
       } else if (icmp_.Compare(files[i]->smallest, ikey) > 0) {
         // Entire file is after "ikey", so ignore
-        if (level > 0) {
-          // Files other than level 0 are sorted by meta->smallest, so
+        if (!gLevelTraits[level].m_OverlappedFiles) {
+          // Non-overlapped files are sorted by meta->smallest, so
           // no further files in this level will contain data for
           // "ikey".
           break;
@@ -1171,7 +1522,7 @@ uint64_t VersionSet::ApproximateOffsetOf(Version* v, const InternalKey& ikey) {
         // approximate offset of "ikey" within the table.
         Table* tableptr;
         Iterator* iter = table_cache_->NewIterator(
-            ReadOptions(), files[i]->number, files[i]->file_size, &tableptr);
+            ReadOptions(), files[i]->number, files[i]->file_size, level, &tableptr);
         if (tableptr != NULL) {
           result += tableptr->ApproximateOffsetOf(ikey.Encode());
         }
@@ -1257,22 +1608,35 @@ void VersionSet::GetRange2(const std::vector<FileMetaData*>& inputs1,
 
 Iterator* VersionSet::MakeInputIterator(Compaction* c) {
   ReadOptions options;
-  options.verify_checksums = options_->paranoid_checks;
+  options.verify_checksums = options_->verify_compactions;
   options.fill_cache = false;
+  options.is_compaction = true;
+  options.info_log = options_->info_log;
+  options.dbname = dbname_;
+  options.env = env_;
+
+  int which_limit, space;
 
   // Level-0 files have to be merged together.  For other levels,
   // we will make a concatenating iterator per level.
   // TODO(opt): use concatenating iterator for level-0 if there is no overlap
-  const int space = (c->level() == 0 ? c->inputs_[0].size() + 1 : 2);
+  // (during a repair, all levels use merge iterator as a precaution)
+  if (!options_->is_repair)
+      space = (gLevelTraits[c->level()].m_OverlappedFiles ? c->inputs_[0].size() + 1 : 2);
+  else
+      space =  c->inputs_[0].size() + c->inputs_[1].size();
+
   Iterator** list = new Iterator*[space];
   int num = 0;
-  for (int which = 0; which < 2; which++) {
+
+  which_limit=gLevelTraits[c->level()+1].m_OverlappedFiles ? 1 : 2;
+  for (int which = 0; which < which_limit; which++) {
     if (!c->inputs_[which].empty()) {
-      if (c->level() + which == 0) {
+      if (gLevelTraits[c->level() + which].m_OverlappedFiles || options_->is_repair) {
         const std::vector<FileMetaData*>& files = c->inputs_[which];
         for (size_t i = 0; i < files.size(); i++) {
           list[num++] = table_cache_->NewIterator(
-              options, files[i]->number, files[i]->file_size);
+              options, files[i]->number, files[i]->file_size, c->level() + which);
         }
       } else {
         // Create concatenating iterator for the files from this level
@@ -1288,58 +1652,126 @@ Iterator* VersionSet::MakeInputIterator(Compaction* c) {
   return result;
 }
 
-Compaction* VersionSet::PickCompaction() {
+
+/**
+ * PickCompactions() directly feeds hot_thread pools as of October 2013
+ */
+void
+VersionSet::PickCompaction(
+    class DBImpl * db_impl)
+{
   Compaction* c;
   int level;
 
-  // We prefer compactions triggered by too much data in a level over
-  // the compactions triggered by seeks.
-  const bool size_compaction = (current_->compaction_score_ >= 1);
-  const bool seek_compaction = (current_->file_to_compact_ != NULL);
-  if (size_compaction) {
-    level = current_->compaction_level_;
-    assert(level >= 0);
-    assert(level+1 < config::kNumLevels);
-    c = new Compaction(options_, level);
+  // perform this once per call ... since Finalize now loops
+  UpdatePenalty(current_);
 
-    // Pick the first file that comes after compact_pointer_[level]
-    for (size_t i = 0; i < current_->files_[level].size(); i++) {
-      FileMetaData* f = current_->files_[level][i];
-      if (compact_pointer_[level].empty() ||
-          icmp_.Compare(f->largest.Encode(), compact_pointer_[level]) > 0) {
-        c->inputs_[0].push_back(f);
-        break;
+  // submit a work object for every valid compaction needed
+  current_->compaction_level_=-1;
+  while(Finalize(current_))
+  {
+      bool submit_flag;
+
+      Log(options_->info_log,"Finalize level: %d, grooming %d",
+	  current_->compaction_level_, current_->compaction_grooming_);
+
+      c=NULL;
+
+      // We prefer compactions triggered by too much data in a level over
+      // the compactions triggered by seeks.  (Riak redefines "seeks" to
+      // "files containing delete tombstones")
+      const bool size_compaction = (current_->compaction_score_ >= 1);
+      const bool seek_compaction = (current_->file_to_compact_ != NULL);
+      if (size_compaction)
+      {
+          level = current_->compaction_level_;
+          assert(level >= 0);
+          assert(level+1 < config::kNumLevels);
+
+          c = new Compaction(level);
+
+          // Pick the first file that comes after compact_pointer_[level]
+          for (size_t i = 0; i < current_->files_[level].size(); i++) {
+              FileMetaData* f = current_->files_[level][i];
+              if (compact_pointer_[level].empty() ||
+                  icmp_.Compare(f->largest.Encode(), compact_pointer_[level]) > 0) {
+                  c->inputs_[0].push_back(f);
+                  break;
+              }
+          }
+          if (c->inputs_[0].empty()) {
+              // Wrap-around to the beginning of the key space
+              c->inputs_[0].push_back(current_->files_[level][0]);
+          }
+      } else if (seek_compaction) {
+          level = current_->file_to_compact_level_;
+          c = new Compaction(level);
+          c->inputs_[0].push_back(current_->file_to_compact_);
+      } else if (current_->compaction_expirefile_) {
+          level = current_->file_to_compact_level_;
+          c = new Compaction(level);
+          c->compaction_type_=kExpiryFileCompaction;
+      } else {
+          return;
       }
-    }
-    if (c->inputs_[0].empty()) {
-      // Wrap-around to the beginning of the key space
-      c->inputs_[0].push_back(current_->files_[level][0]);
-    }
-  } else if (seek_compaction) {
-    level = current_->file_to_compact_level_;
-    c = new Compaction(options_, level);
-    c->inputs_[0].push_back(current_->file_to_compact_);
-  } else {
-    return NULL;
-  }
 
-  c->input_version_ = current_;
-  c->input_version_->Ref();
+      c->input_version_ = current_;
+      c->input_version_->Ref();
+      c->no_move_ = current_->compaction_no_move_;
 
-  // Files in level 0 may overlap each other, so pick up all overlapping ones
-  if (level == 0) {
-    InternalKey smallest, largest;
-    GetRange(c->inputs_[0], &smallest, &largest);
-    // Note that the next call will discard the file we placed in
-    // c->inputs_[0] earlier and replace it with an overlapping set
-    // which will include the picked file.
-    current_->GetOverlappingInputs(0, &smallest, &largest, &c->inputs_[0]);
-    assert(!c->inputs_[0].empty());
-  }
+      // set submitted as race defense
+      m_CompactionStatus[level].m_Submitted=true;
 
-  SetupOtherInputs(c);
+      if (!current_->compaction_expirefile_)
+      {
+          // m_OverlappedFiles==true levels have files that
+          //   may overlap each other, so pick up all overlapping ones
+          if (gLevelTraits[level].m_OverlappedFiles) {
+              InternalKey smallest, largest;
+              GetRange(c->inputs_[0], &smallest, &largest);
+              // Note that the next call will discard the file we placed in
+              // c->inputs_[0] earlier and replace it with an overlapping set
+              // which will include the picked file.
+              current_->GetOverlappingInputs(level, &smallest, &largest, &c->inputs_[0]);
+              assert(!c->inputs_[0].empty());
 
-  return c;
+              // this can get into tens of thousands after a repair
+              //  keep it sane
+              size_t max_open_files=100;  // previously an options_ member variable
+              if (max_open_files < c->inputs_[0].size())
+              {
+                  std::nth_element(c->inputs_[0].begin(),
+                                   c->inputs_[0].begin()+max_open_files-1,
+                                   c->inputs_[0].end(),FileMetaDataPtrCompare(options_->comparator));
+                  c->inputs_[0].erase(c->inputs_[0].begin()+max_open_files,
+                                      c->inputs_[0].end());
+              }   // if
+          }   // if
+
+          SetupOtherInputs(c);
+
+          ThreadTask * task=new CompactionTask(db_impl, c);
+
+          if (0==level)
+              submit_flag=gLevel0Threads->Submit(task, !current_->compaction_grooming_);
+          else
+              submit_flag=gCompactionThreads->Submit(task, !current_->compaction_grooming_);
+      }   // if
+
+      // expiry compaction
+      else
+      {
+          ThreadTask * task=new CompactionTask(db_impl, c);
+          submit_flag=gCompactionThreads->Submit(task, true);
+      }   // else
+
+      // set/reset submitted based upon truth of queuing
+      //  (ref counting will auto delete task rejected)
+      m_CompactionStatus[level].m_Submitted=submit_flag;
+
+  }   // while
+
+  return;
 }
 
 void VersionSet::SetupOtherInputs(Compaction* c) {
@@ -1347,53 +1779,79 @@ void VersionSet::SetupOtherInputs(Compaction* c) {
   InternalKey smallest, largest;
   GetRange(c->inputs_[0], &smallest, &largest);
 
-  current_->GetOverlappingInputs(level+1, &smallest, &largest, &c->inputs_[1]);
+  if (!gLevelTraits[level+1].m_OverlappedFiles)
+  {
+      current_->GetOverlappingInputs(level+1, &smallest, &largest, &c->inputs_[1]);
 
-  // Get entire range covered by compaction
-  InternalKey all_start, all_limit;
-  GetRange2(c->inputs_[0], c->inputs_[1], &all_start, &all_limit);
+      // Get entire range covered by compaction
+      InternalKey all_start, all_limit;
+      GetRange2(c->inputs_[0], c->inputs_[1], &all_start, &all_limit);
 
-  // See if we can grow the number of inputs in "level" without
-  // changing the number of "level+1" files we pick up.
-  if (!c->inputs_[1].empty()) {
-    std::vector<FileMetaData*> expanded0;
-    current_->GetOverlappingInputs(level, &all_start, &all_limit, &expanded0);
-    const int64_t inputs0_size = TotalFileSize(c->inputs_[0]);
-    const int64_t inputs1_size = TotalFileSize(c->inputs_[1]);
-    const int64_t expanded0_size = TotalFileSize(expanded0);
-    if (expanded0.size() > c->inputs_[0].size() &&
-        inputs1_size + expanded0_size <
-            ExpandedCompactionByteSizeLimit(options_)) {
-      InternalKey new_start, new_limit;
-      GetRange(expanded0, &new_start, &new_limit);
-      std::vector<FileMetaData*> expanded1;
-      current_->GetOverlappingInputs(level+1, &new_start, &new_limit,
-                                     &expanded1);
-      if (expanded1.size() == c->inputs_[1].size()) {
-        Log(options_->info_log,
-            "Expanding@%d %d+%d (%ld+%ld bytes) to %d+%d (%ld+%ld bytes)\n",
-            level,
-            int(c->inputs_[0].size()),
-            int(c->inputs_[1].size()),
-            long(inputs0_size), long(inputs1_size),
-            int(expanded0.size()),
-            int(expanded1.size()),
-            long(expanded0_size), long(inputs1_size));
-        smallest = new_start;
-        largest = new_limit;
-        c->inputs_[0] = expanded0;
-        c->inputs_[1] = expanded1;
-        GetRange2(c->inputs_[0], c->inputs_[1], &all_start, &all_limit);
+      // See if we can grow the number of inputs in "level" without
+      // changing the number of "level+1" files we pick up.
+      if (!c->inputs_[1].empty()) {
+          std::vector<FileMetaData*> expanded0;
+          current_->GetOverlappingInputs(level, &all_start, &all_limit, &expanded0);
+          //const int64_t inputs0_size = TotalFileSize(c->inputs_[0]);
+          const int64_t inputs1_size = TotalFileSize(c->inputs_[1]);
+          const int64_t expanded0_size = TotalFileSize(expanded0);
+          if (expanded0.size() > c->inputs_[0].size() &&
+              inputs1_size + expanded0_size < gLevelTraits[level].m_ExpandedCompactionByteSizeLimit) {
+              InternalKey new_start, new_limit;
+              GetRange(expanded0, &new_start, &new_limit);
+              std::vector<FileMetaData*> expanded1;
+              current_->GetOverlappingInputs(level+1, &new_start, &new_limit,
+                                             &expanded1);
+              if (expanded1.size() == c->inputs_[1].size()) {
+#if 0  // mutex_ held
+                  Log(options_->info_log,
+                      "Expanding@%d %d+%d (%ld+%ld bytes) to %d+%d (%ld+%ld bytes)\n",
+                      level,
+                      int(c->inputs_[0].size()),
+                      int(c->inputs_[1].size()),
+                      long(inputs0_size), long(inputs1_size),
+                      int(expanded0.size()),
+                      int(expanded1.size()),
+                      long(expanded0_size), long(inputs1_size));
+#endif
+                  smallest = new_start;
+                  largest = new_limit;
+                  c->inputs_[0] = expanded0;
+                  c->inputs_[1] = expanded1;
+                  GetRange2(c->inputs_[0], c->inputs_[1], &all_start, &all_limit);
+              }
+          }
       }
-    }
-  }
 
-  // Compute the set of grandparent files that overlap this compaction
-  // (parent == level+1; grandparent == level+2)
-  if (level + 2 < config::kNumLevels) {
-    current_->GetOverlappingInputs(level + 2, &all_start, &all_limit,
-                                   &c->grandparents_);
-  }
+      // Compute the set of grandparent files that overlap this compaction
+      // (parent == level+1; grandparent == level+2)
+      if (level + 2 < config::kNumLevels) {
+          current_->GetOverlappingInputs(level + 2, &all_start, &all_limit,
+                                         &c->grandparents_);
+      }
+  }   // if
+#if 1
+  // compacting into an overlapped layer
+  else
+  {
+      // if this is NOT a repair (or panic) situation, take all files
+      //  to reduce write amplification
+      if (c->inputs_[0].size()<=config::kL0_StopWritesTrigger
+          && c->inputs_[0].size()!=current_->files_[level].size())
+      {
+          c->inputs_[0].clear();
+          c->inputs_[0].reserve(current_->files_[level].size());
+
+          for (size_t i = 0; i < current_->files_[level].size(); ++i )
+          {
+              FileMetaData* f = current_->files_[level][i];
+              c->inputs_[0].push_back(f);
+          }   // for
+
+          GetRange(c->inputs_[0], &smallest, &largest);
+      }   // if
+  }   // else
+#endif
 
   if (false) {
     Log(options_->info_log, "Compacting %d '%s' .. '%s'",
@@ -1421,23 +1879,18 @@ Compaction* VersionSet::CompactRange(
   }
 
   // Avoid compacting too much in one shot in case the range is large.
-  // But we cannot do this for level-0 since level-0 files can overlap
-  // and we must not pick one file and drop another older file if the
-  // two files overlap.
-  if (level > 0) {
-    const uint64_t limit = MaxFileSizeForLevel(options_, level);
-    uint64_t total = 0;
-    for (size_t i = 0; i < inputs.size(); i++) {
-      uint64_t s = inputs[i]->file_size;
-      total += s;
-      if (total >= limit) {
-        inputs.resize(i + 1);
-        break;
-      }
+  const uint64_t limit = gLevelTraits[level].m_MaxFileSizeForLevel;
+  uint64_t total = 0;
+  for (size_t i = 0; i < inputs.size(); i++) {
+    uint64_t s = inputs[i]->file_size;
+    total += s;
+    if (total >= limit) {
+      inputs.resize(i + 1);
+      break;
     }
   }
 
-  Compaction* c = new Compaction(options_, level);
+  Compaction* c = new Compaction(level);
   c->input_version_ = current_;
   c->input_version_->Ref();
   c->inputs_[0] = inputs;
@@ -1445,13 +1898,21 @@ Compaction* VersionSet::CompactRange(
   return c;
 }
 
-Compaction::Compaction(const Options* options, int level)
+
+Compaction::Compaction(int level)
     : level_(level),
-      max_output_file_size_(MaxFileSizeForLevel(options, level)),
+      max_output_file_size_(gLevelTraits[level].m_MaxFileSizeForLevel),
       input_version_(NULL),
+      compaction_type_(kNormalCompaction),
       grandparent_index_(0),
       seen_key_(false),
-      overlapped_bytes_(0) {
+      overlapped_bytes_(0),
+      tot_user_data_(0), tot_index_keys_(0),
+      avg_value_size_(0), avg_key_size_(0), avg_block_size_(0),
+      compressible_(true),
+      stats_done_(false),
+      no_move_(false)
+  {
   for (int i = 0; i < config::kNumLevels; i++) {
     level_ptrs_[i] = 0;
   }
@@ -1464,13 +1925,24 @@ Compaction::~Compaction() {
 }
 
 bool Compaction::IsTrivialMove() const {
-  const VersionSet* vset = input_version_->vset_;
   // Avoid a move if there is lots of overlapping grandparent data.
   // Otherwise, the move could create a parent file that will require
   // a very expensive merge later on.
-  return (num_input_files(0) == 1 && num_input_files(1) == 0 &&
-          TotalFileSize(grandparents_) <=
-              MaxGrandParentOverlapBytes(vset->options_));
+#if 1
+  return (!gLevelTraits[level_].m_OverlappedFiles &&
+          IsMoveOk() &&
+          num_input_files(0) == 1 &&
+          num_input_files(1) == 0 &&
+          (uint64_t)TotalFileSize(grandparents_) <= gLevelTraits[level_].m_MaxGrandParentOverlapBytes);
+#else
+  // removed this functionality when creating gLevelTraits[].m_OverlappedFiles
+  //  flag.  "Move" was intented by Google to delay compaction by moving small
+  //  files in-between non-overlapping sorted files.  New concept is to delay
+  //  all compactions by creating larger log files before starting to thrash
+  //  disk by maintaining smaller sorted files.  Less thrash -> higher throughput
+  return(false);
+#endif
+
 }
 
 void Compaction::AddInputDeletions(VersionEdit* edit) {
@@ -1482,47 +1954,76 @@ void Compaction::AddInputDeletions(VersionEdit* edit) {
 }
 
 bool Compaction::IsBaseLevelForKey(const Slice& user_key) {
-  // Maybe use binary search to find right entry instead of linear search?
-  const Comparator* user_cmp = input_version_->vset_->icmp_.user_comparator();
-  for (int lvl = level_ + 2; lvl < config::kNumLevels; lvl++) {
-    const std::vector<FileMetaData*>& files = input_version_->files_[lvl];
-    for (; level_ptrs_[lvl] < files.size(); ) {
-      FileMetaData* f = files[level_ptrs_[lvl]];
-      if (user_cmp->Compare(user_key, f->largest.user_key()) <= 0) {
-        // We've advanced far enough
-        if (user_cmp->Compare(user_key, f->smallest.user_key()) >= 0) {
-          // Key falls in this file's range, so definitely not base level
-          return false;
+    bool ret_flag;
+
+    ret_flag=true;
+
+    if (gLevelTraits[level_].m_OverlappedFiles
+        || gLevelTraits[level_+1].m_OverlappedFiles)
+    {
+        ret_flag=false;
+    }   // if
+    else
+    {
+        // Maybe use binary search to find right entry instead of linear search?
+        const Comparator* user_cmp = input_version_->vset_->icmp_.user_comparator();
+        for (int lvl = level_ + 2; lvl < config::kNumLevels; lvl++) {
+            const std::vector<FileMetaData*>& files = input_version_->files_[lvl];
+            for (; level_ptrs_[lvl] < files.size(); ) {
+                FileMetaData* f = files[level_ptrs_[lvl]];
+                if (user_cmp->Compare(user_key, f->largest.user_key()) <= 0) {
+                    // We've advanced far enough
+                    if (user_cmp->Compare(user_key, f->smallest.user_key()) >= 0) {
+                        // Key falls in this file's range, so definitely not base level
+                        return false;
+                    }
+                    break;
+                }
+                level_ptrs_[lvl]++;
+            }
         }
-        break;
-      }
-      level_ptrs_[lvl]++;
-    }
-  }
-  return true;
+    }   // else
+
+    return ret_flag;
 }
 
-bool Compaction::ShouldStopBefore(const Slice& internal_key) {
-  const VersionSet* vset = input_version_->vset_;
-  // Scan to find earliest grandparent file that contains key.
-  const InternalKeyComparator* icmp = &vset->icmp_;
-  while (grandparent_index_ < grandparents_.size() &&
-      icmp->Compare(internal_key,
-                    grandparents_[grandparent_index_]->largest.Encode()) > 0) {
-    if (seen_key_) {
-      overlapped_bytes_ += grandparents_[grandparent_index_]->file_size;
-    }
-    grandparent_index_++;
-  }
-  seen_key_ = true;
+bool Compaction::ShouldStopBefore(const Slice& internal_key, size_t key_count) {
 
-  if (overlapped_bytes_ > MaxGrandParentOverlapBytes(vset->options_)) {
-    // Too much overlap for current output; start new output
+  bool ret_flag(false);
+
+  // This is a look ahead to see how costly this key will make the subsequent compaction
+  //  of this new file to the next higher level.  Start a new file if the cost is high.
+  if (!gLevelTraits[level()+1].m_OverlappedFiles)
+  {
+    // Scan to find earliest grandparent file that contains key.
+    const InternalKeyComparator* icmp = &input_version_->vset_->icmp_;
+    while (grandparent_index_ < grandparents_.size() &&
+           icmp->Compare(internal_key,
+                         grandparents_[grandparent_index_]->largest.Encode()) > 0) {
+      if (seen_key_) {
+        overlapped_bytes_ += grandparents_[grandparent_index_]->file_size;
+      }
+        grandparent_index_++;
+    }
+    seen_key_ = true;
+
+    if (overlapped_bytes_ > gLevelTraits[level_].m_MaxGrandParentOverlapBytes) {
+      // Too much overlap for current output; start new output
+      ret_flag=true;
+    } // if
+
+    // Second consideration:  sorted files need to keep the bloom filter size controlled
+    //  to meet file open speed goals
+    else
+    {
+      ret_flag=(300000<key_count);
+    } // else
+  }  // if
+
+  if (ret_flag)
     overlapped_bytes_ = 0;
-    return true;
-  } else {
-    return false;
-  }
+
+  return(ret_flag);
 }
 
 void Compaction::ReleaseInputs() {
@@ -1532,4 +2033,170 @@ void Compaction::ReleaseInputs() {
   }
 }
 
-}  // namespace leveldb
+/**
+ * Riak specific:  populate statistics data about this compaction
+ */
+void
+Compaction::CalcInputStats(
+    TableCache & tables)
+{
+    uint64_t temp, temp_cnt;
+    size_t value_count, key_count, block_count;
+
+    if (!stats_done_)
+    {
+        tot_user_data_=0;
+        tot_index_keys_=0;
+        avg_value_size_=0; value_count=0;
+        avg_key_size_=0;   key_count=0;
+        avg_block_size_=0; block_count=0;
+        compressible_=(0==level_);
+
+        // walk both levels of input files
+        const size_t level0Count = inputs_[0].size();
+        const size_t level1Count = inputs_[1].size();
+        const size_t totalCount = level0Count + level1Count;
+
+        for (size_t j = 0; j < totalCount; ++j)
+        {
+            FileMetaData * fmd;
+            Status s;
+            Cache::Handle * handle;
+            size_t user_est, idx_est;
+
+            fmd=(j < level0Count) ? inputs_[0][j] : inputs_[1][j-level0Count];
+
+            // compression test
+            // true if more data blocks than data blocks that did not compress
+            //    or if no statistics available
+            compressible_ = compressible_
+                            || (tables.GetStatisticValue(fmd->number, eSstCountBlocks)
+                               >tables.GetStatisticValue(fmd->number, eSstCountCompressAborted))
+                              || 0==tables.GetStatisticValue(fmd->number, eSstCountBlocks);
+
+            // block sizing algorithm
+            temp=0;
+            temp_cnt=0;
+            user_est=0;
+            idx_est=0;
+
+            // get and hold handle to cache entry
+            s=tables.TEST_FindTable(fmd->number, fmd->file_size, fmd->level, &handle);
+
+            if (s.ok())
+            {
+                // 1. total size of all blocks before compaction
+                temp=tables.GetStatisticValue(fmd->number, eSstCountBlockSize);
+
+                // estimate size when counter does not exist
+                if (0==temp)
+                {
+                    TableAndFile * tf;
+
+                    tf=reinterpret_cast<TableAndFile*>(tables.TEST_GetInternalCache()->Value(handle));
+                    if (tf->table->TableObjectSize() < fmd->file_size)
+                        temp=fmd->file_size - tf->table->TableObjectSize();
+                }   // if
+
+                user_est=temp;
+                tot_user_data_+=temp;
+
+                // 2. total keys in the indexes
+                temp=tables.GetStatisticValue(fmd->number, eSstCountIndexKeys);
+
+                // estimate total when counter does not exist
+                if (0==temp)
+                {
+                    TableAndFile * tf;
+                    Block * index_block;
+
+                    tf=reinterpret_cast<TableAndFile*>(tables.TEST_GetInternalCache()->Value(handle));
+                    index_block=tf->table->TEST_GetIndexBlock();
+                    temp=index_block->NumRestarts();
+                }   // if
+
+                idx_est=temp;
+                tot_index_keys_+=temp;
+
+                // 3. average size of values in input set
+                //    (value is really size of value plus size of key)
+                temp=tables.GetStatisticValue(fmd->number, eSstCountValueSize);
+                temp+=tables.GetStatisticValue(fmd->number, eSstCountKeySize);
+                temp_cnt=tables.GetStatisticValue(fmd->number, eSstCountKeys);
+
+                // estimate total when counter does not exist
+                if (0==temp || 0==temp_cnt)
+                {
+                    // no way to estimate total key count
+                    //  (ok, could try from bloom filter size ... but likely no
+                    //   bloom filter if no stats)
+                    temp=0;
+                    temp_cnt=0;
+                }   // if
+
+                avg_value_size_+=temp;
+                value_count+=temp_cnt;
+
+                // 4. average key size
+                temp=tables.GetStatisticValue(fmd->number, eSstCountKeySize);
+                temp_cnt=tables.GetStatisticValue(fmd->number, eSstCountKeys);
+
+                // estimate total when counter does not exist
+                if (0==temp || 0==temp_cnt)
+                {
+                    // no way to estimate total key count
+                    //  (ok, could try from bloom filter size ... but likely no
+                    //   bloom filter if no stats)
+                    temp=0;
+                    temp_cnt=0;
+                }   // if
+
+                avg_key_size_+=temp;
+                key_count+=temp_cnt;
+
+                // 5. block key size
+                temp=tables.GetStatisticValue(fmd->number, eSstCountBlockSizeUsed);
+                temp_cnt=tables.GetStatisticValue(fmd->number, eSstCountBlocks);
+                temp*=temp_cnt;
+
+                // estimate total when counter does not exist
+                if (0==temp || 0==temp_cnt)
+                {
+                    temp=user_est;
+                    temp_cnt=idx_est;
+                }   // if
+
+                avg_block_size_+=temp;
+                block_count+=temp_cnt;
+
+                // cleanup
+                tables.Release(handle);
+            }   // if
+        }   // for
+
+        // compute averages
+        if (0!=value_count)
+            avg_value_size_/=value_count;
+        else
+            avg_value_size_=0;
+
+        if (0!=key_count)
+            avg_key_size_/=key_count;
+        else
+            avg_key_size_=0;
+
+        if (0!=block_count)
+            avg_block_size_/=block_count;
+        else
+            avg_block_size_=0;
+
+        // only want to do this once per compaction
+        stats_done_=true;
+    }   // if
+
+    return;
+
+}   // Compaction::CalcInputStats
+
+
+}   // namespace leveldb
diff --git a/src/leveldb/db/version_set.h b/src/leveldb/db/version_set.h
index 7935a965a..477c3d8d4 100644
--- a/src/leveldb/db/version_set.h
+++ b/src/leveldb/db/version_set.h
@@ -21,7 +21,9 @@
 #include "db/dbformat.h"
 #include "db/version_edit.h"
 #include "port/port.h"
-#include "port/thread_annotations.h"
+#include "leveldb/atomics.h"
+#include "leveldb/env.h"
+#include "util/throttle.h"
 
 namespace leveldb {
 
@@ -70,7 +72,7 @@ class Version {
     FileMetaData* seek_file;
     int seek_file_level;
   };
-  Status Get(const ReadOptions&, const LookupKey& key, std::string* val,
+  Status Get(const ReadOptions&, const LookupKey& key, Value* val,
              GetStats* stats);
 
   // Adds "stats" into the current state.  Returns true if a new
@@ -78,12 +80,6 @@ class Version {
   // REQUIRES: lock is held
   bool UpdateStats(const GetStats& stats);
 
-  // Record a sample of bytes read at the specified internal key.
-  // Samples are taken approximately once every config::kReadBytesPeriod
-  // bytes.  Returns true if a new compaction may need to be triggered.
-  // REQUIRES: lock is held
-  bool RecordReadSample(Slice key);
-
   // Reference count management (so Versions do not disappear out from
   // under live iterators)
   void Ref();
@@ -101,43 +97,47 @@ class Version {
   // largest_user_key==NULL represents a key largest than all keys in the DB.
   bool OverlapInLevel(int level,
                       const Slice* smallest_user_key,
-                      const Slice* largest_user_key);
+                      const Slice* largest_user_key) const;
 
   // Return the level at which we should place a new memtable compaction
   // result that covers the range [smallest_user_key,largest_user_key].
   int PickLevelForMemTableOutput(const Slice& smallest_user_key,
-                                 const Slice& largest_user_key);
+                                 const Slice& largest_user_key,
+                                 const int level_limit);
 
-  int NumFiles(int level) const { return files_[level].size(); }
+  virtual size_t NumFiles(int level) const { return files_[level].size(); }
+
+  const VersionSet * GetVersionSet() const { return vset_; }
+
+  typedef std::vector<FileMetaData*> FileMetaDataVector_t;
+
+  virtual const std::vector<FileMetaData*> & GetFileList(int level) const {return files_[level];};
+
+  volatile int WritePenalty() const {return write_penalty_; }
+
+  // Riak specific repair routine
+  bool VerifyLevels(int & level, InternalKey & begin, InternalKey & end);
 
   // Return a human readable string that describes this version's contents.
   std::string DebugString() const;
 
- private:
+protected:
   friend class Compaction;
   friend class VersionSet;
 
   class LevelFileNumIterator;
   Iterator* NewConcatenatingIterator(const ReadOptions&, int level) const;
 
-  // Call func(arg, level, f) for every file that overlaps user_key in
-  // order from newest to oldest.  If an invocation of func returns
-  // false, makes no more calls.
-  //
-  // REQUIRES: user portion of internal_key == user_key.
-  void ForEachOverlapping(Slice user_key, Slice internal_key,
-                          void* arg,
-                          bool (*func)(void*, int, FileMetaData*));
-
   VersionSet* vset_;            // VersionSet to which this Version belongs
   Version* next_;               // Next version in linked list
   Version* prev_;               // Previous version in linked list
   int refs_;                    // Number of live refs to this version
 
   // List of files per level
-  std::vector<FileMetaData*> files_[config::kNumLevels];
+  USED_BY_NESTED_FRIEND(std::vector<FileMetaData*> files_[config::kNumLevels];)
 
-  // Next file to compact based on seek stats.
+ protected:
+  // Next file to compact based on seek stats (or Riak delete test)
   FileMetaData* file_to_compact_;
   int file_to_compact_level_;
 
@@ -146,17 +146,29 @@ class Version {
   // are initialized by Finalize().
   double compaction_score_;
   int compaction_level_;
+  bool compaction_grooming_;
+  bool compaction_no_move_;
+  bool compaction_expirefile_;
+  volatile int write_penalty_;
 
+ protected:
+  // make the ctor/dtor protected, so that a unit test can subclass
   explicit Version(VersionSet* vset)
       : vset_(vset), next_(this), prev_(this), refs_(0),
         file_to_compact_(NULL),
         file_to_compact_level_(-1),
         compaction_score_(-1),
-        compaction_level_(-1) {
+        compaction_level_(-1),
+        compaction_grooming_(false),
+        compaction_no_move_(false),
+        compaction_expirefile_(false),
+        write_penalty_(0)
+  {
   }
 
-  ~Version();
+  virtual ~Version();
 
+private:
   // No copying allowed
   Version(const Version&);
   void operator=(const Version&);
@@ -175,11 +187,10 @@ class VersionSet {
   // current version.  Will release *mu while actually writing to the file.
   // REQUIRES: *mu is held on entry.
   // REQUIRES: no other thread concurrently calls LogAndApply()
-  Status LogAndApply(VersionEdit* edit, port::Mutex* mu)
-      EXCLUSIVE_LOCKS_REQUIRED(mu);
+  Status LogAndApply(VersionEdit* edit, port::Mutex* mu);
 
   // Recover the last saved descriptor from persistent storage.
-  Status Recover(bool *save_manifest);
+  Status Recover();
 
   // Return the current version.
   Version* current() const { return current_; }
@@ -188,19 +199,29 @@ class VersionSet {
   uint64_t ManifestFileNumber() const { return manifest_file_number_; }
 
   // Allocate and return a new file number
-  uint64_t NewFileNumber() { return next_file_number_++; }
+  //  (-1 is to "duplicate" old post-increment logic while maintaining
+  //   some threading integrity ... next_file_number_ used naked a bunch)
+  uint64_t NewFileNumber() { return(inc_and_fetch(&next_file_number_) -1); }
 
   // Arrange to reuse "file_number" unless a newer file number has
   // already been allocated.
   // REQUIRES: "file_number" was returned by a call to NewFileNumber().
+  //  (disabled due to threading concerns ... and desire NOT to use mutex, matthewv)
   void ReuseFileNumber(uint64_t file_number) {
-    if (next_file_number_ == file_number + 1) {
-      next_file_number_ = file_number;
-    }
+//    if (next_file_number_ == file_number + 1) {
+//      next_file_number_ = file_number;
+//    }
   }
 
   // Return the number of Table files at the specified level.
-  int NumLevelFiles(int level) const;
+  size_t NumLevelFiles(int level) const;
+
+  // is the specified level overlapped (or if false->sorted)
+  static bool IsLevelOverlapped(int level);
+
+  static uint64_t DesiredBytesForLevel(int level);
+  static uint64_t MaxBytesForLevel(int level);
+  static uint64_t MaxFileSizeForLevel(int level);
 
   // Return the combined file size of all files at the specified level.
   int64_t NumLevelBytes(int level) const;
@@ -224,11 +245,36 @@ class VersionSet {
   // being compacted, or zero if there is no such log file.
   uint64_t PrevLogNumber() const { return prev_log_number_; }
 
+  int WriteThrottleUsec(bool active_compaction)
+  {
+      uint64_t penalty, throttle;
+      int ret_val;
+
+      penalty=current_->write_penalty_;
+      throttle=GetThrottleWriteRate();
+
+      ret_val=0;
+      if (0==penalty && 1!=throttle)
+          ret_val=(int)throttle;
+      else if (0!=penalty)
+      {
+          if (1==throttle)
+              throttle=GetUnadjustedThrottleWriteRate();
+          ret_val=(int)penalty * throttle;
+      }   // else if
+
+      return(ret_val);
+  }
+
+
   // Pick level and inputs for a new compaction.
   // Returns NULL if there is no compaction to be done.
   // Otherwise returns a pointer to a heap-allocated object that
   // describes the compaction.  Caller should delete the result.
-  Compaction* PickCompaction();
+  //
+  // Riak October 2013:  Pick Compaction now posts work directly
+  //  to hot_thread pools
+  void PickCompaction(class DBImpl * db_impl);
 
   // Return a compaction object for compacting the range [begin,end] in
   // the specified level.  Returns NULL if there is nothing in that
@@ -267,16 +313,42 @@ class VersionSet {
     char buffer[100];
   };
   const char* LevelSummary(LevelSummaryStorage* scratch) const;
+  const char* CompactionSummary(LevelSummaryStorage* scratch) const;
 
- private:
+  TableCache* GetTableCache() {return(table_cache_);};
+
+  const Options * GetOptions() const {return(options_);};
+
+  bool IsCompactionSubmitted(int level)
+  {return(m_CompactionStatus[level].m_Submitted);}
+
+  void SetCompactionSubmitted(int level)
+  {m_CompactionStatus[level].m_Submitted=true;}
+
+  void SetCompactionRunning(int level)
+  {m_CompactionStatus[level].m_Running=true;}
+
+  void SetCompactionDone(int level, uint64_t Now)
+  {   m_CompactionStatus[level].m_Running=false;
+      m_CompactionStatus[level].m_Submitted=false;
+      // must set both source and destination.  otherwise
+      //  destination might immediately decide it needs a
+      //  timed grooming too ... defeating idea to spreadout the groomings
+      m_CompactionStatus[level].m_LastCompaction=Now;
+      if ((level+1)<config::kNumLevels)
+          m_CompactionStatus[level+1].m_LastCompaction=Now;
+  }
+
+  bool NeighborCompactionsQuiet(int level);
+
+protected:
   class Builder;
 
   friend class Compaction;
   friend class Version;
 
-  bool ReuseManifest(const std::string& dscname, const std::string& dscbase);
-
-  void Finalize(Version* v);
+  bool Finalize(Version* v);
+  void UpdatePenalty(Version *v);
 
   void GetRange(const std::vector<FileMetaData*>& inputs,
                 InternalKey* smallest,
@@ -299,7 +371,7 @@ class VersionSet {
   const Options* const options_;
   TableCache* const table_cache_;
   const InternalKeyComparator icmp_;
-  uint64_t next_file_number_;
+  volatile uint64_t next_file_number_;
   uint64_t manifest_file_number_;
   uint64_t last_sequence_;
   uint64_t log_number_;
@@ -315,11 +387,44 @@ class VersionSet {
   // Either an empty string, or a valid InternalKey.
   std::string compact_pointer_[config::kNumLevels];
 
+  // Riak allows multiple compaction threads, this mutex allows
+  //  only one to write to manifest at a time.  Only used in LogAndApply
+  port::Mutex manifest_mutex_;
+
+  volatile uint64_t last_penalty_minutes_;
+  volatile int prev_write_penalty_;
+
+
+
+  struct CompactionStatus_s
+  {
+      bool m_Submitted;     //!< level submitted to hot thread pool
+      bool m_Running;       //!< thread actually running compaction
+      uint64_t m_LastCompaction; //!<NowMicros() when last compaction completed
+
+      CompactionStatus_s()
+      : m_Submitted(false), m_Running(false), m_LastCompaction(0)
+      {};
+  } m_CompactionStatus[config::kNumLevels];
+
+private:
   // No copying allowed
   VersionSet(const VersionSet&);
   void operator=(const VersionSet&);
 };
 
+//
+// allows routing of compaction request to
+//  diverse processing routines via common
+//  BackgroundCall2 thread entry
+//
+enum CompactionType
+{
+    kNormalCompaction = 0x0,
+    kExpiryFileCompaction = 0x1
+};  // CompactionType
+
+
 // A Compaction encapsulates information about a compaction.
 class Compaction {
  public:
@@ -329,6 +434,9 @@ class Compaction {
   // and "level+1" will be merged to produce a set of "level+1" files.
   int level() const { return level_; }
 
+  // Return parent Version object
+  const Version * version() const { return input_version_; }
+
   // Return the object that holds the edits to the descriptor done
   // by this compaction.
   VersionEdit* edit() { return &edit_; }
@@ -356,32 +464,47 @@ class Compaction {
 
   // Returns true iff we should stop building the current output
   // before processing "internal_key".
-  bool ShouldStopBefore(const Slice& internal_key);
+  bool ShouldStopBefore(const Slice& internal_key, size_t key_count);
 
   // Release the input version for the compaction, once the compaction
   // is successful.
   void ReleaseInputs();
 
+  // Riak specific:  get summary statistics from compaction inputs
+  void CalcInputStats(TableCache & tables);
+  size_t TotalUserDataSize() const {return(tot_user_data_);};
+  size_t TotalIndexKeys()    const {return(tot_index_keys_);};
+  size_t AverageValueSize()  const {return(avg_value_size_);};
+  size_t AverageKeySize()    const {return(avg_key_size_);};
+  size_t AverageBlockSize()  const {return(avg_block_size_);};
+  bool IsCompressible()      const {return(compressible_);};
+
+  // Riak specific:  is move operation ok for compaction?
+  bool IsMoveOk()            const {return(!no_move_);};
+
+  enum CompactionType GetCompactionType() const {return(compaction_type_);};
+
  private:
   friend class Version;
   friend class VersionSet;
 
-  Compaction(const Options* options, int level);
+  explicit Compaction(int level);
 
   int level_;
   uint64_t max_output_file_size_;
   Version* input_version_;
   VersionEdit edit_;
+  CompactionType compaction_type_;
 
   // Each compaction reads inputs from "level_" and "level_+1"
   std::vector<FileMetaData*> inputs_[2];      // The two sets of inputs
 
-  // State used to check for number of overlapping grandparent files
+  // State used to check for number of of overlapping grandparent files
   // (parent == level_ + 1, grandparent == level_ + 2)
   std::vector<FileMetaData*> grandparents_;
   size_t grandparent_index_;  // Index in grandparent_starts_
   bool seen_key_;             // Some output key has been seen
-  int64_t overlapped_bytes_;  // Bytes of overlap between current output
+  uint64_t overlapped_bytes_;  // Bytes of overlap between current output
                               // and grandparent files
 
   // State for implementing IsBaseLevelForKey
@@ -391,6 +514,16 @@ class Compaction {
   // higher level than the ones involved in this compaction (i.e. for
   // all L >= level_ + 2).
   size_t level_ptrs_[config::kNumLevels];
+
+  // Riak specific:  output statistics from CalcInputStats
+  size_t tot_user_data_;
+  size_t tot_index_keys_;
+  size_t avg_value_size_;
+  size_t avg_key_size_;
+  size_t avg_block_size_;
+  bool compressible_;
+  bool stats_done_;
+  bool no_move_;
 };
 
 }  // namespace leveldb
diff --git a/src/leveldb/db/version_set_test.cc b/src/leveldb/db/version_set_test.cc
index 501e34d13..aa36b4ee7 100644
--- a/src/leveldb/db/version_set_test.cc
+++ b/src/leveldb/db/version_set_test.cc
@@ -27,13 +27,13 @@ class FindFileTest {
            SequenceNumber largest_seq = 100) {
     FileMetaData* f = new FileMetaData;
     f->number = files_.size() + 1;
-    f->smallest = InternalKey(smallest, smallest_seq, kTypeValue);
-    f->largest = InternalKey(largest, largest_seq, kTypeValue);
+    f->smallest = InternalKey(smallest, 0, smallest_seq, kTypeValue);
+    f->largest = InternalKey(largest, 0, largest_seq, kTypeValue);
     files_.push_back(f);
   }
 
   int Find(const char* key) {
-    InternalKey target(key, 100, kTypeValue);
+    InternalKey target(key, 0, 100, kTypeValue);
     InternalKeyComparator cmp(BytewiseComparator());
     return FindFile(cmp, files_, target.Encode());
   }
diff --git a/src/leveldb/db/write_batch.cc b/src/leveldb/db/write_batch.cc
index 33f4a4257..116e717a9 100644
--- a/src/leveldb/db/write_batch.cc
+++ b/src/leveldb/db/write_batch.cc
@@ -13,13 +13,17 @@
 //    len: varint32
 //    data: uint8[len]
 
-#include "leveldb/write_batch.h"
+#include <stdint.h>
 
 #include "leveldb/db.h"
+#include "leveldb/env.h"
+#include "leveldb/expiry.h"
+#include "leveldb/write_batch.h"
 #include "db/dbformat.h"
 #include "db/memtable.h"
 #include "db/write_batch_internal.h"
 #include "util/coding.h"
+#include "util/throttle.h"
 
 namespace leveldb {
 
@@ -47,16 +51,17 @@ Status WriteBatch::Iterate(Handler* handler) const {
 
   input.remove_prefix(kHeader);
   Slice key, value;
+  ExpiryTimeMicros expiry;
   int found = 0;
   while (!input.empty()) {
     found++;
-    char tag = input[0];
+    ValueType tag = (ValueType)input[0];
     input.remove_prefix(1);
     switch (tag) {
       case kTypeValue:
         if (GetLengthPrefixedSlice(&input, &key) &&
             GetLengthPrefixedSlice(&input, &value)) {
-          handler->Put(key, value);
+            handler->Put(key, value, kTypeValue, 0);
         } else {
           return Status::Corruption("bad WriteBatch Put");
         }
@@ -68,6 +73,16 @@ Status WriteBatch::Iterate(Handler* handler) const {
           return Status::Corruption("bad WriteBatch Delete");
         }
         break;
+      case kTypeValueWriteTime:
+      case kTypeValueExplicitExpiry:
+        if (GetLengthPrefixedSlice(&input, &key) &&
+            GetVarint64(&input, &expiry) &&
+            GetLengthPrefixedSlice(&input, &value)) {
+            handler->Put(key, value, tag, expiry);
+        } else {
+          return Status::Corruption("bad WriteBatch Expiry");
+        }
+        break;
       default:
         return Status::Corruption("unknown WriteBatch tag");
     }
@@ -95,10 +110,20 @@ void WriteBatchInternal::SetSequence(WriteBatch* b, SequenceNumber seq) {
   EncodeFixed64(&b->rep_[0], seq);
 }
 
-void WriteBatch::Put(const Slice& key, const Slice& value) {
+void WriteBatch::Put(const Slice& key, const Slice& value, const KeyMetaData * meta) {
+  KeyMetaData local_meta;
   WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1);
-  rep_.push_back(static_cast<char>(kTypeValue));
+  if (NULL!=meta)
+      local_meta=*meta;
+  rep_.push_back(static_cast<char>(local_meta.m_Type));
   PutLengthPrefixedSlice(&rep_, key);
+  if (kTypeValueExplicitExpiry==local_meta.m_Type
+      || kTypeValueWriteTime==local_meta.m_Type)
+  {
+      if (kTypeValueWriteTime==local_meta.m_Type && 0==local_meta.m_Expiry)
+          local_meta.m_Expiry=GetCachedTimeMicros();
+      PutVarint64(&rep_, local_meta.m_Expiry);
+  }   // if
   PutLengthPrefixedSlice(&rep_, value);
 }
 
@@ -113,23 +138,33 @@ class MemTableInserter : public WriteBatch::Handler {
  public:
   SequenceNumber sequence_;
   MemTable* mem_;
+  const Options * options_;
 
-  virtual void Put(const Slice& key, const Slice& value) {
-    mem_->Add(sequence_, kTypeValue, key, value);
+  MemTableInserter() : mem_(NULL), options_(NULL) {};
+
+  virtual void Put(const Slice& key, const Slice& value, const ValueType &type, const ExpiryTimeMicros &expiry) {
+    ValueType type_use(type);
+    ExpiryTimeMicros expiry_use(expiry);
+
+    if (NULL!=options_ && options_->ExpiryActivated())
+        options_->expiry_module->MemTableInserterCallback(key, value, type_use, expiry_use);
+    mem_->Add(sequence_, (ValueType)type_use, key, value, expiry_use);
     sequence_++;
   }
   virtual void Delete(const Slice& key) {
-    mem_->Add(sequence_, kTypeDeletion, key, Slice());
+    mem_->Add(sequence_, kTypeDeletion, key, Slice(), 0);
     sequence_++;
   }
 };
 }  // namespace
 
 Status WriteBatchInternal::InsertInto(const WriteBatch* b,
-                                      MemTable* memtable) {
+                                      MemTable* memtable,
+                                      const Options * options) {
   MemTableInserter inserter;
   inserter.sequence_ = WriteBatchInternal::Sequence(b);
   inserter.mem_ = memtable;
+  inserter.options_ = options;
   return b->Iterate(&inserter);
 }
 
diff --git a/src/leveldb/db/write_batch_internal.h b/src/leveldb/db/write_batch_internal.h
index 9448ef7b2..d313d02da 100644
--- a/src/leveldb/db/write_batch_internal.h
+++ b/src/leveldb/db/write_batch_internal.h
@@ -5,7 +5,6 @@
 #ifndef STORAGE_LEVELDB_DB_WRITE_BATCH_INTERNAL_H_
 #define STORAGE_LEVELDB_DB_WRITE_BATCH_INTERNAL_H_
 
-#include "db/dbformat.h"
 #include "leveldb/write_batch.h"
 
 namespace leveldb {
@@ -22,10 +21,10 @@ class WriteBatchInternal {
   // Set the count for the number of entries in the batch.
   static void SetCount(WriteBatch* batch, int n);
 
-  // Return the sequence number for the start of this batch.
+  // Return the seqeunce number for the start of this batch.
   static SequenceNumber Sequence(const WriteBatch* batch);
 
-  // Store the specified number as the sequence number for the start of
+  // Store the specified number as the seqeunce number for the start of
   // this batch.
   static void SetSequence(WriteBatch* batch, SequenceNumber seq);
 
@@ -39,7 +38,7 @@ class WriteBatchInternal {
 
   static void SetContents(WriteBatch* batch, const Slice& contents);
 
-  static Status InsertInto(const WriteBatch* batch, MemTable* memtable);
+  static Status InsertInto(const WriteBatch* batch, MemTable* memtable, const Options * options);
 
   static void Append(WriteBatch* dst, const WriteBatch* src);
 };
diff --git a/src/leveldb/db/write_batch_test.cc b/src/leveldb/db/write_batch_test.cc
index 9064e3d85..4854af429 100644
--- a/src/leveldb/db/write_batch_test.cc
+++ b/src/leveldb/db/write_batch_test.cc
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
+#include <sstream>
 #include "leveldb/db.h"
 
 #include "db/memtable.h"
@@ -17,11 +18,12 @@ static std::string PrintContents(WriteBatch* b) {
   MemTable* mem = new MemTable(cmp);
   mem->Ref();
   std::string state;
-  Status s = WriteBatchInternal::InsertInto(b, mem);
+  Status s = WriteBatchInternal::InsertInto(b, mem, NULL);
   int count = 0;
   Iterator* iter = mem->NewIterator();
   for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
     ParsedInternalKey ikey;
+    std::stringstream sstr;
     ASSERT_TRUE(ParseInternalKey(iter->key(), &ikey));
     switch (ikey.type) {
       case kTypeValue:
@@ -32,6 +34,28 @@ static std::string PrintContents(WriteBatch* b) {
         state.append(")");
         count++;
         break;
+      case kTypeValueWriteTime:
+        state.append("PutWT(");
+        state.append(ikey.user_key.ToString());
+        state.append(", ");
+        sstr << ikey.expiry;
+        state.append(sstr.str());
+        state.append(", ");
+        state.append(iter->value().ToString());
+        state.append(")");
+        count++;
+        break;
+      case kTypeValueExplicitExpiry:
+        state.append("PutEE(");
+        state.append(ikey.user_key.ToString());
+        state.append(", ");
+        sstr << ikey.expiry;
+        state.append(sstr.str());
+        state.append(", ");
+        state.append(iter->value().ToString());
+        state.append(")");
+        count++;
+        break;
       case kTypeDeletion:
         state.append("Delete(");
         state.append(ikey.user_key.ToString());
@@ -74,6 +98,32 @@ TEST(WriteBatchTest, Multiple) {
             PrintContents(&batch));
 }
 
+TEST(WriteBatchTest, MultipleExpiry) {
+  WriteBatch batch;
+  KeyMetaData meta;
+  batch.Put(Slice("Mary"), Slice("Lamb"));
+  meta.m_Type=kTypeValueExplicitExpiry;
+  meta.m_Expiry=2347;
+  batch.Put(Slice("Adam"), Slice("Ant"), &meta);
+  //batch.PutExplicitExpiry(Slice("Adam"), Slice("Ant"), 2347);
+  batch.Put(Slice("Frosty"), Slice("Snowman"));
+  batch.Put(Slice("Tip"), Slice("ONeal"));
+  batch.Delete(Slice("Frosty"));
+  meta.m_Type=kTypeValueExplicitExpiry;
+  meta.m_Expiry=987654321;
+  batch.Put(Slice("The"), Slice("Fonz"), &meta);
+  WriteBatchInternal::SetSequence(&batch, 200);
+  ASSERT_EQ(200, WriteBatchInternal::Sequence(&batch));
+  ASSERT_EQ(6, WriteBatchInternal::Count(&batch));
+  ASSERT_EQ("PutEE(Adam, 2347, Ant)@201"
+            "Delete(Frosty)@204"
+            "Put(Frosty, Snowman)@202"
+            "Put(Mary, Lamb)@200"
+            "PutEE(The, 987654321, Fonz)@205"
+            "Put(Tip, ONeal)@203",
+            PrintContents(&batch));
+}
+
 TEST(WriteBatchTest, Corruption) {
   WriteBatch batch;
   batch.Put(Slice("foo"), Slice("bar"));
diff --git a/src/leveldb/doc/bench/db_bench_sqlite3.cc b/src/leveldb/doc/bench/db_bench_sqlite3.cc
index e63aaa8dc..256793a9d 100644
--- a/src/leveldb/doc/bench/db_bench_sqlite3.cc
+++ b/src/leveldb/doc/bench/db_bench_sqlite3.cc
@@ -618,7 +618,7 @@ class Benchmark {
         ErrorCheck(status);
 
         // Execute read statement
-        while ((status = sqlite3_step(read_stmt)) == SQLITE_ROW) {}
+        while ((status = sqlite3_step(read_stmt)) == SQLITE_ROW);
         StepErrorCheck(status);
 
         // Reset SQLite statement for another use
diff --git a/src/leveldb/doc/bench/db_bench_tree_db.cc b/src/leveldb/doc/bench/db_bench_tree_db.cc
index 4ca381f11..ed86f031c 100644
--- a/src/leveldb/doc/bench/db_bench_tree_db.cc
+++ b/src/leveldb/doc/bench/db_bench_tree_db.cc
@@ -338,7 +338,7 @@ class Benchmark {
       bool write_sync = false;
       if (name == Slice("fillseq")) {
         Write(write_sync, SEQUENTIAL, FRESH, num_, FLAGS_value_size, 1);
-        DBSynchronize(db_);
+        
       } else if (name == Slice("fillrandom")) {
         Write(write_sync, RANDOM, FRESH, num_, FLAGS_value_size, 1);
         DBSynchronize(db_);
diff --git a/src/leveldb/doc/doc.css b/src/leveldb/doc/doc.css
new file mode 100644
index 000000000..700c564e4
--- /dev/null
+++ b/src/leveldb/doc/doc.css
@@ -0,0 +1,89 @@
+body {
+  margin-left: 0.5in;
+  margin-right: 0.5in;
+  background: white;
+  color: black;
+}
+
+h1 {
+  margin-left: -0.2in;
+  font-size: 14pt;
+}
+h2 {
+  margin-left: -0in;
+  font-size: 12pt;
+}
+h3 {
+  margin-left: -0in;
+}
+h4 {
+  margin-left: -0in;
+}
+hr {
+  margin-left: -0in;
+}
+
+/* Definition lists: definition term bold */
+dt {
+  font-weight: bold;
+}
+
+address {
+  text-align: center;
+}
+code,samp,var {
+  color: blue;
+}
+kbd {
+  color: #600000;
+}
+div.note p {
+  float: right;
+  width: 3in;
+  margin-right: 0%;
+  padding: 1px;
+  border: 2px solid #6060a0;
+  background-color: #fffff0;
+}
+
+ul {
+  margin-top: -0em;
+  margin-bottom: -0em;
+}
+
+ol {
+  margin-top: -0em;
+  margin-bottom: -0em;
+}
+
+UL.nobullets {
+  list-style-type: none;
+  list-style-image: none;
+  margin-left: -1em;
+}
+
+p {
+  margin: 1em 0 1em 0;
+  padding: 0 0 0 0;
+}
+
+pre {
+  line-height: 1.3em;
+  padding: 0.4em 0 0.8em 0;
+  margin:  0 0 0 0;
+  border:  0 0 0 0;
+  color: blue;
+}
+
+.datatable {
+  margin-left: auto;
+  margin-right: auto;
+  margin-top: 2em;
+  margin-bottom: 2em;
+  border: 1px solid;
+}
+
+.datatable td,th {
+  padding: 0 0.5em 0 0.5em;
+  text-align: right;
+}
diff --git a/src/leveldb/doc/impl.html b/src/leveldb/doc/impl.html
new file mode 100644
index 000000000..e870795d2
--- /dev/null
+++ b/src/leveldb/doc/impl.html
@@ -0,0 +1,213 @@
+<!DOCTYPE html>
+<html>
+<head>
+<link rel="stylesheet" type="text/css" href="doc.css" />
+<title>Leveldb file layout and compactions</title>
+</head>
+
+<body>
+
+<h1>Files</h1>
+
+The implementation of leveldb is similar in spirit to the
+representation of a single
+<a href="http://labs.google.com/papers/bigtable.html">
+Bigtable tablet (section 5.3)</a>.
+However the organization of the files that make up the representation
+is somewhat different and is explained below.
+
+<p>
+Each database is represented by a set of files stored in a directory.
+There are several different types of files as documented below:
+<p>
+<h2>Log files</h2>
+<p>
+A log file (*.log) stores a sequence of recent updates.  Each update
+is appended to the current log file.  When the log file reaches a
+pre-determined size (approximately 4MB by default), it is converted
+to a sorted table (see below) and a new log file is created for future
+updates.
+<p>
+A copy of the current log file is kept in an in-memory structure (the
+<code>memtable</code>).  This copy is consulted on every read so that read
+operations reflect all logged updates.
+<p>
+<h2>Sorted tables</h2>
+<p>
+A sorted table (*.sst) stores a sequence of entries sorted by key.
+Each entry is either a value for the key, or a deletion marker for the
+key.  (Deletion markers are kept around to hide obsolete values
+present in older sorted tables).
+<p>
+The set of sorted tables are organized into a sequence of levels.  The
+sorted table generated from a log file is placed in a special <code>young</code>
+level (also called level-0).  When the number of young files exceeds a
+certain threshold (currently four), all of the young files are merged
+together with all of the overlapping level-1 files to produce a
+sequence of new level-1 files (we create a new level-1 file for every
+2MB of data.)
+<p>
+Files in the young level may contain overlapping keys.  However files
+in other levels have distinct non-overlapping key ranges.  Consider
+level number L where L >= 1.  When the combined size of files in
+level-L exceeds (10^L) MB (i.e., 10MB for level-1, 100MB for level-2,
+...), one file in level-L, and all of the overlapping files in
+level-(L+1) are merged to form a set of new files for level-(L+1).
+These merges have the effect of gradually migrating new updates from
+the young level to the largest level using only bulk reads and writes
+(i.e., minimizing expensive seeks).
+
+<h2>Manifest</h2>
+<p>
+A MANIFEST file lists the set of sorted tables that make up each
+level, the corresponding key ranges, and other important metadata.
+A new MANIFEST file (with a new number embedded in the file name)
+is created whenever the database is reopened.  The MANIFEST file is
+formatted as a log, and changes made to the serving state (as files
+are added or removed) are appended to this log.
+<p>
+<h2>Current</h2>
+<p>
+CURRENT is a simple text file that contains the name of the latest
+MANIFEST file.
+<p>
+<h2>Info logs</h2>
+<p>
+Informational messages are printed to files named LOG and LOG.old.
+<p>
+<h2>Others</h2>
+<p>
+Other files used for miscellaneous purposes may also be present
+(LOCK, *.dbtmp).
+
+<h1>Level 0</h1>
+When the log file grows above a certain size (1MB by default):
+<ul>
+<li>Create a brand new memtable and log file and direct future updates here
+<li>In the background:
+<ul>
+<li>Write the contents of the previous memtable to an sstable
+<li>Discard the memtable
+<li>Delete the old log file and the old memtable
+<li>Add the new sstable to the young (level-0) level.
+</ul>
+</ul>
+
+<h1>Compactions</h1>
+
+<p>
+When the size of level L exceeds its limit, we compact it in a
+background thread.  The compaction picks a file from level L and all
+overlapping files from the next level L+1.  Note that if a level-L
+file overlaps only part of a level-(L+1) file, the entire file at
+level-(L+1) is used as an input to the compaction and will be
+discarded after the compaction.  Aside: because level-0 is special
+(files in it may overlap each other), we treat compactions from
+level-0 to level-1 specially: a level-0 compaction may pick more than
+one level-0 file in case some of these files overlap each other.
+
+<p>
+A compaction merges the contents of the picked files to produce a
+sequence of level-(L+1) files.  We switch to producing a new
+level-(L+1) file after the current output file has reached the target
+file size (2MB).  We also switch to a new output file when the key
+range of the current output file has grown enough to overlap more then
+ten level-(L+2) files.  This last rule ensures that a later compaction
+of a level-(L+1) file will not pick up too much data from level-(L+2).
+
+<p>
+The old files are discarded and the new files are added to the serving
+state.
+
+<p>
+Compactions for a particular level rotate through the key space.  In
+more detail, for each level L, we remember the ending key of the last
+compaction at level L.  The next compaction for level L will pick the
+first file that starts after this key (wrapping around to the
+beginning of the key space if there is no such file).
+
+<p>
+Compactions drop overwritten values.  They also drop deletion markers
+if there are no higher numbered levels that contain a file whose range
+overlaps the current key.
+
+<h2>Timing</h2>
+
+Level-0 compactions will read up to four 1MB files from level-0, and
+at worst all the level-1 files (10MB).  I.e., we will read 14MB and
+write 14MB.
+
+<p>
+Other than the special level-0 compactions, we will pick one 2MB file
+from level L.  In the worst case, this will overlap ~ 12 files from
+level L+1 (10 because level-(L+1) is ten times the size of level-L,
+and another two at the boundaries since the file ranges at level-L
+will usually not be aligned with the file ranges at level-L+1).  The
+compaction will therefore read 26MB and write 26MB.  Assuming a disk
+IO rate of 100MB/s (ballpark range for modern drives), the worst
+compaction cost will be approximately 0.5 second.
+
+<p>
+If we throttle the background writing to something small, say 10% of
+the full 100MB/s speed, a compaction may take up to 5 seconds.  If the
+user is writing at 10MB/s, we might build up lots of level-0 files
+(~50 to hold the 5*10MB).  This may signficantly increase the cost of
+reads due to the overhead of merging more files together on every
+read.
+
+<p>
+Solution 1: To reduce this problem, we might want to increase the log
+switching threshold when the number of level-0 files is large.  Though
+the downside is that the larger this threshold, the more memory we will
+need to hold the corresponding memtable.
+
+<p>
+Solution 2: We might want to decrease write rate artificially when the
+number of level-0 files goes up.
+
+<p>
+Solution 3: We work on reducing the cost of very wide merges.
+Perhaps most of the level-0 files will have their blocks sitting
+uncompressed in the cache and we will only need to worry about the
+O(N) complexity in the merging iterator.
+
+<h2>Number of files</h2>
+
+Instead of always making 2MB files, we could make larger files for
+larger levels to reduce the total file count, though at the expense of
+more bursty compactions.  Alternatively, we could shard the set of
+files into multiple directories.
+
+<p>
+An experiment on an <code>ext3</code> filesystem on Feb 04, 2011 shows
+the following timings to do 100K file opens in directories with
+varying number of files:
+<table class="datatable">
+<tr><th>Files in directory</th><th>Microseconds to open a file</th></tr>
+<tr><td>1000</td><td>9</td>
+<tr><td>10000</td><td>10</td>
+<tr><td>100000</td><td>16</td>
+</table>
+So maybe even the sharding is not necessary on modern filesystems?
+
+<h1>Recovery</h1>
+
+<ul>
+<li> Read CURRENT to find name of the latest committed MANIFEST
+<li> Read the named MANIFEST file
+<li> Clean up stale files
+<li> We could open all sstables here, but it is probably better to be lazy...
+<li> Convert log chunk to a new level-0 sstable
+<li> Start directing new writes to a new log file with recovered sequence#
+</ul>
+
+<h1>Garbage collection of files</h1>
+
+<code>DeleteObsoleteFiles()</code> is called at the end of every
+compaction and at the end of recovery.  It finds the names of all
+files in the database.  It deletes all log files that are not the
+current log file.  It deletes all table files that are not referenced
+from some level and are not the output of an active compaction.
+
+</body>
+</html>
diff --git a/src/leveldb/doc/impl.md b/src/leveldb/doc/impl.md
deleted file mode 100644
index 4b13f2a6b..000000000
--- a/src/leveldb/doc/impl.md
+++ /dev/null
@@ -1,170 +0,0 @@
-## Files
-
-The implementation of leveldb is similar in spirit to the representation of a
-single [Bigtable tablet (section 5.3)](http://research.google.com/archive/bigtable.html).
-However the organization of the files that make up the representation is
-somewhat different and is explained below.
-
-Each database is represented by a set of files stored in a directory. There are
-several different types of files as documented below:
-
-### Log files
-
-A log file (*.log) stores a sequence of recent updates. Each update is appended
-to the current log file. When the log file reaches a pre-determined size
-(approximately 4MB by default), it is converted to a sorted table (see below)
-and a new log file is created for future updates.
-
-A copy of the current log file is kept in an in-memory structure (the
-`memtable`). This copy is consulted on every read so that read operations
-reflect all logged updates.
-
-## Sorted tables
-
-A sorted table (*.ldb) stores a sequence of entries sorted by key. Each entry is
-either a value for the key, or a deletion marker for the key. (Deletion markers
-are kept around to hide obsolete values present in older sorted tables).
-
-The set of sorted tables are organized into a sequence of levels. The sorted
-table generated from a log file is placed in a special **young** level (also
-called level-0). When the number of young files exceeds a certain threshold
-(currently four), all of the young files are merged together with all of the
-overlapping level-1 files to produce a sequence of new level-1 files (we create
-a new level-1 file for every 2MB of data.)
-
-Files in the young level may contain overlapping keys. However files in other
-levels have distinct non-overlapping key ranges. Consider level number L where
-L >= 1. When the combined size of files in level-L exceeds (10^L) MB (i.e., 10MB
-for level-1, 100MB for level-2, ...), one file in level-L, and all of the
-overlapping files in level-(L+1) are merged to form a set of new files for
-level-(L+1). These merges have the effect of gradually migrating new updates
-from the young level to the largest level using only bulk reads and writes
-(i.e., minimizing expensive seeks).
-
-### Manifest
-
-A MANIFEST file lists the set of sorted tables that make up each level, the
-corresponding key ranges, and other important metadata. A new MANIFEST file
-(with a new number embedded in the file name) is created whenever the database
-is reopened. The MANIFEST file is formatted as a log, and changes made to the
-serving state (as files are added or removed) are appended to this log.
-
-### Current
-
-CURRENT is a simple text file that contains the name of the latest MANIFEST
-file.
-
-### Info logs
-
-Informational messages are printed to files named LOG and LOG.old.
-
-### Others
-
-Other files used for miscellaneous purposes may also be present (LOCK, *.dbtmp).
-
-## Level 0
-
-When the log file grows above a certain size (1MB by default):
-Create a brand new memtable and log file and direct future updates here
-In the background:
-Write the contents of the previous memtable to an sstable
-Discard the memtable
-Delete the old log file and the old memtable
-Add the new sstable to the young (level-0) level.
-
-## Compactions
-
-When the size of level L exceeds its limit, we compact it in a background
-thread. The compaction picks a file from level L and all overlapping files from
-the next level L+1. Note that if a level-L file overlaps only part of a
-level-(L+1) file, the entire file at level-(L+1) is used as an input to the
-compaction and will be discarded after the compaction.  Aside: because level-0
-is special (files in it may overlap each other), we treat compactions from
-level-0 to level-1 specially: a level-0 compaction may pick more than one
-level-0 file in case some of these files overlap each other.
-
-A compaction merges the contents of the picked files to produce a sequence of
-level-(L+1) files. We switch to producing a new level-(L+1) file after the
-current output file has reached the target file size (2MB). We also switch to a
-new output file when the key range of the current output file has grown enough
-to overlap more than ten level-(L+2) files.  This last rule ensures that a later
-compaction of a level-(L+1) file will not pick up too much data from
-level-(L+2).
-
-The old files are discarded and the new files are added to the serving state.
-
-Compactions for a particular level rotate through the key space. In more detail,
-for each level L, we remember the ending key of the last compaction at level L.
-The next compaction for level L will pick the first file that starts after this
-key (wrapping around to the beginning of the key space if there is no such
-file).
-
-Compactions drop overwritten values. They also drop deletion markers if there
-are no higher numbered levels that contain a file whose range overlaps the
-current key.
-
-### Timing
-
-Level-0 compactions will read up to four 1MB files from level-0, and at worst
-all the level-1 files (10MB). I.e., we will read 14MB and write 14MB.
-
-Other than the special level-0 compactions, we will pick one 2MB file from level
-L. In the worst case, this will overlap ~ 12 files from level L+1 (10 because
-level-(L+1) is ten times the size of level-L, and another two at the boundaries
-since the file ranges at level-L will usually not be aligned with the file
-ranges at level-L+1). The compaction will therefore read 26MB and write 26MB.
-Assuming a disk IO rate of 100MB/s (ballpark range for modern drives), the worst
-compaction cost will be approximately 0.5 second.
-
-If we throttle the background writing to something small, say 10% of the full
-100MB/s speed, a compaction may take up to 5 seconds. If the user is writing at
-10MB/s, we might build up lots of level-0 files (~50 to hold the 5*10MB). This
-may significantly increase the cost of reads due to the overhead of merging more
-files together on every read.
-
-Solution 1: To reduce this problem, we might want to increase the log switching
-threshold when the number of level-0 files is large. Though the downside is that
-the larger this threshold, the more memory we will need to hold the
-corresponding memtable.
-
-Solution 2: We might want to decrease write rate artificially when the number of
-level-0 files goes up.
-
-Solution 3: We work on reducing the cost of very wide merges. Perhaps most of
-the level-0 files will have their blocks sitting uncompressed in the cache and
-we will only need to worry about the O(N) complexity in the merging iterator.
-
-### Number of files
-
-Instead of always making 2MB files, we could make larger files for larger levels
-to reduce the total file count, though at the expense of more bursty
-compactions.  Alternatively, we could shard the set of files into multiple
-directories.
-
-An experiment on an ext3 filesystem on Feb 04, 2011 shows the following timings
-to do 100K file opens in directories with varying number of files:
-
-
-| Files in directory | Microseconds to open a file |
-|-------------------:|----------------------------:|
-|               1000 |                           9 |
-|              10000 |                          10 |
-|             100000 |                          16 |
-
-So maybe even the sharding is not necessary on modern filesystems?
-
-## Recovery
-
-* Read CURRENT to find name of the latest committed MANIFEST
-* Read the named MANIFEST file
-* Clean up stale files
-* We could open all sstables here, but it is probably better to be lazy...
-* Convert log chunk to a new level-0 sstable
-* Start directing new writes to a new log file with recovered sequence#
-
-## Garbage collection of files
-
-`DeleteObsoleteFiles()` is called at the end of every compaction and at the end
-of recovery. It finds the names of all files in the database. It deletes all log
-files that are not the current log file. It deletes all table files that are not
-referenced from some level and are not the output of an active compaction.
diff --git a/src/leveldb/doc/index.html b/src/leveldb/doc/index.html
new file mode 100644
index 000000000..521d2baf4
--- /dev/null
+++ b/src/leveldb/doc/index.html
@@ -0,0 +1,549 @@
+<!DOCTYPE html>
+<html>
+<head>
+<link rel="stylesheet" type="text/css" href="doc.css" />
+<title>Leveldb</title>
+</head>
+
+<body>
+<h1>Leveldb</h1>
+<address>Jeff Dean, Sanjay Ghemawat</address>
+<p>
+The <code>leveldb</code> library provides a persistent key value store.  Keys and
+values are arbitrary byte arrays.  The keys are ordered within the key
+value store according to a user-specified comparator function.
+
+<p>
+<h1>Opening A Database</h1>
+<p>
+A <code>leveldb</code> database has a name which corresponds to a file system
+directory.  All of the contents of database are stored in this
+directory.  The following example shows how to open a database,
+creating it if necessary:
+<p>
+<pre>
+  #include &lt;assert&gt;
+  #include "leveldb/db.h"
+
+  leveldb::DB* db;
+  leveldb::Options options;
+  options.create_if_missing = true;
+  leveldb::Status status = leveldb::DB::Open(options, "/tmp/testdb", &amp;db);
+  assert(status.ok());
+  ...
+</pre>
+If you want to raise an error if the database already exists, add
+the following line before the <code>leveldb::DB::Open</code> call:
+<pre>
+  options.error_if_exists = true;
+</pre>
+<h1>Status</h1>
+<p>
+You may have noticed the <code>leveldb::Status</code> type above.  Values of this
+type are returned by most functions in <code>leveldb</code> that may encounter an
+error.  You can check if such a result is ok, and also print an
+associated error message:
+<p>
+<pre>
+   leveldb::Status s = ...;
+   if (!s.ok()) cerr &lt;&lt; s.ToString() &lt;&lt; endl;
+</pre>
+<h1>Closing A Database</h1>
+<p>
+When you are done with a database, just delete the database object.
+Example:
+<p>
+<pre>
+  ... open the db as described above ...
+  ... do something with db ...
+  delete db;
+</pre>
+<h1>Reads And Writes</h1>
+<p>
+The database provides <code>Put</code>, <code>Delete</code>, and <code>Get</code> methods to
+modify/query the database.  For example, the following code
+moves the value stored under key1 to key2.
+<pre>
+  std::string value;
+  leveldb::Status s = db-&gt;Get(leveldb::ReadOptions(), key1, &amp;value);
+  if (s.ok()) s = db-&gt;Put(leveldb::WriteOptions(), key2, value);
+  if (s.ok()) s = db-&gt;Delete(leveldb::WriteOptions(), key1);
+</pre>
+
+<h1>Atomic Updates</h1>
+<p>
+Note that if the process dies after the Put of key2 but before the
+delete of key1, the same value may be left stored under multiple keys.
+Such problems can be avoided by using the <code>WriteBatch</code> class to
+atomically apply a set of updates:
+<p>
+<pre>
+  #include "leveldb/write_batch.h"
+  ...
+  std::string value;
+  leveldb::Status s = db-&gt;Get(leveldb::ReadOptions(), key1, &amp;value);
+  if (s.ok()) {
+    leveldb::WriteBatch batch;
+    batch.Delete(key1);
+    batch.Put(key2, value);
+    s = db-&gt;Write(leveldb::WriteOptions(), &amp;batch);
+  }
+</pre>
+The <code>WriteBatch</code> holds a sequence of edits to be made to the database,
+and these edits within the batch are applied in order.  Note that we
+called <code>Delete</code> before <code>Put</code> so that if <code>key1</code> is identical to <code>key2</code>,
+we do not end up erroneously dropping the value entirely.
+<p>
+Apart from its atomicity benefits, <code>WriteBatch</code> may also be used to
+speed up bulk updates by placing lots of individual mutations into the
+same batch.
+
+<h1>Synchronous Writes</h1>
+By default, each write to <code>leveldb</code> is asynchronous: it
+returns after pushing the write from the process into the operating
+system.  The transfer from operating system memory to the underlying
+persistent storage happens asynchronously.  The <code>sync</code> flag
+can be turned on for a particular write to make the write operation
+not return until the data being written has been pushed all the way to
+persistent storage.  (On Posix systems, this is implemented by calling
+either <code>fsync(...)</code> or <code>fdatasync(...)</code> or
+<code>msync(..., MS_SYNC)</code> before the write operation returns.)
+<pre>
+  leveldb::WriteOptions write_options;
+  write_options.sync = true;
+  db-&gt;Put(write_options, ...);
+</pre>
+Asynchronous writes are often more than a thousand times as fast as
+synchronous writes.  The downside of asynchronous writes is that a
+crash of the machine may cause the last few updates to be lost.  Note
+that a crash of just the writing process (i.e., not a reboot) will not
+cause any loss since even when <code>sync</code> is false, an update
+is pushed from the process memory into the operating system before it
+is considered done.
+
+<p>
+Asynchronous writes can often be used safely.  For example, when
+loading a large amount of data into the database you can handle lost
+updates by restarting the bulk load after a crash.  A hybrid scheme is
+also possible where every Nth write is synchronous, and in the event
+of a crash, the bulk load is restarted just after the last synchronous
+write finished by the previous run.  (The synchronous write can update
+a marker that describes where to restart on a crash.)
+
+<p>
+<code>WriteBatch</code> provides an alternative to asynchronous writes.
+Multiple updates may be placed in the same <code>WriteBatch</code> and
+applied together using a synchronous write (i.e.,
+<code>write_options.sync</code> is set to true).  The extra cost of
+the synchronous write will be amortized across all of the writes in
+the batch.
+
+<p>
+<h1>Concurrency</h1>
+<p>
+A database may only be opened by one process at a time.
+The <code>leveldb</code> implementation acquires a lock from the
+operating system to prevent misuse.  Within a single process, the
+same <code>leveldb::DB</code> object may be safely shared by multiple
+concurrent threads.  I.e., different threads may write into or fetch
+iterators or call <code>Get</code> on the same database without any
+external synchronization (the leveldb implementation will
+automatically do the required synchronization).  However other objects
+(like Iterator and WriteBatch) may require external synchronization.
+If two threads share such an object, they must protect access to it
+using their own locking protocol.  More details are available in
+the public header files.
+<p>
+<h1>Iteration</h1>
+<p>
+The following example demonstrates how to print all key,value pairs
+in a database.
+<p>
+<pre>
+  leveldb::Iterator* it = db-&gt;NewIterator(leveldb::ReadOptions());
+  for (it-&gt;SeekToFirst(); it-&gt;Valid(); it-&gt;Next()) {
+    cout &lt;&lt; it-&gt;key().ToString() &lt;&lt; ": "  &lt;&lt; it-&gt;value().ToString() &lt;&lt; endl;
+  }
+  assert(it-&gt;status().ok());  // Check for any errors found during the scan
+  delete it;
+</pre>
+The following variation shows how to process just the keys in the
+range <code>[start,limit)</code>:
+<p>
+<pre>
+  for (it-&gt;Seek(start);
+       it-&gt;Valid() &amp;&amp; it-&gt;key().ToString() &lt; limit;
+       it-&gt;Next()) {
+    ...
+  }
+</pre>
+You can also process entries in reverse order.  (Caveat: reverse
+iteration may be somewhat slower than forward iteration.)
+<p>
+<pre>
+  for (it-&gt;SeekToLast(); it-&gt;Valid(); it-&gt;Prev()) {
+    ...
+  }
+</pre>
+<h1>Snapshots</h1>
+<p>
+Snapshots provide consistent read-only views over the entire state of
+the key-value store.  <code>ReadOptions::snapshot</code> may be non-NULL to indicate
+that a read should operate on a particular version of the DB state.
+If <code>ReadOptions::snapshot</code> is NULL, the read will operate on an
+implicit snapshot of the current state.
+<p>
+Snapshots are created by the DB::GetSnapshot() method:
+<p>
+<pre>
+  leveldb::ReadOptions options;
+  options.snapshot = db-&gt;GetSnapshot();
+  ... apply some updates to db ...
+  leveldb::Iterator* iter = db-&gt;NewIterator(options);
+  ... read using iter to view the state when the snapshot was created ...
+  delete iter;
+  db-&gt;ReleaseSnapshot(options.snapshot);
+</pre>
+Note that when a snapshot is no longer needed, it should be released
+using the DB::ReleaseSnapshot interface.  This allows the
+implementation to get rid of state that was being maintained just to
+support reading as of that snapshot.
+<h1>Slice</h1>
+<p>
+The return value of the <code>it->key()</code> and <code>it->value()</code> calls above
+are instances of the <code>leveldb::Slice</code> type.  <code>Slice</code> is a simple
+structure that contains a length and a pointer to an external byte
+array.  Returning a <code>Slice</code> is a cheaper alternative to returning a
+<code>std::string</code> since we do not need to copy potentially large keys and
+values.  In addition, <code>leveldb</code> methods do not return null-terminated
+C-style strings since <code>leveldb</code> keys and values are allowed to
+contain '\0' bytes.
+<p>
+C++ strings and null-terminated C-style strings can be easily converted
+to a Slice:
+<p>
+<pre>
+   leveldb::Slice s1 = "hello";
+
+   std::string str("world");
+   leveldb::Slice s2 = str;
+</pre>
+A Slice can be easily converted back to a C++ string:
+<pre>
+   std::string str = s1.ToString();
+   assert(str == std::string("hello"));
+</pre>
+Be careful when using Slices since it is up to the caller to ensure that
+the external byte array into which the Slice points remains live while
+the Slice is in use.  For example, the following is buggy:
+<p>
+<pre>
+   leveldb::Slice slice;
+   if (...) {
+     std::string str = ...;
+     slice = str;
+   }
+   Use(slice);
+</pre>
+When the <code>if</code> statement goes out of scope, <code>str</code> will be destroyed and the
+backing storage for <code>slice</code> will disappear.
+<p>
+<h1>Comparators</h1>
+<p>
+The preceding examples used the default ordering function for key,
+which orders bytes lexicographically.  You can however supply a custom
+comparator when opening a database.  For example, suppose each
+database key consists of two numbers and we should sort by the first
+number, breaking ties by the second number.  First, define a proper
+subclass of <code>leveldb::Comparator</code> that expresses these rules:
+<p>
+<pre>
+  class TwoPartComparator : public leveldb::Comparator {
+   public:
+    // Three-way comparison function:
+    //   if a &lt; b: negative result
+    //   if a &gt; b: positive result
+    //   else: zero result
+    int Compare(const leveldb::Slice&amp; a, const leveldb::Slice&amp; b) const {
+      int a1, a2, b1, b2;
+      ParseKey(a, &amp;a1, &amp;a2);
+      ParseKey(b, &amp;b1, &amp;b2);
+      if (a1 &lt; b1) return -1;
+      if (a1 &gt; b1) return +1;
+      if (a2 &lt; b2) return -1;
+      if (a2 &gt; b2) return +1;
+      return 0;
+    }
+
+    // Ignore the following methods for now:
+    const char* Name() const { return "TwoPartComparator"; }
+    void FindShortestSeparator(std::string*, const leveldb::Slice&amp;) const { }
+    void FindShortSuccessor(std::string*) const { }
+  };
+</pre>
+Now create a database using this custom comparator:
+<p>
+<pre>
+  TwoPartComparator cmp;
+  leveldb::DB* db;
+  leveldb::Options options;
+  options.create_if_missing = true;
+  options.comparator = &amp;cmp;
+  leveldb::Status status = leveldb::DB::Open(options, "/tmp/testdb", &amp;db);
+  ...
+</pre>
+<h2>Backwards compatibility</h2>
+<p>
+The result of the comparator's <code>Name</code> method is attached to the
+database when it is created, and is checked on every subsequent
+database open.  If the name changes, the <code>leveldb::DB::Open</code> call will
+fail.  Therefore, change the name if and only if the new key format
+and comparison function are incompatible with existing databases, and
+it is ok to discard the contents of all existing databases.
+<p>
+You can however still gradually evolve your key format over time with
+a little bit of pre-planning.  For example, you could store a version
+number at the end of each key (one byte should suffice for most uses).
+When you wish to switch to a new key format (e.g., adding an optional
+third part to the keys processed by <code>TwoPartComparator</code>),
+(a) keep the same comparator name (b) increment the version number
+for new keys (c) change the comparator function so it uses the
+version numbers found in the keys to decide how to interpret them.
+<p>
+<h1>Performance</h1>
+<p>
+Performance can be tuned by changing the default values of the
+types defined in <code>include/leveldb/options.h</code>.
+
+<p>
+<h2>Block size</h2>
+<p>
+<code>leveldb</code> groups adjacent keys together into the same block and such a
+block is the unit of transfer to and from persistent storage.  The
+default block size is approximately 4096 uncompressed bytes.
+Applications that mostly do bulk scans over the contents of the
+database may wish to increase this size.  Applications that do a lot
+of point reads of small values may wish to switch to a smaller block
+size if performance measurements indicate an improvement.  There isn't
+much benefit in using blocks smaller than one kilobyte, or larger than
+a few megabytes.  Also note that compression will be more effective
+with larger block sizes.
+<p>
+<h2>Compression</h2>
+<p>
+Each block is individually compressed before being written to
+persistent storage.  Compression is on by default since the default
+compression method is very fast, and is automatically disabled for
+uncompressible data.  In rare cases, applications may want to disable
+compression entirely, but should only do so if benchmarks show a
+performance improvement:
+<p>
+<pre>
+  leveldb::Options options;
+  options.compression = leveldb::kNoCompression;
+  ... leveldb::DB::Open(options, name, ...) ....
+</pre>
+<h2>Cache</h2>
+<p>
+The contents of the database are stored in a set of files in the
+filesystem and each file stores a sequence of compressed blocks.  If
+<code>options.cache</code> is non-NULL, it is used to cache frequently used
+uncompressed block contents.
+<p>
+<pre>
+  #include "leveldb/cache.h"
+
+  leveldb::Options options;
+  options.cache = leveldb::NewLRUCache(100 * 1048576);  // 100MB cache
+  leveldb::DB* db;
+  leveldb::DB::Open(options, name, &db);
+  ... use the db ...
+  delete db
+  delete options.cache;
+</pre>
+Note that the cache holds uncompressed data, and therefore it should
+be sized according to application level data sizes, without any
+reduction from compression.  (Caching of compressed blocks is left to
+the operating system buffer cache, or any custom <code>Env</code>
+implementation provided by the client.)
+<p>
+When performing a bulk read, the application may wish to disable
+caching so that the data processed by the bulk read does not end up
+displacing most of the cached contents.  A per-iterator option can be
+used to achieve this:
+<p>
+<pre>
+  leveldb::ReadOptions options;
+  options.fill_cache = false;
+  leveldb::Iterator* it = db-&gt;NewIterator(options);
+  for (it-&gt;SeekToFirst(); it-&gt;Valid(); it-&gt;Next()) {
+    ...
+  }
+</pre>
+<h2>Key Layout</h2>
+<p>
+Note that the unit of disk transfer and caching is a block.  Adjacent
+keys (according to the database sort order) will usually be placed in
+the same block.  Therefore the application can improve its performance
+by placing keys that are accessed together near each other and placing
+infrequently used keys in a separate region of the key space.
+<p>
+For example, suppose we are implementing a simple file system on top
+of <code>leveldb</code>.  The types of entries we might wish to store are:
+<p>
+<pre>
+   filename -&gt; permission-bits, length, list of file_block_ids
+   file_block_id -&gt; data
+</pre>
+We might want to prefix <code>filename</code> keys with one letter (say '/') and the
+<code>file_block_id</code> keys with a different letter (say '0') so that scans
+over just the metadata do not force us to fetch and cache bulky file
+contents.
+<p>
+<h2>Filters</h2>
+<p>
+Because of the way <code>leveldb</code> data is organized on disk,
+a single <code>Get()</code> call may involve multiple reads from disk.
+The optional <code>FilterPolicy</code> mechanism can be used to reduce
+the number of disk reads substantially.
+<pre>
+   leveldb::Options options;
+   options.filter_policy = NewBloomFilter(10);
+   leveldb::DB* db;
+   leveldb::DB::Open(options, "/tmp/testdb", &amp;db);
+   ... use the database ...
+   delete db;
+   delete options.filter_policy;
+</pre>
+The preceding code associates a
+<a href="http://en.wikipedia.org/wiki/Bloom_filter">Bloom filter</a>
+based filtering policy with the database.  Bloom filter based
+filtering relies on keeping some number of bits of data in memory per
+key (in this case 10 bits per key since that is the argument we passed
+to NewBloomFilter).  This filter will reduce the number of unnecessary
+disk reads needed for <code>Get()</code> calls by a factor of
+approximately a 100.  Increasing the bits per key will lead to a
+larger reduction at the cost of more memory usage.  We recommend that
+applications whose working set does not fit in memory and that do a
+lot of random reads set a filter policy.
+<p>
+If you are using a custom comparator, you should ensure that the filter
+policy you are using is compatible with your comparator.  For example,
+consider a comparator that ignores trailing spaces when comparing keys.
+<code>NewBloomFilter</code> must not be used with such a comparator.
+Instead, the application should provide a custom filter policy that
+also ignores trailing spaces.  For example:
+<pre>
+  class CustomFilterPolicy : public leveldb::FilterPolicy {
+   private:
+    FilterPolicy* builtin_policy_;
+   public:
+    CustomFilterPolicy() : builtin_policy_(NewBloomFilter(10)) { }
+    ~CustomFilterPolicy() { delete builtin_policy_; }
+
+    const char* Name() const { return "IgnoreTrailingSpacesFilter"; }
+
+    void CreateFilter(const Slice* keys, int n, std::string* dst) const {
+      // Use builtin bloom filter code after removing trailing spaces
+      std::vector&lt;Slice&gt; trimmed(n);
+      for (int i = 0; i &lt; n; i++) {
+        trimmed[i] = RemoveTrailingSpaces(keys[i]);
+      }
+      return builtin_policy_-&gt;CreateFilter(&amp;trimmed[i], n, dst);
+    }
+
+    bool KeyMayMatch(const Slice& key, const Slice& filter) const {
+      // Use builtin bloom filter code after removing trailing spaces
+      return builtin_policy_-&gt;KeyMayMatch(RemoveTrailingSpaces(key), filter);
+    }
+  };
+</pre>
+<p>
+Advanced applications may provide a filter policy that does not use
+a bloom filter but uses some other mechanism for summarizing a set
+of keys.  See <code>leveldb/filter_policy.h</code> for detail.
+<p>
+<h1>Checksums</h1>
+<p>
+<code>leveldb</code> associates checksums with all data it stores in the file system.
+There are two separate controls provided over how aggressively these
+checksums are verified:
+<p>
+<ul>
+<li> <code>ReadOptions::verify_checksums</code> may be set to true to force
+  checksum verification of all data that is read from the file system on
+  behalf of a particular read.  By default, no such verification is
+  done.
+<p>
+<li> <code>Options::paranoid_checks</code> may be set to true before opening a
+  database to make the database implementation raise an error as soon as
+  it detects an internal corruption.  Depending on which portion of the
+  database has been corrupted, the error may be raised when the database
+  is opened, or later by another database operation.  By default,
+  paranoid checking is off so that the database can be used even if
+  parts of its persistent storage have been corrupted.
+<p>
+  If a database is corrupted (perhaps it cannot be opened when
+  paranoid checking is turned on), the <code>leveldb::RepairDB</code> function
+  may be used to recover as much of the data as possible
+<p>
+</ul>
+<h1>Approximate Sizes</h1>
+<p>
+The <code>GetApproximateSizes</code> method can used to get the approximate
+number of bytes of file system space used by one or more key ranges.
+<p>
+<pre>
+   leveldb::Range ranges[2];
+   ranges[0] = leveldb::Range("a", "c");
+   ranges[1] = leveldb::Range("x", "z");
+   uint64_t sizes[2];
+   leveldb::Status s = db-&gt;GetApproximateSizes(ranges, 2, sizes);
+</pre>
+The preceding call will set <code>sizes[0]</code> to the approximate number of
+bytes of file system space used by the key range <code>[a..c)</code> and
+<code>sizes[1]</code> to the approximate number of bytes used by the key range
+<code>[x..z)</code>.
+<p>
+<h1>Environment</h1>
+<p>
+All file operations (and other operating system calls) issued by the
+<code>leveldb</code> implementation are routed through a <code>leveldb::Env</code> object.
+Sophisticated clients may wish to provide their own <code>Env</code>
+implementation to get better control.  For example, an application may
+introduce artificial delays in the file IO paths to limit the impact
+of <code>leveldb</code> on other activities in the system.
+<p>
+<pre>
+  class SlowEnv : public leveldb::Env {
+    .. implementation of the Env interface ...
+  };
+
+  SlowEnv env;
+  leveldb::Options options;
+  options.env = &amp;env;
+  Status s = leveldb::DB::Open(options, ...);
+</pre>
+<h1>Porting</h1>
+<p>
+<code>leveldb</code> may be ported to a new platform by providing platform
+specific implementations of the types/methods/functions exported by
+<code>leveldb/port/port.h</code>.  See <code>leveldb/port/port_example.h</code> for more
+details.
+<p>
+In addition, the new platform may need a new default <code>leveldb::Env</code>
+implementation.  See <code>leveldb/util/env_posix.h</code> for an example.
+
+<h1>Other Information</h1>
+
+<p>
+Details about the <code>leveldb</code> implementation may be found in
+the following documents:
+<ul>
+<li> <a href="impl.html">Implementation notes</a>
+<li> <a href="table_format.txt">Format of an immutable Table file</a>
+<li> <a href="log_format.txt">Format of a log file</a>
+</ul>
+
+</body>
+</html>
diff --git a/src/leveldb/doc/index.md b/src/leveldb/doc/index.md
deleted file mode 100644
index be8569692..000000000
--- a/src/leveldb/doc/index.md
+++ /dev/null
@@ -1,523 +0,0 @@
-leveldb
-=======
-
-_Jeff Dean, Sanjay Ghemawat_
-
-The leveldb library provides a persistent key value store. Keys and values are
-arbitrary byte arrays.  The keys are ordered within the key value store
-according to a user-specified comparator function.
-
-## Opening A Database
-
-A leveldb database has a name which corresponds to a file system directory. All
-of the contents of database are stored in this directory. The following example
-shows how to open a database, creating it if necessary:
-
-```c++
-#include <cassert>
-#include "leveldb/db.h"
-
-leveldb::DB* db;
-leveldb::Options options;
-options.create_if_missing = true;
-leveldb::Status status = leveldb::DB::Open(options, "/tmp/testdb", &db);
-assert(status.ok());
-...
-```
-
-If you want to raise an error if the database already exists, add the following
-line before the `leveldb::DB::Open` call:
-
-```c++
-options.error_if_exists = true;
-```
-
-## Status
-
-You may have noticed the `leveldb::Status` type above. Values of this type are
-returned by most functions in leveldb that may encounter an error. You can check
-if such a result is ok, and also print an associated error message:
-
-```c++
-leveldb::Status s = ...;
-if (!s.ok()) cerr << s.ToString() << endl;
-```
-
-## Closing A Database
-
-When you are done with a database, just delete the database object. Example:
-
-```c++
-... open the db as described above ...
-... do something with db ...
-delete db;
-```
-
-## Reads And Writes
-
-The database provides Put, Delete, and Get methods to modify/query the database.
-For example, the following code moves the value stored under key1 to key2.
-
-```c++
-std::string value;
-leveldb::Status s = db->Get(leveldb::ReadOptions(), key1, &value);
-if (s.ok()) s = db->Put(leveldb::WriteOptions(), key2, value);
-if (s.ok()) s = db->Delete(leveldb::WriteOptions(), key1);
-```
-
-## Atomic Updates
-
-Note that if the process dies after the Put of key2 but before the delete of
-key1, the same value may be left stored under multiple keys. Such problems can
-be avoided by using the `WriteBatch` class to atomically apply a set of updates:
-
-```c++
-#include "leveldb/write_batch.h"
-...
-std::string value;
-leveldb::Status s = db->Get(leveldb::ReadOptions(), key1, &value);
-if (s.ok()) {
-  leveldb::WriteBatch batch;
-  batch.Delete(key1);
-  batch.Put(key2, value);
-  s = db->Write(leveldb::WriteOptions(), &batch);
-}
-```
-
-The `WriteBatch` holds a sequence of edits to be made to the database, and these
-edits within the batch are applied in order. Note that we called Delete before
-Put so that if key1 is identical to key2, we do not end up erroneously dropping
-the value entirely.
-
-Apart from its atomicity benefits, `WriteBatch` may also be used to speed up
-bulk updates by placing lots of individual mutations into the same batch.
-
-## Synchronous Writes
-
-By default, each write to leveldb is asynchronous: it returns after pushing the
-write from the process into the operating system. The transfer from operating
-system memory to the underlying persistent storage happens asynchronously. The
-sync flag can be turned on for a particular write to make the write operation
-not return until the data being written has been pushed all the way to
-persistent storage. (On Posix systems, this is implemented by calling either
-`fsync(...)` or `fdatasync(...)` or `msync(..., MS_SYNC)` before the write
-operation returns.)
-
-```c++
-leveldb::WriteOptions write_options;
-write_options.sync = true;
-db->Put(write_options, ...);
-```
-
-Asynchronous writes are often more than a thousand times as fast as synchronous
-writes. The downside of asynchronous writes is that a crash of the machine may
-cause the last few updates to be lost. Note that a crash of just the writing
-process (i.e., not a reboot) will not cause any loss since even when sync is
-false, an update is pushed from the process memory into the operating system
-before it is considered done.
-
-Asynchronous writes can often be used safely. For example, when loading a large
-amount of data into the database you can handle lost updates by restarting the
-bulk load after a crash. A hybrid scheme is also possible where every Nth write
-is synchronous, and in the event of a crash, the bulk load is restarted just
-after the last synchronous write finished by the previous run. (The synchronous
-write can update a marker that describes where to restart on a crash.)
-
-`WriteBatch` provides an alternative to asynchronous writes. Multiple updates
-may be placed in the same WriteBatch and applied together using a synchronous
-write (i.e., `write_options.sync` is set to true). The extra cost of the
-synchronous write will be amortized across all of the writes in the batch.
-
-## Concurrency
-
-A database may only be opened by one process at a time. The leveldb
-implementation acquires a lock from the operating system to prevent misuse.
-Within a single process, the same `leveldb::DB` object may be safely shared by
-multiple concurrent threads. I.e., different threads may write into or fetch
-iterators or call Get on the same database without any external synchronization
-(the leveldb implementation will automatically do the required synchronization).
-However other objects (like Iterator and `WriteBatch`) may require external
-synchronization. If two threads share such an object, they must protect access
-to it using their own locking protocol. More details are available in the public
-header files.
-
-## Iteration
-
-The following example demonstrates how to print all key,value pairs in a
-database.
-
-```c++
-leveldb::Iterator* it = db->NewIterator(leveldb::ReadOptions());
-for (it->SeekToFirst(); it->Valid(); it->Next()) {
-  cout << it->key().ToString() << ": "  << it->value().ToString() << endl;
-}
-assert(it->status().ok());  // Check for any errors found during the scan
-delete it;
-```
-
-The following variation shows how to process just the keys in the range
-[start,limit):
-
-```c++
-for (it->Seek(start);
-   it->Valid() && it->key().ToString() < limit;
-   it->Next()) {
-  ...
-}
-```
-
-You can also process entries in reverse order. (Caveat: reverse iteration may be
-somewhat slower than forward iteration.)
-
-```c++
-for (it->SeekToLast(); it->Valid(); it->Prev()) {
-  ...
-}
-```
-
-## Snapshots
-
-Snapshots provide consistent read-only views over the entire state of the
-key-value store.  `ReadOptions::snapshot` may be non-NULL to indicate that a
-read should operate on a particular version of the DB state. If
-`ReadOptions::snapshot` is NULL, the read will operate on an implicit snapshot
-of the current state.
-
-Snapshots are created by the `DB::GetSnapshot()` method:
-
-```c++
-leveldb::ReadOptions options;
-options.snapshot = db->GetSnapshot();
-... apply some updates to db ...
-leveldb::Iterator* iter = db->NewIterator(options);
-... read using iter to view the state when the snapshot was created ...
-delete iter;
-db->ReleaseSnapshot(options.snapshot);
-```
-
-Note that when a snapshot is no longer needed, it should be released using the
-`DB::ReleaseSnapshot` interface. This allows the implementation to get rid of
-state that was being maintained just to support reading as of that snapshot.
-
-## Slice
-
-The return value of the `it->key()` and `it->value()` calls above are instances
-of the `leveldb::Slice` type. Slice is a simple structure that contains a length
-and a pointer to an external byte array. Returning a Slice is a cheaper
-alternative to returning a `std::string` since we do not need to copy
-potentially large keys and values. In addition, leveldb methods do not return
-null-terminated C-style strings since leveldb keys and values are allowed to
-contain `'\0'` bytes.
-
-C++ strings and null-terminated C-style strings can be easily converted to a
-Slice:
-
-```c++
-leveldb::Slice s1 = "hello";
-
-std::string str("world");
-leveldb::Slice s2 = str;
-```
-
-A Slice can be easily converted back to a C++ string:
-
-```c++
-std::string str = s1.ToString();
-assert(str == std::string("hello"));
-```
-
-Be careful when using Slices since it is up to the caller to ensure that the
-external byte array into which the Slice points remains live while the Slice is
-in use. For example, the following is buggy:
-
-```c++
-leveldb::Slice slice;
-if (...) {
-  std::string str = ...;
-  slice = str;
-}
-Use(slice);
-```
-
-When the if statement goes out of scope, str will be destroyed and the backing
-storage for slice will disappear.
-
-## Comparators
-
-The preceding examples used the default ordering function for key, which orders
-bytes lexicographically. You can however supply a custom comparator when opening
-a database.  For example, suppose each database key consists of two numbers and
-we should sort by the first number, breaking ties by the second number. First,
-define a proper subclass of `leveldb::Comparator` that expresses these rules:
-
-```c++
-class TwoPartComparator : public leveldb::Comparator {
- public:
-  // Three-way comparison function:
-  //   if a < b: negative result
-  //   if a > b: positive result
-  //   else: zero result
-  int Compare(const leveldb::Slice& a, const leveldb::Slice& b) const {
-    int a1, a2, b1, b2;
-    ParseKey(a, &a1, &a2);
-    ParseKey(b, &b1, &b2);
-    if (a1 < b1) return -1;
-    if (a1 > b1) return +1;
-    if (a2 < b2) return -1;
-    if (a2 > b2) return +1;
-    return 0;
-  }
-
-  // Ignore the following methods for now:
-  const char* Name() const { return "TwoPartComparator"; }
-  void FindShortestSeparator(std::string*, const leveldb::Slice&) const {}
-  void FindShortSuccessor(std::string*) const {}
-};
-```
-
-Now create a database using this custom comparator:
-
-```c++
-TwoPartComparator cmp;
-leveldb::DB* db;
-leveldb::Options options;
-options.create_if_missing = true;
-options.comparator = &cmp;
-leveldb::Status status = leveldb::DB::Open(options, "/tmp/testdb", &db);
-...
-```
-
-### Backwards compatibility
-
-The result of the comparator's Name method is attached to the database when it
-is created, and is checked on every subsequent database open. If the name
-changes, the `leveldb::DB::Open` call will fail. Therefore, change the name if
-and only if the new key format and comparison function are incompatible with
-existing databases, and it is ok to discard the contents of all existing
-databases.
-
-You can however still gradually evolve your key format over time with a little
-bit of pre-planning. For example, you could store a version number at the end of
-each key (one byte should suffice for most uses). When you wish to switch to a
-new key format (e.g., adding an optional third part to the keys processed by
-`TwoPartComparator`), (a) keep the same comparator name (b) increment the
-version number for new keys (c) change the comparator function so it uses the
-version numbers found in the keys to decide how to interpret them.
-
-## Performance
-
-Performance can be tuned by changing the default values of the types defined in
-`include/leveldb/options.h`.
-
-### Block size
-
-leveldb groups adjacent keys together into the same block and such a block is
-the unit of transfer to and from persistent storage. The default block size is
-approximately 4096 uncompressed bytes.  Applications that mostly do bulk scans
-over the contents of the database may wish to increase this size. Applications
-that do a lot of point reads of small values may wish to switch to a smaller
-block size if performance measurements indicate an improvement. There isn't much
-benefit in using blocks smaller than one kilobyte, or larger than a few
-megabytes. Also note that compression will be more effective with larger block
-sizes.
-
-### Compression
-
-Each block is individually compressed before being written to persistent
-storage. Compression is on by default since the default compression method is
-very fast, and is automatically disabled for uncompressible data. In rare cases,
-applications may want to disable compression entirely, but should only do so if
-benchmarks show a performance improvement:
-
-```c++
-leveldb::Options options;
-options.compression = leveldb::kNoCompression;
-... leveldb::DB::Open(options, name, ...) ....
-```
-
-### Cache
-
-The contents of the database are stored in a set of files in the filesystem and
-each file stores a sequence of compressed blocks. If options.cache is non-NULL,
-it is used to cache frequently used uncompressed block contents.
-
-```c++
-#include "leveldb/cache.h"
-
-leveldb::Options options;
-options.cache = leveldb::NewLRUCache(100 * 1048576);  // 100MB cache
-leveldb::DB* db;
-leveldb::DB::Open(options, name, &db);
-... use the db ...
-delete db
-delete options.cache;
-```
-
-Note that the cache holds uncompressed data, and therefore it should be sized
-according to application level data sizes, without any reduction from
-compression. (Caching of compressed blocks is left to the operating system
-buffer cache, or any custom Env implementation provided by the client.)
-
-When performing a bulk read, the application may wish to disable caching so that
-the data processed by the bulk read does not end up displacing most of the
-cached contents. A per-iterator option can be used to achieve this:
-
-```c++
-leveldb::ReadOptions options;
-options.fill_cache = false;
-leveldb::Iterator* it = db->NewIterator(options);
-for (it->SeekToFirst(); it->Valid(); it->Next()) {
-  ...
-}
-```
-
-### Key Layout
-
-Note that the unit of disk transfer and caching is a block. Adjacent keys
-(according to the database sort order) will usually be placed in the same block.
-Therefore the application can improve its performance by placing keys that are
-accessed together near each other and placing infrequently used keys in a
-separate region of the key space.
-
-For example, suppose we are implementing a simple file system on top of leveldb.
-The types of entries we might wish to store are:
-
-    filename -> permission-bits, length, list of file_block_ids
-    file_block_id -> data
-
-We might want to prefix filename keys with one letter (say '/') and the
-`file_block_id` keys with a different letter (say '0') so that scans over just
-the metadata do not force us to fetch and cache bulky file contents.
-
-### Filters
-
-Because of the way leveldb data is organized on disk, a single `Get()` call may
-involve multiple reads from disk. The optional FilterPolicy mechanism can be
-used to reduce the number of disk reads substantially.
-
-```c++
-leveldb::Options options;
-options.filter_policy = NewBloomFilterPolicy(10);
-leveldb::DB* db;
-leveldb::DB::Open(options, "/tmp/testdb", &db);
-... use the database ...
-delete db;
-delete options.filter_policy;
-```
-
-The preceding code associates a Bloom filter based filtering policy with the
-database.  Bloom filter based filtering relies on keeping some number of bits of
-data in memory per key (in this case 10 bits per key since that is the argument
-we passed to `NewBloomFilterPolicy`). This filter will reduce the number of
-unnecessary disk reads needed for Get() calls by a factor of approximately
-a 100. Increasing the bits per key will lead to a larger reduction at the cost
-of more memory usage. We recommend that applications whose working set does not
-fit in memory and that do a lot of random reads set a filter policy.
-
-If you are using a custom comparator, you should ensure that the filter policy
-you are using is compatible with your comparator. For example, consider a
-comparator that ignores trailing spaces when comparing keys.
-`NewBloomFilterPolicy` must not be used with such a comparator. Instead, the
-application should provide a custom filter policy that also ignores trailing
-spaces. For example:
-
-```c++
-class CustomFilterPolicy : public leveldb::FilterPolicy {
- private:
-  FilterPolicy* builtin_policy_;
-
- public:
-  CustomFilterPolicy() : builtin_policy_(NewBloomFilterPolicy(10)) {}
-  ~CustomFilterPolicy() { delete builtin_policy_; }
-
-  const char* Name() const { return "IgnoreTrailingSpacesFilter"; }
-
-  void CreateFilter(const Slice* keys, int n, std::string* dst) const {
-    // Use builtin bloom filter code after removing trailing spaces
-    std::vector<Slice> trimmed(n);
-    for (int i = 0; i < n; i++) {
-      trimmed[i] = RemoveTrailingSpaces(keys[i]);
-    }
-    return builtin_policy_->CreateFilter(&trimmed[i], n, dst);
-  }
-};
-```
-
-Advanced applications may provide a filter policy that does not use a bloom
-filter but uses some other mechanism for summarizing a set of keys. See
-`leveldb/filter_policy.h` for detail.
-
-## Checksums
-
-leveldb associates checksums with all data it stores in the file system. There
-are two separate controls provided over how aggressively these checksums are
-verified:
-
-`ReadOptions::verify_checksums` may be set to true to force checksum
-verification of all data that is read from the file system on behalf of a
-particular read.  By default, no such verification is done.
-
-`Options::paranoid_checks` may be set to true before opening a database to make
-the database implementation raise an error as soon as it detects an internal
-corruption. Depending on which portion of the database has been corrupted, the
-error may be raised when the database is opened, or later by another database
-operation. By default, paranoid checking is off so that the database can be used
-even if parts of its persistent storage have been corrupted.
-
-If a database is corrupted (perhaps it cannot be opened when paranoid checking
-is turned on), the `leveldb::RepairDB` function may be used to recover as much
-of the data as possible
-
-## Approximate Sizes
-
-The `GetApproximateSizes` method can used to get the approximate number of bytes
-of file system space used by one or more key ranges.
-
-```c++
-leveldb::Range ranges[2];
-ranges[0] = leveldb::Range("a", "c");
-ranges[1] = leveldb::Range("x", "z");
-uint64_t sizes[2];
-leveldb::Status s = db->GetApproximateSizes(ranges, 2, sizes);
-```
-
-The preceding call will set `sizes[0]` to the approximate number of bytes of
-file system space used by the key range `[a..c)` and `sizes[1]` to the
-approximate number of bytes used by the key range `[x..z)`.
-
-## Environment
-
-All file operations (and other operating system calls) issued by the leveldb
-implementation are routed through a `leveldb::Env` object. Sophisticated clients
-may wish to provide their own Env implementation to get better control.
-For example, an application may introduce artificial delays in the file IO
-paths to limit the impact of leveldb on other activities in the system.
-
-```c++
-class SlowEnv : public leveldb::Env {
-  ... implementation of the Env interface ...
-};
-
-SlowEnv env;
-leveldb::Options options;
-options.env = &env;
-Status s = leveldb::DB::Open(options, ...);
-```
-
-## Porting
-
-leveldb may be ported to a new platform by providing platform specific
-implementations of the types/methods/functions exported by
-`leveldb/port/port.h`.  See `leveldb/port/port_example.h` for more details.
-
-In addition, the new platform may need a new default `leveldb::Env`
-implementation.  See `leveldb/util/env_posix.h` for an example.
-
-## Other Information
-
-Details about the leveldb implementation may be found in the following
-documents:
-
-1. [Implementation notes](impl.md)
-2. [Format of an immutable Table file](table_format.md)
-3. [Format of a log file](log_format.md)
diff --git a/src/leveldb/doc/log_format.md b/src/leveldb/doc/log_format.md
deleted file mode 100644
index f32cb5d7d..000000000
--- a/src/leveldb/doc/log_format.md
+++ /dev/null
@@ -1,75 +0,0 @@
-leveldb Log format
-==================
-The log file contents are a sequence of 32KB blocks.  The only exception is that
-the tail of the file may contain a partial block.
-
-Each block consists of a sequence of records:
-
-    block := record* trailer?
-    record :=
-      checksum: uint32     // crc32c of type and data[] ; little-endian
-      length: uint16       // little-endian
-      type: uint8          // One of FULL, FIRST, MIDDLE, LAST
-      data: uint8[length]
-
-A record never starts within the last six bytes of a block (since it won't fit).
-Any leftover bytes here form the trailer, which must consist entirely of zero
-bytes and must be skipped by readers.
-
-Aside: if exactly seven bytes are left in the current block, and a new non-zero
-length record is added, the writer must emit a FIRST record (which contains zero
-bytes of user data) to fill up the trailing seven bytes of the block and then
-emit all of the user data in subsequent blocks.
-
-More types may be added in the future.  Some Readers may skip record types they
-do not understand, others may report that some data was skipped.
-
-    FULL == 1
-    FIRST == 2
-    MIDDLE == 3
-    LAST == 4
-
-The FULL record contains the contents of an entire user record.
-
-FIRST, MIDDLE, LAST are types used for user records that have been split into
-multiple fragments (typically because of block boundaries).  FIRST is the type
-of the first fragment of a user record, LAST is the type of the last fragment of
-a user record, and MIDDLE is the type of all interior fragments of a user
-record.
-
-Example: consider a sequence of user records:
-
-    A: length 1000
-    B: length 97270
-    C: length 8000
-
-**A** will be stored as a FULL record in the first block.
-
-**B** will be split into three fragments: first fragment occupies the rest of
-the first block, second fragment occupies the entirety of the second block, and
-the third fragment occupies a prefix of the third block.  This will leave six
-bytes free in the third block, which will be left empty as the trailer.
-
-**C** will be stored as a FULL record in the fourth block.
-
-----
-
-## Some benefits over the recordio format:
-
-1. We do not need any heuristics for resyncing - just go to next block boundary
-   and scan.  If there is a corruption, skip to the next block.  As a
-   side-benefit, we do not get confused when part of the contents of one log
-   file are embedded as a record inside another log file.
-
-2. Splitting at approximate boundaries (e.g., for mapreduce) is simple: find the
-   next block boundary and skip records until we hit a FULL or FIRST record.
-
-3. We do not need extra buffering for large records.
-
-## Some downsides compared to recordio format:
-
-1. No packing of tiny records.  This could be fixed by adding a new record type,
-   so it is a shortcoming of the current implementation, not necessarily the
-   format.
-
-2. No compression.  Again, this could be fixed by adding new record types.
diff --git a/src/leveldb/doc/log_format.txt b/src/leveldb/doc/log_format.txt
new file mode 100644
index 000000000..3a0414b65
--- /dev/null
+++ b/src/leveldb/doc/log_format.txt
@@ -0,0 +1,75 @@
+The log file contents are a sequence of 32KB blocks.  The only
+exception is that the tail of the file may contain a partial block.
+
+Each block consists of a sequence of records:
+   block := record* trailer?
+   record :=
+	checksum: uint32	// crc32c of type and data[]
+	length: uint16
+	type: uint8		// One of FULL, FIRST, MIDDLE, LAST
+	data: uint8[length]
+
+A record never starts within the last six bytes of a block (since it
+won't fit).  Any leftover bytes here form the trailer, which must
+consist entirely of zero bytes and must be skipped by readers.  
+
+Aside: if exactly seven bytes are left in the current block, and a new
+non-zero length record is added, the writer must emit a FIRST record
+(which contains zero bytes of user data) to fill up the trailing seven
+bytes of the block and then emit all of the user data in subsequent
+blocks.
+
+More types may be added in the future.  Some Readers may skip record
+types they do not understand, others may report that some data was
+skipped.
+
+FULL == 1
+FIRST == 2
+MIDDLE == 3
+LAST == 4
+
+The FULL record contains the contents of an entire user record.
+
+FIRST, MIDDLE, LAST are types used for user records that have been
+split into multiple fragments (typically because of block boundaries).
+FIRST is the type of the first fragment of a user record, LAST is the
+type of the last fragment of a user record, and MID is the type of all
+interior fragments of a user record.
+
+Example: consider a sequence of user records:
+   A: length 1000
+   B: length 97270
+   C: length 8000
+A will be stored as a FULL record in the first block.
+
+B will be split into three fragments: first fragment occupies the rest
+of the first block, second fragment occupies the entirety of the
+second block, and the third fragment occupies a prefix of the third
+block.  This will leave six bytes free in the third block, which will
+be left empty as the trailer.
+
+C will be stored as a FULL record in the fourth block.
+
+===================
+
+Some benefits over the recordio format:
+
+(1) We do not need any heuristics for resyncing - just go to next
+block boundary and scan.  If there is a corruption, skip to the next
+block.  As a side-benefit, we do not get confused when part of the
+contents of one log file are embedded as a record inside another log
+file.
+
+(2) Splitting at approximate boundaries (e.g., for mapreduce) is
+simple: find the next block boundary and skip records until we
+hit a FULL or FIRST record.
+
+(3) We do not need extra buffering for large records.
+
+Some downsides compared to recordio format:
+
+(1) No packing of tiny records.  This could be fixed by adding a new
+record type, so it is a shortcoming of the current implementation,
+not necessarily the format.
+
+(2) No compression.  Again, this could be fixed by adding new record types.
diff --git a/src/leveldb/doc/table_format.md b/src/leveldb/doc/table_format.md
deleted file mode 100644
index 5fe7e7241..000000000
--- a/src/leveldb/doc/table_format.md
+++ /dev/null
@@ -1,107 +0,0 @@
-leveldb File format
-===================
-
-    <beginning_of_file>
-    [data block 1]
-    [data block 2]
-    ...
-    [data block N]
-    [meta block 1]
-    ...
-    [meta block K]
-    [metaindex block]
-    [index block]
-    [Footer]        (fixed size; starts at file_size - sizeof(Footer))
-    <end_of_file>
-
-The file contains internal pointers.  Each such pointer is called
-a BlockHandle and contains the following information:
-
-    offset:   varint64
-    size:     varint64
-
-See [varints](https://developers.google.com/protocol-buffers/docs/encoding#varints)
-for an explanation of varint64 format.
-
-1.  The sequence of key/value pairs in the file are stored in sorted
-order and partitioned into a sequence of data blocks.  These blocks
-come one after another at the beginning of the file.  Each data block
-is formatted according to the code in `block_builder.cc`, and then
-optionally compressed.
-
-2. After the data blocks we store a bunch of meta blocks.  The
-supported meta block types are described below.  More meta block types
-may be added in the future.  Each meta block is again formatted using
-`block_builder.cc` and then optionally compressed.
-
-3. A "metaindex" block.  It contains one entry for every other meta
-block where the key is the name of the meta block and the value is a
-BlockHandle pointing to that meta block.
-
-4. An "index" block.  This block contains one entry per data block,
-where the key is a string >= last key in that data block and before
-the first key in the successive data block.  The value is the
-BlockHandle for the data block.
-
-5. At the very end of the file is a fixed length footer that contains
-the BlockHandle of the metaindex and index blocks as well as a magic number.
-
-        metaindex_handle: char[p];     // Block handle for metaindex
-        index_handle:     char[q];     // Block handle for index
-        padding:          char[40-p-q];// zeroed bytes to make fixed length
-                                       // (40==2*BlockHandle::kMaxEncodedLength)
-        magic:            fixed64;     // == 0xdb4775248b80fb57 (little-endian)
-
-## "filter" Meta Block
-
-If a `FilterPolicy` was specified when the database was opened, a
-filter block is stored in each table.  The "metaindex" block contains
-an entry that maps from `filter.<N>` to the BlockHandle for the filter
-block where `<N>` is the string returned by the filter policy's
-`Name()` method.
-
-The filter block stores a sequence of filters, where filter i contains
-the output of `FilterPolicy::CreateFilter()` on all keys that are stored
-in a block whose file offset falls within the range
-
-    [ i*base ... (i+1)*base-1 ]
-
-Currently, "base" is 2KB.  So for example, if blocks X and Y start in
-the range `[ 0KB .. 2KB-1 ]`, all of the keys in X and Y will be
-converted to a filter by calling `FilterPolicy::CreateFilter()`, and the
-resulting filter will be stored as the first filter in the filter
-block.
-
-The filter block is formatted as follows:
-
-    [filter 0]
-    [filter 1]
-    [filter 2]
-    ...
-    [filter N-1]
-
-    [offset of filter 0]                  : 4 bytes
-    [offset of filter 1]                  : 4 bytes
-    [offset of filter 2]                  : 4 bytes
-    ...
-    [offset of filter N-1]                : 4 bytes
-
-    [offset of beginning of offset array] : 4 bytes
-    lg(base)                              : 1 byte
-
-The offset array at the end of the filter block allows efficient
-mapping from a data block offset to the corresponding filter.
-
-## "stats" Meta Block
-
-This meta block contains a bunch of stats.  The key is the name
-of the statistic.  The value contains the statistic.
-
-TODO(postrelease): record following stats.
-
-    data size
-    index size
-    key size (uncompressed)
-    value size (uncompressed)
-    number of entries
-    number of data blocks
diff --git a/src/leveldb/doc/table_format.txt b/src/leveldb/doc/table_format.txt
new file mode 100644
index 000000000..d0f3065ed
--- /dev/null
+++ b/src/leveldb/doc/table_format.txt
@@ -0,0 +1,102 @@
+File format
+===========
+
+  <beginning_of_file>
+  [data block 1]
+  [data block 2]
+  ...
+  [data block N]
+  [meta block 1]
+  ...
+  [meta block K]
+  [metaindex block]
+  [index block]
+  [Footer]        (fixed size; starts at file_size - sizeof(Footer))
+  <end_of_file>
+
+The file contains internal pointers.  Each such pointer is called
+a BlockHandle and contains the following information:
+  offset:	    varint64
+  size:		    varint64
+
+(1) The sequence of key/value pairs in the file are stored in sorted
+order and partitioned into a sequence of data blocks.  These blocks
+come one after another at the beginning of the file.  Each data block
+is formatted according to the code in block_builder.cc, and then
+optionally compressed.
+
+(2) After the data blocks we store a bunch of meta blocks.  The
+supported meta block types are described below.  More meta block types
+may be added in the future.  Each meta block is again formatted using
+block_builder.cc and then optionally compressed.
+
+(3) A "metaindex" block.  It contains one entry for every other meta
+block where the key is the name of the meta block and the value is a
+BlockHandle pointing to that meta block.
+
+(4) An "index" block.  This block contains one entry per data block,
+where the key is a string >= last key in that data block and before
+the first key in the successive data block.  The value is the
+BlockHandle for the data block.
+
+(6) At the very end of the file is a fixed length footer that contains
+the BlockHandle of the metaindex and index blocks as well as a magic number.
+       metaindex_handle:       char[p];    // Block handle for metaindex
+       index_handle:	       char[q];    // Block handle for index
+       padding:		       char[40-p-q]; // 0 bytes to make fixed length
+       			 	       // (40==2*BlockHandle::kMaxEncodedLength)
+       magic:		       fixed64;    // == 0xdb4775248b80fb57
+
+"filter" Meta Block
+-------------------
+
+If a "FilterPolicy" was specified when the database was opened, a
+filter block is stored in each table.  The "metaindex" block contains
+an entry that maps from "filter.<N>" to the BlockHandle for the filter
+block where "<N>" is the string returned by the filter policy's
+"Name()" method.
+
+The filter block stores a sequence of filters, where filter i contains
+the output of FilterPolicy::CreateFilter() on all keys that are stored
+in a block whose file offset falls within the range
+
+    [ i*base ... (i+1)*base-1 ]
+
+Currently, "base" is 2KB.  So for example, if blocks X and Y start in
+the range [ 0KB .. 2KB-1 ], all of the keys in X and Y will be
+converted to a filter by calling FilterPolicy::CreateFilter(), and the
+resulting filter will be stored as the first filter in the filter
+block.
+
+The filter block is formatted as follows:
+
+     [filter 0]
+     [filter 1]
+     [filter 2]
+     ...
+     [filter N-1]
+
+     [offset of filter 0]                  : 4 bytes
+     [offset of filter 1]                  : 4 bytes
+     [offset of filter 2]                  : 4 bytes
+     ...
+     [offset of filter N-1]                : 4 bytes
+
+     [offset of beginning of offset array] : 4 bytes
+     lg(base)                              : 1 byte
+
+The offset array at the end of the filter block allows efficient
+mapping from a data block offset to the corresponding filter.
+
+"stats" Meta Block
+------------------
+
+This meta block contains a bunch of stats.  The key is the name
+of the statistic.  The value contains the statistic.
+TODO(postrelease): record following stats.
+  data size
+  index size
+  key size (uncompressed)
+  value size (uncompressed)
+  number of entries
+  number of data blocks
diff --git a/src/leveldb/helpers/memenv/memenv.cc b/src/leveldb/helpers/memenv/memenv.cc
index 68c0614a5..efad9524a 100644
--- a/src/leveldb/helpers/memenv/memenv.cc
+++ b/src/leveldb/helpers/memenv/memenv.cc
@@ -55,15 +55,14 @@ class FileState {
     }
     const uint64_t available = size_ - offset;
     if (n > available) {
-      n = static_cast<size_t>(available);
+      n = available;
     }
     if (n == 0) {
       *result = Slice();
       return Status::OK();
     }
 
-    assert(offset / kBlockSize <= SIZE_MAX);
-    size_t block = static_cast<size_t>(offset / kBlockSize);
+    size_t block = offset / kBlockSize;
     size_t block_offset = offset % kBlockSize;
 
     if (n <= kBlockSize - block_offset) {
@@ -168,7 +167,7 @@ class SequentialFileImpl : public SequentialFile {
     if (pos_ > file_->Size()) {
       return Status::IOError("pos_ > file_->Size()");
     }
-    const uint64_t available = file_->Size() - pos_;
+    const size_t available = file_->Size() - pos_;
     if (n > available) {
       n = available;
     }
@@ -176,10 +175,9 @@ class SequentialFileImpl : public SequentialFile {
     return Status::OK();
   }
 
-  virtual std::string GetName() const { return "[memenv]"; }
  private:
   FileState* file_;
-  uint64_t pos_;
+  size_t pos_;
 };
 
 class RandomAccessFileImpl : public RandomAccessFile {
@@ -197,7 +195,6 @@ class RandomAccessFileImpl : public RandomAccessFile {
     return file_->Read(offset, n, result, scratch);
   }
 
-  virtual std::string GetName() const { return "[memenv]"; }
  private:
   FileState* file_;
 };
@@ -220,16 +217,10 @@ class WritableFileImpl : public WritableFile {
   virtual Status Flush() { return Status::OK(); }
   virtual Status Sync() { return Status::OK(); }
 
-  virtual std::string GetName() const { return "[memenv]"; }
  private:
   FileState* file_;
 };
 
-class NoOpLogger : public Logger {
- public:
-  virtual void Logv(const char* format, va_list ap) { }
-};
-
 class InMemoryEnv : public EnvWrapper {
  public:
   explicit InMemoryEnv(Env* base_env) : EnvWrapper(base_env) { }
@@ -266,7 +257,7 @@ class InMemoryEnv : public EnvWrapper {
   }
 
   virtual Status NewWritableFile(const std::string& fname,
-                                 WritableFile** result) {
+                                 WritableFile** result, size_t) {
     MutexLock lock(&mutex_);
     if (file_map_.find(fname) != file_map_.end()) {
       DeleteFileInternal(fname);
@@ -280,19 +271,6 @@ class InMemoryEnv : public EnvWrapper {
     return Status::OK();
   }
 
-  virtual Status NewAppendableFile(const std::string& fname,
-                                   WritableFile** result) {
-    MutexLock lock(&mutex_);
-    FileState** sptr = &file_map_[fname];
-    FileState* file = *sptr;
-    if (file == NULL) {
-      file = new FileState();
-      file->Ref();
-    }
-    *result = new WritableFileImpl(file);
-    return Status::OK();
-  }
-
   virtual bool FileExists(const std::string& fname) {
     MutexLock lock(&mutex_);
     return file_map_.find(fname) != file_map_.end();
@@ -380,11 +358,6 @@ class InMemoryEnv : public EnvWrapper {
     return Status::OK();
   }
 
-  virtual Status NewLogger(const std::string& fname, Logger** result) {
-    *result = new NoOpLogger;
-    return Status::OK();
-  }
-
  private:
   // Map from filenames to FileState objects, representing a simple file system.
   typedef std::map<std::string, FileState*> FileSystem;
diff --git a/src/leveldb/helpers/memenv/memenv_test.cc b/src/leveldb/helpers/memenv/memenv_test.cc
index 5cff77613..38ee6ac3e 100644
--- a/src/leveldb/helpers/memenv/memenv_test.cc
+++ b/src/leveldb/helpers/memenv/memenv_test.cc
@@ -29,68 +29,61 @@ TEST(MemEnvTest, Basics) {
   uint64_t file_size;
   WritableFile* writable_file;
   std::vector<std::string> children;
+  std::string dbname;
 
-  ASSERT_OK(env_->CreateDir("/dir"));
+  dbname=test::TmpDir();
+  ASSERT_OK(env_->CreateDir(dbname.c_str()));
 
   // Check that the directory is empty.
-  ASSERT_TRUE(!env_->FileExists("/dir/non_existent"));
-  ASSERT_TRUE(!env_->GetFileSize("/dir/non_existent", &file_size).ok());
-  ASSERT_OK(env_->GetChildren("/dir", &children));
+  ASSERT_TRUE(!env_->FileExists(dbname + "/non_existent"));
+  ASSERT_TRUE(!env_->GetFileSize(dbname + "/non_existent", &file_size).ok());
+  ASSERT_OK(env_->GetChildren(dbname + "", &children));
   ASSERT_EQ(0, children.size());
 
   // Create a file.
-  ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file));
-  ASSERT_OK(env_->GetFileSize("/dir/f", &file_size));
-  ASSERT_EQ(0, file_size);
+  ASSERT_OK(env_->NewWritableFile(dbname + "/f", &writable_file, 2<<20));
   delete writable_file;
 
   // Check that the file exists.
-  ASSERT_TRUE(env_->FileExists("/dir/f"));
-  ASSERT_OK(env_->GetFileSize("/dir/f", &file_size));
+  ASSERT_TRUE(env_->FileExists(dbname + "/f"));
+  ASSERT_OK(env_->GetFileSize(dbname + "/f", &file_size));
   ASSERT_EQ(0, file_size);
-  ASSERT_OK(env_->GetChildren("/dir", &children));
+  ASSERT_OK(env_->GetChildren(dbname + "", &children));
   ASSERT_EQ(1, children.size());
   ASSERT_EQ("f", children[0]);
 
   // Write to the file.
-  ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file));
+  ASSERT_OK(env_->NewWritableFile(dbname + "/f", &writable_file, 2<<20));
   ASSERT_OK(writable_file->Append("abc"));
   delete writable_file;
 
-  // Check that append works.
-  ASSERT_OK(env_->NewAppendableFile("/dir/f", &writable_file));
-  ASSERT_OK(env_->GetFileSize("/dir/f", &file_size));
-  ASSERT_EQ(3, file_size);
-  ASSERT_OK(writable_file->Append("hello"));
-  delete writable_file;
-
   // Check for expected size.
-  ASSERT_OK(env_->GetFileSize("/dir/f", &file_size));
-  ASSERT_EQ(8, file_size);
+  ASSERT_OK(env_->GetFileSize(dbname + "/f", &file_size));
+  ASSERT_EQ(3, file_size);
 
   // Check that renaming works.
-  ASSERT_TRUE(!env_->RenameFile("/dir/non_existent", "/dir/g").ok());
-  ASSERT_OK(env_->RenameFile("/dir/f", "/dir/g"));
-  ASSERT_TRUE(!env_->FileExists("/dir/f"));
-  ASSERT_TRUE(env_->FileExists("/dir/g"));
-  ASSERT_OK(env_->GetFileSize("/dir/g", &file_size));
-  ASSERT_EQ(8, file_size);
+  ASSERT_TRUE(!env_->RenameFile(dbname + "/non_existent", dbname + "/g").ok());
+  ASSERT_OK(env_->RenameFile(dbname + "/f", dbname + "/g"));
+  ASSERT_TRUE(!env_->FileExists(dbname + "/f"));
+  ASSERT_TRUE(env_->FileExists(dbname + "/g"));
+  ASSERT_OK(env_->GetFileSize(dbname + "/g", &file_size));
+  ASSERT_EQ(3, file_size);
 
   // Check that opening non-existent file fails.
   SequentialFile* seq_file;
   RandomAccessFile* rand_file;
-  ASSERT_TRUE(!env_->NewSequentialFile("/dir/non_existent", &seq_file).ok());
+  ASSERT_TRUE(!env_->NewSequentialFile(dbname + "/non_existent", &seq_file).ok());
   ASSERT_TRUE(!seq_file);
-  ASSERT_TRUE(!env_->NewRandomAccessFile("/dir/non_existent", &rand_file).ok());
+  ASSERT_TRUE(!env_->NewRandomAccessFile(dbname + "/non_existent", &rand_file).ok());
   ASSERT_TRUE(!rand_file);
 
   // Check that deleting works.
-  ASSERT_TRUE(!env_->DeleteFile("/dir/non_existent").ok());
-  ASSERT_OK(env_->DeleteFile("/dir/g"));
-  ASSERT_TRUE(!env_->FileExists("/dir/g"));
-  ASSERT_OK(env_->GetChildren("/dir", &children));
+  ASSERT_TRUE(!env_->DeleteFile(dbname + "/non_existent").ok());
+  ASSERT_OK(env_->DeleteFile(dbname + "/g"));
+  ASSERT_TRUE(!env_->FileExists(dbname + "/g"));
+  ASSERT_OK(env_->GetChildren(dbname + "", &children));
   ASSERT_EQ(0, children.size());
-  ASSERT_OK(env_->DeleteDir("/dir"));
+  ASSERT_OK(env_->DeleteDir(dbname + ""));
 }
 
 TEST(MemEnvTest, ReadWrite) {
@@ -99,16 +92,19 @@ TEST(MemEnvTest, ReadWrite) {
   RandomAccessFile* rand_file;
   Slice result;
   char scratch[100];
+  std::string dbname;
 
-  ASSERT_OK(env_->CreateDir("/dir"));
+  dbname=test::TmpDir();
 
-  ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file));
+  ASSERT_OK(env_->CreateDir(dbname + ""));
+
+  ASSERT_OK(env_->NewWritableFile(dbname + "/f", &writable_file, 2<<20));
   ASSERT_OK(writable_file->Append("hello "));
   ASSERT_OK(writable_file->Append("world"));
   delete writable_file;
 
   // Read sequentially.
-  ASSERT_OK(env_->NewSequentialFile("/dir/f", &seq_file));
+  ASSERT_OK(env_->NewSequentialFile(dbname + "/f", &seq_file));
   ASSERT_OK(seq_file->Read(5, &result, scratch)); // Read "hello".
   ASSERT_EQ(0, result.compare("hello"));
   ASSERT_OK(seq_file->Skip(1));
@@ -122,7 +118,7 @@ TEST(MemEnvTest, ReadWrite) {
   delete seq_file;
 
   // Random reads.
-  ASSERT_OK(env_->NewRandomAccessFile("/dir/f", &rand_file));
+  ASSERT_OK(env_->NewRandomAccessFile(dbname + "/f", &rand_file));
   ASSERT_OK(rand_file->Read(6, 5, &result, scratch)); // Read "world".
   ASSERT_EQ(0, result.compare("world"));
   ASSERT_OK(rand_file->Read(0, 5, &result, scratch)); // Read "hello".
@@ -149,7 +145,7 @@ TEST(MemEnvTest, Misc) {
   ASSERT_TRUE(!test_dir.empty());
 
   WritableFile* writable_file;
-  ASSERT_OK(env_->NewWritableFile("/a/b", &writable_file));
+  ASSERT_OK(env_->NewWritableFile("/a/b", &writable_file, 2<<20));
 
   // These are no-ops, but we test they return success.
   ASSERT_OK(writable_file->Sync());
@@ -161,6 +157,9 @@ TEST(MemEnvTest, Misc) {
 TEST(MemEnvTest, LargeWrite) {
   const size_t kWriteSize = 300 * 1024;
   char* scratch = new char[kWriteSize * 2];
+  std::string dbname;
+
+  dbname=test::TmpDir();
 
   std::string write_data;
   for (size_t i = 0; i < kWriteSize; ++i) {
@@ -168,14 +167,14 @@ TEST(MemEnvTest, LargeWrite) {
   }
 
   WritableFile* writable_file;
-  ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file));
+  ASSERT_OK(env_->NewWritableFile(dbname + "/f", &writable_file, 2<<20));
   ASSERT_OK(writable_file->Append("foo"));
   ASSERT_OK(writable_file->Append(write_data));
   delete writable_file;
 
   SequentialFile* seq_file;
   Slice result;
-  ASSERT_OK(env_->NewSequentialFile("/dir/f", &seq_file));
+  ASSERT_OK(env_->NewSequentialFile(dbname + "/f", &seq_file));
   ASSERT_OK(seq_file->Read(3, &result, scratch)); // Read "foo".
   ASSERT_EQ(0, result.compare("foo"));
 
@@ -190,17 +189,21 @@ TEST(MemEnvTest, LargeWrite) {
   delete seq_file;
   delete [] scratch;
 }
-
+#if 0
 TEST(MemEnvTest, DBTest) {
   Options options;
   options.create_if_missing = true;
   options.env = env_;
   DB* db;
+  std::string dbname;
+
+  dbname=test::TmpDir();
+  ASSERT_OK(env_->CreateDir(dbname+ "/db"));
 
   const Slice keys[] = {Slice("aaa"), Slice("bbb"), Slice("ccc")};
   const Slice vals[] = {Slice("foo"), Slice("bar"), Slice("baz")};
 
-  ASSERT_OK(DB::Open(options, "/dir/db", &db));
+  ASSERT_OK(DB::Open(options, dbname + "/db", &db));
   for (size_t i = 0; i < 3; ++i) {
     ASSERT_OK(db->Put(WriteOptions(), keys[i], vals[i]));
   }
@@ -233,7 +236,7 @@ TEST(MemEnvTest, DBTest) {
 
   delete db;
 }
-
+#endif
 }  // namespace leveldb
 
 int main(int argc, char** argv) {
diff --git a/src/leveldb/include/leveldb/atomics.h b/src/leveldb/include/leveldb/atomics.h
new file mode 100644
index 000000000..6b2a4887b
--- /dev/null
+++ b/src/leveldb/include/leveldb/atomics.h
@@ -0,0 +1,227 @@
+// -------------------------------------------------------------------
+//
+// atomics.h: portable atomic operations for leveldb/eleveldb (http://code.google.com/p/leveldb/)
+//
+// Copyright (c) 2011-2013 Basho Technologies, Inc. All Rights Reserved.
+//
+// This file is provided to you under the Apache License,
+// Version 2.0 (the "License"); you may not use this file
+// except in compliance with the License.  You may obtain
+// a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+//
+// -------------------------------------------------------------------
+
+/// Copied from basho/eleveldb/c_src/detail.hpp September 8, 2013
+
+#ifndef LEVELDB_ATOMIC_H
+ #define LEVELDB_ATOMIC_H 1
+
+#include <stdint.h>
+#include <stddef.h>
+
+/* These can be hopefully-replaced with constexpr or compile-time assert later: */
+#if defined(OS_SOLARIS) || defined(SOLARIS) || defined(sun)
+ #define LEVELDB_IS_SOLARIS 1
+#else
+ #undef LEVELDB_IS_SOLARIS
+#endif
+
+#ifdef LEVELDB_IS_SOLARIS
+ #include <atomic.h>
+#endif
+
+namespace leveldb {
+
+/**
+ * Compare and swap
+ */
+
+// primary template
+template <typename PtrT, typename ValueT>
+inline bool compare_and_swap(volatile PtrT *ptr, const ValueT& comp_val, const ValueT& exchange_val);
+
+
+// uint32 size (needed for solaris)
+template <>
+inline bool compare_and_swap(volatile uint32_t *ptr, const int& comp_val, const int& exchange_val)
+{
+#if LEVELDB_IS_SOLARIS
+  return ((uint32_t) comp_val==atomic_cas_32(ptr, comp_val, exchange_val));
+#else
+    return __sync_bool_compare_and_swap(ptr, comp_val, exchange_val);
+#endif
+}
+
+
+// generic specification ... for pointers
+template <typename PtrT, typename ValueT>
+inline bool compare_and_swap(volatile PtrT *ptr, const ValueT& comp_val, const ValueT& exchange_val)
+{
+#if LEVELDB_IS_SOLARIS
+    return (comp_val==atomic_cas_ptr(ptr, comp_val, exchange_val));
+#else
+    return __sync_bool_compare_and_swap(ptr, comp_val, exchange_val);
+#endif
+}
+
+
+/**
+ * Atomic increment
+ */
+
+template <typename ValueT>
+inline ValueT inc_and_fetch(volatile ValueT *ptr);
+
+template <>
+inline uint64_t inc_and_fetch(volatile uint64_t *ptr)
+{
+#if LEVELDB_IS_SOLARIS
+    return atomic_inc_64_nv(ptr);
+#else
+    return __sync_add_and_fetch(ptr, 1);
+#endif
+}
+
+template <>
+inline uint32_t inc_and_fetch(volatile uint32_t *ptr)
+{
+#if LEVELDB_IS_SOLARIS
+    return atomic_inc_32_nv(ptr);
+#else
+    return __sync_add_and_fetch(ptr, 1);
+#endif
+}
+
+#if defined(__APPLE__) || defined(__OpenBSD__) || (defined(__s390__) && !defined(__s390x__))
+template <>
+inline size_t inc_and_fetch(volatile size_t *ptr)
+{
+    return __sync_add_and_fetch(ptr, 1);
+}
+#endif
+
+
+/**
+ * atomic decrement
+ */
+
+template <typename ValueT>
+inline ValueT dec_and_fetch(volatile ValueT *ptr);
+
+template <>
+inline uint64_t dec_and_fetch(volatile uint64_t *ptr)
+{
+#if LEVELDB_IS_SOLARIS
+    return atomic_dec_64_nv(ptr);
+#else
+    return __sync_sub_and_fetch(ptr, 1);
+#endif
+}
+
+template <>
+inline uint32_t dec_and_fetch(volatile uint32_t *ptr)
+{
+#if LEVELDB_IS_SOLARIS
+    return atomic_dec_32_nv(ptr);
+#else
+    return __sync_sub_and_fetch(ptr, 1);
+#endif
+}
+
+#if defined(__APPLE__) || defined(__OpenBSD__) || (defined(__s390__) && !defined(__s390x__))
+template <>
+inline size_t dec_and_fetch(volatile size_t *ptr)
+{
+    return __sync_sub_and_fetch(ptr, 1);
+}
+#endif
+
+
+/**
+ * Atomic add
+ */
+
+
+template <typename ValueT>
+inline ValueT add_and_fetch(volatile ValueT *ptr, ValueT val);
+
+template <>
+inline uint64_t add_and_fetch(volatile uint64_t *ptr, uint64_t val)
+{
+#if LEVELDB_IS_SOLARIS
+    return atomic_add_64_nv(ptr, val);
+#else
+    return __sync_add_and_fetch(ptr, val);
+#endif
+}
+
+template <>
+inline uint32_t add_and_fetch(volatile uint32_t *ptr, uint32_t val)
+{
+#if LEVELDB_IS_SOLARIS
+    return atomic_add_32_nv(ptr, val);
+#else
+    return __sync_add_and_fetch(ptr, val);
+#endif
+}
+
+#if defined(__APPLE__) || defined(__OpenBSD__) || (defined(__s390__) && !defined(__s390x__))
+template <>
+inline size_t add_and_fetch(volatile size_t *ptr, size_t val)
+{
+    return __sync_add_and_fetch(ptr, val);
+}
+#endif
+
+
+/**
+ * Atomic subtract
+ */
+
+template <typename ValueT>
+inline ValueT sub_and_fetch(volatile ValueT *ptr, ValueT val);
+
+template <>
+inline uint64_t sub_and_fetch(volatile uint64_t *ptr, uint64_t val)
+{
+#if LEVELDB_IS_SOLARIS
+    uint64_t temp=(~val)+1;  // 2's complement, bypass sign warnings
+    return atomic_add_64_nv(ptr, temp);
+#else
+    return __sync_sub_and_fetch(ptr, val);
+#endif
+}
+
+template <>
+inline uint32_t sub_and_fetch(volatile uint32_t *ptr, uint32_t val)
+{
+#if LEVELDB_IS_SOLARIS
+    uint32_t temp=(~val)+1;  // 2's complement, bypass sign warnings
+    return atomic_add_32_nv(ptr, temp);
+#else
+    return __sync_sub_and_fetch(ptr, val);
+#endif
+}
+
+#if defined(__APPLE__) || defined(__OpenBSD__) || (defined(__s390__) && !defined(__s390x__))
+template <>
+inline size_t sub_and_fetch(volatile size_t *ptr, size_t val)
+{
+    return __sync_sub_and_fetch(ptr, val);
+}
+#endif
+
+
+
+} // namespace leveldb
+
+#endif
diff --git a/src/leveldb/include/leveldb/c.h b/src/leveldb/include/leveldb/c.h
index 1048fe3b8..d3eda280e 100644
--- a/src/leveldb/include/leveldb/c.h
+++ b/src/leveldb/include/leveldb/c.h
@@ -9,6 +9,7 @@
   Does not support:
   . getters for the option types
   . custom comparators that implement key shortening
+  . capturing post-write-snapshot
   . custom iter, db, env, cache implementations using just the C bindings
 
   Some conventions:
@@ -27,7 +28,6 @@
   be true on entry:
      *errptr == NULL
      *errptr points to a malloc()ed null-terminated error message
-       (On Windows, *errptr must have been malloc()-ed by this library.)
   On success, a leveldb routine leaves *errptr unchanged.
   On failure, leveldb frees the old value of *errptr and
   set *errptr to a malloc()ed error message.
@@ -66,7 +66,7 @@ typedef struct leveldb_snapshot_t      leveldb_snapshot_t;
 typedef struct leveldb_writablefile_t  leveldb_writablefile_t;
 typedef struct leveldb_writebatch_t    leveldb_writebatch_t;
 typedef struct leveldb_writeoptions_t  leveldb_writeoptions_t;
-
+typedef struct leveldb_keymetadata_t   leveldb_keymetadata_t;
 /* DB operations */
 
 extern leveldb_t* leveldb_open(
@@ -83,6 +83,14 @@ extern void leveldb_put(
     const char* val, size_t vallen,
     char** errptr);
 
+extern void leveldb_put2(
+    leveldb_t* db,
+    const leveldb_writeoptions_t* options,
+    const char* key, size_t keylen,
+    const char* val, size_t vallen,
+    char** errptr,
+    const leveldb_keymetadata_t * metadata);
+
 extern void leveldb_delete(
     leveldb_t* db,
     const leveldb_writeoptions_t* options,
@@ -104,6 +112,14 @@ extern char* leveldb_get(
     size_t* vallen,
     char** errptr);
 
+extern char* leveldb_get2(
+    leveldb_t* db,
+    const leveldb_readoptions_t* options,
+    const char* key, size_t keylen,
+    size_t* vallen,
+    char** errptr,
+    leveldb_keymetadata_t * metadata);
+
 extern leveldb_iterator_t* leveldb_create_iterator(
     leveldb_t* db,
     const leveldb_readoptions_t* options);
@@ -156,6 +172,7 @@ extern void leveldb_iter_next(leveldb_iterator_t*);
 extern void leveldb_iter_prev(leveldb_iterator_t*);
 extern const char* leveldb_iter_key(const leveldb_iterator_t*, size_t* klen);
 extern const char* leveldb_iter_value(const leveldb_iterator_t*, size_t* vlen);
+extern const void leveldb_iter_keymetadata(const leveldb_iterator_t *, leveldb_keymetadata_t *);
 extern void leveldb_iter_get_error(const leveldb_iterator_t*, char** errptr);
 
 /* Write batch */
@@ -167,13 +184,19 @@ extern void leveldb_writebatch_put(
     leveldb_writebatch_t*,
     const char* key, size_t klen,
     const char* val, size_t vlen);
+extern void leveldb_writebatch_put2(
+    leveldb_writebatch_t*,
+    const char* key, size_t klen,
+    const char* val, size_t vlen,
+    const leveldb_keymetadata_t * meta);
 extern void leveldb_writebatch_delete(
     leveldb_writebatch_t*,
     const char* key, size_t klen);
 extern void leveldb_writebatch_iterate(
     leveldb_writebatch_t*,
     void* state,
-    void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen),
+    void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen,
+                const int & type, const uint64_t & expiry),
     void (*deleted)(void*, const char* k, size_t klen));
 
 /* Options */
@@ -192,6 +215,8 @@ extern void leveldb_options_set_error_if_exists(
     leveldb_options_t*, unsigned char);
 extern void leveldb_options_set_paranoid_checks(
     leveldb_options_t*, unsigned char);
+extern void leveldb_options_set_verify_compactions(
+    leveldb_options_t*, unsigned char);
 extern void leveldb_options_set_env(leveldb_options_t*, leveldb_env_t*);
 extern void leveldb_options_set_info_log(leveldb_options_t*, leveldb_logger_t*);
 extern void leveldb_options_set_write_buffer_size(leveldb_options_t*, size_t);
@@ -199,6 +224,7 @@ extern void leveldb_options_set_max_open_files(leveldb_options_t*, int);
 extern void leveldb_options_set_cache(leveldb_options_t*, leveldb_cache_t*);
 extern void leveldb_options_set_block_size(leveldb_options_t*, size_t);
 extern void leveldb_options_set_block_restart_interval(leveldb_options_t*, int);
+extern void leveldb_options_set_total_leveldb_mem(leveldb_options_t*, size_t);
 
 enum {
   leveldb_no_compression = 0,
@@ -267,20 +293,20 @@ extern void leveldb_cache_destroy(leveldb_cache_t* cache);
 
 extern leveldb_env_t* leveldb_create_default_env();
 extern void leveldb_env_destroy(leveldb_env_t*);
+extern void leveldb_env_shutdown();
 
-/* Utility */
+/* Util */
 
-/* Calls free(ptr).
-   REQUIRES: ptr was malloc()-ed and returned by one of the routines
-   in this file.  Note that in certain cases (typically on Windows), you
-   may need to call this routine instead of free(ptr) to dispose of
-   malloc()-ed memory returned by this library. */
+/**
+ * CAUTION:  this call is only for char * objects returned by
+ *           functions like leveldb_get and leveldb_property_value.
+ *           Also used to release errptr strings.
+ */
 extern void leveldb_free(void* ptr);
 
-/* Return the major version number for this release. */
-extern int leveldb_major_version();
+/* Version */
 
-/* Return the minor version number for this release. */
+extern int leveldb_major_version();
 extern int leveldb_minor_version();
 
 #ifdef __cplusplus
diff --git a/src/leveldb/include/leveldb/cache.h b/src/leveldb/include/leveldb/cache.h
index 6819d5bc4..224e18d2a 100644
--- a/src/leveldb/include/leveldb/cache.h
+++ b/src/leveldb/include/leveldb/cache.h
@@ -29,6 +29,11 @@ class Cache;
 // of Cache uses a least-recently-used eviction policy.
 extern Cache* NewLRUCache(size_t capacity);
 
+// Riak customization - just like NewLRUCache except the underlying
+//  structure is NOT sharded.  Better for file cache.
+extern Cache* NewLRUCache2(size_t capacity);
+
+
 class Cache {
  public:
   Cache() { }
@@ -81,16 +86,17 @@ class Cache {
   // its cache keys.
   virtual uint64_t NewId() = 0;
 
-  // Remove all cache entries that are not actively in use.  Memory-constrained
-  // applications may wish to call this method to reduce memory usage.
-  // Default implementation of Prune() does nothing.  Subclasses are strongly
-  // encouraged to override the default implementation.  A future release of
-  // leveldb may change Prune() to a pure abstract method.
-  virtual void Prune() {}
+  // Return size, if any, of per entry overhead for item placed in cache.
+  // Allows more accurate tracking of "charge" against each cache item.
+  virtual size_t EntryOverheadSize() {return(0);};
 
-  // Return an estimate of the combined charges of all elements stored in the
-  // cache.
-  virtual size_t TotalCharge() const = 0;
+  // Riak specific:  Add a reference to cache object to help hold it
+  //  in memory
+  virtual void Addref(Handle* e) = 0;
+
+  // Riak specific:  walk contents of entire cache, calling functor Acc
+  // with the "value" for each cache entry.  Locks cache throughout call.
+  virtual bool WalkCache(class CacheAccumulator & Acc) {return(true);};
 
  private:
   void LRU_Remove(Handle* e);
@@ -107,4 +113,4 @@ class Cache {
 
 }  // namespace leveldb
 
-#endif  // STORAGE_LEVELDB_INCLUDE_CACHE_H_
+#endif  // STORAGE_LEVELDB_UTIL_CACHE_H_
diff --git a/src/leveldb/include/leveldb/comparator.h b/src/leveldb/include/leveldb/comparator.h
index 556b984c7..38b59539e 100644
--- a/src/leveldb/include/leveldb/comparator.h
+++ b/src/leveldb/include/leveldb/comparator.h
@@ -58,6 +58,10 @@ class Comparator {
 // must not be deleted.
 extern const Comparator* BytewiseComparator();
 
+// Riak specific: cleans up the default comparitor to make
+//  valgrind results clean
+extern void ComparatorShutdown();
+
 }  // namespace leveldb
 
 #endif  // STORAGE_LEVELDB_INCLUDE_COMPARATOR_H_
diff --git a/src/leveldb/include/leveldb/db.h b/src/leveldb/include/leveldb/db.h
index bfab10a0b..d2bd6dce3 100644
--- a/src/leveldb/include/leveldb/db.h
+++ b/src/leveldb/include/leveldb/db.h
@@ -14,7 +14,7 @@ namespace leveldb {
 
 // Update Makefile if you change these
 static const int kMajorVersion = 1;
-static const int kMinorVersion = 20;
+static const int kMinorVersion = 9;
 
 struct Options;
 struct ReadOptions;
@@ -38,6 +38,17 @@ struct Range {
   Range(const Slice& s, const Slice& l) : start(s), limit(l) { }
 };
 
+// Abstract holder for a DB value.
+// This allows callers to manage their own value buffers and have
+// DB values copied directly into those buffers.
+class Value {
+ public:
+  virtual Value& assign(const char* data, size_t size) = 0;
+
+ protected:
+  virtual ~Value();
+};
+
 // A DB is a persistent ordered map from keys to values.
 // A DB is safe for concurrent access from multiple threads without
 // any external synchronization.
@@ -60,7 +71,8 @@ class DB {
   // Note: consider setting options.sync = true.
   virtual Status Put(const WriteOptions& options,
                      const Slice& key,
-                     const Slice& value) = 0;
+                     const Slice& value,
+                     const KeyMetaData * meta=NULL) = 0;
 
   // Remove the database entry (if any) for "key".  Returns OK on
   // success, and a non-OK status on error.  It is not an error if "key"
@@ -81,7 +93,11 @@ class DB {
   //
   // May return some other Status on an error.
   virtual Status Get(const ReadOptions& options,
-                     const Slice& key, std::string* value) = 0;
+                     const Slice& key, std::string* value,
+                     KeyMetaData * meta=NULL) = 0;
+  virtual Status Get(const ReadOptions& options,
+                     const Slice& key, Value* value,
+                     KeyMetaData * meta=NULL) = 0;
 
   // Return a heap-allocated iterator over the contents of the database.
   // The result of NewIterator() is initially invalid (caller must
@@ -115,8 +131,6 @@ class DB {
   //     about the internal operation of the DB.
   //  "leveldb.sstables" - returns a multi-line string that describes all
   //     of the sstables that make up the db contents.
-  //  "leveldb.approximate-memory-usage" - returns the approximate number of
-  //     bytes of memory in use by the DB.
   virtual bool GetProperty(const Slice& property, std::string* value) = 0;
 
   // For each i in [0,n-1], store in "sizes[i]", the approximate
@@ -142,6 +156,21 @@ class DB {
   //    db->CompactRange(NULL, NULL);
   virtual void CompactRange(const Slice* begin, const Slice* end) = 0;
 
+  // Riak specific function:  Verify that no .sst files overlap
+  // within the levels that expect non-overlapping files.  Run
+  // compactions as necessary to correct.  Assumes DB opened
+  // with Options.is_repair=true
+  virtual Status VerifyLevels();
+
+  // Riak specific function:  Request database check for
+  // available compactions.  This is to stimulate retry of
+  // grooming that might have been offered and rejected previously
+  virtual void CheckAvailableCompactions();
+
+  // Riak specific function:  Give external code, namely
+  // eleveldb, access to leveldb's logging routines.
+  virtual Logger* GetLogger() const { return NULL; }
+
  private:
   // No copying allowed
   DB(const DB&);
diff --git a/src/leveldb/include/leveldb/dumpfile.h b/src/leveldb/include/leveldb/dumpfile.h
deleted file mode 100644
index 3f97fda16..000000000
--- a/src/leveldb/include/leveldb/dumpfile.h
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright (c) 2014 The LevelDB Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
-
-#ifndef STORAGE_LEVELDB_INCLUDE_DUMPFILE_H_
-#define STORAGE_LEVELDB_INCLUDE_DUMPFILE_H_
-
-#include <string>
-#include "leveldb/env.h"
-#include "leveldb/status.h"
-
-namespace leveldb {
-
-// Dump the contents of the file named by fname in text format to
-// *dst.  Makes a sequence of dst->Append() calls; each call is passed
-// the newline-terminated text corresponding to a single item found
-// in the file.
-//
-// Returns a non-OK result if fname does not name a leveldb storage
-// file, or if the file cannot be read.
-Status DumpFile(Env* env, const std::string& fname, WritableFile* dst);
-
-}  // namespace leveldb
-
-#endif  // STORAGE_LEVELDB_INCLUDE_DUMPFILE_H_
diff --git a/src/leveldb/include/leveldb/env.h b/src/leveldb/include/leveldb/env.h
index 275d441ea..e1df0c78d 100644
--- a/src/leveldb/include/leveldb/env.h
+++ b/src/leveldb/include/leveldb/env.h
@@ -13,15 +13,19 @@
 #ifndef STORAGE_LEVELDB_INCLUDE_ENV_H_
 #define STORAGE_LEVELDB_INCLUDE_ENV_H_
 
+#include <cstdarg>
+#include <pthread.h>
 #include <string>
 #include <vector>
-#include <stdarg.h>
 #include <stdint.h>
+#include "leveldb/perf_count.h"
 #include "leveldb/status.h"
 
 namespace leveldb {
 
+class AppendableFile;
 class FileLock;
+struct Options;
 class Logger;
 class RandomAccessFile;
 class SequentialFile;
@@ -40,6 +44,11 @@ class Env {
   // The result of Default() belongs to leveldb and must never be deleted.
   static Env* Default();
 
+  // Riak specific:  Shutdown background work threads and other objects
+  //  to get clean environment for valgrind memory test.  No restart supported
+  //  after this call.  Not thread safe.
+  static void Shutdown();
+
   // Create a brand new sequentially-readable file with the specified name.
   // On success, stores a pointer to the new file in *result and returns OK.
   // On failure stores NULL in *result and returns non-OK.  If the file does
@@ -67,22 +76,31 @@ class Env {
   //
   // The returned file will only be accessed by one thread at a time.
   virtual Status NewWritableFile(const std::string& fname,
-                                 WritableFile** result) = 0;
+                                 WritableFile** result,
+                                 size_t map_size) = 0;
 
-  // Create an object that either appends to an existing file, or
-  // writes to a new file (if the file does not exist to begin with).
-  // On success, stores a pointer to the new file in *result and
-  // returns OK.  On failure stores NULL in *result and returns
-  // non-OK.
+  // Riak specific:
+  // Derived from NewWritableFile.  One change: if the file exists,
+  // move to the end of the file and continue writing.
+  // new file.  On success, stores a pointer to the open file in
+  // *result and returns OK.  On failure stores NULL in *result and
+  // returns non-OK.
   //
   // The returned file will only be accessed by one thread at a time.
-  //
-  // May return an IsNotSupportedError error if this Env does
-  // not allow appending to an existing file.  Users of Env (including
-  // the leveldb implementation) must be prepared to deal with
-  // an Env that does not support appending.
   virtual Status NewAppendableFile(const std::string& fname,
-                                   WritableFile** result);
+                                   WritableFile** result,
+                                   size_t map_size) = 0;
+
+  // Riak specific:
+  // Allows for virtualized version of NewWritableFile that enables write
+  // and close operations to execute on background threads
+  //  (where platform supported).
+  //
+  // The returned file will only be accessed by one thread at a time.
+  virtual Status NewWriteOnlyFile(const std::string& fname,
+                                  WritableFile** result,
+                                  size_t map_size)
+  {return(NewWritableFile(fname, result, map_size));};
 
   // Returns true iff the named file exists.
   virtual bool FileExists(const std::string& fname) = 0;
@@ -142,7 +160,7 @@ class Env {
 
   // Start a new thread, invoking "function(arg)" within the new thread.
   // When "function(arg)" returns, the thread will be destroyed.
-  virtual void StartThread(void (*function)(void* arg), void* arg) = 0;
+  virtual pthread_t StartThread(void (*function)(void* arg), void* arg) = 0;
 
   // *path is set to a temporary directory that can be used for testing. It may
   // or many not have just been created. The directory may or may not differ
@@ -157,9 +175,16 @@ class Env {
   // useful for computing deltas of time.
   virtual uint64_t NowMicros() = 0;
 
-  // Sleep/delay the thread for the prescribed number of micro-seconds.
+  // Sleep/delay the thread for the perscribed number of micro-seconds.
   virtual void SleepForMicroseconds(int micros) = 0;
 
+  // Riak specific:  Get object that is tracking various software counters
+  virtual PerformanceCounters * GetPerformanceCounters() {return(gPerfCounters);};
+
+  // Riak specific:  Request size of recovery memory map, potentially using
+  //  Options data for the decision.  Default 2Mbyte is Google's original size.
+  virtual size_t RecoveryMmapSize(const struct Options *) const {return(2*1024*1024L);};
+
  private:
   // No copying allowed
   Env(const Env&);
@@ -190,14 +215,6 @@ class SequentialFile {
   //
   // REQUIRES: External synchronization
   virtual Status Skip(uint64_t n) = 0;
-
-  // Get a name for the file, only for error reporting
-  virtual std::string GetName() const = 0;
-
- private:
-  // No copying allowed
-  SequentialFile(const SequentialFile&);
-  void operator=(const SequentialFile&);
 };
 
 // A file abstraction for randomly reading the contents of a file.
@@ -218,13 +235,11 @@ class RandomAccessFile {
   virtual Status Read(uint64_t offset, size_t n, Slice* result,
                       char* scratch) const = 0;
 
-  // Get a name for the file, only for error reporting
-  virtual std::string GetName() const = 0;
+  // Riak optimization:  allows advising Linux page cache
+  virtual void SetForCompaction(uint64_t file_size) {};
 
- private:
-  // No copying allowed
-  RandomAccessFile(const RandomAccessFile&);
-  void operator=(const RandomAccessFile&);
+  // Riak addition:  size of this structure in bytes
+  virtual size_t ObjectSize() {return(sizeof(RandomAccessFile));};
 };
 
 // A file abstraction for sequential writing.  The implementation
@@ -240,8 +255,10 @@ class WritableFile {
   virtual Status Flush() = 0;
   virtual Status Sync() = 0;
 
-  // Get a name for the file, only for error reporting
-  virtual std::string GetName() const = 0;
+  // Riak specific:
+  // Provide hint where key/value data ends and metadata starts
+  //  in an .sst table file.
+  virtual void SetMetadataOffset(uint64_t) {};
 
  private:
   // No copying allowed
@@ -249,12 +266,30 @@ class WritableFile {
   void operator=(const WritableFile&);
 };
 
+// A file abstraction for sequential writing at end of existing file.
+class AppendableFile: public WritableFile {
+ public:
+  AppendableFile() { }
+  virtual ~AppendableFile();
+
+ private:
+  // No copying allowed
+  AppendableFile(const AppendableFile&);
+  void operator=(const AppendableFile&);
+};
+
 // An interface for writing log messages.
 class Logger {
  public:
   Logger() { }
   virtual ~Logger();
 
+  // Riak specific function for hot backup.
+  //  hot_backup.cc assumes that it can rotate the LOG file
+  //  via standard Env routines if this function returns a
+  //  non-zero value.
+  virtual long LogSize() {return(0);};
+
   // Write an entry to the log file with the specified format.
   virtual void Logv(const char* format, va_list ap) = 0;
 
@@ -310,11 +345,14 @@ class EnvWrapper : public Env {
   Status NewRandomAccessFile(const std::string& f, RandomAccessFile** r) {
     return target_->NewRandomAccessFile(f, r);
   }
-  Status NewWritableFile(const std::string& f, WritableFile** r) {
-    return target_->NewWritableFile(f, r);
+  Status NewWritableFile(const std::string& f, WritableFile** r, size_t s=0) {
+    return target_->NewWritableFile(f, r, s);
   }
-  Status NewAppendableFile(const std::string& f, WritableFile** r) {
-    return target_->NewAppendableFile(f, r);
+  Status NewAppendableFile(const std::string& f, WritableFile** r, size_t s=0) {
+      return target_->NewAppendableFile(f, r, s);
+  }
+  Status NewWriteOnlyFile(const std::string& f, WritableFile** r, size_t s=0) {
+    return target_->NewWriteOnlyFile(f, r, s);
   }
   bool FileExists(const std::string& f) { return target_->FileExists(f); }
   Status GetChildren(const std::string& dir, std::vector<std::string>* r) {
@@ -334,9 +372,9 @@ class EnvWrapper : public Env {
   }
   Status UnlockFile(FileLock* l) { return target_->UnlockFile(l); }
   void Schedule(void (*f)(void*), void* a) {
-    return target_->Schedule(f, a);
+      return target_->Schedule(f, a);
   }
-  void StartThread(void (*f)(void*), void* a) {
+  pthread_t StartThread(void (*f)(void*), void* a) {
     return target_->StartThread(f, a);
   }
   virtual Status GetTestDirectory(std::string* path) {
@@ -355,6 +393,12 @@ class EnvWrapper : public Env {
   Env* target_;
 };
 
+// Riak specific hack to allow runtime change
+//  of mapping size
+extern volatile size_t gMapSize;
+
+extern bool gFadviseWillNeed;
+
 }  // namespace leveldb
 
 #endif  // STORAGE_LEVELDB_INCLUDE_ENV_H_
diff --git a/src/leveldb/include/leveldb/expiry.h b/src/leveldb/include/leveldb/expiry.h
new file mode 100644
index 000000000..c5be6603a
--- /dev/null
+++ b/src/leveldb/include/leveldb/expiry.h
@@ -0,0 +1,135 @@
+// -------------------------------------------------------------------
+//
+// expiry.h:  background expiry management for Basho's modified leveldb
+//
+// Copyright (c) 2016 Basho Technologies, Inc. All Rights Reserved.
+//
+// This file is provided to you under the Apache License,
+// Version 2.0 (the "License"); you may not use this file
+// except in compliance with the License.  You may obtain
+// a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+//
+// -------------------------------------------------------------------
+
+#ifndef EXPIRY_H
+#define EXPIRY_H
+
+#include <limits.h>
+#include <stdint.h>
+#include "leveldb/env.h"
+#include "leveldb/options.h"
+#include "util/refobject_base.h"
+
+namespace leveldb {
+
+class Compaction;
+class Logger;
+struct ParsedInternalKey;
+class Slice;
+class SstCounters;
+class Version;
+class VersionEdit;
+struct FileMetaData;
+
+
+enum EleveldbRouterActions_t
+{
+    eGetBucketProperties=1
+};  // enum EleveldbRouterActions_t
+
+
+typedef bool (* EleveldbRouter_t)(EleveldbRouterActions_t Action, int ParamCount, const void ** Params);
+
+
+class ExpiryModule : public RefObjectBase
+{
+public:
+    virtual ~ExpiryModule() {};
+
+    // Print expiry options to LOG file
+    virtual void Dump(Logger * log) const
+    {Log(log,"                        Expiry: (none)");};
+
+    // Quick test to allow manifest logic and such know if
+    //  extra expiry logic should be checked
+    virtual bool ExpiryActivated() const {return(false);};
+
+    // db/write_batch.cc MemTableInserter::Put() calls this.
+    // returns false on internal error
+    virtual bool MemTableInserterCallback(
+        const Slice & Key,   // input: user's key about to be written
+        const Slice & Value, // input: user's value object
+        ValueType & ValType,   // input/output: key type. call might change
+        ExpiryTimeMicros & Expiry) const  // input/output: 0 or specific expiry. call might change
+    {return(true);};
+
+    // db/dbformat.cc KeyRetirement::operator() calls this.
+    // db/version_set.cc SaveValue() calls this too.
+    // returns true if key is expired, returns false if key not expired
+    virtual bool KeyRetirementCallback(
+        const ParsedInternalKey & Ikey) const
+    {return(false);};
+
+    // table/table_builder.cc TableBuilder::Add() calls this.
+    // returns false on internal error
+    virtual bool TableBuilderCallback(
+        const Slice & Key,       // input: internal key
+        SstCounters & Counters) const // input/output: counters for new sst table
+    {return(true);};
+
+    // db/memtable.cc MemTable::Get() calls this.
+    // returns true if type/expiry is expired, returns false if not expired
+    virtual bool MemTableCallback(
+        const Slice & Key) const        // input: leveldb internal key
+    {return(false);};
+
+    // db/version_set.cc VersionSet::Finalize() calls this if no
+    //  other compaction selected for a level
+    // returns true if there is an expiry compaction eligible
+    virtual bool CompactionFinalizeCallback(
+        bool WantAll,                 // input: true - examine all expired files
+        const Version & Ver,          // input: database state for examination
+        int Level,                    // input: level to review for expiry
+        VersionEdit * Edit) const     // output: NULL or destination of delete list
+    {return(false);};
+
+    // yep, sometimes we want to expiry this expiry module object.
+    //  mostly for bucket level properties in Riak EE
+    virtual uint64_t ExpiryModuleExpiryMicros() {return(0);};
+
+    // Creates derived ExpiryModule object that matches compile time
+    //  switch for open source or Basho enterprise edition features.
+    static ExpiryModule * CreateExpiryModule(EleveldbRouter_t Router);
+
+    // Cleans up global objects related to expiry
+    //  switch for open source or Basho enterprise edition features.
+    static void ShutdownExpiryModule();
+
+    // Riak EE:  stash a user created module with settings
+    virtual void NoteUserExpirySettings() {};
+
+protected:
+    ExpiryModule() {};
+
+private:
+    ExpiryModule(const ExpiryModule &);
+    ExpiryModule & operator=(const ExpiryModule &);
+
+};  // ExpiryModule
+
+
+typedef RefPtr<class ExpiryModule> ExpiryPtr_t;
+
+} // namespace leveldb
+
+#endif // ifndef
+
diff --git a/src/leveldb/include/leveldb/filter_policy.h b/src/leveldb/include/leveldb/filter_policy.h
index 1fba08001..9369f7224 100644
--- a/src/leveldb/include/leveldb/filter_policy.h
+++ b/src/leveldb/include/leveldb/filter_policy.h
@@ -23,9 +23,21 @@ namespace leveldb {
 class Slice;
 
 class FilterPolicy {
- public:
+protected:
+  mutable const FilterPolicy * m_Next;      // used by FilterInventory
+
+public:
+  FilterPolicy()
+      : m_Next(NULL)
+  {};
+
   virtual ~FilterPolicy();
 
+  // list pointer accessors
+  const FilterPolicy * GetNext() const {return(m_Next);};
+  void SetNext(const FilterPolicy * Next) const {m_Next=Next;};
+
+
   // Return the name of this policy.  Note that if the filter encoding
   // changes in an incompatible way, the name returned by this method
   // must be changed.  Otherwise, old incompatible filters may be
@@ -47,6 +59,7 @@ class FilterPolicy {
   // This method may return true or false if the key was not on the
   // list, but it should aim to return false with a high probability.
   virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const = 0;
+
 };
 
 // Return a new filter policy that uses a bloom filter with approximately
@@ -64,7 +77,29 @@ class FilterPolicy {
 // FilterPolicy (like NewBloomFilterPolicy) that does not ignore
 // trailing spaces in keys.
 extern const FilterPolicy* NewBloomFilterPolicy(int bits_per_key);
+extern const FilterPolicy* NewBloomFilterPolicy2(int bits_per_key);
 
-}
+
+class FilterInventory
+{
+public:
+    // MUST be static variable so that it initializes before any static objects
+    //  have their initializers called
+    static const FilterPolicy * ListHead;
+
+    // This might be called prior to singleton FilterInventory object
+    //  being initialized.  NOT THREAD SAFE.
+    static void AddFilterToInventory(const FilterPolicy * Filter)
+    {
+        if (NULL!=Filter)
+        {
+            Filter->SetNext(ListHead);
+            ListHead=Filter;
+        }   // if
+        return;
+    }
+};  // class FilterInventory
+
+}   // namespace leveldb
 
 #endif  // STORAGE_LEVELDB_INCLUDE_FILTER_POLICY_H_
diff --git a/src/leveldb/include/leveldb/iterator.h b/src/leveldb/include/leveldb/iterator.h
index da631ed9d..71d201f62 100644
--- a/src/leveldb/include/leveldb/iterator.h
+++ b/src/leveldb/include/leveldb/iterator.h
@@ -17,6 +17,7 @@
 
 #include "leveldb/slice.h"
 #include "leveldb/status.h"
+#include "leveldb/options.h"
 
 namespace leveldb {
 
@@ -37,7 +38,7 @@ class Iterator {
   // Valid() after this call iff the source is not empty.
   virtual void SeekToLast() = 0;
 
-  // Position at the first key in the source that is at or past target.
+  // Position at the first key in the source that at or past target
   // The iterator is Valid() after this call iff the source contains
   // an entry that comes at or past target.
   virtual void Seek(const Slice& target) = 0;
@@ -61,9 +62,13 @@ class Iterator {
   // Return the value for the current entry.  The underlying storage for
   // the returned slice is valid only until the next modification of
   // the iterator.
-  // REQUIRES: Valid()
+  // REQUIRES: !AtEnd() && !AtStart()
   virtual Slice value() const = 0;
 
+  // Riak specific:  if a database iterator, returns key meta data
+  // REQUIRES: Valid()
+  virtual KeyMetaData & keymetadata() const {return(keymetadata_); };
+
   // If an error has occurred, return it.  Else return an ok status.
   virtual Status status() const = 0;
 
@@ -75,6 +80,10 @@ class Iterator {
   typedef void (*CleanupFunction)(void* arg1, void* arg2);
   void RegisterCleanup(CleanupFunction function, void* arg1, void* arg2);
 
+ protected:
+  // mutable so reusable by derived classes
+  mutable KeyMetaData keymetadata_;
+
  private:
   struct Cleanup {
     CleanupFunction function;
diff --git a/src/leveldb/include/leveldb/options.h b/src/leveldb/include/leveldb/options.h
index 976e38122..00efa3333 100644
--- a/src/leveldb/include/leveldb/options.h
+++ b/src/leveldb/include/leveldb/options.h
@@ -6,15 +6,23 @@
 #define STORAGE_LEVELDB_INCLUDE_OPTIONS_H_
 
 #include <stddef.h>
+#include <stdint.h>
+#include <string>
+#include <memory>
 
 namespace leveldb {
 
 class Cache;
 class Comparator;
 class Env;
+class ExpiryModule;
 class FilterPolicy;
 class Logger;
 class Snapshot;
+namespace log
+{
+    class Writer;
+}  // namespace log
 
 // DB contents are stored in a set of blocks, each of which holds a
 // sequence of key,value pairs.  Each block may be compressed before
@@ -24,9 +32,34 @@ enum CompressionType {
   // NOTE: do not change the values of existing entries, as these are
   // part of the persistent format on disk.
   kNoCompression     = 0x0,
-  kSnappyCompression = 0x1
+  kSnappyCompression = 0x1,
+  kLZ4Compression    = 0x2,
+  kNoCompressionAutomated = 0x3
 };
 
+//  Originally located in db/dbformat.h.  Now available publically.
+// Value types encoded as the last component of internal keys.
+// DO NOT CHANGE THESE ENUM VALUES: they are embedded in the on-disk
+// data structures.
+enum ValueType {
+  kTypeDeletion = 0x0,
+  kTypeValue = 0x1,
+  kTypeValueWriteTime = 0x2,
+  kTypeValueExplicitExpiry = 0x3
+};
+
+//  Originally located in db/dbformat.h
+typedef uint64_t SequenceNumber;
+typedef uint64_t ExpiryTimeMicros;
+
+};  // namespace leveldb
+
+//
+// must follow ValueType declaration
+#include "leveldb/expiry.h"
+
+namespace leveldb {
+
 // Options to control the behavior of a database (passed to DB::Open)
 struct Options {
   // -------------------
@@ -56,6 +89,14 @@ struct Options {
   // Default: false
   bool paranoid_checks;
 
+  // Riak specific: this variable replaces paranoid_checks at one
+  // one place in the code.  This variable alone controls whether or not
+  // compaction read operations check CRC values.  Riak needs
+  // the compaction CRC check, but not other paranoid_checks ... so
+  // this independent control.
+  // Default: true
+  bool verify_compactions;
+
   // Use the specified object to interact with the environment,
   // e.g. to read/write files, schedule background work, etc.
   // Default: Env::Default()
@@ -85,7 +126,7 @@ struct Options {
   // Number of open files that can be used by the DB.  You may need to
   // increase this if your database has a large working set (budget
   // one open file per 2MB of working set).
-  //
+  // RIAK: NO LONGER USED
   // Default: 1000
   int max_open_files;
 
@@ -105,6 +146,15 @@ struct Options {
   // Default: 4K
   size_t block_size;
 
+  // Riak specific:  non-zero value activates code to automatically
+  // increase block_size as needed to ensure maximum number of files
+  // are available in the file cache.  The value indicates how many
+  // incremental increases to use between the original block_size
+  // and largest, reasonable block_size.
+  //
+  // Default: 16
+  int block_size_steps;
+
   // Number of keys between restart points for delta encoding of keys.
   // This parameter can be changed dynamically.  Most clients should
   // leave this parameter alone.
@@ -112,18 +162,6 @@ struct Options {
   // Default: 16
   int block_restart_interval;
 
-  // Leveldb will write up to this amount of bytes to a file before
-  // switching to a new one.
-  // Most clients should leave this parameter alone.  However if your
-  // filesystem is more efficient with larger files, you could
-  // consider increasing the value.  The downside will be longer
-  // compactions and hence longer latency/performance hiccups.
-  // Another reason to increase this parameter might be when you are
-  // initially populating a large database.
-  //
-  // Default: 2MB
-  size_t max_file_size;
-
   // Compress blocks using the specified compression algorithm.  This
   // parameter can be changed dynamically.
   //
@@ -140,12 +178,6 @@ struct Options {
   // efficiently detect that and will switch to uncompressed mode.
   CompressionType compression;
 
-  // EXPERIMENTAL: If true, append to existing MANIFEST and log files
-  // when a database is opened.  This can significantly speed up open.
-  //
-  // Default: currently false, but may become true later.
-  bool reuse_logs;
-
   // If non-NULL, use the specified filter policy to reduce disk reads.
   // Many applications will benefit from passing the result of
   // NewBloomFilterPolicy() here.
@@ -153,8 +185,84 @@ struct Options {
   // Default: NULL
   const FilterPolicy* filter_policy;
 
+  // Riak specific flag used to indicate when database is open
+  // as part of a Repair operation.  Default is false
+  bool is_repair;
+
+  // Riak specific flag to mark Riak internal database versus
+  //  user database.  (User database gets larger cache resources.)
+  bool is_internal_db;
+
+  // Riak replacement for max_open_files and block_cache.  This is
+  //  TOTAL memory to be used by leveldb across ALL DATABASES.
+  //  Most recent value seen upon database open, wins.  Zero for default.
+  uint64_t total_leveldb_mem;
+
+  // Riak specific option specifying block cache space that cannot
+  //  be released for page cache use.  The space may still be
+  //  released for file cache.
+  uint64_t block_cache_threshold;
+
+  // Riak option to override most memory modeling and create
+  //  smaller memory footprint for developers.  Helps when
+  //  running large number of databases and multiple VMs. Do
+  //  NOT use this option if making performance measurements.
+  // Default: false
+  bool limited_developer_mem;
+
+  // The size of each MMAped file, choose 0 for the default (20M)
+  uint64_t mmap_size;
+
+  // Riak option to adjust aggressive delete behavior.
+  //  - zero disables aggressive delete
+  //  - positive value indicates how many deletes must exist
+  //     in a file for it to be compacted due to deletes
+  uint64_t delete_threshold;
+
+  // Riak specific flag used to indicate when fadvise() management
+  // should default to WILLNEED instead of DONTNEED.  Default is false
+  bool fadvise_willneed;
+
+  // *****
+  // Riak specific options for establishing two tiers of disk arrays.
+  // All three tier options must be valid for the option to activate.
+  // When active, leveldb directories are constructed using either
+  // the fast or slow prefix followed by the database name given
+  // in the DB::Open call.  (a synonym for "prefix" is "mount")
+  // *****
+
+  // Riak specific option setting the level number at which the
+  // "tiered_slow_prefix" should be used.  Default is zero which
+  // disables the option.  Valid values are 1 to 6.  3 or 4 recommended.
+  unsigned tiered_slow_level;
+
+  // Riak specific option with the path prefix used for "fast" disk
+  // array.  levels 0 to tiered_slow_level-1 use this path prefix
+  std::string tiered_fast_prefix;
+
+  // Riak specific option with the path prefix used for "slow" disk
+  // array.  levels tiered_slow_level through 6 use this path prefix
+  std::string tiered_slow_prefix;
+
+  // Riak specific option that writes a list of open table files
+  // to disk on close then automatically opens same files again
+  // upon restart.
+  bool cache_object_warming;
+
+  // Riak specific object that defines expiry policy for data
+  // written to leveldb.
+  ExpiryPtr_t expiry_module;
+
   // Create an Options object with default values for all fields.
   Options();
+
+  void Dump(Logger * log) const;
+
+  bool ExpiryActivated() const
+        {return(NULL!=expiry_module.get() && expiry_module->ExpiryActivated());};
+
+private:
+
 };
 
 // Options that control read operations
@@ -171,16 +279,57 @@ struct ReadOptions {
 
   // If "snapshot" is non-NULL, read as of the supplied snapshot
   // (which must belong to the DB that is being read and which must
-  // not have been released).  If "snapshot" is NULL, use an implicit
+  // not have been released).  If "snapshot" is NULL, use an impliicit
   // snapshot of the state at the beginning of this read operation.
   // Default: NULL
   const Snapshot* snapshot;
 
+  // Riak specific flag, currently used within Erlang adaptor
+  //  to enable automatic delete and new of fresh snapshot
+  //  and database iterator objects for long running iterations
+  //  (only supports iterator NEXT operations).
+  // Default: false
+  bool iterator_refresh;
+
   ReadOptions()
-      : verify_checksums(false),
-        fill_cache(true),
-        snapshot(NULL) {
+  : verify_checksums(true),
+      fill_cache(true),
+      snapshot(NULL),
+      iterator_refresh(false),
+      is_compaction(false),
+      env(NULL),
+      info_log(NULL)
+  {
   }
+
+
+  // accessors to the private data
+  bool IsCompaction() const {return(is_compaction);};
+
+  Logger * GetInfoLog() const {return(info_log);};
+
+  const std::string & GetDBName() const {return(dbname);};
+
+  Env * GetEnv() const {return(env);};
+
+  // The items below are internal options, not for external manipulation.
+  //  They are populated by VersionSet::MakeInputIterator only during compaction operations
+private:
+  friend class VersionSet;
+
+  // true when used on background compaction
+  bool is_compaction;
+
+  // Database name for potential creation of bad blocks file
+  std::string dbname;
+
+  // Needed for file operations if creating bad blocks file
+  Env * env;
+
+  // Open log file for error notifications
+  // Only valid when is_compation==true
+  Logger* info_log;
+
 };
 
 // Options that control write operations
@@ -208,6 +357,22 @@ struct WriteOptions {
   }
 };
 
+
+// Riak specific object that can return key metadata
+//  during get or iterate operation
+struct KeyMetaData
+{
+    ValueType m_Type;          // see above
+    SequenceNumber m_Sequence; // output only, leveldb internal
+    ExpiryTimeMicros m_Expiry; // microseconds since Epoch, UTC
+
+    KeyMetaData()
+    : m_Type(kTypeValue), m_Sequence(0), m_Expiry(0)
+    {};
+};  // struct KeyMetaData
+
+const char * CompileOptionsString();
+
 }  // namespace leveldb
 
 #endif  // STORAGE_LEVELDB_INCLUDE_OPTIONS_H_
diff --git a/src/leveldb/include/leveldb/perf_count.h b/src/leveldb/include/leveldb/perf_count.h
new file mode 100644
index 000000000..2f957f4fe
--- /dev/null
+++ b/src/leveldb/include/leveldb/perf_count.h
@@ -0,0 +1,329 @@
+// -------------------------------------------------------------------
+//
+// perf_count.h:  performance counters LevelDB
+//
+// Copyright (c) 2012-2016 Basho Technologies, Inc. All Rights Reserved.
+//
+// This file is provided to you under the Apache License,
+// Version 2.0 (the "License"); you may not use this file
+// except in compliance with the License.  You may obtain
+// a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+//
+// -------------------------------------------------------------------
+
+#ifndef STORAGE_LEVELDB_INCLUDE_PERF_COUNT_H_
+#define STORAGE_LEVELDB_INCLUDE_PERF_COUNT_H_
+
+#include <stdint.h>
+#include <string>
+#include "leveldb/status.h"
+
+namespace leveldb {
+
+enum SstCountEnum
+{
+    //
+    // array index values/names
+    //
+    eSstCountKeys=0,           //!< how many keys in this sst
+    eSstCountBlocks=1,         //!< how many blocks in this sst
+    eSstCountCompressAborted=2,//!< how many blocks attempted compression and aborted use
+    eSstCountKeySize=3,        //!< byte count of all keys
+    eSstCountValueSize=4,      //!< byte count of all values
+    eSstCountBlockSize=5,      //!< byte count of all blocks (pre-compression)
+    eSstCountBlockWriteSize=6, //!< post-compression size, or BlockSize if no compression
+    eSstCountIndexKeys=7,      //!< how many keys in the index block
+    eSstCountKeyLargest=8,     //!< largest key in sst
+    eSstCountKeySmallest=9,    //!< smallest key in sst
+    eSstCountValueLargest=10,  //!< largest value in sst
+    eSstCountValueSmallest=11, //!< smallest value in sst
+    eSstCountDeleteKey=12,     //!< tombstone count
+    eSstCountBlockSizeUsed=13, //!< Options::block_size used with this file
+    eSstCountUserDataSize=14,  //!< post-compression size of non-metadata (user keys/values/block overhead)
+    eSstCountExpiry1=15,       //!< undocumented expiry counter 1
+    eSstCountExpiry2=16,       //!< undocumented expiry counter 2
+    eSstCountExpiry3=17,       //!< undocumented expiry counter 3
+    eSstCountSequence=18,      //!< highest sequence number in file
+
+    // must follow last index name to represent size of array
+    eSstCountEnumSize,          //!< size of the array described by the enum values
+
+    eSstCountVersion=1
+
+};  // enum SstCountEnum
+
+
+class SstCounters
+{
+protected:
+    bool m_IsReadOnly;         //!< set when data decoded from a file
+    uint32_t m_Version;        //!< object revision identification
+    uint32_t m_CounterSize;    //!< number of objects in m_Counter
+
+    uint64_t m_Counter[eSstCountEnumSize];
+
+public:
+    // constructors / destructor
+    SstCounters();
+
+    // Put data into disk form
+    void EncodeTo(std::string & Dst) const;
+
+    // Populate member data from prior EncodeTo block
+    Status DecodeFrom(const Slice& src);
+
+    // increment the counter
+    uint64_t Inc(unsigned Index);
+
+    // add value to the counter
+    uint64_t Add(unsigned Index, uint64_t Amount);
+
+    // return value of a counter
+    uint64_t Value(unsigned Index) const;
+
+    // set a value
+    void Set(unsigned Index, uint64_t);
+
+    // return number of counters
+    uint32_t Size() const {return(m_CounterSize);};
+
+    // printf all values
+    void Dump() const;
+
+};  // class SstCounters
+
+
+extern struct PerformanceCounters * gPerfCounters;
+
+
+enum PerformanceCountersEnum
+{
+    //
+    // array index values/names
+    //  (enum explicitly numbered to allow future edits / moves / inserts)
+    //
+    ePerfROFileOpen=0,      //!< PosixMmapReadableFile open
+    ePerfROFileClose=1,     //!<  closed
+    ePerfROFileUnmap=2,     //!<  unmap without close
+
+    ePerfRWFileOpen=3,      //!< PosixMmapFile open
+    ePerfRWFileClose=4,     //!<  closed
+    ePerfRWFileUnmap=5,     //!<  unmap without close
+
+    ePerfApiOpen=6,         //!< Count of DB::Open completions
+    ePerfApiGet=7,          //!< Count of DBImpl::Get completions
+    ePerfApiWrite=8,        //!< Count of DBImpl::Get completions
+
+    ePerfWriteSleep=9,      //!< DBImpl::MakeRoomForWrite called sleep
+    ePerfWriteWaitImm=10,   //!< DBImpl::MakeRoomForWrite called Wait on Imm compact
+    ePerfWriteWaitLevel0=11,//!< DBImpl::MakeRoomForWrite called Wait on Level0 compact
+    ePerfWriteNewMem=12,    //!< DBImpl::MakeRoomForWrite created new memory log
+    ePerfWriteError=13,     //!< DBImpl::MakeRoomForWrite saw bg_error_
+    ePerfWriteNoWait=14,    //!< DBImpl::MakeRoomForWrite took no action
+
+    ePerfGetMem=15,         //!< DBImpl::Get read from memory log
+    ePerfGetImm=16,         //!< DBImpl::Get read from previous memory log
+    ePerfGetVersion=17,     //!< DBImpl::Get read from Version object
+
+    // code ASSUMES the levels are in numerical order,
+    //  i.e. based off of ePerfSearchLevel0
+    ePerfSearchLevel0=18,   //!< Version::Get read searched one or more files here
+    ePerfSearchLevel1=19,   //!< Version::Get read searched one or more files here
+    ePerfSearchLevel2=20,   //!< Version::Get read searched one or more files here
+    ePerfSearchLevel3=21,   //!< Version::Get read searched one or more files here
+    ePerfSearchLevel4=22,   //!< Version::Get read searched one or more files here
+    ePerfSearchLevel5=23,   //!< Version::Get read searched one or more files here
+    ePerfSearchLevel6=24,   //!< Version::Get read searched one or more files here
+
+    ePerfTableCached=25,    //!< TableCache::FindTable found table in cache
+    ePerfTableOpened=26,    //!< TableCache::FindTable had to open table file
+    ePerfTableGet=27,       //!< TableCache::Get used to retrieve a key
+
+    ePerfBGCloseUnmap=28,   //!< PosixEnv::BGThreaed started Unmap/Close job
+    ePerfBGCompactImm=29,   //!< PosixEnv::BGThreaed started compaction of Imm
+    ePerfBGNormal=30,       //!< PosixEnv::BGThreaed started normal compaction job
+    ePerfBGCompactLevel0=31,//!< PosixEnv::BGThreaed started compaction of Level0
+
+    ePerfBlockFiltered=32,  //!< Table::BlockReader search stopped due to filter
+    ePerfBlockFilterFalse=33,//!< Table::BlockReader gave a false positive for match
+    ePerfBlockCached=34,    //!< Table::BlockReader found block in cache
+    ePerfBlockRead=35,      //!< Table::BlockReader read block from disk
+    ePerfBlockFilterRead=36,//!< Table::ReadMeta filter loaded from file
+    ePerfBlockValidGet=37,  //!< Table::InternalGet has valid iterator
+
+    ePerfDebug0=38,         //!< Developer debug counters, moveable
+    ePerfDebug1=39,         //!< Developer debug counters, moveable
+    ePerfDebug2=40,         //!< Developer debug counters, moveable
+    ePerfDebug3=41,         //!< Developer debug counters, moveable
+    ePerfDebug4=42,         //!< Developer debug counters, moveable
+
+    ePerfReadBlockError=43, //!< crc or compression error in ReadBlock (format.cc)
+
+    ePerfIterNew=44,        //!< Count of DBImpl::NewDBIterator calls
+    ePerfIterNext=45,       //!< Count of DBIter::Next calls
+    ePerfIterPrev=46,       //!< Count of DBIter::Prev calls
+    ePerfIterSeek=47,       //!< Count of DBIter::Seek calls
+    ePerfIterSeekFirst=48,  //!< Count of DBIter::SeekFirst calls
+    ePerfIterSeekLast=49,   //!< Count of DBIter::SeekLast calls
+    ePerfIterDelete=50,     //!< Count of DBIter::~DBIter
+
+    ePerfElevelDirect=51,   //!< eleveldb's FindWaitingThread went direct to thread
+    ePerfElevelQueued=52,   //!< eleveldb's FindWaitingThread queued work item
+    ePerfElevelDequeued=53, //!< eleveldb's worker took item from backlog queue
+
+    ePerfElevelRefCreate=54,//!< eleveldb RefObject constructed
+    ePerfElevelRefDelete=55,//!< eleveldb RefObject destructed
+
+    ePerfThrottleGauge=56,  //!< current throttle value
+    ePerfThrottleCounter=57,//!< running throttle by seconds
+
+    ePerfThrottleMicros0=58,//!< level 0 micros spent compacting
+    ePerfThrottleKeys0=59,  //!< level 0 keys processed
+    ePerfThrottleBacklog0=60,//!< backlog at time of posting (level0)
+    ePerfThrottleCompacts0=61,//!< number of level 0 compactions
+
+    ePerfThrottleMicros1=62,//!< level 1+ micros spent compacting
+    ePerfThrottleKeys1=63,  //!< level 1+ keys processed
+    ePerfThrottleBacklog1=64,//!< backlog at time of posting (level1+)
+    ePerfThrottleCompacts1=65,//!< number of level 1+ compactions
+
+    ePerfBGWriteError=66,   //!< error in write/close, see syslog
+
+    ePerfThrottleWait=67,   //!< milliseconds of throttle wait
+    ePerfThreadError=68,    //!< system error on thread related call, no LOG access
+
+    ePerfBGImmDirect=69,    //!< count Imm compactions happened directly
+    ePerfBGImmQueued=70,    //!< count Imm compactions placed on queue
+    ePerfBGImmDequeued=71,  //!< count Imm compactions removed from queue
+    ePerfBGImmWeighted=72,  //!< total microseconds item spent on queue
+
+    ePerfBGUnmapDirect=73,  //!< count Unmap operations happened directly
+    ePerfBGUnmapQueued=74,  //!< count Unmap operations placed on queue
+    ePerfBGUnmapDequeued=75,//!< count Unmap operations removed from queue
+    ePerfBGUnmapWeighted=76,//!< total microseconds item spent on queue
+
+    ePerfBGLevel0Direct=77,  //!< count Level0 compactions happened directly
+    ePerfBGLevel0Queued=78,  //!< count Level0 compactions placed on queue
+    ePerfBGLevel0Dequeued=79,//!< count Level0 compactions removed from queue
+    ePerfBGLevel0Weighted=80,//!< total microseconds item spent on queue
+
+    ePerfBGCompactDirect=81,  //!< count generic compactions happened directly
+    ePerfBGCompactQueued=82,  //!< count generic compactions placed on queue
+    ePerfBGCompactDequeued=83,//!< count generic compactions removed from queue
+    ePerfBGCompactWeighted=84,//!< total microseconds item spent on queue
+
+    ePerfFileCacheInsert=85,  //!< total bytes inserted into file cache
+    ePerfFileCacheRemove=86,  //!< total bytes removed from file cache
+
+    ePerfBlockCacheInsert=87, //!< total bytes inserted into block cache
+    ePerfBlockCacheRemove=88, //!< total bytes removed from block cache
+
+    ePerfApiDelete=89,        //!< Count of DB::Delete
+
+    ePerfBGMove=90,           //!< compaction was a successful move
+    ePerfBGMoveFail=91,       //!< compaction move failed, regular compaction attempted
+
+    ePerfThrottleUnadjusted=92,//!< current unadjusted throttle gauge
+
+    // this one was added to the other ePerfElevelXxx counters above when we backported HotThreadPool to eleveldb
+    ePerfElevelWeighted=93,   //!< total microseconds item spent on queue
+
+    ePerfExpiredKeys=94,      //!< key physically removed because it expired
+    ePerfExpiredFiles=95,     //!< entire file removed because all keys expired
+
+    ePerfSyslogWrite=96,      //!< logged message to syslog
+    ePerfBackupStarted=97,    //!< hot backup initiated
+    ePerfBackupError=98,      //!< hot backup had an error
+
+    ePerfPropCacheHit=99,     //!< property cache had data
+    ePerfPropCacheMiss=100,   //!< property cache had to look up data
+    ePerfPropCacheError=101,  //!< no property cache entry built/located
+
+    // must follow last index name to represent size of array
+    //  (ASSUMES previous enum is highest value)
+    ePerfCountEnumSize,     //!< size of the array described by the enum values
+
+    ePerfVersion=1,         //!< structure versioning
+    ePerfKey=41207          //!< random number as shared memory identifier
+};
+
+
+struct PerfCounterAttributes
+{
+    const char * m_PerfCounterName;  //!< text description
+    const bool m_PerfDiscretionary;  //!< true if ok to disable
+};  // PerfCounterAttributes
+
+
+//
+// Do NOT use virtual functions.  This structure will be aligned at different
+//  locations in multiple processes.  Things can get messy with virtuals.
+
+struct PerformanceCounters
+{
+public:
+    static int m_LastError;
+
+protected:
+    uint32_t m_Version;        //!< object revision identification
+    uint32_t m_CounterSize;    //!< number of objects in m_Counter
+
+    volatile uint64_t m_Counter[ePerfCountEnumSize];
+
+    static const PerfCounterAttributes m_PerfCounterAttr[];
+    static int m_PerfSharedId;
+    static volatile uint64_t m_BogusCounter;  //!< for out of range GetPtr calls
+
+public:
+    // only called for local object, not for shared memory
+    PerformanceCounters();
+
+    //!< does executable's idea of version match shared object?
+    bool VersionTest()
+        {return(ePerfCountEnumSize<=m_CounterSize && ePerfVersion==m_Version);};
+
+    //!< mostly for perf_count_test.cc
+    void SetVersion(uint32_t Version, uint32_t CounterSize)
+    {m_Version=Version; m_CounterSize=CounterSize;};
+
+    static PerformanceCounters * Init(bool IsReadOnly);
+    static int Close(PerformanceCounters * Counts);
+
+    uint64_t Inc(unsigned Index);
+    uint64_t Dec(unsigned Index);
+
+    // add value to the counter
+    uint64_t Add(unsigned Index, uint64_t Amount);
+
+    // return value of a counter
+    uint64_t Value(unsigned Index) const;
+
+    // set a value
+    void Set(unsigned Index, uint64_t);
+
+    volatile const uint64_t * GetPtr(unsigned Index) const;
+
+    static const char * GetNamePtr(unsigned Index);
+
+    int LookupCounter(const char * Name);
+
+    void Dump();
+
+};  // struct PerformanceCounters
+
+extern PerformanceCounters * gPerfCounters;
+
+extern volatile bool gPerfCountersDisabled;
+
+}  // namespace leveldb
+
+#endif  // STORAGE_LEVELDB_INCLUDE_PERF_COUNT_H_
diff --git a/src/leveldb/include/leveldb/slice.h b/src/leveldb/include/leveldb/slice.h
index bc367986f..74ea8fa49 100644
--- a/src/leveldb/include/leveldb/slice.h
+++ b/src/leveldb/include/leveldb/slice.h
@@ -94,7 +94,7 @@ inline bool operator!=(const Slice& x, const Slice& y) {
 }
 
 inline int Slice::compare(const Slice& b) const {
-  const size_t min_len = (size_ < b.size_) ? size_ : b.size_;
+  const int min_len = (size_ < b.size_) ? size_ : b.size_;
   int r = memcmp(data_, b.data_, min_len);
   if (r == 0) {
     if (size_ < b.size_) r = -1;
diff --git a/src/leveldb/include/leveldb/status.h b/src/leveldb/include/leveldb/status.h
index d9575f975..11dbd4b47 100644
--- a/src/leveldb/include/leveldb/status.h
+++ b/src/leveldb/include/leveldb/status.h
@@ -60,12 +60,6 @@ class Status {
   // Returns true iff the status indicates an IOError.
   bool IsIOError() const { return code() == kIOError; }
 
-  // Returns true iff the status indicates a NotSupportedError.
-  bool IsNotSupportedError() const { return code() == kNotSupported; }
-
-  // Returns true iff the status indicates an InvalidArgument.
-  bool IsInvalidArgument() const { return code() == kInvalidArgument; }
-
   // Return a string representation of this status suitable for printing.
   // Returns the string "OK" for success.
   std::string ToString() const;
diff --git a/src/leveldb/include/leveldb/table.h b/src/leveldb/include/leveldb/table.h
index a9746c3f5..96e8e81d9 100644
--- a/src/leveldb/include/leveldb/table.h
+++ b/src/leveldb/include/leveldb/table.h
@@ -7,6 +7,7 @@
 
 #include <stdint.h>
 #include "leveldb/iterator.h"
+#include "leveldb/perf_count.h"
 
 namespace leveldb {
 
@@ -40,7 +41,7 @@ class Table {
                      uint64_t file_size,
                      Table** table);
 
-  ~Table();
+  virtual ~Table();
 
   // Returns a new iterator over the table contents.
   // The result of NewIterator() is initially invalid (caller must
@@ -55,7 +56,29 @@ class Table {
   // be close to the file length.
   uint64_t ApproximateOffsetOf(const Slice& key) const;
 
- private:
+  // return a static copy of the table's counters.
+  SstCounters GetSstCounters() const;
+
+  // riak routine to retrieve total memory footprint of an open table
+  //  object in memory
+  size_t TableObjectSize();
+
+  // riak routine to retrieve disk size of table file
+  //  ("virtual" is for unit test activites)
+  virtual uint64_t GetFileSize();
+
+  // Riak routine to request bloom filter load on
+  //  second read operation (not iterator read)
+  bool ReadFilter();
+
+  // access routines for testing tools, not for public use
+  Block * TEST_GetIndexBlock();
+  size_t TEST_TableObjectSize() {return(TableObjectSize());};
+  size_t TEST_FilterDataSize();
+  static Iterator* TEST_BlockReader(void* Ptr, const ReadOptions& ROptions, const Slice& SliceReturn)
+    {return(BlockReader(Ptr, ROptions, SliceReturn));};
+
+ protected:  // was private, made protected for unit tests
   struct Rep;
   Rep* rep_;
 
@@ -69,11 +92,12 @@ class Table {
   Status InternalGet(
       const ReadOptions&, const Slice& key,
       void* arg,
-      void (*handle_result)(void* arg, const Slice& k, const Slice& v));
+      bool (*handle_result)(void* arg, const Slice& k, const Slice& v));
 
 
   void ReadMeta(const Footer& footer);
-  void ReadFilter(const Slice& filter_handle_value);
+  void ReadFilter(class BlockHandle & filter_handle_value, const class FilterPolicy * policy);
+  void ReadSstCounters(const Slice& sst_counters_handle_value);
 
   // No copying allowed
   Table(const Table&);
diff --git a/src/leveldb/include/leveldb/table_builder.h b/src/leveldb/include/leveldb/table_builder.h
index 5fd1dc71f..cbe741f59 100644
--- a/src/leveldb/include/leveldb/table_builder.h
+++ b/src/leveldb/include/leveldb/table_builder.h
@@ -74,6 +74,14 @@ class TableBuilder {
   // Finish() call, returns the size of the final generated file.
   uint64_t FileSize() const;
 
+  // Number of delete tombstones so far.
+  uint64_t NumDeletes() const;
+
+  // Retrieve expiry control values
+  uint64_t GetExpiryWriteLow() const;
+  uint64_t GetExpiryWriteHigh() const;
+  uint64_t GetExpiryExplicitHigh() const;
+
  private:
   bool ok() const { return status().ok(); }
   void WriteBlock(BlockBuilder* block, BlockHandle* handle);
diff --git a/src/leveldb/include/leveldb/write_batch.h b/src/leveldb/include/leveldb/write_batch.h
index ee9aab68e..bd887fd62 100644
--- a/src/leveldb/include/leveldb/write_batch.h
+++ b/src/leveldb/include/leveldb/write_batch.h
@@ -23,6 +23,7 @@
 
 #include <string>
 #include "leveldb/status.h"
+#include "leveldb/options.h"
 
 namespace leveldb {
 
@@ -34,7 +35,7 @@ class WriteBatch {
   ~WriteBatch();
 
   // Store the mapping "key->value" in the database.
-  void Put(const Slice& key, const Slice& value);
+  void Put(const Slice& key, const Slice& value, const KeyMetaData * meta=NULL);
 
   // If the database contains a mapping for "key", erase it.  Else do nothing.
   void Delete(const Slice& key);
@@ -46,7 +47,8 @@ class WriteBatch {
   class Handler {
    public:
     virtual ~Handler();
-    virtual void Put(const Slice& key, const Slice& value) = 0;
+    virtual void Put(const Slice& key, const Slice& value,
+                     const ValueType & type, const ExpiryTimeMicros & expiry) = 0;
     virtual void Delete(const Slice& key) = 0;
   };
   Status Iterate(Handler* handler) const;
diff --git a/src/leveldb/issues/issue178_test.cc b/src/leveldb/issues/issue178_test.cc
deleted file mode 100644
index 1b1cf8bb2..000000000
--- a/src/leveldb/issues/issue178_test.cc
+++ /dev/null
@@ -1,92 +0,0 @@
-// Copyright (c) 2013 The LevelDB Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
-
-// Test for issue 178: a manual compaction causes deleted data to reappear.
-#include <iostream>
-#include <sstream>
-#include <cstdlib>
-
-#include "leveldb/db.h"
-#include "leveldb/write_batch.h"
-#include "util/testharness.h"
-
-namespace {
-
-const int kNumKeys = 1100000;
-
-std::string Key1(int i) {
-  char buf[100];
-  snprintf(buf, sizeof(buf), "my_key_%d", i);
-  return buf;
-}
-
-std::string Key2(int i) {
-  return Key1(i) + "_xxx";
-}
-
-class Issue178 { };
-
-TEST(Issue178, Test) {
-  // Get rid of any state from an old run.
-  std::string dbpath = leveldb::test::TmpDir() + "/leveldb_cbug_test";
-  DestroyDB(dbpath, leveldb::Options());
-
-  // Open database.  Disable compression since it affects the creation
-  // of layers and the code below is trying to test against a very
-  // specific scenario.
-  leveldb::DB* db;
-  leveldb::Options db_options;
-  db_options.create_if_missing = true;
-  db_options.compression = leveldb::kNoCompression;
-  ASSERT_OK(leveldb::DB::Open(db_options, dbpath, &db));
-
-  // create first key range
-  leveldb::WriteBatch batch;
-  for (size_t i = 0; i < kNumKeys; i++) {
-    batch.Put(Key1(i), "value for range 1 key");
-  }
-  ASSERT_OK(db->Write(leveldb::WriteOptions(), &batch));
-
-  // create second key range
-  batch.Clear();
-  for (size_t i = 0; i < kNumKeys; i++) {
-    batch.Put(Key2(i), "value for range 2 key");
-  }
-  ASSERT_OK(db->Write(leveldb::WriteOptions(), &batch));
-
-  // delete second key range
-  batch.Clear();
-  for (size_t i = 0; i < kNumKeys; i++) {
-    batch.Delete(Key2(i));
-  }
-  ASSERT_OK(db->Write(leveldb::WriteOptions(), &batch));
-
-  // compact database
-  std::string start_key = Key1(0);
-  std::string end_key = Key1(kNumKeys - 1);
-  leveldb::Slice least(start_key.data(), start_key.size());
-  leveldb::Slice greatest(end_key.data(), end_key.size());
-
-  // commenting out the line below causes the example to work correctly
-  db->CompactRange(&least, &greatest);
-
-  // count the keys
-  leveldb::Iterator* iter = db->NewIterator(leveldb::ReadOptions());
-  size_t num_keys = 0;
-  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
-    num_keys++;
-  }
-  delete iter;
-  ASSERT_EQ(kNumKeys, num_keys) << "Bad number of keys";
-
-  // close database
-  delete db;
-  DestroyDB(dbpath, leveldb::Options());
-}
-
-}  // anonymous namespace
-
-int main(int argc, char** argv) {
-  return leveldb::test::RunAllTests();
-}
diff --git a/src/leveldb/issues/issue200_test.cc b/src/leveldb/issues/issue200_test.cc
deleted file mode 100644
index 1cec79f44..000000000
--- a/src/leveldb/issues/issue200_test.cc
+++ /dev/null
@@ -1,59 +0,0 @@
-// Copyright (c) 2013 The LevelDB Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
-
-// Test for issue 200: when iterator switches direction from backward
-// to forward, the current key can be yielded unexpectedly if a new
-// mutation has been added just before the current key.
-
-#include "leveldb/db.h"
-#include "util/testharness.h"
-
-namespace leveldb {
-
-class Issue200 { };
-
-TEST(Issue200, Test) {
-  // Get rid of any state from an old run.
-  std::string dbpath = test::TmpDir() + "/leveldb_issue200_test";
-  DestroyDB(dbpath, Options());
-
-  DB *db;
-  Options options;
-  options.create_if_missing = true;
-  ASSERT_OK(DB::Open(options, dbpath, &db));
-
-  WriteOptions write_options;
-  ASSERT_OK(db->Put(write_options, "1", "b"));
-  ASSERT_OK(db->Put(write_options, "2", "c"));
-  ASSERT_OK(db->Put(write_options, "3", "d"));
-  ASSERT_OK(db->Put(write_options, "4", "e"));
-  ASSERT_OK(db->Put(write_options, "5", "f"));
-
-  ReadOptions read_options;
-  Iterator *iter = db->NewIterator(read_options);
-
-  // Add an element that should not be reflected in the iterator.
-  ASSERT_OK(db->Put(write_options, "25", "cd"));
-
-  iter->Seek("5");
-  ASSERT_EQ(iter->key().ToString(), "5");
-  iter->Prev();
-  ASSERT_EQ(iter->key().ToString(), "4");
-  iter->Prev();
-  ASSERT_EQ(iter->key().ToString(), "3");
-  iter->Next();
-  ASSERT_EQ(iter->key().ToString(), "4");
-  iter->Next();
-  ASSERT_EQ(iter->key().ToString(), "5");
-
-  delete iter;
-  delete db;
-  DestroyDB(dbpath, options);
-}
-
-}  // namespace leveldb
-
-int main(int argc, char** argv) {
-  return leveldb::test::RunAllTests();
-}
diff --git a/src/leveldb/leveldb_os/compile_opt.cc b/src/leveldb/leveldb_os/compile_opt.cc
new file mode 100644
index 000000000..b311bcd43
--- /dev/null
+++ b/src/leveldb/leveldb_os/compile_opt.cc
@@ -0,0 +1,32 @@
+// -------------------------------------------------------------------
+//
+// compile_opt.h
+//
+// Copyright (c) 2016 Basho Technologies, Inc. All Rights Reserved.
+//
+// This file is provided to you under the Apache License,
+// Version 2.0 (the "License"); you may not use this file
+// except in compliance with the License.  You may obtain
+// a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+//
+// -------------------------------------------------------------------
+
+#include "leveldb/options.h"
+
+namespace leveldb
+{
+    const char * CompileOptionsString()
+    {
+        return("(open source)");
+    }
+}  // namespace leveldb
+
diff --git a/src/leveldb/leveldb_os/expiry_os_stub.cc b/src/leveldb/leveldb_os/expiry_os_stub.cc
new file mode 100644
index 000000000..a8463e233
--- /dev/null
+++ b/src/leveldb/leveldb_os/expiry_os_stub.cc
@@ -0,0 +1,62 @@
+// -------------------------------------------------------------------
+//
+// expiry_os_stub.cc
+//
+// Copyright (c) 2016-2017 Basho Technologies, Inc. All Rights Reserved.
+//
+// This file is provided to you under the Apache License,
+// Version 2.0 (the "License"); you may not use this file
+// except in compliance with the License.  You may obtain
+// a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+//
+// -------------------------------------------------------------------
+
+#include "db/dbformat.h"
+#include "leveldb/expiry.h"
+#include "util/expiry_os.h"
+
+namespace leveldb {
+
+/**
+ * This is the factory function to create
+ *  an open source version of object expiry
+ */
+ExpiryModule *
+ExpiryModule::CreateExpiryModule(
+    EleveldbRouter_t Router)
+{
+
+    return(new leveldb::ExpiryModuleOS);
+
+}   // ExpiryModule::CreateExpiryModule()
+
+
+void
+ExpiryModule::ShutdownExpiryModule()
+{
+
+    return;
+
+}   // ExpiryModule::ShutdownExpiryModule
+
+
+uint64_t
+CuttlefishDurationMinutes(
+    const char * Buffer)
+{
+
+    // zero is safe return since it implies "disable write time expiry"
+    return(0);
+
+}   // CuttlefishDurationMinutes
+
+}  // namespace leveldb
diff --git a/src/leveldb/leveldb_os/hot_backup_stub.cc b/src/leveldb/leveldb_os/hot_backup_stub.cc
new file mode 100644
index 000000000..73190975f
--- /dev/null
+++ b/src/leveldb/leveldb_os/hot_backup_stub.cc
@@ -0,0 +1,37 @@
+// -------------------------------------------------------------------
+//
+// hot_backup_stub.cc
+//
+// Copyright (c) 2011-2016 Basho Technologies, Inc. All Rights Reserved.
+//
+// This file is provided to you under the Apache License,
+// Version 2.0 (the "License"); you may not use this file
+// except in compliance with the License.  You may obtain
+// a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+//
+// -------------------------------------------------------------------
+
+#include "util/thread_tasks.h"
+
+namespace leveldb {
+
+/**
+ * Called by throttle.cc's thread once a minute.  Used to
+ *  test for trigger condition
+ */
+void
+CheckHotBackupTrigger()
+{
+    return;
+}   // CheckHotBackupTrigger
+
+}  // namespace leveldb
diff --git a/src/leveldb/leveldb_os/prop_cache_stub.cc b/src/leveldb/leveldb_os/prop_cache_stub.cc
new file mode 100644
index 000000000..47778977d
--- /dev/null
+++ b/src/leveldb/leveldb_os/prop_cache_stub.cc
@@ -0,0 +1,41 @@
+// -------------------------------------------------------------------
+//
+// hot_backup_stub.cc
+//
+// Copyright (c) 2011-2016 Basho Technologies, Inc. All Rights Reserved.
+//
+// This file is provided to you under the Apache License,
+// Version 2.0 (the "License"); you may not use this file
+// except in compliance with the License.  You may obtain
+// a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+//
+// -------------------------------------------------------------------
+
+#include "util/prop_cache.h"
+
+namespace leveldb {
+
+/**
+ * Internal Lookup function that first requests property
+ *  data from Eleveldb Router, then waits for the data
+ *  to post to the cache.
+ */
+Cache::Handle *
+PropertyCache::LookupWait(
+    const Slice & CompositeBucket)
+{
+
+    return(NULL);
+
+}   // PropertyCache::LookupWait
+
+}  // namespace leveldb
diff --git a/src/leveldb/leveldb_os/warming_stub.cc b/src/leveldb/leveldb_os/warming_stub.cc
new file mode 100644
index 000000000..6db93dfc4
--- /dev/null
+++ b/src/leveldb/leveldb_os/warming_stub.cc
@@ -0,0 +1,48 @@
+// -------------------------------------------------------------------
+//
+// cache_warm.cc
+//
+// Copyright (c) 2011-2016 Basho Technologies, Inc. All Rights Reserved.
+//
+// This file is provided to you under the Apache License,
+// Version 2.0 (the "License"); you may not use this file
+// except in compliance with the License.  You may obtain
+// a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+//
+// -------------------------------------------------------------------
+
+#include "db/table_cache.h"
+
+namespace leveldb {
+
+
+/**
+ * Riak specific routine to push list of open files to disk
+ */
+Status
+TableCache::SaveOpenFileList()
+{
+    return(Status::OK());
+}   // TableCache::SaveOpenFiles
+
+
+/**
+ * Riak specific routine to read list of previously open files
+ *  and preload them into the table cache
+ */
+Status
+TableCache::PreloadTableCache()
+{
+    return(Status::OK());
+}   // TableCache::PreloadTableCache
+
+}  // namespace leveldb
diff --git a/src/leveldb/port/atomic_pointer.h b/src/leveldb/port/atomic_pointer.h
index d79a02230..2b485c7f7 100644
--- a/src/leveldb/port/atomic_pointer.h
+++ b/src/leveldb/port/atomic_pointer.h
@@ -5,13 +5,14 @@
 // AtomicPointer provides storage for a lock-free pointer.
 // Platform-dependent implementation of AtomicPointer:
 // - If the platform provides a cheap barrier, we use it with raw pointers
-// - If <atomic> is present (on newer versions of gcc, it is), we use
-//   a <atomic>-based AtomicPointer.  However we prefer the memory
+// - If cstdatomic is present (on newer versions of gcc, it is), we use
+//   a cstdatomic-based AtomicPointer.  However we prefer the memory
 //   barrier based version, because at least on a gcc 4.4 32-bit build
-//   on linux, we have encountered a buggy <atomic> implementation.
-//   Also, some <atomic> implementations are much slower than a memory-barrier
-//   based implementation (~16ns for <atomic> based acquire-load vs. ~1ns for
-//   a barrier based acquire-load).
+//   on linux, we have encountered a buggy <cstdatomic>
+//   implementation.  Also, some <cstdatomic> implementations are much
+//   slower than a memory-barrier based implementation (~16ns for
+//   <cstdatomic> based acquire-load vs. ~1ns for a barrier based
+//   acquire-load).
 // This code is based on atomicops-internals-* in Google's perftools:
 // http://code.google.com/p/google-perftools/source/browse/#svn%2Ftrunk%2Fsrc%2Fbase
 
@@ -19,9 +20,9 @@
 #define PORT_ATOMIC_POINTER_H_
 
 #include <stdint.h>
-#ifdef LEVELDB_ATOMIC_PRESENT
-#include <atomic>
-#endif
+//#ifdef LEVELDB_CSTDATOMIC_PRESENT
+//#include <cstdatomic>              ... moved below
+//#endif
 #ifdef OS_WIN
 #include <windows.h>
 #endif
@@ -35,41 +36,11 @@
 #define ARCH_CPU_X86_FAMILY 1
 #elif defined(__ARMEL__)
 #define ARCH_CPU_ARM_FAMILY 1
-#elif defined(__aarch64__)
-#define ARCH_CPU_ARM64_FAMILY 1
-#elif defined(__ppc__) || defined(__powerpc__) || defined(__powerpc64__)
-#define ARCH_CPU_PPC_FAMILY 1
-#elif defined(__mips__)
-#define ARCH_CPU_MIPS_FAMILY 1
 #endif
 
 namespace leveldb {
 namespace port {
 
-// AtomicPointer based on <cstdatomic> if available
-#if defined(LEVELDB_ATOMIC_PRESENT)
-class AtomicPointer {
- private:
-  std::atomic<void*> rep_;
- public:
-  AtomicPointer() { }
-  explicit AtomicPointer(void* v) : rep_(v) { }
-  inline void* Acquire_Load() const {
-    return rep_.load(std::memory_order_acquire);
-  }
-  inline void Release_Store(void* v) {
-    rep_.store(v, std::memory_order_release);
-  }
-  inline void* NoBarrier_Load() const {
-    return rep_.load(std::memory_order_relaxed);
-  }
-  inline void NoBarrier_Store(void* v) {
-    rep_.store(v, std::memory_order_relaxed);
-  }
-};
-
-#else
-
 // Define MemoryBarrier() if available
 // Windows on x86
 #if defined(OS_WIN) && defined(COMPILER_MSVC) && defined(ARCH_CPU_X86_FAMILY)
@@ -77,13 +48,6 @@ class AtomicPointer {
 // http://msdn.microsoft.com/en-us/library/ms684208(v=vs.85).aspx
 #define LEVELDB_HAVE_MEMORY_BARRIER
 
-// Mac OS
-#elif defined(OS_MACOSX)
-inline void MemoryBarrier() {
-  OSMemoryBarrier();
-}
-#define LEVELDB_HAVE_MEMORY_BARRIER
-
 // Gcc on x86
 #elif defined(ARCH_CPU_X86_FAMILY) && defined(__GNUC__)
 inline void MemoryBarrier() {
@@ -102,6 +66,13 @@ inline void MemoryBarrier() {
 }
 #define LEVELDB_HAVE_MEMORY_BARRIER
 
+// Mac OS
+#elif defined(OS_MACOSX)
+inline void MemoryBarrier() {
+  OSMemoryBarrier();
+}
+#define LEVELDB_HAVE_MEMORY_BARRIER
+
 // ARM Linux
 #elif defined(ARCH_CPU_ARM_FAMILY) && defined(__linux__)
 typedef void (*LinuxKernelMemoryBarrierFunc)(void);
@@ -120,29 +91,6 @@ inline void MemoryBarrier() {
 }
 #define LEVELDB_HAVE_MEMORY_BARRIER
 
-// ARM64
-#elif defined(ARCH_CPU_ARM64_FAMILY)
-inline void MemoryBarrier() {
-  asm volatile("dmb sy" : : : "memory");
-}
-#define LEVELDB_HAVE_MEMORY_BARRIER
-
-// PPC
-#elif defined(ARCH_CPU_PPC_FAMILY) && defined(__GNUC__)
-inline void MemoryBarrier() {
-  // TODO for some powerpc expert: is there a cheaper suitable variant?
-  // Perhaps by having separate barriers for acquire and release ops.
-  asm volatile("sync" : : : "memory");
-}
-#define LEVELDB_HAVE_MEMORY_BARRIER
-
-// MIPS
-#elif defined(ARCH_CPU_MIPS_FAMILY) && defined(__GNUC__)
-inline void MemoryBarrier() {
-  __asm__ __volatile__("sync" : : : "memory");
-}
-#define LEVELDB_HAVE_MEMORY_BARRIER
-
 #endif
 
 // AtomicPointer built using platform-specific MemoryBarrier()
@@ -166,78 +114,39 @@ class AtomicPointer {
   }
 };
 
-// Atomic pointer based on sparc memory barriers
-#elif defined(__sparcv9) && defined(__GNUC__)
+// AtomicPointer based on <cstdatomic>
+#elif defined(LEVELDB_CSTDATOMIC_PRESENT)
+#include <cstdatomic>
+
 class AtomicPointer {
  private:
-  void* rep_;
+  std::atomic<void*> rep_;
  public:
   AtomicPointer() { }
   explicit AtomicPointer(void* v) : rep_(v) { }
   inline void* Acquire_Load() const {
-    void* val;
-    __asm__ __volatile__ (
-        "ldx [%[rep_]], %[val] \n\t"
-         "membar #LoadLoad|#LoadStore \n\t"
-        : [val] "=r" (val)
-        : [rep_] "r" (&rep_)
-        : "memory");
-    return val;
+    return rep_.load(std::memory_order_acquire);
   }
   inline void Release_Store(void* v) {
-    __asm__ __volatile__ (
-        "membar #LoadStore|#StoreStore \n\t"
-        "stx %[v], [%[rep_]] \n\t"
-        :
-        : [rep_] "r" (&rep_), [v] "r" (v)
-        : "memory");
+    rep_.store(v, std::memory_order_release);
+  }
+  inline void* NoBarrier_Load() const {
+    return rep_.load(std::memory_order_relaxed);
+  }
+  inline void NoBarrier_Store(void* v) {
+    rep_.store(v, std::memory_order_relaxed);
   }
-  inline void* NoBarrier_Load() const { return rep_; }
-  inline void NoBarrier_Store(void* v) { rep_ = v; }
 };
 
-// Atomic pointer based on ia64 acq/rel
-#elif defined(__ia64) && defined(__GNUC__)
-class AtomicPointer {
- private:
-  void* rep_;
- public:
-  AtomicPointer() { }
-  explicit AtomicPointer(void* v) : rep_(v) { }
-  inline void* Acquire_Load() const {
-    void* val    ;
-    __asm__ __volatile__ (
-        "ld8.acq %[val] = [%[rep_]] \n\t"
-        : [val] "=r" (val)
-        : [rep_] "r" (&rep_)
-        : "memory"
-        );
-    return val;
-  }
-  inline void Release_Store(void* v) {
-    __asm__ __volatile__ (
-        "st8.rel [%[rep_]] = %[v]  \n\t"
-        :
-        : [rep_] "r" (&rep_), [v] "r" (v)
-        : "memory"
-        );
-  }
-  inline void* NoBarrier_Load() const { return rep_; }
-  inline void NoBarrier_Store(void* v) { rep_ = v; }
-};
-
-// We have neither MemoryBarrier(), nor <atomic>
+// We have neither MemoryBarrier(), nor <cstdatomic>
 #else
 #error Please implement AtomicPointer for this platform.
 
-#endif
 #endif
 
 #undef LEVELDB_HAVE_MEMORY_BARRIER
 #undef ARCH_CPU_X86_FAMILY
 #undef ARCH_CPU_ARM_FAMILY
-#undef ARCH_CPU_ARM64_FAMILY
-#undef ARCH_CPU_PPC_FAMILY
 
 }  // namespace port
 }  // namespace leveldb
diff --git a/src/leveldb/port/port.h b/src/leveldb/port/port.h
index 4baafa8e2..d3c5d6aad 100644
--- a/src/leveldb/port/port.h
+++ b/src/leveldb/port/port.h
@@ -6,6 +6,7 @@
 #define STORAGE_LEVELDB_PORT_PORT_H_
 
 #include <string.h>
+#include "leveldb/ldb_config.h"
 
 // Include the appropriate platform specific file below.  If you are
 // porting to a new platform, see "port_example.h" for documentation
@@ -14,8 +15,6 @@
 #  include "port/port_posix.h"
 #elif defined(LEVELDB_PLATFORM_CHROMIUM)
 #  include "port/port_chromium.h"
-#elif defined(LEVELDB_PLATFORM_WINDOWS)
-#  include "port/port_win.h"
 #endif
 
 #endif  // STORAGE_LEVELDB_PORT_PORT_H_
diff --git a/src/leveldb/port/port_android.cc b/src/leveldb/port/port_android.cc
new file mode 100644
index 000000000..815abf299
--- /dev/null
+++ b/src/leveldb/port/port_android.cc
@@ -0,0 +1,64 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "port/port_android.h"
+
+#include <cstdlib>
+
+extern "C" {
+size_t fread_unlocked(void *a, size_t b, size_t c, FILE *d) {
+  return fread(a, b, c, d);
+}
+
+size_t fwrite_unlocked(const void *a, size_t b, size_t c, FILE *d) {
+  return fwrite(a, b, c, d);
+}
+
+int fflush_unlocked(FILE *f) {
+  return fflush(f);
+}
+
+int fdatasync(int fd) {
+  return fsync(fd);
+}
+}
+
+namespace leveldb {
+namespace port {
+
+static void PthreadCall(const char* label, int result) {
+  if (result != 0) {
+    fprintf(stderr, "pthread %s: %s\n", label, strerror(result));
+    abort();
+  }
+}
+
+Mutex::Mutex() { PthreadCall("init mutex", pthread_mutex_init(&mu_, NULL)); }
+Mutex::~Mutex() { PthreadCall("destroy mutex", pthread_mutex_destroy(&mu_)); }
+void Mutex::Lock() { PthreadCall("lock", pthread_mutex_lock(&mu_)); }
+void Mutex::Unlock() { PthreadCall("unlock", pthread_mutex_unlock(&mu_)); }
+
+CondVar::CondVar(Mutex* mu)
+    : mu_(mu) {
+  PthreadCall("init cv", pthread_cond_init(&cv_, NULL));
+}
+
+CondVar::~CondVar() { 
+  PthreadCall("destroy cv", pthread_cond_destroy(&cv_));
+}
+
+void CondVar::Wait() {
+  PthreadCall("wait", pthread_cond_wait(&cv_, &mu_->mu_));
+}
+
+void CondVar::Signal(){
+  PthreadCall("signal", pthread_cond_signal(&cv_));
+}
+
+void CondVar::SignalAll() {
+  PthreadCall("broadcast", pthread_cond_broadcast(&cv_));
+}
+
+}  // namespace port
+}  // namespace leveldb
diff --git a/src/leveldb/port/port_android.h b/src/leveldb/port/port_android.h
new file mode 100644
index 000000000..b733388d8
--- /dev/null
+++ b/src/leveldb/port/port_android.h
@@ -0,0 +1,159 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// See port_example.h for documentation for the following types/functions.
+
+#ifndef STORAGE_LEVELDB_PORT_PORT_ANDROID_H_
+#define STORAGE_LEVELDB_PORT_PORT_ANDROID_H_
+
+#include <endian.h>
+#include <pthread.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <string>
+#include <cctype>
+
+// Collapse the plethora of ARM flavors available to an easier to manage set
+// Defs reference is at https://wiki.edubuntu.org/ARM/Thumb2PortingHowto
+#if defined(__ARM_ARCH_6__) || \
+    defined(__ARM_ARCH_6J__) || \
+    defined(__ARM_ARCH_6K__) || \
+    defined(__ARM_ARCH_6Z__) || \
+    defined(__ARM_ARCH_6T2__) || \
+    defined(__ARM_ARCH_6ZK__) || \
+    defined(__ARM_ARCH_7__) || \
+    defined(__ARM_ARCH_7R__) || \
+    defined(__ARM_ARCH_7A__)
+#define ARMV6_OR_7 1
+#endif
+
+extern "C" {
+  size_t fread_unlocked(void *a, size_t b, size_t c, FILE *d);
+  size_t fwrite_unlocked(const void *a, size_t b, size_t c, FILE *d);
+  int fflush_unlocked(FILE *f);
+  int fdatasync (int fd);
+}
+
+namespace leveldb {
+namespace port {
+
+static const bool kLittleEndian = __BYTE_ORDER == __LITTLE_ENDIAN;
+
+class CondVar;
+
+class Mutex {
+ public:
+  Mutex();
+  ~Mutex();
+
+  void Lock();
+  void Unlock();
+  void AssertHeld() {
+    //TODO(gabor): How can I implement this?
+  }
+
+ private:
+  friend class CondVar;
+  pthread_mutex_t mu_;
+
+  // No copying
+  Mutex(const Mutex&);
+  void operator=(const Mutex&);
+};
+
+class CondVar {
+ public:
+  explicit CondVar(Mutex* mu);
+  ~CondVar();
+  void Wait();
+  void Signal();
+  void SignalAll();
+ private:
+  Mutex* mu_;
+  pthread_cond_t cv_;
+};
+
+#ifndef ARMV6_OR_7
+// On ARM chipsets <V6, 0xffff0fa0 is the hard coded address of a 
+// memory barrier function provided by the kernel.
+typedef void (*LinuxKernelMemoryBarrierFunc)(void);
+// TODO(user): ATTRIBUTE_WEAK is undefined, so this fails to build on
+// non-ARMV6_OR_7. We may be able to replace it with __attribute__((weak)) for
+// older ARM builds, but x86 builds will require a different memory barrier.
+LinuxKernelMemoryBarrierFunc pLinuxKernelMemoryBarrier ATTRIBUTE_WEAK =
+    (LinuxKernelMemoryBarrierFunc) 0xffff0fa0;
+#endif
+
+// Storage for a lock-free pointer
+class AtomicPointer {
+ private:
+  void* rep_;
+
+  inline void MemoryBarrier() const {
+    // TODO(gabor): This only works on Android instruction sets >= V6
+#ifdef ARMV6_OR_7
+    __asm__ __volatile__("dmb" : : : "memory");
+#else
+    pLinuxKernelMemoryBarrier();
+#endif
+  }
+
+ public:
+  AtomicPointer() { }
+  explicit AtomicPointer(void* v) : rep_(v) { }
+  inline void* Acquire_Load() const {
+    void* r = rep_;
+    MemoryBarrier();
+    return r;
+  }
+  inline void Release_Store(void* v) {
+    MemoryBarrier();
+    rep_ = v;
+  }
+  inline void* NoBarrier_Load() const {
+    void* r = rep_;
+    return r;
+  }
+  inline void NoBarrier_Store(void* v) {
+    rep_ = v;
+  }
+};
+
+// TODO(gabor): Implement compress
+inline bool Snappy_Compress(
+    const char* input,
+    size_t input_length,
+    std::string* output) {
+  return false;
+}
+
+// TODO(gabor): Implement uncompress
+inline bool Snappy_GetUncompressedLength(const char* input, size_t length,
+                                         size_t* result) {
+  return false;
+}
+
+// TODO(gabor): Implement uncompress
+inline bool Snappy_Uncompress(
+    const char* input_data,
+    size_t input_length,
+    char* output) {
+  return false;
+}
+
+inline uint64_t ThreadIdentifier() {
+  pthread_t tid = pthread_self();
+  uint64_t r = 0;
+  memcpy(&r, &tid, sizeof(r) < sizeof(tid) ? sizeof(r) : sizeof(tid));
+  return r;
+}
+
+inline bool GetHeapProfile(void (*func)(void*, const char*, int), void* arg) {
+  return false;
+}
+
+}  // namespace port
+}  // namespace leveldb
+
+#endif  // STORAGE_LEVELDB_PORT_PORT_ANDROID_H_
diff --git a/src/leveldb/port/port_example.h b/src/leveldb/port/port_example.h
index 5b1d027de..ab9e489b3 100644
--- a/src/leveldb/port/port_example.h
+++ b/src/leveldb/port/port_example.h
@@ -129,16 +129,6 @@ extern bool Snappy_Uncompress(const char* input_data, size_t input_length,
 // The concatenation of all "data[0,n-1]" fragments is the heap profile.
 extern bool GetHeapProfile(void (*func)(void*, const char*, int), void* arg);
 
-// Determine whether a working accelerated crc32 implementation exists
-// Returns true if AcceleratedCRC32C is safe to call
-bool HasAcceleratedCRC32C();
-
-// Extend the CRC to include the first n bytes of buf.
-//
-// Returns zero if the CRC cannot be extended using acceleration, else returns
-// the newly extended CRC value (which may also be zero).
-uint32_t AcceleratedCRC32C(uint32_t crc, const char* buf, size_t size);
-
 }  // namespace port
 }  // namespace leveldb
 
diff --git a/src/leveldb/port/port_posix.cc b/src/leveldb/port/port_posix.cc
index 4b80203bd..280c29f6e 100644
--- a/src/leveldb/port/port_posix.cc
+++ b/src/leveldb/port/port_posix.cc
@@ -7,10 +7,9 @@
 #include <cstdlib>
 #include <stdio.h>
 #include <string.h>
-
-#if (defined(__x86_64__) || defined(__i386__)) && defined(__GNUC__)
-#include <cpuid.h>
-#endif
+#include <errno.h>
+#include "leveldb/env.h"
+#include "util/logging.h"
 
 namespace leveldb {
 namespace port {
@@ -18,11 +17,24 @@ namespace port {
 static void PthreadCall(const char* label, int result) {
   if (result != 0) {
     fprintf(stderr, "pthread %s: %s\n", label, strerror(result));
+    Log(NULL, "pthread %s: %s\n", label, strerror(result));
     abort();
   }
 }
 
-Mutex::Mutex() { PthreadCall("init mutex", pthread_mutex_init(&mu_, NULL)); }
+Mutex::Mutex(bool recursive) {
+  if (recursive) {
+    pthread_mutexattr_t attr;
+
+    PthreadCall("init mutex attr", pthread_mutexattr_init(&attr));
+    PthreadCall("set mutex recursive", pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE));
+    PthreadCall("init recursive mutex", pthread_mutex_init(&mu_, &attr));
+    PthreadCall("destroy mutex attr", pthread_mutexattr_destroy(&attr));
+  }
+  else {
+    PthreadCall("init mutex", pthread_mutex_init(&mu_, NULL));
+  }
+}
 
 Mutex::~Mutex() { PthreadCall("destroy mutex", pthread_mutex_destroy(&mu_)); }
 
@@ -30,6 +42,16 @@ void Mutex::Lock() { PthreadCall("lock", pthread_mutex_lock(&mu_)); }
 
 void Mutex::Unlock() { PthreadCall("unlock", pthread_mutex_unlock(&mu_)); }
 
+#if defined(_POSIX_SPIN_LOCKS) && 0<_POSIX_SPIN_LOCKS
+Spin::Spin() { PthreadCall("init spinlock", pthread_spin_init(&sp_, PTHREAD_PROCESS_PRIVATE)); }
+
+Spin::~Spin() { PthreadCall("destroy spinlock", pthread_spin_destroy(&sp_)); }
+
+void Spin::Lock() { PthreadCall("lock spin", pthread_spin_lock(&sp_)); }
+
+void Spin::Unlock() { PthreadCall("unlock spin", pthread_spin_unlock(&sp_)); }
+#endif
+
 CondVar::CondVar(Mutex* mu)
     : mu_(mu) {
     PthreadCall("init cv", pthread_cond_init(&cv_, NULL));
@@ -41,6 +63,20 @@ void CondVar::Wait() {
   PthreadCall("wait", pthread_cond_wait(&cv_, &mu_->mu_));
 }
 
+bool CondVar::Wait(struct timespec* pTimespec) {
+  bool signaled = true;
+  int result = pthread_cond_timedwait(&cv_, &mu_->mu_, pTimespec);
+  if (0 != result) {
+    signaled = false;
+
+    // the only expected errno is ETIMEDOUT; anything else is a real error
+    if (ETIMEDOUT != result) {
+      PthreadCall("timed wait", result);
+    }
+  }
+  return signaled;
+}
+
 void CondVar::Signal() {
   PthreadCall("signal", pthread_cond_signal(&cv_));
 }
@@ -53,15 +89,15 @@ void InitOnce(OnceType* once, void (*initializer)()) {
   PthreadCall("once", pthread_once(once, initializer));
 }
 
-bool HasAcceleratedCRC32C() {
-#if (defined(__x86_64__) || defined(__i386__)) && defined(__GNUC__)
-  unsigned int eax, ebx, ecx = 0, edx;
-  __get_cpuid(1, &eax, &ebx, &ecx, &edx);
-  return (ecx & (1 << 20)) != 0;
-#else
-  return false;
-#endif
-}
+RWMutex::RWMutex() { PthreadCall("init mutex", pthread_rwlock_init(&mu_, NULL)); }
+
+RWMutex::~RWMutex() { PthreadCall("destroy mutex", pthread_rwlock_destroy(&mu_)); }
+
+void RWMutex::ReadLock() { PthreadCall("read lock", pthread_rwlock_rdlock(&mu_)); }
+
+void RWMutex::WriteLock() { PthreadCall("write lock", pthread_rwlock_wrlock(&mu_)); }
+
+void RWMutex::Unlock() { PthreadCall("unlock", pthread_rwlock_unlock(&mu_)); }
 
 }  // namespace port
 }  // namespace leveldb
diff --git a/src/leveldb/port/port_posix.h b/src/leveldb/port/port_posix.h
index d85fa5d63..4d9146289 100644
--- a/src/leveldb/port/port_posix.h
+++ b/src/leveldb/port/port_posix.h
@@ -7,6 +7,16 @@
 #ifndef STORAGE_LEVELDB_PORT_PORT_POSIX_H_
 #define STORAGE_LEVELDB_PORT_PORT_POSIX_H_
 
+// to properly pull in bits/posix_opt.h on Linux
+#include <unistd.h>
+#include <assert.h>
+
+#if _POSIX_TIMERS >= 200801L
+   #include <time.h> // declares clock_gettime()
+#else
+   #include <sys/time.h> // declares gettimeofday()
+#endif
+
 #undef PLATFORM_IS_LITTLE_ENDIAN
 #if defined(OS_MACOSX)
   #include <machine/endian.h>
@@ -21,23 +31,17 @@
   #else
     #define PLATFORM_IS_LITTLE_ENDIAN false
   #endif
-#elif defined(OS_FREEBSD) || defined(OS_OPENBSD) ||\
-      defined(OS_NETBSD) || defined(OS_DRAGONFLYBSD)
+#elif defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) ||\
+      defined(OS_DRAGONFLYBSD) || defined(OS_ANDROID)
   #include <sys/types.h>
   #include <sys/endian.h>
-  #define PLATFORM_IS_LITTLE_ENDIAN (_BYTE_ORDER == _LITTLE_ENDIAN)
-#elif defined(OS_HPUX)
-  #define PLATFORM_IS_LITTLE_ENDIAN false
-#elif defined(OS_ANDROID)
-  // Due to a bug in the NDK x86 <sys/endian.h> definition,
-  // _BYTE_ORDER must be used instead of __BYTE_ORDER on Android.
-  // See http://code.google.com/p/android/issues/detail?id=39824
-  #include <endian.h>
-  #define PLATFORM_IS_LITTLE_ENDIAN  (_BYTE_ORDER == _LITTLE_ENDIAN)
+
+  #if !defined(PLATFORM_IS_LITTLE_ENDIAN) && defined(_BYTE_ORDER)
+    #define PLATFORM_IS_LITTLE_ENDIAN (_BYTE_ORDER == _LITTLE_ENDIAN)
+  #endif
 #else
   #include <endian.h>
 #endif
-
 #include <pthread.h>
 #ifdef SNAPPY
 #include <snappy.h>
@@ -52,21 +56,28 @@
 
 #if defined(OS_MACOSX) || defined(OS_SOLARIS) || defined(OS_FREEBSD) ||\
     defined(OS_NETBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLYBSD) ||\
-    defined(OS_ANDROID) || defined(OS_HPUX) || defined(CYGWIN)
+    defined(OS_ANDROID)
 // Use fread/fwrite/fflush on platforms without _unlocked variants
 #define fread_unlocked fread
 #define fwrite_unlocked fwrite
 #define fflush_unlocked fflush
 #endif
 
-#if defined(OS_FREEBSD) ||\
+#if defined(OS_MACOSX) || defined(OS_FREEBSD) ||\
     defined(OS_OPENBSD) || defined(OS_DRAGONFLYBSD)
 // Use fsync() on platforms without fdatasync()
 #define fdatasync fsync
 #endif
 
-#if defined(OS_MACOSX)
-#define fdatasync(fd) fcntl(fd, F_FULLFSYNC, 0)
+// Some compilers do not provide access to nested classes of a declared friend class
+// Defining PUBLIC_NESTED_FRIEND_ACCESS will cause those declarations to be made
+// public as a workaround.  Added by David Smith, Basho.
+#if defined(OS_MACOSX) || defined(OS_SOLARIS)
+#define USED_BY_NESTED_FRIEND(a) public: a; private:
+#define USED_BY_NESTED_FRIEND2(a,b) public: a,b; private:
+#else
+#define USED_BY_NESTED_FRIEND(a) a;
+#define USED_BY_NESTED_FRIEND2(a,b) a,b;
 #endif
 
 #if defined(OS_ANDROID) && __ANDROID_API__ < 9
@@ -85,12 +96,12 @@ class CondVar;
 
 class Mutex {
  public:
-  Mutex();
+  Mutex(bool recursive=false); // true => creates a mutex that can be locked recursively
   ~Mutex();
 
   void Lock();
   void Unlock();
-  void AssertHeld() { }
+  void AssertHeld() {assert(0!=pthread_mutex_trylock(&mu_));}
 
  private:
   friend class CondVar;
@@ -101,11 +112,40 @@ class Mutex {
   void operator=(const Mutex&);
 };
 
+
+#if defined(_POSIX_SPIN_LOCKS) && 0<_POSIX_SPIN_LOCKS
+class Spin {
+ public:
+  Spin();
+  ~Spin();
+
+  void Lock();
+  void Unlock();
+  void AssertHeld() {assert(0!=pthread_spin_trylock(&sp_));}
+
+ private:
+  friend class CondVar;
+  pthread_spinlock_t sp_;
+
+  // No copying
+  Spin(const Spin&);
+  void operator=(const Spin&);
+};
+#else
+typedef Mutex Spin;
+#endif
+
+
 class CondVar {
  public:
   explicit CondVar(Mutex* mu);
   ~CondVar();
   void Wait();
+
+  // waits on the condition variable until the specified time is reached
+  bool // true => the condition variable was signaled, else timed out
+  Wait(struct timespec* pTimespec);
+
   void Signal();
   void SignalAll();
  private:
@@ -117,6 +157,27 @@ typedef pthread_once_t OnceType;
 #define LEVELDB_ONCE_INIT PTHREAD_ONCE_INIT
 extern void InitOnce(OnceType* once, void (*initializer)());
 
+
+class RWMutex {
+ public:
+  RWMutex();
+  ~RWMutex();
+
+  void ReadLock();
+  void WriteLock();
+  void Unlock();
+  void AssertHeld() { }
+
+ private:
+  pthread_rwlock_t mu_;
+
+  // No copying
+  RWMutex(const RWMutex&);
+  void operator=(const RWMutex&);
+
+};
+
+
 inline bool Snappy_Compress(const char* input, size_t length,
                             ::std::string* output) {
 #ifdef SNAPPY
@@ -152,8 +213,45 @@ inline bool GetHeapProfile(void (*func)(void*, const char*, int), void* arg) {
   return false;
 }
 
-bool HasAcceleratedCRC32C();
-uint32_t AcceleratedCRC32C(uint32_t crc, const char* buf, size_t size);
+// sets the name of the current thread
+inline void SetCurrentThreadName(const char* threadName) {
+  if (NULL == threadName) {
+    threadName = "";
+  }
+#if defined(OS_MACOSX)
+  pthread_setname_np(threadName);
+//#elif defined(OS_LINUX)
+#elif defined(__GLIBC__)
+#if  __GLIBC_PREREQ(2,12)
+  pthread_setname_np(pthread_self(), threadName);
+#endif
+#elif defined(OS_NETBSD)
+  pthread_setname_np(pthread_self(), threadName, NULL);
+#else
+  // we have some other platform(s) to support
+  //   defined(OS_FREEBSD) ... freebsd-9.2, Feb 19, 2016 not working
+  //
+  // NOTE: do not fail here since this functionality is optional
+#endif
+}
+
+// similar to Env::NowMicros except guaranteed to return "time" instead
+//  of potentially only ticks since reboot
+const uint64_t UINT64_ONE_SECOND_MICROS=1000000;
+
+inline uint64_t TimeMicros() {
+#if _POSIX_TIMERS >= 200801L
+    struct timespec ts;
+
+    // this is rumored to be faster than gettimeofday(),
+    clock_gettime(CLOCK_REALTIME, &ts);
+    return static_cast<uint64_t>(ts.tv_sec) * 1000000 + ts.tv_nsec/1000;
+#else
+    struct timeval tv;
+    gettimeofday(&tv, NULL);
+    return static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
+#endif
+} // TimeMicros
 
 } // namespace port
 } // namespace leveldb
diff --git a/src/leveldb/port/port_posix_sse.cc b/src/leveldb/port/port_posix_sse.cc
deleted file mode 100644
index 2d49c21dd..000000000
--- a/src/leveldb/port/port_posix_sse.cc
+++ /dev/null
@@ -1,110 +0,0 @@
-// Copyright 2016 The LevelDB Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
-//
-// A portable implementation of crc32c, optimized to handle
-// four bytes at a time.
-//
-// In a separate source file to allow this accelerated CRC32C function to be
-// compiled with the appropriate compiler flags to enable x86 SSE 4.2
-// instructions.
-
-#include <stdint.h>
-#include <string.h>
-#include "port/port.h"
-
-#if defined(LEVELDB_PLATFORM_POSIX_SSE)
-
-#if defined(_MSC_VER)
-#include <intrin.h>
-#elif defined(__GNUC__) && defined(__SSE4_2__)
-#include <nmmintrin.h>
-#endif
-
-#endif  // defined(LEVELDB_PLATFORM_POSIX_SSE)
-
-namespace leveldb {
-namespace port {
-
-#if defined(LEVELDB_PLATFORM_POSIX_SSE)
-
-// Used to fetch a naturally-aligned 32-bit word in little endian byte-order
-static inline uint32_t LE_LOAD32(const uint8_t *p) {
-  // SSE is x86 only, so ensured that |p| is always little-endian.
-  uint32_t word;
-  memcpy(&word, p, sizeof(word));
-  return word;
-}
-
-#if defined(_M_X64) || defined(__x86_64__)  // LE_LOAD64 is only used on x64.
-
-// Used to fetch a naturally-aligned 64-bit word in little endian byte-order
-static inline uint64_t LE_LOAD64(const uint8_t *p) {
-  uint64_t dword;
-  memcpy(&dword, p, sizeof(dword));
-  return dword;
-}
-
-#endif  // defined(_M_X64) || defined(__x86_64__)
-
-#endif  // defined(LEVELDB_PLATFORM_POSIX_SSE)
-
-// For further improvements see Intel publication at:
-// http://download.intel.com/design/intarch/papers/323405.pdf
-uint32_t AcceleratedCRC32C(uint32_t crc, const char* buf, size_t size) {
-#if !defined(LEVELDB_PLATFORM_POSIX_SSE)
-  return 0;
-#else
-
-  const uint8_t *p = reinterpret_cast<const uint8_t *>(buf);
-  const uint8_t *e = p + size;
-  uint32_t l = crc ^ 0xffffffffu;
-
-#define STEP1 do {                              \
-    l = _mm_crc32_u8(l, *p++);                  \
-} while (0)
-#define STEP4 do {                              \
-    l = _mm_crc32_u32(l, LE_LOAD32(p));         \
-    p += 4;                                     \
-} while (0)
-#define STEP8 do {                              \
-    l = _mm_crc32_u64(l, LE_LOAD64(p));         \
-    p += 8;                                     \
-} while (0)
-
-  if (size > 16) {
-    // Process unaligned bytes
-    for (unsigned int i = reinterpret_cast<uintptr_t>(p) % 8; i; --i) {
-      STEP1;
-    }
-
-    // _mm_crc32_u64 is only available on x64.
-#if defined(_M_X64) || defined(__x86_64__)
-    // Process 8 bytes at a time
-    while ((e-p) >= 8) {
-      STEP8;
-    }
-    // Process 4 bytes at a time
-    if ((e-p) >= 4) {
-      STEP4;
-    }
-#else  // !(defined(_M_X64) || defined(__x86_64__))
-    // Process 4 bytes at a time
-    while ((e-p) >= 4) {
-      STEP4;
-    }
-#endif  // defined(_M_X64) || defined(__x86_64__)
-  }
-  // Process the last few bytes
-  while (p != e) {
-    STEP1;
-  }
-#undef STEP8
-#undef STEP4
-#undef STEP1
-  return l ^ 0xffffffffu;
-#endif  // defined(LEVELDB_PLATFORM_POSIX_SSE)
-}
-
-}  // namespace port
-}  // namespace leveldb
diff --git a/src/leveldb/port/port_win.cc b/src/leveldb/port/port_win.cc
deleted file mode 100644
index 1be9e8d5b..000000000
--- a/src/leveldb/port/port_win.cc
+++ /dev/null
@@ -1,158 +0,0 @@
-// LevelDB Copyright (c) 2011 The LevelDB Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
-//
-// See port_example.h for documentation for the following types/functions.
-
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-// 
-//  * Redistributions of source code must retain the above copyright
-//    notice, this list of conditions and the following disclaimer.
-//  * Redistributions in binary form must reproduce the above copyright
-//    notice, this list of conditions and the following disclaimer in the
-//    documentation and/or other materials provided with the distribution.
-//  * Neither the name of the University of California, Berkeley nor the
-//    names of its contributors may be used to endorse or promote products
-//    derived from this software without specific prior written permission.
-// 
-// THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-// DISCLAIMED. IN NO EVENT SHALL THE REGENTS AND CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-
-#include "port/port_win.h"
-
-#include <windows.h>
-#include <cassert>
-#include <intrin.h>
-
-namespace leveldb {
-namespace port {
-
-Mutex::Mutex() :
-    cs_(NULL) {
-  assert(!cs_);
-  cs_ = static_cast<void *>(new CRITICAL_SECTION());
-  ::InitializeCriticalSection(static_cast<CRITICAL_SECTION *>(cs_));
-  assert(cs_);
-}
-
-Mutex::~Mutex() {
-  assert(cs_);
-  ::DeleteCriticalSection(static_cast<CRITICAL_SECTION *>(cs_));
-  delete static_cast<CRITICAL_SECTION *>(cs_);
-  cs_ = NULL;
-  assert(!cs_);
-}
-
-void Mutex::Lock() {
-  assert(cs_);
-  ::EnterCriticalSection(static_cast<CRITICAL_SECTION *>(cs_));
-}
-
-void Mutex::Unlock() {
-  assert(cs_);
-  ::LeaveCriticalSection(static_cast<CRITICAL_SECTION *>(cs_));
-}
-
-void Mutex::AssertHeld() {
-  assert(cs_);
-  assert(1);
-}
-
-CondVar::CondVar(Mutex* mu) :
-    waiting_(0), 
-    mu_(mu), 
-    sem1_(::CreateSemaphore(NULL, 0, 10000, NULL)), 
-    sem2_(::CreateSemaphore(NULL, 0, 10000, NULL)) {
-  assert(mu_);
-}
-
-CondVar::~CondVar() {
-  ::CloseHandle(sem1_);
-  ::CloseHandle(sem2_);
-}
-
-void CondVar::Wait() {
-  mu_->AssertHeld();
-
-  wait_mtx_.Lock();
-  ++waiting_;
-  wait_mtx_.Unlock();
-
-  mu_->Unlock();
-
-  // initiate handshake
-  ::WaitForSingleObject(sem1_, INFINITE);
-  ::ReleaseSemaphore(sem2_, 1, NULL);
-  mu_->Lock();
-}
-
-void CondVar::Signal() {
-  wait_mtx_.Lock();
-  if (waiting_ > 0) {
-    --waiting_;
-
-    // finalize handshake
-    ::ReleaseSemaphore(sem1_, 1, NULL);
-    ::WaitForSingleObject(sem2_, INFINITE);
-  }
-  wait_mtx_.Unlock();
-}
-
-void CondVar::SignalAll() {
-  wait_mtx_.Lock();
-  ::ReleaseSemaphore(sem1_, waiting_, NULL);
-  while(waiting_ > 0) {
-    --waiting_;
-    ::WaitForSingleObject(sem2_, INFINITE);
-  }
-  wait_mtx_.Unlock();
-}
-
-AtomicPointer::AtomicPointer(void* v) {
-  Release_Store(v);
-}
-
-void InitOnce(OnceType* once, void (*initializer)()) {
-  once->InitOnce(initializer);
-}
-
-void* AtomicPointer::Acquire_Load() const {
-  void * p = NULL;
-  InterlockedExchangePointer(&p, rep_);
-  return p;
-}
-
-void AtomicPointer::Release_Store(void* v) {
-  InterlockedExchangePointer(&rep_, v);
-}
-
-void* AtomicPointer::NoBarrier_Load() const {
-  return rep_;
-}
-
-void AtomicPointer::NoBarrier_Store(void* v) {
-  rep_ = v;
-}
-
-bool HasAcceleratedCRC32C() {
-#if defined(__x86_64__) || defined(__i386__)
-  int cpu_info[4];
-  __cpuid(cpu_info, 1);
-  return (cpu_info[2] & (1 << 20)) != 0;
-#else
-  return false;
-#endif
-}
-
-}
-}
diff --git a/src/leveldb/port/port_win.h b/src/leveldb/port/port_win.h
deleted file mode 100644
index e8bf46ef2..000000000
--- a/src/leveldb/port/port_win.h
+++ /dev/null
@@ -1,177 +0,0 @@
-// LevelDB Copyright (c) 2011 The LevelDB Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
-//
-// See port_example.h for documentation for the following types/functions.
-
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-// 
-//  * Redistributions of source code must retain the above copyright
-//    notice, this list of conditions and the following disclaimer.
-//  * Redistributions in binary form must reproduce the above copyright
-//    notice, this list of conditions and the following disclaimer in the
-//    documentation and/or other materials provided with the distribution.
-//  * Neither the name of the University of California, Berkeley nor the
-//    names of its contributors may be used to endorse or promote products
-//    derived from this software without specific prior written permission.
-// 
-// THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-// DISCLAIMED. IN NO EVENT SHALL THE REGENTS AND CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-
-#ifndef STORAGE_LEVELDB_PORT_PORT_WIN_H_
-#define STORAGE_LEVELDB_PORT_PORT_WIN_H_
-
-#ifdef _MSC_VER
-#define snprintf _snprintf
-#define close _close
-#define fread_unlocked _fread_nolock
-#endif
-
-#include <string>
-#include <stdint.h>
-#ifdef SNAPPY
-#include <snappy.h>
-#endif
-
-namespace leveldb {
-namespace port {
-
-// Windows is little endian (for now :p)
-static const bool kLittleEndian = true;
-
-class CondVar;
-
-class Mutex {
- public:
-  Mutex();
-  ~Mutex();
-
-  void Lock();
-  void Unlock();
-  void AssertHeld();
-
- private:
-  friend class CondVar;
-  // critical sections are more efficient than mutexes
-  // but they are not recursive and can only be used to synchronize threads within the same process
-  // we use opaque void * to avoid including windows.h in port_win.h
-  void * cs_;
-
-  // No copying
-  Mutex(const Mutex&);
-  void operator=(const Mutex&);
-};
-
-// the Win32 API offers a dependable condition variable mechanism, but only starting with
-// Windows 2008 and Vista
-// no matter what we will implement our own condition variable with a semaphore
-// implementation as described in a paper written by Andrew D. Birrell in 2003
-class CondVar {
- public:
-  explicit CondVar(Mutex* mu);
-  ~CondVar();
-  void Wait();
-  void Signal();
-  void SignalAll();
- private:
-  Mutex* mu_;
-  
-  Mutex wait_mtx_;
-  long waiting_;
-  
-  void * sem1_;
-  void * sem2_;
-  
-  
-};
-
-class OnceType {
-public:
-//    OnceType() : init_(false) {}
-    OnceType(const OnceType &once) : init_(once.init_) {}
-    OnceType(bool f) : init_(f) {}
-    void InitOnce(void (*initializer)()) {
-        mutex_.Lock();
-        if (!init_) {
-            init_ = true;
-            initializer();
-        }
-        mutex_.Unlock();
-    }
-
-private:
-    bool init_;
-    Mutex mutex_;
-};
-
-#define LEVELDB_ONCE_INIT false
-extern void InitOnce(port::OnceType*, void (*initializer)());
-
-// Storage for a lock-free pointer
-class AtomicPointer {
- private:
-  void * rep_;
- public:
-  AtomicPointer() : rep_(NULL) { }
-  explicit AtomicPointer(void* v); 
-  void* Acquire_Load() const;
-
-  void Release_Store(void* v);
-
-  void* NoBarrier_Load() const;
-
-  void NoBarrier_Store(void* v);
-};
-
-inline bool Snappy_Compress(const char* input, size_t length,
-                            ::std::string* output) {
-#ifdef SNAPPY
-  output->resize(snappy::MaxCompressedLength(length));
-  size_t outlen;
-  snappy::RawCompress(input, length, &(*output)[0], &outlen);
-  output->resize(outlen);
-  return true;
-#endif
-
-  return false;
-}
-
-inline bool Snappy_GetUncompressedLength(const char* input, size_t length,
-                                         size_t* result) {
-#ifdef SNAPPY
-  return snappy::GetUncompressedLength(input, length, result);
-#else
-  return false;
-#endif
-}
-
-inline bool Snappy_Uncompress(const char* input, size_t length,
-                              char* output) {
-#ifdef SNAPPY
-  return snappy::RawUncompress(input, length, output);
-#else
-  return false;
-#endif
-}
-
-inline bool GetHeapProfile(void (*func)(void*, const char*, int), void* arg) {
-  return false;
-}
-
-bool HasAcceleratedCRC32C();
-uint32_t AcceleratedCRC32C(uint32_t crc, const char* buf, size_t size);
-
-}
-}
-
-#endif  // STORAGE_LEVELDB_PORT_PORT_WIN_H_
diff --git a/src/leveldb/port/thread_annotations.h b/src/leveldb/port/thread_annotations.h
index 9470ef587..6f9b6a792 100644
--- a/src/leveldb/port/thread_annotations.h
+++ b/src/leveldb/port/thread_annotations.h
@@ -2,8 +2,7 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#ifndef STORAGE_LEVELDB_PORT_THREAD_ANNOTATIONS_H_
-#define STORAGE_LEVELDB_PORT_THREAD_ANNOTATIONS_H_
+#ifndef STORAGE_LEVELDB_PORT_THREAD_ANNOTATIONS_H
 
 // Some environments provide custom macros to aid in static thread-safety
 // analysis.  Provide empty definitions of such macros unless they are already
@@ -57,4 +56,4 @@
 #define NO_THREAD_SAFETY_ANALYSIS
 #endif
 
-#endif  // STORAGE_LEVELDB_PORT_THREAD_ANNOTATIONS_H_
+#endif  // STORAGE_LEVELDB_PORT_THREAD_ANNOTATIONS_H
diff --git a/src/leveldb/table/block.cc b/src/leveldb/table/block.cc
index 43e402c9c..c27c912e7 100644
--- a/src/leveldb/table/block.cc
+++ b/src/leveldb/table/block.cc
@@ -15,8 +15,8 @@
 
 namespace leveldb {
 
-inline uint32_t Block::NumRestarts() const {
-  assert(size_ >= sizeof(uint32_t));
+uint32_t Block::NumRestarts() const {
+  assert(size_ >= 2*sizeof(uint32_t));
   return DecodeFixed32(data_ + size_ - sizeof(uint32_t));
 }
 
@@ -27,12 +27,11 @@ Block::Block(const BlockContents& contents)
   if (size_ < sizeof(uint32_t)) {
     size_ = 0;  // Error marker
   } else {
-    size_t max_restarts_allowed = (size_-sizeof(uint32_t)) / sizeof(uint32_t);
-    if (NumRestarts() > max_restarts_allowed) {
-      // The size is too small for NumRestarts()
+    restart_offset_ = size_ - (1 + NumRestarts()) * sizeof(uint32_t);
+    if (restart_offset_ > size_ - sizeof(uint32_t)) {
+      // The size is too small for NumRestarts() and therefore
+      // restart_offset_ wrapped around.
       size_ = 0;
-    } else {
-      restart_offset_ = size_ - (1 + NumRestarts()) * sizeof(uint32_t);
     }
   }
 }
@@ -46,7 +45,7 @@ Block::~Block() {
 // Helper routine: decode the next block entry starting at "p",
 // storing the number of shared key bytes, non_shared key bytes,
 // and the length of the value in "*shared", "*non_shared", and
-// "*value_length", respectively.  Will not dereference past "limit".
+// "*value_length", respectively.  Will not derefence past "limit".
 //
 // If any errors are detected, returns NULL.  Otherwise, returns a
 // pointer to the key delta (just past the three decoded values).
@@ -163,8 +162,8 @@ class Block::Iter : public Iterator {
   }
 
   virtual void Seek(const Slice& target) {
-    // Binary search in restart array to find the last restart point
-    // with a key < target
+    // Binary search in restart array to find the first restart point
+    // with a key >= target
     uint32_t left = 0;
     uint32_t right = num_restarts_ - 1;
     while (left < right) {
@@ -254,7 +253,7 @@ class Block::Iter : public Iterator {
 };
 
 Iterator* Block::NewIterator(const Comparator* cmp) {
-  if (size_ < sizeof(uint32_t)) {
+  if (size_ < 2*sizeof(uint32_t)) {
     return NewErrorIterator(Status::Corruption("bad block contents"));
   }
   const uint32_t num_restarts = NumRestarts();
diff --git a/src/leveldb/table/block.h b/src/leveldb/table/block.h
index 2493eb9f9..f29f08186 100644
--- a/src/leveldb/table/block.h
+++ b/src/leveldb/table/block.h
@@ -24,9 +24,10 @@ class Block {
   size_t size() const { return size_; }
   Iterator* NewIterator(const Comparator* comparator);
 
- private:
   uint32_t NumRestarts() const;
 
+ private:
+
   const char* data_;
   size_t size_;
   uint32_t restart_offset_;     // Offset in data_ of restart array
diff --git a/src/leveldb/table/block_builder.h b/src/leveldb/table/block_builder.h
index 4fbcb3397..5b545bd1a 100644
--- a/src/leveldb/table/block_builder.h
+++ b/src/leveldb/table/block_builder.h
@@ -21,7 +21,7 @@ class BlockBuilder {
   // Reset the contents as if the BlockBuilder was just constructed.
   void Reset();
 
-  // REQUIRES: Finish() has not been called since the last call to Reset().
+  // REQUIRES: Finish() has not been callled since the last call to Reset().
   // REQUIRES: key is larger than any previously added key
   void Add(const Slice& key, const Slice& value);
 
diff --git a/src/leveldb/table/filter_block.cc b/src/leveldb/table/filter_block.cc
index 1ed513417..fb171e698 100644
--- a/src/leveldb/table/filter_block.cc
+++ b/src/leveldb/table/filter_block.cc
@@ -9,22 +9,31 @@
 
 namespace leveldb {
 
-// See doc/table_format.md for an explanation of the filter block format.
+// See doc/table_format.txt for an explanation of the filter block format.
 
-// Generate new filter every 2KB of data
-static const size_t kFilterBaseLg = 11;
-static const size_t kFilterBase = 1 << kFilterBaseLg;
+// list of available filters within code base
+const FilterPolicy * FilterInventory::ListHead(NULL);
 
 FilterBlockBuilder::FilterBlockBuilder(const FilterPolicy* policy)
-    : policy_(policy) {
+    : policy_(policy), filter_base_lg_(0), filter_base_(0), last_offset_(0)
+{
 }
 
 void FilterBlockBuilder::StartBlock(uint64_t block_offset) {
-  uint64_t filter_index = (block_offset / kFilterBase);
-  assert(filter_index >= filter_offsets_.size());
-  while (filter_index > filter_offsets_.size()) {
-    GenerateFilter();
-  }
+    if (0==filter_base_lg_ && (1500<start_.size() || 268435456<block_offset))
+        PickFilterBase(block_offset);
+
+    if (0!=filter_base_lg_)
+    {
+        uint64_t filter_index = (block_offset / filter_base_);
+        assert(filter_index >= filter_offsets_.size());
+        while (filter_index > filter_offsets_.size())
+        {
+            GenerateFilter();
+        }   // if
+    }   // if
+
+    last_offset_=block_offset;
 }
 
 void FilterBlockBuilder::AddKey(const Slice& key) {
@@ -34,6 +43,9 @@ void FilterBlockBuilder::AddKey(const Slice& key) {
 }
 
 Slice FilterBlockBuilder::Finish() {
+    if (0==filter_base_lg_)
+        PickFilterBase(last_offset_);
+
   if (!start_.empty()) {
     GenerateFilter();
   }
@@ -45,7 +57,7 @@ Slice FilterBlockBuilder::Finish() {
   }
 
   PutFixed32(&result_, array_offset);
-  result_.push_back(kFilterBaseLg);  // Save encoding parameter in result
+  result_.push_back(filter_base_lg_);  // Save encoding parameter in result
   return Slice(result_);
 }
 
@@ -68,7 +80,7 @@ void FilterBlockBuilder::GenerateFilter() {
 
   // Generate filter for current set of keys and append to result_.
   filter_offsets_.push_back(result_.size());
-  policy_->CreateFilter(&tmp_keys_[0], static_cast<int>(num_keys), &result_);
+  policy_->CreateFilter(&tmp_keys_[0], num_keys, &result_);
 
   tmp_keys_.clear();
   keys_.clear();
@@ -97,7 +109,7 @@ bool FilterBlockReader::KeyMayMatch(uint64_t block_offset, const Slice& key) {
   if (index < num_) {
     uint32_t start = DecodeFixed32(offset_ + index*4);
     uint32_t limit = DecodeFixed32(offset_ + index*4 + 4);
-    if (start <= limit && limit <= static_cast<size_t>(offset_ - data_)) {
+    if (start <= limit && limit <= (offset_ - data_)) {
       Slice filter = Slice(data_ + start, limit - start);
       return policy_->KeyMayMatch(key, filter);
     } else if (start == limit) {
@@ -108,4 +120,48 @@ bool FilterBlockReader::KeyMayMatch(uint64_t block_offset, const Slice& key) {
   return true;  // Errors are treated as potential matches
 }
 
+
+// wikipedia.com quotes following as source
+//  Warren Jr., Henry S. (2002). Hacker's Delight. Addison Wesley. pp. 48. ISBN 978-0-201-91465-8
+// Numerical Recipes, Third Edition credits
+//   Anderson, S.E. 2001, "BitTwiddling Hacks", http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
+// latter states public domain.
+static uint32_t
+PowerOfTwoGreater(uint32_t num)
+{
+    uint32_t n;
+
+    n=num;
+    --n;
+    n |= n >> 1;
+    n |= n >> 2;
+    n |= n >> 4;
+    n |= n >> 8;
+    n |= n >> 16;
+    ++n;
+
+    return n;
+}   // CalcFilterBaseLg
+
+
+void
+FilterBlockBuilder::PickFilterBase(
+    size_t BlockOffset)
+{
+    // create limits just for safety sake
+    if (0==BlockOffset || 268435456<BlockOffset)
+    {
+        filter_base_lg_=28;
+        filter_base_=268435456;
+    }   // if
+    else
+    {
+        uint32_t temp;
+        filter_base_=PowerOfTwoGreater((uint32_t)BlockOffset);
+        for (filter_base_lg_=0, temp=filter_base_>>1; 0!=temp; ++filter_base_lg_, temp=temp >> 1);
+    }   // else
+
+}   // FilterBlockBuilder::PickFilterBase
+
+
 }
diff --git a/src/leveldb/table/filter_block.h b/src/leveldb/table/filter_block.h
index c67d010bd..5acf337a5 100644
--- a/src/leveldb/table/filter_block.h
+++ b/src/leveldb/table/filter_block.h
@@ -36,8 +36,13 @@ class FilterBlockBuilder {
 
  private:
   void GenerateFilter();
+  void PickFilterBase(size_t BlockOffset);
 
   const FilterPolicy* policy_;
+  size_t filter_base_lg_;
+  size_t filter_base_;
+  size_t last_offset_;
+
   std::string keys_;              // Flattened key contents
   std::vector<size_t> start_;     // Starting index in keys_ of each key
   std::string result_;            // Filter data computed so far
diff --git a/src/leveldb/table/filter_block_test.cc b/src/leveldb/table/filter_block_test.cc
index 8c4a4741f..8d0752819 100644
--- a/src/leveldb/table/filter_block_test.cc
+++ b/src/leveldb/table/filter_block_test.cc
@@ -29,7 +29,7 @@ class TestHashFilter : public FilterPolicy {
 
   virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const {
     uint32_t h = Hash(key.data(), key.size(), 1);
-    for (size_t i = 0; i + 4 <= filter.size(); i += 4) {
+    for (int i = 0; i + 4 <= filter.size(); i += 4) {
       if (h == DecodeFixed32(filter.data() + i)) {
         return true;
       }
@@ -46,7 +46,7 @@ class FilterBlockTest {
 TEST(FilterBlockTest, EmptyBuilder) {
   FilterBlockBuilder builder(&policy_);
   Slice block = builder.Finish();
-  ASSERT_EQ("\\x00\\x00\\x00\\x00\\x0b", EscapeString(block));
+  ASSERT_EQ("\\x00\\x00\\x00\\x00\\x1c", EscapeString(block));
   FilterBlockReader reader(&policy_, block);
   ASSERT_TRUE(reader.KeyMayMatch(0, "foo"));
   ASSERT_TRUE(reader.KeyMayMatch(100000, "foo"));
@@ -95,7 +95,7 @@ TEST(FilterBlockTest, MultiChunk) {
 
   Slice block = builder.Finish();
   FilterBlockReader reader(&policy_, block);
-
+#if 0 // all in first/only filter with riak
   // Check first filter
   ASSERT_TRUE(reader.KeyMayMatch(0, "foo"));
   ASSERT_TRUE(reader.KeyMayMatch(2000, "bar"));
@@ -119,6 +119,30 @@ TEST(FilterBlockTest, MultiChunk) {
   ASSERT_TRUE(reader.KeyMayMatch(9000, "hello"));
   ASSERT_TRUE(! reader.KeyMayMatch(9000, "foo"));
   ASSERT_TRUE(! reader.KeyMayMatch(9000, "bar"));
+#else
+  ASSERT_TRUE(reader.KeyMayMatch(0, "foo"));
+  ASSERT_TRUE(reader.KeyMayMatch(2000, "bar"));
+  ASSERT_TRUE(reader.KeyMayMatch(0, "box"));
+  ASSERT_TRUE(reader.KeyMayMatch(0, "hello"));
+
+  // Check second filter
+  ASSERT_TRUE(reader.KeyMayMatch(3100, "box"));
+  ASSERT_TRUE(reader.KeyMayMatch(3100, "foo"));
+  ASSERT_TRUE(reader.KeyMayMatch(3100, "bar"));
+  ASSERT_TRUE(reader.KeyMayMatch(3100, "hello"));
+
+  // Check third filter (empty)
+  ASSERT_TRUE(reader.KeyMayMatch(4100, "foo"));
+  ASSERT_TRUE(reader.KeyMayMatch(4100, "bar"));
+  ASSERT_TRUE(reader.KeyMayMatch(4100, "box"));
+  ASSERT_TRUE(reader.KeyMayMatch(4100, "hello"));
+
+  // Check last filter
+  ASSERT_TRUE(reader.KeyMayMatch(9000, "box"));
+  ASSERT_TRUE(reader.KeyMayMatch(9000, "hello"));
+  ASSERT_TRUE(reader.KeyMayMatch(9000, "foo"));
+  ASSERT_TRUE(reader.KeyMayMatch(9000, "bar"));
+#endif
 }
 
 }  // namespace leveldb
diff --git a/src/leveldb/table/format.cc b/src/leveldb/table/format.cc
index 285e1c0de..c98ef930f 100644
--- a/src/leveldb/table/format.cc
+++ b/src/leveldb/table/format.cc
@@ -5,13 +5,23 @@
 #include "table/format.h"
 
 #include "leveldb/env.h"
+#include "leveldb/perf_count.h"
 #include "port/port.h"
 #include "table/block.h"
 #include "util/coding.h"
 #include "util/crc32c.h"
+#include "util/lz4.h"
+#include "db/log_writer.h"
 
 namespace leveldb {
 
+static struct
+{
+    uint32_t filler_;  //!< don't know and don't care
+    uint32_t zero_restarts_;  //!< path to an EmptyIterator
+} gEmptyBlock={0,0};
+
+
 void BlockHandle::EncodeTo(std::string* dst) const {
   // Sanity check that all fields have been set
   assert(offset_ != ~static_cast<uint64_t>(0));
@@ -30,14 +40,15 @@ Status BlockHandle::DecodeFrom(Slice* input) {
 }
 
 void Footer::EncodeTo(std::string* dst) const {
+#ifndef NDEBUG
   const size_t original_size = dst->size();
+#endif
   metaindex_handle_.EncodeTo(dst);
   index_handle_.EncodeTo(dst);
   dst->resize(2 * BlockHandle::kMaxEncodedLength);  // Padding
   PutFixed32(dst, static_cast<uint32_t>(kTableMagicNumber & 0xffffffffu));
   PutFixed32(dst, static_cast<uint32_t>(kTableMagicNumber >> 32));
   assert(dst->size() == original_size + kEncodedLength);
-  (void)original_size;  // Disable unused variable warning.
 }
 
 Status Footer::DecodeFrom(Slice* input) {
@@ -47,7 +58,7 @@ Status Footer::DecodeFrom(Slice* input) {
   const uint64_t magic = ((static_cast<uint64_t>(magic_hi) << 32) |
                           (static_cast<uint64_t>(magic_lo)));
   if (magic != kTableMagicNumber) {
-    return Status::Corruption("not an sstable (bad magic number)");
+    return Status::InvalidArgument("not an sstable (bad magic number)");
   }
 
   Status result = metaindex_handle_.DecodeFrom(input);
@@ -65,7 +76,14 @@ Status Footer::DecodeFrom(Slice* input) {
 Status ReadBlock(RandomAccessFile* file,
                  const ReadOptions& options,
                  const BlockHandle& handle,
-                 BlockContents* result) {
+                 BlockContents* result)
+{
+  char * buf, * ubuf;
+  const char * data;
+
+  buf=NULL;
+  ubuf=NULL;
+  data=NULL;
   result->data = Slice();
   result->cachable = false;
   result->heap_allocated = false;
@@ -73,72 +91,161 @@ Status ReadBlock(RandomAccessFile* file,
   // Read the block contents as well as the type/crc footer.
   // See table_builder.cc for the code that built this structure.
   size_t n = static_cast<size_t>(handle.size());
-  char* buf = new char[n + kBlockTrailerSize];
+  buf = new char[n + kBlockTrailerSize];
   Slice contents;
   Status s = file->Read(handle.offset(), n + kBlockTrailerSize, &contents, buf);
-  if (!s.ok()) {
-    delete[] buf;
-    return s;
-  }
-  if (contents.size() != n + kBlockTrailerSize) {
-    delete[] buf;
-    return Status::Corruption("truncated block read", file->GetName());
-  }
+  if (s.ok())
+  {
+      if (contents.size() != n + kBlockTrailerSize) {
+          s=Status::Corruption("truncated block read");
+      }
+  }   // if
 
   // Check the crc of the type and the block contents
-  const char* data = contents.data();    // Pointer to where Read put the data
-  if (options.verify_checksums) {
-    const uint32_t crc = crc32c::Unmask(DecodeFixed32(data + n + 1));
-    const uint32_t actual = crc32c::Value(data, n + 1);
-    if (actual != crc) {
-      delete[] buf;
-      s = Status::Corruption("block checksum mismatch", file->GetName());
-      return s;
-    }
-  }
+  if (s.ok())
+  {
+      data = contents.data();    // Pointer to where Read put the data
+      if (options.verify_checksums) {
+          const uint32_t crc = crc32c::Unmask(DecodeFixed32(data + n + 1));
+          const uint32_t actual = crc32c::Value(data, n + 1);
+          if (actual != crc) {
+              s = Status::Corruption("block checksum mismatch");
+          }   // if
+      }   // if
+  }   // if
 
-  switch (data[n]) {
-    case kNoCompression:
-      if (data != buf) {
-        // File implementation gave us pointer to some other data.
-        // Use it directly under the assumption that it will be live
-        // while the file is open.
-        delete[] buf;
-        result->data = Slice(data, n);
-        result->heap_allocated = false;
-        result->cachable = false;  // Do not double-cache
-      } else {
-        result->data = Slice(buf, n);
-        result->heap_allocated = true;
-        result->cachable = true;
-      }
+  if (s.ok())
+  {
+      switch (data[n]) {
+          case kNoCompression:
+              if (data != buf) {
+                  // File implementation gave us pointer to some other data.
+                  // Use it directly under the assumption that it will be live
+                  // while the file is open.
+                  delete[] buf;
+                  buf=NULL;
+                  result->data = Slice(data, n);
+                  result->heap_allocated = false;
+                  result->cachable = false;  // Do not double-cache
+              } else {
+                  result->data = Slice(buf, n);
+                  result->heap_allocated = true;
+                  result->cachable = true;
+              }   // else
+              // Ok
+              break;
 
-      // Ok
-      break;
-    case kSnappyCompression: {
-      size_t ulength = 0;
-      if (!port::Snappy_GetUncompressedLength(data, n, &ulength)) {
-        delete[] buf;
-        return Status::Corruption("corrupted compressed block contents", file->GetName());
-      }
-      char* ubuf = new char[ulength];
-      if (!port::Snappy_Uncompress(data, n, ubuf)) {
-        delete[] buf;
-        delete[] ubuf;
-        return Status::Corruption("corrupted compressed block contents", file->GetName());
-      }
-      delete[] buf;
-      result->data = Slice(ubuf, ulength);
-      result->heap_allocated = true;
-      result->cachable = true;
-      break;
-    }
-    default:
-      delete[] buf;
-      return Status::Corruption("bad block type", file->GetName());
-  }
+          case kSnappyCompression: {
+              size_t ulength = 0;
+              if (!port::Snappy_GetUncompressedLength(data, n, &ulength)) {
+                  s = Status::Corruption("corrupted compressed block contents");
+              }
 
-  return Status::OK();
+              if (s.ok())
+              {
+                  ubuf = new char[ulength];
+                  if (!port::Snappy_Uncompress(data, n, ubuf)) {
+                      s=Status::Corruption("corrupted compressed block contents");
+                  }
+              }   // if
+
+              if (s.ok())
+              {
+                  delete[] buf;
+                  buf=NULL;
+                  result->data = Slice(ubuf, ulength);
+                  result->heap_allocated = true;
+                  result->cachable = true;
+              }   // if
+              break;
+          }
+
+          case kLZ4Compression: {
+              size_t ulength = DecodeFixed32(data);
+              size_t ret_val;
+              ubuf = new char[ulength];
+
+              ret_val=LZ4_decompress_safe(data+4, ubuf, n-4, ulength);
+              if (ret_val != ulength)
+              {
+                  s=Status::Corruption("corrupted LZ4 compressed block");
+              }   // if
+
+              if (s.ok())
+              {
+                  delete[] buf;
+                  buf=NULL;
+                  result->data = Slice(ubuf, ulength);
+                  result->heap_allocated = true;
+                  result->cachable = true;
+              }   // if
+              break;
+          }
+
+          default:
+              s=Status::Corruption("bad block type");
+              break;
+      }   // switch
+  }   // if
+
+  // clean up error and decide what to do with it
+  if (!s.ok())
+  {
+      gPerfCounters->Inc(ePerfReadBlockError);
+
+      if (options.IsCompaction() && 0!=options.GetDBName().length())
+      {
+          // this process is slow.  assumption is that it does not happen often.
+          if (NULL!=data)
+          {
+              std::string new_name;
+              WritableFile *bad_file;
+              log::Writer *bad_logger;
+              Status s2;
+
+              bad_file=NULL;
+              bad_logger=NULL;
+
+              // potentially create the "lost" directory.  It might already exist.
+              new_name=options.GetDBName();
+              new_name+="/lost";
+              options.GetEnv()->CreateDir(new_name);
+
+              // create / append file to hold removed blocks
+              new_name+="/BLOCKS.bad";
+              s2=options.GetEnv()->NewAppendableFile(new_name, &bad_file, 4*1024);
+              if (s2.ok())
+              {
+                  // need a try/catch
+                  bad_logger=new log::Writer(bad_file);
+                  bad_logger->AddRecord(Slice(data, n));
+                  Log(options.GetInfoLog(),"Moving corrupted block to lost/BLOCKS.bad (size %zd)", n);
+
+                  // Close also deletes bad_file
+                  bad_logger->Close();
+                  delete bad_logger;
+                  bad_logger=NULL;
+                  bad_file=NULL;
+              }   // if
+              else
+              {
+                  Log(options.GetInfoLog(), "Unable to create file for bad/corrupted blocks: %s", new_name.c_str());
+              }   // else
+          }   // if
+
+          // lie to the upper layers to keep compaction from going into an infinite loop
+          s = Status::OK();
+      }   // if
+
+      delete [] buf;
+      delete [] ubuf;
+
+      result->data = Slice((char *)&gEmptyBlock, sizeof(gEmptyBlock));
+      result->cachable = false;
+      result->heap_allocated = false;
+  }   // if
+
+  return s;
 }
 
 }  // namespace leveldb
diff --git a/src/leveldb/table/iterator_wrapper.h b/src/leveldb/table/iterator_wrapper.h
index f410c3fab..9e16b3dbe 100644
--- a/src/leveldb/table/iterator_wrapper.h
+++ b/src/leveldb/table/iterator_wrapper.h
@@ -5,9 +5,6 @@
 #ifndef STORAGE_LEVELDB_TABLE_ITERATOR_WRAPPER_H_
 #define STORAGE_LEVELDB_TABLE_ITERATOR_WRAPPER_H_
 
-#include "leveldb/iterator.h"
-#include "leveldb/slice.h"
-
 namespace leveldb {
 
 // A internal wrapper class with an interface similar to Iterator that
diff --git a/src/leveldb/table/table.cc b/src/leveldb/table/table.cc
index decf8082c..877ee4fc6 100644
--- a/src/leveldb/table/table.cc
+++ b/src/leveldb/table/table.cc
@@ -9,6 +9,7 @@
 #include "leveldb/env.h"
 #include "leveldb/filter_policy.h"
 #include "leveldb/options.h"
+#include "leveldb/perf_count.h"
 #include "table/block.h"
 #include "table/filter_block.h"
 #include "table/format.h"
@@ -27,12 +28,18 @@ struct Table::Rep {
   Options options;
   Status status;
   RandomAccessFile* file;
+  uint64_t file_size;
   uint64_t cache_id;
   FilterBlockReader* filter;
   const char* filter_data;
+  size_t filter_data_size;
 
   BlockHandle metaindex_handle;  // Handle to metaindex_block: saved from footer
   Block* index_block;
+  SstCounters sst_counters;
+  BlockHandle filter_handle;
+  const FilterPolicy * filter_policy;
+  volatile uint32_t filter_flag;
 };
 
 Status Table::Open(const Options& options,
@@ -41,10 +48,14 @@ Status Table::Open(const Options& options,
                    Table** table) {
   *table = NULL;
   if (size < Footer::kEncodedLength) {
-    return Status::Corruption("file is too short to be an sstable");
+    return Status::InvalidArgument("file is too short to be an sstable");
   }
 
   char footer_space[Footer::kEncodedLength];
+  // stop valgrind uninitialize warning
+  // let footer.DecodeFrom returned status do the talking for read of bad info
+  memset(footer_space, 0, Footer::kEncodedLength);
+
   Slice footer_input;
   Status s = file->Read(size - Footer::kEncodedLength, Footer::kEncodedLength,
                         &footer_input, footer_space);
@@ -58,11 +69,7 @@ Status Table::Open(const Options& options,
   BlockContents contents;
   Block* index_block = NULL;
   if (s.ok()) {
-    ReadOptions opt;
-    if (options.paranoid_checks) {
-      opt.verify_checksums = true;
-    }
-    s = ReadBlock(file, opt, footer.index_handle(), &contents);
+    s = ReadBlock(file, ReadOptions(), footer.index_handle(), &contents);
     if (s.ok()) {
       index_block = new Block(contents);
     }
@@ -74,32 +81,32 @@ Status Table::Open(const Options& options,
     Rep* rep = new Table::Rep;
     rep->options = options;
     rep->file = file;
+    rep->file_size = size;
     rep->metaindex_handle = footer.metaindex_handle();
     rep->index_block = index_block;
     rep->cache_id = (options.block_cache ? options.block_cache->NewId() : 0);
     rep->filter_data = NULL;
+    rep->filter_data_size = 0;
     rep->filter = NULL;
+    rep->filter_policy = NULL;
+    rep->filter_flag = 0;
     *table = new Table(rep);
     (*table)->ReadMeta(footer);
   } else {
-    delete index_block;
+    if (index_block) delete index_block;
   }
 
   return s;
 }
 
 void Table::ReadMeta(const Footer& footer) {
-  if (rep_->options.filter_policy == NULL) {
-    return;  // Do not need any metadata
-  }
 
   // TODO(sanjay): Skip this if footer.metaindex_handle() size indicates
   // it is an empty block.
+  std::string key;
   ReadOptions opt;
-  if (rep_->options.paranoid_checks) {
-    opt.verify_checksums = true;
-  }
   BlockContents contents;
+
   if (!ReadBlock(rep_->file, opt, footer.metaindex_handle(), &contents).ok()) {
     // Do not propagate errors since meta info is not needed for operation
     return;
@@ -107,39 +114,139 @@ void Table::ReadMeta(const Footer& footer) {
   Block* meta = new Block(contents);
 
   Iterator* iter = meta->NewIterator(BytewiseComparator());
-  std::string key = "filter.";
-  key.append(rep_->options.filter_policy->Name());
+
+  // read filter only if policy set
+  if (NULL != rep_->options.filter_policy) {
+      bool found,first;
+      const FilterPolicy * policy, * next;
+
+      first=true;
+      next=NULL;
+
+      do
+      {
+          found=false;
+
+          if (first)
+          {
+              policy=rep_->options.filter_policy;
+              next=FilterInventory::ListHead;
+              first=false;
+          }   // if
+          else
+          {
+              policy=next;
+              if (NULL!=policy)
+                  next=policy->GetNext();
+              else
+                  next=NULL;
+          }   // else
+
+          if (NULL!=policy)
+          {
+              key = "filter.";
+              key.append(policy->Name());
+              iter->Seek(key);
+              if (iter->Valid() && iter->key() == Slice(key))
+              {
+                  // store information needed to load bloom filter
+                  //  at a later time
+                  Slice v = iter->value();
+                  rep_->filter_handle.DecodeFrom(&v);
+                  rep_->filter_policy = policy;
+
+                  found=true;
+              }   // if
+          }   //if
+      } while(!found && NULL!=policy);
+  }   // if
+
+  // always read counters
+  key="stats.sst1";
   iter->Seek(key);
   if (iter->Valid() && iter->key() == Slice(key)) {
-    ReadFilter(iter->value());
+      ReadSstCounters(iter->value());
   }
+
   delete iter;
   delete meta;
 }
 
-void Table::ReadFilter(const Slice& filter_handle_value) {
-  Slice v = filter_handle_value;
-  BlockHandle filter_handle;
-  if (!filter_handle.DecodeFrom(&v).ok()) {
-    return;
-  }
 
+// public version that reads filter at some time
+//  after open ... true if filter read
+bool
+Table::ReadFilter()
+{
+    bool ret_flag;
+
+    ret_flag=false;
+
+    if (0!=rep_->filter_handle.size()
+        && NULL!=rep_->filter_policy
+        && 1 == inc_and_fetch(&rep_->filter_flag))
+    {
+        gPerfCounters->Inc(ePerfBlockFilterRead);
+
+        ReadFilter(rep_->filter_handle, rep_->filter_policy);
+        ret_flag=(NULL != rep_->filter);
+
+        // only attempt the read once
+        rep_->filter_handle.set_size(0);
+    }   // if
+
+    return(ret_flag);
+}   // ReadFilter
+
+// Private version of ReadFilter that does the actual work
+void
+Table::ReadFilter(
+    BlockHandle & filter_handle,
+    const FilterPolicy * policy)
+{
   // We might want to unify with ReadBlock() if we start
   // requiring checksum verification in Table::Open.
   ReadOptions opt;
-  if (rep_->options.paranoid_checks) {
-    opt.verify_checksums = true;
-  }
   BlockContents block;
   if (!ReadBlock(rep_->file, opt, filter_handle, &block).ok()) {
     return;
   }
   if (block.heap_allocated) {
     rep_->filter_data = block.data.data();     // Will need to delete later
+    rep_->filter_data_size = block.data.size();
   }
-  rep_->filter = new FilterBlockReader(rep_->options.filter_policy, block.data);
+
+  rep_->filter = new FilterBlockReader(policy, block.data);
 }
 
+
+void Table::ReadSstCounters(const Slice& sst_counters_handle_value) {
+  Slice v = sst_counters_handle_value;
+  BlockHandle counters_handle;
+  if (!counters_handle.DecodeFrom(&v).ok()) {
+    return;
+  }
+
+  // We might want to unify with ReadBlock() if we start
+  // requiring checksum verification in Table::Open.
+  ReadOptions opt;
+  BlockContents block;
+  if (!ReadBlock(rep_->file, opt, counters_handle, &block).ok()) {
+    return;
+  }
+  if (block.heap_allocated) {
+    rep_->sst_counters.DecodeFrom(block.data);
+    delete [] block.data.data();
+  }
+
+}
+
+SstCounters Table::GetSstCounters() const
+{
+    return(rep_->sst_counters);
+}   // Table::GetSstCounters
+
+
 Table::~Table() {
   delete rep_;
 }
@@ -185,18 +292,23 @@ Iterator* Table::BlockReader(void* arg,
       cache_handle = block_cache->Lookup(key);
       if (cache_handle != NULL) {
         block = reinterpret_cast<Block*>(block_cache->Value(cache_handle));
+        gPerfCounters->Inc(ePerfBlockCached);
       } else {
         s = ReadBlock(table->rep_->file, options, handle, &contents);
+        gPerfCounters->Inc(ePerfBlockRead);
         if (s.ok()) {
           block = new Block(contents);
           if (contents.cachable && options.fill_cache) {
             cache_handle = block_cache->Insert(
-                key, block, block->size(), &DeleteCachedBlock);
+                key, block,
+                (block->size() + /*block_cache->EntryOverheadSize() +*/ sizeof(cache_key_buffer)),
+                &DeleteCachedBlock);
           }
         }
       }
     } else {
       s = ReadBlock(table->rep_->file, options, handle, &contents);
+        gPerfCounters->Inc(ePerfBlockRead);
       if (s.ok()) {
         block = new Block(contents);
       }
@@ -225,7 +337,7 @@ Iterator* Table::NewIterator(const ReadOptions& options) const {
 
 Status Table::InternalGet(const ReadOptions& options, const Slice& k,
                           void* arg,
-                          void (*saver)(void*, const Slice&, const Slice&)) {
+                          bool (*saver)(void*, const Slice&, const Slice&)) {
   Status s;
   Iterator* iiter = rep_->index_block->NewIterator(rep_->options.comparator);
   iiter->Seek(k);
@@ -237,12 +349,19 @@ Status Table::InternalGet(const ReadOptions& options, const Slice& k,
         handle.DecodeFrom(&handle_value).ok() &&
         !filter->KeyMayMatch(handle.offset(), k)) {
       // Not found
+        gPerfCounters->Inc(ePerfBlockFiltered);
     } else {
       Iterator* block_iter = BlockReader(this, options, iiter->value());
       block_iter->Seek(k);
       if (block_iter->Valid()) {
-        (*saver)(arg, block_iter->key(), block_iter->value());
+        bool match;
+        match=(*saver)(arg, block_iter->key(), block_iter->value());
+        if (!match && NULL!=filter)
+            gPerfCounters->Inc(ePerfBlockFilterFalse);
+        if (match)
+            gPerfCounters->Inc(ePerfBlockValidGet);
       }
+
       s = block_iter->status();
       delete block_iter;
     }
@@ -282,4 +401,27 @@ uint64_t Table::ApproximateOffsetOf(const Slice& key) const {
   return result;
 }
 
+
+uint64_t
+Table::GetFileSize()
+{
+    return(rep_->file_size);
+};
+
+Block *
+Table::TEST_GetIndexBlock() {return(rep_->index_block);};
+
+// Riak specific routine.  Calculates total footprint of an open
+//  table in memory.
+size_t
+Table::TableObjectSize()
+{
+    return(sizeof(Table) + sizeof(Table::Rep) + rep_->index_block->size() + rep_->filter_data_size + rep_->file->ObjectSize()
+           + sizeof(FilterBlockReader) + sizeof(Block));
+};
+
+size_t
+Table::TEST_FilterDataSize() {return(rep_->filter_data_size);};
+
+
 }  // namespace leveldb
diff --git a/src/leveldb/table/table_builder.cc b/src/leveldb/table/table_builder.cc
index 62002c84f..0672cc742 100644
--- a/src/leveldb/table/table_builder.cc
+++ b/src/leveldb/table/table_builder.cc
@@ -5,15 +5,19 @@
 #include "leveldb/table_builder.h"
 
 #include <assert.h>
+#include "db/dbformat.h"
 #include "leveldb/comparator.h"
 #include "leveldb/env.h"
+#include "leveldb/expiry.h"
 #include "leveldb/filter_policy.h"
 #include "leveldb/options.h"
+#include "leveldb/perf_count.h"
 #include "table/block_builder.h"
 #include "table/filter_block.h"
 #include "table/format.h"
 #include "util/coding.h"
 #include "util/crc32c.h"
+#include "util/lz4.h"
 
 namespace leveldb {
 
@@ -29,6 +33,7 @@ struct TableBuilder::Rep {
   int64_t num_entries;
   bool closed;          // Either Finish() or Abandon() has been called.
   FilterBlockBuilder* filter_block;
+  SstCounters sst_counters;
 
   // We do not emit the index entry for a block until we have seen the
   // first key for the next data block.  This allows us to use shorter
@@ -104,6 +109,7 @@ void TableBuilder::Add(const Slice& key, const Slice& value) {
     r->pending_handle.EncodeTo(&handle_encoding);
     r->index_block.Add(r->last_key, Slice(handle_encoding));
     r->pending_index_entry = false;
+    r->sst_counters.Inc(eSstCountIndexKeys);
   }
 
   if (r->filter_block != NULL) {
@@ -114,6 +120,38 @@ void TableBuilder::Add(const Slice& key, const Slice& value) {
   r->num_entries++;
   r->data_block.Add(key, value);
 
+  // statistics
+  r->sst_counters.Inc(eSstCountKeys);
+  r->sst_counters.Add(eSstCountKeySize, key.size());
+  r->sst_counters.Add(eSstCountValueSize, value.size());
+
+  if (key.size() < r->sst_counters.Value(eSstCountKeySmallest))
+      r->sst_counters.Set(eSstCountKeySmallest, key.size());
+  if (r->sst_counters.Value(eSstCountKeyLargest) < key.size())
+      r->sst_counters.Set(eSstCountKeyLargest, key.size());
+
+  if (value.size() < r->sst_counters.Value(eSstCountValueSmallest))
+      r->sst_counters.Set(eSstCountValueSmallest, value.size());
+  if (r->sst_counters.Value(eSstCountValueLargest) < value.size())
+      r->sst_counters.Set(eSstCountValueLargest, value.size());
+
+  // unit tests use non-standard keys ... must ignore the short ones
+  if (8 < key.size() && kTypeDeletion==ExtractValueType(key))
+      r->sst_counters.Inc(eSstCountDeleteKey);
+
+  // again ignore short keys, save high sequence number for abbreviated repair
+  if (8 <= key.size()
+      && r->sst_counters.Value(eSstCountSequence)<ExtractSequenceNumber(key))
+      r->sst_counters.Set(eSstCountSequence,ExtractSequenceNumber(key));
+
+  // statistics if an expiry key
+  //  Note: not using ExpiryActivated().  Forcing expiry statistics which
+  //  are upgrade / downgrade safe.
+  if (NULL!=r->options.expiry_module.get())
+  {
+      r->options.expiry_module->TableBuilderCallback(key, r->sst_counters);
+  } // if
+
   const size_t estimated_block_size = r->data_block.CurrentSizeEstimate();
   if (estimated_block_size >= r->options.block_size) {
     Flush();
@@ -145,16 +183,28 @@ void TableBuilder::WriteBlock(BlockBuilder* block, BlockHandle* handle) {
   Rep* r = rep_;
   Slice raw = block->Finish();
 
+  r->sst_counters.Inc(eSstCountBlocks);
+  r->sst_counters.Add(eSstCountBlockSize, raw.size());
+
   Slice block_contents;
   CompressionType type = r->options.compression;
   // TODO(postrelease): Support more compression options: zlib?
+  std::string * compressed;
+
   switch (type) {
+    case kNoCompressionAutomated:
+      // automation disabled compression
+      type=kNoCompression;
+      r->sst_counters.Inc(eSstCountCompressAborted);
+      block_contents = raw;
+      break;
+
     case kNoCompression:
       block_contents = raw;
       break;
 
-    case kSnappyCompression: {
-      std::string* compressed = &r->compressed_output;
+    case kSnappyCompression:
+      compressed = &r->compressed_output;
       if (port::Snappy_Compress(raw.data(), raw.size(), compressed) &&
           compressed->size() < raw.size() - (raw.size() / 8u)) {
         block_contents = *compressed;
@@ -163,11 +213,36 @@ void TableBuilder::WriteBlock(BlockBuilder* block, BlockHandle* handle) {
         // store uncompressed form
         block_contents = raw;
         type = kNoCompression;
+        r->sst_counters.Inc(eSstCountCompressAborted);
       }
       break;
-    }
+
+    case kLZ4Compression:
+      compressed = &r->compressed_output;
+      int limit, result_size;
+      limit=raw.size() - (raw.size() / 8u);
+
+      compressed->resize(limit+4);
+      result_size=LZ4_compress_default(raw.data(), (char *)(compressed->data())+4, raw.size(), limit);
+      if (result_size)
+      {
+          EncodeFixed32((char *)compressed->data(), raw.size());
+          compressed->resize(result_size+4);
+          block_contents = *compressed;
+      }
+      else {
+        // Snappy not supported, or compressed less than 12.5%, so just
+        // store uncompressed form
+        block_contents = raw;
+        type = kNoCompression;
+        r->sst_counters.Inc(eSstCountCompressAborted);
+      }
+      break;
+
+
   }
   WriteRawBlock(block_contents, type, handle);
+  r->sst_counters.Add(eSstCountBlockWriteSize, block_contents.size());
   r->compressed_output.clear();
   block->Reset();
 }
@@ -202,7 +277,12 @@ Status TableBuilder::Finish() {
   assert(!r->closed);
   r->closed = true;
 
-  BlockHandle filter_block_handle, metaindex_block_handle, index_block_handle;
+  BlockHandle filter_block_handle, metaindex_block_handle, index_block_handle,
+      sst_stats_block_handle;
+
+  // pass hint to Linux fadvise management
+  r->sst_counters.Set(eSstCountUserDataSize, r->offset);
+  r->file->SetMetadataOffset(r->offset);
 
   // Write filter block
   if (ok() && r->filter_block != NULL) {
@@ -210,18 +290,42 @@ Status TableBuilder::Finish() {
                   &filter_block_handle);
   }
 
+  // Write sst statistic counters
+  if (ok())
+  {
+      std::string encoded_stats;
+
+      r->sst_counters.Set(eSstCountBlockSizeUsed, r->options.block_size);
+
+      if (r->pending_index_entry)
+          r->sst_counters.Inc(eSstCountIndexKeys);
+
+      r->sst_counters.EncodeTo(encoded_stats);
+      WriteRawBlock(Slice(encoded_stats), kNoCompression,
+                    &sst_stats_block_handle);
+  }   // if
+
   // Write metaindex block
   if (ok()) {
     BlockBuilder meta_index_block(&r->options);
+    std::string key, handle_encoding;
+
     if (r->filter_block != NULL) {
       // Add mapping from "filter.Name" to location of filter data
-      std::string key = "filter.";
+      key = "filter.";
       key.append(r->options.filter_policy->Name());
-      std::string handle_encoding;
+      handle_encoding.clear();
       filter_block_handle.EncodeTo(&handle_encoding);
       meta_index_block.Add(key, handle_encoding);
+
     }
 
+    // Add mapping for "stats.sst1"
+    key = "stats.sst1";
+    handle_encoding.clear();
+    sst_stats_block_handle.EncodeTo(&handle_encoding);
+    meta_index_block.Add(key, handle_encoding);
+
     // TODO(postrelease): Add stats and other meta blocks
     WriteBlock(&meta_index_block, &metaindex_block_handle);
   }
@@ -267,4 +371,20 @@ uint64_t TableBuilder::FileSize() const {
   return rep_->offset;
 }
 
+uint64_t TableBuilder::NumDeletes() const {
+  return rep_->sst_counters.Value(eSstCountDeleteKey);
+}
+
+uint64_t TableBuilder::GetExpiryWriteLow() const {
+  return rep_->sst_counters.Value(eSstCountExpiry1);
+}
+
+uint64_t TableBuilder::GetExpiryWriteHigh() const {
+  return rep_->sst_counters.Value(eSstCountExpiry2);
+}
+
+uint64_t TableBuilder::GetExpiryExplicitHigh() const {
+  return rep_->sst_counters.Value(eSstCountExpiry3);
+}
+
 }  // namespace leveldb
diff --git a/src/leveldb/table/table_test.cc b/src/leveldb/table/table_test.cc
index abf6e246f..4382c8e01 100644
--- a/src/leveldb/table/table_test.cc
+++ b/src/leveldb/table/table_test.cc
@@ -279,7 +279,7 @@ class KeyConvertingIterator: public Iterator {
   virtual ~KeyConvertingIterator() { delete iter_; }
   virtual bool Valid() const { return iter_->Valid(); }
   virtual void Seek(const Slice& target) {
-    ParsedInternalKey ikey(target, kMaxSequenceNumber, kTypeValue);
+    ParsedInternalKey ikey(target, 0, kMaxSequenceNumber, kTypeValue);
     std::string encoded;
     AppendInternalKey(&encoded, ikey);
     iter_->Seek(encoded);
@@ -644,36 +644,6 @@ class Harness {
   Constructor* constructor_;
 };
 
-// Test empty table/block.
-TEST(Harness, Empty) {
-  for (int i = 0; i < kNumTestArgs; i++) {
-    Init(kTestArgList[i]);
-    Random rnd(test::RandomSeed() + 1);
-    Test(&rnd);
-  }
-}
-
-// Special test for a block with no restart entries.  The C++ leveldb
-// code never generates such blocks, but the Java version of leveldb
-// seems to.
-TEST(Harness, ZeroRestartPointsInBlock) {
-  char data[sizeof(uint32_t)];
-  memset(data, 0, sizeof(data));
-  BlockContents contents;
-  contents.data = Slice(data, sizeof(data));
-  contents.cachable = false;
-  contents.heap_allocated = false;
-  Block block(contents);
-  Iterator* iter = block.NewIterator(BytewiseComparator());
-  iter->SeekToFirst();
-  ASSERT_TRUE(!iter->Valid());
-  iter->SeekToLast();
-  ASSERT_TRUE(!iter->Valid());
-  iter->Seek("foo");
-  ASSERT_TRUE(!iter->Valid());
-  delete iter;
-}
-
 // Test the empty key
 TEST(Harness, SimpleEmptyKey) {
   for (int i = 0; i < kNumTestArgs; i++) {
@@ -769,7 +739,7 @@ TEST(MemTableTest, Simple) {
   batch.Put(std::string("k2"), std::string("v2"));
   batch.Put(std::string("k3"), std::string("v3"));
   batch.Put(std::string("largekey"), std::string("vlarge"));
-  ASSERT_TRUE(WriteBatchInternal::InsertInto(&batch, memtable).ok());
+  ASSERT_TRUE(WriteBatchInternal::InsertInto(&batch, memtable, NULL).ok());
 
   Iterator* iter = memtable->NewIterator();
   iter->SeekToFirst();
@@ -853,20 +823,12 @@ TEST(TableTest, ApproximateOffsetOfCompressed) {
   options.compression = kSnappyCompression;
   c.Finish(options, &keys, &kvmap);
 
-  // Expected upper and lower bounds of space used by compressible strings.
-  static const int kSlop = 1000;  // Compressor effectiveness varies.
-  const int expected = 2500;  // 10000 * compression ratio (0.25)
-  const int min_z = expected - kSlop;
-  const int max_z = expected + kSlop;
-
-  ASSERT_TRUE(Between(c.ApproximateOffsetOf("abc"), 0, kSlop));
-  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01"), 0, kSlop));
-  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k02"), 0, kSlop));
-  // Have now emitted a large compressible string, so adjust expected offset.
-  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k03"), min_z, max_z));
-  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04"), min_z, max_z));
-  // Have now emitted two large compressible strings, so adjust expected offset.
-  ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"), 2 * min_z, 2 * max_z));
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("abc"),       0,      0));
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01"),       0,      0));
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k02"),       0,      0));
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k03"),    2000,   3000));
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04"),    2000,   3000));
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"),    4000,   6000));
 }
 
 }  // namespace leveldb
diff --git a/src/leveldb/tools/builder.list b/src/leveldb/tools/builder.list
new file mode 100644
index 000000000..0f7baedaa
--- /dev/null
+++ b/src/leveldb/tools/builder.list
@@ -0,0 +1,15 @@
+10.0.27.221    # Centos 6.3
+10.0.27.222    # Debian Squeeze
+10.0.27.231    # Fedora 17
+10.0.27.248    # Centos 7
+10.0.27.234    # SmartOS 1.8.4
+10.0.27.211    # SmartOS 1.6.3
+10.0.27.220    # Centos 5.8
+10.0.27.251    # SLES 11
+10.0.27.190    # FreeBSD 9.2
+10.0.27.214    # FreeBSD 9 64
+10.0.27.213    # Ubuntu Lucid 64
+10.0.27.219    # Ubuntu Natty 64
+10.0.27.212    # Ubuntu Precise 64
+10.0.27.250    # Ubuntu 14
+10.0.27.217    # Solaris
diff --git a/src/leveldb/tools/buildtester.sh b/src/leveldb/tools/buildtester.sh
new file mode 100755
index 000000000..6524a1b7b
--- /dev/null
+++ b/src/leveldb/tools/buildtester.sh
@@ -0,0 +1,89 @@
+#! /bin/bash
+## ./buildtester.sh builder.list leveldb_tag
+##    or
+## ./buildertest.sh builder.list      ""       tarfile.tgz
+##       $0         $1                $2           $3
+##
+## NOTE: you must manually ssh to each buildbot to get RSA fingerprint
+##       into your local known_hosts file before script works
+
+# eleveldb requires knowing which erlang is installed where 
+REPO=leveldb
+
+##
+##  Subroutines must appear before code using them
+##
+
+ssh_command()
+{
+#   ssh_command "ip address" "command to execute"
+    if ssh -q -o 'BatchMode yes' buildbot@$1 "$2"
+    then
+        echo "success: $2"
+    else
+        echo "Error on $1 executing $2"
+        exit 1
+    fi
+}
+
+ssh_command_test()
+{
+#   ssh_command "ip address" "command to execute"
+    if ssh -q -o 'BatchMode yes' buildbot@$1 "$2"
+    then
+        return 0
+    else
+        return 1
+    fi
+}
+
+#
+# main
+#
+
+if [ $# == 2 ]; then
+    echo "2 parameters"
+    builder_list=$(cut -d ' ' -f 1 $1)
+    for builder in $builder_list
+    do
+        echo "builder: " $builder
+
+        ## remove previous eleveldb instance
+        #ssh_command $builder "cd ~/$USER && if [ -d eleveldb ] rm -rf eleveldb"
+        echo -n "Start $builder: " >>./builder.log
+        date >>./builder.log
+
+        ssh_command $builder "rm -rf ~/$USER/$REPO"
+        ssh_command $builder "mkdir -p ~/$USER"
+        ssh_command $builder "cd ~/$USER && git clone git@github.com:basho/$REPO"
+        ssh_command $builder "cd ~/$USER/$REPO && git checkout $2"
+
+        # freeBSD needs gmake explicitly, otherwise "Missing dependency operator" errors
+        #  but other platforms assume "make" is gnumake
+        if ssh_command_test $builder "which gmake"
+        then 
+            ssh_command $builder "cd ~/$USER/$REPO && gmake -j 4"
+            echo -n "Test $builder: " >>./builder.log
+            date >>./builder.log
+            ssh_command $builder "cd ~/$USER/$REPO && export LD_LIBRARY_PATH=. && gmake -j 4 check"
+            # freebsd error: util/cache2_test.cc:170: failed: -1 == 201 ... fixed
+        else
+            ssh_command $builder "cd ~/$USER/$REPO && make -j 4"
+            echo -n "Test $builder: " >>./builder.log
+            date >>./builder.log
+            ssh_command $builder "cd ~/$USER/$REPO && export LD_LIBRARY_PATH=. && make -j 4 check"
+        fi
+
+        echo -n "End $builder: " >>./builder.log
+        date >>./builder.log
+        echo "" >>./builder.log
+    done
+elif [ $# == 3 ]; then
+    echo "3 parameters"
+else
+    echo " ./buildtester.sh builder.list leveldb_tag"
+    echo "    or"
+    echo " ./buildertest.sh builder.list \"\" tarfile.tgz"
+fi
+
+exit 0
diff --git a/src/leveldb/db/dumpfile.cc b/src/leveldb/tools/leveldb_main.cc
similarity index 51%
rename from src/leveldb/db/dumpfile.cc
rename to src/leveldb/tools/leveldb_main.cc
index 61c47c2ff..995d76107 100644
--- a/src/leveldb/db/dumpfile.cc
+++ b/src/leveldb/tools/leveldb_main.cc
@@ -35,112 +35,93 @@ bool GuessType(const std::string& fname, FileType* type) {
 // Notified when log reader encounters corruption.
 class CorruptionReporter : public log::Reader::Reporter {
  public:
-  WritableFile* dst_;
   virtual void Corruption(size_t bytes, const Status& status) {
-    std::string r = "corruption: ";
-    AppendNumberTo(&r, bytes);
-    r += " bytes; ";
-    r += status.ToString();
-    r.push_back('\n');
-    dst_->Append(r);
+    printf("corruption: %d bytes; %s\n",
+            static_cast<int>(bytes),
+            status.ToString().c_str());
   }
 };
 
 // Print contents of a log file. (*func)() is called on every record.
-Status PrintLogContents(Env* env, const std::string& fname,
-                        void (*func)(uint64_t, Slice, WritableFile*),
-                        WritableFile* dst) {
+bool PrintLogContents(Env* env, const std::string& fname,
+                      void (*func)(Slice)) {
   SequentialFile* file;
   Status s = env->NewSequentialFile(fname, &file);
   if (!s.ok()) {
-    return s;
+    fprintf(stderr, "%s\n", s.ToString().c_str());
+    return false;
   }
   CorruptionReporter reporter;
-  reporter.dst_ = dst;
   log::Reader reader(file, &reporter, true, 0);
   Slice record;
   std::string scratch;
   while (reader.ReadRecord(&record, &scratch)) {
-    (*func)(reader.LastRecordOffset(), record, dst);
+    printf("--- offset %llu; ",
+           static_cast<unsigned long long>(reader.LastRecordOffset()));
+    (*func)(record);
   }
   delete file;
-  return Status::OK();
+  return true;
 }
 
 // Called on every item found in a WriteBatch.
 class WriteBatchItemPrinter : public WriteBatch::Handler {
  public:
-  WritableFile* dst_;
+  uint64_t offset_;
+  uint64_t sequence_;
+
   virtual void Put(const Slice& key, const Slice& value) {
-    std::string r = "  put '";
-    AppendEscapedStringTo(&r, key);
-    r += "' '";
-    AppendEscapedStringTo(&r, value);
-    r += "'\n";
-    dst_->Append(r);
+    printf("  put '%s' '%s'\n",
+           EscapeString(key).c_str(),
+           EscapeString(value).c_str());
   }
   virtual void Delete(const Slice& key) {
-    std::string r = "  del '";
-    AppendEscapedStringTo(&r, key);
-    r += "'\n";
-    dst_->Append(r);
+    printf("  del '%s'\n",
+           EscapeString(key).c_str());
   }
 };
 
 
 // Called on every log record (each one of which is a WriteBatch)
 // found in a kLogFile.
-static void WriteBatchPrinter(uint64_t pos, Slice record, WritableFile* dst) {
-  std::string r = "--- offset ";
-  AppendNumberTo(&r, pos);
-  r += "; ";
+static void WriteBatchPrinter(Slice record) {
   if (record.size() < 12) {
-    r += "log record length ";
-    AppendNumberTo(&r, record.size());
-    r += " is too small\n";
-    dst->Append(r);
+    printf("log record length %d is too small\n",
+           static_cast<int>(record.size()));
     return;
   }
   WriteBatch batch;
   WriteBatchInternal::SetContents(&batch, record);
-  r += "sequence ";
-  AppendNumberTo(&r, WriteBatchInternal::Sequence(&batch));
-  r.push_back('\n');
-  dst->Append(r);
+  printf("sequence %llu\n",
+         static_cast<unsigned long long>(WriteBatchInternal::Sequence(&batch)));
   WriteBatchItemPrinter batch_item_printer;
-  batch_item_printer.dst_ = dst;
   Status s = batch.Iterate(&batch_item_printer);
   if (!s.ok()) {
-    dst->Append("  error: " + s.ToString() + "\n");
+    printf("  error: %s\n", s.ToString().c_str());
   }
 }
 
-Status DumpLog(Env* env, const std::string& fname, WritableFile* dst) {
-  return PrintLogContents(env, fname, WriteBatchPrinter, dst);
+bool DumpLog(Env* env, const std::string& fname) {
+  return PrintLogContents(env, fname, WriteBatchPrinter);
 }
 
 // Called on every log record (each one of which is a WriteBatch)
 // found in a kDescriptorFile.
-static void VersionEditPrinter(uint64_t pos, Slice record, WritableFile* dst) {
-  std::string r = "--- offset ";
-  AppendNumberTo(&r, pos);
-  r += "; ";
+static void VersionEditPrinter(Slice record) {
   VersionEdit edit;
   Status s = edit.DecodeFrom(record);
   if (!s.ok()) {
-    r += s.ToString();
-    r.push_back('\n');
-  } else {
-    r += edit.DebugString();
+    printf("%s\n", s.ToString().c_str());
+    return;
   }
-  dst->Append(r);
+  printf("%s", edit.DebugString().c_str());
 }
 
-Status DumpDescriptor(Env* env, const std::string& fname, WritableFile* dst) {
-  return PrintLogContents(env, fname, VersionEditPrinter, dst);
+bool DumpDescriptor(Env* env, const std::string& fname) {
+  return PrintLogContents(env, fname, VersionEditPrinter);
 }
 
-Status DumpTable(Env* env, const std::string& fname, WritableFile* dst) {
+bool DumpTable(Env* env, const std::string& fname) {
   uint64_t file_size;
   RandomAccessFile* file = NULL;
   Table* table = NULL;
@@ -156,70 +137,102 @@ Status DumpTable(Env* env, const std::string& fname, WritableFile* dst) {
     s = Table::Open(Options(), file, file_size, &table);
   }
   if (!s.ok()) {
+    fprintf(stderr, "%s\n", s.ToString().c_str());
     delete table;
     delete file;
-    return s;
+    return false;
   }
 
   ReadOptions ro;
   ro.fill_cache = false;
   Iterator* iter = table->NewIterator(ro);
-  std::string r;
   for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
-    r.clear();
     ParsedInternalKey key;
     if (!ParseInternalKey(iter->key(), &key)) {
-      r = "badkey '";
-      AppendEscapedStringTo(&r, iter->key());
-      r += "' => '";
-      AppendEscapedStringTo(&r, iter->value());
-      r += "'\n";
-      dst->Append(r);
+      printf("badkey '%s' => '%s'\n",
+             EscapeString(iter->key()).c_str(),
+             EscapeString(iter->value()).c_str());
     } else {
-      r = "'";
-      AppendEscapedStringTo(&r, key.user_key);
-      r += "' @ ";
-      AppendNumberTo(&r, key.sequence);
-      r += " : ";
+      char kbuf[20];
+      const char* type;
       if (key.type == kTypeDeletion) {
-        r += "del";
+        type = "del";
       } else if (key.type == kTypeValue) {
-        r += "val";
+        type = "val";
       } else {
-        AppendNumberTo(&r, key.type);
+        snprintf(kbuf, sizeof(kbuf), "%d", static_cast<int>(key.type));
+        type = kbuf;
       }
-      r += " => '";
-      AppendEscapedStringTo(&r, iter->value());
-      r += "'\n";
-      dst->Append(r);
+      printf("'%s' @ %8llu : %s => '%s'\n",
+             EscapeString(key.user_key).c_str(),
+             static_cast<unsigned long long>(key.sequence),
+             type,
+             EscapeString(iter->value()).c_str());
     }
   }
   s = iter->status();
   if (!s.ok()) {
-    dst->Append("iterator error: " + s.ToString() + "\n");
+    printf("iterator error: %s\n", s.ToString().c_str());
   }
 
   delete iter;
   delete table;
   delete file;
-  return Status::OK();
+  return true;
 }
 
-}  // namespace
-
-Status DumpFile(Env* env, const std::string& fname, WritableFile* dst) {
+bool DumpFile(Env* env, const std::string& fname) {
   FileType ftype;
   if (!GuessType(fname, &ftype)) {
-    return Status::InvalidArgument(fname + ": unknown file type");
+    fprintf(stderr, "%s: unknown file type\n", fname.c_str());
+    return false;
   }
   switch (ftype) {
-    case kLogFile:         return DumpLog(env, fname, dst);
-    case kDescriptorFile:  return DumpDescriptor(env, fname, dst);
-    case kTableFile:       return DumpTable(env, fname, dst);
-    default:
+    case kLogFile:         return DumpLog(env, fname);
+    case kDescriptorFile:  return DumpDescriptor(env, fname);
+    case kTableFile:       return DumpTable(env, fname);
+
+    default: {
+      fprintf(stderr, "%s: not a dump-able file type\n", fname.c_str());
       break;
+    }
   }
-  return Status::InvalidArgument(fname + ": not a dump-able file type");
+  return false;
 }
 
+bool HandleDumpCommand(Env* env, char** files, int num) {
+  bool ok = true;
+  for (int i = 0; i < num; i++) {
+    ok &= DumpFile(env, files[i]);
+  }
+  return ok;
+}
+
+}
 }  // namespace leveldb
+
+static void Usage() {
+  fprintf(
+      stderr,
+      "Usage: leveldbutil command...\n"
+      "   dump files...         -- dump contents of specified files\n"
+      );
+}
+
+int main(int argc, char** argv) {
+  leveldb::Env* env = leveldb::Env::Default();
+  bool ok = true;
+  if (argc < 2) {
+    Usage();
+    ok = false;
+  } else {
+    std::string command = argv[1];
+    if (command == "dump") {
+      ok = leveldb::HandleDumpCommand(env, argv+2, argc-2);
+    } else {
+      Usage();
+      ok = false;
+    }
+  }
+  return (ok ? 0 : 1);
+}
diff --git a/src/leveldb/tools/leveldb_repair.cc b/src/leveldb/tools/leveldb_repair.cc
new file mode 100644
index 000000000..a0cfb08fb
--- /dev/null
+++ b/src/leveldb/tools/leveldb_repair.cc
@@ -0,0 +1,99 @@
+#include <stdlib.h>
+#include <libgen.h>
+
+#include "db/filename.h"
+#include "leveldb/env.h"
+#include "leveldb/db.h"
+#include "leveldb/cache.h"
+#include "leveldb/iterator.h"
+#include "leveldb/filter_policy.h"
+#include "leveldb/slice.h"
+#include "db/table_cache.h"
+#include "db/version_edit.h"
+#include "table/format.h"
+#include "table/block.h"
+#include "table/filter_block.h"
+
+//#include "util/logging.h"
+//#include "db/log_reader.h"
+
+void command_help();
+
+int
+main(
+    int argc,
+    char ** argv)
+{
+    bool error_seen, running;
+    int error_counter;
+    char ** cursor;
+
+    running=true;
+    error_seen=false;
+    error_counter=0;
+
+
+    for (cursor=argv+1; NULL!=*cursor && running; ++cursor)
+    {
+        // option flag?
+        if ('-'==**cursor)
+        {
+            char flag;
+
+            flag=*((*cursor)+1);
+            switch(flag)
+            {
+                default:
+                    fprintf(stderr, " option \'%c\' is not valid\n", flag);
+                    command_help();
+                    running=false;
+                    error_counter+=1;
+                    error_seen=true;
+                    break;
+            }   // switch
+        }   // if
+
+        // database path
+        else
+        {
+            std::string dbname;
+            leveldb::Options options;
+            leveldb::Status status;
+
+            dbname=*cursor;
+            options.env=leveldb::Env::Default();
+
+            status=leveldb::RepairDB(dbname.c_str(), options);
+            printf("Repair of %s %s.\n",
+                   dbname.c_str(),
+                   (status.ok() ? "successful" : "failed"));
+
+            if (!status.ok())
+            {
+                ++error_counter;
+                error_seen=true;
+            }   // if
+        }   // else
+    }   // for
+
+    if (1==argc)
+        command_help();
+
+    return( error_seen && 0!=error_counter ? 1 : 0 );
+
+}   // main
+
+
+void
+command_help()
+{
+    fprintf(stderr, "leveldb_repair [option | data_base]*\n");
+    fprintf(stderr, "  options\n");
+    fprintf(stderr, "      (none at this time)\n");
+}   // command_help
+
+namespace leveldb {
+
+
+}  // namespace leveldb
+
diff --git a/src/leveldb/tools/pbuilder.list b/src/leveldb/tools/pbuilder.list
new file mode 100644
index 000000000..bb9e82172
--- /dev/null
+++ b/src/leveldb/tools/pbuilder.list
@@ -0,0 +1,16 @@
+10.0.27.222    debian6
+10.0.27.239    debian7
+10.0.27.249    fedora19
+10.0.27.190    freebsd9.2
+10.0.27.220    rhel5
+10.0.27.221    rhel6
+10.0.27.248    rhel7
+10.0.27.251    sles11
+10.0.27.234    smartos1.8
+10.0.27.217    solaris10
+10.0.27.213    ubuntuLucid
+10.0.27.212    ubuntuPrecise
+10.0.27.250    ubuntuTrusty
+bsd-build.bos1 freebsd10
+mac-mini.bos1  osx10.8
+10.0.27.240    smartos13.1
diff --git a/src/leveldb/tools/pbuilder.sh b/src/leveldb/tools/pbuilder.sh
new file mode 100755
index 000000000..fd28dcf40
--- /dev/null
+++ b/src/leveldb/tools/pbuilder.sh
@@ -0,0 +1,99 @@
+#! /bin/bash
+## ./pbuilder.sh builder.list leveldb_tag
+##    or
+## ./pbuilder.sh builder.list      ""       tarfile.tgz
+##       $0         $1                $2           $3
+##
+## NOTE: you must manually ssh to each buildbot to get RSA fingerprint
+##       into your local known_hosts file before script works
+
+REPO=leveldb
+
+#
+# main
+#
+
+if [ $# == 2 ]; then
+    echo "2 parameters"
+    temp_path=$(mktemp)
+    temp_name=$(basename $temp_path)
+    echo "temp file " $temp_path
+    cat <<EOF >$temp_path
+rm -rf ~/$USER/$REPO
+mkdir -p ~/$USER
+cd ~/$USER
+echo " Git start:: " \$(date)
+git clone git@github.com:basho/$REPO
+cd $REPO
+git checkout $2
+export LD_LIBRARY_PATH=.
+
+echo "Make start: " \$(date)
+if which gmake
+then
+    if gmake -j 2 -s
+    then 
+       echo "Build successful."
+    else
+       echo "Build failed."
+       exit 1
+    fi
+    echo "Test start: " \$(date)
+    if gmake -j 2 -s check >/dev/null
+    then
+        echo "Test successful."
+    else
+        echo "Test failed."
+        exit 1
+    fi
+else
+    if make -j 2 -s
+    then 
+       echo "Build successful."
+    else
+       echo "Build failed."
+       exit 1
+    fi
+    echo "Test start: " \$(date)
+    if make -j 2 -s check >/dev/null
+    then
+        echo "Test successful."
+    else
+        echo "Test failed."
+        exit 1
+    fi
+fi
+echo "  Test end: " \$(date)
+EOF
+
+#
+#. /usr/local/erlang-r16b02/activate
+#echo "Build name: " $REPO\_$2_"\$1"
+#export build_name="$REPO\_$2_\$1"
+#echo "Again: $build_name"
+#env
+
+
+    chmod a+x $temp_path
+
+    mkdir -p ~/builds/$REPO
+    rm ~/builds/$REPO/out
+    rm ~/builds/$REPO/err
+    parallel --tag -a $1 --gnu --colsep '[ ]{1,}' scp $temp_path buildbot@{1}:~/.  >>~/builds/$REPO/out 2>>~/builds/$REPO/err
+    parallel --tag -a $1 --gnu --colsep '[ ]{1,}' ssh -q buildbot@{1} ./$temp_name {2} >>~/builds/$REPO/out 2>>~/builds/$REPO/err
+    parallel --tag -a $1 --gnu --colsep '[ ]{1,}' ssh -q buildbot@{1} rm $temp_name >>~/builds/$REPO/out 2>>~/builds/$REPO/err
+    echo "done"
+    rm $temp_path
+
+    grep 'Test successful.' ~/builds/$REPO/out
+    grep 'Test successful.' ~/builds/$REPO/out | wc -l
+    echo "Builder count: " $(wc -l $1)
+elif [ $# == 3 ]; then
+    echo "3 parameters"
+else
+    echo " ./pbuilder.sh leveldb_tag"
+    echo "    or"
+    echo " ./pbuilder.sh builder.list \"\" tarfile.tgz"
+fi
+
+exit 0
diff --git a/src/leveldb/tools/perf_dump.cc b/src/leveldb/tools/perf_dump.cc
new file mode 100644
index 000000000..551923268
--- /dev/null
+++ b/src/leveldb/tools/perf_dump.cc
@@ -0,0 +1,173 @@
+#include <stdlib.h>
+#include <unistd.h>
+#include <stdio.h>
+
+#include "leveldb/env.h"
+#include "leveldb/perf_count.h"
+#include "port/port.h"
+
+#define __STDC_FORMAT_MACROS
+#include <inttypes.h>
+
+void command_help();
+
+int
+main(
+    int argc,
+    char ** argv)
+{
+    bool error_seen, csv_header, diff_mode, running;
+    int error_counter;
+    unsigned diff_seconds;
+    char ** cursor;
+
+    running=true;
+    error_seen=false;
+    error_counter=0;
+
+    csv_header=false;
+    diff_mode=false;
+    diff_seconds=1;
+
+
+    for (cursor=argv+1; NULL!=*cursor && running; ++cursor)
+    {
+        // option flag?
+        if ('-'==**cursor)
+        {
+            char flag;
+
+            flag=*((*cursor)+1);
+            switch(flag)
+            {
+                case 'h':  csv_header=true; break;
+                case 'd':
+                    diff_mode=true;
+                    ++cursor;
+                    diff_seconds=strtoul(*cursor, NULL, 10);
+                    break;
+
+                default:
+                    fprintf(stderr, " option \'%c\' is not valid\n", flag);
+                    command_help();
+                    running=false;
+                    error_counter=1;
+                    error_seen=true;
+                    break;
+            }   // switch
+        }   // if
+
+        // non flag params
+        else
+        {
+            fprintf(stderr, " option \'%s\' is not valid\n", *cursor);
+            command_help();
+            running=false;
+            error_counter=1;
+            error_seen=true;
+        }   // else
+    }   // for
+
+    // attach to shared memory if params looking good
+    if (!error_seen)
+    {
+        const leveldb::PerformanceCounters * perf_ptr;
+        bool first_pass;
+
+        first_pass=true;
+        perf_ptr=leveldb::PerformanceCounters::Init(true);
+
+        if (NULL!=perf_ptr)
+        {
+            uint64_t first_time;
+            int loop;
+
+            first_time=leveldb::port::TimeMicros();
+
+            if (csv_header)
+            {
+                csv_header=false;
+                printf("time, diff time, name, count\n");
+            }   // if
+
+            if (diff_mode)
+            {
+                uint64_t prev_counters[leveldb::ePerfCountEnumSize], cur_counters[leveldb::ePerfCountEnumSize];
+                uint64_t cur_time;
+
+                do
+                {
+                    // capture state before reporting
+                    cur_time=leveldb::port::TimeMicros();
+                    for (loop=0; loop<leveldb::ePerfCountEnumSize; ++loop)
+                    {
+                        cur_counters[loop]=perf_ptr->Value(loop);
+                    }   // for
+
+                    if (!first_pass)
+                    {
+                        for (loop=0; loop<leveldb::ePerfCountEnumSize; ++loop)
+                        {
+                            printf("%" PRIu64 ", %" PRIu64 ", %s, %" PRIu64 "\n",
+                                   cur_time, cur_time-first_time,
+                                   leveldb::PerformanceCounters::GetNamePtr(loop),
+                                   cur_counters[loop]-prev_counters[loop]);
+                        }   // for
+                    }   // if
+
+                    first_pass=false;
+
+                    // save for next pass
+                    //  (counters are "live" so use data previously reported to maintain some consistency)
+                    for (loop=0; loop<leveldb::ePerfCountEnumSize; ++loop)
+                    {
+                        prev_counters[loop]=cur_counters[loop];
+                    }   // for
+
+                    sleep(diff_seconds);
+                } while(true);
+            }   // if
+
+            // one time dump
+            else
+            {
+                for (loop=0; loop<leveldb::ePerfCountEnumSize; ++loop)
+                {
+                    printf("%" PRIu64 ", %u, %s, %" PRIu64 "\n",
+                           first_time, 0,
+                           leveldb::PerformanceCounters::GetNamePtr(loop),
+                           perf_ptr->Value(loop));
+                }   // for
+            }   // else
+        }   // if
+        else
+        {
+            fprintf(stderr, "unable to attach to shared memory, error %d\n",
+                    leveldb::PerformanceCounters::m_LastError);
+            ++error_counter;
+            error_seen=true;
+        }   // else
+    }   // if
+
+    if (error_seen)
+        command_help();
+
+    return( error_seen && 0!=error_counter ? 1 : 0 );
+
+}   // main
+
+
+void
+command_help()
+{
+    fprintf(stderr, "perf_dump [option]*\n");
+    fprintf(stderr, "  options\n");
+    fprintf(stderr, "      -h    print csv formatted header line (once)\n");
+    fprintf(stderr, "      -d n  print diff ever n seconds\n");
+}   // command_help
+
+namespace leveldb {
+
+
+}  // namespace leveldb
+
diff --git a/src/leveldb/tools/ppackager.sh b/src/leveldb/tools/ppackager.sh
new file mode 100755
index 000000000..46f912083
--- /dev/null
+++ b/src/leveldb/tools/ppackager.sh
@@ -0,0 +1,136 @@
+#! /bin/bash
+## ./ppackager.sh builder.list leveldb_tag
+##    or
+## ./ppackager.sh builder.list      ""       tarfile.tgz
+##       $0         $1                $2           $3
+##
+## NOTE: you must manually ssh to each buildbot to get RSA fingerprint
+##       into your local known_hosts file before script works
+
+REPO=eleveldb
+
+#
+# main
+#
+
+if [ $# == 2 ]; then
+    echo "2 parameters"
+    temp_path=$(mktemp)
+    temp_name=$(basename $temp_path)
+    echo "temp file " $temp_path
+    cat <<EOF >$temp_path
+rm -rf ~/$USER/$REPO
+mkdir -p ~/$USER
+cd ~/$USER
+echo " Git start:: " \$(date)
+git clone git@github.com:basho/$REPO
+cd $REPO
+git checkout $2
+sed -i -e 's/% #!sed//' rebar.config test/eleveldb_schema_tests.erl
+export LD_LIBRARY_PATH=.
+rm ~/$USER/eleveldb_$2\*
+
+. /usr/local/erlang-r16b02/activate
+
+echo "Make start: " \$(date)
+if hash gmake 2>/dev/null
+then
+    if gmake -j 2 -s
+    then
+       echo "Build successful."
+    else
+       echo "Build failed."
+       exit 1
+    fi
+    echo "Test start: " \$(date)
+    if gmake -s test >/dev/null
+    then
+        echo "Test successful."
+    else
+        echo "Test failed."
+        #exit 1
+    fi
+else
+    if make -j 2 -s
+    then
+       echo "Build successful."
+    else
+       echo "Build failed."
+       exit 1
+    fi
+    echo "Test start: " \$(date)
+    if make -s test >/dev/null
+    then
+        echo "Test successful."
+    else
+        echo "Test failed."
+        #exit 1
+    fi
+fi
+echo "  Test end: " \$(date)
+
+cd priv
+cp -p ../ebin/eleveldb.beam .
+
+# hack to deal with the fact that md5sum may be in a weird place on smartos
+export PATH=$PATH:/opt/local/gnu/bin
+
+if hash md5sum 2>/dev/null
+then
+    echo calling md5sum
+    md5sum eleveldb.beam eleveldb.so >md5sum.txt
+else
+    if hash md5 2>/dev/null
+    then
+        echo calling md5
+        md5 -r eleveldb.beam eleveldb.so >md5sum.txt
+    else
+        // solaris does not have the md5sum or md5 commands, so use digest
+        echo calling digest
+        digest -a md5 eleveldb.beam eleveldb.so >md5sum.txt
+    fi
+fi
+
+if uname -a | grep solaris >/dev/null
+then
+    echo running tar and gzip on solaris
+    tar cf - eleveldb.beam eleveldb.so md5sum.txt | gzip -c > ~/$USER/eleveldb_$2_\$1.tar.gz
+else
+    echo running gnu tar with -z option
+    tar -czf ~/$USER/eleveldb_$2_\$1.tar.gz eleveldb.beam eleveldb.so md5sum.txt
+fi
+
+EOF
+
+#
+#echo "Build name: " $REPO\_$2_"\$1"
+#export build_name="$REPO\_$2_\$1"
+#echo "Again: $build_name"
+#env
+
+
+    chmod a+x $temp_path
+
+    mkdir -p ~/builds/$REPO
+    rm ~/builds/$REPO/out
+    rm ~/builds/$REPO/err
+    rm ~/builds/$REPO/eleveldb_$2*
+    parallel --tag -a $1 --gnu --colsep '[ ]{1,}' scp $temp_path buildbot@{1}:~/.  >>~/builds/$REPO/out 2>>~/builds/$REPO/err
+    parallel --tag -a $1 --gnu --colsep '[ ]{1,}' ssh -q buildbot@{1} ./$temp_name {2} >>~/builds/$REPO/out 2>>~/builds/$REPO/err
+    parallel --tag -a $1 --gnu --colsep '[ ]{1,}' ssh -q buildbot@{1} rm $temp_name >>~/builds/$REPO/out 2>>~/builds/$REPO/err
+    parallel --tag -a $1 --gnu --colsep '[ ]{1,}' scp -q buildbot@{1}:~/$USER/eleveldb_$2\* ~/builds/$REPO/.
+    echo "done"
+    rm $temp_path
+
+    grep 'Test successful.' ~/builds/$REPO/out
+    grep 'Test successful.' ~/builds/$REPO/out | wc -l
+    echo "Packager count: " $(wc -l $1)
+elif [ $# == 3 ]; then
+    echo "3 parameters"
+else
+    echo " ./ppackager.sh leveldb_tag"
+    echo "    or"
+    echo " ./ppackager.sh builder.list \"\" tarfile.tgz"
+fi
+
+exit 0
diff --git a/src/leveldb/tools/sst_rewrite.cc b/src/leveldb/tools/sst_rewrite.cc
new file mode 100644
index 000000000..49f233038
--- /dev/null
+++ b/src/leveldb/tools/sst_rewrite.cc
@@ -0,0 +1,398 @@
+// -------------------------------------------------------------------
+//
+// sst_rewrite.cc
+//
+// Copyright (c) 2015 Basho Technologies, Inc. All Rights Reserved.
+//
+// This file is provided to you under the Apache License,
+// Version 2.0 (the "License"); you may not use this file
+// except in compliance with the License.  You may obtain
+// a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+//
+// -------------------------------------------------------------------
+
+#include <memory>
+#include <stdio.h>
+#include <stdlib.h>
+
+#define __STDC_FORMAT_MACROS
+#include <inttypes.h>
+
+#include "leveldb/env.h"
+#include "leveldb/filter_policy.h"
+#include "leveldb/options.h"
+#include "leveldb/table.h"
+#include "leveldb/table_builder.h"
+
+void command_help();
+
+// wrapper class for opening / closing existing leveldb tables
+class LDbTable
+{
+public:
+    LDbTable(leveldb::Options &, std::string &);
+    virtual ~LDbTable();
+
+    bool Ok() const {return(m_IsOpen);};
+    leveldb::Iterator * NewIterator();
+
+    const leveldb::Status & GetStatus() const {return(m_LastStatus);};
+    const char * GetFileName() const {return(m_FileName.c_str());};
+
+    uint64_t GetSstCounter(unsigned Idx) const
+        {return(m_IsOpen ? m_TablePtr->GetSstCounters().Value(Idx) : 0);};
+
+protected:
+    leveldb::Options & m_Options;
+    const std::string m_FileName;
+    leveldb::RandomAccessFile * m_FilePtr;
+    leveldb::Table * m_TablePtr;
+    uint64_t m_FileSize;
+    leveldb::Status m_LastStatus;
+
+    bool m_IsOpen;
+
+    void Reset();
+
+private:
+    // disable these
+    LDbTable();
+    LDbTable(const LDbTable &);
+    const LDbTable operator=(const LDbTable&);
+};  // LDbTable
+
+
+LDbTable::LDbTable(
+    leveldb::Options & Options,
+    std::string & FileName)
+    : m_Options(Options), m_FileName(FileName),
+      m_FilePtr(NULL), m_TablePtr(NULL), m_FileSize(0), m_IsOpen(false)
+{
+    m_LastStatus=m_Options.env->GetFileSize(m_FileName, &m_FileSize);
+
+    if (m_LastStatus.ok())
+        {m_LastStatus=m_Options.env->NewRandomAccessFile(m_FileName, &m_FilePtr);}
+
+    if (m_LastStatus.ok())
+    {
+        m_LastStatus=leveldb::Table::Open(m_Options, m_FilePtr, m_FileSize, &m_TablePtr);
+
+        // use fadvise to start file pre-read
+        m_FilePtr->SetForCompaction(m_FileSize);
+    }   // if
+
+    m_IsOpen=m_LastStatus.ok();
+
+    if (!m_IsOpen)
+    {
+        // some people would throw() at this point, but not me
+        Reset();
+    }   // if
+
+    return;
+
+}   // LDbTable::LDbTable
+
+
+LDbTable::~LDbTable()
+{
+    Reset();
+
+    return;
+
+}   // LDbTable::~LDbTable
+
+
+void
+LDbTable::Reset()
+{
+    m_IsOpen=false;
+    delete m_TablePtr;
+    m_TablePtr=NULL;
+    delete m_FilePtr;
+    m_FilePtr=NULL;
+    m_FileSize=0;
+
+    return;
+
+}   // LDbTable::Reset
+
+
+leveldb::Iterator *
+LDbTable::NewIterator()
+{
+    leveldb::Iterator * ret_ptr(NULL);
+
+    if (m_IsOpen)
+    {
+        leveldb::ReadOptions read_options;
+
+        read_options.fill_cache=false;
+        ret_ptr=m_TablePtr->NewIterator(read_options);
+    }   // if
+
+    return(ret_ptr);
+
+}   // LDbTable::NewIterator
+
+
+int
+main(
+    int argc,
+    char ** argv)
+{
+    bool error_seen, running, compare_files;
+    char ** cursor;
+
+    compare_files=false;
+    error_seen=false;
+    running=true;
+
+    // Options: needs filter & total_leveldb_mem initialized
+    leveldb::Options options;
+
+    // using 16 bit width per key in bloom filter
+    options.filter_policy=leveldb::NewBloomFilterPolicy2(16);
+    // tell leveldb it can use 512Mbyte of memory
+    options.total_leveldb_mem=(512 << 20);
+
+    for (cursor=argv+1;
+         NULL!=*cursor && running && !error_seen;
+         ++cursor)
+    {
+        // option flag?
+        if ('-'==**cursor)
+        {
+            char flag;
+
+            flag=*((*cursor)+1);
+            switch(flag)
+            {
+                case 'b':
+                {
+                    error_seen=(NULL==(cursor+1));
+                    if (!error_seen)
+                    {
+                        ++cursor;
+                        options.block_size=atol(*cursor);
+                    };
+                    break;
+                }   // case b
+
+                case 's':  options.compression=leveldb::kSnappyCompression; break;
+                case 'z':  options.compression=leveldb::kLZ4Compression; break;
+                case 'n':  options.compression=leveldb::kNoCompression; break;
+
+                case 'c':
+                {
+                    // test for first pair ... but after that user beware
+                    error_seen=(NULL==(cursor+1)) || (NULL==(cursor+2));
+                    if (!error_seen)
+                         {compare_files=true;}
+                    break;
+                }   // case c
+
+                case 'w':  compare_files=false; break;
+
+                default:
+                    fprintf(stderr, " option \'%c\' is not valid\n", flag);
+                    command_help();
+                    running=false;
+                    error_seen=true;
+                    break;
+            }   // switch
+        }   // if
+
+        // sst file
+        else
+        {
+            std::string fname;
+            fname=*cursor;
+
+            // do a rewrite
+            if (!compare_files)
+            {
+                leveldb::WritableFile * outfile;
+                leveldb::Status s;
+                std::auto_ptr<leveldb::Iterator> it;
+                std::auto_ptr<leveldb::TableBuilder> builder;
+
+                LDbTable in_file(options, fname);
+
+                if (in_file.GetStatus().ok())
+                {
+                    it.reset(in_file.NewIterator());
+
+                    fname.append(".new");
+                    s = options.env->NewWritableFile(fname, &outfile,
+                                                     options.env->RecoveryMmapSize(&options));
+                    if (s.ok())
+                        builder.reset(new leveldb::TableBuilder(options, outfile));
+                    else
+                    {
+                        // Table::Open failed on file "fname"
+                        fprintf(stderr, "%s: NewWritableFile failed (%s)\n",
+                                fname.c_str(), s.ToString().c_str());
+                        error_seen=true;
+                    }   // else
+
+                    for (it->SeekToFirst();
+                         it->Valid() && s.ok() && builder->status().ok();
+                         it->Next())
+                    {
+                        leveldb::Slice key = it->key();
+                        builder->Add(key, it->value());
+                    }   // for
+
+                    // hmmm, nothing new setting status right now.
+                    if (s.ok() && builder->status().ok()) {
+                        s = builder->Finish();
+                    } else {
+                        builder->Abandon();
+                    }
+
+                    if (NULL!=outfile)
+                        outfile->Close();
+                    delete outfile;
+                }   // if
+                else
+                {
+                    fprintf(stderr, "%s: Input table open failed (%s)\n",
+                            fname.c_str(), in_file.GetStatus().ToString().c_str());
+                    error_seen=true;
+                }   // else
+            }   // if
+
+            // compare two files
+            else
+            {
+                LDbTable file1(options, fname);
+
+                ++cursor;
+                if (NULL!=*cursor)
+                {
+                    fname=*cursor;
+                    LDbTable file2(options, fname);
+
+                    if (file1.GetStatus().ok() && file2.GetStatus().ok())
+                    {
+                        // quick check: same number of keys and bytes of user data?
+                        //     do this before reading entire files
+                        if (file1.GetSstCounter(leveldb::eSstCountKeys)==file2.GetSstCounter(leveldb::eSstCountKeys)
+                            && file1.GetSstCounter(leveldb::eSstCountKeySize)==file2.GetSstCounter(leveldb::eSstCountKeySize)
+                            && file1.GetSstCounter(leveldb::eSstCountValueSize)==file2.GetSstCounter(leveldb::eSstCountValueSize))
+                        {
+                            leveldb::Iterator * it1, *it2;
+                            uint64_t key_count;
+                            bool match;
+
+                            it1=file1.NewIterator();
+                            it2=file2.NewIterator();
+                            match=true;
+
+                            for (it1->SeekToFirst(), it2->SeekToFirst(), key_count=1;
+                                 it1->Valid() && it2->Valid() && match;
+                                 it1->Next(), it2->Next(), ++key_count)
+                            {
+                                match=(0==it1->key().compare(it2->key())) && (0==it1->value().compare(it2->value()));
+
+                                if (!match)
+                                {
+                                    fprintf(stderr, "%s, %s: Content mismatch at key position %d (%d, %d).\n",
+                                            file1.GetFileName(), file2.GetFileName(),
+                                            (int)key_count,
+                                            it1->key().compare(it2->key()), it1->value().compare(it2->value()));
+                                    error_seen=true;
+                                }   // if
+
+                            }   // for
+
+                            if (it1->Valid() != it2->Valid())
+                            {
+                                fprintf(stderr, "%s, %s: Walk of keys terminated early (%d, %d).\n",
+                                        file1.GetFileName(), file2.GetFileName(),
+                                        (int)it1->Valid(), (int)it2->Valid());
+                                error_seen=true;
+                            }
+                        }   // if
+                        else
+                        {
+                            if (file1.GetSstCounter(leveldb::eSstCountKeys)==file2.GetSstCounter(leveldb::eSstCountKeys))
+                                fprintf(stderr, "%s, %s: Number of keys different, %" PRIu64 " vs %" PRIu64 ".\n",
+                                        file1.GetFileName(), file2.GetFileName(),
+                                        file1.GetSstCounter(leveldb::eSstCountKeys),
+                                        file2.GetSstCounter(leveldb::eSstCountKeys));
+
+                            if (file1.GetSstCounter(leveldb::eSstCountKeySize)==file2.GetSstCounter(leveldb::eSstCountKeySize))
+                                fprintf(stderr, "%s, %s: Byte size of all keys different, %" PRIu64 " vs %" PRIu64 "\n",
+                                        file1.GetFileName(), file2.GetFileName(),
+                                        file1.GetSstCounter(leveldb::eSstCountKeySize),
+                                        file2.GetSstCounter(leveldb::eSstCountKeySize));
+
+                            if (file1.GetSstCounter(leveldb::eSstCountValueSize)==file2.GetSstCounter(leveldb::eSstCountValueSize))
+                                fprintf(stderr, "%s, %s: Byte size of all values different, %" PRIu64 " vs %" PRIu64 "\n",
+                                        file1.GetFileName(), file2.GetFileName(),
+                                        file1.GetSstCounter(leveldb::eSstCountValueSize),
+                                        file2.GetSstCounter(leveldb::eSstCountValueSize));
+                            error_seen=true;
+                        }   // else
+                    }   // if
+                    else
+                    {
+                        if (!file1.GetStatus().ok())
+                            fprintf(stderr, "%s: Input table open failed (%s)\n",
+                                    file1.GetFileName(), file1.GetStatus().ToString().c_str());
+                        if (!file2.GetStatus().ok())
+                            fprintf(stderr, "%s: Input table open failed (%s)\n",
+                                    file2.GetFileName(), file2.GetStatus().ToString().c_str());
+                        error_seen=true;
+                    }   // else
+                }   // if
+                else
+                {
+                    fprintf(stderr, "%s: compare needs two file names, only have one\n",
+                            fname.c_str());
+                }   // else
+            }   // else
+        }   // else
+    }   // for
+
+    // cleanup
+    options.env->Shutdown();
+    delete options.filter_policy;
+
+    if (1==argc)
+        command_help();
+
+    return( error_seen ? 1 : 0 );
+
+}   // main
+
+
+void
+command_help()
+{
+    fprintf(stderr, "sst_rewrite [option | file]*\n");
+    fprintf(stderr, "  options\n");
+    fprintf(stderr, "      -b  value  set Options.block_size to value\n");
+    fprintf(stderr, "      -n  set Options.compression to No compression\n");
+    fprintf(stderr, "      -s  set Options.compression to Snappy compression\n");
+    fprintf(stderr, "      -z  set Options.compression to LZ4 compression\n");
+    fprintf(stderr, "      -c  compare next two files (inverse of -w)\n");
+    fprintf(stderr, "      -w  rewrite next file (default, inverse of -c)\n");
+}   // command_help
+
+namespace leveldb {
+
+
+}  // namespace leveldb
+
diff --git a/src/leveldb/tools/sst_scan.cc b/src/leveldb/tools/sst_scan.cc
new file mode 100644
index 000000000..3c93a9b42
--- /dev/null
+++ b/src/leveldb/tools/sst_scan.cc
@@ -0,0 +1,563 @@
+// -------------------------------------------------------------------
+//
+// sst_scan.cc
+//
+// Copyright (c) 2016 Basho Technologies, Inc. All Rights Reserved.
+//
+// This file is provided to you under the Apache License,
+// Version 2.0 (the "License"); you may not use this file
+// except in compliance with the License.  You may obtain
+// a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+//
+// -------------------------------------------------------------------
+
+#include <stdlib.h>
+#include <libgen.h>
+#include <arpa/inet.h>
+
+#define __STDC_FORMAT_MACROS
+#include <inttypes.h>
+
+#include "db/filename.h"
+#include "leveldb/env.h"
+#include "leveldb/db.h"
+#include "leveldb/cache.h"
+#include "leveldb/filter_policy.h"
+#include "leveldb/slice.h"
+#include "db/table_cache.h"
+#include "db/version_edit.h"
+#include "table/format.h"
+#include "table/block.h"
+#include "table/filter_block.h"
+#include "util/cache2.h"
+
+//#include "leveldb_ee/riak_object.h"
+
+//#include "util/logging.h"
+//#include "db/log_reader.h"
+
+void command_help();
+bool PrintSextKey(leveldb::Slice & Cursor, int Limit=1);
+bool PrintSextAtom(leveldb::Slice & Cursor);
+void PrintInternalKeyInfo(leveldb::ParsedInternalKey & ParsedKey);
+
+int
+main(
+    int argc,
+    char ** argv)
+{
+    bool error_seen, index_keys, all_keys, block_info, csv_header, counter_info,
+        running, no_csv, summary_only, riak_translations, value_dump;
+    int error_counter;
+    char ** cursor;
+
+    running=true;
+    error_seen=false;
+
+    block_info=false;
+    counter_info=false;
+    index_keys=false;
+    csv_header=false;
+    all_keys=false;
+    no_csv=false;
+    summary_only=false;
+    riak_translations=false;
+    value_dump=false;
+
+    error_counter=0;
+
+
+    for (cursor=argv+1; NULL!=*cursor && running; ++cursor)
+    {
+        // option flag?
+        if ('-'==**cursor)
+        {
+            char flag;
+
+            flag=*((*cursor)+1);
+            switch(flag)
+            {
+                case 'b':  block_info=true; break;
+                case 'c':  counter_info=true; break;
+                case 'h':  csv_header=true; break;
+                case 'i':  index_keys=true; break;
+                case 'k':  all_keys=true; break;
+                case 'n':  no_csv=true; break;
+                case 'r':  riak_translations=true; break;
+                case 's':  summary_only=true; break;
+                case 'v':  all_keys=true; value_dump=true; break;
+                default:
+                    fprintf(stderr, " option \'%c\' is not valid\n", flag);
+                    command_help();
+                    running=false;
+                    error_counter=1;
+                    error_seen=true;
+                    break;
+            }   // switch
+        }   // if
+
+        // sst file
+        else
+        {
+            leveldb::Options options;
+            leveldb::DoubleCache double_cache(options);
+            leveldb::ReadOptions read_options;
+            std::string table_name, dbname, path_temp;
+            leveldb::Env * env;
+            leveldb::FileMetaData meta;
+            leveldb::TableCache * table_cache;
+            env=leveldb::Env::Default();
+
+            const int search_level = -2;
+            const bool is_overlapped = search_level < 3; // temporary: see TableCache::Evict()
+
+            // make copy since basename() and dirname() may modify
+            path_temp=*cursor;
+            dbname=dirname((char *)path_temp.c_str());
+            dbname=MakeTieredDbname(dbname, options);
+            path_temp=*cursor;
+            table_name=basename((char *)path_temp.c_str());
+            meta.number=strtol(table_name.c_str(), NULL, 10);
+
+            options.filter_policy=leveldb::NewBloomFilterPolicy(10);
+            table_cache=new leveldb::TableCache(dbname, &options, double_cache.GetFileCache(), double_cache);
+            table_name = leveldb::TableFileName(options, meta.number, search_level);
+
+            // open table, step 1 get file size
+            leveldb::Status status = env->GetFileSize(table_name, &meta.file_size);
+            if (!status.ok())
+            {
+                fprintf(stderr, "%s: GetFileSize failed (%s)\n", table_name.c_str(),status.ToString().c_str());
+                error_seen=true;
+                error_counter=10;
+            }   // if
+
+            //open table, step 2 find table (cache or open)
+            if (status.ok())
+            {
+                leveldb::Cache::Handle * fhandle;
+
+                fhandle=NULL;
+
+                status=table_cache->TEST_FindTable(meta.number, meta.file_size, search_level, &fhandle);
+
+                // count keys and size keys/filter
+                if (status.ok())
+                {
+                    leveldb::Table* table;
+                    leveldb::Iterator *it, *it2;
+                    int count, count2, total, block_count;
+                    size_t tot_size, smallest_block, tot_compress, tot_uncompress;
+                    bool first;
+                    leveldb::Status status;
+                    leveldb::RandomAccessFile * file;
+
+                    total=0;
+                    count=0;
+                    count2=0;
+                    tot_size=0;
+
+                    table = reinterpret_cast<leveldb::TableAndFile*>(table_cache->TEST_GetInternalCache()->Value(fhandle))->table;
+                    table->ReadFilter();
+                    file = reinterpret_cast<leveldb::TableAndFile*>(table_cache->TEST_GetInternalCache()->Value(fhandle))->file;
+                    it = table->TEST_GetIndexBlock()->NewIterator(options.comparator);
+
+
+                    // walk keys in index block
+                    if (index_keys)
+                    {
+                        for (it->SeekToFirst(), count=0; it->Valid(); it->Next())
+                        {
+                            ++count;
+                            if (it->status().ok())
+                            {
+                                leveldb::ParsedInternalKey parsed;
+                                leveldb::Slice key = it->key();
+                                leveldb::Slice value = it->value();
+
+                                ParseInternalKey(key, &parsed);
+                                printf("key %zd, value %zd: %s\n", key.size(), value.size(), parsed.DebugStringHex().c_str());
+                            }   // if
+                            else
+                            {
+                                fprintf(stderr, "%s: index iterator failed (%s)\n", table_name.c_str(),it->status().ToString().c_str());
+                            }   // else
+                        }   // for
+                    }   // if
+
+                    // Walk all blocks (but nothing within block)
+                    smallest_block=0;
+                    first=true;
+                    block_count=0;
+                    tot_compress=0;
+                    tot_uncompress=0;
+
+                    for (it->SeekToFirst(), count=0; it->Valid() && !summary_only; it->Next())
+                    {
+                        leveldb::BlockContents contents;
+                        leveldb::BlockHandle bhandle;
+                        leveldb::Slice slice;
+
+                        ++block_count;
+                        slice=it->value();
+                        bhandle.DecodeFrom(&slice);
+
+                        if (block_info)
+                        {
+                            printf("block %d, offset %" PRIu64 ", size %" PRIu64 ", next %" PRIu64 "\n",
+                                   block_count, bhandle.offset(), bhandle.size(), bhandle.offset()+bhandle.size());
+                        }   // if
+
+                        tot_compress+=bhandle.size();
+                        status=leveldb::ReadBlock(file, read_options, bhandle, &contents);
+                        if (status.ok())
+                        {
+                            if (first)
+                            {
+                                first=false;
+                                smallest_block=contents.data.size();
+                            }   // if
+                            else if (contents.data.size()<smallest_block)
+                            {
+                                smallest_block=contents.data.size();
+                            }   // else if
+                            tot_uncompress+=contents.data.size();
+
+                            if (contents.heap_allocated)
+                            {
+                                delete [] contents.data.data();
+                            }   // if
+                        }   // if
+                        else
+                        {
+                            fprintf(stderr, "ReadBlock failed on block %d\n", block_count);
+                        }   // else
+                    }   // for
+
+                    // Walk all keys in each block.
+                    for (it->SeekToFirst(), count=0; it->Valid() && !summary_only; it->Next())
+                    {
+                        ++count;
+                        it2=leveldb::Table::TEST_BlockReader(table, read_options, it->value());
+                        for (it2->SeekToFirst(), count2=0; it2->Valid(); it2->Next())
+                        {
+                            ++count2;
+                            ++total;
+                            if (it2->status().ok())
+                            {
+                                tot_size+=it2->value().size();
+
+                                if (all_keys)
+                                {
+                                    leveldb::ParsedInternalKey parsed;
+                                    leveldb::Slice key = it2->key();
+
+                                    ParseInternalKey(key, &parsed);
+                                    printf("%s block_key %s\n", parsed.DebugStringHex().c_str(), table_name.c_str());
+
+                                    if (riak_translations && '\x10'==*parsed.user_key.data())
+                                    {
+                                        leveldb::Slice cursor_slice;
+
+                                        cursor_slice=parsed.user_key;
+                                        printf("     ");
+                                        PrintSextKey(cursor_slice);
+                                        printf("\n");
+                                        printf("     ");
+                                        PrintInternalKeyInfo(parsed);
+                                        printf("\n");
+
+                                        cursor_slice=parsed.user_key;
+                                    }   // if
+
+                                    if (value_dump)
+                                    {
+                                        printf("  %s\n", HexString(it2->value()).c_str());
+                                    }   // if
+                                }   // if
+                            }   // if
+                            else
+                            {
+                                fprintf(stderr, "%s: value iterator failed, location [%d, %d] (%s)\n",
+                                       table_name.c_str(),count, count2,it2->status().ToString().c_str());
+                            }   // else
+                        }   // for
+
+                        delete it2;
+                    }   // for
+
+                    delete it;
+
+                    if (!no_csv)
+                    {
+                        if (csv_header)
+                        {
+                            csv_header=false;
+                            printf("Table File, File size, Index size, Index key count, ");
+                            printf("total key count, total value size, average value size, smallest block, ratio*100, ");
+                            printf("table object size, filter size");
+
+                            if (counter_info)
+                            {
+                                unsigned loop;
+                                leveldb::SstCounters counters;
+
+                                counters=table->GetSstCounters();
+
+                                for (loop=0; loop<counters.Size(); ++loop)
+                                    printf(", Counter %u", loop);
+                            }   // if
+
+                            printf("\n");
+                        }   // if
+
+                        printf("%s, %" PRIu64 ", %zd, %d,",
+                               table_name.c_str(), meta.file_size, table->TEST_GetIndexBlock()->size(), count);
+
+                        printf(" %d, %zd, %zd, %zd, %zd,",
+                               total, tot_size, (0!=total) ? tot_size/total : 0, smallest_block,
+                               (0!=tot_compress) ? (tot_uncompress*100)/tot_compress: 0);
+
+                        printf(" %zd, %zd",
+                               table->TEST_TableObjectSize(), table->TEST_FilterDataSize());
+
+                        if (counter_info || summary_only)
+                        {
+                            unsigned loop;
+                            leveldb::SstCounters counters;
+
+                            counters=table->GetSstCounters();
+
+                            for (loop=0; loop<counters.Size(); ++loop)
+                                printf(", %" PRIu64 "", counters.Value(loop));
+                        }   // if
+
+                        printf("\n");
+                    }   // if
+
+                    // cleanup
+                    table_cache->Evict(meta.number, is_overlapped);
+                }   // if
+                else
+                {
+                    fprintf(stderr, "%s: FindTable failed (%s)\n", table_name.c_str(),status.ToString().c_str());
+                    error_seen=true;
+                    error_counter=1;
+                }   // else
+            }   // if
+
+            // cleanup
+            delete table_cache;
+            delete options.filter_policy;
+
+        }   // else
+    }   // for
+
+    if (1==argc)
+        command_help();
+
+    return( error_seen && 0!=error_counter ? 1 : 0 );
+
+}   // main
+
+
+void
+command_help()
+{
+    fprintf(stderr, "sst_scan [option | file]*\n");
+    fprintf(stderr, "  options\n");
+    fprintf(stderr, "      -b  print details about block\n");
+    fprintf(stderr, "      -c  print sst counters\n");
+    fprintf(stderr, "      -h  print csv formatted header line (once)\n");
+    fprintf(stderr, "      -i  print index keys\n");
+    fprintf(stderr, "      -k  print all keys\n");
+    fprintf(stderr, "      -n  NO csv data (or header)\n");
+    fprintf(stderr, "      -r  print riak translations\n");
+    fprintf(stderr, "      -v  print all keys and their values\n");
+
+}   // command_help
+
+
+/**
+ * Recursive routine to give idea of key contents
+ */
+bool
+PrintSextKey(
+    leveldb::Slice & Cursor,
+    int Limit)
+{
+    int loop;
+    bool good(true);
+
+    for (loop=0; loop<Limit && good; ++loop)
+    {
+        if (0<loop)
+            printf(",");
+
+        switch(*Cursor.data())
+        {
+            case(16):   // tuple
+            {
+                uint32_t count;
+
+                Cursor.remove_prefix(1);
+                count=ntohl(*(uint32_t*)Cursor.data());
+                Cursor.remove_prefix(4);
+
+                printf("{");
+                good=PrintSextKey(Cursor, count);
+                printf("}");
+                break;
+            }   // tuple
+
+            case(12):   // atom
+            {
+                Cursor.remove_prefix(1);
+                good=PrintSextAtom(Cursor);
+                break;
+            }   // atom
+
+            case(18):   // binary
+            {
+                Cursor.remove_prefix(1);
+                printf("<<");
+                good=PrintSextAtom(Cursor);
+                printf(">>");
+                break;
+            }   // atom
+        }   // switch
+    }   // for
+
+    return(good);
+
+}   // PrintSextKey
+
+
+bool
+PrintSextAtom(
+    leveldb::Slice & Cursor)
+{
+    bool good(true);
+    uint8_t mask(0x80);
+    char output;
+
+    while(good && (uint8_t)*Cursor.data() & mask)
+    {
+        // this could be done easier with variables instead of fixed constants
+        switch(mask)
+        {
+            case(0x80):
+            {
+                output=(*Cursor.data() & 0x7f) << 1;
+                Cursor.remove_prefix(1);
+                output+=(*Cursor.data() & 0x80) >> 7;
+                printf("%c",output);
+                mask=0x40;
+                break;
+            }
+
+            case(0x40):
+            {
+                output=(*Cursor.data() & 0x3f) << 2;
+                Cursor.remove_prefix(1);
+                output+=(*Cursor.data() & 0xc0) >> 6;
+                printf("%c",output);
+                mask=0x20;
+                break;
+            }
+
+            case(0x20):
+            {
+                output=(*Cursor.data() & 0x1f) << 3;
+                Cursor.remove_prefix(1);
+                output+=(*Cursor.data() & 0xe0) >> 5;
+                printf("%c",output);
+                mask=0x10;
+                break;
+            }
+
+            case(0x10):
+            {
+                output=(*Cursor.data() & 0x0f) << 4;
+                Cursor.remove_prefix(1);
+                output+=(*Cursor.data() & 0xf0) >> 4;
+                printf("%c",output);
+                mask=0x08;
+                break;
+            }
+
+            case(0x08):
+            {
+                output=(*Cursor.data() & 0x07) << 5;
+                Cursor.remove_prefix(1);
+                output+=(*Cursor.data() & 0xf8) >> 3;
+                printf("%c",output);
+                mask=0x04;
+                break;
+            }
+
+            case(0x04):
+            {
+                output=(*Cursor.data() & 0x03) << 6;
+                Cursor.remove_prefix(1);
+                output+=(*Cursor.data() & 0xfc) >> 2;
+                printf("%c",output);
+                mask=0x02;
+                break;
+            }
+
+            case(0x02):
+            {
+                output=(*Cursor.data() & 0x01) << 7;
+                Cursor.remove_prefix(1);
+                output+=(*Cursor.data() & 0xfe) >> 1;
+                printf("%c",output);
+                mask=0x01;
+                break;
+            }
+
+            case(0x01):
+            {
+                Cursor.remove_prefix(1);
+                output=*Cursor.data();
+                Cursor.remove_prefix(1);
+                printf("%c",output);
+                mask=0x80;
+                break;
+            }
+        }   // switch
+
+    }   // while
+
+    Cursor.remove_prefix(2);
+
+    return(good);
+
+}   // PrintSextAtom
+
+
+void
+PrintInternalKeyInfo(
+    leveldb::ParsedInternalKey & ParsedKey)
+{
+    printf("%s, seq: %" PRIu64, leveldb::KeyTypeString(ParsedKey.type), ParsedKey.sequence);
+
+    if (leveldb::IsExpiryKey(ParsedKey.type))
+        printf(", expiry: %" PRIu64, ParsedKey.expiry);
+
+}   // PrintInternalKeyInfo
+
+namespace leveldb {
+
+
+}  // namespace leveldb
+
diff --git a/src/leveldb/util/arena.cc b/src/leveldb/util/arena.cc
index 74078213e..9551d6a3a 100644
--- a/src/leveldb/util/arena.cc
+++ b/src/leveldb/util/arena.cc
@@ -9,7 +9,8 @@ namespace leveldb {
 
 static const int kBlockSize = 4096;
 
-Arena::Arena() : memory_usage_(0) {
+Arena::Arena() {
+  blocks_memory_ = 0;
   alloc_ptr_ = NULL;  // First allocation will allocate a block
   alloc_bytes_remaining_ = 0;
 }
@@ -39,7 +40,7 @@ char* Arena::AllocateFallback(size_t bytes) {
 }
 
 char* Arena::AllocateAligned(size_t bytes) {
-  const int align = (sizeof(void*) > 8) ? sizeof(void*) : 8;
+  const int align = sizeof(void*);    // We'll align to pointer size
   assert((align & (align-1)) == 0);   // Pointer size should be a power of 2
   size_t current_mod = reinterpret_cast<uintptr_t>(alloc_ptr_) & (align-1);
   size_t slop = (current_mod == 0 ? 0 : align - current_mod);
@@ -59,9 +60,8 @@ char* Arena::AllocateAligned(size_t bytes) {
 
 char* Arena::AllocateNewBlock(size_t block_bytes) {
   char* result = new char[block_bytes];
+  blocks_memory_ += block_bytes;
   blocks_.push_back(result);
-  memory_usage_.NoBarrier_Store(
-      reinterpret_cast<void*>(MemoryUsage() + block_bytes + sizeof(char*)));
   return result;
 }
 
diff --git a/src/leveldb/util/arena.h b/src/leveldb/util/arena.h
index 48bab3374..8f7dde226 100644
--- a/src/leveldb/util/arena.h
+++ b/src/leveldb/util/arena.h
@@ -5,11 +5,10 @@
 #ifndef STORAGE_LEVELDB_UTIL_ARENA_H_
 #define STORAGE_LEVELDB_UTIL_ARENA_H_
 
+#include <cstddef>
 #include <vector>
 #include <assert.h>
-#include <stddef.h>
 #include <stdint.h>
-#include "port/port.h"
 
 namespace leveldb {
 
@@ -25,9 +24,10 @@ class Arena {
   char* AllocateAligned(size_t bytes);
 
   // Returns an estimate of the total memory usage of data allocated
-  // by the arena.
+  // by the arena (including space allocated but not yet used for user
+  // allocations).
   size_t MemoryUsage() const {
-    return reinterpret_cast<uintptr_t>(memory_usage_.NoBarrier_Load());
+    return blocks_memory_ + blocks_.capacity() * sizeof(char*);
   }
 
  private:
@@ -41,8 +41,8 @@ class Arena {
   // Array of new[] allocated memory blocks
   std::vector<char*> blocks_;
 
-  // Total memory usage of the arena.
-  port::AtomicPointer memory_usage_;
+  // Bytes of memory in blocks allocated so far
+  size_t blocks_memory_;
 
   // No copying allowed
   Arena(const Arena&);
diff --git a/src/leveldb/util/arena_test.cc b/src/leveldb/util/arena_test.cc
index 58e870ec4..63d177803 100644
--- a/src/leveldb/util/arena_test.cc
+++ b/src/leveldb/util/arena_test.cc
@@ -40,7 +40,7 @@ TEST(ArenaTest, Simple) {
       r = arena.Allocate(s);
     }
 
-    for (size_t b = 0; b < s; b++) {
+    for (int b = 0; b < s; b++) {
       // Fill the "i"th allocation with a known bit pattern
       r[b] = i % 256;
     }
@@ -51,10 +51,10 @@ TEST(ArenaTest, Simple) {
       ASSERT_LE(arena.MemoryUsage(), bytes * 1.10);
     }
   }
-  for (size_t i = 0; i < allocated.size(); i++) {
+  for (int i = 0; i < allocated.size(); i++) {
     size_t num_bytes = allocated[i].first;
     const char* p = allocated[i].second;
-    for (size_t b = 0; b < num_bytes; b++) {
+    for (int b = 0; b < num_bytes; b++) {
       // Check the "i"th allocation for the known bit pattern
       ASSERT_EQ(int(p[b]) & 0xff, i % 256);
     }
diff --git a/src/leveldb/util/bloom.cc b/src/leveldb/util/bloom.cc
index bf3e4ca6e..1cb63d1c3 100644
--- a/src/leveldb/util/bloom.cc
+++ b/src/leveldb/util/bloom.cc
@@ -2,8 +2,10 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
+#include <memory>
 #include "leveldb/filter_policy.h"
 
+#include "db/dbformat.h"
 #include "leveldb/slice.h"
 #include "util/hash.h"
 
@@ -29,7 +31,7 @@ class BloomFilterPolicy : public FilterPolicy {
   }
 
   virtual const char* Name() const {
-    return "leveldb.BuiltinBloomFilter2";
+    return "leveldb.BuiltinBloomFilter";
   }
 
   virtual void CreateFilter(const Slice* keys, int n, std::string* dst) const {
@@ -47,7 +49,7 @@ class BloomFilterPolicy : public FilterPolicy {
     dst->resize(init_size + bytes, 0);
     dst->push_back(static_cast<char>(k_));  // Remember # of probes in filter
     char* array = &(*dst)[init_size];
-    for (int i = 0; i < n; i++) {
+    for (size_t i = 0; i < (size_t)n; i++) {
       // Use double-hashing to generate a sequence of hash values.
       // See analysis in [Kirsch,Mitzenmacher 2006].
       uint32_t h = BloomHash(keys[i]);
@@ -92,4 +94,19 @@ const FilterPolicy* NewBloomFilterPolicy(int bits_per_key) {
   return new BloomFilterPolicy(bits_per_key);
 }
 
+// container to hold one bloom filter and auto destruct
+struct BloomInventoryItem
+{
+    std::auto_ptr<const FilterPolicy> m_Item;
+
+    BloomInventoryItem()
+    {
+        m_Item.reset(new InternalFilterPolicy2(NewBloomFilterPolicy(16)));
+        FilterInventory::AddFilterToInventory(m_Item.get());
+    };
+};  // struct BloomInventoryItem
+
+// bloom filter for reading, created on start-up
+static BloomInventoryItem lBloomItem;
+
 }  // namespace leveldb
diff --git a/src/leveldb/util/bloom2.cc b/src/leveldb/util/bloom2.cc
new file mode 100644
index 000000000..5ffb2840c
--- /dev/null
+++ b/src/leveldb/util/bloom2.cc
@@ -0,0 +1,1447 @@
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <memory>
+#include "leveldb/filter_policy.h"
+
+#include "db/dbformat.h"
+#include "leveldb/slice.h"
+#include "util/hash.h"
+#include "util/murmurhash.h"
+
+namespace leveldb {
+
+static unsigned Bytes2Prime(unsigned Bytes);
+static unsigned Bits2PrimeNBytes(unsigned Bits, unsigned & BytesOut);
+
+
+namespace {
+static uint32_t BloomHash0(const Slice& key) {
+  return Hash(key.data(), key.size(), 0xbc9f1d34);
+}
+
+static uint32_t BloomHash1(const Slice& key) {
+    return((uint32_t)MurmurHash(key.data(), key.size(), 0x5bd1e995));
+}
+
+class BloomFilterPolicy2 : public FilterPolicy {
+ private:
+  size_t bits_per_key_;
+  size_t k_;
+
+ public:
+  explicit BloomFilterPolicy2(int bits_per_key)
+      : bits_per_key_(bits_per_key) {
+    // We intentionally round down to reduce probing cost a little bit
+    k_ = static_cast<size_t>(bits_per_key * 0.69);  // 0.69 =~ ln(2)
+    if (k_ < 1) k_ = 1;
+    if (k_ > 30) k_ = 30;
+  }
+
+  virtual const char* Name() const {
+    return "leveldb.BuiltinBloomFilter2";
+  }
+
+  virtual void CreateFilter(const Slice* keys, int n, std::string* dst) const
+  {
+    unsigned bytes;
+
+    // Compute bloom filter size (in both bits and bytes)
+    size_t bits = n * bits_per_key_;
+
+    // For small n, we can see a very high false positive rate.  Fix it
+    // by enforcing a minimum bloom filter length.
+    if (bits < 61) bits = 61;
+
+    const unsigned prime=Bits2PrimeNBytes(bits, bytes);
+
+    const size_t init_size = dst->size();
+    dst->resize(init_size + bytes, 0);
+    dst->push_back(static_cast<char>(k_));  // Remember # of probes in filter
+    char* array = &(*dst)[init_size];
+    for (size_t i = 0; i < (size_t)n; i++) {
+      // Use double-hashing to generate a sequence of hash values.
+      // See analysis in [Kirsch,Mitzenmacher 2006].
+      uint32_t h = BloomHash0(keys[i]);
+      uint32_t h2= BloomHash1(keys[i]);
+      const uint32_t delta = (h >> 17) | (h << 15);  // Rotate right 17 bits
+      for (size_t j = 0; j < k_; j++) {
+          const uint32_t bitpos = (h + ((j+1)*h2)) % prime;
+        array[bitpos/8] |= (1 << (bitpos % 8));
+        h += delta;
+      }
+    }
+  }
+
+  virtual bool KeyMayMatch(const Slice& key, const Slice& bloom_filter) const {
+    const size_t len = bloom_filter.size();
+    if (len < 2) return false;
+
+    const char* array = bloom_filter.data();
+    const unsigned prime=Bytes2Prime(len-1);
+
+    // Use the encoded k so that we can read filters generated by
+    // bloom filters created using different parameters.
+    const size_t k = array[len-1];
+    if (k > 30) {
+      // Reserved for potentially new encodings for short bloom filters.
+      // Consider it a match.
+      return true;
+    }
+
+    uint32_t h = BloomHash0(key);
+    uint32_t h2= BloomHash1(key);
+    const uint32_t delta = (h >> 17) | (h << 15);  // Rotate right 17 bits
+    for (size_t j = 0; j < k; j++) {
+      const uint32_t bitpos = (h + ((j+1)*h2)) % prime;
+      if ((array[bitpos/8] & (1 << (bitpos % 8))) == 0) return false;
+      h += delta;
+    }
+    return true;
+  }
+};
+}
+
+const FilterPolicy* NewBloomFilterPolicy2(int bits_per_key) {
+  return new BloomFilterPolicy2(bits_per_key);
+}
+
+
+// container to hold one bloom filter and auto destruct
+struct BloomInventoryItem2
+{
+    std::auto_ptr<const FilterPolicy> m_Item;
+
+    BloomInventoryItem2()
+    {
+        m_Item.reset(new InternalFilterPolicy2(NewBloomFilterPolicy2(16)));
+        FilterInventory::AddFilterToInventory(m_Item.get());
+    };
+};  // struct BloomInventoryItem2
+
+// bloom filter for reading, created on start-up
+static BloomInventoryItem2 lBloom2Item;
+
+
+
+// sparse table of primes where index to array is count
+//  of bytes to contain the prime number of bits
+// The prime table only helps where key count is roughly 6,250
+//  or less.  It adds accuracy to smaller populations.
+static unsigned ByteSizePrimes[]=
+{
+ 0, 7, 13, 23, 31, 37, 47, 53, 61, 71,
+ 79, 83, 89, 103, 109, 113, 127, 131, 139, 151,
+ 157, 167, 173, 181, 191, 199, 199, 211, 223, 229,
+ 239, 241, 251, 263, 271, 277, 283, 293, 293, 311,
+ 317, 317, 331, 337, 349, 359, 367, 373, 383, 389,
+ 397, 401, 409, 421, 431, 439, 443, 449, 463, 467,
+ 479, 487, 491, 503, 509, 509, 523, 523, 541, 547,
+ 557, 563, 571, 577, 587, 599, 607, 613, 619, 631,
+ 631, 647, 653, 661, 661, 677, 683, 691, 701, 709,
+ 719, 727, 733, 743, 751, 757, 761, 773, 773, 787,
+ 797, 797, 811, 823, 829, 839, 839, 853, 863, 863,
+ 877, 887, 887, 887, 911, 919, 919, 929, 941, 947,
+ 953, 967, 971, 983, 991, 997, 997, 1013, 1021, 1031,
+ 1039, 1039, 1051, 1063, 1069, 1069, 1087, 1093, 1103, 1109,
+ 1117, 1123, 1129, 1129, 1151, 1153, 1163, 1171, 1181, 1187,
+ 1193, 1201, 1213, 1223, 1231, 1237, 1237, 1249, 1259, 1259,
+ 1279, 1283, 1291, 1303, 1307, 1319, 1327, 1327, 1327, 1327,
+ 1327, 1367, 1373, 1381, 1381, 1399, 1399, 1409, 1423, 1429,
+ 1439, 1447, 1453, 1459, 1471, 1471, 1487, 1493, 1499, 1511,
+ 1511, 1523, 1531, 1543, 1549, 1559, 1567, 1571, 1583, 1583,
+ 1597, 1607, 1613, 1621, 1627, 1637, 1637, 1637, 1663, 1669,
+ 1669, 1669, 1693, 1699, 1709, 1709, 1723, 1733, 1741, 1747,
+ 1759, 1759, 1759, 1783, 1789, 1789, 1801, 1811, 1823, 1831,
+ 1831, 1847, 1847, 1861, 1871, 1879, 1879, 1889, 1901, 1907,
+ 1913, 1913, 1933, 1933, 1951, 1951, 1951, 1973, 1979, 1987,
+ 1999, 2003, 2011, 2017, 2029, 2039, 2039, 2053, 2063, 2069,
+ 2069, 2087, 2089, 2099, 2111, 2113, 2113, 2131, 2143, 2143,
+ 2153, 2161, 2161, 2179, 2179, 2179, 2207, 2213, 2221, 2221,
+ 2239, 2243, 2251, 2251, 2269, 2273, 2287, 2293, 2297, 2311,
+ 2311, 2311, 2333, 2341, 2351, 2357, 2357, 2371, 2383, 2389,
+ 2399, 2399, 2411, 2423, 2423, 2437, 2447, 2447, 2459, 2467,
+ 2477, 2477, 2477, 2503, 2503, 2503, 2521, 2531, 2543, 2551,
+ 2557, 2557, 2557, 2579, 2591, 2593, 2593, 2609, 2621, 2621,
+ 2633, 2647, 2647, 2663, 2671, 2677, 2687, 2693, 2699, 2711,
+ 2719, 2719, 2731, 2741, 2749, 2753, 2767, 2767, 2777, 2791,
+ 2797, 2803, 2803, 2819, 2819, 2837, 2843, 2851, 2861, 2861,
+ 2879, 2887, 2887, 2903, 2909, 2917, 2927, 2927, 2939, 2939,
+ 2957, 2963, 2971, 2971, 2971, 2999, 3001, 3011, 3023, 3023,
+ 3037, 3041, 3049, 3061, 3067, 3079, 3083, 3089, 3089, 3109,
+ 3119, 3121, 3121, 3137, 3137, 3137, 3167, 3169, 3181, 3191,
+ 3191, 3203, 3209, 3221, 3229, 3229, 3229, 3253, 3259, 3271,
+ 3271, 3271, 3271, 3301, 3307, 3319, 3323, 3331, 3343, 3347,
+ 3359, 3361, 3373, 3373, 3391, 3391, 3407, 3413, 3413, 3413,
+ 3433, 3433, 3449, 3463, 3469, 3469, 3469, 3491, 3499, 3511,
+ 3517, 3527, 3533, 3541, 3547, 3559, 3559, 3571, 3583, 3583,
+ 3593, 3607, 3613, 3623, 3631, 3637, 3643, 3643, 3659, 3671,
+ 3677, 3677, 3691, 3701, 3709, 3719, 3727, 3733, 3739, 3739,
+ 3739, 3767, 3769, 3779, 3779, 3797, 3803, 3803, 3823, 3823,
+ 3833, 3847, 3853, 3863, 3863, 3877, 3881, 3889, 3889, 3911,
+ 3919, 3923, 3931, 3943, 3947, 3947, 3967, 3967, 3967, 3989,
+ 3989, 4007, 4013, 4021, 4027, 4027, 4027, 4051, 4057, 4057,
+ 4079, 4079, 4093, 4099, 4111, 4111, 4127, 4133, 4139, 4139,
+ 4159, 4159, 4159, 4177, 4177, 4177, 4201, 4211, 4219, 4231,
+ 4231, 4243, 4253, 4261, 4271, 4273, 4283, 4289, 4297, 4297,
+ 4297, 4327, 4327, 4339, 4349, 4357, 4363, 4373, 4373, 4391,
+ 4397, 4397, 4409, 4423, 4423, 4423, 4447, 4451, 4463, 4463,
+ 4463, 4483, 4493, 4493, 4507, 4519, 4523, 4523, 4523, 4549,
+ 4549, 4567, 4567, 4583, 4591, 4597, 4603, 4603, 4621, 4621,
+ 4639, 4643, 4651, 4663, 4663, 4679, 4679, 4691, 4703, 4703,
+ 4703, 4723, 4733, 4733, 4751, 4759, 4759, 4759, 4783, 4789,
+ 4799, 4801, 4813, 4817, 4831, 4831, 4831, 4831, 4861, 4871,
+ 4877, 4877, 4889, 4903, 4909, 4919, 4919, 4933, 4943, 4951,
+ 4957, 4967, 4973, 4973, 4987, 4999, 5003, 5011, 5023, 5023,
+ 5039, 5039, 5051, 5059, 5059, 5077, 5087, 5087, 5101, 5107,
+ 5119, 5119, 5119, 5119, 5147, 5153, 5167, 5171, 5179, 5189,
+ 5197, 5197, 5209, 5209, 5231, 5237, 5237, 5237, 5261, 5261,
+ 5279, 5281, 5281, 5303, 5309, 5309, 5323, 5333, 5333, 5351,
+ 5351, 5351, 5351, 5381, 5387, 5399, 5407, 5413, 5419, 5431,
+ 5437, 5443, 5449, 5449, 5471, 5479, 5483, 5483, 5503, 5507,
+ 5519, 5527, 5531, 5531, 5531, 5557, 5563, 5573, 5581, 5591,
+ 5591, 5591, 5591, 5623, 5623, 5639, 5647, 5653, 5659, 5669,
+ 5669, 5683, 5693, 5701, 5711, 5717, 5717, 5717, 5743, 5749,
+ 5749, 5749, 5749, 5783, 5791, 5791, 5807, 5813, 5821, 5827,
+ 5839, 5843, 5851, 5861, 5869, 5879, 5881, 5881, 5903, 5903,
+ 5903, 5927, 5927, 5939, 5939, 5953, 5953, 5953, 5981, 5987,
+ 5987, 6007, 6011, 6011, 6029, 6037, 6047, 6053, 6053, 6067,
+ 6079, 6079, 6091, 6101, 6101, 6113, 6121, 6133, 6143, 6151,
+ 6151, 6163, 6173, 6173, 6173, 6199, 6203, 6211, 6221, 6229,
+ 6229, 6247, 6247, 6263, 6271, 6277, 6287, 6287, 6301, 6311,
+ 6317, 6323, 6329, 6343, 6343, 6359, 6367, 6373, 6379, 6389,
+ 6397, 6397, 6397, 6421, 6427, 6427, 6427, 6451, 6451, 6469,
+ 6473, 6481, 6491, 6491, 6491, 6491, 6521, 6529, 6529, 6551,
+ 6553, 6563, 6571, 6581, 6581, 6599, 6607, 6607, 6619, 6619,
+ 6637, 6637, 6653, 6661, 6661, 6679, 6679, 6691, 6703, 6709,
+ 6719, 6719, 6733, 6737, 6737, 6737, 6763, 6763, 6781, 6791,
+ 6793, 6803, 6803, 6823, 6829, 6833, 6841, 6841, 6863, 6871,
+ 6871, 6883, 6883, 6899, 6911, 6917, 6917, 6917, 6917, 6949,
+ 6959, 6967, 6971, 6983, 6991, 6997, 7001, 7013, 7019, 7027,
+ 7039, 7043, 7043, 7057, 7069, 7079, 7079, 7079, 7103, 7109,
+ 7109, 7127, 7129, 7129, 7151, 7159, 7159, 7159, 7177, 7187,
+ 7193, 7207, 7213, 7219, 7229, 7237, 7247, 7253, 7253, 7253,
+ 7253, 7283, 7283, 7297, 7309, 7309, 7321, 7333, 7333, 7351,
+ 7351, 7351, 7369, 7369, 7369, 7393, 7393, 7411, 7417, 7417,
+ 7433, 7433, 7451, 7459, 7459, 7477, 7487, 7489, 7499, 7507,
+ 7517, 7523, 7529, 7541, 7549, 7559, 7561, 7573, 7583, 7591,
+ 7591, 7607, 7607, 7621, 7621, 7639, 7643, 7649, 7649, 7669,
+ 7673, 7687, 7691, 7703, 7703, 7717, 7727, 7727, 7741, 7741,
+ 7759, 7759, 7759, 7759, 7789, 7793, 7793, 7793, 7823, 7829,
+ 7829, 7841, 7853, 7853, 7867, 7879, 7883, 7883, 7901, 7907,
+ 7919, 7927, 7933, 7937, 7951, 7951, 7963, 7963, 7963, 7963,
+ 7993, 7993, 8011, 8017, 8017, 8039, 8039, 8053, 8059, 8069,
+ 8069, 8087, 8093, 8101, 8111, 8117, 8123, 8123, 8123, 8147,
+ 8147, 8167, 8171, 8179, 8191, 8191, 8191, 8209, 8221, 8231,
+ 8237, 8243, 8243, 8263, 8269, 8273, 8287, 8293, 8297, 8311,
+ 8317, 8317, 8329, 8329, 8329, 8353, 8363, 8369, 8377, 8389,
+ 8389, 8389, 8389, 8423, 8431, 8431, 8447, 8447, 8461, 8467,
+ 8467, 8467, 8467, 8501, 8501, 8513, 8527, 8527, 8543, 8543,
+ 8543, 8563, 8573, 8581, 8581, 8599, 8599, 8609, 8623, 8629,
+ 8629, 8647, 8647, 8663, 8669, 8677, 8681, 8693, 8699, 8707,
+ 8719, 8719, 8731, 8741, 8747, 8753, 8761, 8761, 8783, 8783,
+ 8783, 8807, 8807, 8821, 8831, 8839, 8839, 8849, 8863, 8867,
+ 8867, 8887, 8893, 8893, 8893, 8893, 8923, 8933, 8941, 8951,
+ 8951, 8963, 8971, 8971, 8971, 8999, 9007, 9013, 9013, 9029,
+ 9029, 9043, 9049, 9059, 9067, 9067, 9067, 9091, 9103, 9109,
+ 9109, 9127, 9133, 9137, 9151, 9157, 9161, 9173, 9181, 9187,
+ 9199, 9203, 9209, 9221, 9227, 9239, 9241, 9241, 9257, 9257,
+ 9277, 9283, 9293, 9293, 9311, 9319, 9323, 9323, 9343, 9349,
+ 9349, 9349, 9371, 9377, 9391, 9397, 9403, 9413, 9421, 9431,
+ 9439, 9439, 9439, 9463, 9467, 9479, 9479, 9491, 9497, 9511,
+ 9511, 9521, 9533, 9539, 9551, 9551, 9551, 9551, 9551, 9587,
+ 9587, 9601, 9613, 9623, 9631, 9631, 9643, 9649, 9661, 9661,
+ 9679, 9679, 9689, 9697, 9697, 9719, 9721, 9733, 9743, 9749,
+ 9749, 9767, 9769, 9781, 9791, 9791, 9803, 9811, 9817, 9829,
+ 9839, 9839, 9851, 9859, 9871, 9871, 9887, 9887, 9901, 9907,
+ 9907, 9923, 9931, 9941, 9949, 9949, 9967, 9973, 9973, 9973,
+ 9973, 10007, 10009, 10009, 10009, 10039, 10039, 10039, 10061, 10069,
+ 10079, 10079, 10093, 10103, 10111, 10111, 10111, 10133, 10141, 10151,
+ 10159, 10163, 10169, 10181, 10181, 10193, 10193, 10211, 10223, 10223,
+ 10223, 10247, 10253, 10259, 10271, 10273, 10273, 10289, 10303, 10303,
+ 10313, 10321, 10333, 10343, 10343, 10357, 10357, 10369, 10369, 10391,
+ 10399, 10399, 10399, 10399, 10429, 10433, 10433, 10453, 10463, 10463,
+ 10477, 10487, 10487, 10501, 10501, 10513, 10513, 10531, 10531, 10531,
+ 10559, 10567, 10567, 10567, 10589, 10597, 10607, 10613, 10613, 10631,
+ 10639, 10639, 10651, 10663, 10667, 10667, 10687, 10691, 10691, 10711,
+ 10711, 10723, 10733, 10739, 10739, 10753, 10753, 10771, 10781, 10789,
+ 10799, 10799, 10799, 10799, 10831, 10837, 10847, 10853, 10861, 10867,
+ 10867, 10883, 10891, 10903, 10909, 10909, 10909, 10909, 10939, 10949,
+ 10957, 10957, 10973, 10979, 10987, 10993, 11003, 11003, 11003, 11027,
+ 11027, 11047, 11047, 11059, 11071, 11071, 11087, 11093, 11093, 11093,
+ 11119, 11119, 11131, 11131, 11149, 11159, 11161, 11173, 11177, 11177,
+ 11197, 11197, 11213, 11213, 11213, 11239, 11243, 11251, 11261, 11261,
+ 11279, 11287, 11287, 11299, 11311, 11317, 11321, 11329, 11329, 11351,
+ 11353, 11353, 11369, 11383, 11383, 11399, 11399, 11411, 11423, 11423,
+ 11437, 11447, 11447, 11447, 11471, 11471, 11483, 11491, 11503, 11503,
+ 11519, 11527, 11527, 11527, 11551, 11551, 11551, 11551, 11579, 11587,
+ 11597, 11597, 11597, 11621, 11621, 11633, 11633, 11633, 11657, 11657,
+ 11677, 11681, 11689, 11701, 11701, 11719, 11719, 11731, 11743, 11743,
+ 11743, 11743, 11743, 11783, 11789, 11789, 11807, 11813, 11821, 11831,
+ 11839, 11839, 11839, 11863, 11867, 11867, 11887, 11887, 11903, 11909,
+ 11909, 11927, 11933, 11941, 11941, 11959, 11959, 11971, 11981, 11987,
+ 11987, 12007, 12011, 12011, 12011, 12037, 12043, 12049, 12049, 12071,
+ 12073, 12073, 12073, 12101, 12109, 12119, 12119, 12119, 12143, 12149,
+ 12157, 12163, 12163, 12163, 12163, 12197, 12203, 12211, 12211, 12227,
+ 12239, 12241, 12253, 12263, 12269, 12277, 12281, 12289, 12301, 12301,
+ 12301, 12323, 12329, 12343, 12347, 12347, 12347, 12373, 12379, 12391,
+ 12391, 12401, 12413, 12421, 12421, 12437, 12437, 12451, 12457, 12457,
+ 12479, 12487, 12491, 12503, 12511, 12517, 12527, 12527, 12541, 12547,
+ 12553, 12553, 12569, 12583, 12589, 12589, 12601, 12613, 12619, 12619,
+ 12637, 12647, 12653, 12659, 12671, 12671, 12671, 12689, 12703, 12703,
+ 12713, 12721, 12721, 12743, 12743, 12757, 12763, 12763, 12781, 12791,
+ 12799, 12799, 12809, 12823, 12829, 12829, 12841, 12853, 12853, 12853,
+ 12853, 12853, 12893, 12899, 12911, 12919, 12923, 12923, 12941, 12941,
+ 12959, 12967, 12973, 12983, 12983, 12983, 13007, 13009, 13009, 13009,
+ 13037, 13043, 13049, 13063, 13063, 13063, 13063, 13093, 13103, 13109,
+ 13109, 13127, 13127, 13127, 13151, 13159, 13163, 13171, 13183, 13187,
+ 13187, 13187, 13187, 13219, 13229, 13229, 13241, 13249, 13259, 13267,
+ 13267, 13267, 13291, 13297, 13309, 13313, 13327, 13331, 13339, 13339,
+ 13339, 13367, 13367, 13381, 13381, 13399, 13399, 13411, 13421, 13421,
+ 13421, 13441, 13451, 13463, 13469, 13477, 13487, 13487, 13499, 13499,
+ 13513, 13523, 13523, 13537, 13537, 13553, 13567, 13567, 13577, 13591,
+ 13597, 13597, 13613, 13619, 13627, 13633, 13633, 13649, 13649, 13669,
+ 13679, 13687, 13693, 13697, 13711, 13711, 13723, 13729, 13729, 13751,
+ 13759, 13763, 13763, 13781, 13789, 13799, 13807, 13807, 13807, 13831,
+ 13831, 13841, 13841, 13859, 13859, 13879, 13883, 13883, 13903, 13907,
+ 13913, 13921, 13933, 13933, 13933, 13933, 13967, 13967, 13967, 13967,
+ 13999, 13999, 14011, 14011, 14029, 14033, 14033, 14051, 14057, 14071,
+ 14071, 14087, 14087, 14087, 14107, 14107, 14107, 14107, 14143, 14149,
+ 14159, 14159, 14173, 14177, 14177, 14197, 14207, 14207, 14221, 14221,
+ 14221, 14243, 14251, 14251, 14251, 14251, 14281, 14293, 14303, 14303,
+ 14303, 14327, 14327, 14341, 14347, 14347, 14347, 14369, 14369, 14389,
+ 14389, 14407, 14411, 14423, 14431, 14437, 14447, 14449, 14461, 14461,
+ 14479, 14479, 14489, 14503, 14503, 14519, 14519, 14533, 14543, 14551,
+ 14557, 14563, 14563, 14563, 14591, 14593, 14593, 14593, 14621, 14629,
+ 14639, 14639, 14653, 14657, 14669, 14669, 14683, 14683, 14699, 14699,
+ 14717, 14723, 14731, 14741, 14747, 14759, 14767, 14771, 14783, 14783,
+ 14797, 14797, 14813, 14821, 14831, 14831, 14843, 14851, 14851, 14869,
+ 14879, 14887, 14891, 14897, 14897, 14897, 14923, 14929, 14939, 14951,
+ 14957, 14957, 14969, 14983, 14983, 14983, 14983, 15013, 15017, 15031,
+ 15031, 15031, 15053, 15061, 15061, 15077, 15083, 15091, 15101, 15107,
+ 15107, 15121, 15131, 15139, 15149, 15149, 15161, 15173, 15173, 15187,
+ 15199, 15199, 15199, 15217, 15227, 15233, 15241, 15241, 15263, 15271,
+ 15277, 15287, 15289, 15299, 15307, 15319, 15319, 15331, 15331, 15349,
+ 15359, 15361, 15373, 15383, 15391, 15391, 15401, 15413, 15413, 15427,
+ 15439, 15443, 15451, 15461, 15467, 15473, 15473, 15493, 15497, 15511,
+ 15511, 15527, 15527, 15541, 15551, 15559, 15559, 15569, 15583, 15583,
+ 15583, 15607, 15607, 15619, 15629, 15629, 15647, 15649, 15661, 15671,
+ 15679, 15683, 15683, 15683, 15683, 15683, 15727, 15733, 15739, 15749,
+ 15749, 15767, 15773, 15773, 15791, 15797, 15803, 15809, 15823, 15823,
+ 15823, 15823, 15823, 15859, 15859, 15877, 15887, 15889, 15901, 15907,
+ 15919, 15923, 15923, 15937, 15937, 15959, 15959, 15973, 15973, 15991,
+ 15991, 16007, 16007, 16007, 16007, 16033, 16033, 16033, 16063, 16069,
+ 16073, 16087, 16091, 16103, 16111, 16111, 16127, 16127, 16141, 16141,
+ 16141, 16141, 16141, 16183, 16189, 16193, 16193, 16193, 16223, 16231,
+ 16231, 16231, 16253, 16253, 16267, 16273, 16273, 16273, 16301, 16301,
+ 16319, 16319, 16333, 16339, 16349, 16349, 16363, 16369, 16381, 16381,
+ 16381, 16381, 16411, 16421, 16427, 16433, 16447, 16453, 16453, 16453,
+ 16477, 16487, 16493, 16493, 16493, 16519, 16519, 16529, 16529, 16547,
+ 16553, 16567, 16573, 16573, 16573, 16573, 16607, 16607, 16619, 16631,
+ 16633, 16633, 16651, 16661, 16661, 16673, 16673, 16693, 16703, 16703,
+ 16703, 16703, 16729, 16741, 16747, 16759, 16763, 16763, 16763, 16787,
+ 16787, 16787, 16811, 16823, 16831, 16831, 16843, 16843, 16843, 16871,
+ 16879, 16883, 16889, 16903, 16903, 16903, 16927, 16931, 16943, 16943,
+ 16943, 16963, 16963, 16981, 16987, 16993, 16993, 17011, 17021, 17029,
+ 17033, 17047, 17053, 17053, 17053, 17077, 17077, 17093, 17099, 17107,
+ 17117, 17123, 17123, 17137, 17137, 17159, 17167, 17167, 17183, 17191,
+ 17191, 17207, 17209, 17209, 17231, 17239, 17239, 17239, 17257, 17257,
+ 17257, 17257, 17293, 17299, 17299, 17317, 17327, 17333, 17341, 17351,
+ 17359, 17359, 17359, 17383, 17389, 17393, 17401, 17401, 17419, 17431,
+ 17431, 17443, 17449, 17449, 17471, 17477, 17483, 17491, 17497, 17509,
+ 17519, 17519, 17519, 17539, 17551, 17551, 17551, 17573, 17581, 17581,
+ 17599, 17599, 17609, 17623, 17627, 17627, 17627, 17627, 17659, 17669,
+ 17669, 17683, 17683, 17683, 17707, 17713, 17713, 17729, 17737, 17749,
+ 17749, 17761, 17761, 17783, 17791, 17791, 17807, 17807, 17807, 17827,
+ 17839, 17839, 17851, 17863, 17863, 17863, 17881, 17891, 17903, 17911,
+ 17911, 17923, 17929, 17939, 17939, 17959, 17959, 17971, 17981, 17989,
+ 17989, 17989, 18013, 18013, 18013, 18013, 18047, 18049, 18061, 18061,
+ 18077, 18077, 18089, 18097, 18097, 18119, 18127, 18133, 18143, 18149,
+ 18149, 18149, 18169, 18181, 18191, 18199, 18199, 18211, 18223, 18229,
+ 18233, 18233, 18253, 18257, 18269, 18269, 18287, 18289, 18301, 18311,
+ 18313, 18313, 18329, 18341, 18341, 18353, 18367, 18371, 18379, 18379,
+ 18397, 18401, 18413, 18413, 18427, 18439, 18443, 18451, 18461, 18461,
+ 18461, 18481, 18493, 18503, 18503, 18517, 18523, 18523, 18541, 18541,
+ 18553, 18553, 18553, 18583, 18587, 18593, 18593, 18593, 18617, 18617,
+ 18637, 18637, 18637, 18661, 18671, 18679, 18679, 18691, 18701, 18701,
+ 18719, 18719, 18731, 18743, 18749, 18757, 18757, 18773, 18773, 18787,
+ 18797, 18803, 18803, 18803, 18803, 18839, 18839, 18839, 18859, 18869,
+ 18869, 18869, 18869, 18899, 18911, 18919, 18919, 18919, 18919, 18947,
+ 18959, 18959, 18973, 18979, 18979, 18979, 19001, 19013, 19013, 19031,
+ 19037, 19037, 19051, 19051, 19069, 19079, 19087, 19087, 19087, 19087,
+ 19087, 19121, 19121, 19141, 19141, 19157, 19163, 19163, 19183, 19183,
+ 19183, 19207, 19213, 19219, 19231, 19237, 19237, 19249, 19259, 19267,
+ 19273, 19273, 19289, 19301, 19309, 19319, 19319, 19333, 19333, 19333,
+ 19333, 19333, 19373, 19381, 19391, 19391, 19403, 19403, 19423, 19429,
+ 19433, 19447, 19447, 19463, 19471, 19477, 19483, 19489, 19501, 19507,
+ 19507, 19507, 19531, 19543, 19543, 19559, 19559, 19571, 19583, 19583,
+ 19597, 19603, 19609, 19609, 19609, 19609, 19609, 19609, 19661, 19661,
+ 19661, 19687, 19687, 19699, 19709, 19717, 19727, 19727, 19739, 19751,
+ 19759, 19763, 19763, 19777, 19777, 19793, 19801, 19813, 19819, 19819,
+ 19819, 19843, 19853, 19861, 19867, 19867, 19867, 19891, 19891, 19891,
+ 19919, 19927, 19927, 19937, 19949, 19949, 19963, 19973, 19979, 19991,
+ 19997, 19997, 20011, 20023, 20029, 20029, 20047, 20051, 20063, 20071,
+ 20071, 20071, 20089, 20101, 20107, 20117, 20123, 20129, 20143, 20149,
+ 20149, 20161, 20173, 20183, 20183, 20183, 20201, 20201, 20219, 20231,
+ 20233, 20233, 20249, 20261, 20269, 20269, 20287, 20287, 20297, 20297,
+ 20297, 20327, 20333, 20341, 20347, 20359, 20359, 20369, 20369, 20389,
+ 20399, 20407, 20411, 20411, 20431, 20431, 20443, 20443, 20443, 20443,
+ 20479, 20483, 20483, 20483, 20509, 20509, 20521, 20533, 20543, 20551,
+ 20551, 20563, 20563, 20563, 20563, 20599, 20599, 20611, 20611, 20627,
+ 20639, 20641, 20641, 20663, 20663, 20663, 20681, 20693, 20693, 20707,
+ 20719, 20719, 20731, 20743, 20749, 20759, 20759, 20773, 20773, 20789,
+ 20789, 20807, 20809, 20809, 20809, 20809, 20809, 20849, 20857, 20857,
+ 20879, 20887, 20887, 20903, 20903, 20903, 20921, 20929, 20939, 20947,
+ 20959, 20963, 20963, 20983, 20983, 20983, 21001, 21013, 21023, 21031,
+ 21031, 21031, 21031, 21061, 21067, 21067, 21067, 21089, 21101, 21107,
+ 21107, 21121, 21121, 21143, 21149, 21157, 21163, 21169, 21179, 21191,
+ 21193, 21193, 21211, 21221, 21227, 21227, 21247, 21247, 21247, 21269,
+ 21277, 21283, 21283, 21283, 21283, 21319, 21323, 21323, 21341, 21347,
+ 21347, 21347, 21347, 21383, 21391, 21397, 21407, 21407, 21419, 21419,
+ 21433, 21433, 21433, 21433, 21467, 21467, 21487, 21493, 21503, 21503,
+ 21517, 21523, 21529, 21529, 21529, 21559, 21563, 21569, 21577, 21589,
+ 21599, 21601, 21613, 21617, 21617, 21617, 21647, 21649, 21661, 21661,
+ 21673, 21683, 21683, 21701, 21701, 21713, 21727, 21727, 21739, 21751,
+ 21757, 21767, 21773, 21773, 21787, 21799, 21803, 21803, 21821, 21821,
+ 21839, 21841, 21851, 21863, 21871, 21871, 21881, 21893, 21893, 21911,
+ 21911, 21911, 21929, 21943, 21943, 21943, 21961, 21961, 21977, 21991,
+ 21997, 22003, 22013, 22013, 22031, 22039, 22039, 22051, 22063, 22067,
+ 22079, 22079, 22093, 22093, 22111, 22111, 22123, 22133, 22133, 22147,
+ 22159, 22159, 22171, 22171, 22189, 22193, 22193, 22193, 22193, 22229,
+ 22229, 22247, 22247, 22259, 22271, 22279, 22283, 22291, 22303, 22307,
+ 22307, 22307, 22307, 22343, 22349, 22349, 22367, 22369, 22381, 22391,
+ 22397, 22397, 22409, 22409, 22409, 22433, 22447, 22453, 22453, 22469,
+ 22469, 22483, 22483, 22501, 22511, 22511, 22511, 22531, 22543, 22549,
+ 22549, 22567, 22573, 22573, 22573, 22573, 22573, 22613, 22621, 22621,
+ 22639, 22643, 22651, 22651, 22669, 22679, 22679, 22691, 22699, 22709,
+ 22717, 22727, 22727, 22741, 22751, 22751, 22751, 22769, 22783, 22787,
+ 22787, 22807, 22811, 22817, 22817, 22817, 22817, 22853, 22861, 22871,
+ 22877, 22877, 22877, 22901, 22907, 22907, 22921, 22921, 22943, 22943,
+ 22943, 22963, 22973, 22973, 22973, 22993, 23003, 23011, 23021, 23029,
+ 23039, 23041, 23053, 23063, 23071, 23071, 23087, 23087, 23099, 23099,
+ 23117, 23117, 23131, 23143, 23143, 23159, 23167, 23173, 23173, 23189,
+ 23197, 23203, 23209, 23209, 23227, 23227, 23227, 23251, 23251, 23269,
+ 23279, 23279, 23293, 23297, 23311, 23311, 23327, 23333, 23339, 23339,
+ 23357, 23357, 23371, 23371, 23371, 23399, 23399, 23399, 23417, 23431,
+ 23431, 23447, 23447, 23459, 23459, 23473, 23473, 23473, 23497, 23509,
+ 23509, 23509, 23531, 23539, 23549, 23557, 23567, 23567, 23581, 23581,
+ 23599, 23603, 23609, 23623, 23629, 23633, 23633, 23633, 23663, 23671,
+ 23677, 23687, 23689, 23689, 23689, 23719, 23719, 23719, 23743, 23747,
+ 23753, 23767, 23773, 23773, 23789, 23789, 23801, 23813, 23819, 23831,
+ 23833, 23833, 23833, 23857, 23869, 23879, 23887, 23893, 23899, 23911,
+ 23917, 23917, 23929, 23929, 23929, 23957, 23957, 23971, 23981, 23981,
+ 23993, 24007, 24007, 24023, 24029, 24029, 24043, 24049, 24061, 24071,
+ 24077, 24083, 24091, 24103, 24109, 24113, 24121, 24133, 24137, 24151,
+ 24151, 24151, 24169, 24181, 24181, 24197, 24203, 24203, 24223, 24229,
+ 24239, 24247, 24251, 24251, 24251, 24251, 24281, 24281, 24281, 24281,
+ 24317, 24317, 24329, 24337, 24337, 24359, 24359, 24373, 24379, 24391,
+ 24391, 24407, 24413, 24421, 24421, 24439, 24443, 24443, 24443, 24469,
+ 24473, 24481, 24481, 24499, 24509, 24517, 24527, 24533, 24533, 24551,
+ 24551, 24551, 24571, 24571, 24571, 24593, 24593, 24611, 24623, 24631,
+ 24631, 24631, 24631, 24659, 24671, 24677, 24683, 24691, 24697, 24709,
+ 24709, 24709, 24733, 24733, 24749, 24749, 24767, 24767, 24781, 24781,
+ 24799, 24799, 24809, 24821, 24821, 24821, 24847, 24851, 24859, 24859,
+ 24877, 24877, 24889, 24889, 24907, 24919, 24923, 24923, 24943, 24943,
+ 24953, 24967, 24971, 24979, 24989, 24989, 24989, 25013, 25013, 25031,
+ 25037, 25037, 25037, 25057, 25057, 25073, 25087, 25087, 25097, 25111,
+ 25117, 25127, 25127, 25127, 25147, 25153, 25163, 25171, 25183, 25189,
+ 25189, 25189, 25189, 25219, 25229, 25237, 25247, 25253, 25261, 25261,
+ 25261, 25261, 25261, 25303, 25309, 25309, 25321, 25321, 25343, 25349,
+ 25357, 25367, 25373, 25373, 25391, 25391, 25391, 25411, 25423, 25423,
+ 25439, 25447, 25453, 25463, 25471, 25471, 25471, 25471, 25471, 25471,
+ 25471, 25523, 25523, 25541, 25541, 25541, 25561, 25561, 25583, 25589,
+ 25589, 25603, 25609, 25621, 25621, 25639, 25643, 25643, 25657, 25667,
+ 25679, 25679, 25693, 25703, 25703, 25717, 25717, 25733, 25741, 25747,
+ 25759, 25763, 25771, 25771, 25771, 25799, 25801, 25801, 25819, 25819,
+ 25819, 25847, 25849, 25849, 25867, 25873, 25873, 25889, 25903, 25903,
+ 25919, 25919, 25933, 25943, 25951, 25951, 25951, 25969, 25981, 25981,
+ 25999, 26003, 26003, 26021, 26029, 26029, 26041, 26053, 26053, 26053,
+ 26053, 26083, 26083, 26099, 26111, 26119, 26119, 26119, 26141, 26141,
+ 26153, 26161, 26171, 26183, 26189, 26189, 26203, 26209, 26209, 26227,
+ 26237, 26237, 26251, 26263, 26267, 26267, 26267, 26293, 26297, 26309,
+ 26317, 26321, 26321, 26339, 26347, 26357, 26357, 26371, 26371, 26387,
+ 26399, 26407, 26407, 26423, 26431, 26437, 26437, 26449, 26459, 26459,
+ 26479, 26479, 26489, 26501, 26501, 26513, 26513, 26513, 26539, 26539,
+ 26557, 26561, 26573, 26573, 26591, 26597, 26597, 26597, 26597, 26627,
+ 26633, 26647, 26647, 26647, 26669, 26669, 26687, 26693, 26701, 26711,
+ 26717, 26723, 26731, 26737, 26737, 26759, 26759, 26759, 26783, 26783,
+ 26783, 26801, 26813, 26821, 26821, 26839, 26839, 26849, 26863, 26863,
+ 26879, 26881, 26893, 26903, 26903, 26903, 26927, 26927, 26927, 26951,
+ 26959, 26959, 26959, 26981, 26987, 26993, 26993, 27011, 27017, 27031,
+ 27031, 27043, 27043, 27061, 27067, 27077, 27077, 27091, 27103, 27109,
+ 27109, 27127, 27127, 27143, 27143, 27143, 27143, 27143, 27179, 27191,
+ 27197, 27197, 27211, 27211, 27211, 27239, 27241, 27253, 27259, 27271,
+ 27277, 27283, 27283, 27299, 27299, 27299, 27299, 27329, 27337, 27337,
+ 27337, 27367, 27367, 27367, 27367, 27397, 27407, 27409, 27409, 27431,
+ 27437, 27437, 27449, 27457, 27457, 27479, 27487, 27487, 27487, 27509,
+ 27509, 27527, 27529, 27541, 27551, 27551, 27551, 27551, 27583, 27583,
+ 27583, 27583, 27611, 27617, 27631, 27631, 27647, 27653, 27653, 27653,
+ 27673, 27673, 27691, 27701, 27701, 27701, 27701, 27733, 27743, 27751,
+ 27751, 27767, 27773, 27779, 27791, 27799, 27803, 27809, 27823, 27827,
+ 27827, 27847, 27851, 27851, 27851, 27851, 27883, 27893, 27901, 27901,
+ 27919, 27919, 27919, 27943, 27947, 27953, 27967, 27967, 27983, 27983,
+ 27997, 28001, 28001, 28019, 28031, 28031, 28031, 28051, 28057, 28069,
+ 28069, 28087, 28087, 28099, 28111, 28111, 28123, 28123, 28123, 28151,
+ 28151, 28163, 28163, 28183, 28183, 28183, 28201, 28211, 28219, 28229,
+ 28229, 28229, 28229, 28229, 28229, 28279, 28283, 28289, 28297, 28309,
+ 28319, 28319, 28319, 28319, 28351, 28351, 28351, 28351, 28351, 28387,
+ 28393, 28403, 28411, 28411, 28429, 28439, 28447, 28447, 28463, 28463,
+ 28477, 28477, 28493, 28499, 28499, 28517, 28517, 28517, 28541, 28549,
+ 28559, 28559, 28573, 28579, 28591, 28597, 28607, 28607, 28621, 28631,
+ 28631, 28643, 28649, 28663, 28669, 28669, 28687, 28687, 28703, 28711,
+ 28711, 28723, 28729, 28729, 28751, 28759, 28759, 28771, 28771, 28789,
+ 28793, 28807, 28813, 28817, 28817, 28837, 28843, 28843, 28859, 28871,
+ 28879, 28879, 28879, 28901, 28909, 28909, 28927, 28933, 28933, 28949,
+ 28949, 28961, 28961, 28979, 28979, 28979, 28979, 29009, 29023, 29027,
+ 29033, 29033, 29033, 29063, 29063, 29077, 29077, 29077, 29101, 29101,
+ 29101, 29123, 29131, 29137, 29147, 29153, 29167, 29173, 29179, 29191,
+ 29191, 29207, 29209, 29221, 29231, 29231, 29243, 29251, 29251, 29269,
+ 29269, 29287, 29287, 29303, 29311, 29311, 29327, 29333, 29339, 29347,
+ 29347, 29363, 29363, 29383, 29389, 29399, 29401, 29411, 29423, 29429,
+ 29437, 29443, 29453, 29453, 29453, 29473, 29483, 29483, 29501, 29501,
+ 29501, 29527, 29531, 29537, 29537, 29537, 29567, 29573, 29581, 29587,
+ 29599, 29599, 29611, 29611, 29629, 29633, 29641, 29641, 29663, 29671,
+ 29671, 29683, 29683, 29683, 29683, 29717, 29723, 29723, 29741, 29741,
+ 29759, 29761, 29761, 29761, 29789, 29789, 29803, 29803, 29819, 29819,
+ 29837, 29837, 29851, 29863, 29867, 29879, 29881, 29881, 29881, 29881,
+ 29917, 29927, 29927, 29927, 29947, 29959, 29959, 29959, 29983, 29989,
+ 29989, 29989, 30013, 30013, 30029, 30029, 30047, 30047, 30059, 30071,
+ 30071, 30071, 30091, 30103, 30109, 30119, 30119, 30133, 30139, 30139,
+ 30139, 30161, 30169, 30181, 30187, 30197, 30203, 30211, 30223, 30223,
+ 30223, 30241, 30253, 30259, 30271, 30271, 30271, 30293, 30293, 30307,
+ 30319, 30323, 30323, 30341, 30347, 30347, 30367, 30367, 30367, 30391,
+ 30391, 30403, 30403, 30403, 30431, 30431, 30431, 30449, 30449, 30469,
+ 30469, 30469, 30493, 30497, 30509, 30517, 30517, 30529, 30539, 30539,
+ 30559, 30559, 30559, 30577, 30577, 30593, 30593, 30593, 30593, 30631,
+ 30637, 30643, 30649, 30661, 30671, 30677, 30677, 30689, 30703, 30707,
+ 30713, 30727, 30727, 30727, 30727, 30757, 30763, 30773, 30781, 30781,
+ 30781, 30803, 30809, 30817, 30829, 30839, 30841, 30853, 30859, 30871,
+ 30871, 30881, 30893, 30893, 30911, 30911, 30911, 30931, 30941, 30949,
+ 30949, 30949, 30971, 30983, 30983, 30983, 30983, 31013, 31019, 31019,
+ 31039, 31039, 31051, 31063, 31069, 31079, 31081, 31091, 31091, 31091,
+ 31091, 31123, 31123, 31139, 31151, 31159, 31159, 31159, 31183, 31189,
+ 31193, 31193, 31193, 31223, 31231, 31237, 31247, 31253, 31259, 31271,
+ 31277, 31277, 31277, 31277, 31307, 31319, 31327, 31333, 31337, 31337,
+ 31357, 31357, 31357, 31379, 31391, 31397, 31397, 31397, 31397, 31397,
+ 31397, 31397, 31397, 31397, 31469, 31477, 31481, 31489, 31489, 31511,
+ 31517, 31517, 31531, 31543, 31547, 31547, 31567, 31573, 31583, 31583,
+ 31583, 31607, 31607, 31607, 31627, 31627, 31643, 31649, 31663, 31667,
+ 31667, 31687, 31687, 31699, 31699, 31699, 31727, 31729, 31741, 31751,
+ 31751, 31751, 31771, 31771, 31771, 31799, 31799, 31799, 31817, 31817,
+ 31817, 31847, 31849, 31859, 31859, 31873, 31883, 31891, 31891, 31907,
+ 31907, 31907, 31907, 31907, 31907, 31957, 31963, 31973, 31981, 31991,
+ 31991, 32003, 32009, 32009, 32029, 32029, 32029, 32051, 32063, 32069,
+ 32077, 32083, 32089, 32099, 32099, 32119, 32119, 32119, 32143, 32143,
+ 32159, 32159, 32173, 32183, 32191, 32191, 32203, 32213, 32213, 32213,
+ 32237, 32237, 32251, 32261, 32261, 32261, 32261, 32261, 32303, 32309,
+ 32309, 32327, 32327, 32341, 32341, 32359, 32363, 32371, 32381, 32381,
+ 32381, 32401, 32413, 32423, 32429, 32429, 32443, 32443, 32443, 32467,
+ 32479, 32479, 32491, 32503, 32507, 32507, 32507, 32533, 32537, 32537,
+ 32537, 32563, 32573, 32579, 32587, 32587, 32603, 32611, 32621, 32621,
+ 32633, 32647, 32653, 32653, 32653, 32653, 32687, 32693, 32693, 32707,
+ 32719, 32719, 32719, 32719, 32749, 32749, 32749, 32771, 32783, 32789,
+ 32797, 32803, 32803, 32803, 32831, 32839, 32843, 32843, 32843, 32869,
+ 32869, 32887, 32887, 32887, 32911, 32917, 32917, 32933, 32941, 32941,
+ 32957, 32957, 32971, 32983, 32987, 32999, 32999, 33013, 33023, 33029,
+ 33037, 33037, 33053, 33053, 33071, 33073, 33083, 33091, 33091, 33107,
+ 33119, 33119, 33119, 33119, 33151, 33151, 33161, 33161, 33181, 33191,
+ 33199, 33203, 33211, 33223, 33223, 33223, 33247, 33247, 33247, 33247,
+ 33247, 33287, 33289, 33301, 33311, 33317, 33317, 33331, 33343, 33349,
+ 33359, 33359, 33359, 33377, 33391, 33391, 33403, 33413, 33413, 33427,
+ 33427, 33427, 33427, 33461, 33469, 33479, 33487, 33493, 33503, 33503,
+ 33503, 33521, 33533, 33533, 33547, 33547, 33563, 33569, 33581, 33589,
+ 33599, 33601, 33613, 33623, 33629, 33637, 33647, 33647, 33647, 33647,
+ 33679, 33679, 33679, 33703, 33703, 33713, 33721, 33721, 33739, 33751,
+ 33757, 33767, 33773, 33773, 33791, 33797, 33797, 33811, 33811, 33829,
+ 33829, 33829, 33851, 33863, 33871, 33871, 33871, 33893, 33893, 33911,
+ 33911, 33923, 33931, 33941, 33941, 33941, 33967, 33967, 33967, 33967,
+ 33997, 33997, 33997, 34019, 34031, 34039, 34039, 34039, 34061, 34061,
+ 34061, 34061, 34061, 34061, 34061, 34061, 34127, 34129, 34141, 34147,
+ 34159, 34159, 34171, 34183, 34183, 34183, 34183, 34213, 34217, 34231,
+ 34231, 34231, 34253, 34261, 34267, 34273, 34283, 34283, 34303, 34303,
+ 34319, 34327, 34327, 34337, 34351, 34351, 34367, 34369, 34381, 34381,
+ 34381, 34403, 34403, 34421, 34429, 34439, 34439, 34439, 34457, 34471,
+ 34471, 34487, 34487, 34501, 34511, 34519, 34519, 34519, 34543, 34549,
+ 34549, 34549, 34549, 34583, 34591, 34591, 34607, 34613, 34613, 34631,
+ 34631, 34631, 34651, 34651, 34667, 34679, 34687, 34693, 34703, 34703,
+ 34703, 34721, 34729, 34739, 34747, 34759, 34763, 34763, 34781, 34781,
+ 34781, 34807, 34807, 34819, 34819, 34819, 34847, 34849, 34849, 34871,
+ 34877, 34883, 34883, 34897, 34897, 34919, 34919, 34919, 34939, 34949,
+ 34949, 34963, 34963, 34981, 34981, 34981, 34981, 34981, 35023, 35027,
+ 35027, 35027, 35053, 35059, 35069, 35069, 35083, 35089, 35099, 35111,
+ 35117, 35117, 35129, 35141, 35149, 35159, 35159, 35171, 35171, 35171,
+ 35171, 35201, 35201, 35221, 35227, 35227, 35227, 35251, 35257, 35267,
+ 35279, 35281, 35291, 35291, 35311, 35317, 35327, 35327, 35339, 35339,
+ 35353, 35363, 35363, 35381, 35381, 35393, 35407, 35407, 35423, 35423,
+ 35437, 35447, 35449, 35461, 35461, 35461, 35461, 35491, 35491, 35509,
+ 35509, 35527, 35533, 35543, 35543, 35543, 35543, 35573, 35573, 35591,
+ 35597, 35603, 35603, 35617, 35617, 35617, 35617, 35617, 35617, 35671,
+ 35677, 35677, 35677, 35677, 35677, 35677, 35677, 35731, 35731, 35747,
+ 35759, 35759, 35771, 35771, 35771, 35797, 35803, 35809, 35809, 35831,
+ 35839, 35839, 35851, 35863, 35869, 35879, 35879, 35879, 35899, 35911,
+ 35911, 35923, 35933, 35933, 35951, 35951, 35963, 35969, 35983, 35983,
+ 35999, 36007, 36013, 36017, 36017, 36037, 36037, 36037, 36061, 36067,
+ 36073, 36083, 36083, 36097, 36109, 36109, 36109, 36131, 36137, 36151,
+ 36151, 36161, 36161, 36161, 36191, 36191, 36191, 36209, 36217, 36229,
+ 36229, 36241, 36251, 36263, 36269, 36277, 36277, 36293, 36299, 36307,
+ 36319, 36319, 36319, 36343, 36343, 36353, 36353, 36373, 36383, 36389,
+ 36389, 36389, 36389, 36389, 36389, 36433, 36433, 36451, 36457, 36469,
+ 36479, 36479, 36493, 36497, 36497, 36497, 36527, 36529, 36541, 36551,
+ 36559, 36563, 36571, 36583, 36587, 36599, 36607, 36607, 36607, 36629,
+ 36637, 36643, 36653, 36653, 36671, 36677, 36683, 36691, 36697, 36709,
+ 36713, 36721, 36721, 36739, 36749, 36749, 36767, 36767, 36781, 36791,
+ 36793, 36793, 36809, 36821, 36821, 36833, 36847, 36847, 36857, 36871,
+ 36877, 36887, 36887, 36901, 36901, 36919, 36923, 36931, 36943, 36947,
+ 36947, 36947, 36973, 36979, 36979, 36997, 37003, 37013, 37021, 37021,
+ 37039, 37039, 37049, 37061, 37061, 37061, 37087, 37087, 37097, 37097,
+ 37117, 37123, 37123, 37139, 37139, 37159, 37159, 37171, 37181, 37189,
+ 37199, 37201, 37201, 37223, 37223, 37223, 37243, 37253, 37253, 37253,
+ 37277, 37277, 37277, 37277, 37309, 37313, 37321, 37321, 37339, 37339,
+ 37357, 37363, 37369, 37379, 37379, 37397, 37397, 37409, 37423, 37423,
+ 37423, 37447, 37447, 37463, 37463, 37463, 37483, 37493, 37501, 37511,
+ 37517, 37517, 37529, 37537, 37549, 37549, 37567, 37573, 37579, 37591,
+ 37591, 37607, 37607, 37619, 37619, 37633, 37643, 37649, 37663, 37663,
+ 37663, 37663, 37693, 37699, 37699, 37717, 37717, 37717, 37717, 37747,
+ 37747, 37747, 37747, 37783, 37783, 37799, 37799, 37813, 37813, 37831,
+ 37831, 37847, 37853, 37861, 37871, 37879, 37879, 37889, 37897, 37907,
+ 37907, 37907, 37907, 37907, 37951, 37957, 37967, 37967, 37967, 37991,
+ 37997, 37997, 38011, 38011, 38011, 38039, 38047, 38053, 38053, 38069,
+ 38069, 38083, 38083, 38083, 38083, 38119, 38119, 38119, 38119, 38149,
+ 38153, 38167, 38167, 38183, 38189, 38197, 38201, 38201, 38219, 38231,
+ 38239, 38239, 38239, 38261, 38261, 38273, 38287, 38287, 38303, 38303,
+ 38317, 38327, 38333, 38333, 38351, 38351, 38351, 38371, 38377, 38377,
+ 38393, 38393, 38393, 38393, 38431, 38431, 38447, 38453, 38461, 38461,
+ 38461, 38461, 38461, 38501, 38501, 38501, 38501, 38501, 38543, 38543,
+ 38557, 38567, 38569, 38569, 38569, 38593, 38603, 38611, 38611, 38629,
+ 38639, 38639, 38653, 38653, 38671, 38677, 38677, 38693, 38699, 38711,
+ 38713, 38723, 38729, 38737, 38749, 38749, 38767, 38767, 38783, 38791,
+ 38791, 38803, 38803, 38821, 38821, 38839, 38839, 38851, 38861, 38867,
+ 38873, 38873, 38891, 38903, 38903, 38917, 38923, 38933, 38933, 38933,
+ 38959, 38959, 38971, 38977, 38977, 38993, 38993, 38993, 39023, 39023,
+ 39023, 39047, 39047, 39047, 39047, 39079, 39079, 39089, 39103, 39107,
+ 39119, 39119, 39133, 39139, 39139, 39157, 39163, 39163, 39181, 39191,
+ 39199, 39199, 39209, 39217, 39229, 39239, 39241, 39251, 39251, 39251,
+ 39251, 39251, 39293, 39301, 39301, 39317, 39323, 39323, 39343, 39343,
+ 39359, 39367, 39373, 39383, 39383, 39397, 39397, 39409, 39419, 39419,
+ 39439, 39443, 39451, 39461, 39461, 39461, 39461, 39461, 39503, 39511,
+ 39511, 39521, 39521, 39541, 39551, 39551, 39563, 39569, 39581, 39581,
+ 39581, 39607, 39607, 39623, 39631, 39631, 39631, 39631, 39659, 39671,
+ 39679, 39679, 39679, 39703, 39709, 39719, 39727, 39733, 39733, 39749,
+ 39749, 39761, 39769, 39779, 39791, 39799, 39799, 39799, 39821, 39829,
+ 39839, 39847, 39847, 39863, 39869, 39877, 39887, 39887, 39901, 39901,
+ 39901, 39901, 39929, 39937, 39937, 39953, 39953, 39971, 39983, 39989,
+ 39989, 39989, 40013, 40013, 40031, 40039, 40039, 40039, 40063, 40063,
+ 40063, 40087, 40093, 40099, 40111, 40111, 40127, 40129, 40129, 40151,
+ 40153, 40163, 40169, 40177, 40189, 40193, 40193, 40213, 40213, 40231,
+ 40237, 40241, 40253, 40253, 40253, 40277, 40283, 40289, 40289, 40289,
+ 40289, 40289, 40289, 40343, 40351, 40357, 40361, 40361, 40361, 40387,
+ 40387, 40387, 40387, 40423, 40429, 40433, 40433, 40433, 40459, 40471,
+ 40471, 40487, 40493, 40499, 40507, 40519, 40519, 40531, 40543, 40543,
+ 40559, 40559, 40559, 40583, 40591, 40597, 40597, 40609, 40609, 40627,
+ 40639, 40639, 40639, 40639, 40639, 40639, 40639, 40693, 40699, 40709,
+ 40709, 40709, 40709, 40739, 40751, 40759, 40763, 40771, 40771, 40787,
+ 40787, 40801, 40813, 40823, 40829, 40829, 40847, 40853, 40853, 40867,
+ 40879, 40883, 40883, 40903, 40903, 40903, 40927, 40933, 40939, 40949,
+ 40949, 40961, 40973, 40973, 40973, 40993, 40993, 41011, 41023, 41023,
+ 41039, 41047, 41051, 41057, 41057, 41077, 41081, 41081, 41081, 41081,
+ 41117, 41117, 41131, 41143, 41149, 41149, 41161, 41161, 41183, 41189,
+ 41189, 41203, 41213, 41221, 41231, 41233, 41243, 41243, 41263, 41269,
+ 41269, 41281, 41281, 41299, 41299, 41299, 41299, 41333, 41341, 41351,
+ 41357, 41357, 41357, 41381, 41389, 41399, 41399, 41413, 41413, 41413,
+ 41413, 41443, 41453, 41453, 41467, 41479, 41479, 41491, 41491, 41507,
+ 41519, 41521, 41521, 41543, 41549, 41549, 41549, 41549, 41579, 41579,
+ 41597, 41603, 41611, 41621, 41627, 41627, 41647, 41651, 41659, 41669,
+ 41669, 41687, 41687, 41687, 41687, 41719, 41719, 41729, 41737, 41737,
+ 41759, 41761, 41771, 41777, 41777, 41777, 41801, 41813, 41813, 41813,
+ 41813, 41843, 41851, 41863, 41863, 41879, 41887, 41893, 41903, 41911,
+ 41911, 41927, 41927, 41941, 41947, 41959, 41959, 41969, 41983, 41983,
+ 41999, 41999, 42013, 42023, 42023, 42023, 42043, 42043, 42061, 42071,
+ 42073, 42083, 42089, 42101, 42101, 42101, 42101, 42131, 42139, 42139,
+ 42157, 42157, 42169, 42181, 42187, 42197, 42197, 42209, 42223, 42227,
+ 42239, 42239, 42239, 42257, 42257, 42257, 42283, 42293, 42299, 42307,
+ 42307, 42323, 42331, 42337, 42349, 42359, 42359, 42373, 42379, 42391,
+ 42397, 42407, 42409, 42409, 42409, 42437, 42443, 42451, 42463, 42467,
+ 42473, 42487, 42491, 42499, 42509, 42509, 42509, 42533, 42533, 42533,
+ 42557, 42557, 42571, 42577, 42589, 42589, 42589, 42611, 42611, 42611,
+ 42611, 42643, 42649, 42649, 42667, 42677, 42683, 42689, 42703, 42709,
+ 42719, 42727, 42727, 42743, 42751, 42751, 42767, 42773, 42773, 42787,
+ 42797, 42797, 42797, 42821, 42829, 42839, 42841, 42853, 42863, 42863,
+ 42863, 42863, 42863, 42901, 42901, 42901, 42923, 42929, 42943, 42943,
+ 42953, 42967, 42967, 42979, 42989, 42989, 43003, 43013, 43019, 43019,
+ 43037, 43037, 43051, 43063, 43067, 43067, 43067, 43093, 43103, 43103,
+ 43117, 43117, 43133, 43133, 43151, 43159, 43159, 43159, 43177, 43189,
+ 43189, 43207, 43207, 43223, 43223, 43237, 43237, 43237, 43261, 43271,
+ 43271, 43283, 43291, 43291, 43291, 43319, 43321, 43331, 43331, 43331,
+ 43331, 43331, 43331, 43331, 43391, 43399, 43403, 43411, 43411, 43427,
+ 43427, 43441, 43451, 43457, 43457, 43457, 43487, 43487, 43499, 43499,
+ 43517, 43517, 43517, 43543, 43543, 43543, 43543, 43573, 43579, 43591,
+ 43597, 43607, 43613, 43613, 43627, 43633, 43633, 43651, 43661, 43669,
+ 43669, 43669, 43691, 43691, 43711, 43717, 43721, 43721, 43721, 43721,
+ 43759, 43759, 43759, 43783, 43789, 43793, 43801, 43801, 43801, 43801,
+ 43801, 43801, 43853, 43853, 43867, 43867, 43867, 43891, 43891, 43891,
+ 43913, 43913, 43933, 43943, 43951, 43951, 43963, 43973, 43973, 43991,
+ 43997, 43997, 43997, 44021, 44029, 44029, 44041, 44053, 44059, 44071,
+ 44071, 44087, 44089, 44101, 44111, 44119, 44123, 44131, 44131, 44131,
+ 44159, 44159, 44171, 44179, 44189, 44189, 44207, 44207, 44221, 44221,
+ 44221, 44221, 44249, 44263, 44269, 44279, 44281, 44293, 44293, 44293,
+ 44293, 44293, 44293, 44293, 44351, 44357, 44357, 44371, 44383, 44389,
+ 44389, 44389, 44389, 44417, 44417, 44417, 44417, 44453, 44453, 44453,
+ 44453, 44483, 44491, 44501, 44507, 44519, 44519, 44533, 44543, 44549,
+ 44549, 44563, 44563, 44579, 44587, 44587, 44587, 44587, 44623, 44623,
+ 44633, 44647, 44651, 44657, 44657, 44657, 44687, 44687, 44701, 44711,
+ 44711, 44711, 44729, 44741, 44741, 44753, 44753, 44773, 44777, 44789,
+ 44797, 44797, 44809, 44819, 44819, 44839, 44843, 44851, 44851, 44867,
+ 44879, 44887, 44893, 44893, 44909, 44917, 44927, 44927, 44939, 44939,
+ 44959, 44963, 44971, 44983, 44987, 44987, 45007, 45013, 45013, 45013,
+ 45013, 45013, 45053, 45061, 45061, 45077, 45083, 45083, 45083, 45083,
+ 45119, 45127, 45131, 45139, 45139, 45139, 45161, 45161, 45181, 45191,
+ 45197, 45197, 45197, 45197, 45197, 45233, 45247, 45247, 45263, 45263,
+ 45263, 45281, 45293, 45293, 45307, 45319, 45319, 45329, 45343, 45343,
+ 45343, 45361, 45361, 45377, 45389, 45389, 45403, 45413, 45413, 45427,
+ 45439, 45439, 45439, 45439, 45439, 45439, 45481, 45491, 45503, 45503,
+ 45503, 45523, 45533, 45541, 45541, 45557, 45557, 45569, 45569, 45589,
+ 45599, 45599, 45613, 45613, 45631, 45631, 45641, 45641, 45659, 45667,
+ 45677, 45677, 45691, 45697, 45707, 45707, 45707, 45707, 45737, 45751,
+ 45757, 45767, 45767, 45779, 45779, 45779, 45779, 45779, 45823, 45827,
+ 45833, 45841, 45853, 45863, 45869, 45869, 45887, 45893, 45893, 45893,
+ 45893, 45893, 45893, 45943, 45949, 45959, 45959, 45971, 45979, 45989,
+ 45989, 45989, 45989, 46021, 46027, 46027, 46027, 46051, 46061, 46061,
+ 46073, 46073, 46093, 46103, 46103, 46103, 46103, 46133, 46141, 46147,
+ 46153, 46153, 46171, 46183, 46187, 46199, 46199, 46199, 46219, 46229,
+ 46237, 46237, 46237, 46261, 46271, 46279, 46279, 46279, 46301, 46309,
+ 46309, 46327, 46327, 46337, 46351, 46351, 46351, 46351, 46381, 46381,
+ 46399, 46399, 46411, 46411, 46411, 46439, 46447, 46451, 46457, 46471,
+ 46477, 46477, 46489, 46499, 46511, 46511, 46523, 46523, 46523, 46549,
+ 46559, 46567, 46573, 46573, 46591, 46591, 46601, 46601, 46619, 46619,
+ 46639, 46643, 46649, 46663, 46663, 46679, 46687, 46691, 46703, 46703,
+ 46703, 46727, 46727, 46727, 46751, 46757, 46757, 46771, 46771, 46771,
+ 46771, 46807, 46811, 46819, 46831, 46831, 46831, 46853, 46861, 46867,
+ 46877, 46877, 46889, 46901, 46901, 46919, 46919, 46933, 46933, 46933,
+ 46957, 46957, 46957, 46957, 46957, 46997, 46997, 46997, 47017, 47017,
+ 47017, 47041, 47051, 47059, 47059, 47059, 47087, 47093, 47093, 47111,
+ 47119, 47123, 47129, 47143, 47149, 47149, 47161, 47161, 47161, 47189,
+ 47189, 47207, 47207, 47221, 47221, 47237, 47237, 47251, 47251, 47269,
+ 47279, 47287, 47293, 47303, 47309, 47317, 47317, 47317, 47339, 47351,
+ 47353, 47363, 47363, 47381, 47389, 47389, 47407, 47407, 47419, 47431,
+ 47431, 47441, 47441, 47459, 47459, 47459, 47459, 47491, 47501, 47507,
+ 47513, 47527, 47533, 47543, 47543, 47543, 47563, 47569, 47581, 47591,
+ 47599, 47599, 47609, 47623, 47629, 47639, 47639, 47653, 47659, 47659,
+ 47659, 47681, 47681, 47701, 47711, 47717, 47717, 47717, 47743, 47743,
+ 47743, 47743, 47743, 47779, 47791, 47797, 47807, 47809, 47819, 47819,
+ 47837, 47843, 47843, 47857, 47869, 47869, 47881, 47881, 47903, 47911,
+ 47917, 47917, 47933, 47939, 47951, 47951, 47963, 47969, 47981, 47981,
+ 47981, 47981, 47981, 48023, 48029, 48029, 48029, 48049, 48049, 48049,
+ 48079, 48079, 48091, 48091, 48109, 48119, 48121, 48131, 48131, 48131,
+ 48157, 48163, 48163, 48179, 48187, 48197, 48197, 48197, 48221, 48221,
+ 48239, 48247, 48247, 48259, 48271, 48271, 48281, 48281, 48299, 48311,
+ 48313, 48313, 48313, 48341, 48341, 48353, 48353, 48371, 48383, 48383,
+ 48397, 48407, 48413, 48413, 48413, 48437, 48437, 48449, 48463, 48463,
+ 48479, 48487, 48491, 48497, 48497, 48497, 48527, 48533, 48541, 48541,
+ 48541, 48563, 48571, 48571, 48589, 48593, 48593, 48611, 48623, 48623,
+ 48623, 48647, 48649, 48661, 48661, 48679, 48679, 48679, 48679, 48679,
+ 48679, 48679, 48733, 48733, 48751, 48757, 48767, 48767, 48781, 48787,
+ 48799, 48799, 48809, 48823, 48823, 48823, 48847, 48847, 48859, 48871,
+ 48871, 48883, 48889, 48889, 48907, 48907, 48907, 48907, 48907, 48947,
+ 48953, 48953, 48973, 48973, 48991, 48991, 49003, 49009, 49019, 49031,
+ 49037, 49043, 49043, 49057, 49069, 49069, 49081, 49081, 49103, 49109,
+ 49117, 49123, 49123, 49139, 49139, 49157, 49157, 49171, 49177, 49177,
+ 49199, 49207, 49211, 49223, 49223, 49223, 49223, 49253, 49261, 49261,
+ 49279, 49279, 49279, 49297, 49307, 49307, 49307, 49333, 49339, 49339,
+ 49339, 49367, 49369, 49369, 49391, 49393, 49393, 49411, 49417, 49429,
+ 49433, 49433, 49451, 49463, 49463, 49477, 49481, 49481, 49499, 49499,
+ 49499, 49523, 49531, 49537, 49549, 49559, 49559, 49559, 49559, 49559,
+ 49597, 49603, 49613, 49613, 49627, 49639, 49639, 49639, 49663, 49669,
+ 49669, 49681, 49681, 49697, 49711, 49711, 49727, 49727, 49741, 49747,
+ 49757, 49757, 49757, 49783, 49789, 49789, 49807, 49811, 49823, 49831,
+ 49831, 49843, 49853, 49853, 49871, 49877, 49877, 49891, 49891, 49891,
+ 49919, 49927, 49927, 49943, 49943, 49957, 49957, 49957, 49957, 49991,
+ 49999, 49999, 49999, 50023, 50023, 50033, 50047, 50053, 50053, 50069,
+ 50077, 50087, 50093, 50101, 50111, 50119, 50123, 50131, 50131, 50147,
+ 50159, 50159, 50159, 50177, 50177, 50177, 50207, 50207, 50221, 50231,
+ 50231, 50231, 50231, 50263, 50263, 50273, 50287, 50291, 50291, 50311,
+ 50311, 50321, 50333, 50341, 50341, 50359, 50363, 50363, 50383, 50387,
+ 50387, 50387, 50411, 50423, 50423, 50423, 50441, 50441, 50461, 50461,
+ 50461, 50461, 50461, 50503, 50503, 50513, 50527, 50527, 50543, 50551,
+ 50551, 50551, 50551, 50581, 50591, 50599, 50599, 50599, 50599, 50627,
+ 50627, 50647, 50651, 50651, 50671, 50671, 50683, 50683, 50683, 50707,
+ 50707, 50723, 50723, 50741, 50741, 50753, 50767, 50773, 50777, 50789,
+ 50789, 50789, 50789, 50821, 50821, 50839, 50839, 50849, 50857, 50867,
+ 50873, 50873, 50893, 50893, 50909, 50909, 50923, 50929, 50929, 50951,
+ 50957, 50957, 50971, 50971, 50989, 50993, 51001, 51001, 51001, 51031,
+ 51031, 51047, 51047, 51061, 51071, 51071, 51071, 51071, 51071, 51109,
+ 51109, 51109, 51133, 51137, 51151, 51157, 51157, 51169, 51169, 51169,
+ 51199, 51203, 51203, 51217, 51229, 51239, 51241, 51241, 51263, 51263,
+ 51263, 51287, 51287, 51287, 51307, 51307, 51307, 51329, 51343, 51349,
+ 51349, 51361, 51361, 51383, 51383, 51383, 51407, 51413, 51421, 51431,
+ 51439, 51439, 51449, 51461, 51461, 51479, 51487, 51487, 51503, 51511,
+ 51517, 51521, 51521, 51539, 51551, 51551, 51563, 51563, 51581, 51581,
+ 51599, 51607, 51613, 51613, 51631, 51637, 51647, 51647, 51659, 51659,
+ 51679, 51683, 51691, 51691, 51691, 51719, 51721, 51721, 51721, 51749,
+ 51749, 51767, 51769, 51769, 51787, 51797, 51803, 51803, 51817, 51829,
+ 51839, 51839, 51853, 51859, 51871, 51871, 51871, 51893, 51899, 51907,
+ 51913, 51913, 51929, 51941, 51949, 51949, 51949, 51973, 51977, 51991,
+ 51991, 51991, 52009, 52021, 52027, 52027, 52027, 52051, 52057, 52069,
+ 52069, 52081, 52081, 52103, 52103, 52103, 52127, 52127, 52127, 52147,
+ 52153, 52163, 52163, 52183, 52189, 52189, 52201, 52201, 52223, 52223,
+ 52237, 52237, 52253, 52259, 52267, 52267, 52267, 52291, 52301, 52301,
+ 52313, 52321, 52321, 52321, 52321, 52321, 52363, 52369, 52379, 52391,
+ 52391, 52391, 52391, 52391, 52391, 52433, 52433, 52453, 52457, 52457,
+ 52457, 52457, 52489, 52501, 52511, 52517, 52517, 52529, 52543, 52543,
+ 52553, 52567, 52571, 52583, 52583, 52583, 52583, 52609, 52609, 52631,
+ 52639, 52639, 52639, 52639, 52667, 52673, 52673, 52691, 52697, 52711,
+ 52711, 52727, 52733, 52733, 52747, 52757, 52757, 52769, 52783, 52783,
+ 52783, 52807, 52813, 52817, 52817, 52837, 52837, 52837, 52861, 52861,
+ 52879, 52883, 52889, 52903, 52903, 52919, 52919, 52919, 52937, 52951,
+ 52957, 52967, 52973, 52981, 52981, 52999, 53003, 53003, 53017, 53017,
+ 53017, 53047, 53051, 53051, 53069, 53077, 53087, 53093, 53101, 53101,
+ 53117, 53117, 53129, 53129, 53149, 53149, 53161, 53173, 53173, 53189,
+ 53197, 53201, 53201, 53201, 53231, 53239, 53239, 53239, 53239, 53269,
+ 53279, 53281, 53281, 53299, 53309, 53309, 53327, 53327, 53327, 53327,
+ 53359, 53359, 53359, 53381, 53381, 53381, 53407, 53411, 53419, 53419,
+ 53437, 53441, 53453, 53453, 53453, 53479, 53479, 53479, 53503, 53507,
+ 53507, 53527, 53527, 53527, 53551, 53551, 53551, 53569, 53569, 53591,
+ 53597, 53597, 53611, 53623, 53629, 53639, 53639, 53653, 53657, 53657,
+ 53657, 53681, 53693, 53699, 53699, 53719, 53719, 53731, 53731, 53731,
+ 53759, 53759, 53773, 53783, 53791, 53791, 53791, 53813, 53819, 53831,
+ 53831, 53831, 53849, 53861, 53861, 53861, 53887, 53891, 53899, 53899,
+ 53917, 53927, 53927, 53939, 53951, 53959, 53959, 53959, 53959, 53987,
+ 53993, 54001, 54013, 54013, 54013, 54037, 54037, 54049, 54059, 54059,
+ 54059, 54083, 54091, 54101, 54101, 54101, 54121, 54133, 54139, 54151,
+ 54151, 54167, 54167, 54181, 54181, 54193, 54193, 54193, 54217, 54217,
+ 54217, 54217, 54251, 54251, 54269, 54277, 54287, 54293, 54293, 54311,
+ 54319, 54323, 54331, 54331, 54347, 54347, 54367, 54371, 54377, 54377,
+ 54377, 54403, 54413, 54421, 54421, 54437, 54443, 54449, 54449, 54469,
+ 54469, 54469, 54493, 54503, 54503, 54517, 54521, 54521, 54541, 54547,
+ 54559, 54563, 54563, 54583, 54583, 54583, 54601, 54601, 54623, 54631,
+ 54631, 54647, 54647, 54647, 54667, 54679, 54679, 54679, 54679, 54709,
+ 54713, 54727, 54727, 54727, 54751, 54751, 54767, 54773, 54779, 54787,
+ 54799, 54799, 54799, 54799, 54829, 54833, 54833, 54851, 54851, 54869,
+ 54877, 54881, 54881, 54881, 54907, 54919, 54919, 54919, 54941, 54949,
+ 54959, 54959, 54973, 54983, 54983, 54983, 55001, 55009, 55021, 55021,
+ 55021, 55021, 55051, 55061, 55061, 55079, 55079, 55079, 55103, 55109,
+ 55117, 55127, 55127, 55127, 55147, 55147, 55163, 55171, 55171, 55171,
+ 55171, 55207, 55213, 55219, 55229, 55229, 55243, 55249, 55259, 55259,
+ 55259, 55259, 55291, 55291, 55291, 55313, 55313, 55333, 55343, 55351,
+ 55351, 55351, 55373, 55381, 55381, 55399, 55399, 55411, 55411, 55411,
+ 55439, 55441, 55441, 55457, 55469, 55469, 55487, 55487, 55501, 55511,
+ 55511, 55511, 55529, 55541, 55547, 55547, 55547, 55547, 55579, 55589,
+ 55589, 55603, 55609, 55621, 55631, 55639, 55639, 55639, 55663, 55667,
+ 55673, 55681, 55691, 55697, 55711, 55717, 55721, 55733, 55733, 55733,
+ 55733, 55763, 55763, 55763, 55787, 55799, 55807, 55813, 55823, 55829,
+ 55837, 55843, 55849, 55849, 55871, 55871, 55871, 55889, 55903, 55903,
+ 55903, 55927, 55933, 55933, 55949, 55949, 55967, 55967, 55967, 55987,
+ 55997, 56003, 56009, 56009, 56009, 56039, 56041, 56053, 56053, 56053,
+ 56053, 56087, 56093, 56101, 56101, 56113, 56123, 56131, 56131, 56149,
+ 56149, 56167, 56171, 56179, 56179, 56197, 56207, 56209, 56209, 56209,
+ 56239, 56239, 56249, 56263, 56269, 56269, 56269, 56269, 56299, 56311,
+ 56311, 56311, 56333, 56333, 56333, 56359, 56359, 56369, 56383, 56383,
+ 56393, 56401, 56401, 56417, 56431, 56437, 56443, 56453, 56453, 56467,
+ 56479, 56479, 56489, 56503, 56509, 56519, 56527, 56533, 56543, 56543,
+ 56543, 56543, 56569, 56569, 56591, 56599, 56599, 56611, 56611, 56629,
+ 56633, 56633, 56633, 56663, 56671, 56671, 56687, 56687, 56701, 56711,
+ 56713, 56713, 56731, 56737, 56747, 56747, 56767, 56773, 56783, 56783,
+ 56783, 56807, 56813, 56821, 56827, 56827, 56843, 56843, 56857, 56857,
+ 56873, 56873, 56893, 56897, 56911, 56911, 56923, 56929, 56941, 56951,
+ 56957, 56963, 56963, 56983, 56989, 56999, 56999, 56999, 56999, 56999,
+ 57037, 57047, 57047, 57059, 57059, 57077, 57077, 57089, 57097, 57107,
+ 57119, 57119, 57131, 57143, 57149, 57149, 57163, 57173, 57179, 57191,
+ 57193, 57203, 57203, 57223, 57223, 57223, 57241, 57251, 57259, 57271,
+ 57271, 57287, 57287, 57301, 57301, 57301, 57301, 57331, 57331, 57349,
+ 57349, 57367, 57373, 57383, 57389, 57397, 57397, 57413, 57413, 57427,
+ 57427, 57427, 57427, 57457, 57467, 57467, 57487, 57493, 57503, 57503,
+ 57503, 57527, 57529, 57529, 57529, 57559, 57559, 57571, 57571, 57587,
+ 57593, 57601, 57601, 57601, 57601, 57637, 57641, 57653, 57653, 57667,
+ 57679, 57679, 57689, 57697, 57709, 57719, 57727, 57731, 57737, 57751,
+ 57751, 57751, 57773, 57781, 57791, 57793, 57803, 57809, 57809, 57829,
+ 57839, 57847, 57853, 57859, 57859, 57859, 57881, 57881, 57901, 57901,
+ 57917, 57923, 57923, 57943, 57947, 57947, 57947, 57973, 57977, 57991,
+ 57991, 57991, 58013, 58013, 58031, 58031, 58043, 58049, 58061, 58067,
+ 58073, 58073, 58073, 58099, 58111, 58111, 58111, 58129, 58129, 58151,
+ 58153, 58153, 58171, 58171, 58189, 58199, 58207, 58211, 58217, 58231,
+ 58237, 58243, 58243, 58243, 58271, 58271, 58271, 58271, 58271, 58309,
+ 58313, 58321, 58321, 58337, 58337, 58337, 58367, 58369, 58379, 58391,
+ 58393, 58403, 58411, 58417, 58427, 58439, 58441, 58453, 58453, 58453,
+ 58477, 58481, 58481, 58481, 58511, 58511, 58511, 58511, 58543, 58549,
+ 58549, 58567, 58573, 58579, 58579, 58579, 58603, 58613, 58613, 58631,
+ 58631, 58631, 58631, 58661, 58661, 58679, 58687, 58693, 58699, 58711,
+ 58711, 58727, 58733, 58741, 58741, 58757, 58763, 58771, 58771, 58789,
+ 58789, 58789, 58789, 58789, 58831, 58831, 58831, 58831, 58831, 58831,
+ 58831, 58831, 58889, 58901, 58909, 58913, 58921, 58921, 58943, 58943,
+ 58943, 58967, 58967, 58979, 58991, 58997, 58997, 59011, 59023, 59029,
+ 59029, 59029, 59053, 59063, 59069, 59077, 59083, 59093, 59093, 59107,
+ 59119, 59123, 59123, 59141, 59149, 59159, 59167, 59167, 59183, 59183,
+ 59197, 59207, 59209, 59221, 59221, 59239, 59243, 59243, 59263, 59263,
+ 59273, 59281, 59281, 59281, 59281, 59281, 59281, 59333, 59341, 59351,
+ 59359, 59359, 59369, 59377, 59387, 59399, 59407, 59407, 59419, 59419,
+ 59419, 59447, 59453, 59453, 59471, 59473, 59473, 59473, 59497, 59509,
+ 59513, 59513, 59513, 59539, 59539, 59557, 59567, 59567, 59581, 59581,
+ 59581, 59581, 59611, 59621, 59629, 59629, 59629, 59651, 59663, 59671,
+ 59671, 59671, 59693, 59699, 59707, 59707, 59723, 59729, 59743, 59747,
+ 59753, 59753, 59771, 59779, 59791, 59797, 59797, 59809, 59809, 59809,
+ 59833, 59833, 59833, 59863, 59863, 59879, 59887, 59887, 59887, 59887,
+ 59887, 59921, 59929, 59929, 59951, 59957, 59957, 59971, 59981, 59981,
+ 59999, 59999, 60013, 60017, 60029, 60037, 60041, 60041, 60041, 60041,
+ 60077, 60083, 60091, 60103, 60107, 60107, 60127, 60133, 60139, 60149,
+ 60149, 60167, 60169, 60169, 60169, 60169, 60169, 60209, 60223, 60223,
+ 60223, 60223, 60251, 60259, 60271, 60271, 60271, 60293, 60293, 60293,
+ 60317, 60317, 60331, 60343, 60343, 60353, 60353, 60373, 60383, 60383,
+ 60397, 60397, 60413, 60413, 60427, 60427, 60443, 60449, 60457, 60457,
+ 60457, 60457, 60493, 60497, 60509, 60509, 60527, 60527, 60539, 60539,
+ 60539, 60539, 60539, 60539, 60589, 60589, 60607, 60611, 60623, 60631,
+ 60637, 60647, 60649, 60661, 60661, 60679, 60679, 60689, 60703, 60703,
+ 60719, 60727, 60733, 60737, 60737, 60757, 60763, 60773, 60779, 60779,
+ 60793, 60793, 60811, 60821, 60821, 60821, 60821, 60821, 60859, 60869,
+ 60869, 60887, 60889, 60901, 60901, 60919, 60923, 60923, 60943, 60943,
+ 60953, 60961, 60961, 60961, 60961, 60961, 61007, 61007, 61007, 61031,
+ 61031, 61043, 61051, 61057, 61057, 61057, 61057, 61091, 61099, 61099,
+ 61099, 61121, 61129, 61141, 61151, 61153, 61153, 61169, 61169, 61169,
+ 61169, 61169, 61211, 61223, 61231, 61231, 61231, 61253, 61261, 61261,
+ 61261, 61283, 61291, 61297, 61297, 61297, 61297, 61333, 61343, 61343,
+ 61357, 61363, 61363, 61381, 61381, 61381, 61403, 61409, 61417, 61417,
+ 61417, 61441, 61441, 61463, 61471, 61471, 61487, 61493, 61493, 61511,
+ 61519, 61519, 61519, 61543, 61547, 61559, 61561, 61561, 61583, 61583,
+ 61583, 61603, 61613, 61613, 61631, 61637, 61643, 61651, 61657, 61667,
+ 61673, 61687, 61687, 61703, 61703, 61717, 61723, 61729, 61729, 61751,
+ 61757, 61757, 61757, 61781, 61781, 61781, 61781, 61813, 61819, 61819,
+ 61837, 61843, 61843, 61861, 61871, 61879, 61879, 61879, 61879, 61909,
+ 61909, 61927, 61933, 61933, 61949, 61949, 61967, 61967, 61981, 61991,
+ 61991, 62003, 62011, 62017, 62017, 62039, 62047, 62053, 62057, 62071,
+ 62071, 62081, 62081, 62099, 62099, 62119, 62119, 62131, 62143, 62143,
+ 62143, 62143, 62171, 62171, 62191, 62191, 62207, 62213, 62219, 62219,
+ 62233, 62233, 62233, 62233, 62233, 62273, 62273, 62273, 62303, 62311,
+ 62311, 62327, 62327, 62327, 62351, 62351, 62351, 62351, 62383, 62383,
+ 62383, 62401, 62401, 62423, 62423, 62423, 62423, 62423, 62459, 62467,
+ 62477, 62483, 62483, 62501, 62507, 62507, 62507, 62533, 62539, 62549,
+ 62549, 62563, 62563, 62581, 62591, 62597, 62603, 62603, 62617, 62627,
+ 62639, 62639, 62653, 62659, 62659, 62659, 62687, 62687, 62701, 62701,
+ 62701, 62723, 62731, 62743, 62743, 62753, 62761, 62773, 62773, 62791,
+ 62791, 62801, 62801, 62819, 62827, 62827, 62827, 62851, 62861, 62869,
+ 62873, 62873, 62873, 62903, 62903, 62903, 62927, 62929, 62939, 62939,
+ 62939, 62939, 62971, 62983, 62989, 62989, 62989, 62989, 62989, 63031,
+ 63031, 63031, 63031, 63059, 63067, 63079, 63079, 63079, 63103, 63103,
+ 63113, 63127, 63131, 63131, 63149, 63149, 63149, 63149, 63179, 63179,
+ 63199, 63199, 63211, 63211, 63211, 63211, 63247, 63247, 63247, 63247,
+ 63277, 63281, 63281, 63299, 63311, 63317, 63317, 63331, 63337, 63347,
+ 63353, 63367, 63367, 63377, 63391, 63397, 63397, 63409, 63421, 63421,
+ 63439, 63443, 63443, 63463, 63467, 63473, 63487, 63493, 63499, 63499,
+ 63499, 63527, 63533, 63541, 63541, 63559, 63559, 63559, 63577, 63589,
+ 63599, 63607, 63611, 63617, 63629, 63629, 63647, 63649, 63659, 63671,
+ 63671, 63671, 63691, 63703, 63709, 63719, 63727, 63727, 63743, 63743,
+ 63743, 63761, 63773, 63781, 63781, 63799, 63803, 63809, 63823, 63823,
+ 63839, 63841, 63853, 63863, 63863, 63863, 63863, 63863, 63901, 63907,
+ 63913, 63913, 63929, 63929, 63949, 63949, 63949, 63949, 63977, 63977,
+ 63997, 64007, 64013, 64019, 64019, 64037, 64037, 64037, 64063, 64067,
+ 64067, 64081, 64091, 64091, 64109, 64109, 64123, 64123, 64123, 64151,
+ 64157, 64157, 64171, 64171, 64189, 64189, 64189, 64189, 64223, 64231,
+ 64237, 64237, 64237, 64237, 64271, 64279, 64283, 64283, 64303, 64303,
+ 64319, 64327, 64333, 64333, 64333, 64333, 64333, 64373, 64381, 64381,
+ 64399, 64403, 64403, 64403, 64403, 64439, 64439, 64453, 64453, 64453,
+ 64453, 64483, 64489, 64499, 64499, 64513, 64513, 64513, 64513, 64513,
+ 64553, 64567, 64567, 64579, 64591, 64591, 64601, 64613, 64621, 64627,
+ 64633, 64633, 64633, 64663, 64667, 64679, 64679, 64693, 64693, 64709,
+ 64717, 64717, 64717, 64717, 64747, 64747, 64763, 64763, 64783, 64783,
+ 64793, 64793, 64811, 64817, 64817, 64817, 64817, 64853, 64853, 64871,
+ 64879, 64879, 64891, 64901, 64901, 64919, 64927, 64927, 64937, 64951,
+ 64951, 64951, 64969, 64969, 64969, 64997, 65003, 65011, 65011, 65029,
+ 65033, 65033, 65053, 65063, 65071, 65071, 65071, 65089, 65101, 65111,
+ 65119, 65123, 65129, 65141, 65147, 65147, 65167, 65173, 65183, 65183,
+ 65183, 65203, 65213, 65213, 65213, 65239, 65239, 65239, 65257, 65269,
+ 65269, 65287, 65293, 65293, 65309, 65309, 65327, 65327, 65327, 65327,
+ 65357, 65357, 65371, 65381, 65381, 65393, 65407, 65413, 65423, 65423,
+ 65437, 65447, 65449, 65449, 65449, 65479, 65479, 65479, 65497, 65497,
+ 65519, 65521, 65521, 65543, 65551, 65557, 65563, 65563, 65581, 65587,
+ 65599, 65599, 65609, 65617, 65629, 65633, 65647, 65651, 65657, 65657,
+ 65677, 65687, 65687, 65701, 65707, 65719, 65719, 65731, 65731, 65731,
+ 65731, 65761, 65761, 65777, 65789, 65789, 65789, 65809, 65809, 65831,
+ 65839, 65843, 65851, 65851, 65867, 65867, 65881, 65881, 65899, 65899,
+ 65899, 65927, 65929, 65929, 65951, 65957, 65963, 65963, 65983, 65983,
+ 65993, 65993, 65993, 65993, 66029, 66037, 66047, 66047, 66047, 66071,
+ 66071, 66083, 66089, 66103, 66109, 66109, 66109, 66109, 66137, 66137,
+ 66137, 66161, 66173, 66179, 66191, 66191, 66191, 66191, 66221, 66221,
+ 66239, 66239, 66239, 66239, 66271, 66271, 66271, 66293, 66301, 66301,
+ 66301, 66301, 66301, 66343, 66347, 66359, 66361, 66373, 66383, 66383,
+ 66383, 66403, 66413, 66413, 66431, 66431, 66431, 66449, 66463, 66467,
+ 66467, 66467, 66491, 66499, 66509, 66509, 66523, 66533, 66541, 66541,
+ 66553, 66553, 66571, 66571, 66587, 66593, 66601, 66601, 66617, 66629,
+ 66629, 66643, 66653, 66653, 66653, 66653, 66683, 66683, 66701, 66701,
+ 66713, 66721, 66733, 66739, 66751, 66751, 66763, 66763, 66763, 66791,
+ 66797, 66797, 66809, 66821, 66821, 66821, 66841, 66853, 66863, 66863,
+ 66877, 66883, 66889, 66889, 66889, 66919, 66923, 66931, 66943, 66949,
+ 66959, 66959, 66973, 66977, 66977, 66977, 67003, 67003, 67021, 67021,
+ 67033, 67043, 67049, 67061, 67061, 67079, 67079, 67079, 67103, 67103,
+ 67103, 67121, 67129, 67141, 67141, 67157, 67157, 67169, 67181, 67189,
+ 67189, 67189, 67213, 67219, 67231, 67231, 67247, 67247, 67261, 67271,
+ 67273, 67273, 67289, 67289, 67307, 67307, 67307, 67307, 67343, 67349,
+ 67349, 67349, 67369, 67369, 67391, 67399, 67399, 67411, 67421, 67429,
+ 67433, 67447, 67453, 67453, 67453, 67477, 67481, 67493, 67499, 67511,
+ 67511, 67523, 67531, 67537, 67547, 67559, 67567, 67567, 67579, 67589,
+ 67589, 67607, 67607, 67619, 67631, 67631, 67631, 67651, 67651, 67651,
+ 67679, 67679, 67679, 67699, 67709, 67709, 67723, 67733, 67741, 67751,
+ 67759, 67763, 67763, 67783, 67789, 67789, 67807, 67807, 67819, 67829,
+ 67829, 67843, 67853, 67853, 67867, 67867, 67883, 67891, 67901, 67901,
+ 67901, 67927, 67933, 67943, 67943, 67957, 67967, 67967, 67979, 67987,
+ 67993, 67993, 67993, 68023, 68023, 68023, 68041, 68053, 68059, 68071,
+ 68071, 68087, 68087, 68099, 68111, 68113, 68113, 68113, 68141, 68147,
+ 68147, 68161, 68171, 68171, 68171, 68171, 68207, 68213, 68219, 68227,
+ 68239, 68239, 68239, 68261, 68261, 68279, 68281, 68281, 68281, 68311,
+ 68311, 68311, 68329, 68329, 68351, 68351, 68351, 68371, 68371, 68389,
+ 68399, 68399, 68399, 68399, 68399, 68437, 68447, 68449, 68449, 68449,
+ 68477, 68483, 68491, 68501, 68507, 68507, 68521, 68531, 68543, 68543,
+ 68543, 68567, 68567, 68581, 68581, 68597, 68597, 68611, 68611, 68611,
+ 68639, 68639, 68639, 68659, 68669, 68669, 68687, 68687, 68699, 68711,
+ 68713, 68713, 68729, 68743, 68749, 68749, 68767, 68771, 68777, 68791,
+ 68791, 68791, 68813, 68821, 68821, 68821, 68821, 68821, 68863, 68863,
+ 68879, 68881, 68891, 68903, 68909, 68917, 68927, 68927, 68927, 68947,
+ 68947, 68963, 68963, 68963, 68963, 68993, 69001, 69011, 69019, 69031,
+ 69031, 69031, 69031, 69061, 69067, 69073, 69073, 69073, 69073, 69109,
+ 69119, 69127, 69127, 69143, 69151, 69151, 69163, 69163, 69163, 69191,
+ 69197, 69203, 69203, 69221, 69221, 69239, 69247, 69247, 69263, 69263,
+ 69263, 69263, 69263, 69263, 69263, 69317, 69317, 69317, 69341, 69341,
+ 69341, 69341, 69371, 69383, 69389, 69389, 69403, 69403, 69403, 69431,
+ 69439, 69439, 69439, 69463, 69467, 69473, 69481, 69493, 69499, 69499,
+ 69499, 69499, 69499, 69539, 69539, 69557, 69557, 69557, 69557, 69557,
+ 69593, 69593, 69593, 69623, 69623, 69623, 69623, 69653, 69661, 69661,
+ 69677, 69677, 69691, 69697, 69709, 69709, 69709, 69709, 69739, 69739,
+ 69739, 69767, 69767, 69779, 69779, 69779, 69779, 69809, 69821, 69829,
+ 69833, 69847, 69847, 69859, 69859, 69877, 69877, 69877, 69899, 69911,
+ 69911, 69911, 69931, 69941, 69941, 69959, 69959, 69959, 69959, 69991,
+ 69997, 70003, 70009, 70019, 70019, 70039, 70039, 70051, 70061, 70067,
+ 70079, 70079, 70079, 70099, 70111, 70117, 70123, 70123, 70141, 70141,
+ 70157, 70163, 70163, 70183, 70183, 70199, 70207, 70207, 70223, 70229,
+ 70237, 70241, 70249, 70249, 70271, 70271, 70271, 70289, 70297, 70309,
+ 70313, 70327, 70327, 70327, 70351, 70351, 70351, 70373, 70381, 70381,
+ 70393, 70393, 70393, 70423, 70429, 70439, 70439, 70451, 70459, 70459,
+ 70459, 70487, 70489, 70501, 70507, 70507, 70507, 70529, 70537, 70549,
+ 70549, 70549, 70573, 70583, 70589, 70589, 70607, 70607, 70621, 70627,
+ 70639, 70639, 70639, 70663, 70667, 70667, 70687, 70687, 70687, 70709,
+ 70717, 70717, 70729, 70729, 70729, 70753, 70753, 70769, 70783, 70783,
+ 70793, 70793, 70793, 70823, 70823, 70823, 70843, 70853, 70853, 70867,
+ 70879, 70879, 70891, 70901, 70901, 70919, 70921, 70921, 70937, 70951,
+ 70957, 70957, 70969, 70981, 70991, 70999, 70999, 71011, 71023, 71023,
+ 71039, 71039, 71039, 71059, 71069, 71069, 71081, 71089, 71089, 71089,
+ 71119, 71119, 71129, 71143, 71147, 71153, 71167, 71171, 71171, 71191,
+ 71191, 71191, 71209, 71209, 71209, 71237, 71237, 71249, 71263, 71263,
+ 71263, 71287, 71293, 71293, 71293, 71317, 71327, 71333, 71341, 71347,
+ 71359, 71363, 71363, 71363, 71389, 71399, 71399, 71413, 71419, 71429,
+ 71437, 71443, 71453, 71453, 71471, 71479, 71483, 71483, 71503, 71503,
+ 71503, 71527, 71527, 71537, 71551, 71551, 71563, 71569, 71569, 71569,
+ 71597, 71597, 71597, 71597, 71597, 71633, 71647, 71647, 71663, 71671,
+ 71671, 71671, 71693, 71699, 71711, 71719, 71719, 71719, 71741, 71741,
+ 71741, 71761, 71761, 71777, 71789, 71789, 71807, 71809, 71821, 71821,
+ 71837, 71843, 71849, 71861, 71867, 71879, 71887, 71887, 71899, 71909,
+ 71917, 71917, 71933, 71941, 71947, 71947, 71963, 71971, 71983, 71987,
+ 71999, 71999, 71999, 72019, 72031, 72031, 72047, 72053, 72053, 72053,
+ 72077, 72077, 72091, 72103, 72109, 72109, 72109, 72109, 72139, 72139,
+ 72139, 72167, 72173, 72173, 72173, 72173, 72173, 72211, 72223, 72229,
+ 72229, 72229, 72253, 72253, 72271, 72277, 72287, 72287, 72287, 72307,
+ 72313, 72313, 72313, 72341, 72341, 72353, 72367, 72367, 72383, 72383,
+ 72383, 72383, 72383, 72421, 72431, 72431, 72431, 72431, 72461, 72469,
+ 72469, 72481, 72493, 72503, 72503, 72503, 72503, 72533, 72533, 72551,
+ 72559, 72559, 72559, 72577, 72577, 72577, 72577, 72613, 72623, 72623,
+ 72623, 72647, 72649, 72661, 72671, 72679, 72679, 72689, 72701, 72707,
+ 72719, 72727, 72733, 72739, 72739, 72739, 72767, 72767, 72767, 72767,
+ 72797, 72797, 72797, 72823, 72823, 72823, 72823, 72823, 72859, 72871,
+ 72871, 72883, 72893, 72901, 72911, 72911, 72923, 72931, 72937, 72949,
+ 72959, 72959, 72973, 72977, 72977, 72997, 72997, 73013, 73019, 73019,
+ 73039, 73043, 73043, 73063, 73063, 73079, 73079, 73091, 73091, 73091,
+ 73091, 73127, 73133, 73141, 73141, 73141, 73141, 73141, 73181, 73189,
+ 73189, 73189, 73189, 73189, 73189, 73237, 73243, 73243, 73259, 73259,
+ 73277, 73277, 73291, 73303, 73309, 73309, 73327, 73331, 73331, 73351,
+ 73351, 73363, 73369, 73379, 73387, 73387, 73387, 73387, 73421, 73421,
+ 73433, 73433, 73453, 73459, 73471, 73477, 73483, 73483, 73483, 73483,
+ 73517, 73523, 73529, 73529, 73547, 73553, 73561, 73571, 73583, 73589,
+ 73597, 73607, 73613, 73613, 73613, 73637, 73643, 73651, 73651, 73651,
+ 73679, 73681, 73693, 73699, 73709, 73709, 73727, 73727, 73727, 73751,
+ 73757, 73757, 73771, 73783, 73783, 73783, 73783, 73783, 73823, 73823,
+ 73823, 73847, 73849, 73859, 73867, 73877, 73883, 73883, 73897, 73907,
+ 73907, 73907, 73907, 73943, 73951, 73951, 73961, 73973, 73973, 73973,
+ 73999, 73999, 73999, 74021, 74027, 74027, 74047, 74051, 74051, 74071,
+ 74077, 74077, 74093, 74101, 74101, 74101, 74101, 74131, 74143, 74149,
+ 74159, 74167, 74167, 74177, 74189, 74197, 74203, 74209, 74219, 74231,
+ 74231, 74231, 74231, 74257, 74257, 74279, 74287, 74293, 74297, 74311,
+ 74317, 74323, 74323, 74323, 74323, 74357, 74363, 74363, 74383, 74383,
+ 74383, 74383, 74413, 74419, 74419, 74419, 74441, 74453, 74453, 74471,
+ 74471, 74471, 74489, 74489, 74509, 74509, 74527, 74531, 74531, 74551,
+ 74551, 74567, 74573, 74573, 74587, 74597, 74597, 74611, 74623, 74623,
+ 74623, 74623, 74653, 74653, 74653, 74653, 74687, 74687, 74699, 74707,
+ 74719, 74719, 74731, 74731, 74747, 74759, 74761, 74771, 74779, 74779,
+ 74797, 74797, 74797, 74821, 74831, 74831, 74843, 74843, 74861, 74869,
+ 74873, 74887, 74891, 74903, 74903, 74903, 74923, 74933, 74941, 74941,
+ 74959, 74959, 74959, 74959, 74959, 74959, 74959, 75013, 75017, 75029,
+ 75037, 75041, 75041, 75041, 75041, 75079, 75083, 75083, 75083, 75109,
+ 75109, 75109, 75133, 75133, 75149, 75149, 75167, 75169, 75181, 75181,
+ 75193, 75193, 75211, 75223, 75227, 75239, 75239, 75253, 75253, 75269,
+ 75277, 75277, 75289, 75289, 75307, 75307, 75323, 75329, 75337, 75347,
+ 75353, 75367, 75367, 75377, 75391, 75391, 75407, 75407, 75407, 75431,
+ 75437, 75437, 75437, 75437, 75437, 75479, 75479, 75479, 75503, 75511,
+ 75511, 75527, 75533, 75541, 75541, 75557, 75557, 75571, 75583, 75583,
+ 75583, 75583, 75611, 75619, 75629, 75629, 75641, 75653, 75659, 75659,
+ 75679, 75683, 75689, 75703, 75709, 75709, 75721, 75731, 75743, 75743,
+ 75743, 75767, 75773, 75781, 75787, 75797, 75797, 75797, 75821, 75821,
+ 75833, 75833, 75853, 75853, 75869, 75869, 75883, 75883, 75883, 75883,
+ 75913, 75913, 75931, 75941, 75941, 75941, 75967, 75967, 75983, 75991,
+ 75997, 76003, 76003, 76003, 76031, 76039, 76039, 76039, 76039, 76039,
+ 76079, 76081, 76091, 76103, 76103, 76103, 76123, 76129, 76129, 76147,
+ 76159, 76163, 76163, 76163, 76163, 76163, 76207, 76213, 76213, 76231,
+ 76231, 76243, 76253, 76261, 76261, 76261, 76283, 76289, 76303, 76303,
+ 76303, 76303, 76333, 76343, 76343, 76343, 76367, 76369, 76379, 76387,
+ 76387, 76403, 76403, 76423, 76423, 76423, 76441, 76441, 76463, 76471,
+ 76471, 76487, 76493, 76493, 76511, 76519, 76519, 76519, 76543, 76543,
+ 76543, 76561, 76561, 76579, 76579, 76597, 76607, 76607, 76607, 76631,
+ 76631, 76631, 76651, 76651, 76667, 76679, 76679, 76679, 76697, 76697,
+ 76717, 76717, 76733, 76733, 76733, 76757, 76757, 76771, 76781, 76781,
+ 76781, 76801, 76801, 76819, 76831, 76837, 76847, 76847, 76847, 76871,
+ 76873, 76883, 76883, 76883, 76907, 76919, 76919, 76919, 76943, 76949,
+ 76949, 76963, 76963, 76963, 76991, 76991, 77003, 77003, 77023, 77029,
+ 77029, 77047, 77047, 77047, 77069, 77069, 77081, 77093, 77101, 77101,
+ 77101, 77101, 77101, 77141, 77141, 77153, 77167, 77171, 77171, 77191,
+ 77191, 77201, 77213, 77213, 77213, 77239, 77243, 77249, 77263, 77269,
+ 77279, 77279, 77291, 77291, 77291, 77317, 77323, 77323, 77339, 77351,
+ 77359, 77359, 77369, 77383, 77383, 77383, 77383, 77383, 77419, 77431,
+ 77431, 77447, 77447, 77447, 77471, 77479, 77479, 77491, 77491, 77509,
+ 77513, 77527, 77527, 77543, 77551, 77557, 77563, 77573, 77573, 77591,
+ 77591, 77591, 77611, 77621, 77621, 77621, 77647, 77647, 77659, 77659,
+ 77659, 77687, 77689, 77699, 77711, 77719, 77723, 77731, 77743, 77747,
+ 77747, 77761, 77773, 77783, 77783, 77797, 77801, 77813, 77813, 77813,
+ 77839, 77839, 77849, 77863, 77867, 77867, 77867, 77893, 77899, 77899,
+ 77899, 77899, 77933, 77933, 77951, 77951, 77951, 77969, 77983, 77983,
+ 77999, 78007, 78007, 78017, 78031, 78031, 78041, 78049, 78059, 78059,
+ 78079, 78079, 78079, 78101, 78101, 78101, 78121, 78121, 78139, 78139,
+ 78157, 78167, 78173, 78179, 78191, 78193, 78203, 78203, 78203, 78229,
+ 78233, 78241, 78241, 78259, 78259, 78277, 78283, 78283, 78301, 78311,
+ 78317, 78317, 78317, 78341, 78347, 78347, 78367, 78367, 78367, 78367,
+ 78367, 78401, 78401, 78401, 78427, 78439, 78439, 78439, 78439, 78467,
+ 78479, 78487, 78487, 78497, 78511, 78517, 78517, 78517, 78541, 78541,
+ 78553, 78553, 78571, 78583, 78583, 78593, 78607, 78607, 78623, 78623,
+ 78623, 78643, 78653, 78653, 78653, 78653, 78653, 78691, 78697, 78707,
+ 78713, 78721, 78721, 78737, 78737, 78737, 78737, 78737, 78781, 78791,
+ 78797, 78803, 78809, 78823, 78823, 78839, 78839, 78853, 78857, 78857,
+ 78877, 78887, 78893, 78901, 78901, 78919, 78919, 78929, 78941, 78941,
+ 78941, 78941, 78941, 78979, 78989, 78989, 78989, 78989, 78989, 79031,
+ 79039, 79043, 79043, 79063, 79063, 79063, 79087, 79087, 79103, 79111,
+ 79111, 79111, 79133, 79139, 79151, 79159, 79159, 79159, 79181, 79187,
+ 79193, 79201, 79201, 79201, 79231, 79231, 79241, 79241, 79259, 79259,
+ 79279, 79283, 79283, 79301, 79309, 79319, 79319, 79333, 79337, 79349,
+ 79357, 79367, 79367, 79379, 79379, 79399, 79399, 79411, 79423, 79427,
+ 79433, 79433, 79451, 79451, 79451, 79451, 79481, 79493, 79493, 79493,
+ 79493, 79493, 79531, 79537, 79549, 79559, 79561, 79561, 79579, 79589,
+ 79589, 79601, 79613, 79621, 79631, 79633, 79633, 79633, 79657, 79669,
+ 79669, 79687, 79693, 79699, 79699, 79699, 79699, 79699, 79699, 79699,
+ 79757, 79757, 79769, 79777, 79777, 79777, 79801, 79813, 79823, 79829,
+ 79829, 79847, 79847, 79861, 79867, 79873, 79873, 79889, 79903, 79907,
+ 79907, 79907, 79907, 79943, 79943, 79943, 79967, 79973, 79979, 79987,
+ 79999, 79999, 79999, 80021, 80021, 80039, 80039, 80051, 80051, 80071,
+ 80077, 80077, 80077, 80077, 80111, 80111, 80111, 80111, 80141, 80149,
+ 80153, 80167, 80173, 80177, 80191, 80191, 80207, 80209, 80221, 80231,
+ 80239, 80239, 80251, 80263, 80263, 80279, 80287, 80287, 80287, 80309,
+ 80317, 80317, 80329, 80341, 80347, 80347, 80363, 80369, 80369, 80387,
+ 80387, 80407, 80407, 80407, 80429, 80429, 80447, 80449, 80449, 80471,
+ 80473, 80473, 80491, 80491, 80491, 80513, 80527, 80527, 80537, 80537,
+ 80557, 80567, 80567, 80567, 80567, 80599, 80603, 80611, 80621, 80629,
+ 80629, 80629, 80651, 80657, 80671, 80677, 80687, 80687, 80701, 80701,
+ 80713, 80713, 80713, 80737, 80749, 80749, 80761, 80761, 80783, 80789,
+ 80789, 80803, 80809, 80819, 80831, 80833, 80833, 80849, 80863, 80863,
+ 80863, 80863, 80863, 80897, 80911, 80917, 80923, 80933, 80933, 80933,
+ 80953, 80963, 80963, 80963, 80989, 80989, 81001, 81013, 81023, 81031,
+ 81031, 81047, 81049, 81049, 81071, 81077, 81083, 81083, 81101, 81101,
+ 81119, 81119, 81131, 81131, 81131, 81157, 81163, 81173, 81181, 81181,
+ 81199, 81203, 81203, 81223, 81223, 81239, 81239, 81239, 81239, 81239,
+ 81239, 81283, 81293, 81299, 81307, 81307, 81307, 81331, 81343, 81349,
+ 81359, 81359, 81373, 81373, 81373, 81373, 81401, 81409, 81421, 81421,
+ 81439, 81439, 81439, 81463, 81463, 81463, 81463, 81463, 81463, 81509,
+ 81517, 81527, 81533, 81533, 81551, 81559, 81563, 81569, 81569, 81569,
+ 81569, 81569, 81611, 81619, 81629, 81637, 81647, 81649, 81649, 81671,
+ 81677, 81677, 81689, 81703, 81707, 81707, 81727, 81727, 81737, 81749,
+ 81749, 81761, 81773, 81773, 81773, 81799, 81799, 81799, 81817, 81817,
+ 81839, 81847, 81853, 81853, 81869, 81869, 81883, 81883, 81901, 81901,
+ 81919, 81919, 81931, 81943, 81943, 81953, 81967, 81973, 81973, 81973,
+ 81973, 82007, 82013, 82021, 82031, 82039, 82039, 82051, 82051, 82067,
+ 82073, 82073, 82073, 82073, 82073, 82073, 82073, 82129, 82141, 82141,
+ 82153, 82163, 82171, 82183, 82189, 82193, 82207, 82207, 82223, 82231,
+ 82237, 82241, 82241, 82261, 82267, 82279, 82279, 82279, 82301, 82307,
+ 82307, 82307, 82307, 82339, 82351, 82351, 82361, 82373, 82373, 82387,
+ 82393, 82393, 82393, 82421, 82421, 82421, 82421, 82421, 82463, 82471,
+ 82471, 82487, 82493, 82499, 82507, 82507, 82507, 82531, 82531, 82549,
+ 82559, 82567, 82571, 82571, 82591, 82591, 82601, 82613, 82619, 82619,
+ 82633, 82633, 82651, 82657, 82657, 82657, 82657, 82657, 82699, 82699,
+ 82699, 82727, 82729, 82729, 82729, 82759, 82763, 82763, 82781, 82787,
+ 82799, 82799, 82813, 82813, 82813, 82837, 82847, 82847, 82847, 82847,
+ 82847, 82883, 82891, 82903, 82903, 82913, 82913, 82913, 82939, 82939,
+ 82939, 82963, 82963, 82981, 82981, 82997, 83003, 83009, 83023, 83023,
+ 83023, 83047, 83047, 83063, 83071, 83077, 83077, 83093, 83101, 83101,
+ 83117, 83117, 83117, 83137, 83137, 83137, 83137, 83137, 83177, 83177,
+ 83177, 83207, 83207, 83221, 83231, 83233, 83243, 83243, 83257, 83269,
+ 83273, 83273, 83273, 83299, 83311, 83311, 83311, 83311, 83341, 83341,
+ 83357, 83357, 83357, 83383, 83389, 83399, 83407, 83407, 83423, 83431,
+ 83437, 83443, 83449, 83459, 83471, 83477, 83477, 83477, 83497, 83497,
+ 83497, 83497, 83497, 83537, 83537, 83557, 83563, 83563, 83579, 83591,
+ 83597, 83597, 83609, 83621, 83621, 83639, 83641, 83653, 83663, 83663,
+ 83663, 83663, 83689, 83701, 83701, 83719, 83719, 83719, 83737, 83737,
+ 83737, 83761, 83773, 83777, 83791, 83791, 83791, 83813, 83813, 83813,
+ 83833, 83843, 83843, 83857, 83869, 83873, 83873, 83891, 83903, 83911,
+ 83911, 83921, 83933, 83939, 83939, 83939, 83939, 83969, 83983, 83987,
+ 83987, 83987, 84011, 84017, 84017, 84017, 84047, 84053, 84061, 84067,
+ 84067, 84067, 84089, 84089, 84089, 84089, 84127, 84131, 84143, 84143,
+ 84143, 84163, 84163, 84181, 84191, 84199, 84199, 84211, 84223, 84229,
+ 84239, 84247, 84247, 84263, 84263, 84263, 84263, 84263, 84299, 84307,
+ 84319, 84319, 84319, 84319, 84349, 84349, 84349, 84349, 84377, 84391,
+ 84391, 84407, 84407, 84421, 84431, 84437, 84443, 84449, 84463, 84467,
+ 84467, 84481, 84481, 84503, 84509, 84509, 84523, 84533, 84533, 84551,
+ 84559, 84559, 84559, 84559, 84589, 84589, 84589, 84589, 84589, 84631,
+ 84631, 84631, 84653, 84659, 84659, 84673, 84673, 84691, 84701, 84701,
+ 84719, 84719, 84731, 84737, 84751, 84751, 84761, 84761, 84761, 84787,
+ 84793, 84793, 84811, 84811, 84827, 84827, 84827, 84827, 84859, 84871,
+ 84871, 84871, 84871, 84871, 84871, 84919, 84919, 84919, 84919, 84947,
+ 84947, 84967, 84967, 84979, 84991, 84991, 84991, 85009, 85021, 85027,
+ 85037, 85037, 85049, 85061, 85061, 85061, 85087, 85093, 85103, 85109,
+ 85109, 85121, 85133, 85133, 85147, 85159, 85159, 85159, 85159, 85159,
+ 85199, 85201, 85213, 85223, 85229, 85237, 85247, 85247, 85259, 85259,
+ 85259, 85259, 85259, 85303, 85303, 85313, 85313, 85333, 85333, 85333,
+ 85333, 85363, 85369, 85381, 85381, 85381, 85381, 85411, 85411, 85429,
+ 85439, 85447, 85453, 85453, 85469, 85469, 85487, 85487, 85487, 85487,
+ 85517, 85523, 85531, 85531, 85549, 85549, 85549, 85571, 85577, 85577,
+ 85597, 85607, 85607, 85621, 85627, 85639, 85643, 85643, 85661, 85669,
+ 85669, 85669, 85691, 85703, 85711, 85717, 85717, 85733, 85733, 85751,
+ 85751, 85751, 85751, 85781, 85781, 85793, 85793, 85793, 85819, 85831,
+ 85837, 85847, 85853, 85853, 85853, 85853, 85853, 85889, 85903, 85909,
+ 85909, 85909, 85933, 85933, 85933, 85933, 85933, 85933, 85933, 85991,
+ 85999, 85999, 86011, 86017, 86029, 86029, 86029, 86029, 86029, 86069,
+ 86077, 86083, 86083, 86083, 86111, 86117, 86117, 86131, 86143, 86143,
+ 86143, 86161, 86171, 86183, 86183, 86197, 86201, 86209, 86209, 86209,
+ 86239, 86243, 86249, 86263, 86269, 86269, 86287, 86293, 86297, 86311,
+ 86311, 86323, 86323, 86341, 86351, 86357, 86357, 86371, 86381, 86389,
+ 86399, 86399, 86413, 86423, 86423, 86423, 86441, 86453, 86461, 86467,
+ 86477, 86477, 86491, 86501, 86509, 86509, 86509, 86533, 86539, 86539,
+ 86539, 86561, 86573, 86579, 86587, 86599, 86599, 86599, 86599, 86629,
+ 86629, 86629, 86629, 86629, 86629, 86677, 86677, 86693, 86693, 86711,
+ 86719, 86719, 86729, 86743, 86743, 86753, 86767, 86771, 86783, 86783,
+ 86783, 86783, 86813, 86813, 86813, 86837, 86843, 86851, 86861, 86869,
+ 86869, 86869, 86869, 86869, 86869, 86869, 86927, 86929, 86939, 86951,
+ 86959, 86959, 86969, 86981, 86981, 86993, 86993, 87013, 87013, 87013,
+ 87037, 87041, 87049, 87049, 87071, 87071, 87083, 87083, 87103, 87107,
+ 87119, 87121, 87133, 87133, 87151, 87151, 87151, 87151, 87181, 87187,
+ 87187, 87187, 87211, 87223, 87223, 87223, 87223, 87253, 87257, 87257,
+ 87277, 87281, 87293, 87299, 87299, 87317, 87323, 87323, 87337, 87337,
+ 87359, 87359, 87359, 87383, 87383, 87383, 87407, 87407, 87421, 87427,
+ 87433, 87443, 87443, 87443, 87443, 87473, 87481, 87491, 87491, 87511,
+ 87517, 87523, 87523, 87541, 87547, 87559, 87559, 87559, 87583, 87589,
+ 87589, 87589, 87613, 87623, 87631, 87631, 87643, 87649, 87649, 87671,
+ 87679, 87683, 87691, 87701, 87701, 87719, 87721, 87721, 87743, 87751,
+ 87751, 87767, 87767, 87767, 87767, 87797, 87803, 87811, 87811, 87811,
+ 87833, 87833, 87853, 87853, 87869, 87877, 87887, 87887, 87887, 87911,
+ 87917, 87917, 87931, 87943, 87943, 87959, 87961, 87973, 87977, 87991,
+ 87991, 88007, 88007, 88019, 88019, 88037, 88037, 88037, 88037, 88069,
+ 88079, 88079, 88093, 88093, 88093, 88117, 88117, 88129, 88129, 88129,
+ 88129, 88129, 88169, 88177, 88177, 88177, 88177, 88211, 88223, 88223,
+ 88237, 88241, 88241, 88261, 88261, 88261, 88261, 88289, 88301, 88301,
+ 88301, 88327, 88327, 88339, 88339, 88339, 88339, 88339, 88379, 88379,
+ 88397, 88397, 88411, 88423, 88427, 88427, 88427, 88427, 88463, 88471,
+ 88471, 88471, 88493, 88499, 88499, 88513, 88523, 88523, 88523, 88547,
+ 88547, 88547, 88547, 88547, 88591, 88591, 88607, 88609, 88609, 88609,
+ 88609, 88643, 88651, 88663, 88667, 88667, 88681, 88681, 88681, 88681,
+ 88681, 88721, 88729, 88741, 88747, 88747, 88747, 88771, 88771, 88789,
+ 88799, 88807, 88813, 88819, 88819, 88819, 88843, 88853, 88861, 88867,
+ 88873, 88883, 88883, 88903, 88903, 88919, 88919, 88919, 88937, 88951,
+ 88951, 88951, 88969, 88969, 88969, 88997, 89003, 89009, 89021, 89021,
+ 89021, 89041, 89051, 89057, 89071, 89071, 89087, 89087, 89101, 89107,
+ 89119, 89123, 89123, 89137, 89137, 89153, 89153, 89153, 89153, 89189,
+ 89189, 89203, 89213, 89213, 89231, 89237, 89237, 89237, 89261, 89269,
+ 89273, 89273, 89293, 89303, 89303, 89317, 89317, 89329, 89329, 89329,
+ 89329, 89363, 89371, 89381, 89387, 89399, 89399, 89413, 89417, 89431,
+ 89431, 89443, 89449, 89459, 89459, 89477, 89477, 89491, 89501, 89501,
+ 89519, 89527, 89533, 89533, 89533, 89533, 89567, 89567, 89567, 89591,
+ 89599, 89603, 89611, 89611, 89627, 89633, 89633, 89653, 89659, 89671,
+ 89671, 89681, 89689, 89689, 89689, 89689, 89689, 89689, 89689, 89689,
+ 89759, 89767, 89767, 89783, 89783, 89797, 89797, 89809, 89821, 89821,
+ 89839, 89839, 89849, 89849, 89867, 89867, 89867, 89891, 89899, 89909,
+ 89917, 89923, 89923, 89939, 89939, 89959, 89963, 89963, 89983, 89989,
+ 89989, 90007, 90011, 90023, 90031, 90031, 90031, 90053, 90059, 90071,
+ 90073, 90073, 90089, 90089, 90107, 90107, 90127, 90127, 90127, 90149,
+ 90149, 90163, 90173, 90173, 90191, 90199, 90203, 90203, 90217, 90227,
+ 90239, 90247, 90247, 90263, 90271, 90271, 90281, 90289, 90289, 90289,
+ 90313, 90313, 90313, 90313, 90313, 90359, 90359, 90373, 90379, 90379,
+ 90397, 90407, 90407, 90407, 90407, 90439, 90439, 90439, 90439, 90469,
+ 90473, 90481, 90481, 90499, 90511, 90511, 90527, 90533, 90533, 90547,
+ 90547, 90547, 90547, 90583, 90583, 90599, 90599, 90599, 90619, 90631,
+ 90631, 90647, 90647, 90659, 90659, 90679, 90679, 90679, 90703, 90709,
+ 90709, 90709, 90731, 90731, 90749, 90749, 90749, 90749, 90749, 90787,
+ 90793, 90803, 90803, 90823, 90823, 90833, 90847, 90847, 90863, 90863,
+ 90863, 90887, 90887, 90901, 90911, 90917, 90917, 90931, 90931, 90947,
+ 90947, 90947, 90971, 90977, 90989, 90997, 90997, 91009, 91019, 91019,
+ 91033, 91033, 91033, 91033, 91033, 91079, 91081, 91081, 91099, 91099,
+ 91099, 91127, 91129, 91141, 91151, 91159, 91163, 91163, 91183, 91183,
+ 91199, 91199, 91199, 91199, 91229, 91237, 91243, 91253, 91253, 91253,
+ 91253, 91283, 91291, 91303, 91309, 91309, 91309, 91331, 91331, 91331,
+ 91331, 91367, 91373, 91381, 91387, 91397, 91397, 91411, 91423, 91423,
+ 91433, 91433, 91453, 91463, 91463, 91463, 91463, 91493, 91499, 91499,
+ 91513, 91513, 91529, 91541, 91541, 91541, 91541, 91573, 91583, 91591,
+ 91591, 91591, 91591, 91621, 91631, 91639, 91639, 91639, 91639, 91639,
+ 91673, 91673, 91691, 91703, 91711, 91711, 91711, 91733, 91733, 91733,
+ 91757, 91757, 91771, 91781, 91781, 91781, 91807, 91813, 91823, 91823,
+ 91837, 91841, 91841, 91841, 91867, 91873, 91873, 91873, 91873, 91909,
+ 91909, 91921, 91921, 91943, 91951, 91957, 91967, 91969, 91969, 91969,
+ 91997, 92003, 92009, 92009, 92009, 92033, 92041, 92051, 92051, 92051,
+ 92077, 92083, 92083, 92083, 92111, 92119, 92119, 92119, 92143, 92143,
+ 92153, 92153, 92173, 92179, 92189, 92189, 92203, 92203, 92221, 92227,
+ 92237, 92243, 92251, 92251, 92269, 92269, 92269, 92269, 92297, 92311,
+ 92317, 92317, 92333, 92333, 92347, 92357, 92363, 92369, 92383, 92387,
+ 92399, 92401, 92413, 92419, 92431, 92431, 92431, 92431, 92461, 92467,
+ 92479, 92479, 92489, 92503, 92507, 92507, 92507, 92507, 92507, 92551,
+ 92557, 92567, 92569, 92581, 92581, 92593, 92593, 92593, 92623, 92627,
+ 92639, 92647, 92647, 92657, 92671, 92671, 92683, 92693, 92699, 92707,
+ 92717, 92723, 92723, 92737, 92737, 92753, 92767, 92767, 92779, 92791,
+ 92791, 92801, 92809, 92821, 92831, 92831, 92831, 92849, 92863, 92867,
+ 92867, 92867, 92893, 92899, 92899, 92899, 92927, 92927, 92941, 92951,
+ 92959, 92959, 92959, 92959, 92987, 92993, 93001, 93001, 93001, 93001,
+ 93001, 93047, 93053, 93059, 93059, 93077, 93083, 93089, 93103, 93103,
+ 93113, 93113, 93133, 93139, 93151, 93151, 93151, 93169, 93179, 93187,
+ 93199, 93199, 93199, 93199, 93229, 93239, 93241, 93253, 93263, 93263,
+ 93263, 93287, 93287, 93287, 93307, 93319, 93323, 93329, 93337, 93337,
+ 93337, 93337, 93371, 93383, 93383, 93383, 93407, 93407, 93419, 93427,
+ 93427, 93427, 93427, 93463, 93463, 93479, 93487, 93493, 93503, 93503,
+ 93503, 93523, 93529, 93529, 93529, 93559, 93563, 93563, 93581, 93581,
+ 93581, 93607, 93607, 93607, 93629, 93637, 93637, 93637, 93637, 93637,
+ 93637, 93683, 93683, 93703, 93703, 93719, 93719, 93719, 93739, 93739,
+ 93739, 93763, 93763, 93763, 93787, 93787, 93787, 93811, 93811, 93827,
+ 93827, 93827, 93851, 93851, 93871, 93871, 93887, 93893, 93901, 93911,
+ 93913, 93923, 93923, 93941, 93949, 93949, 93967, 93971, 93983, 93983,
+ 93997, 94007, 94009, 94009, 94009, 94033, 94033, 94049, 94063, 94063,
+ 94079, 94079, 94079, 94099, 94111, 94117, 94121, 94121, 94121, 94151,
+ 94153, 94153, 94169, 94169, 94169, 94169, 94207, 94207, 94219, 94229,
+ 94229, 94229, 94253, 94261, 94261, 94273, 94273, 94291, 94291, 94309,
+ 94309, 94327, 94331, 94343, 94351, 94351, 94351, 94351, 94379, 94379,
+ 94399, 94399, 94399, 94421, 94427, 94439, 94447, 94447, 94463, 94463,
+ 94477, 94483, 94483, 94483, 94483, 94513, 94513, 94531, 94543, 94547,
+ 94559, 94561, 94573, 94583, 94583, 94597, 94603, 94613, 94621, 94621,
+ 94621, 94621, 94651, 94651, 94651, 94651, 94687, 94693, 94693, 94709,
+ 94709, 94727, 94727, 94727, 94747, 94747, 94747, 94771, 94781, 94789,
+ 94793, 94793, 94811, 94823, 94823, 94837, 94847, 94849, 94849, 94849,
+ 94873, 94873, 94889, 94903, 94907, 94907, 94907, 94933, 94933, 94951,
+ 94951, 94961, 94961, 94961, 94961, 94999, 95003, 95009, 95021, 95027,
+ 95027, 95027, 95027, 95063, 95071, 95071, 95087, 95093, 95101, 95111,
+ 95111, 95111, 95131, 95143, 95143, 95153, 95153, 95153, 95177, 95191,
+ 95191, 95203, 95213, 95219, 95231, 95239, 95239, 95239, 95261, 95267,
+ 95279, 95287, 95287, 95287, 95311, 95317, 95327, 95327, 95339, 95339,
+ 95339, 95339, 95369, 95383, 95383, 95393, 95401, 95413, 95419, 95429,
+ 95429, 95443, 95443, 95461, 95471, 95479, 95483, 95483, 95483, 95507,
+ 95507, 95527, 95531, 95539, 95549, 95549, 95561, 95569, 95581, 95581,
+ 95597, 95603, 95603, 95621, 95629, 95633, 95633, 95651, 95651, 95651,
+ 95651, 95651, 95651, 95701, 95707, 95717, 95723, 95731, 95737, 95747,
+ 95747, 95747, 95773, 95783, 95791, 95791, 95803, 95813, 95819, 95819,
+ 95819, 95819, 95819, 95857, 95869, 95873, 95881, 95891, 95891, 95911,
+ 95917, 95923, 95929, 95929, 95947, 95959, 95959, 95971, 95971, 95989,
+ 95989, 96001, 96013, 96017, 96017, 96017, 96043, 96053, 96059, 96059,
+ 96079, 96079, 96079, 96097, 96097, 96097, 96097, 96097, 96137, 96149,
+ 96157, 96167, 96167, 96181, 96181, 96199, 96199, 96211, 96223, 96223,
+ 96233, 96233, 96233, 96263, 96269, 96269, 96281, 96293, 96293, 96293,
+ 96293, 96323, 96331, 96337, 96337, 96353, 96353, 96353, 96377, 96377,
+ 96377, 96401, 96401, 96419, 96431, 96431, 96443, 96451, 96461, 96469,
+ 96479, 96487, 96493, 96497, 96497, 96517, 96527, 96527, 96527, 96527,
+ 96557, 96557, 96557, 96581, 96589, 96589, 96601, 96601, 96601, 96601,
+ 96601, 96643, 96643, 96661, 96671, 96671, 96671, 96671, 96703, 96703,
+ 96703, 96703, 96731, 96739, 96749, 96757, 96763, 96769, 96779, 96787,
+ 96799, 96799, 96799, 96823, 96827, 96827, 96847, 96851, 96857, 96857,
+ 96857, 96857, 96893, 96893, 96911, 96911, 96911, 96931, 96931, 96931,
+ 96959, 96959, 96973, 96979, 96989, 96997, 97007, 97007, 97021, 97021,
+ 97039, 97039, 97039, 97039, 97039, 97073, 97081, 97081, 97103, 97103,
+ 97117, 97127, 97127, 97127, 97151, 97159, 97159, 97171, 97177, 97187,
+ 97187, 97187, 97213, 97213, 97231, 97231, 97241, 97241, 97259, 97259,
+ 97259, 97283, 97283, 97303, 97303, 97303, 97327, 97327, 97327, 97327,
+ 97327, 97367, 97373, 97381, 97387, 97397, 97397, 97397, 97423, 97429,
+ 97429, 97441, 97453, 97463, 97463, 97463, 97463, 97463, 97501, 97511,
+ 97511, 97523, 97523, 97523, 97549, 97553, 97561, 97571, 97583, 97583,
+ 97583, 97607, 97613, 97613, 97613, 97613, 97613, 97651, 97651, 97651,
+ 97673, 97687, 97687, 97687, 97711, 97711, 97711, 97729, 97729, 97729,
+ 97729, 97729, 97771, 97777, 97789, 97789, 97789, 97813, 97813, 97829,
+ 97829, 97847, 97849, 97861, 97871, 97879, 97883, 97883, 97883, 97883,
+ 97919, 97927, 97931, 97943, 97943, 97943, 97967, 97973, 97973, 97987,
+ 97987, 97987, 98011, 98017, 98017, 98017, 98047, 98047, 98057, 98057,
+ 98057, 98081, 98081, 98101, 98101, 98101, 98123, 98129, 98143, 98143,
+ 98143, 98143, 98143, 98179, 98179, 98179, 98207, 98213, 98221, 98227,
+ 98227, 98227, 98251, 98257, 98269, 98269, 98269, 98269, 98299, 98299,
+ 98317, 98327, 98327, 98327, 98347, 98347, 98347, 98369, 98377, 98389,
+ 98389, 98407, 98411, 98419, 98429, 98429, 98443, 98453, 98459, 98467,
+ 98479, 98479, 98491, 98491, 98507, 98519, 98519, 98533, 98543, 98543,
+ 98543, 98563, 98573, 98573, 98573, 98597, 98597, 98597, 98621, 98627,
+ 98639, 98641, 98641, 98663, 98669, 98669, 98669, 98689, 98689, 98711,
+ 98717, 98717, 98731, 98737, 98737, 98737, 98737, 98773, 98779, 98779,
+ 98779, 98807, 98809, 98809, 98809, 98837, 98837, 98849, 98849, 98869,
+ 98873, 98887, 98893, 98899, 98911, 98911, 98927, 98929, 98939, 98947,
+ 98953, 98963, 98963, 98981, 98981, 98999, 98999, 99013, 99023, 99023,
+ 99023, 99041, 99053, 99053, 99053, 99079, 99083, 99089, 99103, 99109,
+ 99119, 99119, 99133, 99139, 99149, 99149, 99149, 99173, 99181, 99191,
+ 99191, 99191, 99191, 99223, 99223, 99233, 99241, 99251, 99259, 99259,
+ 99277, 99277, 99289, 99289, 99289, 99317, 99317, 99317, 99317, 99349,
+ 99349, 99367, 99371, 99377, 99391, 99397, 99401, 99409, 99409, 99431,
+ 99439, 99439, 99439, 99439, 99469, 99469, 99487, 99487, 99497, 99497,
+ 99497, 99527, 99529, 99529, 99551, 99559, 99563, 99571, 99581, 99581,
+ 99581, 99607, 99611, 99623, 99623, 99623, 99643, 99643, 99661, 99667,
+ 99679, 99679, 99689, 99689, 99709, 99719, 99721, 99733, 99733, 99733,
+ 99733, 99767, 99767, 99767, 99787, 99793, 99793, 99809, 99823, 99829,
+ 99839, 99839, 99839, 99859, 99871, 99877, 99881, 99881, 99901, 99907,
+ 99907, 99923, 99929, 99929, 99929, 99929, 99961, 99971
+};
+
+
+static const unsigned ByteSizePrimesCount=sizeof(ByteSizePrimes)/sizeof(ByteSizePrimes[0]);
+
+
+static unsigned
+Bits2PrimeNBytes(
+    unsigned Bits,
+    unsigned & BytesOut)
+{
+    unsigned prime, bytes, temp;
+
+    prime=0;
+    BytesOut=0;
+
+    if (0 < Bits)
+    {
+        bytes=(Bits+7)/8;
+        do
+        {
+            if (bytes<ByteSizePrimesCount)
+            {
+                temp=ByteSizePrimes[bytes];
+                if (Bits<=temp)
+                    prime=temp;
+                else
+                    ++bytes;
+            }   // if
+            else
+                prime=bytes*8;
+
+        } while(0==prime);
+
+        BytesOut=bytes;
+    }   // if
+
+    return(prime);
+
+}   // Bits2PrimeNBytes
+
+
+static unsigned
+Bytes2Prime(
+    unsigned Bytes)
+{
+    unsigned prime;
+
+    prime=0;
+    if (Bytes<ByteSizePrimesCount)
+    {
+        prime=ByteSizePrimes[Bytes];
+    }   // if
+    else
+    {
+        prime=Bytes*8;
+    }   // else
+
+    return(prime);
+
+}   // Bytes2Prime
+
+
+}  // namespace leveldb
diff --git a/src/leveldb/util/bloom_test.cc b/src/leveldb/util/bloom_test.cc
index 1b87a2be3..724ebd45a 100644
--- a/src/leveldb/util/bloom_test.cc
+++ b/src/leveldb/util/bloom_test.cc
@@ -4,7 +4,6 @@
 
 #include "leveldb/filter_policy.h"
 
-#include "util/coding.h"
 #include "util/logging.h"
 #include "util/testharness.h"
 #include "util/testutil.h"
@@ -14,8 +13,8 @@ namespace leveldb {
 static const int kVerbose = 1;
 
 static Slice Key(int i, char* buffer) {
-  EncodeFixed32(buffer, i);
-  return Slice(buffer, sizeof(uint32_t));
+  memcpy(buffer, &i, sizeof(i));
+  return Slice(buffer, sizeof(i));
 }
 
 class BloomTest {
@@ -25,7 +24,8 @@ class BloomTest {
   std::vector<std::string> keys_;
 
  public:
-  BloomTest() : policy_(NewBloomFilterPolicy(10)) { }
+//  BloomTest() : policy_(NewBloomFilterPolicy(10)) { }
+  BloomTest() : policy_(NewBloomFilterPolicy2(16)) { }
 
   ~BloomTest() {
     delete policy_;
@@ -46,8 +46,7 @@ class BloomTest {
       key_slices.push_back(Slice(keys_[i]));
     }
     filter_.clear();
-    policy_->CreateFilter(&key_slices[0], static_cast<int>(key_slices.size()),
-                          &filter_);
+    policy_->CreateFilter(&key_slices[0], key_slices.size(), &filter_);
     keys_.clear();
     if (kVerbose >= 2) DumpFilter();
   }
@@ -107,8 +106,10 @@ static int NextLength(int length) {
     length += 10;
   } else if (length < 1000) {
     length += 100;
-  } else {
+  } else if (length < 15000) {
     length += 1000;
+  } else {
+    length += 15000;
   }
   return length;
 }
@@ -120,15 +121,15 @@ TEST(BloomTest, VaryingLengths) {
   int mediocre_filters = 0;
   int good_filters = 0;
 
-  for (int length = 1; length <= 10000; length = NextLength(length)) {
+  for (int length = 1; length <= 200000; length = NextLength(length)) {
     Reset();
     for (int i = 0; i < length; i++) {
       Add(Key(i, buffer));
     }
     Build();
 
-    ASSERT_LE(FilterSize(), static_cast<size_t>((length * 10 / 8) + 40))
-        << length;
+//    ASSERT_LE(FilterSize(), (length * 10 / 8) + 40) << length;
+    ASSERT_LE(FilterSize(), (length * 16 / 8) + 40) << length;
 
     // All added keys must match
     for (int i = 0; i < length; i++) {
diff --git a/src/leveldb/util/cache.cc b/src/leveldb/util/cache.cc
index ce4688617..efa481f53 100644
--- a/src/leveldb/util/cache.cc
+++ b/src/leveldb/util/cache.cc
@@ -19,23 +19,6 @@ Cache::~Cache() {
 namespace {
 
 // LRU cache implementation
-//
-// Cache entries have an "in_cache" boolean indicating whether the cache has a
-// reference on the entry.  The only ways that this can become false without the
-// entry being passed to its "deleter" are via Erase(), via Insert() when
-// an element with a duplicate key is inserted, or on destruction of the cache.
-//
-// The cache keeps two linked lists of items in the cache.  All items in the
-// cache are in one list or the other, and never both.  Items still referenced
-// by clients but erased from the cache are in neither list.  The lists are:
-// - in-use:  contains the items currently referenced by clients, in no
-//   particular order.  (This list is used for invariant checking.  If we
-//   removed the check, elements that would otherwise be on this list could be
-//   left as disconnected singleton lists.)
-// - LRU:  contains the items not currently referenced by clients, in LRU order
-// Elements are moved between these lists by the Ref() and Unref() methods,
-// when they detect an element in the cache acquiring or losing its only
-// external reference.
 
 // An entry is a variable length heap-allocated structure.  Entries
 // are kept in a circular doubly linked list ordered by access time.
@@ -47,8 +30,7 @@ struct LRUHandle {
   LRUHandle* prev;
   size_t charge;      // TODO(opt): Only allow uint32_t?
   size_t key_length;
-  bool in_cache;      // Whether entry is in the cache.
-  uint32_t refs;      // References, including cache reference, if present.
+  uint32_t refs;
   uint32_t hash;      // Hash of key(); used for fast sharding and comparisons
   char key_data[1];   // Beginning of key
 
@@ -134,6 +116,7 @@ class HandleTable {
       LRUHandle* h = list_[i];
       while (h != NULL) {
         LRUHandle* next = h->next_hash;
+        /*Slice key =*/ h->key();  // eliminate unused var warning, but allow for side-effects
         uint32_t hash = h->hash;
         LRUHandle** ptr = &new_list[hash & (new_length - 1)];
         h->next_hash = *ptr;
@@ -150,92 +133,98 @@ class HandleTable {
 };
 
 // A single shard of sharded cache.
-class LRUCache {
+class LRUCache : public Cache {
  public:
   LRUCache();
   ~LRUCache();
 
+  static inline uint32_t HashSlice(const Slice& s) {
+    return Hash(s.data(), s.size(), 0);
+  }
   // Separate from constructor so caller can easily make an array of LRUCache
   void SetCapacity(size_t capacity) { capacity_ = capacity; }
 
+  size_t GetCapacity() const {return(capacity_);};
+  size_t GetUsage() const {return(usage_);};
+
+  // Cache methods to allow direct use for single shard
+  virtual Cache::Handle* Insert(const Slice& key,
+                        void* value, size_t charge,
+                        void (*deleter)(const Slice& key, void* value))
+        {return(Insert(key, HashSlice(key), value, charge, deleter));};
+
+  virtual Cache::Handle* Lookup(const Slice& key)
+        {return(Lookup(key, HashSlice(key)));};
+
+  virtual void Release(Cache::Handle* handle);
+  virtual void Erase(const Slice& key)
+       {Erase(key, HashSlice(key));};
+  virtual void* Value(Handle* handle) {
+    return reinterpret_cast<LRUHandle*>(handle)->value;
+  }
+
+  virtual uint64_t NewId() {
+    return (++last_id_);
+  }
+
+  virtual size_t EntryOverheadSize() {return(sizeof(LRUHandle));};
+
   // Like Cache methods, but with an extra "hash" parameter.
   Cache::Handle* Insert(const Slice& key, uint32_t hash,
                         void* value, size_t charge,
                         void (*deleter)(const Slice& key, void* value));
   Cache::Handle* Lookup(const Slice& key, uint32_t hash);
-  void Release(Cache::Handle* handle);
+
   void Erase(const Slice& key, uint32_t hash);
-  void Prune();
-  size_t TotalCharge() const {
-    MutexLock l(&mutex_);
-    return usage_;
-  }
+
+    virtual void Addref(Cache::Handle* handle);
 
  private:
   void LRU_Remove(LRUHandle* e);
-  void LRU_Append(LRUHandle*list, LRUHandle* e);
-  void Ref(LRUHandle* e);
+  void LRU_Append(LRUHandle* e);
   void Unref(LRUHandle* e);
-  bool FinishErase(LRUHandle* e);
 
   // Initialized before use.
   size_t capacity_;
 
   // mutex_ protects the following state.
-  mutable port::Mutex mutex_;
+  port::Spin spin_;
   size_t usage_;
+  uint64_t last_id_;
 
   // Dummy head of LRU list.
   // lru.prev is newest entry, lru.next is oldest entry.
-  // Entries have refs==1 and in_cache==true.
   LRUHandle lru_;
 
-  // Dummy head of in-use list.
-  // Entries are in use by clients, and have refs >= 2 and in_cache==true.
-  LRUHandle in_use_;
-
   HandleTable table_;
 };
 
 LRUCache::LRUCache()
-    : usage_(0) {
-  // Make empty circular linked lists.
+    : usage_(0),
+      last_id_(0) {
+  // Make empty circular linked list
   lru_.next = &lru_;
   lru_.prev = &lru_;
-  in_use_.next = &in_use_;
-  in_use_.prev = &in_use_;
 }
 
 LRUCache::~LRUCache() {
-  assert(in_use_.next == &in_use_);  // Error if caller has an unreleased handle
   for (LRUHandle* e = lru_.next; e != &lru_; ) {
     LRUHandle* next = e->next;
-    assert(e->in_cache);
-    e->in_cache = false;
-    assert(e->refs == 1);  // Invariant of lru_ list.
+
+    assert(e->refs == 1);  // Error if caller has an unreleased handle
+
     Unref(e);
     e = next;
   }
 }
 
-void LRUCache::Ref(LRUHandle* e) {
-  if (e->refs == 1 && e->in_cache) {  // If on lru_ list, move to in_use_ list.
-    LRU_Remove(e);
-    LRU_Append(&in_use_, e);
-  }
-  e->refs++;
-}
-
 void LRUCache::Unref(LRUHandle* e) {
   assert(e->refs > 0);
   e->refs--;
-  if (e->refs == 0) { // Deallocate.
-    assert(!e->in_cache);
+  if (e->refs <= 0) {
+    usage_ -= e->charge;
     (*e->deleter)(e->key(), e->value);
     free(e);
-  } else if (e->in_cache && e->refs == 1) {  // No longer in use; move to lru_ list.
-    LRU_Remove(e);
-    LRU_Append(&lru_, e);
   }
 }
 
@@ -244,32 +233,43 @@ void LRUCache::LRU_Remove(LRUHandle* e) {
   e->prev->next = e->next;
 }
 
-void LRUCache::LRU_Append(LRUHandle* list, LRUHandle* e) {
-  // Make "e" newest entry by inserting just before *list
-  e->next = list;
-  e->prev = list->prev;
+void LRUCache::LRU_Append(LRUHandle* e) {
+  // Make "e" newest entry by inserting just before lru_
+  e->next = &lru_;
+  e->prev = lru_.prev;
   e->prev->next = e;
   e->next->prev = e;
 }
 
 Cache::Handle* LRUCache::Lookup(const Slice& key, uint32_t hash) {
-  MutexLock l(&mutex_);
+  SpinLock l(&spin_);
   LRUHandle* e = table_.Lookup(key, hash);
   if (e != NULL) {
-    Ref(e);
+    e->refs++;
+    LRU_Remove(e);
+    LRU_Append(e);
   }
   return reinterpret_cast<Cache::Handle*>(e);
 }
 
 void LRUCache::Release(Cache::Handle* handle) {
-  MutexLock l(&mutex_);
+  SpinLock l(&spin_);
   Unref(reinterpret_cast<LRUHandle*>(handle));
 }
 
+void LRUCache::Addref(Cache::Handle* handle) {
+  SpinLock l(&spin_);
+  LRUHandle * e;
+
+  e=reinterpret_cast<LRUHandle*>(handle);
+  if (NULL!=e && 1 <= e->refs)
+      ++e->refs;
+}
+
 Cache::Handle* LRUCache::Insert(
     const Slice& key, uint32_t hash, void* value, size_t charge,
     void (*deleter)(const Slice& key, void* value)) {
-  MutexLock l(&mutex_);
+  SpinLock l(&spin_);
 
   LRUHandle* e = reinterpret_cast<LRUHandle*>(
       malloc(sizeof(LRUHandle)-1 + key.size()));
@@ -278,57 +278,48 @@ Cache::Handle* LRUCache::Insert(
   e->charge = charge;
   e->key_length = key.size();
   e->hash = hash;
-  e->in_cache = false;
-  e->refs = 1;  // for the returned handle.
+  e->refs = 2;  // One from LRUCache, one for the returned handle
   memcpy(e->key_data, key.data(), key.size());
+  LRU_Append(e);
+  usage_ += charge;
 
-  if (capacity_ > 0) {
-    e->refs++;  // for the cache's reference.
-    e->in_cache = true;
-    LRU_Append(&in_use_, e);
-    usage_ += charge;
-    FinishErase(table_.Insert(e));
-  } // else don't cache.  (Tests use capacity_==0 to turn off caching.)
-
-  while (usage_ > capacity_ && lru_.next != &lru_) {
-    LRUHandle* old = lru_.next;
-    assert(old->refs == 1);
-    bool erased = FinishErase(table_.Remove(old->key(), old->hash));
-    if (!erased) {  // to avoid unused variable when compiled NDEBUG
-      assert(erased);
-    }
+  LRUHandle* old = table_.Insert(e);
+  if (old != NULL) {
+    LRU_Remove(old);
+    Unref(old);
   }
 
+
+  // Riak - matthewv: code added to remove old only if it was not active.
+  //  Had scenarios where file cache would be largely or totally drained
+  //  because an active object does NOT reduce usage_ upon delete.  So
+  //  the previous while loop would basically delete everything.
+  LRUHandle * next, * cursor;
+
+  for (cursor=lru_.next; usage_ > capacity_ && cursor != &lru_; cursor=next)
+  {
+      // take next pointer before potentially destroying cursor
+      next=cursor->next;
+
+      // only delete cursor if it will actually destruct and
+      //   return value to usage_
+      if (cursor->refs <= 1)
+      {
+          LRU_Remove(cursor);
+          table_.Remove(cursor->key(), cursor->hash);
+          Unref(cursor);
+      }   // if
+  }   // for
+
   return reinterpret_cast<Cache::Handle*>(e);
 }
 
-// If e != NULL, finish removing *e from the cache; it has already been removed
-// from the hash table.  Return whether e != NULL.  Requires mutex_ held.
-bool LRUCache::FinishErase(LRUHandle* e) {
-  if (e != NULL) {
-    assert(e->in_cache);
-    LRU_Remove(e);
-    e->in_cache = false;
-    usage_ -= e->charge;
-    Unref(e);
-  }
-  return e != NULL;
-}
-
 void LRUCache::Erase(const Slice& key, uint32_t hash) {
-  MutexLock l(&mutex_);
-  FinishErase(table_.Remove(key, hash));
-}
-
-void LRUCache::Prune() {
-  MutexLock l(&mutex_);
-  while (lru_.next != &lru_) {
-    LRUHandle* e = lru_.next;
-    assert(e->refs == 1);
-    bool erased = FinishErase(table_.Remove(e->key(), e->hash));
-    if (!erased) {  // to avoid unused variable when compiled NDEBUG
-      assert(erased);
-    }
+  SpinLock l(&spin_);
+  LRUHandle* e = table_.Remove(key, hash);
+  if (e != NULL) {
+    LRU_Remove(e);
+    Unref(e);
   }
 }
 
@@ -338,7 +329,7 @@ static const int kNumShards = 1 << kNumShardBits;
 class ShardedLRUCache : public Cache {
  private:
   LRUCache shard_[kNumShards];
-  port::Mutex id_mutex_;
+  port::Spin id_spin_;
   uint64_t last_id_;
 
   static inline uint32_t HashSlice(const Slice& s) {
@@ -367,6 +358,10 @@ class ShardedLRUCache : public Cache {
     const uint32_t hash = HashSlice(key);
     return shard_[Shard(hash)].Lookup(key, hash);
   }
+  virtual void Addref(Handle* handle) {
+    LRUHandle* h = reinterpret_cast<LRUHandle*>(handle);
+    shard_[Shard(h->hash)].Addref(handle);
+  }
   virtual void Release(Handle* handle) {
     LRUHandle* h = reinterpret_cast<LRUHandle*>(handle);
     shard_[Shard(h->hash)].Release(handle);
@@ -379,21 +374,10 @@ class ShardedLRUCache : public Cache {
     return reinterpret_cast<LRUHandle*>(handle)->value;
   }
   virtual uint64_t NewId() {
-    MutexLock l(&id_mutex_);
+    SpinLock l(&id_spin_);
     return ++(last_id_);
   }
-  virtual void Prune() {
-    for (int s = 0; s < kNumShards; s++) {
-      shard_[s].Prune();
-    }
-  }
-  virtual size_t TotalCharge() const {
-    size_t total = 0;
-    for (int s = 0; s < kNumShards; s++) {
-      total += shard_[s].TotalCharge();
-    }
-    return total;
-  }
+  virtual size_t EntryOverheadSize() {return(sizeof(LRUHandle));};
 };
 
 }  // end anonymous namespace
@@ -402,4 +386,11 @@ Cache* NewLRUCache(size_t capacity) {
   return new ShardedLRUCache(capacity);
 }
 
+Cache* NewLRUCache2(size_t capacity) {
+    LRUCache * cache;
+    cache=new LRUCache();
+    cache->SetCapacity(capacity);
+    return cache;
+}
+
 }  // namespace leveldb
diff --git a/src/leveldb/util/cache2.cc b/src/leveldb/util/cache2.cc
new file mode 100644
index 000000000..3e2e3cfd1
--- /dev/null
+++ b/src/leveldb/util/cache2.cc
@@ -0,0 +1,760 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+//
+// mildly modified version of Google's original cache.cc to support
+//  Riak's flexcache.cc
+//
+
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "db/table_cache.h"
+#include "db/version_edit.h"
+#include "leveldb/atomics.h"
+#include "leveldb/env.h"
+#include "util/cache2.h"
+#include "port/port.h"
+#include "util/hash.h"
+#include "util/mutexlock.h"
+
+namespace leveldb {
+
+//namespace {
+
+// LRU cache implementation
+
+// An entry is a variable length heap-allocated structure.  Entries
+// are kept in a circular doubly linked list ordered by access time.
+struct LRUHandle2 {
+  void* value;
+  void (*deleter)(const Slice&, void* value);
+  LRUHandle2* next_hash;
+  LRUHandle2* next;
+  LRUHandle2* prev;
+  size_t charge;      // TODO(opt): Only allow uint32_t?
+  size_t key_length;
+  uint32_t refs;
+  uint32_t hash;      // Hash of key(); used for fast sharding and comparisons
+  time_t expire_seconds; // zero (no expire) or time when this object expires
+  char key_data[1];   // Beginning of key
+
+  Slice key() const {
+    // For cheaper lookups, we allow a temporary Handle object
+    // to store a pointer to a key in "value".
+    if (next == this) {
+      return *(reinterpret_cast<Slice*>(value));
+    } else {
+      return Slice(key_data, key_length);
+    }
+  }
+};
+
+// We provide our own simple hash table since it removes a whole bunch
+// of porting hacks and is also faster than some of the built-in hash
+// table implementations in some of the compiler/runtime combinations
+// we have tested.  E.g., readrandom speeds up by ~5% over the g++
+// 4.4.3's builtin hashtable.
+class HandleTable {
+ public:
+  HandleTable() : length_(0), elems_(0), list_(NULL) { Resize(); }
+  ~HandleTable() { delete[] list_; }
+
+  LRUHandle2* Lookup(const Slice& key, uint32_t hash) {
+    return *FindPointer(key, hash);
+  }
+
+  LRUHandle2* Insert(LRUHandle2* h) {
+    LRUHandle2** ptr = FindPointer(h->key(), h->hash);
+    LRUHandle2* old = *ptr;
+    h->next_hash = (old == NULL ? NULL : old->next_hash);
+    *ptr = h;
+    if (old == NULL) {
+      ++elems_;
+      if (elems_ > length_) {
+        // Since each cache entry is fairly large, we aim for a small
+        // average linked list length (<= 1).
+        Resize();
+      }
+    }
+    return old;
+  }
+
+  LRUHandle2* Remove(const Slice& key, uint32_t hash) {
+    LRUHandle2** ptr = FindPointer(key, hash);
+    LRUHandle2* result = *ptr;
+    if (result != NULL) {
+      *ptr = result->next_hash;
+      --elems_;
+    }
+    return result;
+  }
+
+ private:
+  // The table consists of an array of buckets where each bucket is
+  // a linked list of cache entries that hash into the bucket.
+  uint32_t length_;
+  uint32_t elems_;
+  LRUHandle2** list_;
+
+  // Return a pointer to slot that points to a cache entry that
+  // matches key/hash.  If there is no such cache entry, return a
+  // pointer to the trailing slot in the corresponding linked list.
+  LRUHandle2** FindPointer(const Slice& key, uint32_t hash) {
+    LRUHandle2** ptr = &list_[hash & (length_ - 1)];
+    while (*ptr != NULL &&
+           ((*ptr)->hash != hash || key != (*ptr)->key())) {
+      ptr = &(*ptr)->next_hash;
+    }
+    return ptr;
+  }
+
+  void Resize() {
+    uint32_t new_length = 4;
+    while (new_length < elems_) {
+      new_length *= 2;
+    }
+    LRUHandle2** new_list = new LRUHandle2*[new_length];
+    memset(new_list, 0, sizeof(new_list[0]) * new_length);
+    uint32_t count = 0;
+    for (uint32_t i = 0; i < length_; i++) {
+      LRUHandle2* h = list_[i];
+      while (h != NULL) {
+        LRUHandle2* next = h->next_hash;
+        /*Slice key =*/ h->key();  // eliminate unused var warning, but allow for side-effects
+        uint32_t hash = h->hash;
+        LRUHandle2** ptr = &new_list[hash & (new_length - 1)];
+        h->next_hash = *ptr;
+        *ptr = h;
+        h = next;
+        count++;
+      }
+    }
+    assert(elems_ == count);
+    delete[] list_;
+    list_ = new_list;
+    length_ = new_length;
+  }
+};
+
+
+// A single shard of sharded cache.
+class LRUCache2 : public Cache {
+ public:
+  LRUCache2();
+  ~LRUCache2();
+
+  static inline uint32_t HashSlice(const Slice& s) {
+    return Hash(s.data(), s.size(), 0);
+  }
+  // Separate from constructor so caller can easily make an array of LRUCache2
+
+  // Cache2 methods to allow direct use for single shard
+  virtual Cache::Handle* Insert(const Slice& key,
+                        void* value, size_t charge,
+                        void (*deleter)(const Slice& key, void* value))
+        {return(Insert(key, HashSlice(key), value, charge, deleter));};
+
+  virtual Cache::Handle* Lookup(const Slice& key)
+        {return(Lookup(key, HashSlice(key)));};
+
+  virtual void Release(Cache::Handle* handle);
+  virtual bool ReleaseOne();
+  virtual void Erase(const Slice& key)
+       {Erase(key, HashSlice(key));};
+  virtual void* Value(Handle* handle) {
+    return reinterpret_cast<LRUHandle2*>(handle)->value;
+  }
+
+  virtual uint64_t NewId() {
+      return inc_and_fetch(&last_id_);
+  }
+
+  virtual size_t EntryOverheadSize() {return(sizeof(LRUHandle2));};
+
+  // Like Cache methods, but with an extra "hash" parameter.
+  Cache::Handle* Insert(const Slice& key, uint32_t hash,
+                        void* value, size_t charge,
+                        void (*deleter)(const Slice& key, void* value));
+  Cache::Handle* Lookup(const Slice& key, uint32_t hash);
+
+  void Erase(const Slice& key, uint32_t hash);
+
+  virtual void Addref(Cache::Handle* handle);
+
+  void SetParent(ShardedLRUCache2 * Parent, bool IsFileCache)
+    {parent_=Parent; is_file_cache_=IsFileCache;};
+
+  LRUHandle2 * LRUHead() {return(&lru_);}
+
+  void LRUErase(LRUHandle2 * cursor)
+  {
+    LRU_Remove(cursor);
+    table_.Remove(cursor->key(), cursor->hash);
+    Unref(cursor);
+  }
+
+ private:
+  void LRU_Remove(LRUHandle2* e);
+  void LRU_Append(LRUHandle2* e);
+  void Unref(LRUHandle2* e);
+
+  // Initialized before use.
+  class ShardedLRUCache2 * parent_;
+  bool is_file_cache_;
+
+  // mutex_ protects the following state.
+  port::Spin spin_;
+  uint64_t last_id_;
+
+  // Dummy head of LRU list.
+  // lru.prev is newest entry, lru.next is oldest entry.
+  LRUHandle2 lru_;
+
+  HandleTable table_;
+};
+
+LRUCache2::LRUCache2()
+  : parent_(NULL), is_file_cache_(true), last_id_(0)
+{
+  // Make empty circular linked list
+  lru_.next = &lru_;
+  lru_.prev = &lru_;
+  lru_.expire_seconds=0;
+}
+
+LRUCache2::~LRUCache2() {
+  for (LRUHandle2* e = lru_.next; e != &lru_; ) {
+    LRUHandle2* next = e->next;
+
+    assert(e->refs == 1);  // Error if caller has an unreleased handle
+    Unref(e);
+    e = next;
+  }
+}
+
+void LRUCache2::LRU_Remove(LRUHandle2* e) {
+  e->next->prev = e->prev;
+  e->prev->next = e->next;
+}
+
+void LRUCache2::LRU_Append(LRUHandle2* e) {
+  // Make "e" newest entry by inserting just before lru_
+  e->next = &lru_;
+  e->prev = lru_.prev;
+  e->prev->next = e;
+  e->next->prev = e;
+}
+
+//Cache::Handle* LRUCache2::Lookup(const Slice& key, uint32_t hash);
+
+void LRUCache2::Release(Cache::Handle* handle) {
+  SpinLock l(&spin_);
+  Unref(reinterpret_cast<LRUHandle2*>(handle));
+}
+
+void LRUCache2::Addref(Cache::Handle* handle) {
+  SpinLock l(&spin_);
+  LRUHandle2 * e;
+
+  e=reinterpret_cast<LRUHandle2*>(handle);
+  if (NULL!=e && 1 <= e->refs)
+      ++e->refs;
+}
+
+
+void LRUCache2::Erase(const Slice& key, uint32_t hash) {
+  SpinLock l(&spin_);
+  LRUHandle2* e = table_.Remove(key, hash);
+  if (e != NULL) {
+    LRU_Remove(e);
+    Unref(e);
+  }
+}
+
+//}  // end anonymous namespace
+
+
+static const int kNumShardBits = 4;
+static const int kNumShards = 1 << kNumShardBits;
+
+class ShardedLRUCache2 : public Cache {
+public:
+  volatile uint64_t usage_;        // cache2's usage is across all shards,
+                                   //  simplifies FlexCache management
+
+private:
+  LRUCache2 shard_[kNumShards];
+  port::Spin id_spin_;
+  DoubleCache & parent_;
+  bool is_file_cache_;
+  size_t next_shard_;
+  volatile uint64_t last_id_;
+
+  static inline uint32_t HashSlice(const Slice& s) {
+    return Hash(s.data(), s.size(), 0);
+  }
+
+  static uint32_t Shard(uint32_t hash) {
+    return hash >> (32 - kNumShardBits);
+  }
+
+ public:
+  explicit ShardedLRUCache2(class DoubleCache & Parent, bool IsFileCache)
+      : usage_(0), parent_(Parent), is_file_cache_(IsFileCache), next_shard_(0), last_id_(0) {
+    for (int s = 0; s < kNumShards; s++)
+    {
+        shard_[s].SetParent(this, IsFileCache);
+    }
+
+  }
+  virtual ~ShardedLRUCache2() { }
+  volatile uint64_t GetUsage() const {return(usage_);};
+  volatile uint64_t * GetUsagePtr() {return(&usage_);};
+  volatile uint64_t GetCapacity() {return(parent_.GetCapacity(is_file_cache_));}
+  time_t GetFileTimeout() {return(parent_.GetFileTimeout());};
+
+  virtual Handle* Insert(const Slice& key, void* value, size_t charge,
+                         void (*deleter)(const Slice& key, void* value)) {
+    const uint32_t hash = HashSlice(key);
+    return shard_[Shard(hash)].Insert(key, hash, value, charge, deleter);
+  }
+  virtual Handle* Lookup(const Slice& key) {
+    const uint32_t hash = HashSlice(key);
+    return shard_[Shard(hash)].Lookup(key, hash);
+  }
+  virtual void Addref(Handle* handle) {
+    LRUHandle2* h = reinterpret_cast<LRUHandle2*>(handle);
+    shard_[Shard(h->hash)].Addref(handle);
+  }
+  virtual void Release(Handle* handle) {
+    LRUHandle2* h = reinterpret_cast<LRUHandle2*>(handle);
+    shard_[Shard(h->hash)].Release(handle);
+  }
+  virtual void Erase(const Slice& key) {
+    const uint32_t hash = HashSlice(key);
+    shard_[Shard(hash)].Erase(key, hash);
+  }
+  virtual void* Value(Handle* handle) {
+    return reinterpret_cast<LRUHandle2*>(handle)->value;
+  }
+  virtual uint64_t NewId() {
+      return inc_and_fetch(&last_id_);
+  }
+  virtual size_t EntryOverheadSize() {return(sizeof(LRUHandle2));};
+
+  // reduce usage of all shards to fit within current capacity limit
+  void Resize()
+  {
+      size_t end_shard;
+      bool one_deleted;
+
+      SpinLock l(&id_spin_);
+      end_shard=next_shard_;
+      one_deleted=true;
+
+      while((parent_.GetCapacity(is_file_cache_) < usage_) && one_deleted)
+      {
+          one_deleted=false;
+
+          // round robin delete ... later, could delete from most full or such
+          //   but keep simple since using spin lock
+          do
+          {
+              one_deleted=shard_[next_shard_].ReleaseOne();
+              next_shard_=(next_shard_ +1) % kNumShards;
+          } while(end_shard!=next_shard_ && !one_deleted);
+
+      }   // while
+
+      return;
+
+  } // ShardedLRUCache2::Resize
+
+
+  // let doublecache know state of cache space
+  void SetFreeSpaceWarning(size_t FileMetaSize)
+  {
+      bool plenty_space;
+
+      plenty_space=(GetUsage() + 5*FileMetaSize < GetCapacity());
+
+      parent_.SetPlentySpace(plenty_space);
+  }   // SetFreeSpaceWarning
+
+
+  // Only used on file cache.  Remove entries that are too old
+  void PurgeExpiredFiles()
+  {
+      if (is_file_cache_)
+      {
+          int loop;
+          time_t now;
+
+          now=Env::Default()->NowMicros() / 1000000L;
+
+          SpinLock l(&id_spin_);
+
+          for (loop=0; loop<kNumShards; ++loop)
+          {
+              LRUHandle2 * next, * cursor;
+
+              for (cursor=shard_[loop].LRUHead()->next;
+                   cursor->expire_seconds <= now && cursor != shard_[loop].LRUHead();
+                   cursor=next)
+              {
+                  // take next pointer before potentially destroying cursor
+                  next=cursor->next;
+
+                  // only delete cursor if it will actually destruct and
+                  //   return value to usage_
+                  if (cursor->refs <= 1 && 0!=cursor->expire_seconds)
+                  {
+                      shard_[loop].LRUErase(cursor);
+                  }   // if
+              }   // for
+          }   // for
+      }   // if
+
+      return;
+
+  } // ShardedLRUCache2::PurgeExpiredFiles
+
+  // Walk all cache entries, calling functor Acc for each
+  bool
+  WalkCache(
+      CacheAccumulator & Acc)
+  {
+      int loop;
+      bool good(true);
+
+      SpinLock l(&id_spin_);
+
+      for (loop=0; loop<kNumShards && good; ++loop)
+      {
+          LRUHandle2 * cursor;
+
+          for (cursor=shard_[loop].LRUHead()->next;
+               cursor != shard_[loop].LRUHead() && good;
+               cursor=cursor->next)
+          {
+              good=Acc(cursor->value);
+          }   // for
+      }   // for
+
+      return(good);
+
+  } // ShardedLRUCache2::WalkCache
+
+};  //ShardedLRUCache2
+
+
+/**
+ * Initialize cache pair based upon current conditions
+ */
+DoubleCache::DoubleCache(
+    const Options & options)
+    : m_FileCache(NULL), m_BlockCache(NULL),
+      m_IsInternalDB(options.is_internal_db), m_PlentySpace(true),
+      m_Overhead(0), m_TotalAllocation(0),
+      m_FileTimeout(10*24*60*60),  // default is 10 days
+      m_BlockCacheThreshold(options.block_cache_threshold),
+      m_SizeCachedFiles(0)
+{
+    // fixed allocation for recovery log and info LOG: 20M each
+    //  (with 64 or open databases, this is a serious number)
+    // and fixed allocation for two write buffers
+
+    m_Overhead=options.write_buffer_size*2
+        + options.env->RecoveryMmapSize(&options) + 4096;
+    m_TotalAllocation=gFlexCache.GetDBCacheCapacity(m_IsInternalDB);
+
+    if (m_Overhead < m_TotalAllocation)
+        m_TotalAllocation -= m_Overhead;
+    else
+        m_TotalAllocation=0;
+
+    // build two new caches
+    Flush();
+
+}   // DoubleCache::DoubleCache
+
+
+DoubleCache::~DoubleCache()
+{
+    delete m_FileCache;
+    delete m_BlockCache;
+
+}   // DoubleCache::DoubleCache
+
+
+/**
+ * Resize each of the caches based upon new global conditions
+ */
+void
+DoubleCache::ResizeCaches()
+{
+    m_TotalAllocation=gFlexCache.GetDBCacheCapacity(m_IsInternalDB);
+    if (m_Overhead < m_TotalAllocation)
+        m_TotalAllocation -= m_Overhead;
+    else
+        m_TotalAllocation=0;
+
+    // worst case is size reduction, take from block cache first
+    m_BlockCache->Resize();
+    m_FileCache->Resize();
+
+    return;
+
+}   // DoubleCache::ResizeCaches()
+
+
+/**
+ * Calculate limit to file or block cache based upon global conditions
+ */
+size_t
+DoubleCache::GetCapacity(
+    bool IsFileCache,
+    bool EstimatePageCache)
+{
+    size_t  ret_val;
+
+    ret_val=0;
+
+    if (2*1024*1024L < m_TotalAllocation)
+    {
+        // file capacity is "fixed", it is always the entire
+        //  cache allocation less minimum block size
+        if (IsFileCache)
+        {
+            ret_val=m_TotalAllocation - (2*1024*1024L);
+        }   // if
+
+        // block cache capacity is whatever file cache is not
+        //  not using, or its minimum ... whichever is larger
+        else
+        {
+            uint64_t temp;
+
+            // usage could vary between two calls,
+            //   get it once and use same twice
+            temp=m_FileCache->GetUsage();
+
+            if (temp<m_TotalAllocation)
+            {
+                // block cache gets whatever is left after
+                //  file cache usage
+                ret_val=m_TotalAllocation - temp;
+
+                if (EstimatePageCache)
+                {
+                    // if block cache allocation exceeds threshold,
+                    //  give up some to page cache
+                    if (m_BlockCacheThreshold < ret_val)
+                    {
+                        uint32_t spare;
+
+                        spare=ret_val-m_BlockCacheThreshold;
+
+                        // use m_SizeCachedFiles as approximation of page cache
+                        //  space needed for full files, i.e. prefer page cache to block cache
+                        //  (must use temp since m_SizeCachedFiles is volatile)
+                        temp = m_SizeCachedFiles;
+                        if (temp < spare)
+                            spare -= temp;
+                        else
+                            spare=0;
+
+                        ret_val=m_BlockCacheThreshold + spare;
+                    }   // if
+                }   // if
+
+                // always allow for 2Mbyte minimum
+                //   (this minimum overrides m_BlockCacheThreshold)
+                if (ret_val < (2*1024*1024L))
+                    ret_val=(2*1024*1024L);
+            }   // if
+        }   // else
+    }   // if
+
+    return(ret_val);
+
+}   // DoubleCache::GetCapacity
+
+
+/**
+ * Wipe out existing caches (if any), create two new ones
+ *  WARNING:  this is really for UNIT TESTS.  DBImpl and TableCache
+ *  save a copy of the pointers below and will not know of a change.
+ *  The old pointer technology is holdover from original implementation.
+ */
+void
+DoubleCache::Flush()
+{
+    delete m_FileCache;
+    delete m_BlockCache;
+
+    m_FileCache=new ShardedLRUCache2(*this, true);
+    m_BlockCache=new ShardedLRUCache2(*this, false);
+
+    return;
+
+}   // DoubleCache::Flush
+
+
+/**
+ * Make room in block cache by killing off file cache
+ *  entries that have been unused for a while
+ */
+void
+DoubleCache::PurgeExpiredFiles()
+{
+    m_FileCache->PurgeExpiredFiles();
+
+    return;
+
+}   // DoubleCache::PurgExpiredFiles
+
+
+//
+// Definitions moved so they could access ShardedLRUCache members
+//  (subtle hint to Google that every object should have .h file
+//    because future reuse is unknowable ... and this ain't Java)
+//
+Cache::Handle* LRUCache2::Lookup(const Slice& key, uint32_t hash) {
+  SpinLock l(&spin_);
+  LRUHandle2* e = table_.Lookup(key, hash);
+  if (e != NULL) {
+    e->refs++;
+    LRU_Remove(e);
+    LRU_Append(e);
+
+    // establish time limit on files in file cache (like 10 days)
+    //  so they do not go stale and steal from block cache
+    if (is_file_cache_)
+    {
+        e->expire_seconds=Env::Default()->NowMicros() / 1000000L
+            + parent_->GetFileTimeout();
+    }   // if
+  }
+  return reinterpret_cast<Cache::Handle*>(e);
+}
+
+
+
+//
+// Definitions moved so they could access ShardedLRUCache members
+//  (subtle hint to Google that every object should have .h file
+//    because future reuse is unknowable)
+//
+void LRUCache2::Unref(LRUHandle2* e) {
+  assert(e->refs > 0);
+  e->refs--;
+  if (e->refs <= 0) {
+      sub_and_fetch(parent_->GetUsagePtr(), (uint64_t)e->charge);
+
+      if (is_file_cache_)
+          gPerfCounters->Add(ePerfFileCacheRemove, e->charge);
+      else
+          gPerfCounters->Add(ePerfBlockCacheRemove, e->charge);
+
+      (*e->deleter)(e->key(), e->value);
+      free(e);
+  }
+}
+
+
+Cache::Handle* LRUCache2::Insert(
+    const Slice& key, uint32_t hash, void* value, size_t charge,
+    void (*deleter)(const Slice& key, void* value)) {
+
+    size_t this_size;
+
+    this_size=sizeof(LRUHandle2)-1 + key.size();
+    LRUHandle2* e = reinterpret_cast<LRUHandle2*>(
+        malloc(this_size));
+
+    e->value = value;
+    e->deleter = deleter;
+    e->charge = charge + this_size;  // assumes charge is always byte size
+    e->key_length = key.size();
+    e->hash = hash;
+    e->refs = 2;  // One from LRUCache2, one for the returned handle
+    e->expire_seconds=0;
+    memcpy(e->key_data, key.data(), key.size());
+
+    // establish time limit on files in file cache (like 10 days)
+    //  so they do not go stale and steal from block cache
+    if (is_file_cache_)
+    {
+        e->expire_seconds=Env::Default()->NowMicros() / 1000000L
+            + parent_->GetFileTimeout();
+    }   // if
+
+    if (is_file_cache_)
+        gPerfCounters->Add(ePerfFileCacheInsert, e->charge);
+    else
+        gPerfCounters->Add(ePerfBlockCacheInsert, e->charge);
+
+
+    {
+        SpinLock l(&spin_);
+
+        LRU_Append(e);
+        add_and_fetch(parent_->GetUsagePtr(), (uint64_t)e->charge);
+
+        LRUHandle2* old = table_.Insert(e);
+        if (old != NULL) {
+            LRU_Remove(old);
+            Unref(old);
+        }
+    }   // SpinLock
+
+    // call parent to rebalance across all shards, not just this one
+    if (parent_->GetCapacity() <parent_->GetUsage())
+        parent_->Resize();
+
+    // let parent adjust free space warning level
+    if (is_file_cache_)
+        parent_->SetFreeSpaceWarning(e->charge);
+
+
+
+  return reinterpret_cast<Cache::Handle*>(e);
+}
+
+
+bool
+LRUCache2::ReleaseOne()
+{
+    bool ret_flag;
+    LRUHandle2 * next, * cursor;
+    SpinLock lock(&spin_);
+
+    ret_flag=false;
+
+    for (cursor=lru_.next; !ret_flag && parent_->GetUsage() > parent_->GetCapacity() && cursor != &lru_; cursor=next)
+    {
+        // take next pointer before potentially destroying cursor
+        next=cursor->next;
+
+        // only delete cursor if it will actually destruct and
+        //   return value to usage_
+        if (cursor->refs <= 1)
+        {
+            LRU_Remove(cursor);
+            table_.Remove(cursor->key(), cursor->hash);
+            Unref(cursor);
+            ret_flag=true;
+        }   // if
+    }   // for
+
+    return(ret_flag);
+
+}   // LRUCache2::ReleaseOne
+
+}  // namespace leveldb
+
diff --git a/src/leveldb/util/cache2.h b/src/leveldb/util/cache2.h
new file mode 100644
index 000000000..b3e3f8c2b
--- /dev/null
+++ b/src/leveldb/util/cache2.h
@@ -0,0 +1,106 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// A Cache is an interface that maps keys to values.  It has internal
+// synchronization and may be safely accessed concurrently from
+// multiple threads.  It may automatically evict entries to make room
+// for new entries.  Values have a specified charge against the cache
+// capacity.  For example, a cache where the values are variable
+// length strings, may use the length of the string as the charge for
+// the string.
+//
+// A builtin cache implementation with a least-recently-used eviction
+// policy is provided.  Clients may use their own implementations if
+// they want something more sophisticated (like scan-resistance, a
+// custom eviction policy, variable cache sizing, etc.)
+
+//
+// mildly modified version of Google's original cache.cc to support
+//  Riak's flexcache.cc
+//
+
+#ifndef STORAGE_LEVELDB_INCLUDE_CACHE2_H_
+#define STORAGE_LEVELDB_INCLUDE_CACHE2_H_
+
+#include <stdint.h>
+#include <string>
+#include <time.h>
+
+#include "leveldb/atomics.h"
+#include "leveldb/cache.h"
+#include "leveldb/options.h"
+#include "leveldb/slice.h"
+#include "util/flexcache.h"
+
+namespace leveldb {
+
+class ShardedLRUCache2;
+
+
+/**
+ * CacheAccumulator is an object to process values
+ *  when walking the contents of a cache, i.e. a functor
+ */
+class CacheAccumulator
+{
+public:
+    CacheAccumulator() {};
+    virtual ~CacheAccumulator() {};
+
+    virtual bool operator()(void * Value) = 0;
+};
+
+
+/**
+ * DoubleCache holds the file cache and the block cache to easy
+ *  interactive sizing
+ */
+class DoubleCache
+{
+public:
+    explicit DoubleCache(const Options & options);
+    virtual ~DoubleCache();
+
+    Cache * GetFileCache() {return((Cache *)m_FileCache);};
+    Cache * GetBlockCache() {return((Cache *)m_BlockCache);};
+
+    void ResizeCaches();
+    size_t GetCapacity(bool IsFileCache, bool EstimatePageCache=true);
+    time_t GetFileTimeout() {return(m_FileTimeout);};
+    void SetFileTimeout(time_t Timeout) {m_FileTimeout=Timeout;};
+
+    void Flush();
+    void SetPlentySpace(bool PlentySpace) {m_PlentySpace=PlentySpace;};
+    bool GetPlentySpace() const {return(m_PlentySpace);};
+    void PurgeExpiredFiles();
+
+    bool IsInternalDB() const {return(m_IsInternalDB);};
+
+    void AddFileSize(uint64_t file_size) {add_and_fetch(&m_SizeCachedFiles, file_size);};
+    void SubFileSize(uint64_t file_size) {sub_and_fetch(&m_SizeCachedFiles, file_size);};
+
+protected:
+    ShardedLRUCache2 * m_FileCache;   //!< file cache used by db/tablecache.cc
+    ShardedLRUCache2 * m_BlockCache;  //!< used by table/table.cc
+
+    bool m_IsInternalDB;        //!< internal db gets smaller allocation from FlexCache
+    bool m_PlentySpace;         //!< true when lots of spare space in file cache
+    size_t m_Overhead;          //!< reduce from allocation to better estimate limits
+    size_t m_TotalAllocation;
+    time_t m_FileTimeout;       //!< seconds to allow file to stay cached.  default 4 days.
+
+    uint64_t m_BlockCacheThreshold; //!< from Options, point where block cache canNOT be
+                                    //!< sacrificed for page cache
+    volatile uint64_t m_SizeCachedFiles; //!< disk size of .sst files in file cache
+
+private:
+    DoubleCache();                       //!< no default constructor
+    DoubleCache(const DoubleCache &);    //!< no copy constructor
+    void operator=(const DoubleCache &); //!< no assignment
+
+};  // class DoubleCache
+
+}  // namespace leveldb
+
+#endif  // STORAGE_LEVELDB_UTIL_CACHE2_H_
diff --git a/src/leveldb/util/cache2_test.cc b/src/leveldb/util/cache2_test.cc
new file mode 100644
index 000000000..3dbd5ee8f
--- /dev/null
+++ b/src/leveldb/util/cache2_test.cc
@@ -0,0 +1,312 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+//
+// Google's cache_test.cc modified to support Riak DoubleCache
+//
+
+#include <vector>
+
+#include "util/cache2.h"
+#include "util/coding.h"
+#include "util/testharness.h"
+
+namespace leveldb {
+
+// Conversions between numeric keys/values and the types expected by Cache.
+static std::string EncodeKey(int k) {
+  std::string result;
+  PutFixed32(&result, k);
+  return result;
+}
+static int DecodeKey(const Slice& k) {
+  assert(k.size() == 4);
+  return DecodeFixed32(k.data());
+}
+static void* EncodeValue(uintptr_t v) { return reinterpret_cast<void*>(v); }
+static int DecodeValue(void* v) { return reinterpret_cast<uintptr_t>(v); }
+
+class CacheTest {
+ public:
+  static CacheTest* current_;
+
+  static void Deleter(const Slice& key, void* v) {
+    current_->deleted_keys_.push_back(DecodeKey(key));
+    current_->deleted_values_.push_back(DecodeValue(v));
+  }
+
+  static const int kOneMeg = 1024*1024L;
+  static const int kCacheSize = 180;    // 180Mbytes is default
+  std::vector<int> deleted_keys_;
+  std::vector<int> deleted_values_;
+  Options options_;
+
+  DoubleCache double_cache_;
+
+  Cache* cache_;
+  Cache* file_;
+
+  CacheTest()
+     : double_cache_(options_)
+  {
+    current_ = this;
+    gFlexCache.SetTotalMemory((120+kCacheSize)*kOneMeg);
+    double_cache_.ResizeCaches();
+    cache_=double_cache_.GetBlockCache();
+    file_=double_cache_.GetFileCache();
+  }
+
+  ~CacheTest() {
+  }
+
+  void ResetCaches()
+  {
+    double_cache_.Flush();
+    cache_=double_cache_.GetBlockCache();
+    file_=double_cache_.GetFileCache();
+  }
+
+  int Lookup(int key) {
+    Cache::Handle* handle = cache_->Lookup(EncodeKey(key));
+    const int r = (handle == NULL) ? -1 : DecodeValue(cache_->Value(handle));
+    if (handle != NULL) {
+      cache_->Release(handle);
+    }
+    return r;
+  }
+
+  void Insert(int key, int value, int charge = 1) {
+    cache_->Release(cache_->Insert(EncodeKey(key), EncodeValue(value), charge,
+                                   &CacheTest::Deleter));
+  }
+
+  void InsertFile(int key, int value, int charge = 1) {
+    file_->Release(file_->Insert(EncodeKey(key), EncodeValue(value), charge,
+                                   &CacheTest::Deleter));
+  }
+
+  void Erase(int key) {
+    cache_->Erase(EncodeKey(key));
+  }
+};
+CacheTest* CacheTest::current_;
+
+TEST(CacheTest, HitAndMiss) {
+  ASSERT_EQ(-1, Lookup(100));
+
+  Insert(100, 101);
+  ASSERT_EQ(101, Lookup(100));
+  ASSERT_EQ(-1,  Lookup(200));
+  ASSERT_EQ(-1,  Lookup(300));
+
+  Insert(200, 201);
+  ASSERT_EQ(101, Lookup(100));
+  ASSERT_EQ(201, Lookup(200));
+  ASSERT_EQ(-1,  Lookup(300));
+
+  Insert(100, 102);
+  ASSERT_EQ(102, Lookup(100));
+  ASSERT_EQ(201, Lookup(200));
+  ASSERT_EQ(-1,  Lookup(300));
+
+  ASSERT_EQ(1, deleted_keys_.size());
+  ASSERT_EQ(100, deleted_keys_[0]);
+  ASSERT_EQ(101, deleted_values_[0]);
+}
+
+TEST(CacheTest, Erase) {
+  Erase(200);
+  ASSERT_EQ(0, deleted_keys_.size());
+
+  Insert(100, 101);
+  Insert(200, 201);
+  Erase(100);
+  ASSERT_EQ(-1,  Lookup(100));
+  ASSERT_EQ(201, Lookup(200));
+  ASSERT_EQ(1, deleted_keys_.size());
+  ASSERT_EQ(100, deleted_keys_[0]);
+  ASSERT_EQ(101, deleted_values_[0]);
+
+  Erase(100);
+  ASSERT_EQ(-1,  Lookup(100));
+  ASSERT_EQ(201, Lookup(200));
+  ASSERT_EQ(1, deleted_keys_.size());
+}
+
+TEST(CacheTest, EntriesArePinned) {
+  Insert(100, 101);
+  Cache::Handle* h1 = cache_->Lookup(EncodeKey(100));
+  ASSERT_EQ(101, DecodeValue(cache_->Value(h1)));
+
+  Insert(100, 102);
+  Cache::Handle* h2 = cache_->Lookup(EncodeKey(100));
+  ASSERT_EQ(102, DecodeValue(cache_->Value(h2)));
+  ASSERT_EQ(0, deleted_keys_.size());
+
+  cache_->Release(h1);
+  ASSERT_EQ(1, deleted_keys_.size());
+  ASSERT_EQ(100, deleted_keys_[0]);
+  ASSERT_EQ(101, deleted_values_[0]);
+
+  Erase(100);
+  ASSERT_EQ(-1, Lookup(100));
+  ASSERT_EQ(1, deleted_keys_.size());
+
+  cache_->Release(h2);
+  ASSERT_EQ(2, deleted_keys_.size());
+  ASSERT_EQ(100, deleted_keys_[1]);
+  ASSERT_EQ(102, deleted_values_[1]);
+}
+
+TEST(CacheTest, EvictionPolicy) {
+  Insert(100, 101, kOneMeg);
+  Insert(200, 201, kOneMeg);
+  // Frequently used entry must be kept around
+  for (int i = 0; i < kCacheSize + 100; i++) {
+    Insert(1000+i, 2000+i, kOneMeg);
+    ASSERT_EQ(2000+i, Lookup(1000+i));
+    ASSERT_EQ(101, Lookup(100));
+  }
+  ASSERT_EQ(101, Lookup(100));
+  ASSERT_EQ(-1, Lookup(200));
+}
+
+TEST(CacheTest, HeavyEntries) {
+  // Add a bunch of light and heavy entries and then count the combined
+  // size of items still in the cache, which must be approximately the
+  // same as the total capacity.
+  const int kLight = 1;
+  const int kHeavy = 10;
+  int added = 0;
+  int index = 0;
+  while (added < 2*kCacheSize) {
+    const int weight = (index & 1) ? kLight : kHeavy;
+    Insert(index, 1000+index, weight*kOneMeg);
+    added += weight;
+    index++;
+  }
+
+  int cached_weight = 0;
+  for (int i = 0; i < index; i++) {
+    const int weight = (i & 1 ? kLight : kHeavy);
+    int r = Lookup(i);
+    if (r >= 0) {
+      cached_weight += weight*kOneMeg;
+      ASSERT_EQ(1000+i, r);
+    }
+  }
+  ASSERT_LE(cached_weight, (kCacheSize + kCacheSize/10)*kOneMeg);
+}
+
+TEST(CacheTest, FlushedEntries) {
+  int added = 0;
+  int index = 0;
+  while (added < 2*kCacheSize) {
+    Insert(index, 1000+index, kOneMeg);
+    added += 1;
+    index++;
+  }
+
+  added=0;
+  while (added < kCacheSize/2) {
+    InsertFile(index, 1000+index, kOneMeg);
+    added += 1;
+    index++;
+  }
+
+  // one insert to block cache should rebalance both
+  Insert(index, 1000+index, kOneMeg);
+
+  int cached_weight = 0;
+  for (int i = 0; i < index; i++) {
+    int r = Lookup(i);
+    if (r >= 0) {
+      cached_weight += 1;
+      ASSERT_EQ(1000+i, r);
+    }
+  }
+  ASSERT_LE(cached_weight, (kCacheSize/2 + kCacheSize/10));
+}
+
+TEST(CacheTest, FileCacheExpire) {
+    time_t expire_default;
+    size_t beginning_size;
+
+    ResetCaches();
+    expire_default=double_cache_.GetFileTimeout();
+
+    // quick two second timeout
+    double_cache_.SetFileTimeout(2);
+
+    // what is block cache's starting size
+    beginning_size=double_cache_.GetCapacity(false);
+
+    // add bunch of stuff to file cache
+    int added = 0;
+    int index = 0;
+    while (added < kCacheSize/2) {
+        InsertFile(index, 1000+index, kOneMeg);
+        added += 1;
+        index++;
+    }   // while
+
+    // did file cache take away?
+    ASSERT_GT(beginning_size-(kCacheSize/2)*kOneMeg, double_cache_.GetCapacity(false));
+
+    // sleep two seconds
+    Env::Default()->SleepForMicroseconds(2000000);
+
+    // force time purge
+    double_cache_.PurgeExpiredFiles();
+
+    ASSERT_EQ(beginning_size, double_cache_.GetCapacity(false));
+
+    // add bunch of stuff to file cache with 2 second timeout
+    added = 0;
+    index = 0;
+    while (added < kCacheSize/4) {
+        InsertFile(index, 1000+index, kOneMeg);
+        added += 1;
+        index++;
+    }   // while
+
+    // add bunch of stuff to file cache with 5 second timeout
+    double_cache_.SetFileTimeout(5);
+    while (added < kCacheSize/2) {
+        InsertFile(index, 1000+index, kOneMeg);
+        added += 1;
+        index++;
+    }   // while
+
+    // did file cache take away?
+    ASSERT_GT(beginning_size-(kCacheSize/2)*kOneMeg, double_cache_.GetCapacity(false));
+
+    // sleep two seconds
+    Env::Default()->SleepForMicroseconds(2000000);
+
+    // force time purge
+    double_cache_.PurgeExpiredFiles();
+
+    // did only half get purged
+    ASSERT_GT(beginning_size-(kCacheSize/4)*kOneMeg, double_cache_.GetCapacity(false));
+
+    // reset timeout to default
+    double_cache_.SetFileTimeout(expire_default);
+
+    return;
+
+}   // CacheTest::FileCacheExpire
+
+
+TEST(CacheTest, NewId) {
+  uint64_t a = cache_->NewId();
+  uint64_t b = cache_->NewId();
+  ASSERT_NE(a, b);
+}
+
+}  // namespace leveldb
+
+int main(int argc, char** argv) {
+  return leveldb::test::RunAllTests();
+}
diff --git a/src/leveldb/util/cache_test.cc b/src/leveldb/util/cache_test.cc
index 468f7a642..1a1b496db 100644
--- a/src/leveldb/util/cache_test.cc
+++ b/src/leveldb/util/cache_test.cc
@@ -59,11 +59,6 @@ class CacheTest {
                                    &CacheTest::Deleter));
   }
 
-  Cache::Handle* InsertAndReturnHandle(int key, int value, int charge = 1) {
-    return cache_->Insert(EncodeKey(key), EncodeValue(value), charge,
-                          &CacheTest::Deleter);
-  }
-
   void Erase(int key) {
     cache_->Erase(EncodeKey(key));
   }
@@ -140,11 +135,7 @@ TEST(CacheTest, EntriesArePinned) {
 TEST(CacheTest, EvictionPolicy) {
   Insert(100, 101);
   Insert(200, 201);
-  Insert(300, 301);
-  Cache::Handle* h = cache_->Lookup(EncodeKey(300));
-
-  // Frequently used entry must be kept around,
-  // as must things that are still in use.
+  // Frequently used entry must be kept around
   for (int i = 0; i < kCacheSize + 100; i++) {
     Insert(1000+i, 2000+i);
     ASSERT_EQ(2000+i, Lookup(1000+i));
@@ -152,25 +143,6 @@ TEST(CacheTest, EvictionPolicy) {
   }
   ASSERT_EQ(101, Lookup(100));
   ASSERT_EQ(-1, Lookup(200));
-  ASSERT_EQ(301, Lookup(300));
-  cache_->Release(h);
-}
-
-TEST(CacheTest, UseExceedsCacheSize) {
-  // Overfill the cache, keeping handles on all inserted entries.
-  std::vector<Cache::Handle*> h;
-  for (int i = 0; i < kCacheSize + 100; i++) {
-    h.push_back(InsertAndReturnHandle(1000+i, 2000+i));
-  }
-
-  // Check that all the entries can be found in the cache.
-  for (int i = 0; i < h.size(); i++) {
-    ASSERT_EQ(2000+i, Lookup(1000+i));
-  }
-
-  for (int i = 0; i < h.size(); i++) {
-    cache_->Release(h[i]);
-  }
 }
 
 TEST(CacheTest, HeavyEntries) {
@@ -206,19 +178,6 @@ TEST(CacheTest, NewId) {
   ASSERT_NE(a, b);
 }
 
-TEST(CacheTest, Prune) {
-  Insert(1, 100);
-  Insert(2, 200);
-
-  Cache::Handle* handle = cache_->Lookup(EncodeKey(1));
-  ASSERT_TRUE(handle);
-  cache_->Prune();
-  cache_->Release(handle);
-
-  ASSERT_EQ(100, Lookup(1));
-  ASSERT_EQ(-1, Lookup(2));
-}
-
 }  // namespace leveldb
 
 int main(int argc, char** argv) {
diff --git a/src/leveldb/util/coding.cc b/src/leveldb/util/coding.cc
index 21e3186d5..e133765b1 100644
--- a/src/leveldb/util/coding.cc
+++ b/src/leveldb/util/coding.cc
@@ -7,29 +7,29 @@
 namespace leveldb {
 
 void EncodeFixed32(char* buf, uint32_t value) {
-  if (port::kLittleEndian) {
-    memcpy(buf, &value, sizeof(value));
-  } else {
-    buf[0] = value & 0xff;
-    buf[1] = (value >> 8) & 0xff;
-    buf[2] = (value >> 16) & 0xff;
-    buf[3] = (value >> 24) & 0xff;
-  }
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+  memcpy(buf, &value, sizeof(value));
+#else
+  buf[0] = value & 0xff;
+  buf[1] = (value >> 8) & 0xff;
+  buf[2] = (value >> 16) & 0xff;
+  buf[3] = (value >> 24) & 0xff;
+#endif
 }
 
 void EncodeFixed64(char* buf, uint64_t value) {
-  if (port::kLittleEndian) {
-    memcpy(buf, &value, sizeof(value));
-  } else {
-    buf[0] = value & 0xff;
-    buf[1] = (value >> 8) & 0xff;
-    buf[2] = (value >> 16) & 0xff;
-    buf[3] = (value >> 24) & 0xff;
-    buf[4] = (value >> 32) & 0xff;
-    buf[5] = (value >> 40) & 0xff;
-    buf[6] = (value >> 48) & 0xff;
-    buf[7] = (value >> 56) & 0xff;
-  }
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+  memcpy(buf, &value, sizeof(value));
+#else
+  buf[0] = value & 0xff;
+  buf[1] = (value >> 8) & 0xff;
+  buf[2] = (value >> 16) & 0xff;
+  buf[3] = (value >> 24) & 0xff;
+  buf[4] = (value >> 32) & 0xff;
+  buf[5] = (value >> 40) & 0xff;
+  buf[6] = (value >> 48) & 0xff;
+  buf[7] = (value >> 56) & 0xff;
+#endif
 }
 
 void PutFixed32(std::string* dst, uint32_t value) {
@@ -79,7 +79,7 @@ void PutVarint32(std::string* dst, uint32_t v) {
 }
 
 char* EncodeVarint64(char* dst, uint64_t v) {
-  static const int B = 128;
+  static const uint64_t B = 128;
   unsigned char* ptr = reinterpret_cast<unsigned char*>(dst);
   while (v >= B) {
     *(ptr++) = (v & (B-1)) | B;
diff --git a/src/leveldb/util/coding.h b/src/leveldb/util/coding.h
index 3993c4a75..af6d6d52e 100644
--- a/src/leveldb/util/coding.h
+++ b/src/leveldb/util/coding.h
@@ -82,6 +82,19 @@ inline uint64_t DecodeFixed64(const char* ptr) {
   }
 }
 
+// Riak: return only lowest 8 bits of 64 bit number,
+//       optimization for internal key's ValueType
+inline unsigned char DecodeLeastFixed64(const char * ptr) {
+  unsigned char ret_char;
+
+  if (port::kLittleEndian)
+    ret_char=(unsigned char)*ptr;
+  else
+    ret_char=(unsigned char)*(ptr+7);
+  return(ret_char);
+}
+
+
 // Internal routine for use by fallback path of GetVarint32Ptr
 extern const char* GetVarint32PtrFallback(const char* p,
                                           const char* limit,
diff --git a/src/leveldb/util/coding_test.cc b/src/leveldb/util/coding_test.cc
index 521541ea6..2c52b17b6 100644
--- a/src/leveldb/util/coding_test.cc
+++ b/src/leveldb/util/coding_test.cc
@@ -109,16 +109,16 @@ TEST(Coding, Varint64) {
     values.push_back(power);
     values.push_back(power-1);
     values.push_back(power+1);
-  }
+  };
 
   std::string s;
-  for (size_t i = 0; i < values.size(); i++) {
+  for (int i = 0; i < values.size(); i++) {
     PutVarint64(&s, values[i]);
   }
 
   const char* p = s.data();
   const char* limit = p + s.size();
-  for (size_t i = 0; i < values.size(); i++) {
+  for (int i = 0; i < values.size(); i++) {
     ASSERT_TRUE(p < limit);
     uint64_t actual;
     const char* start = p;
@@ -143,7 +143,7 @@ TEST(Coding, Varint32Truncation) {
   std::string s;
   PutVarint32(&s, large_value);
   uint32_t result;
-  for (size_t len = 0; len < s.size() - 1; len++) {
+  for (int len = 0; len < s.size() - 1; len++) {
     ASSERT_TRUE(GetVarint32Ptr(s.data(), s.data() + len, &result) == NULL);
   }
   ASSERT_TRUE(GetVarint32Ptr(s.data(), s.data() + s.size(), &result) != NULL);
@@ -162,7 +162,7 @@ TEST(Coding, Varint64Truncation) {
   std::string s;
   PutVarint64(&s, large_value);
   uint64_t result;
-  for (size_t len = 0; len < s.size() - 1; len++) {
+  for (int len = 0; len < s.size() - 1; len++) {
     ASSERT_TRUE(GetVarint64Ptr(s.data(), s.data() + len, &result) == NULL);
   }
   ASSERT_TRUE(GetVarint64Ptr(s.data(), s.data() + s.size(), &result) != NULL);
diff --git a/src/leveldb/util/comparator.cc b/src/leveldb/util/comparator.cc
index 4b7b5724e..037d0b4f8 100644
--- a/src/leveldb/util/comparator.cc
+++ b/src/leveldb/util/comparator.cc
@@ -67,7 +67,7 @@ class BytewiseComparatorImpl : public Comparator {
 }  // namespace
 
 static port::OnceType once = LEVELDB_ONCE_INIT;
-static const Comparator* bytewise;
+static const Comparator* bytewise = NULL;
 
 static void InitModule() {
   bytewise = new BytewiseComparatorImpl;
@@ -78,4 +78,9 @@ const Comparator* BytewiseComparator() {
   return bytewise;
 }
 
+void ComparatorShutdown()
+{
+    delete bytewise;
+    bytewise=NULL;
+}
 }  // namespace leveldb
diff --git a/src/leveldb/util/crc32c.cc b/src/leveldb/util/crc32c.cc
index b3f40eeee..d52492ca5 100644
--- a/src/leveldb/util/crc32c.cc
+++ b/src/leveldb/util/crc32c.cc
@@ -8,13 +8,16 @@
 #include "util/crc32c.h"
 
 #include <stdint.h>
-
-#include "port/port.h"
 #include "util/coding.h"
 
 namespace leveldb {
 namespace crc32c {
 
+static uint32_t SoftCRC(uint32_t StartCrc, const char * BlockStart, size_t BlockSize);
+static uint32_t HardCRC(uint32_t StartCrc, const char * BlockStart, size_t BlockSize);
+
+static uint32_t (*CrcFunction)(uint32_t, const char *, size_t)=&SoftCRC;
+
 static const uint32_t table0_[256] = {
   0x00000000, 0xf26b8303, 0xe13b70f7, 0x1350f3f4,
   0xc79a971f, 0x35f1141c, 0x26a1e7e8, 0xd4ca64eb,
@@ -285,27 +288,22 @@ static inline uint32_t LE_LOAD32(const uint8_t *p) {
   return DecodeFixed32(reinterpret_cast<const char*>(p));
 }
 
-// Determine if the CPU running this program can accelerate the CRC32C
-// calculation.
-static bool CanAccelerateCRC32C() {
-  if (!port::HasAcceleratedCRC32C())
-    return false;
 
-  // Double-check that the accelerated implementation functions correctly.
-  // port::AcceleretedCRC32C returns zero when unable to accelerate.
-  static const char kTestCRCBuffer[] = "TestCRCBuffer";
-  static const char kBufSize = sizeof(kTestCRCBuffer) - 1;
-  static const uint32_t kTestCRCValue = 0xdcbc59fa;
+uint32_t Extend(uint32_t crc, const char* buf, size_t size)
+{
+    return((*CrcFunction)(crc, buf, size));
+}   // Extend
 
-  return port::AcceleratedCRC32C(0, kTestCRCBuffer, kBufSize) == kTestCRCValue;
-}
 
-uint32_t Extend(uint32_t crc, const char* buf, size_t size) {
-  static bool accelerate = CanAccelerateCRC32C();
-  if (accelerate) {
-    return port::AcceleratedCRC32C(crc, buf, size);
-  }
+void SwitchToHardwareCRC() {CrcFunction=&HardCRC;};
 
+
+bool IsHardwareCRC() {return(&HardCRC==CrcFunction);};
+
+
+static uint32_t
+SoftCRC(uint32_t crc, const char* buf, size_t size)
+{
   const uint8_t *p = reinterpret_cast<const uint8_t *>(buf);
   const uint8_t *e = p + size;
   uint32_t l = crc ^ 0xffffffffu;
@@ -347,8 +345,53 @@ uint32_t Extend(uint32_t crc, const char* buf, size_t size) {
   }
 #undef STEP4
 #undef STEP1
+
   return l ^ 0xffffffffu;
-}
+}   // SoftCRC
+
+
+static uint32_t
+HardCRC(
+    uint32_t StartCrc,
+    const char * BlockStart,
+    size_t BlockSize)
+{
+#if defined(__x86_64__)
+    size_t fullqwords, remainder;
+    uint32_t ret_crc;
+    char * src_c;
+    uint64_t * src_q;
+
+    fullqwords=BlockSize / 8;
+    remainder=BlockSize % 8;
+
+    ret_crc=StartCrc ^ 0xffffffffu;
+    src_q=(uint64_t *)BlockStart;
+
+    for ( ; 0!=fullqwords; --fullqwords, ++src_q)
+    {
+        __asm__ __volatile__ (
+            ".byte 0xf2, 0x48, 0x0f, 0x38, 0xf1, 0xf1;"
+            : "=S"(ret_crc)
+            : "S"(ret_crc), "c"(*src_q));
+    }   // for
+
+    src_c=(char *)src_q;
+    for ( ; 0!=remainder; --remainder, ++src_c)
+    {
+        __asm__ __volatile__ (
+            ".byte 0xf2, 0x48, 0x0f, 0x38, 0xf0, 0xf1;"
+            : "=S"(ret_crc)
+            : "S"(ret_crc), "c"(*src_c));
+    }   // for
+
+    return(ret_crc ^ 0xffffffffu);
+#else
+    return(0);
+#endif
+
+}   // HardCRC
+
 
 }  // namespace crc32c
 }  // namespace leveldb
diff --git a/src/leveldb/util/crc32c.h b/src/leveldb/util/crc32c.h
index 1d7e5c075..61253d235 100644
--- a/src/leveldb/util/crc32c.h
+++ b/src/leveldb/util/crc32c.h
@@ -21,6 +21,10 @@ inline uint32_t Value(const char* data, size_t n) {
   return Extend(0, data, n);
 }
 
+// switch function pointer from software crc to hardware
+extern void SwitchToHardwareCRC();
+extern bool IsHardwareCRC();
+
 static const uint32_t kMaskDelta = 0xa282ead8ul;
 
 // Return a masked representation of crc.
diff --git a/src/leveldb/util/crc32c_test.cc b/src/leveldb/util/crc32c_test.cc
index 4b957ee12..e87dd9f97 100644
--- a/src/leveldb/util/crc32c_test.cc
+++ b/src/leveldb/util/crc32c_test.cc
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
+#include "leveldb/env.h"
 #include "util/crc32c.h"
 #include "util/testharness.h"
 
@@ -68,5 +69,9 @@ TEST(CRC, Mask) {
 }  // namespace leveldb
 
 int main(int argc, char** argv) {
-  return leveldb::test::RunAllTests();
+
+    // identify and potentially switch to hardware CRC
+    leveldb::Env::Default();
+
+    return leveldb::test::RunAllTests();
 }
diff --git a/src/leveldb/util/db_list.cc b/src/leveldb/util/db_list.cc
new file mode 100644
index 000000000..2c3de802c
--- /dev/null
+++ b/src/leveldb/util/db_list.cc
@@ -0,0 +1,192 @@
+// -------------------------------------------------------------------
+//
+// db_list.cc
+//
+// Copyright (c) 2011-2013 Basho Technologies, Inc. All Rights Reserved.
+//
+// This file is provided to you under the Apache License,
+// Version 2.0 (the "License"); you may not use this file
+// except in compliance with the License.  You may obtain
+// a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+//
+// -------------------------------------------------------------------
+
+#include <algorithm>
+#include <syslog.h>
+
+#include "util/db_list.h"
+#include "util/mutexlock.h"
+
+#define __STDC_FORMAT_MACROS
+#include <inttypes.h>
+
+namespace leveldb {
+
+// using singleton model from comparator.cc
+static port::OnceType once = LEVELDB_ONCE_INIT;
+static DBListImpl * dblist=NULL;
+
+static void InitModule()
+{
+    dblist=new DBListImpl;
+}   // InitModule
+
+
+DBListImpl * DBList()
+{
+    port::InitOnce(&once, InitModule);
+    return(dblist);
+
+}   // DBList
+
+
+void
+DBListShutdown()
+{
+    // retrieve point to handle any initialization/shutdown races
+    DBList();
+    delete dblist;
+
+    return;
+
+}   // DBListShutdown
+
+
+
+DBListImpl::DBListImpl()
+    : m_UserDBCount(0), m_InternalDBCount(0)
+{
+}   // DBListImpl::DBListImpl
+
+
+bool
+DBListImpl::AddDB(
+    DBImpl * Dbase,
+    bool IsInternal)
+{
+    bool ret_flag;
+
+    SpinLock lock(&m_Lock);
+
+    if (IsInternal)
+    {
+        ret_flag=m_InternalDBs.insert(Dbase).second;
+        m_InternalDBCount=m_InternalDBs.size();
+    }   // if
+    else
+    {
+        ret_flag=m_UserDBs.insert(Dbase).second;
+        m_UserDBCount=m_UserDBs.size();
+    }   // else
+
+    return(ret_flag);
+
+}   // DBListImpl::AddDB
+
+
+void
+DBListImpl::ReleaseDB(
+    DBImpl * Dbase,
+    bool IsInternal)
+{
+    db_set_t::iterator it;
+    SpinLock lock(&m_Lock);
+
+    if (IsInternal)
+    {
+        it=m_InternalDBs.find(Dbase);
+        if (m_InternalDBs.end()!=it)
+        {
+            m_InternalDBs.erase(it);
+        }   // if
+        m_InternalDBCount=m_InternalDBs.size();
+    }   // if
+    else
+    {
+        it=m_UserDBs.find(Dbase);
+        if (m_UserDBs.end()!=it)
+        {
+            m_UserDBs.erase(it);
+        }   // if
+        m_UserDBCount=m_UserDBs.size();
+    }   // else
+
+    return;
+
+}   // DBListImpl::ReleaseDB
+
+
+size_t
+DBListImpl::GetDBCount(
+    bool IsInternal)
+{
+    size_t ret_val;
+
+    if (IsInternal)
+        ret_val=m_InternalDBCount;
+    else
+        ret_val=m_UserDBCount;
+
+    return(ret_val);
+
+}   // DBListImpl::GetDBCount
+
+
+void
+DBListImpl::ScanDBs(
+    bool IsInternal,
+    void (DBImpl::* Function)())
+{
+    db_set_t::iterator it, first, last;
+    SpinLock lock(&m_Lock);
+
+    size_t count;
+
+    // for_each() would have been fun, but setup deadlock
+    //  scenarios
+    // Now we have a race condition of us using the db object
+    //  while someone is shutting it down ... hmm
+    if (IsInternal)
+    {
+        first=m_InternalDBs.begin();
+        last=m_InternalDBs.end();
+        count=m_InternalDBs.size();
+    }   // if
+    else
+    {
+        first=m_UserDBs.begin();
+        last=m_UserDBs.end();
+        count=m_UserDBs.size();
+    }   // else
+
+#if 0  // for debugging ... sometimes
+    m_Lock.Unlock(); /// might not be needed now
+    syslog(LOG_ERR, "count %zd, total memory %" PRIu64 ", db cache size %" PRIu64 ", internal %d",
+           count, gFlexCache.GetTotalMemory(), gFlexCache.GetDBCacheCapacity(IsInternal),
+           (int)IsInternal);
+    m_Lock.Lock();
+#else
+    count=count*2;  // kill off compiler warning
+#endif
+
+    // call member function of each database
+    for (it=first; last!=it; ++it)
+    {
+        // must protect list from db add/delete during scan, leave locks
+        ((*it)->*Function)();
+    }   // for
+
+    return;
+
+}   // DBListImpl::ScanDBs
+
+}  // namespace leveldb
diff --git a/src/leveldb/util/db_list.h b/src/leveldb/util/db_list.h
new file mode 100644
index 000000000..709ab1ab5
--- /dev/null
+++ b/src/leveldb/util/db_list.h
@@ -0,0 +1,67 @@
+// -------------------------------------------------------------------
+//
+// db_list.h
+//
+// Copyright (c) 2011-2013 Basho Technologies, Inc. All Rights Reserved.
+//
+// This file is provided to you under the Apache License,
+// Version 2.0 (the "License"); you may not use this file
+// except in compliance with the License.  You may obtain
+// a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+//
+// -------------------------------------------------------------------
+
+#include "db/db_impl.h"
+#include "port/port.h"
+
+
+namespace leveldb
+{
+
+/**
+ * DBList:  class to provide management access to all
+ *  open databases (Riak vnodes)
+ */
+class DBListImpl
+{
+protected:
+   typedef std::set<DBImpl *> db_set_t;
+
+   port::Spin m_Lock;      //!< thread protection for set
+   db_set_t m_UserDBs;     //!< set of pointers for user db
+   db_set_t m_InternalDBs; //!< Riak internal dbs
+
+   volatile size_t m_UserDBCount;   //!< m_UserDBs size() for non-blocking retrieval
+   volatile size_t m_InternalDBCount;   //!< m_InternalDBs size() for non-blocking retrieval
+
+public:
+   DBListImpl();
+   virtual ~DBListImpl() {};
+
+   bool AddDB(DBImpl *, bool is_internal);
+   void ReleaseDB(DBImpl *, bool is_internal);
+
+   size_t GetDBCount(bool is_internal);
+
+   void ScanDBs(bool is_internal, void (DBImpl::*)());
+
+};  // class DBListImpl
+
+
+// Universal access to dblist ... initialization order independent
+DBListImpl * DBList();
+
+// cleanup memory, mostly for valgrind
+void DBListShutdown();
+
+
+}  // namespace leveldb
diff --git a/src/leveldb/util/env.cc b/src/leveldb/util/env.cc
index c58a0821e..5311c3883 100644
--- a/src/leveldb/util/env.cc
+++ b/src/leveldb/util/env.cc
@@ -2,17 +2,17 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
+#include <syslog.h>
+#include <stdarg.h>
+
 #include "leveldb/env.h"
+#include "leveldb/perf_count.h"
 
 namespace leveldb {
 
 Env::~Env() {
 }
 
-Status Env::NewAppendableFile(const std::string& fname, WritableFile** result) {
-  return Status::NotSupported("NewAppendableFile", fname);
-}
-
 SequentialFile::~SequentialFile() {
 }
 
@@ -29,19 +29,39 @@ FileLock::~FileLock() {
 }
 
 void Log(Logger* info_log, const char* format, ...) {
-  if (info_log != NULL) {
     va_list ap;
+
     va_start(ap, format);
-    info_log->Logv(format, ap);
+
+    if (info_log != NULL)
+    {
+        info_log->Logv(format, ap);
+    }   // if
+    else
+    {
+        // perf counter is clue to check syslog
+        vsyslog(LOG_ERR, format, ap);
+        gPerfCounters->Inc(ePerfSyslogWrite);
+    }   // else
+
     va_end(ap);
-  }
 }
 
 static Status DoWriteStringToFile(Env* env, const Slice& data,
                                   const std::string& fname,
                                   bool should_sync) {
   WritableFile* file;
-  Status s = env->NewWritableFile(fname, &file);
+  size_t map_size;
+
+  // adjust file map size to speed up corruption test's
+  //  writing of 40M files, but keep small for normal
+  //  case of writing CURRENT file (code will round up to page_size)
+  if (gMapSize<data.size())
+      map_size=gMapSize;
+  else
+      map_size=data.size();
+
+  Status s = env->NewWritableFile(fname, &file, map_size);
   if (!s.ok()) {
     return s;
   }
diff --git a/src/leveldb/util/env_posix.cc b/src/leveldb/util/env_posix.cc
index f77918313..b446c5a30 100644
--- a/src/leveldb/util/env_posix.cc
+++ b/src/leveldb/util/env_posix.cc
@@ -1,8 +1,9 @@
 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
-#if !defined(LEVELDB_PLATFORM_WINDOWS)
 
+#include <deque>
+#include <set>
 #include <dirent.h>
 #include <errno.h>
 #include <fcntl.h>
@@ -10,84 +11,88 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <syslog.h>
 #include <sys/mman.h>
-#include <sys/resource.h>
 #include <sys/stat.h>
 #include <sys/time.h>
 #include <sys/types.h>
+#include <sys/file.h>
 #include <time.h>
 #include <unistd.h>
-#include <deque>
-#include <limits>
-#include <set>
+#if defined(LEVELDB_PLATFORM_ANDROID)
+#include <sys/stat.h>
+#endif
+#include "leveldb/atomics.h"
 #include "leveldb/env.h"
+#include "leveldb/filter_policy.h"
 #include "leveldb/slice.h"
 #include "port/port.h"
+#include "util/crc32c.h"
+#include "util/db_list.h"
+#include "util/hot_threads.h"
 #include "util/logging.h"
 #include "util/mutexlock.h"
 #include "util/posix_logger.h"
-#include "util/env_posix_test_helper.h"
+#include "util/thread_tasks.h"
+#include "util/throttle.h"
+#include "db/dbformat.h"
+#include "leveldb/perf_count.h"
+
+
+#if _XOPEN_SOURCE >= 600 || _POSIX_C_SOURCE >= 200112L
+#define HAVE_FADVISE
+#endif
 
 namespace leveldb {
 
-namespace {
+volatile size_t gMapSize=20*1024*1024L;
 
-static int open_read_only_file_limit = -1;
-static int mmap_limit = -1;
+// ugly global used to change fadvise behaviour
+bool gFadviseWillNeed=false;
+
+namespace {
 
 static Status IOError(const std::string& context, int err_number) {
   return Status::IOError(context, strerror(err_number));
 }
 
-// Helper class to limit resource usage to avoid exhaustion.
-// Currently used to limit read-only file descriptors and mmap file usage
-// so that we do not end up running out of file descriptors, virtual memory,
-// or running into kernel performance problems for very large databases.
-class Limiter {
- public:
-  // Limit maximum number of resources to |n|.
-  Limiter(intptr_t n) {
-    SetAllowed(n);
-  }
+// background routines to close and/or unmap files
+static void BGFileUnmapper2(void* file_info);
 
-  // If another resource is available, acquire it and return true.
-  // Else return false.
-  bool Acquire() {
-    if (GetAllowed() <= 0) {
-      return false;
-    }
-    MutexLock l(&mu_);
-    intptr_t x = GetAllowed();
-    if (x <= 0) {
-      return false;
-    } else {
-      SetAllowed(x - 1);
-      return true;
-    }
-  }
+// data needed by background routines for close/unmap
+class BGCloseInfo : public ThreadTask
+{
+public:
+    int fd_;
+    void * base_;
+    size_t offset_;
+    size_t length_;
+    volatile uint64_t * ref_count_;
+    uint64_t metadata_;
 
-  // Release a resource acquired by a previous call to Acquire() that returned
-  // true.
-  void Release() {
-    MutexLock l(&mu_);
-    SetAllowed(GetAllowed() + 1);
-  }
+    BGCloseInfo(int fd, void * base, size_t offset, size_t length,
+                volatile uint64_t * ref_count, uint64_t metadata)
+        : fd_(fd), base_(base), offset_(offset), length_(length),
+          ref_count_(ref_count), metadata_(metadata)
+    {
+        // reference count of independent file object count
+        if (NULL!=ref_count_)
+            inc_and_fetch(ref_count_);
 
- private:
-  port::Mutex mu_;
-  port::AtomicPointer allowed_;
+        // reference count of threads/paths using this object
+        //  (because there is a direct path and a threaded path usage)
+        RefInc();
+    };
 
-  intptr_t GetAllowed() const {
-    return reinterpret_cast<intptr_t>(allowed_.Acquire_Load());
-  }
+    virtual ~BGCloseInfo() {};
 
-  // REQUIRES: mu_ must be held
-  void SetAllowed(intptr_t v) {
-    allowed_.Release_Store(reinterpret_cast<void*>(v));
-  }
+    virtual void operator()() {BGFileUnmapper2(this);};
+
+private:
+    BGCloseInfo();
+    BGCloseInfo(const BGCloseInfo &);
+    BGCloseInfo & operator=(const BGCloseInfo &);
 
-  Limiter(const Limiter&);
-  void operator=(const Limiter&);
 };
 
 class PosixSequentialFile: public SequentialFile {
@@ -121,183 +126,384 @@ class PosixSequentialFile: public SequentialFile {
     }
     return Status::OK();
   }
-
-  virtual std::string GetName() const { return filename_; }
 };
 
 // pread() based random-access
 class PosixRandomAccessFile: public RandomAccessFile {
  private:
   std::string filename_;
-  bool temporary_fd_;  // If true, fd_ is -1 and we open on every read.
   int fd_;
-  Limiter* limiter_;
+  bool is_compaction_;
+  uint64_t file_size_;
 
  public:
-  PosixRandomAccessFile(const std::string& fname, int fd, Limiter* limiter)
-      : filename_(fname), fd_(fd), limiter_(limiter) {
-    temporary_fd_ = !limiter->Acquire();
-    if (temporary_fd_) {
-      // Open file on every access.
-      close(fd_);
-      fd_ = -1;
-    }
+  PosixRandomAccessFile(const std::string& fname, int fd)
+      : filename_(fname), fd_(fd), is_compaction_(false), file_size_(0)
+  {
+#if defined(HAVE_FADVISE)
+    posix_fadvise(fd_, 0, file_size_, POSIX_FADV_RANDOM);
+#endif
+    gPerfCounters->Inc(ePerfROFileOpen);
   }
+  virtual ~PosixRandomAccessFile()
+  {
+      if (is_compaction_)
+      {
+#if defined(HAVE_FADVISE)
+          posix_fadvise(fd_, 0, file_size_, POSIX_FADV_DONTNEED);
+#endif
+      }   // if
 
-  virtual ~PosixRandomAccessFile() {
-    if (!temporary_fd_) {
-      close(fd_);
-      limiter_->Release();
-    }
+     gPerfCounters->Inc(ePerfROFileClose);
+     close(fd_);
   }
 
   virtual Status Read(uint64_t offset, size_t n, Slice* result,
                       char* scratch) const {
-    int fd = fd_;
-    if (temporary_fd_) {
-      fd = open(filename_.c_str(), O_RDONLY);
-      if (fd < 0) {
-        return IOError(filename_, errno);
-      }
-    }
-
     Status s;
-    ssize_t r = pread(fd, scratch, n, static_cast<off_t>(offset));
+    ssize_t r = pread(fd_, scratch, n, static_cast<off_t>(offset));
     *result = Slice(scratch, (r < 0) ? 0 : r);
     if (r < 0) {
       // An error: return a non-ok status
       s = IOError(filename_, errno);
     }
-    if (temporary_fd_) {
-      // Close the temporary file descriptor opened earlier.
-      close(fd);
-    }
     return s;
   }
 
-  virtual std::string GetName() const { return filename_; }
+  virtual void SetForCompaction(uint64_t file_size)
+  {
+      is_compaction_=true;
+      file_size_=file_size;
+#if defined(HAVE_FADVISE)
+      posix_fadvise(fd_, 0, file_size_, POSIX_FADV_SEQUENTIAL);
+#endif
+
+  };
+
+  // Riak addition:  size of this structure in bytes
+  virtual size_t ObjectSize() {return(sizeof(PosixRandomAccessFile)+filename_.length());};
+
 };
 
-// mmap() based random-access
-class PosixMmapReadableFile: public RandomAccessFile {
+
+// We preallocate up to an extra megabyte and use memcpy to append new
+// data to the file.  This is safe since we either properly close the
+// file before reading from it, or for log files, the reading code
+// knows enough to skip zero suffixes.
+class PosixMmapFile : public WritableFile {
  private:
   std::string filename_;
-  void* mmapped_region_;
-  size_t length_;
-  Limiter* limiter_;
+  int fd_;
+  size_t page_size_;
+  size_t map_size_;       // How much extra memory to map at a time
+  char* base_;            // The mapped region
+  char* limit_;           // Limit of the mapped region
+  char* dst_;             // Where to write next  (in range [base_,limit_])
+  char* last_sync_;       // Where have we synced up to
+  uint64_t file_offset_;  // Offset of base_ in file
+  uint64_t metadata_offset_; // Offset where sst metadata starts, or zero
+  bool pending_sync_;     // Have we done an munmap of unsynced data?
+  bool is_async_;        // can this file process in background
+  volatile uint64_t * ref_count_; // alternative to std:shared_ptr that is thread safe everywhere
 
- public:
-  // base[0,length-1] contains the mmapped contents of the file.
-  PosixMmapReadableFile(const std::string& fname, void* base, size_t length,
-                        Limiter* limiter)
-      : filename_(fname), mmapped_region_(base), length_(length),
-        limiter_(limiter) {
+  // Roundup x to a multiple of y
+  static size_t Roundup(size_t x, size_t y) {
+    return ((x + y - 1) / y) * y;
   }
 
-  virtual ~PosixMmapReadableFile() {
-    munmap(mmapped_region_, length_);
-    limiter_->Release();
-  }
-
-  virtual Status Read(uint64_t offset, size_t n, Slice* result,
-                      char* scratch) const {
-    Status s;
-    if (offset + n > length_) {
-      *result = Slice();
-      s = IOError(filename_, EINVAL);
-    } else {
-      *result = Slice(reinterpret_cast<char*>(mmapped_region_) + offset, n);
-    }
+  size_t TruncateToPageBoundary(size_t s) {
+    s -= (s & (page_size_ - 1));
+    assert((s % page_size_) == 0);
     return s;
   }
 
-  virtual std::string GetName() const { return filename_; }
-};
+  bool UnmapCurrentRegion() {
+    bool result = true;
+    if (base_ != NULL) {
+      if (last_sync_ < limit_) {
+        // Defer syncing this data until next Sync() call, if any
+        pending_sync_ = true;
+      }
 
-class PosixWritableFile : public WritableFile {
- private:
-  std::string filename_;
-  FILE* file_;
+
+      // write only files can perform operations async, but not
+      //  files that might re-open and read again soon
+      if (!is_async_)
+      {
+          BGCloseInfo * ptr=new BGCloseInfo(fd_, base_, file_offset_, limit_-base_,
+                                            NULL, metadata_offset_);
+          BGFileUnmapper2(ptr);
+      }   // if
+
+      // called from user thread, move these operations to background
+      //  queue
+      else
+      {
+          BGCloseInfo * ptr=new BGCloseInfo(fd_, base_, file_offset_, limit_-base_,
+                                            ref_count_, metadata_offset_);
+          gWriteThreads->Submit(ptr);
+      }   // else
+
+      file_offset_ += limit_ - base_;
+      base_ = NULL;
+      limit_ = NULL;
+      last_sync_ = NULL;
+      dst_ = NULL;
+
+    }
+
+    return result;
+  }
+
+  bool MapNewRegion() {
+    size_t offset_adjust;
+
+    // append mode file might not have file_offset_ on a page boundry
+    offset_adjust=file_offset_ % page_size_;
+    if (0!=offset_adjust)
+        file_offset_-=offset_adjust;
+
+    assert(base_ == NULL);
+    if (ftruncate(fd_, file_offset_ + map_size_) < 0) {
+      return false;
+    }
+    void* ptr = mmap(NULL, map_size_, PROT_WRITE, MAP_SHARED,
+                     fd_, file_offset_);
+    if (ptr == MAP_FAILED) {
+      return false;
+    }
+    base_ = reinterpret_cast<char*>(ptr);
+    limit_ = base_ + map_size_;
+    dst_ = base_ + offset_adjust;
+    last_sync_ = base_;
+    return true;
+  }
 
  public:
-  PosixWritableFile(const std::string& fname, FILE* f)
-      : filename_(fname), file_(f) { }
+  PosixMmapFile(const std::string& fname, int fd,
+                size_t page_size, size_t file_offset=0L,
+                bool is_async=false,
+                size_t map_size=gMapSize)
+      : filename_(fname),
+        fd_(fd),
+        page_size_(page_size),
+        map_size_(Roundup(map_size, page_size)),
+        base_(NULL),
+        limit_(NULL),
+        dst_(NULL),
+        last_sync_(NULL),
+        file_offset_(file_offset),
+        metadata_offset_(0),
+        pending_sync_(false),
+        is_async_(is_async),
+        ref_count_(NULL)
+    {
+    assert((page_size & (page_size - 1)) == 0);
 
-  ~PosixWritableFile() {
-    if (file_ != NULL) {
-      // Ignoring any potential errors
-      fclose(file_);
+    if (is_async_)
+    {
+        ref_count_=new volatile uint64_t[2];
+        *ref_count_=1;      // one ref count for PosixMmapFile object
+        *(ref_count_+1)=0;  // filesize
+    }   // if
+
+    // when global set, make entire file use FADV_WILLNEED
+    if (gFadviseWillNeed)
+        metadata_offset_=1;
+
+    gPerfCounters->Inc(ePerfRWFileOpen);
+  }
+
+  ~PosixMmapFile() {
+    if (fd_ >= 0) {
+      PosixMmapFile::Close();
     }
   }
 
   virtual Status Append(const Slice& data) {
-    size_t r = fwrite_unlocked(data.data(), 1, data.size(), file_);
-    if (r != data.size()) {
-      return IOError(filename_, errno);
+    const char* src = data.data();
+    size_t left = data.size();
+    while (left > 0) {
+      assert(base_ <= dst_);
+      assert(dst_ <= limit_);
+      size_t avail = limit_ - dst_;
+      if (avail == 0) {
+        if (!UnmapCurrentRegion() ||
+            !MapNewRegion()) {
+          return IOError(filename_, errno);
+        }
+      }
+
+      size_t n = (left <= avail) ? left : avail;
+      memcpy(dst_, src, n);
+      dst_ += n;
+      src += n;
+      left -= n;
     }
     return Status::OK();
   }
 
   virtual Status Close() {
-    Status result;
-    if (fclose(file_) != 0) {
-      result = IOError(filename_, errno);
+    Status s;
+    size_t file_length;
+    int ret_val;
+
+
+    // compute actual file length before final unmap
+    file_length=file_offset_ + (dst_ - base_);
+
+    if (!UnmapCurrentRegion()) {
+        s = IOError(filename_, errno);
     }
-    file_ = NULL;
-    return result;
+
+    // hard code
+    if (!is_async_)
+    {
+        ret_val=ftruncate(fd_, file_length);
+        if (0!=ret_val)
+        {
+            syslog(LOG_ERR,"Close ftruncate failed [%d, %m]", errno);
+            s = IOError(filename_, errno);
+        }   // if
+
+        ret_val=close(fd_);
+    }  // if
+
+    // async close
+    else
+    {
+        *(ref_count_ +1)=file_length;
+        ret_val=ReleaseRef(ref_count_, fd_);
+
+        // retry once if failed
+        if (0!=ret_val)
+        {
+            Env::Default()->SleepForMicroseconds(500000);
+            ret_val=ReleaseRef(ref_count_, fd_);
+            if (0!=ret_val)
+            {
+                syslog(LOG_ERR,"ReleaseRef failed in Close");
+                s = IOError(filename_, errno);
+                delete [] ref_count_;
+
+                // force close
+                ret_val=close(fd_);
+            }   // if
+        }   // if
+    }   // else
+
+    fd_ = -1;
+    ref_count_=NULL;
+    base_ = NULL;
+    limit_ = NULL;
+    return s;
   }
 
   virtual Status Flush() {
-    if (fflush_unlocked(file_) != 0) {
-      return IOError(filename_, errno);
-    }
     return Status::OK();
   }
 
-  Status SyncDirIfManifest() {
-    const char* f = filename_.c_str();
-    const char* sep = strrchr(f, '/');
-    Slice basename;
-    std::string dir;
-    if (sep == NULL) {
-      dir = ".";
-      basename = f;
-    } else {
-      dir = std::string(f, sep - f);
-      basename = sep + 1;
-    }
+  virtual Status Sync() {
     Status s;
-    if (basename.starts_with("MANIFEST")) {
-      int fd = open(dir.c_str(), O_RDONLY);
-      if (fd < 0) {
-        s = IOError(dir, errno);
-      } else {
-        if (fsync(fd) < 0 && errno != EINVAL) {
-          s = IOError(dir, errno);
-        }
-        close(fd);
+
+    if (pending_sync_) {
+      // Some unmapped data was not synced
+      pending_sync_ = false;
+      if (fdatasync(fd_) < 0) {
+        s = IOError(filename_, errno);
       }
     }
+
+    if (dst_ > last_sync_) {
+      // Find the beginnings of the pages that contain the first and last
+      // bytes to be synced.
+      size_t p1 = TruncateToPageBoundary(last_sync_ - base_);
+      size_t p2 = TruncateToPageBoundary(dst_ - base_ - 1);
+      last_sync_ = dst_;
+      if (msync(base_ + p1, p2 - p1 + page_size_, MS_SYNC) < 0) {
+        s = IOError(filename_, errno);
+      }
+    }
+
     return s;
   }
 
-  virtual Status Sync() {
-    // Ensure new files referred to by the manifest are in the filesystem.
-    Status s = SyncDirIfManifest();
-    if (!s.ok()) {
-      return s;
-    }
-    if (fflush_unlocked(file_) != 0 ||
-        fdatasync(fileno(file_)) != 0) {
-      s = Status::IOError(filename_, strerror(errno));
-    }
-    return s;
-  }
+  virtual void SetMetadataOffset(uint64_t Metadata)
+  {
+      // when global set, make entire file use FADV_WILLNEED,
+      //  so ignore this setting
+      if (!gFadviseWillNeed && 1!=metadata_offset_)
+          metadata_offset_=Metadata;
+  }   // SetMetadataOffset
+
+
+  // if std::shared_ptr was guaranteed thread safe everywhere
+  //  the following function would be best written differently
+  static int ReleaseRef(volatile uint64_t * Count, int File)
+  {
+      bool good;
+
+      good=true;
+      if (NULL!=Count)
+      {
+          int ret_val;
+
+          ret_val=dec_and_fetch(Count);
+          if (0==ret_val)
+          {
+              ret_val=ftruncate(File, *(Count +1));
+              if (0!=ret_val)
+              {
+                  syslog(LOG_ERR,"ReleaseRef ftruncate failed [%d, %m]", errno);
+                  gPerfCounters->Inc(ePerfBGWriteError);
+                  good=false;
+              }   // if
+
+              if (good)
+              {
+                  ret_val=close(File);
+                  if (0==ret_val)
+                  {
+                      gPerfCounters->Inc(ePerfRWFileClose);
+                  }   // if
+                  else
+                  {
+                      syslog(LOG_ERR,"ReleaseRef close failed [%d, %m]", errno);
+                      gPerfCounters->Inc(ePerfBGWriteError);
+                      good=false;
+                  }   // else
+
+              }   // if
+
+              if (good)
+                  delete [] Count;
+              else
+                  inc_and_fetch(Count); // try again.
+
+          }   // if
+      }   // if
+
+      return(good ? 0 : -1);
+
+  }   // static ReleaseRef
 
-  virtual std::string GetName() const { return filename_; }
 };
 
+
+// matthewv July 17, 2012 ... riak was overlapping activity on the
+//  same database directory due to the incorrect assumption that the
+//  code below worked within the riak process space.  The fix leads to a choice:
+// fcntl() only locks against external processes, not multiple locks from
+//  same process.  But it has worked great with NFS forever
+// flock() locks against both external processes and multiple locks from
+//  same process.  It does not with NFS until Linux 2.6.12 ... other OS may vary.
+//  SmartOS/Solaris do not appear to support flock() though there is a man page.
+// Pick the fcntl() or flock() below as appropriate for your environment / needs.
+
 static int LockOrUnlock(int fd, bool lock) {
+#ifndef LOCK_UN
+    // works with NFS, but fails if same process attempts second access to
+    //  db, i.e. makes second DB object to same directory
   errno = 0;
   struct flock f;
   memset(&f, 0, sizeof(f));
@@ -306,6 +512,10 @@ static int LockOrUnlock(int fd, bool lock) {
   f.l_start = 0;
   f.l_len = 0;        // Lock/unlock entire file
   return fcntl(fd, F_SETLK, &f);
+#else
+  // does NOT work with NFS, but DOES work within same process
+  return flock(fd, (lock ? LOCK_EX : LOCK_UN) | LOCK_NB);
+#endif
 }
 
 class PosixFileLock : public FileLock {
@@ -332,14 +542,12 @@ class PosixLockTable {
   }
 };
 
+static PosixLockTable gFileLocks;
+
 class PosixEnv : public Env {
  public:
   PosixEnv();
-  virtual ~PosixEnv() {
-    char msg[] = "Destroying Env::Default()\n";
-    fwrite(msg, 1, sizeof(msg), stderr);
-    abort();
-  }
+  virtual ~PosixEnv();
 
   virtual Status NewSequentialFile(const std::string& fname,
                                    SequentialFile** result) {
@@ -360,53 +568,84 @@ class PosixEnv : public Env {
     int fd = open(fname.c_str(), O_RDONLY);
     if (fd < 0) {
       s = IOError(fname, errno);
-    } else if (mmap_limit_.Acquire()) {
+#if 0
+      // going to let page cache tune the file
+      //  system reads instead of hoping to better
+      //  manage through memory mapped files.
+    } else if (sizeof(void*) >= 8) {
+      // Use mmap when virtual address-space is plentiful.
       uint64_t size;
       s = GetFileSize(fname, &size);
       if (s.ok()) {
         void* base = mmap(NULL, size, PROT_READ, MAP_SHARED, fd, 0);
         if (base != MAP_FAILED) {
-          *result = new PosixMmapReadableFile(fname, base, size, &mmap_limit_);
+            *result = new PosixMmapReadableFile(fname, base, size, fd);
         } else {
           s = IOError(fname, errno);
+          close(fd);
         }
       }
-      close(fd);
-      if (!s.ok()) {
-        mmap_limit_.Release();
-      }
+#endif
     } else {
-      *result = new PosixRandomAccessFile(fname, fd, &fd_limit_);
+      *result = new PosixRandomAccessFile(fname, fd);
     }
     return s;
   }
 
   virtual Status NewWritableFile(const std::string& fname,
-                                 WritableFile** result) {
+                                 WritableFile** result,
+                                 size_t map_size) {
     Status s;
-    FILE* f = fopen(fname.c_str(), "w");
-    if (f == NULL) {
+    const int fd = open(fname.c_str(), O_CREAT | O_RDWR | O_TRUNC, 0644);
+    if (fd < 0) {
       *result = NULL;
       s = IOError(fname, errno);
     } else {
-      *result = new PosixWritableFile(fname, f);
+      *result = new PosixMmapFile(fname, fd, page_size_, 0, false, map_size);
     }
     return s;
   }
 
   virtual Status NewAppendableFile(const std::string& fname,
-                                   WritableFile** result) {
+                                   WritableFile** result,
+                                   size_t map_size) {
     Status s;
-    FILE* f = fopen(fname.c_str(), "a");
-    if (f == NULL) {
+    const int fd = open(fname.c_str(), O_CREAT | O_RDWR, 0644);
+    if (fd < 0) {
+      *result = NULL;
+      s = IOError(fname, errno);
+    } else
+    {
+      uint64_t size;
+      s = GetFileSize(fname, &size);
+      if (s.ok())
+      {
+          *result = new PosixMmapFile(fname, fd, page_size_, size, false, map_size);
+      }   // if
+      else
+      {
+          s = IOError(fname, errno);
+          close(fd);
+      }   // else
+    }   // else
+    return s;
+  }
+
+  virtual Status NewWriteOnlyFile(const std::string& fname,
+                                  WritableFile** result,
+                                  size_t map_size) {
+    Status s;
+    const int fd = open(fname.c_str(), O_CREAT | O_RDWR | O_TRUNC, 0644);
+    if (fd < 0) {
       *result = NULL;
       s = IOError(fname, errno);
     } else {
-      *result = new PosixWritableFile(fname, f);
+      *result = new PosixMmapFile(fname, fd, page_size_, 0, true, map_size);
     }
     return s;
   }
 
+
   virtual bool FileExists(const std::string& fname) {
     return access(fname.c_str(), F_OK) == 0;
   }
@@ -432,7 +671,7 @@ class PosixEnv : public Env {
       result = IOError(fname, errno);
     }
     return result;
-  }
+  };
 
   virtual Status CreateDir(const std::string& name) {
     Status result;
@@ -440,7 +679,7 @@ class PosixEnv : public Env {
       result = IOError(name, errno);
     }
     return result;
-  }
+  };
 
   virtual Status DeleteDir(const std::string& name) {
     Status result;
@@ -448,7 +687,7 @@ class PosixEnv : public Env {
       result = IOError(name, errno);
     }
     return result;
-  }
+  };
 
   virtual Status GetFileSize(const std::string& fname, uint64_t* size) {
     Status s;
@@ -476,17 +715,18 @@ class PosixEnv : public Env {
     int fd = open(fname.c_str(), O_RDWR | O_CREAT, 0644);
     if (fd < 0) {
       result = IOError(fname, errno);
-    } else if (!locks_.Insert(fname)) {
+    } else if (!gFileLocks.Insert(fname)) {
       close(fd);
       result = Status::IOError("lock " + fname, "already held by process");
     } else if (LockOrUnlock(fd, true) == -1) {
       result = IOError("lock " + fname, errno);
       close(fd);
-      locks_.Remove(fname);
+      gFileLocks.Remove(fname);
     } else {
       PosixFileLock* my_lock = new PosixFileLock;
       my_lock->fd_ = fd;
       my_lock->name_ = fname;
+
       *lock = my_lock;
     }
     return result;
@@ -498,15 +738,18 @@ class PosixEnv : public Env {
     if (LockOrUnlock(my_lock->fd_, false) == -1) {
       result = IOError("unlock", errno);
     }
-    locks_.Remove(my_lock->name_);
+    gFileLocks.Remove(my_lock->name_);
     close(my_lock->fd_);
+
+    my_lock->fd_=-1;
+
     delete my_lock;
     return result;
   }
 
   virtual void Schedule(void (*function)(void*), void* arg);
 
-  virtual void StartThread(void (*function)(void* arg), void* arg);
+  virtual pthread_t StartThread(void (*function)(void* arg), void* arg);
 
   virtual Status GetTestDirectory(std::string* result) {
     const char* env = getenv("TEST_TMPDIR");
@@ -541,122 +784,110 @@ class PosixEnv : public Env {
   }
 
   virtual uint64_t NowMicros() {
+#if _POSIX_TIMERS >= 200801L
+    struct timespec ts;
+
+    // this is rumored to be faster that gettimeofday(),
+    //  and sometimes shift less ... someday use CLOCK_MONOTONIC_RAW
+    clock_gettime(CLOCK_MONOTONIC, &ts);
+    return static_cast<uint64_t>(ts.tv_sec) * 1000000 + ts.tv_nsec/1000;
+#else
     struct timeval tv;
     gettimeofday(&tv, NULL);
     return static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
+#endif
   }
 
   virtual void SleepForMicroseconds(int micros) {
-    usleep(micros);
-  }
+    struct timespec ts;
+    int ret_val;
+
+    if (0!=micros)
+    {
+        micros=(micros/clock_res_ +1)*clock_res_;
+        ts.tv_sec=micros/1000000;
+        ts.tv_nsec=(micros - ts.tv_sec*1000000) *1000;
+
+        do
+        {
+#if _POSIX_TIMERS >= 200801L
+            // later ... add test for CLOCK_MONOTONIC_RAW where supported (better)
+            ret_val=clock_nanosleep(CLOCK_MONOTONIC,0, &ts, &ts);
+#else
+            ret_val=nanosleep(&ts, &ts);
+#endif
+        } while(EINTR==ret_val && 0!=(ts.tv_sec+ts.tv_nsec));
+    }   // if
+  }  // SleepForMicroSeconds
+
+
+  virtual size_t RecoveryMmapSize(const struct Options * options) const
+    {
+      size_t map_size;
+
+      if (NULL!=options)
+      {
+        // large buffers, try for a little bit bigger than half hoping
+        //  for two writes ... not three
+        if (10*1024*1024 < options->write_buffer_size)
+            map_size=(options->write_buffer_size/6)*4;
+        else
+            map_size=(options->write_buffer_size*12)/10;  // integer multiply 1.2
+      } // if
+      else
+        map_size=2*1024*1024L;
+
+      return(map_size);
+    };
 
  private:
+
   void PthreadCall(const char* label, int result) {
     if (result != 0) {
       fprintf(stderr, "pthread %s: %s\n", label, strerror(result));
-      abort();
+      exit(1);
     }
   }
 
-  // BGThread() is the body of the background thread
-  void BGThread();
-  static void* BGThreadWrapper(void* arg) {
-    reinterpret_cast<PosixEnv*>(arg)->BGThread();
-    return NULL;
-  }
-
+  size_t page_size_;
   pthread_mutex_t mu_;
   pthread_cond_t bgsignal_;
-  pthread_t bgthread_;
-  bool started_bgthread_;
+  int64_t clock_res_;
 
   // Entry per Schedule() call
-  struct BGItem { void* arg; void (*function)(void*); };
-  typedef std::deque<BGItem> BGQueue;
-  BGQueue queue_;
+  struct BGItem { void* arg; void (*function)(void*); int priority;};
 
-  PosixLockTable locks_;
-  Limiter mmap_limit_;
-  Limiter fd_limit_;
 };
 
-// Return the maximum number of concurrent mmaps.
-static int MaxMmaps() {
-  if (mmap_limit >= 0) {
-    return mmap_limit;
-  }
-  // Up to 4096 mmaps for 64-bit binaries; none for smaller pointer sizes.
-  mmap_limit = sizeof(void*) >= 8 ? 4096 : 0;
-  return mmap_limit;
-}
 
-// Return the maximum number of read-only files to keep open.
-static intptr_t MaxOpenFiles() {
-  if (open_read_only_file_limit >= 0) {
-    return open_read_only_file_limit;
-  }
-  struct rlimit rlim;
-  if (getrlimit(RLIMIT_NOFILE, &rlim)) {
-    // getrlimit failed, fallback to hard-coded default.
-    open_read_only_file_limit = 50;
-  } else if (rlim.rlim_cur == RLIM_INFINITY) {
-    open_read_only_file_limit = std::numeric_limits<int>::max();
-  } else {
-    // Allow use of 20% of available file descriptors for read-only files.
-    open_read_only_file_limit = rlim.rlim_cur / 5;
-  }
-  return open_read_only_file_limit;
-}
+PosixEnv::PosixEnv() : page_size_(getpagesize()),
+                       clock_res_(1)
+{
+
+#if _POSIX_TIMERS >= 200801L
+  struct timespec ts;
+  clock_getres(CLOCK_MONOTONIC, &ts);
+  clock_res_=ts.tv_sec*1000000+ts.tv_nsec/1000;
+  if (0==clock_res_)
+      ++clock_res_;
+#endif
 
-PosixEnv::PosixEnv()
-    : started_bgthread_(false),
-      mmap_limit_(MaxMmaps()),
-      fd_limit_(MaxOpenFiles()) {
   PthreadCall("mutex_init", pthread_mutex_init(&mu_, NULL));
   PthreadCall("cvar_init", pthread_cond_init(&bgsignal_, NULL));
 }
 
+
+PosixEnv::~PosixEnv()
+{
+}   // PosixEnf::~PosixEnv
+
 void PosixEnv::Schedule(void (*function)(void*), void* arg) {
-  PthreadCall("lock", pthread_mutex_lock(&mu_));
+    ThreadTask * task;
 
-  // Start background thread if necessary
-  if (!started_bgthread_) {
-    started_bgthread_ = true;
-    PthreadCall(
-        "create thread",
-        pthread_create(&bgthread_, NULL,  &PosixEnv::BGThreadWrapper, this));
-  }
-
-  // If the queue is currently empty, the background thread may currently be
-  // waiting.
-  if (queue_.empty()) {
-    PthreadCall("signal", pthread_cond_signal(&bgsignal_));
-  }
-
-  // Add to priority queue
-  queue_.push_back(BGItem());
-  queue_.back().function = function;
-  queue_.back().arg = arg;
-
-  PthreadCall("unlock", pthread_mutex_unlock(&mu_));
+    task=new LegacyTask(function,arg);
+    gCompactionThreads->Submit(task, true);
 }
 
-void PosixEnv::BGThread() {
-  while (true) {
-    // Wait until there is an item that is ready to run
-    PthreadCall("lock", pthread_mutex_lock(&mu_));
-    while (queue_.empty()) {
-      PthreadCall("wait", pthread_cond_wait(&bgsignal_, &mu_));
-    }
-
-    void (*function)(void*) = queue_.front().function;
-    void* arg = queue_.front().arg;
-    queue_.pop_front();
-
-    PthreadCall("unlock", pthread_mutex_unlock(&mu_));
-    (*function)(arg);
-  }
-}
 
 namespace {
 struct StartThreadState {
@@ -671,29 +902,185 @@ static void* StartThreadWrapper(void* arg) {
   return NULL;
 }
 
-void PosixEnv::StartThread(void (*function)(void* arg), void* arg) {
+pthread_t PosixEnv::StartThread(void (*function)(void* arg), void* arg) {
   pthread_t t;
   StartThreadState* state = new StartThreadState;
   state->user_function = function;
   state->arg = arg;
   PthreadCall("start thread",
               pthread_create(&t, NULL,  &StartThreadWrapper, state));
+
+  return(t);
 }
 
+
+// Called by BGFileUnmapper which manages retries
+//    this was a new file:  unmap, hold in page cache
+int
+BGFileUnmapper(void * arg)
+{
+    BGCloseInfo * file_ptr;
+    bool err_flag;
+    int ret_val;
+
+    //
+    // Reminder:  this could get called multiple times for
+    //            same "arg" due to error retry
+    //
+
+    err_flag=false;
+    file_ptr=(BGCloseInfo *)arg;
+
+    // non-null implies this is a background job,
+    //  i.e. not on direct thread of compaction.
+    if (NULL!=file_ptr->ref_count_)
+        gPerfCounters->Inc(ePerfBGCloseUnmap);
+
+    if (NULL!=file_ptr->base_)
+    {
+        ret_val=munmap(file_ptr->base_, file_ptr->length_);
+        if (0==ret_val)
+        {
+            file_ptr->base_=NULL;
+        }   // if
+        else
+        {
+            syslog(LOG_ERR,"BGFileUnmapper2 munmap failed [%d, %m]", errno);
+            err_flag=true;
+        }  // else
+    }   // if
+
+#if defined(HAVE_FADVISE)
+    if (0==file_ptr->metadata_
+        || (file_ptr->offset_ + file_ptr->length_ < file_ptr->metadata_))
+    {
+        // must fdatasync for DONTNEED to work
+        ret_val=fdatasync(file_ptr->fd_);
+        if (0!=ret_val)
+        {
+            syslog(LOG_ERR,"BGFileUnmapper2 fdatasync failed on %d [%d, %m]", file_ptr->fd_, errno);
+            err_flag=true;
+        }  // if
+
+        ret_val=posix_fadvise(file_ptr->fd_, file_ptr->offset_, file_ptr->length_, POSIX_FADV_DONTNEED);
+        if (0!=ret_val)
+        {
+            syslog(LOG_ERR,"BGFileUnmapper2 posix_fadvise DONTNEED failed on %d [%d]", file_ptr->fd_, ret_val);
+            err_flag=true;
+        }  // if
+    }   // if
+    else
+    {
+        ret_val=posix_fadvise(file_ptr->fd_, file_ptr->offset_, file_ptr->length_, POSIX_FADV_WILLNEED);
+        if (0!=ret_val)
+        {
+            syslog(LOG_ERR,"BGFileUnmapper2 posix_fadvise WILLNEED failed on %d [%d]", file_ptr->fd_, ret_val);
+            err_flag=true;
+        }  // if
+    }   // else
+#endif
+
+    // release access to file, maybe close it
+    if (!err_flag)
+    {
+        ret_val=PosixMmapFile::ReleaseRef(file_ptr->ref_count_, file_ptr->fd_);
+        err_flag=(0!=ret_val);
+    }   // if
+
+    if (err_flag)
+        gPerfCounters->Inc(ePerfBGWriteError);
+
+    // routine called directly or via async thread, this
+    //  controls when to delete file_ptr object
+    if (!err_flag)
+    {
+        gPerfCounters->Inc(ePerfRWFileUnmap);
+        file_ptr->RefDec();
+    }   // if
+
+    return(err_flag ? -1 : 0);
+
+}   // BGFileUnmapper
+
+
+// Thread entry point, and retry loop
+void BGFileUnmapper2(void * arg)
+{
+    int retries, ret_val;
+
+    retries=0;
+    ret_val=0;
+
+    do
+    {
+        if (1<retries)
+            Env::Default()->SleepForMicroseconds(100000);
+
+        ret_val=BGFileUnmapper(arg);
+        ++retries;
+    } while(retries<3 && 0!=ret_val);
+
+    // release object's memory here
+    if (0!=ret_val)
+    {
+        BGCloseInfo * file_ptr;
+
+        file_ptr=(BGCloseInfo *)arg;
+        file_ptr->RefDec();
+    }   // if
+
+    return;
+
+}   // BGFileUnmapper2
+
+
+
 }  // namespace
 
+// how many blocks of 4 priority background threads/queues
+/// for riak, make sure this is an odd number (and especially not 4)
+#define THREAD_BLOCKS 1
+
+static bool HasSSE4_2();
+
 static pthread_once_t once = PTHREAD_ONCE_INIT;
 static Env* default_env;
-static void InitDefaultEnv() { default_env = new PosixEnv; }
+static volatile bool started=false;
+static void InitDefaultEnv()
+{
+    default_env=new PosixEnv;
 
-void EnvPosixTestHelper::SetReadOnlyFDLimit(int limit) {
-  assert(default_env == NULL);
-  open_read_only_file_limit = limit;
-}
+    ThrottleInit();
 
-void EnvPosixTestHelper::SetReadOnlyMMapLimit(int limit) {
-  assert(default_env == NULL);
-  mmap_limit = limit;
+    // force the loading of code for both filters in case they
+    //  are hidden in a shared library
+    const FilterPolicy * ptr;
+    ptr=NewBloomFilterPolicy(16);
+    delete ptr;
+    ptr=NewBloomFilterPolicy2(16);
+    delete ptr;
+
+    if (HasSSE4_2())
+        crc32c::SwitchToHardwareCRC();
+
+    PerformanceCounters::Init(false);
+
+    gImmThreads=new HotThreadPool(5, "ImmWrite",
+                                  ePerfBGImmDirect, ePerfBGImmQueued,
+                                  ePerfBGImmDequeued, ePerfBGImmWeighted);
+    gWriteThreads=new HotThreadPool(3, "RecoveryWrite",
+                                    ePerfBGUnmapDirect, ePerfBGUnmapQueued,
+                                    ePerfBGUnmapDequeued, ePerfBGUnmapWeighted);
+    gLevel0Threads=new HotThreadPool(3, "Level0Compact",
+                                     ePerfBGLevel0Direct, ePerfBGLevel0Queued,
+                                     ePerfBGLevel0Dequeued, ePerfBGLevel0Weighted);
+    // "2" is for Linux OS "nice", assumption is "1" nice might be
+    //   used by AAE hash trees in the future
+    gCompactionThreads=new HotThreadPool(3, "GeneralCompact",
+                                         ePerfBGCompactDirect, ePerfBGCompactQueued,
+                                         ePerfBGCompactDequeued, ePerfBGCompactWeighted, 2);
+
+    started=true;
 }
 
 Env* Env::Default() {
@@ -701,6 +1088,73 @@ Env* Env::Default() {
   return default_env;
 }
 
-}  // namespace leveldb
 
+void Env::Shutdown()
+{
+    if (started)
+    {
+        // prevent throttle from initiating new compactions
+        ThrottleStopThreads();
+    }   // if
+
+    DBListShutdown();
+
+    delete gImmThreads;
+    gImmThreads=NULL;
+
+    delete gWriteThreads;
+    gWriteThreads=NULL;
+
+    delete gLevel0Threads;
+    gLevel0Threads=NULL;
+
+    delete gCompactionThreads;
+    gCompactionThreads=NULL;
+
+    if (started)
+    {
+        // release throttle globals now that
+        //  background compaction threads done
+        ThrottleClose();
+
+        delete default_env;
+        default_env=NULL;
+    }   // if
+
+    ExpiryModule::ShutdownExpiryModule();
+
+    // wait until compaction threads complete before
+    //  releasing comparator object (else segfault possible)
+    ComparatorShutdown();
+
+    PerformanceCounters::Close(gPerfCounters);
+
+}   // Env::Shutdown
+
+
+static bool
+HasSSE4_2()
+{
+#if defined(__x86_64__)
+    uint64_t ecx;
+    ecx=0;
+
+    __asm__ __volatile__
+        ("mov %%rbx, %%rdi\n\t" /* 32bit PIC: don't clobber ebx */
+         "mov $1,%%rax\n\t"
+         "cpuid\n\t"
+         "mov %%rdi, %%rbx\n\t"
+         : "=c" (ecx)
+         :
+         : "%rax", "%rbx", "%rdx", "%rdi" );
+
+    return( 0 != (ecx & 1<<20));
+#else
+    return(false);
 #endif
+
+}   // HasSSE4_2
+
+
+
+}  // namespace leveldb
diff --git a/src/leveldb/util/env_posix_test.cc b/src/leveldb/util/env_posix_test.cc
deleted file mode 100644
index 295f8ae44..000000000
--- a/src/leveldb/util/env_posix_test.cc
+++ /dev/null
@@ -1,66 +0,0 @@
-// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
-
-#include "leveldb/env.h"
-
-#include "port/port.h"
-#include "util/testharness.h"
-#include "util/env_posix_test_helper.h"
-
-namespace leveldb {
-
-static const int kDelayMicros = 100000;
-static const int kReadOnlyFileLimit = 4;
-static const int kMMapLimit = 4;
-
-class EnvPosixTest {
- public:
-  Env* env_;
-  EnvPosixTest() : env_(Env::Default()) { }
-
-  static void SetFileLimits(int read_only_file_limit, int mmap_limit) {
-    EnvPosixTestHelper::SetReadOnlyFDLimit(read_only_file_limit);
-    EnvPosixTestHelper::SetReadOnlyMMapLimit(mmap_limit);
-  }
-};
-
-TEST(EnvPosixTest, TestOpenOnRead) {
-  // Write some test data to a single file that will be opened |n| times.
-  std::string test_dir;
-  ASSERT_OK(env_->GetTestDirectory(&test_dir));
-  std::string test_file = test_dir + "/open_on_read.txt";
-
-  FILE* f = fopen(test_file.c_str(), "w");
-  ASSERT_TRUE(f != NULL);
-  const char kFileData[] = "abcdefghijklmnopqrstuvwxyz";
-  fputs(kFileData, f);
-  fclose(f);
-
-  // Open test file some number above the sum of the two limits to force
-  // open-on-read behavior of POSIX Env leveldb::RandomAccessFile.
-  const int kNumFiles = kReadOnlyFileLimit + kMMapLimit + 5;
-  leveldb::RandomAccessFile* files[kNumFiles] = {0};
-  for (int i = 0; i < kNumFiles; i++) {
-    ASSERT_OK(env_->NewRandomAccessFile(test_file, &files[i]));
-  }
-  char scratch;
-  Slice read_result;
-  for (int i = 0; i < kNumFiles; i++) {
-    ASSERT_OK(files[i]->Read(i, 1, &read_result, &scratch));
-    ASSERT_EQ(kFileData[i], read_result[0]);
-  }
-  for (int i = 0; i < kNumFiles; i++) {
-    delete files[i];
-  }
-  ASSERT_OK(env_->DeleteFile(test_file));
-}
-
-}  // namespace leveldb
-
-int main(int argc, char** argv) {
-  // All tests currently run with the same read-only file limits.
-  leveldb::EnvPosixTest::SetFileLimits(leveldb::kReadOnlyFileLimit,
-                                       leveldb::kMMapLimit);
-  return leveldb::test::RunAllTests();
-}
diff --git a/src/leveldb/util/env_posix_test_helper.h b/src/leveldb/util/env_posix_test_helper.h
deleted file mode 100644
index 038696059..000000000
--- a/src/leveldb/util/env_posix_test_helper.h
+++ /dev/null
@@ -1,28 +0,0 @@
-// Copyright 2017 The LevelDB Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
-
-#ifndef STORAGE_LEVELDB_UTIL_ENV_POSIX_TEST_HELPER_H_
-#define STORAGE_LEVELDB_UTIL_ENV_POSIX_TEST_HELPER_H_
-
-namespace leveldb {
-
-class EnvPosixTest;
-
-// A helper for the POSIX Env to facilitate testing.
-class EnvPosixTestHelper {
- private:
-  friend class EnvPosixTest;
-
-  // Set the maximum number of read-only files that will be opened.
-  // Must be called before creating an Env.
-  static void SetReadOnlyFDLimit(int limit);
-
-  // Set the maximum number of read-only files that will be mapped via mmap.
-  // Must be called before creating an Env.
-  static void SetReadOnlyMMapLimit(int limit);
-};
-
-}  // namespace leveldb
-
-#endif  // STORAGE_LEVELDB_UTIL_ENV_POSIX_TEST_HELPER_H_
diff --git a/src/leveldb/util/env_test.cc b/src/leveldb/util/env_test.cc
index 839ae56a1..8091103cd 100644
--- a/src/leveldb/util/env_test.cc
+++ b/src/leveldb/util/env_test.cc
@@ -10,31 +10,30 @@
 namespace leveldb {
 
 static const int kDelayMicros = 100000;
-static const int kReadOnlyFileLimit = 4;
-static const int kMMapLimit = 4;
 
-class EnvTest {
+class EnvPosixTest {
  private:
   port::Mutex mu_;
   std::string events_;
 
  public:
   Env* env_;
-  EnvTest() : env_(Env::Default()) { }
+  EnvPosixTest() : env_(Env::Default()) { }
 };
 
 static void SetBool(void* ptr) {
   reinterpret_cast<port::AtomicPointer*>(ptr)->NoBarrier_Store(ptr);
 }
 
-TEST(EnvTest, RunImmediately) {
+TEST(EnvPosixTest, RunImmediately) {
   port::AtomicPointer called (NULL);
   env_->Schedule(&SetBool, &called);
-  env_->SleepForMicroseconds(kDelayMicros);
+  Env::Default()->SleepForMicroseconds(kDelayMicros);
   ASSERT_TRUE(called.NoBarrier_Load() != NULL);
 }
 
-TEST(EnvTest, RunMany) {
+#if 0 // test assumes single thread and queue. No long valid assumption
+TEST(EnvPosixTest, RunMany) {
   port::AtomicPointer last_id (NULL);
 
   struct CB {
@@ -61,10 +60,11 @@ TEST(EnvTest, RunMany) {
   env_->Schedule(&CB::Run, &cb3);
   env_->Schedule(&CB::Run, &cb4);
 
-  env_->SleepForMicroseconds(kDelayMicros);
+  Env::Default()->SleepForMicroseconds(kDelayMicros);
   void* cur = last_id.Acquire_Load();
   ASSERT_EQ(4, reinterpret_cast<uintptr_t>(cur));
 }
+#endif
 
 struct State {
   port::Mutex mu;
@@ -80,12 +80,14 @@ static void ThreadBody(void* arg) {
   s->mu.Unlock();
 }
 
-TEST(EnvTest, StartThread) {
+TEST(EnvPosixTest, StartThread) {
   State state;
+  pthread_t pid;
   state.val = 0;
   state.num_running = 3;
   for (int i = 0; i < 3; i++) {
-    env_->StartThread(&ThreadBody, &state);
+    pid=env_->StartThread(&ThreadBody, &state);
+    pthread_detach(pid);
   }
   while (true) {
     state.mu.Lock();
@@ -94,7 +96,7 @@ TEST(EnvTest, StartThread) {
     if (num == 0) {
       break;
     }
-    env_->SleepForMicroseconds(kDelayMicros);
+    Env::Default()->SleepForMicroseconds(kDelayMicros);
   }
   ASSERT_EQ(state.val, 3);
 }
diff --git a/src/leveldb/util/env_win.cc b/src/leveldb/util/env_win.cc
deleted file mode 100644
index 81380216b..000000000
--- a/src/leveldb/util/env_win.cc
+++ /dev/null
@@ -1,901 +0,0 @@
-// This file contains source that originates from:
-// http://code.google.com/p/leveldbwin/source/browse/trunk/win32_impl_src/env_win32.h
-// http://code.google.com/p/leveldbwin/source/browse/trunk/win32_impl_src/port_win32.cc
-// Those files don't have any explicit license headers but the 
-// project (http://code.google.com/p/leveldbwin/) lists the 'New BSD License'
-// as the license.
-#if defined(LEVELDB_PLATFORM_WINDOWS)
-#include <map>
-
-
-#include "leveldb/env.h"
-
-#include "port/port.h"
-#include "leveldb/slice.h"
-#include "util/logging.h"
-
-#include <shlwapi.h>
-#include <process.h>
-#include <cstring>
-#include <stdio.h>
-#include <errno.h>
-#include <io.h>
-#include <algorithm>
-
-#ifdef max
-#undef max
-#endif
-
-#ifndef va_copy
-#define va_copy(d,s) ((d) = (s))
-#endif
-
-#if defined DeleteFile
-#undef DeleteFile
-#endif
-
-//Declarations
-namespace leveldb
-{
-
-namespace Win32
-{
-
-#define DISALLOW_COPY_AND_ASSIGN(TypeName) \
-  TypeName(const TypeName&);               \
-  void operator=(const TypeName&)
-
-std::string GetCurrentDir();
-std::wstring GetCurrentDirW();
-
-static const std::string CurrentDir = GetCurrentDir();
-static const std::wstring CurrentDirW = GetCurrentDirW();
-
-std::string& ModifyPath(std::string& path);
-std::wstring& ModifyPath(std::wstring& path);
-
-std::string GetLastErrSz();
-std::wstring GetLastErrSzW();
-
-size_t GetPageSize();
-
-typedef void (*ScheduleProc)(void*) ;
-
-struct WorkItemWrapper
-{
-    WorkItemWrapper(ScheduleProc proc_,void* content_);
-    ScheduleProc proc;
-    void* pContent;
-};
-
-DWORD WINAPI WorkItemWrapperProc(LPVOID pContent);
-
-class Win32SequentialFile : public SequentialFile
-{
-public:
-    friend class Win32Env;
-    virtual ~Win32SequentialFile();
-    virtual Status Read(size_t n, Slice* result, char* scratch);
-    virtual Status Skip(uint64_t n);
-    BOOL isEnable();
-    virtual std::string GetName() const { return _filename; }
-private:
-    BOOL _Init();
-    void _CleanUp();
-    Win32SequentialFile(const std::string& fname);
-    std::string _filename;
-    ::HANDLE _hFile;
-    DISALLOW_COPY_AND_ASSIGN(Win32SequentialFile);
-};
-
-class Win32RandomAccessFile : public RandomAccessFile
-{
-public:
-    friend class Win32Env;
-    virtual ~Win32RandomAccessFile();
-    virtual Status Read(uint64_t offset, size_t n, Slice* result,char* scratch) const;
-    BOOL isEnable();
-    virtual std::string GetName() const { return _filename; }
-private:
-    BOOL _Init(LPCWSTR path);
-    void _CleanUp();
-    Win32RandomAccessFile(const std::string& fname);
-    HANDLE _hFile;
-    const std::string _filename;
-    DISALLOW_COPY_AND_ASSIGN(Win32RandomAccessFile);
-};
-
-class Win32WritableFile : public WritableFile
-{
-public:
-    Win32WritableFile(const std::string& fname, bool append);
-    ~Win32WritableFile();
-
-    virtual Status Append(const Slice& data);
-    virtual Status Close();
-    virtual Status Flush();
-    virtual Status Sync();
-    BOOL isEnable();
-    virtual std::string GetName() const { return filename_; }
-private:
-    std::string filename_;
-    ::HANDLE _hFile;
-};
-
-class Win32FileLock : public FileLock
-{
-public:
-    friend class Win32Env;
-    virtual ~Win32FileLock();
-    BOOL isEnable();
-private:
-    BOOL _Init(LPCWSTR path);
-    void _CleanUp();
-    Win32FileLock(const std::string& fname);
-    HANDLE _hFile;
-    std::string _filename;
-    DISALLOW_COPY_AND_ASSIGN(Win32FileLock);
-};
-
-class Win32Logger : public Logger
-{
-public: 
-    friend class Win32Env;
-    virtual ~Win32Logger();
-    virtual void Logv(const char* format, va_list ap);
-private:
-    explicit Win32Logger(WritableFile* pFile);
-    WritableFile* _pFileProxy;
-    DISALLOW_COPY_AND_ASSIGN(Win32Logger);
-};
-
-class Win32Env : public Env
-{
-public:
-    Win32Env();
-    virtual ~Win32Env();
-    virtual Status NewSequentialFile(const std::string& fname,
-        SequentialFile** result);
-
-    virtual Status NewRandomAccessFile(const std::string& fname,
-        RandomAccessFile** result);
-    virtual Status NewWritableFile(const std::string& fname,
-        WritableFile** result);
-    virtual Status NewAppendableFile(const std::string& fname,
-        WritableFile** result);
-
-    virtual bool FileExists(const std::string& fname);
-
-    virtual Status GetChildren(const std::string& dir,
-        std::vector<std::string>* result);
-
-    virtual Status DeleteFile(const std::string& fname);
-
-    virtual Status CreateDir(const std::string& dirname);
-
-    virtual Status DeleteDir(const std::string& dirname);
-
-    virtual Status GetFileSize(const std::string& fname, uint64_t* file_size);
-
-    virtual Status RenameFile(const std::string& src,
-        const std::string& target);
-
-    virtual Status LockFile(const std::string& fname, FileLock** lock);
-
-    virtual Status UnlockFile(FileLock* lock);
-
-    virtual void Schedule(
-        void (*function)(void* arg),
-        void* arg);
-
-    virtual void StartThread(void (*function)(void* arg), void* arg);
-
-    virtual Status GetTestDirectory(std::string* path);
-
-    //virtual void Logv(WritableFile* log, const char* format, va_list ap);
-
-    virtual Status NewLogger(const std::string& fname, Logger** result);
-
-    virtual uint64_t NowMicros();
-
-    virtual void SleepForMicroseconds(int micros);
-};
-
-void ToWidePath(const std::string& value, std::wstring& target) {
-	wchar_t buffer[MAX_PATH];
-	MultiByteToWideChar(CP_ACP, 0, value.c_str(), -1, buffer, MAX_PATH);
-	target = buffer;
-}
-
-void ToNarrowPath(const std::wstring& value, std::string& target) {
-	char buffer[MAX_PATH];
-	WideCharToMultiByte(CP_ACP, 0, value.c_str(), -1, buffer, MAX_PATH, NULL, NULL);
-	target = buffer;
-}
-
-std::string GetCurrentDir()
-{
-    CHAR path[MAX_PATH];
-    ::GetModuleFileNameA(::GetModuleHandleA(NULL),path,MAX_PATH);
-    *strrchr(path,'\\') = 0;
-    return std::string(path);
-}
-
-std::wstring GetCurrentDirW()
-{
-    WCHAR path[MAX_PATH];
-    ::GetModuleFileNameW(::GetModuleHandleW(NULL),path,MAX_PATH);
-    *wcsrchr(path,L'\\') = 0;
-    return std::wstring(path);
-}
-
-std::string& ModifyPath(std::string& path)
-{
-    if(path[0] == '/' || path[0] == '\\'){
-        path = CurrentDir + path;
-    }
-    std::replace(path.begin(),path.end(),'/','\\');
-
-    return path;
-}
-
-std::wstring& ModifyPath(std::wstring& path)
-{
-    if(path[0] == L'/' || path[0] == L'\\'){
-        path = CurrentDirW + path;
-    }
-    std::replace(path.begin(),path.end(),L'/',L'\\');
-    return path;
-}
-
-std::string GetLastErrSz()
-{
-    LPWSTR lpMsgBuf;
-    FormatMessageW( 
-        FORMAT_MESSAGE_ALLOCATE_BUFFER | 
-        FORMAT_MESSAGE_FROM_SYSTEM | 
-        FORMAT_MESSAGE_IGNORE_INSERTS,
-        NULL,
-        GetLastError(),
-        0, // Default language
-        (LPWSTR) &lpMsgBuf,
-        0,
-        NULL 
-        );
-    std::string Err;
-	ToNarrowPath(lpMsgBuf, Err); 
-    LocalFree( lpMsgBuf );
-    return Err;
-}
-
-std::wstring GetLastErrSzW()
-{
-    LPVOID lpMsgBuf;
-    FormatMessageW( 
-        FORMAT_MESSAGE_ALLOCATE_BUFFER | 
-        FORMAT_MESSAGE_FROM_SYSTEM | 
-        FORMAT_MESSAGE_IGNORE_INSERTS,
-        NULL,
-        GetLastError(),
-        0, // Default language
-        (LPWSTR) &lpMsgBuf,
-        0,
-        NULL 
-        );
-    std::wstring Err = (LPCWSTR)lpMsgBuf;
-    LocalFree(lpMsgBuf);
-    return Err;
-}
-
-WorkItemWrapper::WorkItemWrapper( ScheduleProc proc_,void* content_ ) :
-    proc(proc_),pContent(content_)
-{
-
-}
-
-DWORD WINAPI WorkItemWrapperProc(LPVOID pContent)
-{
-    WorkItemWrapper* item = static_cast<WorkItemWrapper*>(pContent);
-    ScheduleProc TempProc = item->proc;
-    void* arg = item->pContent;
-    delete item;
-    TempProc(arg);
-    return 0;
-}
-
-size_t GetPageSize()
-{
-    SYSTEM_INFO si;
-    GetSystemInfo(&si);
-    return std::max(si.dwPageSize,si.dwAllocationGranularity);
-}
-
-const size_t g_PageSize = GetPageSize();
-
-
-Win32SequentialFile::Win32SequentialFile( const std::string& fname ) :
-    _filename(fname),_hFile(NULL)
-{
-    _Init();
-}
-
-Win32SequentialFile::~Win32SequentialFile()
-{
-    _CleanUp();
-}
-
-Status Win32SequentialFile::Read( size_t n, Slice* result, char* scratch )
-{
-    Status sRet;
-    DWORD hasRead = 0;
-    if(_hFile && ReadFile(_hFile,scratch,n,&hasRead,NULL) ){
-        *result = Slice(scratch,hasRead);
-    } else {
-        sRet = Status::IOError(_filename, Win32::GetLastErrSz() );
-    }
-    return sRet;
-}
-
-Status Win32SequentialFile::Skip( uint64_t n )
-{
-    Status sRet;
-    LARGE_INTEGER Move,NowPointer;
-    Move.QuadPart = n;
-    if(!SetFilePointerEx(_hFile,Move,&NowPointer,FILE_CURRENT)){
-        sRet = Status::IOError(_filename,Win32::GetLastErrSz());
-    }
-    return sRet;
-}
-
-BOOL Win32SequentialFile::isEnable()
-{
-    return _hFile ? TRUE : FALSE;
-}
-
-BOOL Win32SequentialFile::_Init()
-{
-	std::wstring path;
-	ToWidePath(_filename, path);
-	_hFile = CreateFileW(path.c_str(),
-                         GENERIC_READ,
-                         FILE_SHARE_READ | FILE_SHARE_WRITE,
-                         NULL,
-                         OPEN_EXISTING,
-                         FILE_ATTRIBUTE_NORMAL | FILE_FLAG_SEQUENTIAL_SCAN,
-                         NULL);
-    if (_hFile == INVALID_HANDLE_VALUE)
-        _hFile = NULL;
-    return _hFile ? TRUE : FALSE;
-}
-
-void Win32SequentialFile::_CleanUp()
-{
-    if(_hFile){
-        CloseHandle(_hFile);
-        _hFile = NULL;
-    }
-}
-
-Win32RandomAccessFile::Win32RandomAccessFile( const std::string& fname ) :
-    _filename(fname),_hFile(NULL)
-{
-	std::wstring path;
-	ToWidePath(fname, path);
-    _Init( path.c_str() );
-}
-
-Win32RandomAccessFile::~Win32RandomAccessFile()
-{
-    _CleanUp();
-}
-
-Status Win32RandomAccessFile::Read(uint64_t offset,size_t n,Slice* result,char* scratch) const
-{
-    Status sRet;
-    OVERLAPPED ol = {0};
-    ZeroMemory(&ol,sizeof(ol));
-    ol.Offset = (DWORD)offset;
-    ol.OffsetHigh = (DWORD)(offset >> 32);
-    DWORD hasRead = 0;
-    if(!ReadFile(_hFile,scratch,n,&hasRead,&ol))
-        sRet = Status::IOError(_filename,Win32::GetLastErrSz());
-    else
-        *result = Slice(scratch,hasRead);
-    return sRet;
-}
-
-BOOL Win32RandomAccessFile::_Init( LPCWSTR path )
-{
-    BOOL bRet = FALSE;
-    if(!_hFile)
-        _hFile = ::CreateFileW(path,GENERIC_READ,FILE_SHARE_READ|FILE_SHARE_WRITE,NULL,OPEN_EXISTING,
-        FILE_ATTRIBUTE_NORMAL | FILE_FLAG_RANDOM_ACCESS,NULL);
-    if(!_hFile || _hFile == INVALID_HANDLE_VALUE )
-        _hFile = NULL;
-    else
-        bRet = TRUE;
-    return bRet;
-}
-
-BOOL Win32RandomAccessFile::isEnable()
-{
-    return _hFile ? TRUE : FALSE;
-}
-
-void Win32RandomAccessFile::_CleanUp()
-{
-    if(_hFile){
-        ::CloseHandle(_hFile);
-        _hFile = NULL;
-    }
-}
-
-Win32WritableFile::Win32WritableFile(const std::string& fname, bool append)
-    : filename_(fname)
-{
-    std::wstring path;
-    ToWidePath(fname, path);
-    // NewAppendableFile: append to an existing file, or create a new one
-    //     if none exists - this is OPEN_ALWAYS behavior, with
-    //     FILE_APPEND_DATA to avoid having to manually position the file
-    //     pointer at the end of the file.
-    // NewWritableFile: create a new file, delete if it exists - this is
-    //     CREATE_ALWAYS behavior. This file is used for writing only so
-    //     use GENERIC_WRITE.
-    _hFile = CreateFileW(path.c_str(),
-                         append ? FILE_APPEND_DATA : GENERIC_WRITE,
-                         FILE_SHARE_READ|FILE_SHARE_DELETE|FILE_SHARE_WRITE,
-                         NULL,
-                         append ? OPEN_ALWAYS : CREATE_ALWAYS,
-                         FILE_ATTRIBUTE_NORMAL,
-                         NULL);
-    // CreateFileW returns INVALID_HANDLE_VALUE in case of error, always check isEnable() before use
-}
-
-Win32WritableFile::~Win32WritableFile()
-{
-    if (_hFile != INVALID_HANDLE_VALUE)
-        Close();
-}
-
-Status Win32WritableFile::Append(const Slice& data)
-{
-    DWORD r = 0;
-    if (!WriteFile(_hFile, data.data(), data.size(), &r, NULL) || r != data.size()) {
-        return Status::IOError("Win32WritableFile.Append::WriteFile: "+filename_, Win32::GetLastErrSz());
-    }
-    return Status::OK();
-}
-
-Status Win32WritableFile::Close()
-{
-    if (!CloseHandle(_hFile)) {
-        return Status::IOError("Win32WritableFile.Close::CloseHandle: "+filename_, Win32::GetLastErrSz());
-    }
-    _hFile = INVALID_HANDLE_VALUE;
-    return Status::OK();
-}
-
-Status Win32WritableFile::Flush()
-{
-    // Nothing to do here, there are no application-side buffers
-    return Status::OK();
-}
-
-Status Win32WritableFile::Sync()
-{
-    if (!FlushFileBuffers(_hFile)) {
-        return Status::IOError("Win32WritableFile.Sync::FlushFileBuffers "+filename_, Win32::GetLastErrSz());
-    }
-    return Status::OK();
-}
-
-BOOL Win32WritableFile::isEnable()
-{
-    return _hFile != INVALID_HANDLE_VALUE;
-}
-
-Win32FileLock::Win32FileLock( const std::string& fname ) :
-    _hFile(NULL),_filename(fname)
-{
-	std::wstring path;
-	ToWidePath(fname, path);
-	_Init(path.c_str());
-}
-
-Win32FileLock::~Win32FileLock()
-{
-    _CleanUp();
-}
-
-BOOL Win32FileLock::_Init( LPCWSTR path )
-{
-    BOOL bRet = FALSE;
-    if(!_hFile)
-        _hFile = ::CreateFileW(path,0,0,NULL,CREATE_ALWAYS,FILE_ATTRIBUTE_NORMAL,NULL);
-    if(!_hFile || _hFile == INVALID_HANDLE_VALUE ){
-        _hFile = NULL;
-    }
-    else
-        bRet = TRUE;
-    return bRet;
-}
-
-void Win32FileLock::_CleanUp()
-{
-    ::CloseHandle(_hFile);
-    _hFile = NULL;
-}
-
-BOOL Win32FileLock::isEnable()
-{
-    return _hFile ? TRUE : FALSE;
-}
-
-Win32Logger::Win32Logger(WritableFile* pFile) : _pFileProxy(pFile)
-{
-    assert(_pFileProxy);
-}
-
-Win32Logger::~Win32Logger()
-{
-    if(_pFileProxy)
-        delete _pFileProxy;
-}
-
-void Win32Logger::Logv( const char* format, va_list ap )
-{
-    uint64_t thread_id = ::GetCurrentThreadId();
-
-    // We try twice: the first time with a fixed-size stack allocated buffer,
-    // and the second time with a much larger dynamically allocated buffer.
-    char buffer[500];
-    for (int iter = 0; iter < 2; iter++) {
-        char* base;
-        int bufsize;
-        if (iter == 0) {
-            bufsize = sizeof(buffer);
-            base = buffer;
-        } else {
-            bufsize = 30000;
-            base = new char[bufsize];
-        }
-        char* p = base;
-        char* limit = base + bufsize;
-
-        SYSTEMTIME st;
-        GetLocalTime(&st);
-        p += snprintf(p, limit - p,
-            "%04d/%02d/%02d-%02d:%02d:%02d.%06d %llx ",
-            int(st.wYear),
-            int(st.wMonth),
-            int(st.wDay),
-            int(st.wHour),
-            int(st.wMinute),
-            int(st.wMinute),
-            int(st.wMilliseconds),
-            static_cast<long long unsigned int>(thread_id));
-
-        // Print the message
-        if (p < limit) {
-            va_list backup_ap;
-            va_copy(backup_ap, ap);
-            p += vsnprintf(p, limit - p, format, backup_ap);
-            va_end(backup_ap);
-        }
-
-        // Truncate to available space if necessary
-        if (p >= limit) {
-            if (iter == 0) {
-                continue;       // Try again with larger buffer
-            } else {
-                p = limit - 1;
-            }
-        }
-
-        // Add newline if necessary
-        if (p == base || p[-1] != '\n') {
-            *p++ = '\n';
-        }
-
-        assert(p <= limit);
-        DWORD hasWritten = 0;
-        if(_pFileProxy){
-            _pFileProxy->Append(Slice(base, p - base));
-            _pFileProxy->Flush();
-        }
-        if (base != buffer) {
-            delete[] base;
-        }
-        break;
-    }
-}
-
-bool Win32Env::FileExists(const std::string& fname)
-{
-	std::string path = fname;
-    std::wstring wpath;
-	ToWidePath(ModifyPath(path), wpath);
-    return ::PathFileExistsW(wpath.c_str()) ? true : false;
-}
-
-Status Win32Env::GetChildren(const std::string& dir, std::vector<std::string>* result)
-{
-    Status sRet;
-    ::WIN32_FIND_DATAW wfd;
-    std::string path = dir;
-    ModifyPath(path);
-    path += "\\*.*";
-	std::wstring wpath;
-	ToWidePath(path, wpath);
-
-	::HANDLE hFind = ::FindFirstFileW(wpath.c_str() ,&wfd);
-    if(hFind && hFind != INVALID_HANDLE_VALUE){
-        BOOL hasNext = TRUE;
-        std::string child;
-        while(hasNext){
-            ToNarrowPath(wfd.cFileName, child); 
-            if(child != ".." && child != ".")  {
-                result->push_back(child);
-            }
-            hasNext = ::FindNextFileW(hFind,&wfd);
-        }
-        ::FindClose(hFind);
-    }
-    else
-        sRet = Status::IOError(dir,"Could not get children.");
-    return sRet;
-}
-
-void Win32Env::SleepForMicroseconds( int micros )
-{
-    ::Sleep((micros + 999) /1000);
-}
-
-
-Status Win32Env::DeleteFile( const std::string& fname )
-{
-    Status sRet;
-    std::string path = fname;
-    std::wstring wpath;
-	ToWidePath(ModifyPath(path), wpath);
-
-    if(!::DeleteFileW(wpath.c_str())) {
-        sRet = Status::IOError(path, "Could not delete file.");
-    }
-    return sRet;
-}
-
-Status Win32Env::GetFileSize( const std::string& fname, uint64_t* file_size )
-{
-    Status sRet;
-    std::string path = fname;
-    std::wstring wpath;
-	ToWidePath(ModifyPath(path), wpath);
-
-    HANDLE file = ::CreateFileW(wpath.c_str(),
-        GENERIC_READ,FILE_SHARE_READ|FILE_SHARE_WRITE,NULL,OPEN_EXISTING,FILE_ATTRIBUTE_NORMAL,NULL);
-    LARGE_INTEGER li;
-    if(::GetFileSizeEx(file,&li)){
-        *file_size = (uint64_t)li.QuadPart;
-    }else
-        sRet = Status::IOError(path,"Could not get the file size.");
-    CloseHandle(file);
-    return sRet;
-}
-
-Status Win32Env::RenameFile( const std::string& src, const std::string& target )
-{
-    Status sRet;
-    std::string src_path = src;
-    std::wstring wsrc_path;
-	ToWidePath(ModifyPath(src_path), wsrc_path);
-	std::string target_path = target;
-    std::wstring wtarget_path;
-	ToWidePath(ModifyPath(target_path), wtarget_path);
-
-    if(!MoveFileW(wsrc_path.c_str(), wtarget_path.c_str() ) ){
-        DWORD err = GetLastError();
-        if(err == 0x000000b7){
-            if(!::DeleteFileW(wtarget_path.c_str() ) )
-                sRet = Status::IOError(src, "Could not rename file.");
-			else if(!::MoveFileW(wsrc_path.c_str(),
-                                 wtarget_path.c_str() ) )
-                sRet = Status::IOError(src, "Could not rename file.");    
-        }
-    }
-    return sRet;
-}
-
-Status Win32Env::LockFile( const std::string& fname, FileLock** lock )
-{
-    Status sRet;
-    std::string path = fname;
-    ModifyPath(path);
-    Win32FileLock* _lock = new Win32FileLock(path);
-    if(!_lock->isEnable()){
-        delete _lock;
-        *lock = NULL;
-        sRet = Status::IOError(path, "Could not lock file.");
-    }
-    else
-        *lock = _lock;
-    return sRet;
-}
-
-Status Win32Env::UnlockFile( FileLock* lock )
-{
-    Status sRet;
-    delete lock;
-    return sRet;
-}
-
-void Win32Env::Schedule( void (*function)(void* arg), void* arg )
-{
-    QueueUserWorkItem(Win32::WorkItemWrapperProc,
-                      new Win32::WorkItemWrapper(function,arg),
-                      WT_EXECUTEDEFAULT);
-}
-
-void Win32Env::StartThread( void (*function)(void* arg), void* arg )
-{
-    ::_beginthread(function,0,arg);
-}
-
-Status Win32Env::GetTestDirectory( std::string* path )
-{
-    Status sRet;
-    WCHAR TempPath[MAX_PATH];
-    ::GetTempPathW(MAX_PATH,TempPath);
-	ToNarrowPath(TempPath, *path);
-    path->append("leveldb\\test\\");
-    ModifyPath(*path);
-    return sRet;
-}
-
-uint64_t Win32Env::NowMicros()
-{
-#ifndef USE_VISTA_API
-#define GetTickCount64 GetTickCount
-#endif
-    return (uint64_t)(GetTickCount64()*1000);
-}
-
-static Status CreateDirInner( const std::string& dirname )
-{
-    Status sRet;
-    DWORD attr = ::GetFileAttributes(dirname.c_str());
-    if (attr == INVALID_FILE_ATTRIBUTES) { // doesn't exist:
-      std::size_t slash = dirname.find_last_of("\\");
-      if (slash != std::string::npos){
-	sRet = CreateDirInner(dirname.substr(0, slash));
-	if (!sRet.ok()) return sRet;
-      }
-      BOOL result = ::CreateDirectory(dirname.c_str(), NULL);
-      if (result == FALSE) {
-	sRet = Status::IOError(dirname, "Could not create directory.");
-	return sRet;
-      }
-    }
-    return sRet;
-}
-
-Status Win32Env::CreateDir( const std::string& dirname )
-{
-    std::string path = dirname;
-    if(path[path.length() - 1] != '\\'){
-        path += '\\';
-    }
-    ModifyPath(path);
-
-    return CreateDirInner(path);
-}
-
-Status Win32Env::DeleteDir( const std::string& dirname )
-{
-    Status sRet;
-    std::wstring path;
-	ToWidePath(dirname, path);
-    ModifyPath(path);
-    if(!::RemoveDirectoryW( path.c_str() ) ){
-        sRet = Status::IOError(dirname, "Could not delete directory.");
-    }
-    return sRet;
-}
-
-Status Win32Env::NewSequentialFile( const std::string& fname, SequentialFile** result )
-{
-    Status sRet;
-    std::string path = fname;
-    ModifyPath(path);
-    Win32SequentialFile* pFile = new Win32SequentialFile(path);
-    if(pFile->isEnable()){
-        *result = pFile;
-    }else {
-        delete pFile;
-        sRet = Status::IOError(path, Win32::GetLastErrSz());
-    }
-    return sRet;
-}
-
-Status Win32Env::NewRandomAccessFile( const std::string& fname, RandomAccessFile** result )
-{
-    Status sRet;
-    std::string path = fname;
-    Win32RandomAccessFile* pFile = new Win32RandomAccessFile(ModifyPath(path));
-    if(!pFile->isEnable()){
-        delete pFile;
-        *result = NULL;
-        sRet = Status::IOError(path, Win32::GetLastErrSz());
-    }else
-        *result = pFile;
-    return sRet;
-}
-
-Status Win32Env::NewLogger( const std::string& fname, Logger** result )
-{
-    Status sRet;
-    std::string path = fname;
-    // Logs are opened with write semantics, not with append semantics
-    // (see PosixEnv::NewLogger)
-    Win32WritableFile* pMapFile = new Win32WritableFile(ModifyPath(path), false);
-    if(!pMapFile->isEnable()){
-        delete pMapFile;
-        *result = NULL;
-        sRet = Status::IOError(path,"could not create a logger.");
-    }else
-        *result = new Win32Logger(pMapFile);
-    return sRet;
-}
-
-Status Win32Env::NewWritableFile( const std::string& fname, WritableFile** result )
-{
-    Status sRet;
-    std::string path = fname;
-    Win32WritableFile* pFile = new Win32WritableFile(ModifyPath(path), false);
-    if(!pFile->isEnable()){
-        *result = NULL;
-        sRet = Status::IOError(fname,Win32::GetLastErrSz());
-    }else
-        *result = pFile;
-    return sRet;
-}
-
-Status Win32Env::NewAppendableFile( const std::string& fname, WritableFile** result )
-{
-    Status sRet;
-    std::string path = fname;
-    Win32WritableFile* pFile = new Win32WritableFile(ModifyPath(path), true);
-    if(!pFile->isEnable()){
-        *result = NULL;
-        sRet = Status::IOError(fname,Win32::GetLastErrSz());
-    }else
-        *result = pFile;
-    return sRet;
-}
-
-Win32Env::Win32Env()
-{
-
-}
-
-Win32Env::~Win32Env()
-{
-
-}
-
-
-}  // Win32 namespace
-
-static port::OnceType once = LEVELDB_ONCE_INIT;
-static Env* default_env;
-static void InitDefaultEnv() { default_env = new Win32::Win32Env(); }
-
-Env* Env::Default() {
-  port::InitOnce(&once, InitDefaultEnv);
-  return default_env;
-}
-
-}  // namespace leveldb
-
-#endif // defined(LEVELDB_PLATFORM_WINDOWS)
diff --git a/src/leveldb/util/expiry_os.cc b/src/leveldb/util/expiry_os.cc
new file mode 100644
index 000000000..57aadcac3
--- /dev/null
+++ b/src/leveldb/util/expiry_os.cc
@@ -0,0 +1,408 @@
+// -------------------------------------------------------------------
+//
+// expiry_os.cc
+//
+// Copyright (c) 2016-2017 Basho Technologies, Inc. All Rights Reserved.
+//
+// This file is provided to you under the Apache License,
+// Version 2.0 (the "License"); you may not use this file
+// except in compliance with the License.  You may obtain
+// a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+//
+// -------------------------------------------------------------------
+
+#define __STDC_FORMAT_MACROS
+#include <inttypes.h>
+#include <limits.h>
+
+#include "leveldb/perf_count.h"
+#include "leveldb/env.h"
+#include "db/dbformat.h"
+#include "db/db_impl.h"
+#include "db/version_set.h"
+#include "util/expiry_os.h"
+#include "util/logging.h"
+#include "util/throttle.h"
+
+namespace leveldb {
+
+// sext key for Riak's meta data
+static const char * lRiakMetaDataKey=
+    {"\x10\x00\x00\x00\x02\x0c\xb6\xd9\x00\x08"};
+static const size_t lRiakMetaDataKeyLen=10;
+
+/**
+ * settings information that gets dumped to LOG upon
+ *  leveldb start
+ */
+void
+ExpiryModuleOS::Dump(
+    Logger * log) const
+{
+    Log(log,"  ExpiryModuleOS.expiry_enabled: %s", IsExpiryEnabled() ? "true" : "false");
+    Log(log,"  ExpiryModuleOS.expiry_minutes: %" PRIu64, GetExpiryMinutes());
+    Log(log,"ExpiryModuleOS.expiry_unlimited: %s", IsExpiryUnlimited() ? "true" : "false");
+    Log(log,"     ExpiryModuleOS.whole_files: %s", IsWholeFileExpiryEnabled() ? "true" : "false");
+
+    return;
+
+}   // ExpiryModuleOS::Dump
+
+
+/**
+ * db/write_batch.cc MemTableInserter() uses this to initialize
+ *   expiry info.
+ */
+bool
+ExpiryModuleOS::MemTableInserterCallback(
+    const Slice & Key,   // input: user's key about to be written
+    const Slice & Value, // input: user's value object
+    ValueType & ValType,   // input/output: key type. call might change
+    ExpiryTimeMicros & Expiry)   // input/output: 0 or specific expiry. call might change
+    const
+{
+    bool good(true);
+
+    // only update the expiry time if explicit type
+    //  without expiry, OR ExpiryMinutes set and not internal key
+    if ((kTypeValueWriteTime==ValType && 0==Expiry)
+        || (kTypeValue==ValType
+            && (0!=GetExpiryMinutes() || IsExpiryUnlimited())
+            && IsExpiryEnabled()
+            && (Key.size()<lRiakMetaDataKeyLen
+                || 0!=memcmp(lRiakMetaDataKey,Key.data(),lRiakMetaDataKeyLen))))
+    {
+        ValType=kTypeValueWriteTime;
+        Expiry=GenerateWriteTimeMicros(Key, Value);
+    }   // if
+
+    return(good);
+
+}   // ExpiryModuleOS::MemTableInserterCallback
+
+
+/**
+ * Use Basho's GetCachedTimeMicros() as write time source.
+ *  This clock returns microseconds since epoch, but
+ *  only updates every 60 seconds or so.
+ */
+uint64_t
+ExpiryModuleOS::GenerateWriteTimeMicros(
+    const Slice & Key,
+    const Slice & Value) const
+{
+
+    return(GetCachedTimeMicros());
+
+}  // ExpiryModuleOS::GenerateWriteTimeMicros()
+
+/**
+ * Returns true if key is expired.  False if key is NOT expired
+ *  (used by MemtableCallback() too).
+ *  Used within dbformat.cc, db_iter.cc, & version_set.cc
+ */
+bool ExpiryModuleOS::KeyRetirementCallback(
+    const ParsedInternalKey & Ikey) const
+{
+    bool is_expired(false);
+    uint64_t now_micros, expires_micros;
+
+    if (IsExpiryEnabled())
+    {
+        switch(Ikey.type)
+        {
+            case kTypeDeletion:
+            case kTypeValue:
+            default:
+                is_expired=false;
+                break;
+
+            case kTypeValueWriteTime:
+                if (0!=GetExpiryMinutes() && 0!=Ikey.expiry &&
+                    !IsExpiryUnlimited())
+                {
+                    now_micros=GetCachedTimeMicros();
+                    expires_micros=GetExpiryMinutes()*60*port::UINT64_ONE_SECOND_MICROS
+                        + Ikey.expiry;
+                    is_expired=(expires_micros<=now_micros);
+                }   // if
+                break;
+
+            case kTypeValueExplicitExpiry:
+                if (0!=Ikey.expiry)
+                {
+                    now_micros=GetCachedTimeMicros();
+                    is_expired=(Ikey.expiry<=now_micros);
+                }   // if
+                break;
+        }   // switch
+    }   // if
+
+    return(is_expired);
+
+}   // ExpiryModuleOS::KeyRetirementCallback
+
+
+/**
+ *  - Sets low/high date range for aged expiry.
+ *     (low for possible time series optimization)
+ *  - Sets high date range for explicit expiry.
+ *  - Increments delete counter for things already
+ *     expired (to aid in starting compaction for
+ *     keys tombstoning for higher levels).
+ *  (called from table/table_builder.cc)
+ */
+bool             // return value ignored
+ExpiryModuleOS::TableBuilderCallback(
+    const Slice & Key,
+    SstCounters & Counters) const
+{
+    bool good(true);
+    ExpiryTimeMicros expires_micros, temp;
+
+    if (IsExpiryKey(Key))
+        expires_micros=ExtractExpiry(Key);
+    else
+        expires_micros=0;
+
+    // make really high so that everything is less than it
+    if (1==Counters.Value(eSstCountKeys))
+        Counters.Set(eSstCountExpiry1, ULLONG_MAX);
+
+    // only updating counters.  do this even if
+    //  expiry disabled
+    switch(ExtractValueType(Key))
+    {
+        // exp_write_low set to smallest (earliest) write time
+        // exp_write_high set to largest (most recent) write time
+        case kTypeValueWriteTime:
+            temp=Counters.Value(eSstCountExpiry1);
+            if (expires_micros<temp)
+                Counters.Set(eSstCountExpiry1, expires_micros);
+            if (Counters.Value(eSstCountExpiry2)<expires_micros)
+                Counters.Set(eSstCountExpiry2, expires_micros);
+            // add to delete count if expired already
+            //   i.e. acting as a tombstone
+            if (MemTableCallback(Key))
+                Counters.Inc(eSstCountDeleteKey);
+            break;
+
+        case kTypeValueExplicitExpiry:
+            if (Counters.Value(eSstCountExpiry3)<expires_micros)
+                Counters.Set(eSstCountExpiry3, expires_micros);
+            // add to delete count if expired already
+            //   i.e. acting as a tombstone
+            if (MemTableCallback(Key))
+                Counters.Inc(eSstCountDeleteKey);
+            break;
+
+        // at least one non-expiry, exp_write_low gets zero
+        case kTypeValue:
+            Counters.Set(eSstCountExpiry1, 0);
+            break;
+
+        default:
+            break;
+    }   // switch
+
+    return(good);
+
+}   // ExpiryModuleOS::TableBuilderCallback
+
+
+/**
+ * Returns true if key is expired.  False if key is NOT expired
+ */
+bool ExpiryModuleOS::MemTableCallback(
+    const Slice & InternalKey) const
+{
+    bool is_expired(false), good;
+    ParsedInternalKey parsed;
+
+    good=ParseInternalKey(InternalKey, &parsed);
+
+    if (good)
+        is_expired=KeyRetirementCallback(parsed);
+
+    return(is_expired);
+
+}   // ExpiryModuleOS::MemTableCallback
+
+
+/**
+ * Returns true if at least one file on this level
+ *  is eligible for full file expiry
+ */
+bool ExpiryModuleOS::CompactionFinalizeCallback(
+    bool WantAll,                  // input: true - examine all expired files
+    const Version & Ver,           // input: database state for examination
+    int Level,                     // input: level to review for expiry
+    VersionEdit * Edit) const      // output: NULL or destination of delete list
+{
+    bool ret_flag(false);
+
+    // only test expiry_enable since it is "global" on/off switch.
+    //  other parameters might change if bucket level expiry uses
+    //  different ExpiryModuleOS object.
+    if (IsExpiryEnabled())
+    {
+        bool expired_file(false);
+        ExpiryTimeMicros now_micros;
+        const std::vector<FileMetaData*> & files(Ver.GetFileList(Level));
+        std::vector<FileMetaData*>::const_iterator it;
+
+        now_micros=GetCachedTimeMicros();
+        for (it=files.begin(); (!expired_file || WantAll) && files.end()!=it; ++it)
+        {
+            // First, is file eligible?
+            expired_file=IsFileExpired(*(*it), now_micros);
+
+            // identified an expired file, do any higher levels overlap
+            //  its key range?
+            if (expired_file)
+            {
+                int test;
+                Slice small, large;
+
+                for (test=Level+1;
+                     test<config::kNumLevels && expired_file;
+                     ++test)
+                {
+                    small=(*it)->smallest.user_key();
+                    large=(*it)->largest.user_key();
+                    expired_file=!Ver.OverlapInLevel(test, &small,
+                                                     &large);
+                }   // for
+                ret_flag=ret_flag || expired_file;
+            }   // if
+
+            // expired_file and no overlap? mark it for delete
+            if (expired_file && NULL!=Edit)
+            {
+                Edit->DeleteFile((*it)->level, (*it)->number);
+            }   // if
+        }   // for
+    }   // if
+
+    return(ret_flag);
+
+}   // ExpiryModuleOS::CompactionFinalizeCallback
+
+
+/**
+ * Review the metadata of one file to see if it is
+ *  eligible for file expiry
+ */
+bool
+ExpiryModuleOS::IsFileExpired(
+    const FileMetaData & SstFile,
+    ExpiryTimeMicros NowMicros) const
+{
+    bool expired_file;
+    ExpiryTimeMicros aged_micros;
+
+    aged_micros=NowMicros - GetExpiryMinutes()*60*port::UINT64_ONE_SECOND_MICROS;
+
+    // must test whole_file_expiry here since this could be
+    //  a bucket's ExpiryModuleOS object, not the default in Options
+    expired_file = (IsExpiryEnabled() && IsWholeFileExpiryEnabled());
+
+    //  - if exp_write_low is zero, game over -  contains non-expiry records
+    //  - if exp_write_high is below current aged time and aging enabled,
+    //       or no exp_write_high keys (is zero)
+    //  - highest explicit expiry (exp_explicit_high) is non-zero and below now
+    //  Note:  say file only contained deleted records:  ... still delete file
+    //      exp_write_low would be ULLONG_MAX, exp_write_high would be 0, exp_explicit_high would be zero
+    expired_file = expired_file && (0!=SstFile.exp_write_low)
+                   && (0!=SstFile.exp_write_high || 0!=SstFile.exp_explicit_high);
+    expired_file = expired_file && ((SstFile.exp_write_high<=aged_micros
+                                     && 0!=GetExpiryMinutes() && !IsExpiryUnlimited())
+                                    || 0==SstFile.exp_write_high);
+
+    expired_file = expired_file && (0==SstFile.exp_explicit_high
+                                    || (0!=SstFile.exp_explicit_high
+                                        && SstFile.exp_explicit_high<=NowMicros));
+
+    return(expired_file);
+
+}   // ExpiryModuleOS::IsFileExpired
+
+
+/**
+ * Riak specific routine to process whole file expiry.
+ *  Code here derived from DBImpl::CompactMemTable() in db/db_impl.cc
+ */
+Status
+DBImpl::BackgroundExpiry(
+    Compaction * Compact)
+{
+    Status s;
+    size_t count;
+
+    mutex_.AssertHeld();
+    assert(NULL != Compact && NULL!=options_.expiry_module.get());
+    assert(NULL != Compact->version());
+
+    if (NULL!=Compact && options_.ExpiryActivated())
+    {
+        VersionEdit edit;
+        int level(Compact->level());
+
+        // Compact holds a reference count to version()/input_version_
+        const Version* base = Compact->version();
+        options_.expiry_module->CompactionFinalizeCallback(true, *base, level,
+                                                           &edit);
+        count=edit.DeletedFileCount();
+
+        if (s.ok() && shutting_down_.Acquire_Load()) {
+            s = Status::IOError("Deleting DB during expiry compaction");
+        }
+
+        // push expired list to manifest
+        if (s.ok() && 0!=count)
+        {
+            s = versions_->LogAndApply(&edit, &mutex_);
+            if (s.ok())
+                gPerfCounters->Add(ePerfExpiredFiles, count);
+            else
+                s = Status::IOError("LogAndApply error during expiry compaction");
+        }   // if
+
+        // Commit to the new state
+        if (s.ok() && 0!=count)
+        {
+            // get rid of Compact now to potential free
+            //  input version's files
+            delete Compact;
+            Compact=NULL;
+
+            DeleteObsoleteFiles();
+
+            // release mutex when writing to log file
+            mutex_.Unlock();
+
+            Log(options_.info_log,
+                "Expired: %zd files from level %d",
+                count, level);
+            mutex_.Lock();
+        }   // if
+    }   // if
+
+    // convention in BackgroundCompaction() is to delete Compact here
+    delete Compact;
+
+    return s;
+
+}   // DBImpl:BackgroundExpiry
+
+
+}  // namespace leveldb
diff --git a/src/leveldb/util/expiry_os.h b/src/leveldb/util/expiry_os.h
new file mode 100644
index 000000000..f4044b85e
--- /dev/null
+++ b/src/leveldb/util/expiry_os.h
@@ -0,0 +1,137 @@
+// -------------------------------------------------------------------
+//
+// expiry_os.h
+//
+// Copyright (c) 2016-2017 Basho Technologies, Inc. All Rights Reserved.
+//
+// This file is provided to you under the Apache License,
+// Version 2.0 (the "License"); you may not use this file
+// except in compliance with the License.  You may obtain
+// a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+//
+// -------------------------------------------------------------------
+
+#ifndef EXPIRY_OS_H
+#define EXPIRY_OS_H
+
+#include <vector>
+
+#include "leveldb/options.h"
+#include "leveldb/expiry.h"
+#include "leveldb/perf_count.h"
+#include "db/dbformat.h"
+#include "db/version_edit.h"
+
+namespace leveldb
+{
+
+class ExpiryModuleOS : public ExpiryModule
+{
+public:
+    ExpiryModuleOS()
+        : expiry_enabled(false), expiry_minutes(0),
+        expiry_unlimited(false), whole_file_expiry(false)
+    {};
+
+    virtual ~ExpiryModuleOS() {};
+
+    // Print expiry options to LOG file
+    virtual void Dump(Logger * log) const;
+
+    // Quick test to allow manifest logic and such know if
+    //  extra expiry logic should be checked
+    virtual bool ExpiryActivated() const {return(expiry_enabled);};
+
+    // db/write_batch.cc MemTableInserter::Put() calls this.
+    // returns false on internal error
+    virtual bool MemTableInserterCallback(
+        const Slice & Key,   // input: user's key about to be written
+        const Slice & Value, // input: user's value object
+        ValueType & ValType,   // input/output: key type. call might change
+        ExpiryTimeMicros & Expiry) const;  // input/output: 0 or specific expiry. call might change
+
+    // db/dbformat.cc KeyRetirement::operator() calls this.
+    // db/version_set.cc SaveValue() calls this too.
+    // returns true if key is expired, returns false if key not expired
+    virtual bool KeyRetirementCallback(
+        const ParsedInternalKey & Ikey) const;  // input: key to examine for retirement
+
+    // table/table_builder.cc TableBuilder::Add() calls this.
+    // returns false on internal error
+    virtual bool TableBuilderCallback(
+        const Slice & key,       // input: internal key
+        SstCounters & counters) const; // input/output: counters for new sst table
+
+    // db/memtable.cc MemTable::Get() calls this.
+    // returns true if type/expiry is expired, returns false if not expired
+    virtual bool MemTableCallback(
+        const Slice & Key) const;    // input: leveldb internal key
+
+    // db/version_set.cc VersionSet::Finalize() calls this if no
+    //  other compaction selected for a level
+    // returns true if there is an expiry compaction eligible
+    virtual bool CompactionFinalizeCallback(
+        bool WantAll,                  // input: true - examine all expired files
+        const Version & Ver,           // input: database state for examination
+        int Level,                     // input: level to review for expiry
+        VersionEdit * Edit) const;     // output: NULL or destination of delete list
+
+    // utility to CompactionFinalizeCallback to review
+    //  characteristics of one SstFile to see if entirely expired
+    virtual bool IsFileExpired(const FileMetaData & SstFile, ExpiryTimeMicros Now) const;
+
+    // Accessors to option parameters
+    bool IsExpiryEnabled() const {return(expiry_enabled);};
+    void SetExpiryEnabled(bool Flag=true) {expiry_enabled=Flag;};
+
+    bool IsExpiryUnlimited() const {return(expiry_unlimited);};
+    void SetExpiryUnlimited(bool Flag=true) {expiry_unlimited=Flag;};
+
+    uint64_t GetExpiryMinutes() const {return(expiry_minutes);};
+    void SetExpiryMinutes(uint64_t Minutes) {expiry_minutes=Minutes; expiry_unlimited=false;};
+
+    bool IsWholeFileExpiryEnabled() const {return(whole_file_expiry);};
+    void SetWholeFileExpiryEnabled(bool Flag=true) {whole_file_expiry=Flag;};
+
+public:
+    // NOTE: option names below are intentionally public and lowercase with underscores.
+    //       This is to match style of options within include/leveldb/options.h.
+
+    // Riak specific option to enable/disable expiry features globally
+    //  true: expiry enabled
+    //  false: disabled (some expired keys may reappear)
+    bool expiry_enabled;
+
+    // Riak specific option giving number of minutes a stored key/value
+    // may stay within the database before automatic deletion.  Zero
+    // disables expiry by age feature.
+    uint64_t expiry_minutes;
+    bool expiry_unlimited;
+
+    // Riak specific option authorizing leveldb to eliminate entire
+    // files that contain expired data (delete files instead of
+    // removing expired data during compactions).
+    bool whole_file_expiry;
+
+protected:
+    // When "creating" write time, chose its source based upon
+    //  open source versus enterprise edition
+    virtual uint64_t GenerateWriteTimeMicros(const Slice & Key, const Slice & Value) const;
+
+
+};  // ExpiryModuleOS
+
+uint64_t CuttlefishDurationMinutes(const char * Buf);
+
+}  // namespace leveldb
+
+#endif // ifndef
diff --git a/src/leveldb/util/expiry_os_test.cc b/src/leveldb/util/expiry_os_test.cc
new file mode 100644
index 000000000..5505ef339
--- /dev/null
+++ b/src/leveldb/util/expiry_os_test.cc
@@ -0,0 +1,1659 @@
+// -------------------------------------------------------------------
+//
+// expiry_os_tests.cc
+//
+// Copyright (c) 2016-2017 Basho Technologies, Inc. All Rights Reserved.
+//
+// This file is provided to you under the Apache License,
+// Version 2.0 (the "License"); you may not use this file
+// except in compliance with the License.  You may obtain
+// a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+//
+// -------------------------------------------------------------------
+
+#include <limits.h>
+#include <algorithm>
+#include <memory>
+#include <string>
+
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+#include "leveldb/comparator.h"
+#include "leveldb/env.h"
+#include "leveldb/options.h"
+#include "leveldb/slice.h"
+#include "leveldb/write_batch.h"
+
+#include "db/db_impl.h"
+#include "db/dbformat.h"
+#include "db/filename.h"
+#include "db/version_set.h"
+#include "port/port.h"
+#include "util/expiry_os.h"
+#include "util/mutexlock.h"
+#include "util/throttle.h"
+
+/**
+ * Execution routine
+ */
+int main(int argc, char** argv)
+{
+  return leveldb::test::RunAllTests();
+}
+
+
+namespace leveldb {
+
+// helper function to clean up heap objects
+static void ClearMetaArray(Version::FileMetaDataVector_t & ClearMe);
+
+
+/**
+ * Wrapper class for tests.  Holds working variables
+ * and helper functions.
+ */
+class ExpiryTester
+{
+public:
+    ExpiryTester()
+    {
+    };
+
+    ~ExpiryTester()
+    {
+    };
+};  // class ExpiryTester
+
+
+/**
+ * Validate option defaults
+ */
+TEST(ExpiryTester, Defaults)
+{
+    ExpiryModuleOS expiry;
+
+    ASSERT_EQ(expiry.IsExpiryEnabled(), false);
+    ASSERT_EQ(expiry.GetExpiryMinutes(), 0);
+    ASSERT_EQ(expiry.IsExpiryUnlimited(), false);
+    ASSERT_EQ(expiry.IsWholeFileExpiryEnabled(), false);
+    ASSERT_EQ(expiry.ExpiryActivated(), false);
+
+}   // test Defaults
+
+
+/**
+ * Validate MemTableInserterCallback
+ */
+TEST(ExpiryTester, MemTableInserterCallback)
+{
+    bool flag;
+    uint64_t before, after;
+    ExpiryModuleOS module;
+    ValueType type;
+    ExpiryTimeMicros expiry;
+    Slice key, value;
+
+    module.SetExpiryEnabled(true);
+    module.SetWholeFileExpiryEnabled(true);
+    ASSERT_EQ(module.ExpiryActivated(), true);
+
+    // deletion, do nothing
+    type=kTypeDeletion;
+    expiry=0;
+    flag=module.MemTableInserterCallback(key, value, type, expiry);
+    ASSERT_EQ(flag, true);
+    ASSERT_EQ(type, kTypeDeletion);
+    ASSERT_EQ(expiry, 0);
+
+    // plain value, needs expiry
+    type=kTypeValue;
+    expiry=0;
+    module.SetExpiryMinutes(30);
+    before=port::TimeMicros();
+    SetCachedTimeMicros(before);
+    flag=module.MemTableInserterCallback(key, value, type, expiry);
+    after=port::TimeMicros();
+    ASSERT_EQ(flag, true);
+    ASSERT_EQ(type, kTypeValueWriteTime);
+    ASSERT_TRUE(before <= expiry && expiry <=after && 0!=expiry);
+
+    // plain value, needs expiry
+    type=kTypeValue;
+    expiry=0;
+    module.SetExpiryUnlimited(true);
+    before=port::TimeMicros();
+    SetCachedTimeMicros(before);
+    flag=module.MemTableInserterCallback(key, value, type, expiry);
+    after=port::TimeMicros();
+    ASSERT_EQ(flag, true);
+    ASSERT_EQ(type, kTypeValueWriteTime);
+    ASSERT_TRUE(before <= expiry && expiry <=after && 0!=expiry);
+
+    // plain value, expiry disabled
+    type=kTypeValue;
+    expiry=0;
+    module.SetExpiryMinutes(0);
+    before=port::TimeMicros();
+    SetCachedTimeMicros(before);
+    flag=module.MemTableInserterCallback(key, value, type, expiry);
+    after=port::TimeMicros();
+    ASSERT_EQ(flag, true);
+    ASSERT_EQ(type, kTypeValue);
+    ASSERT_EQ(expiry, 0);
+
+    // write time value, needs expiry
+    type=kTypeValueWriteTime;
+    expiry=0;
+    module.SetExpiryMinutes(30);
+    before=port::TimeMicros();
+    SetCachedTimeMicros(before);
+    flag=module.MemTableInserterCallback(key, value, type, expiry);
+    after=port::TimeMicros();
+    ASSERT_EQ(flag, true);
+    ASSERT_EQ(type, kTypeValueWriteTime);
+    ASSERT_TRUE(before <= expiry && expiry <=after && 0!=expiry);
+
+    // write time value, expiry supplied (as if copied from another db)
+    type=kTypeValueWriteTime;
+    module.SetExpiryMinutes(30);
+    before=port::TimeMicros();
+    expiry=before - 1000;
+    SetCachedTimeMicros(before);
+    flag=module.MemTableInserterCallback(key, value, type, expiry);
+    after=port::TimeMicros();
+    ASSERT_EQ(flag, true);
+    ASSERT_EQ(type, kTypeValueWriteTime);
+    ASSERT_TRUE((before - 1000) == expiry && expiry <=after && 0!=expiry);
+
+    // explicit expiry, not changed
+    type=kTypeValueExplicitExpiry;
+    expiry=97531;
+    module.SetExpiryMinutes(30);
+    flag=module.MemTableInserterCallback(key, value, type, expiry);
+    ASSERT_EQ(flag, true);
+    ASSERT_EQ(type, kTypeValueExplicitExpiry);
+    ASSERT_EQ(expiry, 97531);
+
+}   // test MemTableInserterCallback
+
+
+/**
+ * Validate MemTableCallback
+ *   (supports KeyRetirementCallback in generic case)
+ */
+TEST(ExpiryTester, MemTableCallback)
+{
+    bool flag;
+    uint64_t before, after;
+    ExpiryModuleOS module;
+    ValueType type;
+    ExpiryTimeMicros expiry;
+    Slice key, value;
+
+    ASSERT_EQ(module.ExpiryActivated(), false);
+    module.SetExpiryEnabled(true);
+    module.SetWholeFileExpiryEnabled(true);
+    module.SetExpiryMinutes(5);
+    ASSERT_EQ(module.ExpiryActivated(), true);
+
+    before=port::TimeMicros();
+    SetCachedTimeMicros(before);
+
+    // deletion, do nothing
+    InternalKey key1("DeleteMeKey", 0, 0, kTypeDeletion);
+    flag=module.MemTableCallback(key1.internal_key());
+    ASSERT_EQ(flag, false);
+
+    // plain value, no expiry
+    InternalKey key2("PlainKey", 0, 0, kTypeValue);
+    flag=module.MemTableCallback(key2.internal_key());
+    ASSERT_EQ(flag, false);
+
+    // explicit, but time in the future
+    after=GetCachedTimeMicros() + 60*port::UINT64_ONE_SECOND_MICROS;
+    InternalKey key3("ExplicitKey", after, 0, kTypeValueExplicitExpiry);
+    flag=module.MemTableCallback(key3.internal_key());
+    ASSERT_EQ(flag, false);
+    // advance the clock
+    SetCachedTimeMicros(after + 60*port::UINT64_ONE_SECOND_MICROS);
+    flag=module.MemTableCallback(key3.internal_key());
+    ASSERT_EQ(flag, true);
+    // disable expiry
+    module.SetExpiryEnabled(false);
+    ASSERT_EQ(module.ExpiryActivated(), false);
+
+    flag=module.MemTableCallback(key3.internal_key());
+    ASSERT_EQ(flag, false);
+
+    // age expiry
+    module.SetExpiryEnabled(true);
+    ASSERT_EQ(module.ExpiryActivated(), true);
+    module.SetExpiryMinutes(2);
+    after=GetCachedTimeMicros();
+    InternalKey key4("AgeKey", after, 0, kTypeValueWriteTime);
+    flag=module.MemTableCallback(key4.internal_key());
+    ASSERT_EQ(flag, false);
+    // advance the clock
+    SetCachedTimeMicros(after + 60*port::UINT64_ONE_SECOND_MICROS);
+    flag=module.MemTableCallback(key4.internal_key());
+    ASSERT_EQ(flag, false);
+    SetCachedTimeMicros(after + 120*port::UINT64_ONE_SECOND_MICROS);
+    flag=module.MemTableCallback(key4.internal_key());
+    ASSERT_EQ(flag, true);
+    // disable expiry
+    module.SetExpiryEnabled(false);
+    flag=module.MemTableCallback(key4.internal_key());
+    ASSERT_EQ(flag, false);
+    // switch to unlimited
+    module.SetExpiryEnabled(true);
+    module.SetExpiryUnlimited(true);
+    flag=module.MemTableCallback(key4.internal_key());
+    ASSERT_EQ(flag, false);
+
+}   // test MemTableCallback
+
+
+/**
+ * Wrapper class to Version that allows manipulation
+ *  of internal objects for testing purposes
+ */
+class VersionTester : public Version
+{
+public:
+    VersionTester() : Version(&m_Vset), m_Icmp(m_Options.comparator),
+                      m_Vset("", &m_Options, NULL, &m_Icmp)  {};
+
+    void SetFileList(int Level, FileMetaDataVector_t & Files)
+        {files_[Level]=Files;};
+
+    Options m_Options;
+    InternalKeyComparator m_Icmp;
+    VersionSet m_Vset;
+};  // class VersionTester
+
+
+/**
+ * Validate CompactionFinalizeCallback's
+ *  identification of expired files
+ */
+
+TEST(ExpiryTester, CompactionFinalizeCallback1)
+{
+    bool flag;
+    uint64_t now, aged, temp_time;
+    std::vector<FileMetaData*> files;
+    FileMetaData * file_ptr;
+    ExpiryModuleOS module;
+    VersionTester ver;
+    int level;
+
+    ASSERT_EQ(ver.m_Options.ExpiryActivated(), false);
+
+    module.SetExpiryEnabled(true);
+    module.SetWholeFileExpiryEnabled(true);
+    module.SetExpiryMinutes(5);
+    level=config::kNumOverlapLevels;
+
+    now=port::TimeMicros();
+    SetCachedTimeMicros(now);
+
+    // put two files into the level, no expiry
+    file_ptr=new FileMetaData;
+    file_ptr->smallest.SetFrom(ParsedInternalKey("AA1", 0, 1, kTypeValue));
+    file_ptr->largest.SetFrom(ParsedInternalKey("CC1", 0, 2, kTypeValue));
+    files.push_back(file_ptr);
+
+    file_ptr=new FileMetaData;
+    file_ptr->smallest.SetFrom(ParsedInternalKey("DD1", 0, 3, kTypeValue));
+    file_ptr->largest.SetFrom(ParsedInternalKey("FF1", 0, 4, kTypeValue));
+    files.push_back(file_ptr);
+
+    // disable
+    module.SetExpiryEnabled(false);
+    module.SetWholeFileExpiryEnabled(false);
+    module.SetExpiryMinutes(0);
+    ver.SetFileList(level, files);
+    flag=module.CompactionFinalizeCallback(true, ver, level, NULL);
+    ASSERT_EQ(flag, false);
+    flag=module.CompactionFinalizeCallback(false, ver, level, NULL);
+    ASSERT_EQ(flag, false);
+
+    // enable and move clock
+    module.SetExpiryEnabled(true);
+    module.SetWholeFileExpiryEnabled(true);
+    module.SetExpiryMinutes(1);
+    SetCachedTimeMicros(now + 120*port::UINT64_ONE_SECOND_MICROS);
+    ver.SetFileList(level, files);
+    flag=module.CompactionFinalizeCallback(true, ver, level, NULL);
+    ASSERT_EQ(flag, false);
+    flag=module.CompactionFinalizeCallback(false, ver, level, NULL);
+    ASSERT_EQ(flag, false);
+
+    // add file only containing explicit
+    //  (explicit only shown in counts, not keys)
+    file_ptr=new FileMetaData;
+    file_ptr->smallest.SetFrom(ParsedInternalKey("GG1", 0, 5, kTypeValue));
+    file_ptr->largest.SetFrom(ParsedInternalKey("HH1", 0, 6, kTypeValue));
+    file_ptr->exp_write_low=ULLONG_MAX;  // sign of no aged expiry, or plain keys
+    file_ptr->exp_explicit_high=now + 60*port::UINT64_ONE_SECOND_MICROS;
+    files.push_back(file_ptr);
+
+    // disable
+    module.SetExpiryEnabled(false);
+    module.SetWholeFileExpiryEnabled(false);
+    module.SetExpiryMinutes(0);
+    ver.SetFileList(level, files);
+    flag=module.CompactionFinalizeCallback(true, ver, level, NULL);
+    ASSERT_EQ(flag, false);
+    flag=module.CompactionFinalizeCallback(false, ver, level, NULL);
+    ASSERT_EQ(flag, false);
+
+    // enable compaction expiry only
+    module.SetExpiryEnabled(true);
+    module.SetWholeFileExpiryEnabled(false);
+    module.SetExpiryMinutes(1);
+    ver.SetFileList(level, files);
+    flag=module.CompactionFinalizeCallback(true, ver, level, NULL);
+    ASSERT_EQ(flag, false);
+    flag=module.CompactionFinalizeCallback(false, ver, level, NULL);
+    ASSERT_EQ(flag, false);
+
+    // enable file expiry too
+    module.SetWholeFileExpiryEnabled(true);
+    module.SetExpiryMinutes(1);
+    flag=module.CompactionFinalizeCallback(true, ver, level, NULL);
+    ASSERT_EQ(flag, true);
+    flag=module.CompactionFinalizeCallback(false, ver, level, NULL);
+    ASSERT_EQ(flag, true);
+
+    // enable file, but not expiry minutes (disable)
+    //   ... but file without aged expiries or plain keys
+    module.SetWholeFileExpiryEnabled(true);
+    module.SetExpiryMinutes(0);
+    ver.SetFileList(level, files);
+    flag=module.CompactionFinalizeCallback(true, ver, level, NULL);
+    ASSERT_EQ(flag, true);
+    flag=module.CompactionFinalizeCallback(false, ver, level, NULL);
+    ASSERT_EQ(flag, true);
+
+    // enable file, minutes as unlimited
+    //   ... but file without aged expiries or plain keys
+    module.SetWholeFileExpiryEnabled(true);
+    module.SetExpiryUnlimited(true);
+    ver.SetFileList(level, files);
+    flag=module.CompactionFinalizeCallback(true, ver, level, NULL);
+    ASSERT_EQ(flag, true);
+    flag=module.CompactionFinalizeCallback(false, ver, level, NULL);
+    ASSERT_EQ(flag, true);
+
+    // remove explicit
+    files.pop_back();
+    delete file_ptr;
+
+    // add file only containing aged
+    //  (aging only shown in counts, not keys)
+    file_ptr=new FileMetaData;
+    file_ptr->smallest.SetFrom(ParsedInternalKey("II1", 0, 7, kTypeValue));
+    file_ptr->largest.SetFrom(ParsedInternalKey("JJ1", 0, 8, kTypeValue));
+    file_ptr->exp_write_low=now - 60*port::UINT64_ONE_SECOND_MICROS;
+    file_ptr->exp_write_high=now + 60*port::UINT64_ONE_SECOND_MICROS;
+    files.push_back(file_ptr);
+
+    // disable
+    module.SetWholeFileExpiryEnabled(false);
+    module.SetExpiryMinutes(0);
+    ver.SetFileList(level, files);
+    flag=module.CompactionFinalizeCallback(true, ver, level, NULL);
+    ASSERT_EQ(flag, false);
+    flag=module.CompactionFinalizeCallback(false, ver, level, NULL);
+    ASSERT_EQ(flag, false);
+
+    // enable compaction only
+    module.SetWholeFileExpiryEnabled(false);
+    module.SetExpiryMinutes(1);
+    flag=module.CompactionFinalizeCallback(true, ver, level, NULL);
+    ASSERT_EQ(flag, false);
+    flag=module.CompactionFinalizeCallback(false, ver, level, NULL);
+    ASSERT_EQ(flag, false);
+
+    // enable file too
+    module.SetWholeFileExpiryEnabled(true);
+    module.SetExpiryMinutes(1);
+    flag=module.CompactionFinalizeCallback(true, ver, level, NULL);
+    ASSERT_EQ(flag, true);
+    flag=module.CompactionFinalizeCallback(false, ver, level, NULL);
+    ASSERT_EQ(flag, true);
+
+    // enable file, but not expiry minutes (disable)
+    module.SetWholeFileExpiryEnabled(true);
+    module.SetExpiryMinutes(0);
+    flag=module.CompactionFinalizeCallback(true, ver, level, NULL);
+    ASSERT_EQ(flag, false);
+    flag=module.CompactionFinalizeCallback(false, ver, level, NULL);
+    ASSERT_EQ(flag, false);
+
+    // enable file, but unlimited minutes
+    module.SetWholeFileExpiryEnabled(true);
+    module.SetExpiryUnlimited(true);
+    flag=module.CompactionFinalizeCallback(true, ver, level, NULL);
+    ASSERT_EQ(flag, false);
+    flag=module.CompactionFinalizeCallback(false, ver, level, NULL);
+    ASSERT_EQ(flag, false);
+
+    // file_ptr at 1min, setting at 5 min
+    module.SetWholeFileExpiryEnabled(true);
+    module.SetExpiryMinutes(5);
+    flag=module.CompactionFinalizeCallback(true, ver, level, NULL);
+    ASSERT_EQ(flag, false);
+    flag=module.CompactionFinalizeCallback(false, ver, level, NULL);
+    ASSERT_EQ(flag, false);
+
+    // file_ptr at 1min, setting at 1m, clock at 30 seconds
+    module.SetWholeFileExpiryEnabled(true);
+    module.SetExpiryMinutes(1);
+    SetCachedTimeMicros(now + 30*port::UINT64_ONE_SECOND_MICROS);
+    flag=module.CompactionFinalizeCallback(true, ver, level, NULL);
+    ASSERT_EQ(flag, false);
+    flag=module.CompactionFinalizeCallback(false, ver, level, NULL);
+    ASSERT_EQ(flag, false);
+
+    // file_ptr at 1min, setting at 1m, clock at 1.5minutes
+    module.SetWholeFileExpiryEnabled(true);
+    module.SetExpiryMinutes(1);
+    SetCachedTimeMicros(now + 90*port::UINT64_ONE_SECOND_MICROS);
+    flag=module.CompactionFinalizeCallback(true, ver, level, NULL);
+    ASSERT_EQ(flag, false);
+    flag=module.CompactionFinalizeCallback(false, ver, level, NULL);
+    ASSERT_EQ(flag, false);
+
+    // file_ptr at 1min, setting at 1m, clock at 2minutes
+    module.SetWholeFileExpiryEnabled(true);
+    module.SetExpiryMinutes(1);
+    SetCachedTimeMicros(now + 120*port::UINT64_ONE_SECOND_MICROS);
+    flag=module.CompactionFinalizeCallback(true, ver, level, NULL);
+    ASSERT_EQ(flag, true);
+    flag=module.CompactionFinalizeCallback(false, ver, level, NULL);
+    ASSERT_EQ(flag, true);
+
+    // same settings, but show an explicit expiry too that has not
+    //  expired
+    file_ptr->exp_explicit_high=now +240*port::UINT64_ONE_SECOND_MICROS;
+    flag=module.CompactionFinalizeCallback(true, ver, level, NULL);
+    ASSERT_EQ(flag, false);
+    flag=module.CompactionFinalizeCallback(false, ver, level, NULL);
+    ASSERT_EQ(flag, false);
+
+    // same settings, but show an explicit expiry has expired
+    //  expired
+    file_ptr->exp_explicit_high=now +90*port::UINT64_ONE_SECOND_MICROS;
+    flag=module.CompactionFinalizeCallback(true, ver, level, NULL);
+    ASSERT_EQ(flag, true);
+    flag=module.CompactionFinalizeCallback(false, ver, level, NULL);
+    ASSERT_EQ(flag, true);
+
+    // bug 1 - thank you Paul Place
+    // try having the expired file first in the list, followed by non-expired files
+    std::vector<FileMetaData*> files1(files.size());
+    std::reverse_copy(files.begin(), files.end(), files1.begin());
+    ver.SetFileList(level, files1);
+    flag=module.CompactionFinalizeCallback(true, ver, level, NULL);
+    ASSERT_EQ(flag, true);
+    flag=module.CompactionFinalizeCallback(false, ver, level, NULL);
+    ASSERT_EQ(flag, true);
+    ver.SetFileList(level, files);
+
+    // same settings, explicit has expired, but not the aged
+    //  expired
+    file_ptr->exp_write_high=now +240*port::UINT64_ONE_SECOND_MICROS;
+    flag=module.CompactionFinalizeCallback(true, ver, level, NULL);
+    ASSERT_EQ(flag, false);
+    flag=module.CompactionFinalizeCallback(false, ver, level, NULL);
+    ASSERT_EQ(flag, false);
+
+    // variations on Bug 1 test.  Put singleton expired file in
+    //  first, second, then third position.  Other two no expiry
+    files[0]->exp_write_low=ULLONG_MAX;  // sign of no aged expiry, or plain keys
+    files[0]->exp_write_high=0;
+    files[0]->exp_explicit_high=now +90*port::UINT64_ONE_SECOND_MICROS;
+    files[1]->exp_write_low=ULLONG_MAX;  // sign of no aged expiry, or plain keys
+    files[1]->exp_write_high=0;
+    files[1]->exp_explicit_high=0;
+    files[2]->exp_write_low=ULLONG_MAX;  // sign of no aged expiry, or plain keys
+    files[2]->exp_write_high=0;
+    files[2]->exp_explicit_high=0;
+    flag=module.CompactionFinalizeCallback(true, ver, level, NULL);
+    ASSERT_EQ(flag, true);
+    flag=module.CompactionFinalizeCallback(false, ver, level, NULL);
+    ASSERT_EQ(flag, true);
+    files[0]->exp_explicit_high=0;
+    files[1]->exp_explicit_high=now +90*port::UINT64_ONE_SECOND_MICROS;
+    flag=module.CompactionFinalizeCallback(true, ver, level, NULL);
+    ASSERT_EQ(flag, true);
+    flag=module.CompactionFinalizeCallback(false, ver, level, NULL);
+    ASSERT_EQ(flag, true);
+    files[1]->exp_explicit_high=0;
+    files[2]->exp_explicit_high=now +90*port::UINT64_ONE_SECOND_MICROS;
+    flag=module.CompactionFinalizeCallback(true, ver, level, NULL);
+    ASSERT_EQ(flag, true);
+    flag=module.CompactionFinalizeCallback(false, ver, level, NULL);
+    ASSERT_EQ(flag, true);
+
+    // clean up phony files or Version destructor will crash
+    ClearMetaArray(files);
+    ver.SetFileList(level,files);
+
+}   // test CompactionFinalizeCallback
+
+
+/**
+ * Building static sets of file levels to increase visibility
+ */
+
+struct TestFileMetaData
+{
+    uint64_t m_Number;          // file number
+    const char * m_Smallest;
+    const char * m_Largest;
+    ExpiryTimeMicros m_Expiry1;              // minutes
+    ExpiryTimeMicros m_Expiry2;
+    ExpiryTimeMicros m_Expiry3;
+};
+
+
+static void
+ClearMetaArray(
+    Version::FileMetaDataVector_t & ClearMe)
+{
+    // clean up phony files or Version destructor will crash
+    std::vector<FileMetaData*>::iterator it;
+    for (it=ClearMe.begin(); ClearMe.end()!=it; ++it)
+        delete (*it);
+    ClearMe.clear();
+
+}   // ClearMetaArray
+
+
+static void
+CreateMetaArray(
+    Version::FileMetaDataVector_t & Output,
+    TestFileMetaData * Data,
+    size_t Count)
+{
+    size_t loop;
+    TestFileMetaData * cursor;
+    FileMetaData * file_ptr;
+    ExpiryTimeMicros now;
+
+    ClearMetaArray(Output);
+    now=GetCachedTimeMicros();
+
+    for (loop=0, cursor=Data; loop<Count; ++loop, ++cursor)
+    {
+        file_ptr=new FileMetaData;
+        file_ptr->number=cursor->m_Number;
+        file_ptr->smallest.SetFrom(ParsedInternalKey(cursor->m_Smallest, 0, cursor->m_Number, kTypeValue));
+        file_ptr->largest.SetFrom(ParsedInternalKey(cursor->m_Largest, 0, cursor->m_Number, kTypeValue));
+        if (0!=cursor->m_Expiry1)
+        {
+            if (ULLONG_MAX!=cursor->m_Expiry1)
+                file_ptr->exp_write_low=now + cursor->m_Expiry1*60000000;
+            else
+                file_ptr->exp_write_low=cursor->m_Expiry1;
+        }   // if
+
+        if (0!=cursor->m_Expiry2)
+            file_ptr->exp_write_high=now + cursor->m_Expiry2*60000000;
+
+        if (0!=cursor->m_Expiry3)
+            file_ptr->exp_explicit_high=now + cursor->m_Expiry3*60000000;
+
+        Output.push_back(file_ptr);
+    }   // for
+
+}   // CreateMetaArray
+
+
+/** case: two levels, no overlap, no expiry **/
+TestFileMetaData levelA[]=
+{
+    {100, "AA", "BA", 0, 0, 0},
+    {101, "LA", "NA", 0, 0, 0}
+};  // levelA
+
+TestFileMetaData levelB[]=
+{
+    {200, "CA", "DA", 0, 0, 0},
+    {201, "SA", "TA", 0, 0, 0}
+};  // levelB
+
+
+/** case: two levels, 100% overlap, both levels expired **/
+TestFileMetaData levelC[]=
+{
+    {200, "CA", "DA", 1, 3, 0},
+    {201, "SA", "TA", ULLONG_MAX, 0, 4}
+};  // levelC
+
+TestFileMetaData levelD[]=
+{
+    {200, "CA", "DA", 1, 2, 0},
+    {201, "SA", "TA", ULLONG_MAX, 0, 2}
+};  // levelD
+
+
+TEST(ExpiryTester, OverlapTests)
+{
+    bool flag;
+    Version::FileMetaDataVector_t level1, level2, level_clear, expired_files;
+    uint64_t now;
+    ExpiryModuleOS module;
+    VersionTester ver;
+    const int overlap0(0), overlap1(1), sorted0(3), sorted1(4);
+    VersionEdit edit;
+
+    module.SetExpiryEnabled(true);
+    module.SetWholeFileExpiryEnabled(true);
+    module.SetExpiryMinutes(2);
+
+    now=port::TimeMicros();
+    SetCachedTimeMicros(now);
+
+
+    /** case: two levels, no overlap, no expiry **/
+    CreateMetaArray(level1, levelA, 2);
+    CreateMetaArray(level2, levelB, 2);
+    ver.SetFileList(sorted0, level1);
+    ver.SetFileList(sorted1, level2);
+    flag=module.CompactionFinalizeCallback(true, ver, sorted0, &edit);
+    ASSERT_EQ(flag, false);
+    ASSERT_EQ(edit.DeletedFileCount(), 0);
+    ver.SetFileList(sorted0, level_clear);
+    ver.SetFileList(sorted1, level_clear);
+
+    ver.SetFileList(overlap0, level1);
+    ver.SetFileList(overlap1, level2);
+    flag=module.CompactionFinalizeCallback(true, ver, overlap0, &edit);
+    ASSERT_EQ(flag, false);
+    ASSERT_EQ(edit.DeletedFileCount(), 0);
+    ver.SetFileList(overlap0, level_clear);
+    ver.SetFileList(overlap1, level_clear);
+
+    ver.SetFileList(overlap0, level1);
+    ver.SetFileList(sorted1, level2);
+    flag=module.CompactionFinalizeCallback(true, ver, overlap0, &edit);
+    ASSERT_EQ(flag, false);
+    ASSERT_EQ(edit.DeletedFileCount(), 0);
+    ver.SetFileList(overlap0, level_clear);
+    ver.SetFileList(sorted1, level_clear);
+
+    /** case: two levels, 100% overlap, both levels expired **/
+    SetCachedTimeMicros(now);
+    CreateMetaArray(level1, levelC, 2);
+    CreateMetaArray(level2, levelD, 2);
+    SetCachedTimeMicros(now + 5*60000000);
+    ver.SetFileList(sorted0, level1);
+    ver.SetFileList(sorted1, level2);
+    flag=module.CompactionFinalizeCallback(true, ver, sorted0, &edit);
+    ASSERT_EQ(flag, false);
+    ASSERT_EQ(edit.DeletedFileCount(), 0);
+    flag=module.CompactionFinalizeCallback(true, ver, sorted1, &edit);
+    ASSERT_EQ(flag, true);
+    ASSERT_EQ(edit.DeletedFileCount(), 2);
+
+    // retest sorted1 with unlimited
+    module.SetExpiryUnlimited(true);
+    flag=module.CompactionFinalizeCallback(true, ver, sorted1, &edit);
+    ASSERT_EQ(flag, true);
+    ASSERT_EQ(edit.DeletedFileCount(), 2);
+
+    // cleanup
+    ver.SetFileList(sorted0, level_clear);
+    ver.SetFileList(sorted1, level_clear);
+
+    ClearMetaArray(level1);
+    ClearMetaArray(level2);
+
+}   // OverlapTests
+
+
+enum eExpiryType
+{
+    eEXPIRY_NONE=1,
+    eEXPIRY_AGED=2,
+    eEXPIRY_EXPLICIT=3
+};  // enum eExpiryType
+
+
+struct sExpiryTestKey
+{
+    const char * m_Key;   // string key
+    eExpiryType m_Type;   // type of expiry
+    int m_NowMinus;       // expiry time to set
+};
+
+
+struct sExpiryTestFile
+{
+    // File size is generated
+    int m_Number;
+    int m_Level;               // level for file in manifest
+    int m_LastValidState;      // in a "state" test, how long should this file be around
+    sExpiryTestKey m_Keys[3];  // low, middle, high key
+};
+
+
+/**
+ * Note:  constructor and destructor NOT called, this is
+ *        an interface class only
+ */
+
+class ExpDB : public DBImpl
+{
+public:
+    ExpDB(const Options& options, const std::string& dbname)
+        : DBImpl(options, dbname) {}
+
+
+
+    virtual ~ExpDB() {};
+
+    VersionSet * GetVersionSet() {return(versions_);};
+    const Options * GetOptions() {return(&options_);};
+
+    void OneCompaction()
+    {
+        MutexLock l(&mutex_);
+        MaybeScheduleCompaction();
+        while (IsCompactionScheduled())
+            bg_cv_.Wait();
+    };  // OneCompaction
+
+    void SetClock(uint64_t Time)
+        {SetCachedTimeMicros(Time);};
+
+    void ShiftClockMinutes(int Min)
+    {
+        uint64_t shift;
+
+        shift=Min * 60 * port::UINT64_ONE_SECOND_MICROS;
+        SetCachedTimeMicros(GetCachedTimeMicros() + shift);
+    };
+};  // class ExpDB
+
+
+class ExpTestModule : public ExpiryModuleOS
+{
+public:
+    ExpTestModule() : m_ExpiryAllow(0), m_AllowLevel(-1) {};
+
+    mutable int m_ExpiryAllow;
+    mutable int m_AllowLevel;
+
+    virtual bool CompactionFinalizeCallback(
+        bool WantAll, const Version & Ver, int Level,
+        VersionEdit * Edit) const
+    {
+        bool flag(false);
+
+        if (0!=m_ExpiryAllow && NULL==Edit)
+        {
+            flag=ExpiryModuleOS::CompactionFinalizeCallback(WantAll, Ver, Level, Edit);
+
+            if (flag)
+            {
+                m_AllowLevel=Level;
+                -- m_ExpiryAllow;
+            }   // if
+        }   // if
+        else if (-1!=m_AllowLevel && NULL!=Edit)
+        {
+            flag=ExpiryModuleOS::CompactionFinalizeCallback(WantAll, Ver, Level, Edit);
+
+            if (flag)
+            {
+                m_AllowLevel=-1;
+            }
+        }   // else if
+
+        return(flag);
+
+    }   // CoompactionFinalizeCallback
+};
+
+
+class ExpiryManifestTester
+{
+public:
+    ExpiryManifestTester()
+        : m_Good(false), m_DB(NULL), m_Env(Env::Default()),
+          m_BaseTime(port::TimeMicros()), m_Sequence(1)
+    {
+        m_DBName = test::TmpDir() + "/expiry";
+
+        // clean up previous execution
+        leveldb::DestroyDB(m_DBName, m_Options);
+
+        m_Options.create_if_missing=true;
+        m_Options.error_if_exists=false;
+
+        // Note: m_Options.expiry_module is a smart pointer.  It
+        //  owns the m_Expiry object and will automatically delete the
+        //  allocation.
+        m_Expiry=new ExpTestModule;
+        m_Options.expiry_module=m_Expiry;
+        m_Expiry->SetExpiryEnabled(true);
+
+        OpenTestDB();
+    };
+
+    ~ExpiryManifestTester()
+    {
+        // clean up
+        delete m_DB;
+        leveldb::DestroyDB(m_DBName, m_Options);
+    };
+
+    bool m_Good;
+    std::string m_DBName;
+    Options m_Options;
+    ExpTestModule * m_Expiry;
+    Env * m_Env;
+    ExpDB * m_DB;
+    uint64_t m_BaseTime;
+    SequenceNumber m_Sequence;
+
+    void OpenTestDB()
+    {
+        leveldb::Status status;
+
+        status=leveldb::DB::Open(m_Options, m_DBName, (DB**)&m_DB);
+
+        m_Good=status.ok();
+        ASSERT_OK(status);
+        m_DB->SetClock(m_BaseTime);
+    }   // OpenTestDB
+
+
+    void CreateKey(const sExpiryTestKey & Key, InternalKey & Output)
+    {
+        ExpiryTimeMicros expiry;
+        ValueType type;
+
+        switch(Key.m_Type)
+        {
+            case(eEXPIRY_NONE):
+                expiry=0;
+                type=kTypeValue;
+                break;
+
+            case(eEXPIRY_AGED):
+                expiry=m_BaseTime - Key.m_NowMinus * 60 * port::UINT64_ONE_SECOND_MICROS;
+                type=kTypeValueWriteTime;
+                break;
+
+            case(eEXPIRY_EXPLICIT):
+                expiry=m_BaseTime + Key.m_NowMinus * 60 * port::UINT64_ONE_SECOND_MICROS;
+                type=kTypeValueExplicitExpiry;
+                break;
+        }   // switch
+
+        ParsedInternalKey ikey(Key.m_Key, expiry, m_Sequence, type);
+
+        Output.SetFrom(ikey);
+        ++m_Sequence;
+    }   // CreateKey
+
+
+    void CreateFile(const sExpiryTestFile & File, VersionEdit & Edit)
+    {
+        std::string fname;
+        Status s;
+        WritableFile * outfile;
+        TableBuilder * builder;
+        InternalKey low_key, mid_key, high_key;
+        uint64_t count1, count2, count3, file_size;
+
+        fname = TableFileName(*m_DB->GetOptions(), File.m_Number, File.m_Level);
+        s = m_Env->NewWritableFile(fname, &outfile, gMapSize);
+        ASSERT_OK(s);
+        builder = new TableBuilder(*m_DB->GetOptions(), outfile);
+
+        CreateKey(File.m_Keys[0], low_key);
+        CreateKey(File.m_Keys[1], mid_key);
+        CreateKey(File.m_Keys[2], high_key);
+
+        builder->Add(low_key.internal_key(), "Value");
+        builder->Add(mid_key.internal_key(), "Value");
+        builder->Add(high_key.internal_key(), "Value");
+
+        s = builder->Finish();
+        ASSERT_OK(s);
+
+        count1=builder->GetExpiryWriteLow();
+        count2=builder->GetExpiryWriteHigh();
+        count3=builder->GetExpiryExplicitHigh();
+
+        s = outfile->Sync();
+        ASSERT_OK(s);
+        s = outfile->Close();
+        ASSERT_OK(s);
+
+        delete builder;
+        delete outfile;
+
+        m_Env->GetFileSize(fname, &file_size);
+
+        Edit.AddFile2(File.m_Level, File.m_Number, file_size,
+                       low_key, high_key,
+                       count1, count2, count3);
+    }    // CreateFile
+
+
+    void CreateManifest(const sExpiryTestFile * Files, size_t Count)
+    {
+        int loop;
+        const sExpiryTestFile * cursor;
+        VersionEdit edit;
+        port::Mutex mutex;
+        Status s;
+
+        m_Sequence=1;
+        for (cursor=Files, loop=0; loop<Count; ++loop, ++cursor)
+        {
+            CreateFile(*cursor, edit);
+        }   // for
+
+        mutex.Lock();
+        s=m_DB->GetVersionSet()->LogAndApply(&edit, &mutex);
+        mutex.Unlock();
+        ASSERT_OK(s);
+
+    }   // CreateManifest
+
+
+    void VerifyManifest(const sExpiryTestFile * Files, size_t Count)
+    {
+        const Version::FileMetaDataVector_t * file_list;
+        Version::FileMetaDataVector_t::const_iterator it;
+        int current_level, loop, loop1;
+        const sExpiryTestFile * cursor;
+        InternalKey low_key, mid_key, high_key;
+        uint64_t exp_write_low, exp_write_high, exp_explicit_high, expires;
+
+        // setup
+        current_level=config::kNumLevels;
+        file_list=NULL;
+        m_Sequence=1;
+
+        for (cursor=Files, loop=0; loop<Count; ++loop, ++cursor)
+        {
+            // get proper manifest level
+            if (cursor->m_Level!=current_level)
+            {
+                current_level=cursor->m_Level;
+                file_list=&m_DB->GetVersionSet()->current()->GetFileList(current_level);
+                it=file_list->begin();
+            }   // if
+
+            // not set by builder   ASSERT_EQ((*it)->num_entries, 3);
+            ASSERT_EQ((*it)->level, cursor->m_Level);
+
+            // same code as above, just basic verification
+            CreateKey(cursor->m_Keys[0], low_key);
+            CreateKey(cursor->m_Keys[1], mid_key); // need to keep sequence # correct
+            CreateKey(cursor->m_Keys[2], high_key);
+
+            ASSERT_TRUE(0==m_Options.comparator->Compare(low_key.internal_key(),
+                                                         (*it)->smallest.internal_key()));
+            ASSERT_TRUE(0==m_Options.comparator->Compare(high_key.internal_key(),
+                                                         (*it)->largest.internal_key()));
+
+            // create our idea of the expiry settings
+            exp_write_low=ULLONG_MAX;
+            exp_write_high=0;
+            exp_explicit_high=0;
+
+            for (loop1=0; loop1<3; ++loop1)
+            {
+                switch(cursor->m_Keys[loop1].m_Type)
+                {
+                    case eEXPIRY_NONE:
+                        exp_write_low=0;
+                        break;
+
+                    case eEXPIRY_AGED:
+                        expires=m_BaseTime - cursor->m_Keys[loop1].m_NowMinus * 60 * port::UINT64_ONE_SECOND_MICROS;
+                        if (expires<exp_write_low)
+                            exp_write_low=expires;
+                        if (exp_write_high<expires)
+                            exp_write_high=expires;
+                        break;
+
+                    case eEXPIRY_EXPLICIT:
+                        expires=m_BaseTime + cursor->m_Keys[loop1].m_NowMinus * 60 * port::UINT64_ONE_SECOND_MICROS;
+                        if (exp_explicit_high<expires)
+                            exp_explicit_high=expires;
+                        break;
+                }   // switch
+            }   // for
+
+            // test our idea against manifest's idea
+            ASSERT_EQ(exp_write_low, (*it)->exp_write_low);
+            ASSERT_EQ(exp_write_high, (*it)->exp_write_high);
+            ASSERT_EQ(exp_explicit_high, (*it)->exp_explicit_high);
+
+            // inc here since not initialized upon for loop entry
+            ++it;
+        }   // for
+
+        return;
+
+    }   // VerifyManifest
+
+    void VerifyFiles(const sExpiryTestFile * Files, size_t Count, int State)
+    {
+        int current_level, loop, loop1;
+        std::vector<std::string> file_names;
+        std::vector<std::string>::iterator f_it;
+
+        std::string dir_name, target;
+        const sExpiryTestFile * cursor;
+
+        current_level=-1;
+
+        for (cursor=Files, loop=0; loop<Count; ++loop, ++cursor)
+        {
+            if (cursor->m_Level!=current_level)
+            {
+                // should be no files left in list upon level change
+                //   (except "." and "..")
+                ASSERT_LE(file_names.size(), 2);
+                file_names.clear();
+
+                current_level=cursor->m_Level;
+                dir_name=MakeDirName2(*m_DB->GetOptions(), current_level, "sst");
+                m_Env->GetChildren(dir_name, &file_names);
+            }   // if
+
+            // is file still found on disk?
+            if (State <= cursor->m_LastValidState)
+            {
+                // -2 omits directory
+                target=TableFileName(*m_DB->GetOptions(), cursor->m_Number, -2);
+                target.erase(0,target.find_last_of('/')+1);
+                f_it=std::find(file_names.begin(), file_names.end(), target);
+                ASSERT_TRUE(file_names.end()!=f_it);
+                file_names.erase(f_it);
+            }   // if
+        }   // for
+
+        // verify last populated level was good
+        ASSERT_LE(file_names.size(), 2);
+
+        return;
+
+    }   // VerifyManifest
+
+
+    void VerifyKeys(const sExpiryTestKey * Key, size_t Count, int Minutes)
+    {
+        Iterator * it;
+        const sExpiryTestKey * cursor;
+        int loop;
+
+        it=m_DB->NewIterator(ReadOptions());
+        it->SeekToFirst();
+
+        for (cursor=Key, loop=0; loop<Count; ++cursor, ++loop)
+        {
+
+            if ( (eEXPIRY_EXPLICIT == cursor->m_Type && Minutes <= cursor->m_NowMinus)
+                 || (eEXPIRY_AGED == cursor->m_Type && Minutes<m_Expiry->GetExpiryMinutes()))
+            {
+                ASSERT_TRUE(it->Valid());
+                ASSERT_TRUE(0==strcmp(cursor->m_Key, it->key().ToString().c_str()));
+                it->Next();
+            }   // if
+        }   // for
+
+        delete it;
+
+        return;
+
+    }   // VerifyKeys
+
+
+};  // ExpiryManifestTester
+
+
+sExpiryTestFile Manifest1[]=
+{
+    {101, 6, 0, {{"02", eEXPIRY_NONE, 0}, {"05", eEXPIRY_NONE, 0}, {"07", eEXPIRY_NONE, 0}}},
+    {102, 6, 0, {{"12", eEXPIRY_NONE, 0}, {"15", eEXPIRY_AGED, 25}, {"17", eEXPIRY_AGED, 25}}},
+    {103, 6, 0, {{"22", eEXPIRY_AGED, 25}, {"25", eEXPIRY_EXPLICIT, 20}, {"27", eEXPIRY_EXPLICIT, 20}}},
+    {104, 6, 0, {{"32", eEXPIRY_AGED, 25}, {"35", eEXPIRY_AGED, 25}, {"37", eEXPIRY_NONE, 0}}},
+    {105, 6, 0, {{"42", eEXPIRY_AGED, 25}, {"45", eEXPIRY_NONE, 0}, {"47", eEXPIRY_AGED, 25}}},
+
+    {201, 5, 0, {{"03", eEXPIRY_AGED, 10}, {"05", eEXPIRY_AGED, 10}, {"06", eEXPIRY_AGED, 10}}},
+    {202, 5, 0, {{"11", eEXPIRY_NONE, 0}, {"15", eEXPIRY_EXPLICIT, 15}, {"18", eEXPIRY_EXPLICIT, 15}}},
+    {203, 5, 0, {{"21", eEXPIRY_EXPLICIT, 15}, {"25", eEXPIRY_EXPLICIT, 15}, {"29", eEXPIRY_AGED, 10}}},
+    {204, 5, 0, {{"34", eEXPIRY_EXPLICIT, 15}, {"35", eEXPIRY_EXPLICIT, 15}, {"39", eEXPIRY_NONE, 0}}},
+    {205, 5, 0, {{"44", eEXPIRY_EXPLICIT, 15}, {"45", eEXPIRY_NONE, 0}, {"46", eEXPIRY_EXPLICIT, 15}}},
+
+    {301, 4, 0, {{"03", eEXPIRY_EXPLICIT, 5}, {"05", eEXPIRY_EXPLICIT, 5}, {"06", eEXPIRY_EXPLICIT, 5}}},
+    {302, 4, 0, {{"11", eEXPIRY_NONE, 0}, {"15", eEXPIRY_AGED, 5}, {"18", eEXPIRY_EXPLICIT, 5}}},
+    {303, 4, 0, {{"21", eEXPIRY_EXPLICIT, 5}, {"25", eEXPIRY_AGED, 5}, {"29", eEXPIRY_EXPLICIT, 5}}},
+    {304, 4, 0, {{"34", eEXPIRY_EXPLICIT, 5}, {"35", eEXPIRY_AGED, 5}, {"39", eEXPIRY_NONE, 0}}},
+    {305, 4, 0, {{"44", eEXPIRY_AGED, 5}, {"45", eEXPIRY_NONE, 0}, {"46", eEXPIRY_EXPLICIT, 5}}}
+
+};  // Manifest1
+
+/**
+ * Does manifest create correctly?
+ */
+TEST(ExpiryManifestTester, Manifest1)
+{
+    size_t manifest_count;
+    Status s;
+
+    manifest_count=sizeof(Manifest1) / sizeof(Manifest1[0]);
+    CreateManifest(Manifest1, manifest_count);
+
+    // quick verify
+    ASSERT_EQ(m_DB->GetVersionSet()->NumLevelFiles(6), 5);
+    ASSERT_EQ(m_DB->GetVersionSet()->NumLevelFiles(5), 5);
+    ASSERT_EQ(m_DB->GetVersionSet()->NumLevelFiles(4), 5);
+    ASSERT_EQ(m_DB->GetVersionSet()->NumLevelFiles(3), 0);
+    ASSERT_EQ(m_DB->GetVersionSet()->NumLevelFiles(2), 0);
+    ASSERT_EQ(m_DB->GetVersionSet()->NumLevelFiles(1), 0);
+    ASSERT_EQ(m_DB->GetVersionSet()->NumLevelFiles(0), 0);
+
+    // full verify
+    VerifyManifest(Manifest1, manifest_count);
+
+    // close, open, verify again
+    delete m_DB;
+    OpenTestDB();
+    VerifyManifest(Manifest1, manifest_count);
+
+    // close, repair, open, verify
+    delete m_DB;
+    s=RepairDB(m_DBName, m_Options);
+    ASSERT_OK(s);
+    OpenTestDB();
+    VerifyManifest(Manifest1, manifest_count);
+
+    return;
+};
+
+
+sExpiryTestFile Overlap1[]=
+{
+    // sorted levels
+    {101, 6, 5, {{"02", eEXPIRY_NONE, 0}, {"05", eEXPIRY_NONE, 0}, {"07", eEXPIRY_NONE, 0}}},
+    {102, 6, 2, {{"15", eEXPIRY_AGED, 25}, {"17", eEXPIRY_AGED, 25}, {"20", eEXPIRY_AGED, 25}}},
+
+    {201, 5, 5, {{"22", eEXPIRY_NONE, 0}, {"24", eEXPIRY_NONE, 0}, {"25", eEXPIRY_NONE, 0}}},
+
+    {301, 4, 5, {{"06", eEXPIRY_EXPLICIT, 5}, {"07", eEXPIRY_EXPLICIT, 5}, {"10", eEXPIRY_EXPLICIT, 5}}},
+    {302, 4, 0, {{"35", eEXPIRY_EXPLICIT, 5}, {"37", eEXPIRY_EXPLICIT, 5}, {"40", eEXPIRY_EXPLICIT, 5}}},
+
+    {401, 3, 5, {{"45", eEXPIRY_NONE, 0}, {"46", eEXPIRY_NONE, 0}, {"47", eEXPIRY_NONE, 0}}},
+
+    {450, 2, 3, {{"11", eEXPIRY_AGED, 25}, {"17", eEXPIRY_AGED, 25}, {"21", eEXPIRY_AGED, 25}}},
+
+    // Overlap levels
+    {501, 1, 5, {{"10", eEXPIRY_AGED, 25}, {"17", eEXPIRY_AGED, 25}, {"23", eEXPIRY_AGED, 25}}},
+    {502, 1, 5, {{"11", eEXPIRY_NONE, 0}, {"12", eEXPIRY_NONE, 0}, {"15", eEXPIRY_NONE, 0}}},
+    {503, 1, 1, {{"33", eEXPIRY_AGED, 25}, {"34", eEXPIRY_AGED, 25}, {"42", eEXPIRY_AGED, 25}}}
+
+
+};
+
+
+/*
+ * Test sequence that expired files get selected
+ */
+TEST(ExpiryManifestTester, Overlap1)
+{
+    size_t manifest_count;
+    Status s;
+
+    manifest_count=sizeof(Overlap1) / sizeof(Overlap1[0]);
+    CreateManifest(Overlap1, manifest_count);
+
+    // quick verify
+    ASSERT_EQ(m_DB->GetVersionSet()->NumLevelFiles(6), 2);
+    ASSERT_EQ(m_DB->GetVersionSet()->NumLevelFiles(5), 1);
+    ASSERT_EQ(m_DB->GetVersionSet()->NumLevelFiles(4), 2);
+    ASSERT_EQ(m_DB->GetVersionSet()->NumLevelFiles(3), 1);
+    ASSERT_EQ(m_DB->GetVersionSet()->NumLevelFiles(2), 1);
+    ASSERT_EQ(m_DB->GetVersionSet()->NumLevelFiles(1), 3);
+    ASSERT_EQ(m_DB->GetVersionSet()->NumLevelFiles(0), 0);
+
+    // full verify
+    VerifyManifest(Overlap1, manifest_count);
+    VerifyFiles(Overlap1, manifest_count, 0);
+
+
+    // fully enable compaction expiry
+    m_Expiry->SetExpiryEnabled(false);
+    ASSERT_EQ(m_Options.ExpiryActivated(), false);
+    m_Expiry->SetExpiryEnabled(true);
+    m_Expiry->SetExpiryMinutes(60);
+    m_Expiry->SetWholeFileExpiryEnabled(true);
+    ASSERT_EQ(m_Options.ExpiryActivated(), true);
+
+    m_DB->ShiftClockMinutes(10);
+    m_Expiry->m_ExpiryAllow=1;
+    m_DB->OneCompaction();
+    VerifyFiles(Overlap1, manifest_count, 1);
+
+    // total shift now 30 min
+    m_DB->ShiftClockMinutes(30);
+    m_Expiry->m_ExpiryAllow=1;
+    m_DB->OneCompaction();
+    VerifyFiles(Overlap1, manifest_count, 2);
+
+    m_Expiry->m_ExpiryAllow=1;
+    m_DB->OneCompaction();
+    VerifyFiles(Overlap1, manifest_count, 3);
+
+    m_Expiry->m_ExpiryAllow=1;
+    m_DB->OneCompaction();
+    VerifyFiles(Overlap1, manifest_count, 4);
+
+    m_Expiry->m_ExpiryAllow=1;
+    m_DB->OneCompaction();
+    VerifyFiles(Overlap1, manifest_count, 5);
+
+    return;
+};
+
+
+/*
+ * Test compaction will find all without prompting
+ */
+TEST(ExpiryManifestTester, Overlap2)
+{
+    size_t manifest_count;
+    Status s;
+
+    manifest_count=sizeof(Overlap1) / sizeof(Overlap1[0]);
+    CreateManifest(Overlap1, manifest_count);
+
+    // quick verify
+    ASSERT_EQ(m_DB->GetVersionSet()->NumLevelFiles(6), 2);
+    ASSERT_EQ(m_DB->GetVersionSet()->NumLevelFiles(5), 1);
+    ASSERT_EQ(m_DB->GetVersionSet()->NumLevelFiles(4), 2);
+    ASSERT_EQ(m_DB->GetVersionSet()->NumLevelFiles(3), 1);
+    ASSERT_EQ(m_DB->GetVersionSet()->NumLevelFiles(2), 1);
+    ASSERT_EQ(m_DB->GetVersionSet()->NumLevelFiles(1), 3);
+    ASSERT_EQ(m_DB->GetVersionSet()->NumLevelFiles(0), 0);
+
+    // full verify
+    VerifyManifest(Overlap1, manifest_count);
+    VerifyFiles(Overlap1, manifest_count, 0);
+
+    // enable compaction expiry
+    m_Expiry->SetExpiryEnabled(true);
+    m_Expiry->SetExpiryMinutes(60);
+    m_Expiry->SetWholeFileExpiryEnabled(true);
+    m_DB->ShiftClockMinutes(61);
+
+    m_Expiry->m_ExpiryAllow=10;
+    m_DB->OneCompaction();
+
+    // let multiple threads complete
+    /// sleep(1) required for Smart OS 1.8 buildbot
+    ///  then rased to sleep(2) for freebsd buildbot
+    sleep(2);
+    VerifyFiles(Overlap1, manifest_count, 5);
+
+    return;
+};
+
+
+sExpiryTestKey Compact1[]=
+{
+    {"01", eEXPIRY_AGED, 0},
+    {"02", eEXPIRY_EXPLICIT, 35},
+    {"03", eEXPIRY_AGED, 0},
+    {"04", eEXPIRY_EXPLICIT, 55},
+    {"05", eEXPIRY_AGED, 0},
+    {"06", eEXPIRY_EXPLICIT, 15},
+    {"07", eEXPIRY_AGED, 0},
+    {"08", eEXPIRY_EXPLICIT, 5},
+    {"09", eEXPIRY_AGED, 0},
+    {"10", eEXPIRY_EXPLICIT, 55},
+    {"11", eEXPIRY_AGED, 0},
+    {"12", eEXPIRY_EXPLICIT, 65},
+    {"13", eEXPIRY_AGED, 0}
+
+};
+
+
+/*
+ * Test expiry records get filtered during regular compaction
+ *  (and expiring all leads to file deletion)
+ */
+TEST(ExpiryManifestTester, Compact1)
+{
+    size_t key_count;
+    const sExpiryTestKey * Key;
+    Status s;
+    WriteBatch batch;
+    KeyMetaData meta;
+    int loop;
+    ExpiryTimeMicros expiry;
+    ValueType type;
+
+    // enable compaction expiry
+    m_Expiry->SetExpiryEnabled(true);
+    m_Expiry->SetExpiryMinutes(30);
+    m_Expiry->SetWholeFileExpiryEnabled(false);
+
+    key_count=sizeof(Compact1) / sizeof(Compact1[0]);
+
+    for (loop=0, Key=Compact1; loop<key_count; ++loop, ++Key)
+    {
+        switch(Key->m_Type)
+        {
+            case(eEXPIRY_NONE):
+                expiry=0;
+                type=kTypeValue;
+                break;
+
+            case(eEXPIRY_AGED):
+                expiry=m_BaseTime - Key->m_NowMinus * 60 * port::UINT64_ONE_SECOND_MICROS;
+                type=kTypeValueWriteTime;
+                break;
+
+            case(eEXPIRY_EXPLICIT):
+                expiry=m_BaseTime + Key->m_NowMinus * 60 * port::UINT64_ONE_SECOND_MICROS;
+                type=kTypeValueExplicitExpiry;
+                break;
+        }   // switch
+
+        meta.m_Type=type;
+        meta.m_Expiry=expiry;
+        s=m_DB->Put(WriteOptions(), Key->m_Key, "gig\'em", &meta);
+        ASSERT_OK(s);
+    }   // for
+
+    // load seem ok?
+    VerifyKeys(Compact1, key_count, 0);
+
+    // move write buffer to .sst file
+    //  (no expiry in buffer to .sst conversion)
+    m_DB->TEST_CompactMemTable();
+    VerifyKeys(Compact1, key_count, 0);
+
+    m_DB->ShiftClockMinutes(20);
+    m_DB->TEST_CompactRange(3, NULL, NULL);
+    VerifyKeys(Compact1, key_count, 20);
+
+    m_DB->ShiftClockMinutes(16);
+    m_DB->TEST_CompactRange(4, NULL, NULL);
+    VerifyKeys(Compact1, key_count, 36);
+
+    m_DB->ShiftClockMinutes(35);
+    m_DB->TEST_CompactRange(5, NULL, NULL);
+    VerifyKeys(Compact1, key_count, 71);
+
+}   // Compact1
+
+
+struct sExpiryDBObject
+{
+    const char * m_Key;   // string key
+    const char * m_Value; // string value
+    int m_NowMinus;       // expiry time to set
+};
+
+
+class ExpiryDBTester
+{
+public:
+    ExpiryDBTester()
+        : m_Good(false), m_DB(NULL),
+          m_BaseTime(port::TimeMicros())
+    {
+        m_DBName = test::TmpDir() + "/expiry";
+
+        // clean up previous execution
+        leveldb::DestroyDB(m_DBName, m_Options);
+
+        m_Options.create_if_missing=true;
+        m_Options.error_if_exists=false;
+
+        // Note: m_Options.expiry_module is a smart pointer.  It
+        //  owns the m_Expiry object and will automatically delete the
+        //  allocation.
+        m_Expiry=new leveldb::ExpiryModuleOS;
+        m_Options.expiry_module=m_Expiry;
+
+        OpenTestDB();
+    };
+
+    ~ExpiryDBTester()
+    {
+        // clean up
+        delete m_DB;
+        leveldb::DestroyDB(m_DBName, m_Options);
+    };
+
+    void OpenTestDB()
+    {
+        leveldb::Status status;
+
+        status=leveldb::DB::Open(m_Options, m_DBName, (DB**)&m_DB);
+
+        m_Good=status.ok();
+        ASSERT_OK(status);
+        m_DB->SetClock(m_BaseTime);
+    }   // OpenTestDB
+
+protected:
+    bool m_Good;
+    std::string m_DBName;
+    Options m_Options;
+    leveldb::ExpiryModuleOS * m_Expiry;
+    ExpDB * m_DB;
+    uint64_t m_BaseTime;
+
+};  // ExpiryDBTester
+
+
+sExpiryDBObject SimpleData[]=
+{
+    {"aa", "one", 0},
+    {"bb", "two", 0},
+    {"cc", "three", 0},
+    {"dd", "four", 0},
+    {"ee", "five", 0}
+};
+
+
+/*
+ * Do simple writes, see if data disappears
+ *
+ */
+TEST(ExpiryDBTester, Simple)
+{
+    size_t obj_count, loop;
+    Status s;
+    sExpiryDBObject * cursor;
+    std::string buffer;
+    std::auto_ptr<leveldb::Iterator> iterator;
+
+    // enable compaction expiry
+    m_Expiry->SetExpiryEnabled(true);
+    m_Expiry->SetExpiryMinutes(2);
+    m_Expiry->SetWholeFileExpiryEnabled(false);
+
+    obj_count=sizeof(SimpleData) / sizeof(SimpleData[0]);
+
+    // load data (now in memory buffer)
+    for (loop=0, cursor=SimpleData; loop<obj_count; ++loop, ++cursor)
+    {
+        s=m_DB->Put(WriteOptions(), cursor->m_Key, cursor->m_Value);
+        ASSERT_OK(s);
+    }   // for
+
+    // verify we can find it
+    for (loop=0, cursor=SimpleData; loop<obj_count; ++loop, ++cursor)
+    {
+        s=m_DB->Get(ReadOptions(), cursor->m_Key, &buffer);
+        ASSERT_OK(s);
+    }   // for
+
+    // verify we can walk it
+    iterator.reset(m_DB->NewIterator(ReadOptions()));
+    for (loop=0, iterator->SeekToFirst(); loop<obj_count; ++loop, iterator->Next())
+    {
+        ASSERT_EQ(iterator->Valid(), true);
+    }   // for
+    ASSERT_EQ(iterator->Valid(), false);
+
+    // expiry set to 2 min, so shift 10
+    m_DB->ShiftClockMinutes(10);
+
+    // all data gone?
+    for (loop=0, cursor=SimpleData; loop<obj_count; ++loop, ++cursor)
+    {
+        s=m_DB->Get(ReadOptions(), cursor->m_Key, &buffer);
+        ASSERT_TRUE(s.IsNotFound());
+    }   // for
+
+    // make it reappear
+    m_Expiry->SetExpiryUnlimited(true);
+    for (loop=0, cursor=SimpleData; loop<obj_count; ++loop, ++cursor)
+    {
+        s=m_DB->Get(ReadOptions(), cursor->m_Key, &buffer);
+        ASSERT_OK(s);
+    }   // for
+
+    m_Expiry->SetExpiryMinutes(2);
+    iterator.reset(m_DB->NewIterator(ReadOptions()));
+    iterator->SeekToFirst();
+    ASSERT_EQ(iterator->Valid(), false);
+
+    // force data from memory buffer to .sst file
+    //  (after shifting clock!!)
+    m_DB->SetClock(m_BaseTime);
+    m_DB->CompactRange(NULL, NULL);
+
+    // verify we can find it
+    for (loop=0, cursor=SimpleData; loop<obj_count; ++loop, ++cursor)
+    {
+        s=m_DB->Get(ReadOptions(), cursor->m_Key, &buffer);
+        ASSERT_OK(s);
+    }   // for
+
+    // verify we can walk it
+    iterator.reset(m_DB->NewIterator(ReadOptions()));
+    for (loop=0, iterator->SeekToFirst(); loop<obj_count; ++loop, iterator->Next())
+    {
+        ASSERT_EQ(iterator->Valid(), true);
+    }   // for
+    ASSERT_EQ(iterator->Valid(), false);
+
+    // expiry set to 2 min, so shift 10
+    m_DB->ShiftClockMinutes(10);
+
+    // all data gone?
+    for (loop=0, cursor=SimpleData; loop<obj_count; ++loop, ++cursor)
+    {
+        s=m_DB->Get(ReadOptions(), cursor->m_Key, &buffer);
+        ASSERT_TRUE(s.IsNotFound());
+    }   // for
+
+    iterator.reset(m_DB->NewIterator(ReadOptions()));
+    iterator->SeekToFirst();
+    ASSERT_EQ(iterator->Valid(), false);
+
+
+    // run compaction again with clock advanced
+    //  to physically remove records.  Then move
+    //  clock to starting time and prove records gone gone.
+    ///  (note that we "know" .sst file is on level 3)
+    m_DB->TEST_CompactRange(3, NULL, NULL);
+    m_DB->SetClock(m_BaseTime);
+
+    // all data gone?
+    for (loop=0, cursor=SimpleData; loop<obj_count; ++loop, ++cursor)
+    {
+        s=m_DB->Get(ReadOptions(), cursor->m_Key, &buffer);
+        ASSERT_TRUE(s.IsNotFound());
+    }   // for
+
+    iterator.reset(m_DB->NewIterator(ReadOptions()));
+    iterator->SeekToFirst();
+    ASSERT_EQ(iterator->Valid(), false);
+
+}   // ExpiryDBTester::Simple
+
+
+/**
+ * Riak uses a special key to mark a "feature upgrade".  That
+ *  key must never expire.
+ */
+// from riak_kv_eleveldb_backend.erl:  sext:encode({md,fixed_indexes}).
+static const char * MDKey=
+{"\x10\x00\x00\x00\x02\x0c\xb6\xd9\x00\x08\x0c\xb3\x5a\x6f\x16\x5b\x25\x7e\xd3\x6e\xb2\x59\x64\x16\x5b\x98\x08"};
+static const int MDKeyLen=27;
+
+// example Riak key:  sext:encode({o,{<<bob1>>,<<buck1>>,<<key0>>}).
+static const char * RiakKey=
+{"\x10\x00\x00\x00\x03\x0c\xb7\x80\x08\x10\x00\x00\x00\x02\x12\xb1\x5b\xec\x53\x10\x08\x12\xb1\x5d\x6c\x76\xb9\x88\x08\x12\xb5\xd9\x6f\x33\x10\x08"};
+static const int RiakKeyLen=36;
+
+TEST(ExpiryDBTester, MetaDataKey)
+{
+    Slice key_md(MDKey, MDKeyLen);
+    Slice key_riak(RiakKey, RiakKeyLen);
+    Slice no_value;
+    std::string return_value;
+    KeyMetaData meta;
+    Status s;
+
+    // enable expiry
+    m_Expiry->SetExpiryEnabled(true);
+    m_Expiry->SetExpiryMinutes(2);
+    m_Expiry->SetWholeFileExpiryEnabled(false);
+
+    // write special key that should not receive expiry
+    s=m_DB->Put(WriteOptions(), key_md, no_value);
+    ASSERT_OK(s);
+
+    // verify
+    s=m_DB->Get(ReadOptions(), key_md, &return_value, &meta);
+    ASSERT_OK(s);
+    ASSERT_EQ(meta.m_Type, kTypeValue);
+
+    // write a normal key that SHOULD get expiry
+    s=m_DB->Put(WriteOptions(), key_riak, no_value);
+    ASSERT_OK(s);
+
+    // verify
+    s=m_DB->Get(ReadOptions(), key_riak, &return_value, &meta);
+    ASSERT_OK(s);
+    ASSERT_EQ(meta.m_Type, kTypeValueWriteTime);
+
+}   // ExpiryDBTester, MetaDataKey
+
+
+
+}  // namespace leveldb
+
diff --git a/src/leveldb/util/flexcache.cc b/src/leveldb/util/flexcache.cc
new file mode 100644
index 000000000..d2aebd8d6
--- /dev/null
+++ b/src/leveldb/util/flexcache.cc
@@ -0,0 +1,129 @@
+// -------------------------------------------------------------------
+//
+// flexcache.cc
+//
+// Copyright (c) 2011-2013 Basho Technologies, Inc. All Rights Reserved.
+//
+// This file is provided to you under the Apache License,
+// Version 2.0 (the "License"); you may not use this file
+// except in compliance with the License.  You may obtain
+// a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+//
+// -------------------------------------------------------------------
+
+#include <sys/time.h>
+#include <sys/resource.h>
+
+#include "util/db_list.h"
+#include "util/flexcache.h"
+
+namespace leveldb {
+
+
+// global cache control
+FlexCache gFlexCache;
+
+
+/**
+ * Initialize object
+ */
+FlexCache::FlexCache()
+    : m_TotalMemory(0)
+{
+    struct rlimit limit;
+    int ret_val;
+
+    // initialize total memory available based upon system data
+    ret_val=getrlimit(RLIMIT_DATA, &limit);
+
+    //  unsigned long caste to fix warning in smartos1.8, smartos 13.1, solaris10
+    if (0==ret_val && (unsigned long)RLIM_INFINITY!=limit.rlim_max)
+    {
+        // 2Gig is "small ram", Riak going to be tight
+       if (limit.rlim_max < flex::kRlimSizeIsSmall)
+            m_TotalMemory=flex::kRlimSmall;
+        else
+            m_TotalMemory=(limit.rlim_max - flex::kRlimLargeReserve) / 2;
+    }   // if
+
+    // create a default similar to Google's original,
+    //  but enough for 2 vnodes including Riak default buffer sizes
+    else
+    {
+        m_TotalMemory=flex::kDefaultMemory;
+    }   // else
+
+    return;
+
+}   // FlexCache::FlexCache
+
+
+/**
+ * Return current capacity limit for cache flavor indicated,
+ *  default is zero if unknown flavor.
+ */
+uint64_t
+FlexCache::GetDBCacheCapacity(
+    bool IsInternal)   //!< value describing cache attributes of caller
+{
+    uint64_t ret_val, shared_total;
+    size_t count, internal_count;
+
+    // get count of database by type
+    count=DBList()->GetDBCount(IsInternal);
+    if (IsInternal)
+        internal_count=count;
+    else
+        internal_count=DBList()->GetDBCount(true);
+
+    // what is total memory assigned to a type
+    if (IsInternal)
+        shared_total=(m_TotalMemory*2)/10;  // integer *.2
+    else if (0!=internal_count)
+        shared_total=(m_TotalMemory*8)/10;
+    else // no internal database
+        shared_total=m_TotalMemory;
+
+    // split up type specific aggregate to "per database" value
+    if (0!=count)
+        ret_val=shared_total / count;
+    else
+        ret_val=shared_total;
+
+    return(ret_val);
+
+}   // FlexCache::GetDBCacheCapacity
+
+
+/**
+ * Change the memory allocated to all caches, and actively resize
+ *  existing caches
+ */
+void
+FlexCache::SetTotalMemory(
+    uint64_t Total)    //!< new memory allocated to all caches
+{
+    // only review current allocation if new value is different
+    //  and not zero default
+    if (0!=Total && Total!=m_TotalMemory)
+    {
+        m_TotalMemory=Total;
+    }   // if
+
+    DBList()->ScanDBs(true, &DBImpl::ResizeCaches);
+    DBList()->ScanDBs(false, &DBImpl::ResizeCaches);
+
+    return;
+
+}   // FlexCache::SetTotalMemory
+
+}  // namespace leveldb
diff --git a/src/leveldb/util/flexcache.h b/src/leveldb/util/flexcache.h
new file mode 100644
index 000000000..768693a35
--- /dev/null
+++ b/src/leveldb/util/flexcache.h
@@ -0,0 +1,72 @@
+// -------------------------------------------------------------------
+//
+// flexcache.h
+//
+// Copyright (c) 2011-2013 Basho Technologies, Inc. All Rights Reserved.
+//
+// This file is provided to you under the Apache License,
+// Version 2.0 (the "License"); you may not use this file
+// except in compliance with the License.  You may obtain
+// a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+//
+// -------------------------------------------------------------------
+
+#include "util/cache2.h"
+
+#ifndef STORAGE_LEVELDB_INCLUDE_FLEXCACHE_H_
+#define STORAGE_LEVELDB_INCLUDE_FLEXCACHE_H_
+
+namespace leveldb
+{
+
+// Constants declared in style of db/dbformat.h
+namespace flex
+{
+
+   static const uint64_t kRlimSizeIsSmall = 2*1024*1024*1024ULL;  // above 2G is lots of ram
+   static const uint64_t kRlimSmall = 256*1024*1024ULL;
+   static const uint64_t kRlimLargeReserve = 1024*1024*1024ULL;
+   static const uint64_t kDefaultMemory = 340*1024*1024ULL;
+   static const uint64_t kMinimumDBMemory = 10*1024*1024ULL;
+
+}   // namespace flex
+
+/**
+ * FlexCache tunes file cache versus block cache versus number
+ *  of open databases
+ */
+
+class FlexCache
+{
+public:
+    FlexCache();
+
+    uint64_t GetDBCacheCapacity(bool IsInternalDB);
+
+    void SetTotalMemory(uint64_t Total);
+
+    void RecalculateAllocations() {SetTotalMemory(0);};
+
+    uint64_t GetTotalMemory() const {return(m_TotalMemory);};
+
+protected:
+
+    uint64_t m_TotalMemory; //!< complete memory assigned to all FlexCache clients
+
+};  // class FlexCache
+
+
+extern FlexCache gFlexCache;
+
+}  // namespace leveldb
+
+#endif   // STORAGE_LEVELDB_INCLUDE_FLEXCACHE_H_
diff --git a/src/leveldb/util/flexcache_test.cc b/src/leveldb/util/flexcache_test.cc
new file mode 100644
index 000000000..d4b49bd0e
--- /dev/null
+++ b/src/leveldb/util/flexcache_test.cc
@@ -0,0 +1,246 @@
+// -------------------------------------------------------------------
+//
+// flexcache_test.cc
+//
+// Copyright (c) 2013 Basho Technologies, Inc. All Rights Reserved.
+//
+// This file is provided to you under the Apache License,
+// Version 2.0 (the "License"); you may not use this file
+// except in compliance with the License.  You may obtain
+// a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+//
+// -------------------------------------------------------------------
+
+#include <string>
+
+#include "leveldb/db.h"
+#include "leveldb/env.h"
+#include "leveldb/filter_policy.h"
+#include "leveldb/options.h"
+#include "leveldb/status.h"
+#include "util/db_list.h"
+#include "util/testharness.h"
+
+namespace leveldb {
+
+class FlexCacheTest { };
+
+TEST(FlexCacheTest, UserSizing) {
+    Options options;
+    DB * db[10];
+    Status st;
+    std::string dbname, value;
+    int loop;
+    char buffer[12];
+
+    options.create_if_missing=true;
+    options.filter_policy=NewBloomFilterPolicy2(16);
+    options.total_leveldb_mem=1000*1024*1024L;
+    options.write_buffer_size=45*1024*1024L;
+
+    // verify accounting with one database
+    dbname = test::TmpDir() + "/flexcache0";
+    st=DB::Open(options, dbname, &db[0]);
+    ASSERT_OK(st);
+    ASSERT_EQ(1, DBList()->GetDBCount(false));
+
+    db[0]->GetProperty("leveldb.block-cache", &value);
+    ASSERT_EQ(922742784L, atoi(value.c_str()));
+
+    db[0]->GetProperty("leveldb.file-cache", &value);
+    ASSERT_EQ(920645632L, atoi(value.c_str()));
+
+    // verify accounting with three databases
+    dbname = test::TmpDir() + "/flexcache1";
+    st=DB::Open(options, dbname, &db[1]);
+    ASSERT_OK(st);
+    dbname = test::TmpDir() + "/flexcache2";
+    st=DB::Open(options, dbname, &db[2]);
+    ASSERT_OK(st);
+    ASSERT_EQ(3, DBList()->GetDBCount(false));
+
+    db[0]->GetProperty("leveldb.block-cache", &value);
+    ASSERT_EQ(223692117L, atoi(value.c_str()));
+
+    db[0]->GetProperty("leveldb.file-cache", &value);
+    ASSERT_EQ(221594965L, atoi(value.c_str()));
+
+    db[1]->GetProperty("leveldb.block-cache", &value);
+    ASSERT_EQ(223692117L, atoi(value.c_str()));
+
+    db[1]->GetProperty("leveldb.file-cache", &value);
+    ASSERT_EQ(221594965L, atoi(value.c_str()));
+
+    db[2]->GetProperty("leveldb.block-cache", &value);
+    ASSERT_EQ(223692117L, atoi(value.c_str()));
+
+    db[2]->GetProperty("leveldb.file-cache", &value);
+    ASSERT_EQ(221594965L, atoi(value.c_str()));
+
+    // verify accounting after two databases go away
+    delete db[0];
+    delete db[2];
+
+    db[1]->GetProperty("leveldb.block-cache", &value);
+    ASSERT_EQ(922742784L, atoi(value.c_str()));
+
+    db[1]->GetProperty("leveldb.file-cache", &value);
+    ASSERT_EQ(920645632L, atoi(value.c_str()));
+
+    // rebuild from zero to ten databases, verify accounting
+    delete db[1];
+
+    options.total_leveldb_mem=3000*1024*1024L;
+    for(loop=0; loop<10; ++loop)
+    {
+        snprintf(buffer, sizeof(buffer), "/flexcache%u", loop);
+        dbname=test::TmpDir() + buffer;
+        st=DB::Open(options, dbname, &db[loop]);
+        ASSERT_OK(st);
+        ASSERT_EQ(loop+1, DBList()->GetDBCount(false));
+    }   // for
+
+    for(loop=0; loop<10; ++loop)
+    {
+        db[loop]->GetProperty("leveldb.block-cache", &value);
+        ASSERT_EQ(188739584l, atoi(value.c_str()));
+
+        db[loop]->GetProperty("leveldb.file-cache", &value);
+        ASSERT_EQ(186642432L, atoi(value.c_str()));
+    }   // for
+
+    for (loop=0; loop<10; ++loop)
+    {
+        delete db[loop];
+        snprintf(buffer, sizeof(buffer), "/flexcache%u", loop);
+        dbname=test::TmpDir() + buffer;
+        st=DestroyDB(dbname, options);
+        ASSERT_OK(st);
+    }   // for
+
+    delete options.filter_policy;
+    options.filter_policy=NULL;
+}
+
+TEST(FlexCacheTest, MixedSizing) {
+    Options options;
+    DB * db[10];
+    Status st;
+    std::string dbname, value;
+    int loop;
+    char buffer[12];
+
+    options.create_if_missing=true;
+    options.filter_policy=NewBloomFilterPolicy2(16);
+    options.total_leveldb_mem=1000*1024*1024L;
+    options.write_buffer_size=45*1024*1024L;
+
+    // verify accounting with one user & one internal
+    dbname = test::TmpDir() + "/flexcache0";
+    st=DB::Open(options, dbname, &db[0]);
+    ASSERT_OK(st);
+    ASSERT_EQ(1, DBList()->GetDBCount(false));
+    ASSERT_EQ(0, DBList()->GetDBCount(true));
+
+    db[0]->GetProperty("leveldb.block-cache", &value);
+    ASSERT_EQ(922742784l, atoi(value.c_str()));
+
+    db[0]->GetProperty("leveldb.file-cache", &value);
+    ASSERT_EQ(920645632L, atoi(value.c_str()));
+
+    // add internal
+    dbname = test::TmpDir() + "/flexcache1";
+    options.is_internal_db=true;
+    options.total_leveldb_mem=1600*1024*1024L;
+    st=DB::Open(options, dbname, &db[1]);
+    ASSERT_OK(st);
+    ASSERT_EQ(1, DBList()->GetDBCount(false));
+    ASSERT_EQ(1, DBList()->GetDBCount(true));
+
+    db[0]->GetProperty("leveldb.block-cache", &value);
+    ASSERT_EQ(1216344064l, atoi(value.c_str()));
+
+    db[0]->GetProperty("leveldb.file-cache", &value);
+    ASSERT_EQ(1214246912L, atoi(value.c_str()));
+
+    db[1]->GetProperty("leveldb.block-cache", &value);
+    ASSERT_EQ(209711104l, atoi(value.c_str()));
+
+    db[1]->GetProperty("leveldb.file-cache", &value);
+    ASSERT_EQ(207613952L, atoi(value.c_str()));
+
+    delete db[0];
+    ASSERT_EQ(0, DBList()->GetDBCount(false));
+    ASSERT_EQ(1, DBList()->GetDBCount(true));
+    db[1]->GetProperty("leveldb.block-cache", &value);
+    ASSERT_EQ(209711104L, atoi(value.c_str()));
+
+    db[1]->GetProperty("leveldb.file-cache", &value);
+    ASSERT_EQ(207613952L, atoi(value.c_str()));
+
+    delete db[1];
+
+
+    // rebuild from zero to ten databases, verify accounting
+    options.total_leveldb_mem=4000*1024*1024L;
+
+    for(loop=0; loop<10; ++loop)
+    {
+        options.is_internal_db=(1==(loop %2));
+        snprintf(buffer, sizeof(buffer), "/flexcache%u", loop);
+        dbname=test::TmpDir() + buffer;
+        st=DB::Open(options, dbname, &db[loop]);
+        ASSERT_OK(st);
+    }   // for
+
+    ASSERT_EQ(5, DBList()->GetDBCount(false));
+    ASSERT_EQ(5, DBList()->GetDBCount(true));
+
+    for(loop=0; loop<10; ++loop)
+    {
+        if (0==(loop %2))
+        {
+            db[loop]->GetProperty("leveldb.block-cache", &value);
+            ASSERT_EQ(545255424l, atoi(value.c_str()));
+
+            db[loop]->GetProperty("leveldb.file-cache", &value);
+            ASSERT_EQ(543158272L, atoi(value.c_str()));
+        }   // if
+        else
+        {
+            db[loop]->GetProperty("leveldb.block-cache", &value);
+            ASSERT_EQ(41938944l, atoi(value.c_str()));
+
+            db[loop]->GetProperty("leveldb.file-cache", &value);
+            ASSERT_EQ(39841792L, atoi(value.c_str()));
+        }   // else
+    }   // for
+
+    for (loop=0; loop<10; ++loop)
+    {
+        delete db[loop];
+        snprintf(buffer, sizeof(buffer), "/flexcache%u", loop);
+        dbname=test::TmpDir() + buffer;
+        st=DestroyDB(dbname, options);
+        ASSERT_OK(st);
+    }   // for
+
+    delete options.filter_policy;
+    options.filter_policy=NULL;
+}
+
+
+}  // namespace leveldb
+
+int main(int argc, char** argv) {
+  return leveldb::test::RunAllTests();
+}
diff --git a/src/leveldb/util/hash.cc b/src/leveldb/util/hash.cc
index ed439ce7a..ba1818082 100644
--- a/src/leveldb/util/hash.cc
+++ b/src/leveldb/util/hash.cc
@@ -6,13 +6,6 @@
 #include "util/coding.h"
 #include "util/hash.h"
 
-// The FALLTHROUGH_INTENDED macro can be used to annotate implicit fall-through
-// between switch labels. The real definition should be provided externally.
-// This one is a fallback version for unsupported compilers.
-#ifndef FALLTHROUGH_INTENDED
-#define FALLTHROUGH_INTENDED do { } while (0)
-#endif
-
 namespace leveldb {
 
 uint32_t Hash(const char* data, size_t n, uint32_t seed) {
@@ -34,13 +27,13 @@ uint32_t Hash(const char* data, size_t n, uint32_t seed) {
   // Pick up remaining bytes
   switch (limit - data) {
     case 3:
-      h += static_cast<unsigned char>(data[2]) << 16;
-      FALLTHROUGH_INTENDED;
+      h += data[2] << 16;
+      // fall through
     case 2:
-      h += static_cast<unsigned char>(data[1]) << 8;
-      FALLTHROUGH_INTENDED;
+      h += data[1] << 8;
+      // fall through
     case 1:
-      h += static_cast<unsigned char>(data[0]);
+      h += data[0];
       h *= m;
       h ^= (h >> r);
       break;
diff --git a/src/leveldb/util/hash_test.cc b/src/leveldb/util/hash_test.cc
deleted file mode 100644
index eaa1c92c2..000000000
--- a/src/leveldb/util/hash_test.cc
+++ /dev/null
@@ -1,54 +0,0 @@
-// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
-
-#include "util/hash.h"
-#include "util/testharness.h"
-
-namespace leveldb {
-
-class HASH { };
-
-TEST(HASH, SignedUnsignedIssue) {
-  const unsigned char data1[1] = {0x62};
-  const unsigned char data2[2] = {0xc3, 0x97};
-  const unsigned char data3[3] = {0xe2, 0x99, 0xa5};
-  const unsigned char data4[4] = {0xe1, 0x80, 0xb9, 0x32};
-  const unsigned char data5[48] = {
-    0x01, 0xc0, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00,
-    0x14, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x04, 0x00,
-    0x00, 0x00, 0x00, 0x14,
-    0x00, 0x00, 0x00, 0x18,
-    0x28, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00,
-    0x02, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00,
-  };
-
-  ASSERT_EQ(Hash(0, 0, 0xbc9f1d34), 0xbc9f1d34);
-  ASSERT_EQ(
-      Hash(reinterpret_cast<const char*>(data1), sizeof(data1), 0xbc9f1d34),
-      0xef1345c4);
-  ASSERT_EQ(
-      Hash(reinterpret_cast<const char*>(data2), sizeof(data2), 0xbc9f1d34),
-      0x5b663814);
-  ASSERT_EQ(
-      Hash(reinterpret_cast<const char*>(data3), sizeof(data3), 0xbc9f1d34),
-      0x323c078f);
-  ASSERT_EQ(
-      Hash(reinterpret_cast<const char*>(data4), sizeof(data4), 0xbc9f1d34),
-      0xed21633a);
-  ASSERT_EQ(
-      Hash(reinterpret_cast<const char*>(data5), sizeof(data5), 0x12345678),
-      0xf333dabb);
-}
-
-}  // namespace leveldb
-
-int main(int argc, char** argv) {
-  return leveldb::test::RunAllTests();
-}
diff --git a/src/leveldb/util/hot_threads.cc b/src/leveldb/util/hot_threads.cc
new file mode 100644
index 000000000..95e13229d
--- /dev/null
+++ b/src/leveldb/util/hot_threads.cc
@@ -0,0 +1,351 @@
+// -------------------------------------------------------------------
+//
+// hot_threads.cc
+//
+// Copyright (c) 2011-2015 Basho Technologies, Inc. All Rights Reserved.
+//
+// This file is provided to you under the Apache License,
+// Version 2.0 (the "License"); you may not use this file
+// except in compliance with the License.  You may obtain
+// a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+//
+// -------------------------------------------------------------------
+
+// -------------------------------------------------------------------
+// HotThread is a subtle variation on the eleveldb_thread_pool.  Both
+//  represent a design pattern that is tested to perform better under
+//  the Erlang VM than other traditional designs.
+// -------------------------------------------------------------------
+
+#include <assert.h>
+#include <errno.h>
+#include <syslog.h>
+#include <sys/fcntl.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+
+#include "leveldb/atomics.h"
+#include "util/hot_threads.h"
+#include "util/thread_tasks.h"
+
+namespace leveldb {
+
+HotThreadPool * gImmThreads=NULL;
+HotThreadPool * gWriteThreads=NULL;
+HotThreadPool * gLevel0Threads=NULL;
+HotThreadPool * gCompactionThreads=NULL;
+
+
+
+void *ThreadStaticEntry(void *args)
+{
+    HotThread &tdata = *(HotThread *)args;
+
+    return(tdata.ThreadRoutine());
+
+}   // ThreadStaticEntry
+
+
+/**
+ * Worker threads:  worker threads have 3 states:
+ *  A. doing nothing, available to be claimed: m_Available=1
+ *  B. processing work passed by Erlang thread: m_Available=0, m_DirectWork=<non-null>
+ *  C. processing backlog queue of work: m_Available=0, m_DirectWork=NULL
+ */
+void *
+HotThread::ThreadRoutine()
+{
+    ThreadTask * submission;
+
+    submission=NULL;
+
+    port::SetCurrentThreadName(m_Pool.m_PoolName.c_str());
+#ifdef OS_LINUX
+    if (0!=m_Nice)
+    {
+        pid_t tid;
+        int ret_val;
+
+        tid = syscall(SYS_gettid);
+        if (-1!=(int)tid)
+        {
+            errno=0;
+            ret_val=getpriority(PRIO_PROCESS, tid);
+            // ret_val could be -1 legally, so double test
+            if (-1!=ret_val || 0==errno)
+                setpriority(PRIO_PROCESS, tid, ret_val+m_Nice);
+
+            assert((ret_val+m_Nice)==getpriority(PRIO_PROCESS, tid));
+        }   // if
+    }   // if
+#endif
+    while(!m_Pool.m_Shutdown)
+    {
+        // is work assigned yet?
+        //  check backlog work queue if not
+        if (NULL==submission)
+        {
+            // test non-blocking size for hint (much faster)
+            if (0!=m_Pool.m_WorkQueueAtomic)
+            {
+                // retest with locking
+                SpinLock lock(&m_Pool.m_QueueLock);
+
+                if (!m_Pool.m_WorkQueue.empty())
+                {
+                    submission=m_Pool.m_WorkQueue.front();
+                    m_Pool.m_WorkQueue.pop_front();
+                    dec_and_fetch(&m_Pool.m_WorkQueueAtomic);
+                    m_Pool.IncWorkDequeued();
+                    m_Pool.IncWorkWeighted(Env::Default()->NowMicros()
+                                           - submission->m_QueueStart);
+                }   // if
+            }   // if
+        }   // if
+
+
+        // a work item identified (direct or queue), work it!
+        //  then loop to test queue again
+        if (NULL!=submission)
+        {
+            // execute the job
+            (*submission)();
+            if (submission->resubmit())
+            {
+                submission->recycle();
+                m_Pool.Submit(submission);
+            }
+
+            submission->RefDec();
+
+            submission=NULL;
+        }   // if
+
+        // no work found, attempt to go into wait state
+        //  (but retest queue before sleep due to race condition)
+        else
+        {
+            MutexLock lock(&m_Mutex);
+
+            m_DirectWork=NULL; // safety
+
+            // only wait if we are really sure no work pending
+            if (0==m_Pool.m_WorkQueueAtomic)
+            {
+                // yes, thread going to wait. set available now.
+                m_Available=1;
+                m_Condition.Wait();
+            }    // if
+
+            m_Available=0;    // safety
+            submission=(ThreadTask *)m_DirectWork; // NULL is valid
+            m_DirectWork=NULL;// safety
+        }   // else
+    }   // while
+
+    return 0;
+
+}   // HotThread::ThreadRoutine
+
+
+
+
+HotThreadPool::HotThreadPool(
+    const size_t PoolSize,
+    const char * Name,
+    enum PerformanceCountersEnum Direct,
+    enum PerformanceCountersEnum Queued,
+    enum PerformanceCountersEnum Dequeued,
+    enum PerformanceCountersEnum Weighted,
+    int Nice)
+    : m_PoolName((Name?Name:"")),    // this crashes if Name is NULL ...but need it set now
+      m_Shutdown(false),
+      m_WorkQueueAtomic(0),
+      m_DirectCounter(Direct), m_QueuedCounter(Queued),
+      m_DequeuedCounter(Dequeued), m_WeightedCounter(Weighted)
+{
+    int ret_val;
+    size_t loop;
+    HotThread * hot_ptr;
+
+    ret_val=0;
+    for (loop=0; loop<PoolSize && 0==ret_val; ++loop)
+    {
+        hot_ptr=new HotThread(*this, Nice);
+
+        ret_val=pthread_create(&hot_ptr->m_ThreadId, NULL,  &ThreadStaticEntry, hot_ptr);
+        if (0==ret_val)
+            m_Threads.push_back(hot_ptr);
+        else
+            delete hot_ptr;
+    }   // for
+
+    m_Shutdown=(0!=ret_val);
+
+    return;
+
+}   // HotThreadPool::HotThreadPool
+
+
+HotThreadPool::~HotThreadPool()
+{
+    ThreadPool_t::iterator thread_it;
+    WorkQueue_t::iterator work_it;
+    // set flag
+    m_Shutdown=true;
+
+    // get all threads stopped
+    for (thread_it=m_Threads.begin(); m_Threads.end()!=thread_it; ++thread_it)
+    {
+        {
+            MutexLock lock(&(*thread_it)->m_Mutex);
+            (*thread_it)->m_Condition.SignalAll();
+        }   // lock
+
+        pthread_join((*thread_it)->m_ThreadId, NULL);
+        delete *thread_it;
+    }   // for
+
+    // release any objects hanging in work queue
+    for (work_it=m_WorkQueue.begin(); m_WorkQueue.end()!=work_it; ++work_it)
+    {
+        (*work_it)->RefDec();
+    }   // for
+
+    return;
+
+}   // HotThreadPool::~HotThreadPool
+
+
+bool                           // returns true if available worker thread found and claimed
+HotThreadPool::FindWaitingThread(
+    ThreadTask * work, // non-NULL to pass current work directly to a thread,
+                       // NULL to potentially nudge an available worker toward backlog queue
+    bool OkToQueue)
+{
+    bool ret_flag;
+    size_t start, index, pool_size;
+
+    ret_flag=false;
+
+    // pick "random" place in thread list.  hopefully
+    //  list size is prime number.
+    pool_size=m_Threads.size();
+    if (OkToQueue)
+        start=(size_t)pthread_self() % pool_size;
+    else
+        start=0;
+    index=start;
+
+    do
+    {
+        // perform quick test to see thread available
+        if (0!=m_Threads[index]->m_Available && !shutdown_pending())
+        {
+            // perform expensive compare and swap to potentially
+            //  claim worker thread (this is an exclusive claim to the worker)
+            ret_flag = compare_and_swap(&m_Threads[index]->m_Available, 1, 0);
+
+            // the compare/swap only succeeds if worker thread is sitting on
+            //  pthread_cond_wait ... or is about to be there but is holding
+            //  the mutex already
+            if (ret_flag)
+            {
+
+                // man page says mutex lock optional, experience in
+                //  this code says it is not.  using broadcast instead
+                //  of signal to cover one other race condition
+                //  that should never happen with single thread waiting.
+                MutexLock lock(&m_Threads[index]->m_Mutex);
+                m_Threads[index]->m_DirectWork=work;
+                m_Threads[index]->m_Condition.SignalAll();
+            }   // if
+        }   // if
+
+        index=(index+1)%pool_size;
+
+    } while(index!=start && !ret_flag && OkToQueue);
+
+    return(ret_flag);
+
+}   // FindWaitingThread
+
+
+bool
+HotThreadPool::Submit(
+    ThreadTask* item,
+    bool OkToQueue)
+{
+    bool ret_flag(false);
+
+    if (NULL!=item)
+    {
+        item->RefInc();
+
+        // do nothing if shutting down
+        if(shutdown_pending())
+        {
+            item->RefDec();
+            ret_flag=false;
+        }   // if
+
+        // try to give work to a waiting thread first
+        else if (FindWaitingThread(item, OkToQueue))
+        {
+            IncWorkDirect();
+            ret_flag=true;
+        }   // else if
+
+        else if (OkToQueue)
+        {
+            // hold mutex of only thread 0, this synchronizes this
+            //  thread and the first worker thread to ensure at least
+            //  one thread will eventually see the work item on the queue
+            //  before m_Condition.Wait()
+            {
+                item->m_QueueStart=Env::Default()->NowMicros();
+
+                MutexLock lock(&m_Threads[0]->m_Mutex);
+
+                // no waiting threads, put on backlog queue
+                {
+                    SpinLock lock(&m_QueueLock);
+                    inc_and_fetch(&m_WorkQueueAtomic);
+                    m_WorkQueue.push_back(item);
+                }
+            }   // mutex released
+
+            // to address race condition, thread might be waiting now
+            FindWaitingThread(NULL, true);
+
+            IncWorkQueued();
+            ret_flag=true;
+        }   // else if
+
+        // did not post to thread or queue
+        else
+        {
+            item->RefDec();
+            ret_flag=false;  // redundant, but safe
+        }   // else
+    }   // if
+
+    return(ret_flag);
+
+}   // HotThreadPool::Submit
+
+};  // namespace leveldb
diff --git a/src/leveldb/util/hot_threads.h b/src/leveldb/util/hot_threads.h
new file mode 100644
index 000000000..039e2506d
--- /dev/null
+++ b/src/leveldb/util/hot_threads.h
@@ -0,0 +1,141 @@
+// -------------------------------------------------------------------
+//
+// hot_threads.h
+//
+// Copyright (c) 2011-2015 Basho Technologies, Inc. All Rights Reserved.
+//
+// This file is provided to you under the Apache License,
+// Version 2.0 (the "License"); you may not use this file
+// except in compliance with the License.  You may obtain
+// a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+//
+// -------------------------------------------------------------------
+
+// -------------------------------------------------------------------
+// HotThread is a subtle variation on the eleveldb_thread_pool.  Both
+//  represent a design pattern that is tested to perform better under
+//  the Erlang VM than other traditional designs.
+// -------------------------------------------------------------------
+
+#ifndef STORAGE_LEVELDB_INCLUDE_HOT_THREADS_H_
+#define STORAGE_LEVELDB_INCLUDE_HOT_THREADS_H_
+
+#include <pthread.h>
+#include <semaphore.h>
+#include <deque>
+#include <vector>
+
+#include "leveldb/perf_count.h"
+#include "port/port.h"
+#include "util/mutexlock.h"
+
+namespace leveldb
+{
+
+// forward declare
+class ThreadTask;
+
+/**
+ * Meta / managment data related to a worker thread.
+ */
+struct HotThread
+{
+public:
+    pthread_t m_ThreadId;                //!< handle for this thread
+
+    volatile uint32_t m_Available;       //!< 1 if thread waiting, using standard type for atomic operation
+    class HotThreadPool & m_Pool;        //!< parent pool object
+    volatile ThreadTask * m_DirectWork;  //!< work passed direct to thread
+    int m_Nice;                          //!< amount to adjust sched priority
+
+    port::Mutex m_Mutex;             //!< mutex for condition variable
+    port::CondVar m_Condition;          //!< condition for thread waiting
+
+public:
+    HotThread(class HotThreadPool & Pool, int Nice)
+    : m_Available(0), m_Pool(Pool), m_DirectWork(NULL), m_Nice(Nice),
+        m_Condition(&m_Mutex)
+    {}   // HotThread
+
+    virtual ~HotThread() {};
+
+    // actual work loop
+    void * ThreadRoutine();
+
+private:
+    HotThread();                              // no default
+    HotThread(const HotThread &);             // no copy
+    HotThread & operator=(const HotThread&);  // no assign
+
+};  // class HotThread
+
+
+class HotThreadPool
+{
+public:
+    std::string m_PoolName;              //!< used to name threads for gdb / core
+    typedef std::deque<ThreadTask*> WorkQueue_t;
+    typedef std::vector<HotThread *>   ThreadPool_t;
+
+    volatile bool m_Shutdown;            //!< should we stop threads and shut down?
+
+    ThreadPool_t  m_Threads;             //!< pool of fast response workers
+
+    WorkQueue_t   m_WorkQueue;
+    port::Spin m_QueueLock;              //!< protects access to work_queue
+    volatile size_t m_WorkQueueAtomic;   //!< atomic size to parallel work_queue.size().
+
+    enum PerformanceCountersEnum m_DirectCounter;
+    enum PerformanceCountersEnum m_QueuedCounter;
+    enum PerformanceCountersEnum m_DequeuedCounter;
+    enum PerformanceCountersEnum m_WeightedCounter;
+
+public:
+    HotThreadPool(const size_t thread_pool_size, const char * Name,
+                  enum PerformanceCountersEnum Direct,
+                  enum PerformanceCountersEnum Queued,
+                  enum PerformanceCountersEnum Dequeued,
+                  enum PerformanceCountersEnum Weighted,
+                  int Nice=0);
+
+    virtual ~HotThreadPool();
+
+    static void *ThreadStart(void *args);
+
+    bool FindWaitingThread(ThreadTask * work, bool OkToQueue=true);
+
+    bool Submit(ThreadTask * item, bool OkToQueue=true);
+
+    size_t work_queue_size() const { return m_WorkQueue.size();}
+    bool shutdown_pending() const  { return m_Shutdown; }
+    leveldb::PerformanceCounters * perf() const {return(leveldb::gPerfCounters);};
+
+    void IncWorkDirect() {leveldb::gPerfCounters->Inc(m_DirectCounter);};
+    void IncWorkQueued() {leveldb::gPerfCounters->Inc(m_QueuedCounter);};
+    void IncWorkDequeued() {leveldb::gPerfCounters->Inc(m_DequeuedCounter);};
+    void IncWorkWeighted(uint64_t Count) {leveldb::gPerfCounters->Add(m_WeightedCounter, Count);};
+
+private:
+    HotThreadPool(const HotThreadPool &);             // nocopy
+    HotThreadPool& operator=(const HotThreadPool&);  // nocopyassign
+
+};  // class HotThreadPool
+
+extern HotThreadPool * gImmThreads;
+extern HotThreadPool * gWriteThreads;
+extern HotThreadPool * gLevel0Threads;
+extern HotThreadPool * gCompactionThreads;
+
+} // namespace leveldb
+
+
+#endif  // STORAGE_LEVELDB_INCLUDE_HOT_THREADS_H_
diff --git a/src/leveldb/util/hot_threads_test.cc b/src/leveldb/util/hot_threads_test.cc
new file mode 100644
index 000000000..b85c69e66
--- /dev/null
+++ b/src/leveldb/util/hot_threads_test.cc
@@ -0,0 +1,133 @@
+// -------------------------------------------------------------------
+//
+// hot_threads_test.cc
+//
+// Copyright (c) 2016 Basho Technologies, Inc. All Rights Reserved.
+//
+// This file is provided to you under the Apache License,
+// Version 2.0 (the "License"); you may not use this file
+// except in compliance with the License.  You may obtain
+// a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+//
+// -------------------------------------------------------------------
+
+
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+#include "port/port.h"
+#include "util/hot_threads.h"
+#include "util/mutexlock.h"
+#include "util/thread_tasks.h"
+
+/**
+ * Execution routine
+ */
+int main(int argc, char** argv)
+{
+  return leveldb::test::RunAllTests();
+}
+
+
+namespace leveldb {
+
+// helper function to clean up heap objects
+static void ClearMetaArray(Version::FileMetaDataVector_t & ClearMe);
+
+
+/**
+ * Wrapper class for tests.  Holds working variables
+ * and helper functions.
+ */
+class HotThreadsTester
+{
+public:
+    HotThreadsTester()
+    {
+    };
+
+    ~HotThreadsTester()
+    {
+    };
+};  // class HotThreadsTester
+
+
+class RaceTask : public ThreadTask
+{
+public:
+    port::Mutex * m_Mutex;
+    port::CondVar * m_Condition;
+    volatile bool * m_ReadyFlag;
+
+    RaceTask() {};
+    virtual ~RaceTask() {};
+
+    virtual void operator()()
+    {
+        volatile bool flag;
+
+        // is other thread waiting yet
+        do
+        {
+            MutexLock lock(m_Mutex);
+            flag=*m_ReadyFlag;
+        } while(!flag);
+
+        {
+            MutexLock lock(m_Mutex);
+            *m_ReadyFlag=false;
+            m_Condition->SignalAll();
+        }
+    };  // operator()
+
+};  // class RaceTask
+
+/**
+ * Reproduce race condition where all threads go to
+ *  into m_Condition.Wait() without seeing new work item
+ *  on queue (valgrind helps make failed code fail).
+ */
+TEST(HotThreadsTester, RaceCondition)
+{
+    HotThreadPool pool(1, "RacePool", ePerfDebug0,ePerfDebug1,ePerfDebug2,ePerfDebug3);
+    port::Mutex race_mutex;
+    port::CondVar race_condition(&race_mutex);
+    int loop_count(0);
+    volatile bool ready_flag;
+    int loop;
+    RaceTask * task;
+
+    for (loop=0; loop<10000000; ++loop)
+    {
+        task=new RaceTask;
+        task->m_Mutex=&race_mutex;
+        task->m_Condition=&race_condition;
+        task->m_ReadyFlag=&ready_flag;
+
+        ready_flag=false;
+        pool.Submit(task,true);
+
+        {
+            MutexLock lock(&race_mutex);
+            ready_flag=true;
+            race_condition.Wait();
+        }
+    }   // for
+
+    printf("loop: %d\n",loop);
+}   // test
+
+
+
+
+}  // namespace leveldb
+
diff --git a/src/leveldb/util/logging.cc b/src/leveldb/util/logging.cc
index 6995d9021..a24501bca 100644
--- a/src/leveldb/util/logging.cc
+++ b/src/leveldb/util/logging.cc
@@ -45,14 +45,37 @@ std::string EscapeString(const Slice& value) {
   return r;
 }
 
+std::string
+HexString(const Slice& value)
+{
+  std::string str;
+  for (size_t i = 0; i < value.size(); i++) {
+    char c = value[i];
+    char buf[10];
+    snprintf(buf, sizeof(buf), "%02x",
+             static_cast<unsigned int>(c) & 0xff);
+    str.append(buf);
+  }  // for
+  return(str);
+}  // HexString
+
+bool ConsumeChar(Slice* in, char c) {
+  if (!in->empty() && (*in)[0] == c) {
+    in->remove_prefix(1);
+    return true;
+  } else {
+    return false;
+  }
+}
+
 bool ConsumeDecimalNumber(Slice* in, uint64_t* val) {
   uint64_t v = 0;
   int digits = 0;
   while (!in->empty()) {
-    unsigned char c = (*in)[0];
+    char c = (*in)[0];
     if (c >= '0' && c <= '9') {
       ++digits;
-      const unsigned int delta = (c - '0');
+      const uint64_t delta = (c - '0');
       static const uint64_t kMaxUint64 = ~static_cast<uint64_t>(0);
       if (v > kMaxUint64/10 ||
           (v == kMaxUint64/10 && delta > kMaxUint64%10)) {
diff --git a/src/leveldb/util/logging.h b/src/leveldb/util/logging.h
index 1b450d248..9a3c5b41e 100644
--- a/src/leveldb/util/logging.h
+++ b/src/leveldb/util/logging.h
@@ -32,6 +32,13 @@ extern std::string NumberToString(uint64_t num);
 // Escapes any non-printable characters found in "value".
 extern std::string EscapeString(const Slice& value);
 
+// Return human-readable hex string version of "value"
+extern std::string HexString(const Slice & value);
+
+// If *in starts with "c", advances *in past the first character and
+// returns true.  Otherwise, returns false.
+extern bool ConsumeChar(Slice* in, char c);
+
 // Parse a human-readable number from "*in" into *value.  On success,
 // advances "*in" past the consumed number and sets "*val" to the
 // numeric value.  Otherwise, returns false and leaves *in in an
diff --git a/src/leveldb/util/lz4.c b/src/leveldb/util/lz4.c
new file mode 100644
index 000000000..08cf6b5cd
--- /dev/null
+++ b/src/leveldb/util/lz4.c
@@ -0,0 +1,1516 @@
+/*
+   LZ4 - Fast LZ compression algorithm
+   Copyright (C) 2011-2015, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - LZ4 source repository : https://github.com/Cyan4973/lz4
+   - LZ4 public forum : https://groups.google.com/forum/#!forum/lz4c
+*/
+
+
+/**************************************
+*  Tuning parameters
+**************************************/
+/*
+ * HEAPMODE :
+ * Select how default compression functions will allocate memory for their hash table,
+ * in memory stack (0:default, fastest), or in memory heap (1:requires malloc()).
+ */
+#define HEAPMODE 0
+
+/*
+ * ACCELERATION_DEFAULT :
+ * Select "acceleration" for LZ4_compress_fast() when parameter value <= 0
+ */
+#define ACCELERATION_DEFAULT 1
+
+
+/**************************************
+*  CPU Feature Detection
+**************************************/
+/*
+ * LZ4_FORCE_SW_BITCOUNT
+ * Define this parameter if your target system or compiler does not support hardware bit count
+ */
+#if defined(_MSC_VER) && defined(_WIN32_WCE)   /* Visual Studio for Windows CE does not support Hardware bit count */
+#  define LZ4_FORCE_SW_BITCOUNT
+#endif
+
+
+/**************************************
+*  Includes
+**************************************/
+#include "lz4.h"
+
+
+/**************************************
+*  Compiler Options
+**************************************/
+#ifdef _MSC_VER    /* Visual Studio */
+#  define FORCE_INLINE static __forceinline
+#  include <intrin.h>
+#  pragma warning(disable : 4127)        /* disable: C4127: conditional expression is constant */
+#  pragma warning(disable : 4293)        /* disable: C4293: too large shift (32-bits) */
+#else
+#  if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)   /* C99 */
+#    if defined(__GNUC__) || defined(__clang__)
+#      define FORCE_INLINE static inline __attribute__((always_inline))
+#    else
+#      define FORCE_INLINE static inline
+#    endif
+#  else
+#    define FORCE_INLINE static
+#  endif   /* __STDC_VERSION__ */
+#endif  /* _MSC_VER */
+
+/* LZ4_GCC_VERSION is defined into lz4.h */
+#if (LZ4_GCC_VERSION >= 302) || (__INTEL_COMPILER >= 800) || defined(__clang__)
+#  define expect(expr,value)    (__builtin_expect ((expr),(value)) )
+#else
+#  define expect(expr,value)    (expr)
+#endif
+
+#define likely(expr)     expect((expr) != 0, 1)
+#define unlikely(expr)   expect((expr) != 0, 0)
+
+
+/**************************************
+*  Memory routines
+**************************************/
+#include <stdlib.h>   /* malloc, calloc, free */
+#define ALLOCATOR(n,s) calloc(n,s)
+#define FREEMEM        free
+#include <string.h>   /* memset, memcpy */
+#define MEM_INIT       memset
+
+
+/**************************************
+*  Basic Types
+**************************************/
+#if defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)   /* C99 */
+# include <stdint.h>
+  typedef  uint8_t BYTE;
+  typedef uint16_t U16;
+  typedef uint32_t U32;
+  typedef  int32_t S32;
+  typedef uint64_t U64;
+#else
+  typedef unsigned char       BYTE;
+  typedef unsigned short      U16;
+  typedef unsigned int        U32;
+  typedef   signed int        S32;
+  typedef unsigned long long  U64;
+#endif
+
+
+/**************************************
+*  Reading and writing into memory
+**************************************/
+#define STEPSIZE sizeof(size_t)
+
+static unsigned LZ4_64bits(void) { return sizeof(void*)==8; }
+
+static unsigned LZ4_isLittleEndian(void)
+{
+    const union { U32 i; BYTE c[4]; } one = { 1 };   /* don't use static : performance detrimental  */
+    return one.c[0];
+}
+
+
+static U16 LZ4_read16(const void* memPtr)
+{
+    U16 val16;
+    memcpy(&val16, memPtr, 2);
+    return val16;
+}
+
+static U16 LZ4_readLE16(const void* memPtr)
+{
+    if (LZ4_isLittleEndian())
+    {
+        return LZ4_read16(memPtr);
+    }
+    else
+    {
+        const BYTE* p = (const BYTE*)memPtr;
+        return (U16)((U16)p[0] + (p[1]<<8));
+    }
+}
+
+static void LZ4_writeLE16(void* memPtr, U16 value)
+{
+    if (LZ4_isLittleEndian())
+    {
+        memcpy(memPtr, &value, 2);
+    }
+    else
+    {
+        BYTE* p = (BYTE*)memPtr;
+        p[0] = (BYTE) value;
+        p[1] = (BYTE)(value>>8);
+    }
+}
+
+static U32 LZ4_read32(const void* memPtr)
+{
+    U32 val32;
+    memcpy(&val32, memPtr, 4);
+    return val32;
+}
+
+static U64 LZ4_read64(const void* memPtr)
+{
+    U64 val64;
+    memcpy(&val64, memPtr, 8);
+    return val64;
+}
+
+static size_t LZ4_read_ARCH(const void* p)
+{
+    if (LZ4_64bits())
+        return (size_t)LZ4_read64(p);
+    else
+        return (size_t)LZ4_read32(p);
+}
+
+
+static void LZ4_copy4(void* dstPtr, const void* srcPtr) { memcpy(dstPtr, srcPtr, 4); }
+
+static void LZ4_copy8(void* dstPtr, const void* srcPtr) { memcpy(dstPtr, srcPtr, 8); }
+
+/* customized version of memcpy, which may overwrite up to 7 bytes beyond dstEnd */
+static void LZ4_wildCopy(void* dstPtr, const void* srcPtr, void* dstEnd)
+{
+    BYTE* d = (BYTE*)dstPtr;
+    const BYTE* s = (const BYTE*)srcPtr;
+    BYTE* e = (BYTE*)dstEnd;
+    do { LZ4_copy8(d,s); d+=8; s+=8; } while (d<e);
+}
+
+
+/**************************************
+*  Common Constants
+**************************************/
+#define MINMATCH 4
+
+#define COPYLENGTH 8
+#define LASTLITERALS 5
+#define MFLIMIT (COPYLENGTH+MINMATCH)
+static const int LZ4_minLength = (MFLIMIT+1);
+
+#define KB *(1 <<10)
+#define MB *(1 <<20)
+#define GB *(1U<<30)
+
+#define MAXD_LOG 16
+#define MAX_DISTANCE ((1 << MAXD_LOG) - 1)
+
+#define ML_BITS  4
+#define ML_MASK  ((1U<<ML_BITS)-1)
+#define RUN_BITS (8-ML_BITS)
+#define RUN_MASK ((1U<<RUN_BITS)-1)
+
+
+/**************************************
+*  Common Utils
+**************************************/
+#define LZ4_STATIC_ASSERT(c)    { enum { LZ4_static_assert = 1/(int)(!!(c)) }; }   /* use only *after* variable declarations */
+
+
+/**************************************
+*  Common functions
+**************************************/
+static unsigned LZ4_NbCommonBytes (register size_t val)
+{
+    if (LZ4_isLittleEndian())
+    {
+        if (LZ4_64bits())
+        {
+#       if defined(_MSC_VER) && defined(_WIN64) && !defined(LZ4_FORCE_SW_BITCOUNT)
+            unsigned long r = 0;
+            _BitScanForward64( &r, (U64)val );
+            return (int)(r>>3);
+#       elif (defined(__clang__) || (LZ4_GCC_VERSION >= 304)) && !defined(LZ4_FORCE_SW_BITCOUNT)
+            return (__builtin_ctzll((U64)val) >> 3);
+#       else
+            static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5, 3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, 5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4, 4, 5, 7, 2, 6, 5, 7, 6, 7, 7 };
+            return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58];
+#       endif
+        }
+        else /* 32 bits */
+        {
+#       if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT)
+            unsigned long r;
+            _BitScanForward( &r, (U32)val );
+            return (int)(r>>3);
+#       elif (defined(__clang__) || (LZ4_GCC_VERSION >= 304)) && !defined(LZ4_FORCE_SW_BITCOUNT)
+            return (__builtin_ctz((U32)val) >> 3);
+#       else
+            static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, 3, 2, 2, 1, 3, 2, 0, 1, 3, 3, 1, 2, 2, 2, 2, 0, 3, 1, 2, 0, 1, 0, 1, 1 };
+            return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27];
+#       endif
+        }
+    }
+    else   /* Big Endian CPU */
+    {
+        if (LZ4_64bits())
+        {
+#       if defined(_MSC_VER) && defined(_WIN64) && !defined(LZ4_FORCE_SW_BITCOUNT)
+            unsigned long r = 0;
+            _BitScanReverse64( &r, val );
+            return (unsigned)(r>>3);
+#       elif (defined(__clang__) || (LZ4_GCC_VERSION >= 304)) && !defined(LZ4_FORCE_SW_BITCOUNT)
+            return (__builtin_clzll((U64)val) >> 3);
+#       else
+            unsigned r;
+            if (!(val>>32)) { r=4; } else { r=0; val>>=32; }
+            if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; }
+            r += (!val);
+            return r;
+#       endif
+        }
+        else /* 32 bits */
+        {
+#       if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT)
+            unsigned long r = 0;
+            _BitScanReverse( &r, (unsigned long)val );
+            return (unsigned)(r>>3);
+#       elif (defined(__clang__) || (LZ4_GCC_VERSION >= 304)) && !defined(LZ4_FORCE_SW_BITCOUNT)
+            return (__builtin_clz((U32)val) >> 3);
+#       else
+            unsigned r;
+            if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; }
+            r += (!val);
+            return r;
+#       endif
+        }
+    }
+}
+
+static unsigned LZ4_count(const BYTE* pIn, const BYTE* pMatch, const BYTE* pInLimit)
+{
+    const BYTE* const pStart = pIn;
+
+    while (likely(pIn<pInLimit-(STEPSIZE-1)))
+    {
+        size_t diff = LZ4_read_ARCH(pMatch) ^ LZ4_read_ARCH(pIn);
+        if (!diff) { pIn+=STEPSIZE; pMatch+=STEPSIZE; continue; }
+        pIn += LZ4_NbCommonBytes(diff);
+        return (unsigned)(pIn - pStart);
+    }
+
+    if (LZ4_64bits()) if ((pIn<(pInLimit-3)) && (LZ4_read32(pMatch) == LZ4_read32(pIn))) { pIn+=4; pMatch+=4; }
+    if ((pIn<(pInLimit-1)) && (LZ4_read16(pMatch) == LZ4_read16(pIn))) { pIn+=2; pMatch+=2; }
+    if ((pIn<pInLimit) && (*pMatch == *pIn)) pIn++;
+    return (unsigned)(pIn - pStart);
+}
+
+
+#ifndef LZ4_COMMONDEFS_ONLY
+/**************************************
+*  Local Constants
+**************************************/
+#define LZ4_HASHLOG   (LZ4_MEMORY_USAGE-2)
+#define HASHTABLESIZE (1 << LZ4_MEMORY_USAGE)
+#define HASH_SIZE_U32 (1 << LZ4_HASHLOG)       /* required as macro for static allocation */
+
+static const int LZ4_64Klimit = ((64 KB) + (MFLIMIT-1));
+static const U32 LZ4_skipTrigger = 6;  /* Increase this value ==> compression run slower on incompressible data */
+
+
+/**************************************
+*  Local Structures and types
+**************************************/
+typedef struct {
+    U32 hashTable[HASH_SIZE_U32];
+    U32 currentOffset;
+    U32 initCheck;
+    const BYTE* dictionary;
+    BYTE* bufferStart;   /* obsolete, used for slideInputBuffer */
+    U32 dictSize;
+} LZ4_stream_t_internal;
+
+typedef enum { notLimited = 0, limitedOutput = 1 } limitedOutput_directive;
+typedef enum { byPtr, byU32, byU16 } tableType_t;
+
+typedef enum { noDict = 0, withPrefix64k, usingExtDict } dict_directive;
+typedef enum { noDictIssue = 0, dictSmall } dictIssue_directive;
+
+typedef enum { endOnOutputSize = 0, endOnInputSize = 1 } endCondition_directive;
+typedef enum { full = 0, partial = 1 } earlyEnd_directive;
+
+
+/**************************************
+*  Local Utils
+**************************************/
+int LZ4_versionNumber (void) { return LZ4_VERSION_NUMBER; }
+int LZ4_compressBound(int isize)  { return LZ4_COMPRESSBOUND(isize); }
+int LZ4_sizeofState() { return LZ4_STREAMSIZE; }
+
+
+
+/********************************
+*  Compression functions
+********************************/
+
+static U32 LZ4_hashSequence(U32 sequence, tableType_t const tableType)
+{
+    if (tableType == byU16)
+        return (((sequence) * 2654435761U) >> ((MINMATCH*8)-(LZ4_HASHLOG+1)));
+    else
+        return (((sequence) * 2654435761U) >> ((MINMATCH*8)-LZ4_HASHLOG));
+}
+
+static const U64 prime5bytes = 889523592379ULL;
+static U32 LZ4_hashSequence64(size_t sequence, tableType_t const tableType)
+{
+    const U32 hashLog = (tableType == byU16) ? LZ4_HASHLOG+1 : LZ4_HASHLOG;
+    const U32 hashMask = (1<<hashLog) - 1;
+    return ((sequence * prime5bytes) >> (40 - hashLog)) & hashMask;
+}
+
+static U32 LZ4_hashSequenceT(size_t sequence, tableType_t const tableType)
+{
+    if (LZ4_64bits())
+        return LZ4_hashSequence64(sequence, tableType);
+    return LZ4_hashSequence((U32)sequence, tableType);
+}
+
+static U32 LZ4_hashPosition(const void* p, tableType_t tableType) { return LZ4_hashSequenceT(LZ4_read_ARCH(p), tableType); }
+
+static void LZ4_putPositionOnHash(const BYTE* p, U32 h, void* tableBase, tableType_t const tableType, const BYTE* srcBase)
+{
+    switch (tableType)
+    {
+    case byPtr: { const BYTE** hashTable = (const BYTE**)tableBase; hashTable[h] = p; return; }
+    case byU32: { U32* hashTable = (U32*) tableBase; hashTable[h] = (U32)(p-srcBase); return; }
+    case byU16: { U16* hashTable = (U16*) tableBase; hashTable[h] = (U16)(p-srcBase); return; }
+    }
+}
+
+static void LZ4_putPosition(const BYTE* p, void* tableBase, tableType_t tableType, const BYTE* srcBase)
+{
+    U32 h = LZ4_hashPosition(p, tableType);
+    LZ4_putPositionOnHash(p, h, tableBase, tableType, srcBase);
+}
+
+static const BYTE* LZ4_getPositionOnHash(U32 h, void* tableBase, tableType_t tableType, const BYTE* srcBase)
+{
+    if (tableType == byPtr) { const BYTE** hashTable = (const BYTE**) tableBase; return hashTable[h]; }
+    if (tableType == byU32) { U32* hashTable = (U32*) tableBase; return hashTable[h] + srcBase; }
+    { U16* hashTable = (U16*) tableBase; return hashTable[h] + srcBase; }   /* default, to ensure a return */
+}
+
+static const BYTE* LZ4_getPosition(const BYTE* p, void* tableBase, tableType_t tableType, const BYTE* srcBase)
+{
+    U32 h = LZ4_hashPosition(p, tableType);
+    return LZ4_getPositionOnHash(h, tableBase, tableType, srcBase);
+}
+
+FORCE_INLINE int LZ4_compress_generic(
+                 void* const ctx,
+                 const char* const source,
+                 char* const dest,
+                 const int inputSize,
+                 const int maxOutputSize,
+                 const limitedOutput_directive outputLimited,
+                 const tableType_t tableType,
+                 const dict_directive dict,
+                 const dictIssue_directive dictIssue,
+                 const U32 acceleration)
+{
+    LZ4_stream_t_internal* const dictPtr = (LZ4_stream_t_internal*)ctx;
+
+    const BYTE* ip = (const BYTE*) source;
+    const BYTE* base;
+    const BYTE* lowLimit;
+    const BYTE* const lowRefLimit = ip - dictPtr->dictSize;
+    const BYTE* const dictionary = dictPtr->dictionary;
+    const BYTE* const dictEnd = dictionary + dictPtr->dictSize;
+    const size_t dictDelta = dictEnd - (const BYTE*)source;
+    const BYTE* anchor = (const BYTE*) source;
+    const BYTE* const iend = ip + inputSize;
+    const BYTE* const mflimit = iend - MFLIMIT;
+    const BYTE* const matchlimit = iend - LASTLITERALS;
+
+    BYTE* op = (BYTE*) dest;
+    BYTE* const olimit = op + maxOutputSize;
+
+    U32 forwardH;
+    size_t refDelta=0;
+
+    /* Init conditions */
+    if ((U32)inputSize > (U32)LZ4_MAX_INPUT_SIZE) return 0;   /* Unsupported input size, too large (or negative) */
+    switch(dict)
+    {
+    case noDict:
+    default:
+        base = (const BYTE*)source;
+        lowLimit = (const BYTE*)source;
+        break;
+    case withPrefix64k:
+        base = (const BYTE*)source - dictPtr->currentOffset;
+        lowLimit = (const BYTE*)source - dictPtr->dictSize;
+        break;
+    case usingExtDict:
+        base = (const BYTE*)source - dictPtr->currentOffset;
+        lowLimit = (const BYTE*)source;
+        break;
+    }
+    if ((tableType == byU16) && (inputSize>=LZ4_64Klimit)) return 0;   /* Size too large (not within 64K limit) */
+    if (inputSize<LZ4_minLength) goto _last_literals;                  /* Input too small, no compression (all literals) */
+
+    /* First Byte */
+    LZ4_putPosition(ip, ctx, tableType, base);
+    ip++; forwardH = LZ4_hashPosition(ip, tableType);
+
+    /* Main Loop */
+    for ( ; ; )
+    {
+        const BYTE* match;
+        BYTE* token;
+        {
+            const BYTE* forwardIp = ip;
+            unsigned step = 1;
+            unsigned searchMatchNb = acceleration << LZ4_skipTrigger;
+
+            /* Find a match */
+            do {
+                U32 h = forwardH;
+                ip = forwardIp;
+                forwardIp += step;
+                step = (searchMatchNb++ >> LZ4_skipTrigger);
+
+                if (unlikely(forwardIp > mflimit)) goto _last_literals;
+
+                match = LZ4_getPositionOnHash(h, ctx, tableType, base);
+                if (dict==usingExtDict)
+                {
+                    if (match<(const BYTE*)source)
+                    {
+                        refDelta = dictDelta;
+                        lowLimit = dictionary;
+                    }
+                    else
+                    {
+                        refDelta = 0;
+                        lowLimit = (const BYTE*)source;
+                    }
+                }
+                forwardH = LZ4_hashPosition(forwardIp, tableType);
+                LZ4_putPositionOnHash(ip, h, ctx, tableType, base);
+
+            } while ( ((dictIssue==dictSmall) ? (match < lowRefLimit) : 0)
+                || ((tableType==byU16) ? 0 : (match + MAX_DISTANCE < ip))
+                || (LZ4_read32(match+refDelta) != LZ4_read32(ip)) );
+        }
+
+        /* Catch up */
+        while ((ip>anchor) && (match+refDelta > lowLimit) && (unlikely(ip[-1]==match[refDelta-1]))) { ip--; match--; }
+
+        {
+            /* Encode Literal length */
+            unsigned litLength = (unsigned)(ip - anchor);
+            token = op++;
+            if ((outputLimited) && (unlikely(op + litLength + (2 + 1 + LASTLITERALS) + (litLength/255) > olimit)))
+                return 0;   /* Check output limit */
+            if (litLength>=RUN_MASK)
+            {
+                int len = (int)litLength-RUN_MASK;
+                *token=(RUN_MASK<<ML_BITS);
+                for(; len >= 255 ; len-=255) *op++ = 255;
+                *op++ = (BYTE)len;
+            }
+            else *token = (BYTE)(litLength<<ML_BITS);
+
+            /* Copy Literals */
+            LZ4_wildCopy(op, anchor, op+litLength);
+            op+=litLength;
+        }
+
+_next_match:
+        /* Encode Offset */
+        LZ4_writeLE16(op, (U16)(ip-match)); op+=2;
+
+        /* Encode MatchLength */
+        {
+            unsigned matchLength;
+
+            if ((dict==usingExtDict) && (lowLimit==dictionary))
+            {
+                const BYTE* limit;
+                match += refDelta;
+                limit = ip + (dictEnd-match);
+                if (limit > matchlimit) limit = matchlimit;
+                matchLength = LZ4_count(ip+MINMATCH, match+MINMATCH, limit);
+                ip += MINMATCH + matchLength;
+                if (ip==limit)
+                {
+                    unsigned more = LZ4_count(ip, (const BYTE*)source, matchlimit);
+                    matchLength += more;
+                    ip += more;
+                }
+            }
+            else
+            {
+                matchLength = LZ4_count(ip+MINMATCH, match+MINMATCH, matchlimit);
+                ip += MINMATCH + matchLength;
+            }
+
+            if ((outputLimited) && (unlikely(op + (1 + LASTLITERALS) + (matchLength>>8) > olimit)))
+                return 0;    /* Check output limit */
+            if (matchLength>=ML_MASK)
+            {
+                *token += ML_MASK;
+                matchLength -= ML_MASK;
+                for (; matchLength >= 510 ; matchLength-=510) { *op++ = 255; *op++ = 255; }
+                if (matchLength >= 255) { matchLength-=255; *op++ = 255; }
+                *op++ = (BYTE)matchLength;
+            }
+            else *token += (BYTE)(matchLength);
+        }
+
+        anchor = ip;
+
+        /* Test end of chunk */
+        if (ip > mflimit) break;
+
+        /* Fill table */
+        LZ4_putPosition(ip-2, ctx, tableType, base);
+
+        /* Test next position */
+        match = LZ4_getPosition(ip, ctx, tableType, base);
+        if (dict==usingExtDict)
+        {
+            if (match<(const BYTE*)source)
+            {
+                refDelta = dictDelta;
+                lowLimit = dictionary;
+            }
+            else
+            {
+                refDelta = 0;
+                lowLimit = (const BYTE*)source;
+            }
+        }
+        LZ4_putPosition(ip, ctx, tableType, base);
+        if ( ((dictIssue==dictSmall) ? (match>=lowRefLimit) : 1)
+            && (match+MAX_DISTANCE>=ip)
+            && (LZ4_read32(match+refDelta)==LZ4_read32(ip)) )
+        { token=op++; *token=0; goto _next_match; }
+
+        /* Prepare next loop */
+        forwardH = LZ4_hashPosition(++ip, tableType);
+    }
+
+_last_literals:
+    /* Encode Last Literals */
+    {
+        const size_t lastRun = (size_t)(iend - anchor);
+        if ((outputLimited) && ((op - (BYTE*)dest) + lastRun + 1 + ((lastRun+255-RUN_MASK)/255) > (U32)maxOutputSize))
+            return 0;   /* Check output limit */
+        if (lastRun >= RUN_MASK)
+        {
+            size_t accumulator = lastRun - RUN_MASK;
+            *op++ = RUN_MASK << ML_BITS;
+            for(; accumulator >= 255 ; accumulator-=255) *op++ = 255;
+            *op++ = (BYTE) accumulator;
+        }
+        else
+        {
+            *op++ = (BYTE)(lastRun<<ML_BITS);
+        }
+        memcpy(op, anchor, lastRun);
+        op += lastRun;
+    }
+
+    /* End */
+    return (int) (((char*)op)-dest);
+}
+
+
+int LZ4_compress_fast_extState(void* state, const char* source, char* dest, int inputSize, int maxOutputSize, int acceleration)
+{
+    LZ4_resetStream((LZ4_stream_t*)state);
+    if (acceleration < 1) acceleration = ACCELERATION_DEFAULT;
+
+    if (maxOutputSize >= LZ4_compressBound(inputSize))
+    {
+        if (inputSize < LZ4_64Klimit)
+            return LZ4_compress_generic(state, source, dest, inputSize, 0, notLimited, byU16,                        noDict, noDictIssue, acceleration);
+        else
+            return LZ4_compress_generic(state, source, dest, inputSize, 0, notLimited, LZ4_64bits() ? byU32 : byPtr, noDict, noDictIssue, acceleration);
+    }
+    else
+    {
+        if (inputSize < LZ4_64Klimit)
+            return LZ4_compress_generic(state, source, dest, inputSize, maxOutputSize, limitedOutput, byU16,                        noDict, noDictIssue, acceleration);
+        else
+            return LZ4_compress_generic(state, source, dest, inputSize, maxOutputSize, limitedOutput, LZ4_64bits() ? byU32 : byPtr, noDict, noDictIssue, acceleration);
+    }
+}
+
+
+int LZ4_compress_fast(const char* source, char* dest, int inputSize, int maxOutputSize, int acceleration)
+{
+#if (HEAPMODE)
+    void* ctxPtr = ALLOCATOR(1, sizeof(LZ4_stream_t));   /* malloc-calloc always properly aligned */
+#else
+    LZ4_stream_t ctx;
+    void* ctxPtr = &ctx;
+#endif
+
+    int result = LZ4_compress_fast_extState(ctxPtr, source, dest, inputSize, maxOutputSize, acceleration);
+
+#if (HEAPMODE)
+    FREEMEM(ctxPtr);
+#endif
+    return result;
+}
+
+
+int LZ4_compress_default(const char* source, char* dest, int inputSize, int maxOutputSize)
+{
+    return LZ4_compress_fast(source, dest, inputSize, maxOutputSize, 1);
+}
+
+
+/* hidden debug function */
+/* strangely enough, gcc generates faster code when this function is uncommented, even if unused */
+int LZ4_compress_fast_force(const char* source, char* dest, int inputSize, int maxOutputSize, int acceleration)
+{
+    LZ4_stream_t ctx;
+
+    LZ4_resetStream(&ctx);
+
+    if (inputSize < LZ4_64Klimit)
+        return LZ4_compress_generic(&ctx, source, dest, inputSize, maxOutputSize, limitedOutput, byU16,                        noDict, noDictIssue, acceleration);
+    else
+        return LZ4_compress_generic(&ctx, source, dest, inputSize, maxOutputSize, limitedOutput, LZ4_64bits() ? byU32 : byPtr, noDict, noDictIssue, acceleration);
+}
+
+
+/********************************
+*  destSize variant
+********************************/
+
+static int LZ4_compress_destSize_generic(
+                       void* const ctx,
+                 const char* const src,
+                       char* const dst,
+                       int*  const srcSizePtr,
+                 const int targetDstSize,
+                 const tableType_t tableType)
+{
+    const BYTE* ip = (const BYTE*) src;
+    const BYTE* base = (const BYTE*) src;
+    const BYTE* lowLimit = (const BYTE*) src;
+    const BYTE* anchor = ip;
+    const BYTE* const iend = ip + *srcSizePtr;
+    const BYTE* const mflimit = iend - MFLIMIT;
+    const BYTE* const matchlimit = iend - LASTLITERALS;
+
+    BYTE* op = (BYTE*) dst;
+    BYTE* const oend = op + targetDstSize;
+    BYTE* const oMaxLit = op + targetDstSize - 2 /* offset */ - 8 /* because 8+MINMATCH==MFLIMIT */ - 1 /* token */;
+    BYTE* const oMaxMatch = op + targetDstSize - (LASTLITERALS + 1 /* token */);
+    BYTE* const oMaxSeq = oMaxLit - 1 /* token */;
+
+    U32 forwardH;
+
+
+    /* Init conditions */
+    if (targetDstSize < 1) return 0;                                     /* Impossible to store anything */
+    if ((U32)*srcSizePtr > (U32)LZ4_MAX_INPUT_SIZE) return 0;            /* Unsupported input size, too large (or negative) */
+    if ((tableType == byU16) && (*srcSizePtr>=LZ4_64Klimit)) return 0;   /* Size too large (not within 64K limit) */
+    if (*srcSizePtr<LZ4_minLength) goto _last_literals;                  /* Input too small, no compression (all literals) */
+
+    /* First Byte */
+    *srcSizePtr = 0;
+    LZ4_putPosition(ip, ctx, tableType, base);
+    ip++; forwardH = LZ4_hashPosition(ip, tableType);
+
+    /* Main Loop */
+    for ( ; ; )
+    {
+        const BYTE* match;
+        BYTE* token;
+        {
+            const BYTE* forwardIp = ip;
+            unsigned step = 1;
+            unsigned searchMatchNb = 1 << LZ4_skipTrigger;
+
+            /* Find a match */
+            do {
+                U32 h = forwardH;
+                ip = forwardIp;
+                forwardIp += step;
+                step = (searchMatchNb++ >> LZ4_skipTrigger);
+
+                if (unlikely(forwardIp > mflimit))
+                    goto _last_literals;
+
+                match = LZ4_getPositionOnHash(h, ctx, tableType, base);
+                forwardH = LZ4_hashPosition(forwardIp, tableType);
+                LZ4_putPositionOnHash(ip, h, ctx, tableType, base);
+
+            } while ( ((tableType==byU16) ? 0 : (match + MAX_DISTANCE < ip))
+                || (LZ4_read32(match) != LZ4_read32(ip)) );
+        }
+
+        /* Catch up */
+        while ((ip>anchor) && (match > lowLimit) && (unlikely(ip[-1]==match[-1]))) { ip--; match--; }
+
+        {
+            /* Encode Literal length */
+            unsigned litLength = (unsigned)(ip - anchor);
+            token = op++;
+            if (op + ((litLength+240)/255) + litLength > oMaxLit)
+            {
+                /* Not enough space for a last match */
+                op--;
+                goto _last_literals;
+            }
+            if (litLength>=RUN_MASK)
+            {
+                unsigned len = litLength - RUN_MASK;
+                *token=(RUN_MASK<<ML_BITS);
+                for(; len >= 255 ; len-=255) *op++ = 255;
+                *op++ = (BYTE)len;
+            }
+            else *token = (BYTE)(litLength<<ML_BITS);
+
+            /* Copy Literals */
+            LZ4_wildCopy(op, anchor, op+litLength);
+            op += litLength;
+        }
+
+_next_match:
+        /* Encode Offset */
+        LZ4_writeLE16(op, (U16)(ip-match)); op+=2;
+
+        /* Encode MatchLength */
+        {
+            size_t matchLength;
+
+            matchLength = LZ4_count(ip+MINMATCH, match+MINMATCH, matchlimit);
+
+            if (op + ((matchLength+240)/255) > oMaxMatch)
+            {
+                /* Match description too long : reduce it */
+                matchLength = (15-1) + (oMaxMatch-op) * 255;
+            }
+            //printf("offset %5i, matchLength%5i \n", (int)(ip-match), matchLength + MINMATCH);
+            ip += MINMATCH + matchLength;
+
+            if (matchLength>=ML_MASK)
+            {
+                *token += ML_MASK;
+                matchLength -= ML_MASK;
+                while (matchLength >= 255) { matchLength-=255; *op++ = 255; }
+                *op++ = (BYTE)matchLength;
+            }
+            else *token += (BYTE)(matchLength);
+        }
+
+        anchor = ip;
+
+        /* Test end of block */
+        if (ip > mflimit) break;
+        if (op > oMaxSeq) break;
+
+        /* Fill table */
+        LZ4_putPosition(ip-2, ctx, tableType, base);
+
+        /* Test next position */
+        match = LZ4_getPosition(ip, ctx, tableType, base);
+        LZ4_putPosition(ip, ctx, tableType, base);
+        if ( (match+MAX_DISTANCE>=ip)
+            && (LZ4_read32(match)==LZ4_read32(ip)) )
+        { token=op++; *token=0; goto _next_match; }
+
+        /* Prepare next loop */
+        forwardH = LZ4_hashPosition(++ip, tableType);
+    }
+
+_last_literals:
+    /* Encode Last Literals */
+    {
+        size_t lastRunSize = (size_t)(iend - anchor);
+        if (op + 1 /* token */ + ((lastRunSize+240)/255) /* litLength */ + lastRunSize /* literals */ > oend)
+        {
+            /* adapt lastRunSize to fill 'dst' */
+            lastRunSize  = (oend-op) - 1;
+            lastRunSize -= (lastRunSize+240)/255;
+        }
+        ip = anchor + lastRunSize;
+
+        if (lastRunSize >= RUN_MASK)
+        {
+            size_t accumulator = lastRunSize - RUN_MASK;
+            *op++ = RUN_MASK << ML_BITS;
+            for(; accumulator >= 255 ; accumulator-=255) *op++ = 255;
+            *op++ = (BYTE) accumulator;
+        }
+        else
+        {
+            *op++ = (BYTE)(lastRunSize<<ML_BITS);
+        }
+        memcpy(op, anchor, lastRunSize);
+        op += lastRunSize;
+    }
+
+    /* End */
+    *srcSizePtr = (int) (((const char*)ip)-src);
+    return (int) (((char*)op)-dst);
+}
+
+
+static int LZ4_compress_destSize_extState (void* state, const char* src, char* dst, int* srcSizePtr, int targetDstSize)
+{
+    LZ4_resetStream((LZ4_stream_t*)state);
+
+    if (targetDstSize >= LZ4_compressBound(*srcSizePtr))   /* compression success is guaranteed */
+    {
+        return LZ4_compress_fast_extState(state, src, dst, *srcSizePtr, targetDstSize, 1);
+    }
+    else
+    {
+        if (*srcSizePtr < LZ4_64Klimit)
+            return LZ4_compress_destSize_generic(state, src, dst, srcSizePtr, targetDstSize, byU16);
+        else
+            return LZ4_compress_destSize_generic(state, src, dst, srcSizePtr, targetDstSize, LZ4_64bits() ? byU32 : byPtr);
+    }
+}
+
+
+int LZ4_compress_destSize(const char* src, char* dst, int* srcSizePtr, int targetDstSize)
+{
+#if (HEAPMODE)
+    void* ctx = ALLOCATOR(1, sizeof(LZ4_stream_t));   /* malloc-calloc always properly aligned */
+#else
+    LZ4_stream_t ctxBody;
+    void* ctx = &ctxBody;
+#endif
+
+    int result = LZ4_compress_destSize_extState(ctx, src, dst, srcSizePtr, targetDstSize);
+
+#if (HEAPMODE)
+    FREEMEM(ctx);
+#endif
+    return result;
+}
+
+
+
+/********************************
+*  Streaming functions
+********************************/
+
+LZ4_stream_t* LZ4_createStream(void)
+{
+    LZ4_stream_t* lz4s = (LZ4_stream_t*)ALLOCATOR(8, LZ4_STREAMSIZE_U64);
+    LZ4_STATIC_ASSERT(LZ4_STREAMSIZE >= sizeof(LZ4_stream_t_internal));    /* A compilation error here means LZ4_STREAMSIZE is not large enough */
+    LZ4_resetStream(lz4s);
+    return lz4s;
+}
+
+void LZ4_resetStream (LZ4_stream_t* LZ4_stream)
+{
+    MEM_INIT(LZ4_stream, 0, sizeof(LZ4_stream_t));
+}
+
+int LZ4_freeStream (LZ4_stream_t* LZ4_stream)
+{
+    FREEMEM(LZ4_stream);
+    return (0);
+}
+
+
+#define HASH_UNIT sizeof(size_t)
+int LZ4_loadDict (LZ4_stream_t* LZ4_dict, const char* dictionary, int dictSize)
+{
+    LZ4_stream_t_internal* dict = (LZ4_stream_t_internal*) LZ4_dict;
+    const BYTE* p = (const BYTE*)dictionary;
+    const BYTE* const dictEnd = p + dictSize;
+    const BYTE* base;
+
+    if ((dict->initCheck) || (dict->currentOffset > 1 GB))  /* Uninitialized structure, or reuse overflow */
+        LZ4_resetStream(LZ4_dict);
+
+    if (dictSize < (int)HASH_UNIT)
+    {
+        dict->dictionary = NULL;
+        dict->dictSize = 0;
+        return 0;
+    }
+
+    if ((dictEnd - p) > 64 KB) p = dictEnd - 64 KB;
+    dict->currentOffset += 64 KB;
+    base = p - dict->currentOffset;
+    dict->dictionary = p;
+    dict->dictSize = (U32)(dictEnd - p);
+    dict->currentOffset += dict->dictSize;
+
+    while (p <= dictEnd-HASH_UNIT)
+    {
+        LZ4_putPosition(p, dict->hashTable, byU32, base);
+        p+=3;
+    }
+
+    return dict->dictSize;
+}
+
+
+static void LZ4_renormDictT(LZ4_stream_t_internal* LZ4_dict, const BYTE* src)
+{
+    if ((LZ4_dict->currentOffset > 0x80000000) ||
+        ((size_t)LZ4_dict->currentOffset > (size_t)src))   /* address space overflow */
+    {
+        /* rescale hash table */
+        U32 delta = LZ4_dict->currentOffset - 64 KB;
+        const BYTE* dictEnd = LZ4_dict->dictionary + LZ4_dict->dictSize;
+        int i;
+        for (i=0; i<HASH_SIZE_U32; i++)
+        {
+            if (LZ4_dict->hashTable[i] < delta) LZ4_dict->hashTable[i]=0;
+            else LZ4_dict->hashTable[i] -= delta;
+        }
+        LZ4_dict->currentOffset = 64 KB;
+        if (LZ4_dict->dictSize > 64 KB) LZ4_dict->dictSize = 64 KB;
+        LZ4_dict->dictionary = dictEnd - LZ4_dict->dictSize;
+    }
+}
+
+
+int LZ4_compress_fast_continue (LZ4_stream_t* LZ4_stream, const char* source, char* dest, int inputSize, int maxOutputSize, int acceleration)
+{
+    LZ4_stream_t_internal* streamPtr = (LZ4_stream_t_internal*)LZ4_stream;
+    const BYTE* const dictEnd = streamPtr->dictionary + streamPtr->dictSize;
+
+    const BYTE* smallest = (const BYTE*) source;
+    if (streamPtr->initCheck) return 0;   /* Uninitialized structure detected */
+    if ((streamPtr->dictSize>0) && (smallest>dictEnd)) smallest = dictEnd;
+    LZ4_renormDictT(streamPtr, smallest);
+    if (acceleration < 1) acceleration = ACCELERATION_DEFAULT;
+
+    /* Check overlapping input/dictionary space */
+    {
+        const BYTE* sourceEnd = (const BYTE*) source + inputSize;
+        if ((sourceEnd > streamPtr->dictionary) && (sourceEnd < dictEnd))
+        {
+            streamPtr->dictSize = (U32)(dictEnd - sourceEnd);
+            if (streamPtr->dictSize > 64 KB) streamPtr->dictSize = 64 KB;
+            if (streamPtr->dictSize < 4) streamPtr->dictSize = 0;
+            streamPtr->dictionary = dictEnd - streamPtr->dictSize;
+        }
+    }
+
+    /* prefix mode : source data follows dictionary */
+    if (dictEnd == (const BYTE*)source)
+    {
+        int result;
+        if ((streamPtr->dictSize < 64 KB) && (streamPtr->dictSize < streamPtr->currentOffset))
+            result = LZ4_compress_generic(LZ4_stream, source, dest, inputSize, maxOutputSize, limitedOutput, byU32, withPrefix64k, dictSmall, acceleration);
+        else
+            result = LZ4_compress_generic(LZ4_stream, source, dest, inputSize, maxOutputSize, limitedOutput, byU32, withPrefix64k, noDictIssue, acceleration);
+        streamPtr->dictSize += (U32)inputSize;
+        streamPtr->currentOffset += (U32)inputSize;
+        return result;
+    }
+
+    /* external dictionary mode */
+    {
+        int result;
+        if ((streamPtr->dictSize < 64 KB) && (streamPtr->dictSize < streamPtr->currentOffset))
+            result = LZ4_compress_generic(LZ4_stream, source, dest, inputSize, maxOutputSize, limitedOutput, byU32, usingExtDict, dictSmall, acceleration);
+        else
+            result = LZ4_compress_generic(LZ4_stream, source, dest, inputSize, maxOutputSize, limitedOutput, byU32, usingExtDict, noDictIssue, acceleration);
+        streamPtr->dictionary = (const BYTE*)source;
+        streamPtr->dictSize = (U32)inputSize;
+        streamPtr->currentOffset += (U32)inputSize;
+        return result;
+    }
+}
+
+
+/* Hidden debug function, to force external dictionary mode */
+int LZ4_compress_forceExtDict (LZ4_stream_t* LZ4_dict, const char* source, char* dest, int inputSize)
+{
+    LZ4_stream_t_internal* streamPtr = (LZ4_stream_t_internal*)LZ4_dict;
+    int result;
+    const BYTE* const dictEnd = streamPtr->dictionary + streamPtr->dictSize;
+
+    const BYTE* smallest = dictEnd;
+    if (smallest > (const BYTE*) source) smallest = (const BYTE*) source;
+    LZ4_renormDictT((LZ4_stream_t_internal*)LZ4_dict, smallest);
+
+    result = LZ4_compress_generic(LZ4_dict, source, dest, inputSize, 0, notLimited, byU32, usingExtDict, noDictIssue, 1);
+
+    streamPtr->dictionary = (const BYTE*)source;
+    streamPtr->dictSize = (U32)inputSize;
+    streamPtr->currentOffset += (U32)inputSize;
+
+    return result;
+}
+
+
+int LZ4_saveDict (LZ4_stream_t* LZ4_dict, char* safeBuffer, int dictSize)
+{
+    LZ4_stream_t_internal* dict = (LZ4_stream_t_internal*) LZ4_dict;
+    const BYTE* previousDictEnd = dict->dictionary + dict->dictSize;
+
+    if ((U32)dictSize > 64 KB) dictSize = 64 KB;   /* useless to define a dictionary > 64 KB */
+    if ((U32)dictSize > dict->dictSize) dictSize = dict->dictSize;
+
+    memmove(safeBuffer, previousDictEnd - dictSize, dictSize);
+
+    dict->dictionary = (const BYTE*)safeBuffer;
+    dict->dictSize = (U32)dictSize;
+
+    return dictSize;
+}
+
+
+
+/*******************************
+*  Decompression functions
+*******************************/
+/*
+ * This generic decompression function cover all use cases.
+ * It shall be instantiated several times, using different sets of directives
+ * Note that it is essential this generic function is really inlined,
+ * in order to remove useless branches during compilation optimization.
+ */
+FORCE_INLINE int LZ4_decompress_generic(
+                 const char* const source,
+                 char* const dest,
+                 int inputSize,
+                 int outputSize,         /* If endOnInput==endOnInputSize, this value is the max size of Output Buffer. */
+
+                 int endOnInput,         /* endOnOutputSize, endOnInputSize */
+                 int partialDecoding,    /* full, partial */
+                 int targetOutputSize,   /* only used if partialDecoding==partial */
+                 int dict,               /* noDict, withPrefix64k, usingExtDict */
+                 const BYTE* const lowPrefix,  /* == dest if dict == noDict */
+                 const BYTE* const dictStart,  /* only if dict==usingExtDict */
+                 const size_t dictSize         /* note : = 0 if noDict */
+                 )
+{
+    /* Local Variables */
+    const BYTE* ip = (const BYTE*) source;
+    const BYTE* const iend = ip + inputSize;
+
+    BYTE* op = (BYTE*) dest;
+    BYTE* const oend = op + outputSize;
+    BYTE* cpy;
+    BYTE* oexit = op + targetOutputSize;
+    const BYTE* const lowLimit = lowPrefix - dictSize;
+
+    const BYTE* const dictEnd = (const BYTE*)dictStart + dictSize;
+    const size_t dec32table[] = {4, 1, 2, 1, 4, 4, 4, 4};
+    const size_t dec64table[] = {0, 0, 0, (size_t)-1, 0, 1, 2, 3};
+
+    const int safeDecode = (endOnInput==endOnInputSize);
+    const int checkOffset = ((safeDecode) && (dictSize < (int)(64 KB)));
+
+
+    /* Special cases */
+    if ((partialDecoding) && (oexit> oend-MFLIMIT)) oexit = oend-MFLIMIT;                         /* targetOutputSize too high => decode everything */
+    if ((endOnInput) && (unlikely(outputSize==0))) return ((inputSize==1) && (*ip==0)) ? 0 : -1;  /* Empty output buffer */
+    if ((!endOnInput) && (unlikely(outputSize==0))) return (*ip==0?1:-1);
+
+
+    /* Main Loop */
+    while (1)
+    {
+        unsigned token;
+        size_t length;
+        const BYTE* match;
+
+        /* get literal length */
+        token = *ip++;
+        if ((length=(token>>ML_BITS)) == RUN_MASK)
+        {
+            unsigned s;
+            do
+            {
+                s = *ip++;
+                length += s;
+            }
+            while (likely((endOnInput)?ip<iend-RUN_MASK:1) && (s==255));
+            if ((safeDecode) && unlikely((size_t)(op+length)<(size_t)(op))) goto _output_error;   /* overflow detection */
+            if ((safeDecode) && unlikely((size_t)(ip+length)<(size_t)(ip))) goto _output_error;   /* overflow detection */
+        }
+
+        /* copy literals */
+        cpy = op+length;
+        if (((endOnInput) && ((cpy>(partialDecoding?oexit:oend-MFLIMIT)) || (ip+length>iend-(2+1+LASTLITERALS))) )
+            || ((!endOnInput) && (cpy>oend-COPYLENGTH)))
+        {
+            if (partialDecoding)
+            {
+                if (cpy > oend) goto _output_error;                           /* Error : write attempt beyond end of output buffer */
+                if ((endOnInput) && (ip+length > iend)) goto _output_error;   /* Error : read attempt beyond end of input buffer */
+            }
+            else
+            {
+                if ((!endOnInput) && (cpy != oend)) goto _output_error;       /* Error : block decoding must stop exactly there */
+                if ((endOnInput) && ((ip+length != iend) || (cpy > oend))) goto _output_error;   /* Error : input must be consumed */
+            }
+            memcpy(op, ip, length);
+            ip += length;
+            op += length;
+            break;     /* Necessarily EOF, due to parsing restrictions */
+        }
+        LZ4_wildCopy(op, ip, cpy);
+        ip += length; op = cpy;
+
+        /* get offset */
+        match = cpy - LZ4_readLE16(ip); ip+=2;
+        if ((checkOffset) && (unlikely(match < lowLimit))) goto _output_error;   /* Error : offset outside destination buffer */
+
+        /* get matchlength */
+        length = token & ML_MASK;
+        if (length == ML_MASK)
+        {
+            unsigned s;
+            do
+            {
+                if ((endOnInput) && (ip > iend-LASTLITERALS)) goto _output_error;
+                s = *ip++;
+                length += s;
+            } while (s==255);
+            if ((safeDecode) && unlikely((size_t)(op+length)<(size_t)op)) goto _output_error;   /* overflow detection */
+        }
+        length += MINMATCH;
+
+        /* check external dictionary */
+        if ((dict==usingExtDict) && (match < lowPrefix))
+        {
+            if (unlikely(op+length > oend-LASTLITERALS)) goto _output_error;   /* doesn't respect parsing restriction */
+
+            if (length <= (size_t)(lowPrefix-match))
+            {
+                /* match can be copied as a single segment from external dictionary */
+                match = dictEnd - (lowPrefix-match);
+                memmove(op, match, length); op += length;
+            }
+            else
+            {
+                /* match encompass external dictionary and current segment */
+                size_t copySize = (size_t)(lowPrefix-match);
+                memcpy(op, dictEnd - copySize, copySize);
+                op += copySize;
+                copySize = length - copySize;
+                if (copySize > (size_t)(op-lowPrefix))   /* overlap within current segment */
+                {
+                    BYTE* const endOfMatch = op + copySize;
+                    const BYTE* copyFrom = lowPrefix;
+                    while (op < endOfMatch) *op++ = *copyFrom++;
+                }
+                else
+                {
+                    memcpy(op, lowPrefix, copySize);
+                    op += copySize;
+                }
+            }
+            continue;
+        }
+
+        /* copy repeated sequence */
+        cpy = op + length;
+        if (unlikely((op-match)<8))
+        {
+            const size_t dec64 = dec64table[op-match];
+            op[0] = match[0];
+            op[1] = match[1];
+            op[2] = match[2];
+            op[3] = match[3];
+            match += dec32table[op-match];
+            LZ4_copy4(op+4, match);
+            op += 8; match -= dec64;
+        } else { LZ4_copy8(op, match); op+=8; match+=8; }
+
+        if (unlikely(cpy>oend-12))
+        {
+            if (cpy > oend-LASTLITERALS) goto _output_error;    /* Error : last LASTLITERALS bytes must be literals */
+            if (op < oend-8)
+            {
+                LZ4_wildCopy(op, match, oend-8);
+                match += (oend-8) - op;
+                op = oend-8;
+            }
+            while (op<cpy) *op++ = *match++;
+        }
+        else
+            LZ4_wildCopy(op, match, cpy);
+        op=cpy;   /* correction */
+    }
+
+    /* end of decoding */
+    if (endOnInput)
+       return (int) (((char*)op)-dest);     /* Nb of output bytes decoded */
+    else
+       return (int) (((const char*)ip)-source);   /* Nb of input bytes read */
+
+    /* Overflow error detected */
+_output_error:
+    return (int) (-(((const char*)ip)-source))-1;
+}
+
+
+int LZ4_decompress_safe(const char* source, char* dest, int compressedSize, int maxDecompressedSize)
+{
+    return LZ4_decompress_generic(source, dest, compressedSize, maxDecompressedSize, endOnInputSize, full, 0, noDict, (BYTE*)dest, NULL, 0);
+}
+
+int LZ4_decompress_safe_partial(const char* source, char* dest, int compressedSize, int targetOutputSize, int maxDecompressedSize)
+{
+    return LZ4_decompress_generic(source, dest, compressedSize, maxDecompressedSize, endOnInputSize, partial, targetOutputSize, noDict, (BYTE*)dest, NULL, 0);
+}
+
+int LZ4_decompress_fast(const char* source, char* dest, int originalSize)
+{
+    return LZ4_decompress_generic(source, dest, 0, originalSize, endOnOutputSize, full, 0, withPrefix64k, (BYTE*)(dest - 64 KB), NULL, 64 KB);
+}
+
+
+/* streaming decompression functions */
+
+typedef struct
+{
+    const BYTE* externalDict;
+    size_t extDictSize;
+    const BYTE* prefixEnd;
+    size_t prefixSize;
+} LZ4_streamDecode_t_internal;
+
+/*
+ * If you prefer dynamic allocation methods,
+ * LZ4_createStreamDecode()
+ * provides a pointer (void*) towards an initialized LZ4_streamDecode_t structure.
+ */
+LZ4_streamDecode_t* LZ4_createStreamDecode(void)
+{
+    LZ4_streamDecode_t* lz4s = (LZ4_streamDecode_t*) ALLOCATOR(1, sizeof(LZ4_streamDecode_t));
+    return lz4s;
+}
+
+int LZ4_freeStreamDecode (LZ4_streamDecode_t* LZ4_stream)
+{
+    FREEMEM(LZ4_stream);
+    return 0;
+}
+
+/*
+ * LZ4_setStreamDecode
+ * Use this function to instruct where to find the dictionary
+ * This function is not necessary if previous data is still available where it was decoded.
+ * Loading a size of 0 is allowed (same effect as no dictionary).
+ * Return : 1 if OK, 0 if error
+ */
+int LZ4_setStreamDecode (LZ4_streamDecode_t* LZ4_streamDecode, const char* dictionary, int dictSize)
+{
+    LZ4_streamDecode_t_internal* lz4sd = (LZ4_streamDecode_t_internal*) LZ4_streamDecode;
+    lz4sd->prefixSize = (size_t) dictSize;
+    lz4sd->prefixEnd = (const BYTE*) dictionary + dictSize;
+    lz4sd->externalDict = NULL;
+    lz4sd->extDictSize  = 0;
+    return 1;
+}
+
+/*
+*_continue() :
+    These decoding functions allow decompression of multiple blocks in "streaming" mode.
+    Previously decoded blocks must still be available at the memory position where they were decoded.
+    If it's not possible, save the relevant part of decoded data into a safe buffer,
+    and indicate where it stands using LZ4_setStreamDecode()
+*/
+int LZ4_decompress_safe_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* source, char* dest, int compressedSize, int maxOutputSize)
+{
+    LZ4_streamDecode_t_internal* lz4sd = (LZ4_streamDecode_t_internal*) LZ4_streamDecode;
+    int result;
+
+    if (lz4sd->prefixEnd == (BYTE*)dest)
+    {
+        result = LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize,
+                                        endOnInputSize, full, 0,
+                                        usingExtDict, lz4sd->prefixEnd - lz4sd->prefixSize, lz4sd->externalDict, lz4sd->extDictSize);
+        if (result <= 0) return result;
+        lz4sd->prefixSize += result;
+        lz4sd->prefixEnd  += result;
+    }
+    else
+    {
+        lz4sd->extDictSize = lz4sd->prefixSize;
+        lz4sd->externalDict = lz4sd->prefixEnd - lz4sd->extDictSize;
+        result = LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize,
+                                        endOnInputSize, full, 0,
+                                        usingExtDict, (BYTE*)dest, lz4sd->externalDict, lz4sd->extDictSize);
+        if (result <= 0) return result;
+        lz4sd->prefixSize = result;
+        lz4sd->prefixEnd  = (BYTE*)dest + result;
+    }
+
+    return result;
+}
+
+int LZ4_decompress_fast_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* source, char* dest, int originalSize)
+{
+    LZ4_streamDecode_t_internal* lz4sd = (LZ4_streamDecode_t_internal*) LZ4_streamDecode;
+    int result;
+
+    if (lz4sd->prefixEnd == (BYTE*)dest)
+    {
+        result = LZ4_decompress_generic(source, dest, 0, originalSize,
+                                        endOnOutputSize, full, 0,
+                                        usingExtDict, lz4sd->prefixEnd - lz4sd->prefixSize, lz4sd->externalDict, lz4sd->extDictSize);
+        if (result <= 0) return result;
+        lz4sd->prefixSize += originalSize;
+        lz4sd->prefixEnd  += originalSize;
+    }
+    else
+    {
+        lz4sd->extDictSize = lz4sd->prefixSize;
+        lz4sd->externalDict = (BYTE*)dest - lz4sd->extDictSize;
+        result = LZ4_decompress_generic(source, dest, 0, originalSize,
+                                        endOnOutputSize, full, 0,
+                                        usingExtDict, (BYTE*)dest, lz4sd->externalDict, lz4sd->extDictSize);
+        if (result <= 0) return result;
+        lz4sd->prefixSize = originalSize;
+        lz4sd->prefixEnd  = (BYTE*)dest + originalSize;
+    }
+
+    return result;
+}
+
+
+/*
+Advanced decoding functions :
+*_usingDict() :
+    These decoding functions work the same as "_continue" ones,
+    the dictionary must be explicitly provided within parameters
+*/
+
+FORCE_INLINE int LZ4_decompress_usingDict_generic(const char* source, char* dest, int compressedSize, int maxOutputSize, int safe, const char* dictStart, int dictSize)
+{
+    if (dictSize==0)
+        return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, safe, full, 0, noDict, (BYTE*)dest, NULL, 0);
+    if (dictStart+dictSize == dest)
+    {
+        if (dictSize >= (int)(64 KB - 1))
+            return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, safe, full, 0, withPrefix64k, (BYTE*)dest-64 KB, NULL, 0);
+        return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, safe, full, 0, noDict, (BYTE*)dest-dictSize, NULL, 0);
+    }
+    return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, safe, full, 0, usingExtDict, (BYTE*)dest, (const BYTE*)dictStart, dictSize);
+}
+
+int LZ4_decompress_safe_usingDict(const char* source, char* dest, int compressedSize, int maxOutputSize, const char* dictStart, int dictSize)
+{
+    return LZ4_decompress_usingDict_generic(source, dest, compressedSize, maxOutputSize, 1, dictStart, dictSize);
+}
+
+int LZ4_decompress_fast_usingDict(const char* source, char* dest, int originalSize, const char* dictStart, int dictSize)
+{
+    return LZ4_decompress_usingDict_generic(source, dest, 0, originalSize, 0, dictStart, dictSize);
+}
+
+/* debug function */
+int LZ4_decompress_safe_forceExtDict(const char* source, char* dest, int compressedSize, int maxOutputSize, const char* dictStart, int dictSize)
+{
+    return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, endOnInputSize, full, 0, usingExtDict, (BYTE*)dest, (const BYTE*)dictStart, dictSize);
+}
+
+
+/***************************************************
+*  Obsolete Functions
+***************************************************/
+/* obsolete compression functions */
+int LZ4_compress_limitedOutput(const char* source, char* dest, int inputSize, int maxOutputSize) { return LZ4_compress_default(source, dest, inputSize, maxOutputSize); }
+int LZ4_compress(const char* source, char* dest, int inputSize) { return LZ4_compress_default(source, dest, inputSize, LZ4_compressBound(inputSize)); }
+int LZ4_compress_limitedOutput_withState (void* state, const char* src, char* dst, int srcSize, int dstSize) { return LZ4_compress_fast_extState(state, src, dst, srcSize, dstSize, 1); }
+int LZ4_compress_withState (void* state, const char* src, char* dst, int srcSize) { return LZ4_compress_fast_extState(state, src, dst, srcSize, LZ4_compressBound(srcSize), 1); }
+int LZ4_compress_limitedOutput_continue (LZ4_stream_t* LZ4_stream, const char* src, char* dst, int srcSize, int maxDstSize) { return LZ4_compress_fast_continue(LZ4_stream, src, dst, srcSize, maxDstSize, 1); }
+int LZ4_compress_continue (LZ4_stream_t* LZ4_stream, const char* source, char* dest, int inputSize) { return LZ4_compress_fast_continue(LZ4_stream, source, dest, inputSize, LZ4_compressBound(inputSize), 1); }
+
+/*
+These function names are deprecated and should no longer be used.
+They are only provided here for compatibility with older user programs.
+- LZ4_uncompress is totally equivalent to LZ4_decompress_fast
+- LZ4_uncompress_unknownOutputSize is totally equivalent to LZ4_decompress_safe
+*/
+int LZ4_uncompress (const char* source, char* dest, int outputSize) { return LZ4_decompress_fast(source, dest, outputSize); }
+int LZ4_uncompress_unknownOutputSize (const char* source, char* dest, int isize, int maxOutputSize) { return LZ4_decompress_safe(source, dest, isize, maxOutputSize); }
+
+
+/* Obsolete Streaming functions */
+
+int LZ4_sizeofStreamState() { return LZ4_STREAMSIZE; }
+
+static void LZ4_init(LZ4_stream_t_internal* lz4ds, BYTE* base)
+{
+    MEM_INIT(lz4ds, 0, LZ4_STREAMSIZE);
+    lz4ds->bufferStart = base;
+}
+
+int LZ4_resetStreamState(void* state, char* inputBuffer)
+{
+    if ((((size_t)state) & 3) != 0) return 1;   /* Error : pointer is not aligned on 4-bytes boundary */
+    LZ4_init((LZ4_stream_t_internal*)state, (BYTE*)inputBuffer);
+    return 0;
+}
+
+void* LZ4_create (char* inputBuffer)
+{
+    void* lz4ds = ALLOCATOR(8, LZ4_STREAMSIZE_U64);
+    LZ4_init ((LZ4_stream_t_internal*)lz4ds, (BYTE*)inputBuffer);
+    return lz4ds;
+}
+
+char* LZ4_slideInputBuffer (void* LZ4_Data)
+{
+    LZ4_stream_t_internal* ctx = (LZ4_stream_t_internal*)LZ4_Data;
+    int dictSize = LZ4_saveDict((LZ4_stream_t*)LZ4_Data, (char*)ctx->bufferStart, 64 KB);
+    return (char*)(ctx->bufferStart + dictSize);
+}
+
+/* Obsolete streaming decompression functions */
+
+int LZ4_decompress_safe_withPrefix64k(const char* source, char* dest, int compressedSize, int maxOutputSize)
+{
+    return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, endOnInputSize, full, 0, withPrefix64k, (BYTE*)dest - 64 KB, NULL, 64 KB);
+}
+
+int LZ4_decompress_fast_withPrefix64k(const char* source, char* dest, int originalSize)
+{
+    return LZ4_decompress_generic(source, dest, 0, originalSize, endOnOutputSize, full, 0, withPrefix64k, (BYTE*)dest - 64 KB, NULL, 64 KB);
+}
+
+#endif   /* LZ4_COMMONDEFS_ONLY */
+
diff --git a/src/leveldb/util/lz4.h b/src/leveldb/util/lz4.h
new file mode 100644
index 000000000..99c6ebb03
--- /dev/null
+++ b/src/leveldb/util/lz4.h
@@ -0,0 +1,360 @@
+/*
+   LZ4 - Fast LZ compression algorithm
+   Header File
+   Copyright (C) 2011-2015, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - LZ4 source repository : https://github.com/Cyan4973/lz4
+   - LZ4 public forum : https://groups.google.com/forum/#!forum/lz4c
+*/
+#pragma once
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/*
+ * lz4.h provides block compression functions, and gives full buffer control to programmer.
+ * If you need to generate inter-operable compressed data (respecting LZ4 frame specification),
+ * and can let the library handle its own memory, please use lz4frame.h instead.
+*/
+
+/**************************************
+*  Version
+**************************************/
+#define LZ4_VERSION_MAJOR    1    /* for breaking interface changes  */
+#define LZ4_VERSION_MINOR    7    /* for new (non-breaking) interface capabilities */
+#define LZ4_VERSION_RELEASE  0    /* for tweaks, bug-fixes, or development */
+#define LZ4_VERSION_NUMBER (LZ4_VERSION_MAJOR *100*100 + LZ4_VERSION_MINOR *100 + LZ4_VERSION_RELEASE)
+int LZ4_versionNumber (void);
+
+/**************************************
+*  Tuning parameter
+**************************************/
+/*
+ * LZ4_MEMORY_USAGE :
+ * Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB; etc.)
+ * Increasing memory usage improves compression ratio
+ * Reduced memory usage can improve speed, due to cache effect
+ * Default value is 14, for 16KB, which nicely fits into Intel x86 L1 cache
+ */
+#define LZ4_MEMORY_USAGE 14
+
+
+/**************************************
+*  Simple Functions
+**************************************/
+
+int LZ4_compress_default(const char* source, char* dest, int sourceSize, int maxDestSize);
+int LZ4_decompress_safe (const char* source, char* dest, int compressedSize, int maxDecompressedSize);
+
+/*
+LZ4_compress_default() :
+    Compresses 'sourceSize' bytes from buffer 'source'
+    into already allocated 'dest' buffer of size 'maxDestSize'.
+    Compression is guaranteed to succeed if 'maxDestSize' >= LZ4_compressBound(sourceSize).
+    It also runs faster, so it's a recommended setting.
+    If the function cannot compress 'source' into a more limited 'dest' budget,
+    compression stops *immediately*, and the function result is zero.
+    As a consequence, 'dest' content is not valid.
+    This function never writes outside 'dest' buffer, nor read outside 'source' buffer.
+        sourceSize  : Max supported value is LZ4_MAX_INPUT_VALUE
+        maxDestSize : full or partial size of buffer 'dest' (which must be already allocated)
+        return : the number of bytes written into buffer 'dest' (necessarily <= maxOutputSize)
+              or 0 if compression fails
+
+LZ4_decompress_safe() :
+    compressedSize : is the precise full size of the compressed block.
+    maxDecompressedSize : is the size of destination buffer, which must be already allocated.
+    return : the number of bytes decompressed into destination buffer (necessarily <= maxDecompressedSize)
+             If destination buffer is not large enough, decoding will stop and output an error code (<0).
+             If the source stream is detected malformed, the function will stop decoding and return a negative result.
+             This function is protected against buffer overflow exploits, including malicious data packets.
+             It never writes outside output buffer, nor reads outside input buffer.
+*/
+
+
+/**************************************
+*  Advanced Functions
+**************************************/
+#define LZ4_MAX_INPUT_SIZE        0x7E000000   /* 2 113 929 216 bytes */
+#define LZ4_COMPRESSBOUND(isize)  ((unsigned)(isize) > (unsigned)LZ4_MAX_INPUT_SIZE ? 0 : (isize) + ((isize)/255) + 16)
+
+/*
+LZ4_compressBound() :
+    Provides the maximum size that LZ4 compression may output in a "worst case" scenario (input data not compressible)
+    This function is primarily useful for memory allocation purposes (destination buffer size).
+    Macro LZ4_COMPRESSBOUND() is also provided for compilation-time evaluation (stack memory allocation for example).
+    Note that LZ4_compress_default() compress faster when dest buffer size is >= LZ4_compressBound(srcSize)
+        inputSize  : max supported value is LZ4_MAX_INPUT_SIZE
+        return : maximum output size in a "worst case" scenario
+              or 0, if input size is too large ( > LZ4_MAX_INPUT_SIZE)
+*/
+int LZ4_compressBound(int inputSize);
+
+/*
+LZ4_compress_fast() :
+    Same as LZ4_compress_default(), but allows to select an "acceleration" factor.
+    The larger the acceleration value, the faster the algorithm, but also the lesser the compression.
+    It's a trade-off. It can be fine tuned, with each successive value providing roughly +~3% to speed.
+    An acceleration value of "1" is the same as regular LZ4_compress_default()
+    Values <= 0 will be replaced by ACCELERATION_DEFAULT (see lz4.c), which is 1.
+*/
+int LZ4_compress_fast (const char* source, char* dest, int sourceSize, int maxDestSize, int acceleration);
+
+
+/*
+LZ4_compress_fast_extState() :
+    Same compression function, just using an externally allocated memory space to store compression state.
+    Use LZ4_sizeofState() to know how much memory must be allocated,
+    and allocate it on 8-bytes boundaries (using malloc() typically).
+    Then, provide it as 'void* state' to compression function.
+*/
+int LZ4_sizeofState(void);
+int LZ4_compress_fast_extState (void* state, const char* source, char* dest, int inputSize, int maxDestSize, int acceleration);
+
+
+/*
+LZ4_compress_destSize() :
+    Reverse the logic, by compressing as much data as possible from 'source' buffer
+    into already allocated buffer 'dest' of size 'targetDestSize'.
+    This function either compresses the entire 'source' content into 'dest' if it's large enough,
+    or fill 'dest' buffer completely with as much data as possible from 'source'.
+        *sourceSizePtr : will be modified to indicate how many bytes where read from 'source' to fill 'dest'.
+                         New value is necessarily <= old value.
+        return : Nb bytes written into 'dest' (necessarily <= targetDestSize)
+              or 0 if compression fails
+*/
+int LZ4_compress_destSize (const char* source, char* dest, int* sourceSizePtr, int targetDestSize);
+
+
+/*
+LZ4_decompress_fast() :
+    originalSize : is the original and therefore uncompressed size
+    return : the number of bytes read from the source buffer (in other words, the compressed size)
+             If the source stream is detected malformed, the function will stop decoding and return a negative result.
+             Destination buffer must be already allocated. Its size must be a minimum of 'originalSize' bytes.
+    note : This function fully respect memory boundaries for properly formed compressed data.
+           It is a bit faster than LZ4_decompress_safe().
+           However, it does not provide any protection against intentionally modified data stream (malicious input).
+           Use this function in trusted environment only (data to decode comes from a trusted source).
+*/
+int LZ4_decompress_fast (const char* source, char* dest, int originalSize);
+
+/*
+LZ4_decompress_safe_partial() :
+    This function decompress a compressed block of size 'compressedSize' at position 'source'
+    into destination buffer 'dest' of size 'maxDecompressedSize'.
+    The function tries to stop decompressing operation as soon as 'targetOutputSize' has been reached,
+    reducing decompression time.
+    return : the number of bytes decoded in the destination buffer (necessarily <= maxDecompressedSize)
+       Note : this number can be < 'targetOutputSize' should the compressed block to decode be smaller.
+             Always control how many bytes were decoded.
+             If the source stream is detected malformed, the function will stop decoding and return a negative result.
+             This function never writes outside of output buffer, and never reads outside of input buffer. It is therefore protected against malicious data packets
+*/
+int LZ4_decompress_safe_partial (const char* source, char* dest, int compressedSize, int targetOutputSize, int maxDecompressedSize);
+
+
+/***********************************************
+*  Streaming Compression Functions
+***********************************************/
+#define LZ4_STREAMSIZE_U64 ((1 << (LZ4_MEMORY_USAGE-3)) + 4)
+#define LZ4_STREAMSIZE     (LZ4_STREAMSIZE_U64 * sizeof(long long))
+/*
+ * LZ4_stream_t
+ * information structure to track an LZ4 stream.
+ * important : init this structure content before first use !
+ * note : only allocated directly the structure if you are statically linking LZ4
+ *        If you are using liblz4 as a DLL, please use below construction methods instead.
+ */
+typedef struct { long long table[LZ4_STREAMSIZE_U64]; } LZ4_stream_t;
+
+/*
+ * LZ4_resetStream
+ * Use this function to init an allocated LZ4_stream_t structure
+ */
+void LZ4_resetStream (LZ4_stream_t* streamPtr);
+
+/*
+ * LZ4_createStream will allocate and initialize an LZ4_stream_t structure
+ * LZ4_freeStream releases its memory.
+ * In the context of a DLL (liblz4), please use these methods rather than the static struct.
+ * They are more future proof, in case of a change of LZ4_stream_t size.
+ */
+LZ4_stream_t* LZ4_createStream(void);
+int           LZ4_freeStream (LZ4_stream_t* streamPtr);
+
+/*
+ * LZ4_loadDict
+ * Use this function to load a static dictionary into LZ4_stream.
+ * Any previous data will be forgotten, only 'dictionary' will remain in memory.
+ * Loading a size of 0 is allowed.
+ * Return : dictionary size, in bytes (necessarily <= 64 KB)
+ */
+int LZ4_loadDict (LZ4_stream_t* streamPtr, const char* dictionary, int dictSize);
+
+/*
+ * LZ4_compress_fast_continue
+ * Compress buffer content 'src', using data from previously compressed blocks as dictionary to improve compression ratio.
+ * Important : Previous data blocks are assumed to still be present and unmodified !
+ * 'dst' buffer must be already allocated.
+ * If maxDstSize >= LZ4_compressBound(srcSize), compression is guaranteed to succeed, and runs faster.
+ * If not, and if compressed data cannot fit into 'dst' buffer size, compression stops, and function returns a zero.
+ */
+int LZ4_compress_fast_continue (LZ4_stream_t* streamPtr, const char* src, char* dst, int srcSize, int maxDstSize, int acceleration);
+
+/*
+ * LZ4_saveDict
+ * If previously compressed data block is not guaranteed to remain available at its memory location
+ * save it into a safer place (char* safeBuffer)
+ * Note : you don't need to call LZ4_loadDict() afterwards,
+ *        dictionary is immediately usable, you can therefore call LZ4_compress_fast_continue()
+ * Return : saved dictionary size in bytes (necessarily <= dictSize), or 0 if error
+ */
+int LZ4_saveDict (LZ4_stream_t* streamPtr, char* safeBuffer, int dictSize);
+
+
+/************************************************
+*  Streaming Decompression Functions
+************************************************/
+
+#define LZ4_STREAMDECODESIZE_U64  4
+#define LZ4_STREAMDECODESIZE     (LZ4_STREAMDECODESIZE_U64 * sizeof(unsigned long long))
+typedef struct { unsigned long long table[LZ4_STREAMDECODESIZE_U64]; } LZ4_streamDecode_t;
+/*
+ * LZ4_streamDecode_t
+ * information structure to track an LZ4 stream.
+ * init this structure content using LZ4_setStreamDecode or memset() before first use !
+ *
+ * In the context of a DLL (liblz4) please prefer usage of construction methods below.
+ * They are more future proof, in case of a change of LZ4_streamDecode_t size in the future.
+ * LZ4_createStreamDecode will allocate and initialize an LZ4_streamDecode_t structure
+ * LZ4_freeStreamDecode releases its memory.
+ */
+LZ4_streamDecode_t* LZ4_createStreamDecode(void);
+int                 LZ4_freeStreamDecode (LZ4_streamDecode_t* LZ4_stream);
+
+/*
+ * LZ4_setStreamDecode
+ * Use this function to instruct where to find the dictionary.
+ * Setting a size of 0 is allowed (same effect as reset).
+ * Return : 1 if OK, 0 if error
+ */
+int LZ4_setStreamDecode (LZ4_streamDecode_t* LZ4_streamDecode, const char* dictionary, int dictSize);
+
+/*
+*_continue() :
+    These decoding functions allow decompression of multiple blocks in "streaming" mode.
+    Previously decoded blocks *must* remain available at the memory position where they were decoded (up to 64 KB)
+    In the case of a ring buffers, decoding buffer must be either :
+    - Exactly same size as encoding buffer, with same update rule (block boundaries at same positions)
+      In which case, the decoding & encoding ring buffer can have any size, including very small ones ( < 64 KB).
+    - Larger than encoding buffer, by a minimum of maxBlockSize more bytes.
+      maxBlockSize is implementation dependent. It's the maximum size you intend to compress into a single block.
+      In which case, encoding and decoding buffers do not need to be synchronized,
+      and encoding ring buffer can have any size, including small ones ( < 64 KB).
+    - _At least_ 64 KB + 8 bytes + maxBlockSize.
+      In which case, encoding and decoding buffers do not need to be synchronized,
+      and encoding ring buffer can have any size, including larger than decoding buffer.
+    Whenever these conditions are not possible, save the last 64KB of decoded data into a safe buffer,
+    and indicate where it is saved using LZ4_setStreamDecode()
+*/
+int LZ4_decompress_safe_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* source, char* dest, int compressedSize, int maxDecompressedSize);
+int LZ4_decompress_fast_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* source, char* dest, int originalSize);
+
+
+/*
+Advanced decoding functions :
+*_usingDict() :
+    These decoding functions work the same as
+    a combination of LZ4_setStreamDecode() followed by LZ4_decompress_x_continue()
+    They are stand-alone. They don't need nor update an LZ4_streamDecode_t structure.
+*/
+int LZ4_decompress_safe_usingDict (const char* source, char* dest, int compressedSize, int maxDecompressedSize, const char* dictStart, int dictSize);
+int LZ4_decompress_fast_usingDict (const char* source, char* dest, int originalSize, const char* dictStart, int dictSize);
+
+
+
+/**************************************
+*  Obsolete Functions
+**************************************/
+/* Deprecate Warnings */
+/* Should these warnings messages be a problem,
+   it is generally possible to disable them,
+   with -Wno-deprecated-declarations for gcc
+   or _CRT_SECURE_NO_WARNINGS in Visual for example.
+   You can also define LZ4_DEPRECATE_WARNING_DEFBLOCK. */
+#ifndef LZ4_DEPRECATE_WARNING_DEFBLOCK
+#  define LZ4_DEPRECATE_WARNING_DEFBLOCK
+#  define LZ4_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
+#  if (LZ4_GCC_VERSION >= 405) || defined(__clang__)
+#    define LZ4_DEPRECATED(message) __attribute__((deprecated(message)))
+#  elif (LZ4_GCC_VERSION >= 301)
+#    define LZ4_DEPRECATED(message) __attribute__((deprecated))
+#  elif defined(_MSC_VER)
+#    define LZ4_DEPRECATED(message) __declspec(deprecated(message))
+#  else
+#    pragma message("WARNING: You need to implement LZ4_DEPRECATED for this compiler")
+#    define LZ4_DEPRECATED(message)
+#  endif
+#endif /* LZ4_DEPRECATE_WARNING_DEFBLOCK */
+
+/* Obsolete compression functions */
+/* These functions are planned to start generate warnings by r131 approximately */
+int LZ4_compress               (const char* source, char* dest, int sourceSize);
+int LZ4_compress_limitedOutput (const char* source, char* dest, int sourceSize, int maxOutputSize);
+int LZ4_compress_withState               (void* state, const char* source, char* dest, int inputSize);
+int LZ4_compress_limitedOutput_withState (void* state, const char* source, char* dest, int inputSize, int maxOutputSize);
+int LZ4_compress_continue                (LZ4_stream_t* LZ4_streamPtr, const char* source, char* dest, int inputSize);
+int LZ4_compress_limitedOutput_continue  (LZ4_stream_t* LZ4_streamPtr, const char* source, char* dest, int inputSize, int maxOutputSize);
+
+/* Obsolete decompression functions */
+/* These function names are completely deprecated and must no longer be used.
+   They are only provided here for compatibility with older programs.
+    - LZ4_uncompress is the same as LZ4_decompress_fast
+    - LZ4_uncompress_unknownOutputSize is the same as LZ4_decompress_safe
+   These function prototypes are now disabled; uncomment them only if you really need them.
+   It is highly recommended to stop using these prototypes and migrate to maintained ones */
+/* int LZ4_uncompress (const char* source, char* dest, int outputSize); */
+/* int LZ4_uncompress_unknownOutputSize (const char* source, char* dest, int isize, int maxOutputSize); */
+
+/* Obsolete streaming functions; use new streaming interface whenever possible */
+LZ4_DEPRECATED("use LZ4_createStream() instead") void* LZ4_create (char* inputBuffer);
+LZ4_DEPRECATED("use LZ4_createStream() instead") int   LZ4_sizeofStreamState(void);
+LZ4_DEPRECATED("use LZ4_resetStream() instead")  int   LZ4_resetStreamState(void* state, char* inputBuffer);
+LZ4_DEPRECATED("use LZ4_saveDict() instead")     char* LZ4_slideInputBuffer (void* state);
+
+/* Obsolete streaming decoding functions */
+LZ4_DEPRECATED("use LZ4_decompress_safe_usingDict() instead") int LZ4_decompress_safe_withPrefix64k (const char* src, char* dst, int compressedSize, int maxDstSize);
+LZ4_DEPRECATED("use LZ4_decompress_fast_usingDict() instead") int LZ4_decompress_fast_withPrefix64k (const char* src, char* dst, int originalSize);
+
+
+#if defined (__cplusplus)
+}
+#endif
diff --git a/src/leveldb/util/murmurhash.cc b/src/leveldb/util/murmurhash.cc
new file mode 100644
index 000000000..2c650d8bd
--- /dev/null
+++ b/src/leveldb/util/murmurhash.cc
@@ -0,0 +1,178 @@
+/*
+  Murmurhash from http://sites.google.com/site/murmurhash/
+
+  All code is released to the public domain. For business purposes, Murmurhash is
+  under the MIT license.
+*/
+#include "murmurhash.h"
+
+#if defined(__x86_64__)
+
+// -------------------------------------------------------------------
+//
+// The same caveats as 32-bit MurmurHash2 apply here - beware of alignment
+// and endian-ness issues if used across multiple platforms.
+//
+// 64-bit hash for 64-bit platforms
+
+uint64_t MurmurHash64A ( const void * key, int len, unsigned int seed )
+{
+    const uint64_t m = 0xc6a4a7935bd1e995;
+    const int r = 47;
+
+    uint64_t h = seed ^ (len * m);
+
+    const uint64_t * data = (const uint64_t *)key;
+    const uint64_t * end = data + (len/8);
+
+    while(data != end)
+    {
+        uint64_t k = *data++;
+
+        k *= m;
+        k ^= k >> r;
+        k *= m;
+
+        h ^= k;
+        h *= m;
+    }
+
+    const unsigned char * data2 = (const unsigned char*)data;
+
+    switch(len & 7)
+    {
+    case 7: h ^= ((uint64_t)data2[6]) << 48;
+    case 6: h ^= ((uint64_t)data2[5]) << 40;
+    case 5: h ^= ((uint64_t)data2[4]) << 32;
+    case 4: h ^= ((uint64_t)data2[3]) << 24;
+    case 3: h ^= ((uint64_t)data2[2]) << 16;
+    case 2: h ^= ((uint64_t)data2[1]) << 8;
+    case 1: h ^= ((uint64_t)data2[0]);
+        h *= m;
+    };
+
+    h ^= h >> r;
+    h *= m;
+    h ^= h >> r;
+
+    return h;
+}
+
+#elif defined(__i386__)
+
+// -------------------------------------------------------------------
+//
+// Note - This code makes a few assumptions about how your machine behaves -
+//
+// 1. We can read a 4-byte value from any address without crashing
+// 2. sizeof(int) == 4
+//
+// And it has a few limitations -
+//
+// 1. It will not work incrementally.
+// 2. It will not produce the same results on little-endian and big-endian
+//    machines.
+
+unsigned int MurmurHash2 ( const void * key, int len, unsigned int seed )
+{
+    // 'm' and 'r' are mixing constants generated offline.
+    // They're not really 'magic', they just happen to work well.
+
+    const unsigned int m = 0x5bd1e995;
+    const int r = 24;
+
+    // Initialize the hash to a 'random' value
+
+    unsigned int h = seed ^ len;
+
+    // Mix 4 bytes at a time into the hash
+
+    const unsigned char * data = (const unsigned char *)key;
+
+    while(len >= 4)
+    {
+        unsigned int k = *(unsigned int *)data;
+
+        k *= m;
+        k ^= k >> r;
+        k *= m;
+
+        h *= m;
+        h ^= k;
+
+        data += 4;
+        len -= 4;
+    }
+
+    // Handle the last few bytes of the input array
+
+    switch(len)
+    {
+    case 3: h ^= data[2] << 16;
+    case 2: h ^= data[1] << 8;
+    case 1: h ^= data[0];
+        h *= m;
+    };
+
+    // Do a few final mixes of the hash to ensure the last few
+    // bytes are well-incorporated.
+
+    h ^= h >> 13;
+    h *= m;
+    h ^= h >> 15;
+
+    return h;
+}
+
+#else
+
+// -------------------------------------------------------------------
+//
+// Same as MurmurHash2, but endian- and alignment-neutral.
+// Half the speed though, alas.
+
+unsigned int MurmurHashNeutral2 ( const void * key, int len, unsigned int seed )
+{
+    const unsigned int m = 0x5bd1e995;
+    const int r = 24;
+
+    unsigned int h = seed ^ len;
+
+    const unsigned char * data = (const unsigned char *)key;
+
+    while(len >= 4)
+    {
+        unsigned int k;
+
+        k  = data[0];
+        k |= data[1] << 8;
+        k |= data[2] << 16;
+        k |= data[3] << 24;
+
+        k *= m;
+        k ^= k >> r;
+        k *= m;
+
+        h *= m;
+        h ^= k;
+
+        data += 4;
+        len -= 4;
+    }
+
+    switch(len)
+    {
+    case 3: h ^= data[2] << 16;
+    case 2: h ^= data[1] << 8;
+    case 1: h ^= data[0];
+        h *= m;
+    };
+
+    h ^= h >> 13;
+    h *= m;
+    h ^= h >> 15;
+
+    return h;
+}
+
+#endif
diff --git a/src/leveldb/util/murmurhash.h b/src/leveldb/util/murmurhash.h
new file mode 100644
index 000000000..1f476b664
--- /dev/null
+++ b/src/leveldb/util/murmurhash.h
@@ -0,0 +1,32 @@
+/*
+  Murmurhash from http://sites.google.com/site/murmurhash/
+
+  All code is released to the public domain. For business purposes, Murmurhash is
+  under the MIT license.
+*/
+#ifndef MURMURHASH_H
+#define MURMURHASH_H
+
+#include <stdint.h>
+
+#if defined(__x86_64__)
+#define MURMUR_HASH MurmurHash64A
+uint64_t MurmurHash64A ( const void * key, int len, unsigned int seed );
+#define MurmurHash MurmurHash64A
+typedef uint64_t murmur_t;
+
+#elif defined(__i386__)
+#define MURMUR_HASH MurmurHash2
+unsigned int MurmurHash2 ( const void * key, int len, unsigned int seed );
+#define MurmurHash MurmurHash2
+typedef unsigned int murmur_t;
+
+#else
+#define MURMUR_HASH MurmurHashNeutral2
+unsigned int MurmurHashNeutral2 ( const void * key, int len, unsigned int seed );
+#define MurmurHash MurmurHashNeutral2
+typedef unsigned int murmur_t;
+
+#endif
+
+#endif /* MURMURHASH_H */
diff --git a/src/leveldb/util/mutexlock.h b/src/leveldb/util/mutexlock.h
index 1ff5a9efa..20dcf6f00 100644
--- a/src/leveldb/util/mutexlock.h
+++ b/src/leveldb/util/mutexlock.h
@@ -6,7 +6,6 @@
 #define STORAGE_LEVELDB_UTIL_MUTEXLOCK_H_
 
 #include "port/port.h"
-#include "port/thread_annotations.h"
 
 namespace leveldb {
 
@@ -20,13 +19,12 @@ namespace leveldb {
 //     ... some complex code, possibly with multiple return paths ...
 //   }
 
-class SCOPED_LOCKABLE MutexLock {
+class MutexLock {
  public:
-  explicit MutexLock(port::Mutex *mu) EXCLUSIVE_LOCK_FUNCTION(mu)
-      : mu_(mu)  {
+  explicit MutexLock(port::Mutex *mu) : mu_(mu) {
     this->mu_->Lock();
   }
-  ~MutexLock() UNLOCK_FUNCTION() { this->mu_->Unlock(); }
+  ~MutexLock() { this->mu_->Unlock(); }
 
  private:
   port::Mutex *const mu_;
@@ -35,6 +33,51 @@ class SCOPED_LOCKABLE MutexLock {
   void operator=(const MutexLock&);
 };
 
+
+class SpinLock {
+ public:
+  explicit SpinLock(port::Spin *sp) : sp_(sp) {
+    this->sp_->Lock();
+  }
+  ~SpinLock() { this->sp_->Unlock(); }
+
+ private:
+  port::Spin *const sp_;
+  // No copying allowed
+  SpinLock(const SpinLock&);
+  void operator=(const SpinLock&);
+};
+
+
+class ReadLock {
+ public:
+  explicit ReadLock(port::RWMutex *mu) : mu_(mu) {
+    this->mu_->ReadLock();
+  }
+  ~ReadLock() { this->mu_->Unlock(); }
+
+ private:
+  port::RWMutex *const mu_;
+  // No copying allowed
+  ReadLock(const ReadLock&);
+  void operator=(const ReadLock&);
+};
+
+
+class WriteLock {
+ public:
+  explicit WriteLock(port::RWMutex *mu) : mu_(mu) {
+    this->mu_->WriteLock();
+  }
+  ~WriteLock() { this->mu_->Unlock(); }
+
+ private:
+  port::RWMutex *const mu_;
+  // No copying allowed
+  WriteLock(const WriteLock&);
+  void operator=(const WriteLock&);
+};
+
 }  // namespace leveldb
 
 
diff --git a/src/leveldb/util/options.cc b/src/leveldb/util/options.cc
index b5e622761..c02635f2d 100644
--- a/src/leveldb/util/options.cc
+++ b/src/leveldb/util/options.cc
@@ -2,10 +2,26 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
+#define __STDC_FORMAT_MACROS
+#include <inttypes.h>
+
 #include "leveldb/options.h"
 
 #include "leveldb/comparator.h"
 #include "leveldb/env.h"
+#include "leveldb/expiry.h"
+#include "leveldb/filter_policy.h"
+#include "util/cache2.h"
+#include "util/crc32c.h"
+
+#include "leveldb/expiry.h"
+
+#if !defined(LEVELDB_VSN)
+#define LEVELDB_VSN develop
+#endif
+
+#define XSTR(x) #x
+#define STR(x) XSTR(x)
 
 namespace leveldb {
 
@@ -14,17 +30,72 @@ Options::Options()
       create_if_missing(false),
       error_if_exists(false),
       paranoid_checks(false),
+      verify_compactions(true),
       env(Env::Default()),
       info_log(NULL),
-      write_buffer_size(4<<20),
+      write_buffer_size(60<<20),
       max_open_files(1000),
       block_cache(NULL),
       block_size(4096),
+      block_size_steps(16),
       block_restart_interval(16),
-      max_file_size(2<<20),
-      compression(kSnappyCompression),
-      reuse_logs(false),
-      filter_policy(NULL) {
+      compression(kLZ4Compression),
+      filter_policy(NULL),
+      is_repair(false),
+      is_internal_db(false),
+      total_leveldb_mem(2684354560ll),
+      block_cache_threshold(32<<20),
+      limited_developer_mem(false),
+      mmap_size(0),
+      delete_threshold(1000),
+      fadvise_willneed(false),
+      tiered_slow_level(0),
+      cache_object_warming(true)
+{
+
 }
 
+
+void
+Options::Dump(
+    Logger * log) const
+{
+    Log(log,"                       Version: %s %s", STR(LEVELDB_VSN), CompileOptionsString());
+    Log(log,"            Options.comparator: %s", comparator->Name());
+    Log(log,"     Options.create_if_missing: %d", create_if_missing);
+    Log(log,"       Options.error_if_exists: %d", error_if_exists);
+    Log(log,"       Options.paranoid_checks: %d", paranoid_checks);
+    Log(log,"    Options.verify_compactions: %d", verify_compactions);
+    Log(log,"                   Options.env: %p", env);
+    Log(log,"              Options.info_log: %p", info_log);
+    Log(log,"     Options.write_buffer_size: %zd", write_buffer_size);
+    Log(log,"        Options.max_open_files: %d", max_open_files);
+    Log(log,"           Options.block_cache: %p", block_cache);
+    Log(log,"            Options.block_size: %zd", block_size);
+    Log(log,"      Options.block_size_steps: %d", block_size_steps);
+    Log(log,"Options.block_restart_interval: %d", block_restart_interval);
+    Log(log,"           Options.compression: %d", compression);
+    Log(log,"         Options.filter_policy: %s", filter_policy == NULL ? "NULL" : filter_policy->Name());
+    Log(log,"             Options.is_repair: %s", is_repair ? "true" : "false");
+    Log(log,"        Options.is_internal_db: %s", is_internal_db ? "true" : "false");
+    Log(log,"     Options.total_leveldb_mem: %" PRIu64, total_leveldb_mem);
+    Log(log," Options.block_cache_threshold: %" PRIu64, block_cache_threshold);
+    Log(log," Options.limited_developer_mem: %s", limited_developer_mem ? "true" : "false");
+    Log(log,"             Options.mmap_size: %" PRIu64, mmap_size);
+    Log(log,"      Options.delete_threshold: %" PRIu64, delete_threshold);
+    Log(log,"      Options.fadvise_willneed: %s", fadvise_willneed ? "true" : "false");
+    Log(log,"     Options.tiered_slow_level: %d", tiered_slow_level);
+    Log(log,"    Options.tiered_fast_prefix: %s", tiered_fast_prefix.c_str());
+    Log(log,"    Options.tiered_slow_prefix: %s", tiered_slow_prefix.c_str());
+    Log(log,"                        crc32c: %s", crc32c::IsHardwareCRC() ? "hardware" : "software");
+    Log(log,"  Options.cache_object_warming: %s", cache_object_warming ? "true" : "false");
+    Log(log,"       Options.ExpiryActivated: %s", ExpiryActivated() ? "true" : "false");
+
+    if (NULL!=expiry_module.get())
+        expiry_module->Dump(log);
+    else
+        Log(log,"         Options.expiry_module: NULL");
+
+}   // Options::Dump
+
 }  // namespace leveldb
diff --git a/src/leveldb/util/perf_count.cc b/src/leveldb/util/perf_count.cc
new file mode 100644
index 000000000..a97efe6d7
--- /dev/null
+++ b/src/leveldb/util/perf_count.cc
@@ -0,0 +1,664 @@
+// -------------------------------------------------------------------
+//
+// perf_count.cc:  performance counters LevelDB
+//
+// Copyright (c) 2012-2016 Basho Technologies, Inc. All Rights Reserved.
+//
+// This file is provided to you under the Apache License,
+// Version 2.0 (the "License"); you may not use this file
+// except in compliance with the License.  You may obtain
+// a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+//
+// -------------------------------------------------------------------
+
+#include <limits.h>
+#include <stdio.h>
+#include <sys/ipc.h>
+#include <sys/shm.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <syslog.h>
+#include <memory.h>
+#include <errno.h>
+
+#ifndef STORAGE_LEVELDB_INCLUDE_PERF_COUNT_H_
+#include "leveldb/perf_count.h"
+#endif
+
+#include "leveldb/atomics.h"
+#include "util/coding.h"
+
+#define __STDC_FORMAT_MACROS
+#include <inttypes.h>
+
+#ifdef OS_SOLARIS
+#  include <atomic.h>
+#endif
+
+
+namespace leveldb
+{
+
+// always have something active in gPerfCounters, eliminates
+//  need to test for "is shared object attached yet"
+static PerformanceCounters LocalStartupCounters;
+PerformanceCounters * gPerfCounters(&LocalStartupCounters);
+
+    SstCounters::SstCounters()
+        : m_IsReadOnly(false),
+          m_Version(eSstCountVersion),
+          m_CounterSize(eSstCountEnumSize)
+    {
+        memset(m_Counter, 0, sizeof(m_Counter));
+
+        m_Counter[eSstCountKeySmallest]=ULLONG_MAX;
+        m_Counter[eSstCountValueSmallest]=ULLONG_MAX;
+
+        return;
+
+    };  // SstCounters::SstCounters
+
+
+    void
+    SstCounters::EncodeTo(
+        std::string & Dst) const
+    {
+        unsigned loop;
+
+        PutVarint32(&Dst, m_Version);
+        PutVarint32(&Dst, m_CounterSize);
+
+        for(loop=0; loop<eSstCountEnumSize; ++loop)
+            PutVarint64(&Dst, m_Counter[loop]);
+    }   // SstCounters::EncodeTo
+
+
+    Status
+    SstCounters::DecodeFrom(
+        const Slice& src)
+    {
+        Status ret_status;
+        Slice cursor;
+        bool good;
+        int loop;
+
+        cursor=src;
+        m_IsReadOnly=true;
+        good=GetVarint32(&cursor, &m_Version);
+        good=good && (m_Version<=eSstCountVersion);
+
+        // all lesser number of stats to be read
+        good=good && GetVarint32(&cursor, &m_CounterSize);
+        if (good && eSstCountEnumSize < m_CounterSize)
+            m_CounterSize=eSstCountEnumSize;
+
+        for (loop=0; good && loop<eSstCountEnumSize; ++loop)
+        {
+            good=GetVarint64(&cursor, &m_Counter[loop]);
+        }   // for
+
+        // if (!good) change ret_status to bad
+
+        return(ret_status);
+
+    }   // SstCounters::DecodeFrom
+
+
+    uint64_t
+    SstCounters::Inc(
+        unsigned Index)
+    {
+        uint64_t ret_val;
+
+        ret_val=0;
+        if (!m_IsReadOnly && Index<m_CounterSize)
+        {
+            ++m_Counter[Index];
+            ret_val=m_Counter[Index];
+        }   // if
+
+        return(ret_val);
+    }   // SstCounters::Inc
+
+
+    uint64_t
+    SstCounters::Add(
+        unsigned Index,
+        uint64_t Amount)
+    {
+        uint64_t ret_val;
+
+        ret_val=0;
+        if (!m_IsReadOnly && Index<m_CounterSize)
+        {
+            m_Counter[Index]+=Amount;
+            ret_val=m_Counter[Index];
+        }   // if
+
+        return(ret_val);
+    }   // SstCounters::Add
+
+
+    uint64_t
+    SstCounters::Value(
+        unsigned Index) const
+    {
+        uint64_t ret_val;
+
+        ret_val=0;
+        if (Index<m_CounterSize)
+        {
+            ret_val=m_Counter[Index];
+        }   // if
+
+        return(ret_val);
+    }   // SstCounters::Value
+
+
+    void
+    SstCounters::Set(
+        unsigned Index,
+        uint64_t Value)
+    {
+        if (Index<m_CounterSize)
+        {
+            m_Counter[Index]=Value;
+        }   // if
+
+        return;
+    }   // SstCounters::Set
+
+
+    void
+    SstCounters::Dump() const
+    {
+        unsigned loop;
+
+        printf("SstCounters:\n");
+        printf("   m_IsReadOnly: %u\n", m_IsReadOnly);
+        printf("      m_Version: %u\n", m_Version);
+        printf("  m_CounterSize: %u\n", m_CounterSize);
+        for (loop=0; loop<m_CounterSize; ++loop)
+            printf("    Counter[%2u]: %" PRIu64 "\n", loop, m_Counter[loop]);
+
+        return;
+
+    }   // SstCounters::Dump
+
+
+    // only used for local static objects, not shared memory objects
+    PerformanceCounters::PerformanceCounters()
+    {
+        m_Version=ePerfVersion;
+        m_CounterSize=ePerfCountEnumSize;
+        // cast away "volatile"
+        memset((void*)m_Counter, 0, sizeof(m_Counter));
+
+        return;
+
+    }   // PerformanceCounters::PerformanceCounters
+
+
+    PerformanceCounters *
+    PerformanceCounters::Init(
+        bool IsReadOnly)
+    {
+        PerformanceCounters * ret_ptr;
+        bool should_create, good;
+        int ret_val, id;
+        struct shmid_ds shm_info;
+        size_t open_size;
+
+        ret_ptr=NULL;
+        memset(&shm_info, 0, sizeof(shm_info));
+        good=true;
+        open_size=sizeof(PerformanceCounters);
+
+        // first id attempt, minimal request
+        id=shmget(ePerfKey, 0, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
+        if (-1!=id)
+            ret_val=shmctl(id, IPC_STAT, &shm_info);
+        else
+            ret_val=-1;
+
+        // does the shared memory already exists (and of proper size if writing)
+        should_create=(0!=ret_val || (shm_info.shm_segsz < sizeof(PerformanceCounters))) && !IsReadOnly;
+
+        // should old shared memory be deleted?
+        if (should_create && 0==ret_val)
+        {
+            ret_val=shmctl(id, IPC_RMID, &shm_info);
+            good=(0==ret_val);
+            if (0!=ret_val)
+                syslog(LOG_ERR, "shmctl IPC_RMID failed [%d, %m]", errno);
+        }   // if
+
+        // else open the size that exists
+        else if (0==ret_val)
+        {
+            open_size=shm_info.shm_segsz;
+        }   // else if
+
+        // attempt to attach/create to shared memory instance
+        if (good)
+        {
+            int flags;
+
+            if (IsReadOnly)
+                flags = S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH;
+            else
+                flags = IPC_CREAT | S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH;
+
+            m_PerfSharedId=shmget(ePerfKey, open_size, flags);
+            good=(-1!=m_PerfSharedId);
+        }   // if
+
+        // map shared memory instance
+        if (good)
+        {
+            ret_ptr=(PerformanceCounters *)shmat(m_PerfSharedId, NULL, (IsReadOnly ? SHM_RDONLY : 0));
+            if ((void*)-1 != ret_ptr)
+            {
+                // initialize?
+                if (should_create || ePerfVersion!=ret_ptr->m_Version)
+                {
+                    if (!IsReadOnly)
+                    {
+                        memset(ret_ptr, 0, sizeof(PerformanceCounters));
+                        ret_ptr->m_Version=ePerfVersion;
+                        ret_ptr->m_CounterSize=ePerfCountEnumSize;
+                    }   // if
+
+                    // bad version match to existing segment
+                    else
+                    {
+                        good=false;
+                        errno=EINVAL;
+                    }   // else
+                }   // if
+            }   // if
+            else
+            {
+                good=false;
+                syslog(LOG_ERR, "shmat failed [%d, %m]", errno);
+            }   // else
+
+            if (good)
+            {
+                // make this available process wide
+                gPerfCounters=ret_ptr;
+            }   // if
+            else
+            {
+                ret_ptr=NULL;
+                m_LastError=errno;
+            }   // else
+        }   // if
+        else
+        {
+            m_LastError=errno;
+            ret_ptr=NULL;
+        }   // else
+
+        return(ret_ptr);
+
+    };  // PerformanceCounters::Init
+
+
+    int
+    PerformanceCounters::Close(
+        PerformanceCounters * Counts)
+    {
+        int ret_val;
+
+        if (NULL!=Counts && &LocalStartupCounters != Counts)
+        {
+            // keep gPerf valid
+            if (gPerfCounters==Counts)
+                gPerfCounters=&LocalStartupCounters;
+
+            ret_val=shmdt(Counts);
+            if (0!=ret_val)
+                ret_val=errno;
+        }   // if
+        else
+        {
+            ret_val=EINVAL;
+        }   // else
+
+        return(ret_val);
+    }   // PerformanceCounters::Close
+
+
+    uint64_t
+    PerformanceCounters::Inc(
+        unsigned Index)
+    {
+        uint64_t ret_val;
+
+        ret_val=0;
+        if (Index<m_CounterSize
+            && (!gPerfCountersDisabled || !m_PerfCounterAttr[Index].m_PerfDiscretionary))
+        {
+            volatile uint64_t * val_ptr;
+
+            val_ptr=&m_Counter[Index];
+
+# if ULONG_MAX != 4294967295UL
+            inc_and_fetch(val_ptr);
+#else
+            // hack fest for 64 bit semi-atomic on 32bit machine
+            uint32_t ret_32, * ptr_32;
+
+            ptr_32=(uint32_t *)&val_ptr;
+            ret_32=inc_and_fetch(ptr_32);
+            if (0==ret_32)
+            {
+                ++ptr_32;
+                inc_and_fetch(ptr_32);
+            }   // if
+#endif
+            ret_val=*val_ptr;
+        }   // if
+
+        return(ret_val);
+    }   // PerformanceCounters::Inc
+
+
+    uint64_t
+    PerformanceCounters::Dec(
+        unsigned Index)
+    {
+        uint64_t ret_val;
+
+        ret_val=0;
+        if (Index<m_CounterSize
+            && (!gPerfCountersDisabled || !m_PerfCounterAttr[Index].m_PerfDiscretionary))
+        {
+            volatile uint64_t * val_ptr;
+
+            val_ptr=&m_Counter[Index];
+
+# if ULONG_MAX != 4294967295UL
+            dec_and_fetch(val_ptr);
+#else
+            // hack fest for 64 bit semi-atomic on 32bit machine
+            uint32_t ret_32, * ptr_32;
+
+            ptr_32=(uint32_t *)&val_ptr;
+            ret_32=dec_and_fetch(ptr_32);
+            if (0xFFFFFFFF==ret_32)
+            {
+                ++ptr_32;
+                dec_and_fetch(ptr_32);
+            }   // if
+#endif
+            ret_val=*val_ptr;
+        }   // if
+
+        return(ret_val);
+    }   // PerformanceCounters::Dec
+
+
+    uint64_t
+    PerformanceCounters::Add(
+        unsigned Index,
+        uint64_t Amount)
+    {
+        uint64_t ret_val;
+
+        ret_val=0;
+        if (Index<m_CounterSize
+            && (!gPerfCountersDisabled || !m_PerfCounterAttr[Index].m_PerfDiscretionary))
+        {
+            volatile uint64_t * val_ptr;
+
+            val_ptr=&m_Counter[Index];
+
+# if ULONG_MAX != 4294967295UL
+            ret_val=add_and_fetch(val_ptr, Amount);
+#else
+            // hack fest for 64 bit semi-atomic on 32bit machine
+            uint32_t old_32, ret_32, * ptr_32;
+
+            ptr_32=(uint32_t *)&val_ptr;
+            old_32=*ptr_32;
+            ret_32=add_and_fetch(ptr_32, (uint32_t)Amount);
+            if (ret_32<old_32)
+            {
+                ++ptr_32;
+                add_and_fetch(ptr_32, (uint32_t)1);
+            }   // if
+
+            ret_val=*val_ptr;
+#endif
+        }   // if
+
+        return(ret_val);
+    }   // PerformanceCounters::Add
+
+
+    uint64_t
+    PerformanceCounters::Value(
+        unsigned Index) const
+    {
+        uint64_t ret_val;
+
+        ret_val=0;
+        if (Index<m_CounterSize)
+        {
+            ret_val=m_Counter[Index];
+        }   // if
+
+        return(ret_val);
+    }   // SstCounters::Value
+
+
+    void
+    PerformanceCounters::Set(
+        unsigned Index,
+        uint64_t Amount)
+    {
+        if (Index<m_CounterSize
+            && (!gPerfCountersDisabled || !m_PerfCounterAttr[Index].m_PerfDiscretionary))
+        {
+            volatile uint64_t * val_ptr;
+
+            val_ptr=&m_Counter[Index];
+
+            *val_ptr=Amount;
+        }   // if
+
+        return;
+    }   // PerformanceCounters::Set
+
+
+    volatile const uint64_t *
+    PerformanceCounters::GetPtr(
+        unsigned Index) const
+    {
+        const volatile uint64_t * ret_ptr;
+
+        if (Index<m_CounterSize)
+            ret_ptr=&m_Counter[Index];
+        else
+            ret_ptr=&m_BogusCounter;
+
+        return(ret_ptr);
+
+    }   // PerformanceCounters::GetPtr
+
+
+    const char *
+    PerformanceCounters::GetNamePtr(
+        unsigned Index)
+    {
+        const char * ret_ptr;
+
+        if (Index<ePerfCountEnumSize)
+            ret_ptr=m_PerfCounterAttr[Index].m_PerfCounterName;
+        else
+            ret_ptr="???";
+
+        return(ret_ptr);
+
+    }   // PerformanceCounters::GetPtr
+
+
+
+    volatile bool gPerfCountersDisabled=true;
+    int PerformanceCounters::m_PerfSharedId=-1;
+    int PerformanceCounters::m_LastError=0;
+    volatile uint64_t PerformanceCounters::m_BogusCounter=0;
+    const PerfCounterAttributes PerformanceCounters::m_PerfCounterAttr[]=
+    {
+        {"ROFileOpen", true},
+        {"ROFileClose", true},
+        {"ROFileUnmap", true},
+        {"RWFileOpen", true},
+        {"RWFileClose", true},
+        {"RWFileUnmap", true},
+        {"ApiOpen", true},
+        {"ApiGet", true},
+        {"ApiWrite", true},
+        {"WriteSleep", true},
+        {"WriteWaitImm", false},
+        {"WriteWaitLevel0", false},
+        {"WriteNewMem", true},
+        {"WriteError", false},
+        {"WriteNoWait", true},
+        {"GetMem", true},
+        {"GetImm", true},
+        {"GetVersion", true},
+        {"SearchLevel[0]", true},
+        {"SearchLevel[1]", true},
+        {"SearchLevel[2]", true},
+        {"SearchLevel[3]", true},
+        {"SearchLevel[4]", true},
+        {"SearchLevel[5]", true},
+        {"SearchLevel[6]", true},
+        {"TableCached", true},
+        {"TableOpened", true},
+        {"TableGet", true},
+        {"BGCloseUnmap", true},
+        {"BGCompactImm", true},
+        {"BGNormal", true},
+        {"BGCompactLevel0", true},
+        {"BlockFiltered", true},
+        {"BlockFilterFalse", true},
+        {"BlockCached", true},
+        {"BlockRead", true},
+        {"BlockFilterRead", true},
+        {"BlockValidGet", true},
+        {"Debug[0]", true},
+        {"Debug[1]", true},
+        {"Debug[2]", true},
+        {"Debug[3]", true},
+        {"Debug[4]", true},
+        {"ReadBlockError", false},
+        {"DBIterNew", true},
+        {"DBIterNext", true},
+        {"DBIterPrev", true},
+        {"DBIterSeek", true},
+        {"DBIterSeekFirst", true},
+        {"DBIterSeekLast", true},
+        {"DBIterDelete", true},
+        {"eleveldbDirect", true},
+        {"eleveldbQueued", true},
+        {"eleveldbDequeued", true},
+        {"elevelRefCreate", true},
+        {"elevelRefDelete", true},
+        {"ThrottleGauge", true},
+        {"ThrottleCounter", true},
+        {"ThrottleMicros0", true},
+        {"ThrottleKeys0", true},
+        {"ThrottleBacklog0", true},
+        {"ThrottleCompacts0", true},
+        {"ThrottleMicros1", true},
+        {"ThrottleKeys1", true},
+        {"ThrottleBacklog1", true},
+        {"ThrottleCompacts1", true},
+        {"BGWriteError", false},
+        {"ThrottleWait", true},
+        {"ThreadError", false},
+        {"BGImmDirect", true},
+        {"BGImmQueued", true},
+        {"BGImmDequeued", true},
+        {"BGImmWeighted", true},
+        {"BGUnmapDirect", true},
+        {"BGUnmapQueued", true},
+        {"BGUnmapDequeued", true},
+        {"BGUnmapWeighted", true},
+        {"BGLevel0Direct", true},
+        {"BGLevel0Queued", true},
+        {"BGLevel0Dequeued", true},
+        {"BGLevel0Weighted", true},
+        {"BGCompactDirect", true},
+        {"BGCompactQueued", true},
+        {"BGCompactDequeued", true},
+        {"BGCompactWeighted", true},
+        {"FileCacheInsert", true},
+        {"FileCacheRemove", true},
+        {"BlockCacheInsert", true},
+        {"BlockCacheRemove", true},
+        {"ApiDelete", true},
+        {"BGMove", true},
+        {"BGMoveFail", false},
+        {"ThrottleUnadjusted", true},
+        {"eleveldbWeighted", true},
+        {"ExpiredKeys", true},
+        {"ExpiredFiles", true},
+        {"SyslogWrite", false},
+        {"BackupStarted", false},
+        {"BackupError", false},
+        {"PropCacheHit", true},
+        {"PropCacheMiss", true},
+        {"PropCacheError", false},
+    };
+
+
+    int
+    PerformanceCounters::LookupCounter(
+        const char * Name)
+    {
+        int index,loop;
+
+        index=-1;
+
+        if (NULL!=Name && '\0'!=*Name)
+        {
+            for (loop=0; loop<ePerfCountEnumSize && -1==index; ++loop)
+            {
+                if (0==strcmp(m_PerfCounterAttr[loop].m_PerfCounterName, Name))
+                    index=loop;
+            }   // loop
+        }   // if
+
+        return(index);
+    };
+
+    void
+    PerformanceCounters::Dump()
+    {
+        int loop;
+
+        printf(" m_Version: %u\n", m_Version);
+        printf(" m_CounterSize: %u\n", m_CounterSize);
+
+        for (loop=0; loop<ePerfCountEnumSize; ++loop)
+        {
+            printf("  %s: %" PRIu64 "\n",
+                   m_PerfCounterAttr[loop].m_PerfCounterName, m_Counter[loop]);
+        }   // loop
+    };  // Dump
+
+}  // namespace leveldb
diff --git a/src/leveldb/util/perf_count_test.cc b/src/leveldb/util/perf_count_test.cc
new file mode 100644
index 000000000..b181ea891
--- /dev/null
+++ b/src/leveldb/util/perf_count_test.cc
@@ -0,0 +1,197 @@
+// testharness used is 
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+// -------------------------------------------------------------------
+//
+// perf_count_test.cc:  unit tests for LevelDB performance counters
+//
+// Copyright (c) 2012-2013 Basho Technologies, Inc. All Rights Reserved.
+//
+// This file is provided to you under the Apache License,
+// Version 2.0 (the "License"); you may not use this file
+// except in compliance with the License.  You may obtain
+// a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+//
+// -------------------------------------------------------------------
+
+#include <errno.h>
+#include <sys/ipc.h>
+#include <sys/shm.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#include "leveldb/perf_count.h"
+#include "util/testharness.h"
+
+namespace leveldb {
+
+class PerfTest 
+{
+public:
+    static PerfTest* current_;
+
+    PerfTest()
+    {
+        current_ = this;
+    }
+
+    ~PerfTest() {};
+
+    bool
+    DeleteShm(key_t Key)
+    {
+        int ret_val, id;
+
+        id=shmget(Key, 0, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
+        if (-1!=id)
+            ret_val=shmctl(id, IPC_RMID, NULL);
+        else
+            ret_val=-1;
+
+        return(0==ret_val);
+    }
+
+
+    bool
+    CreateShm(key_t Key, size_t Size)
+    {
+        int ret_val;
+
+        ret_val=shmget(Key, Size, IPC_CREAT | S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
+        return(-1!=ret_val);
+    }
+
+
+    void *
+    MapShm(key_t Key)
+    {
+        int id;
+        void * ret_ptr;
+
+        id=shmget(Key, 0, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
+        if (-1!=id)
+            ret_ptr=shmat(id, NULL, 0);
+        else
+            ret_ptr=NULL;
+
+        return(ret_ptr);
+    }
+
+
+
+    size_t
+    GetShmSize(key_t Key)
+    {
+        int ret_val, id;
+        struct shmid_ds shm_info;
+
+        memset(&shm_info, 0, sizeof(shm_info));
+        id=shmget(Key, 0, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
+        if (-1!=id)
+        {
+            ret_val=shmctl(id, IPC_STAT, &shm_info);
+
+            if (0!=ret_val)
+                shm_info.shm_segsz=0;
+        }   // if
+        return(shm_info.shm_segsz);
+    }
+
+};  // class PerfTest
+
+
+PerfTest* PerfTest::current_;
+
+
+TEST(PerfTest, CreateNew) 
+{
+    PerformanceCounters * perf_ptr;
+
+    // clear any existing shm
+    DeleteShm(ePerfKey);
+
+    // open for write, will create
+    perf_ptr=PerformanceCounters::Init(false);
+    ASSERT_NE(perf_ptr, (void*)NULL);
+    ASSERT_EQ(sizeof(PerformanceCounters), GetShmSize(ePerfKey));
+
+    // close and reopen for read
+    ASSERT_EQ(0, PerformanceCounters::Close(perf_ptr));
+    
+    perf_ptr=PerformanceCounters::Init(true);
+    ASSERT_NE(perf_ptr, (void*)NULL);
+    ASSERT_EQ(sizeof(PerformanceCounters), GetShmSize(ePerfKey));
+    ASSERT_EQ(0, PerformanceCounters::Close(perf_ptr));
+
+    // cleanup
+    ASSERT_EQ(true, DeleteShm(ePerfKey));
+
+    return;
+
+}   // CreateNew
+
+
+TEST(PerfTest, SizeUpgrade)
+{
+    PerformanceCounters * perf_ptr;
+
+    // clear any existing shm
+    DeleteShm(ePerfKey);
+
+    // Riak 1.2 was 536 bytes
+    ASSERT_NE(536, sizeof(PerformanceCounters));
+    ASSERT_EQ(true, CreateShm(ePerfKey, 536));
+    ASSERT_EQ(536, GetShmSize(ePerfKey));
+
+    // open for write, will recreate to current size
+    perf_ptr=PerformanceCounters::Init(false);
+    ASSERT_NE(perf_ptr, (void*)NULL);
+    ASSERT_EQ(sizeof(PerformanceCounters), GetShmSize(ePerfKey));
+
+    // cleanup
+    ASSERT_EQ(true, DeleteShm(ePerfKey));
+
+    return;
+}   // SizeUpgrade
+
+TEST(PerfTest, ReadLarger)
+{
+    PerformanceCounters * perf_ptr;
+
+    // clear any existing shm
+    DeleteShm(ePerfKey);
+
+    // create a new larger than today segment
+    ASSERT_EQ(true, CreateShm(ePerfKey, sizeof(PerformanceCounters)+64));
+    perf_ptr=(PerformanceCounters *)MapShm(ePerfKey);
+    ASSERT_NE(perf_ptr, (void*)NULL);
+    memset(perf_ptr, 0, sizeof(PerformanceCounters)+64);
+    perf_ptr->SetVersion(ePerfVersion, ePerfCountEnumSize+8);
+    shmdt(perf_ptr);
+
+    // open for read
+    perf_ptr=PerformanceCounters::Init(false);
+    ASSERT_NE(perf_ptr, (void*)NULL);
+
+    // cleanup
+    ASSERT_EQ(true, DeleteShm(ePerfKey));
+
+    return;
+}   // ReadLarger
+
+}  // namespace leveldb
+
+int main(int argc, char** argv) {
+  return leveldb::test::RunAllTests();
+}
diff --git a/src/leveldb/util/posix_logger.h b/src/leveldb/util/posix_logger.h
index c063c2b7c..9dea1d325 100644
--- a/src/leveldb/util/posix_logger.h
+++ b/src/leveldb/util/posix_logger.h
@@ -3,16 +3,16 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 //
 // Logger implementation that can be shared by all environments
-// where enough Posix functionality is available.
+// where enough posix functionality is available.
 
 #ifndef STORAGE_LEVELDB_UTIL_POSIX_LOGGER_H_
 #define STORAGE_LEVELDB_UTIL_POSIX_LOGGER_H_
 
-#include <algorithm>
 #include <stdio.h>
 #include <sys/time.h>
 #include <time.h>
 #include "leveldb/env.h"
+#include "util/mutexlock.h"
 
 namespace leveldb {
 
@@ -20,11 +20,23 @@ class PosixLogger : public Logger {
  private:
   FILE* file_;
   uint64_t (*gettid_)();  // Return the thread id for the current thread
+
  public:
   PosixLogger(FILE* f, uint64_t (*gettid)()) : file_(f), gettid_(gettid) { }
   virtual ~PosixLogger() {
     fclose(file_);
   }
+  virtual long LogSize()
+      {
+          long ret_val;
+
+          // if ftell() gives error, return zero
+          //  to match default class' "does not exist" response
+          ret_val=ftell(file_);
+          if (-1==ret_val)
+              ret_val=0;
+          return(ret_val);
+      };
   virtual void Logv(const char* format, va_list ap) {
     const uint64_t thread_id = (*gettid_)();
 
diff --git a/src/leveldb/util/prop_cache.cc b/src/leveldb/util/prop_cache.cc
new file mode 100644
index 000000000..a36fea04c
--- /dev/null
+++ b/src/leveldb/util/prop_cache.cc
@@ -0,0 +1,341 @@
+// -------------------------------------------------------------------
+//
+// prop_cache.cc
+//
+// Copyright (c) 2016-2017 Basho Technologies, Inc. All Rights Reserved.
+//
+// This file is provided to you under the Apache License,
+// Version 2.0 (the "License"); you may not use this file
+// except in compliance with the License.  You may obtain
+// a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+//
+// -------------------------------------------------------------------
+
+#include <sys/time.h>
+#include <unistd.h>
+
+#include "port/port.h"
+#include "util/prop_cache.h"
+#include "util/logging.h"
+#include "util/mutexlock.h"
+#include "util/throttle.h"
+
+namespace leveldb {
+
+/**
+ * lPropCacheLock and lPropCache exist to address race condition
+ *  where Erlang respond to an information request after telling
+ *  leveldb to shutdown.
+ */
+static port::Spin lPropCacheLock;
+static PropertyCachePtr_t lPropCache;
+
+/**
+ * Create the cache.  Called only once upon
+ * leveldb initialization
+ */
+void
+PropertyCache::InitPropertyCache(
+    EleveldbRouter_t Router)
+{
+    ShutdownPropertyCache();
+    lPropCache = new PropertyCache(Router);
+
+    return;
+
+}   // PropertyCache
+
+
+void
+PropertyCache::ShutdownPropertyCache()
+{
+
+    SpinLock l(&lPropCacheLock);
+    lPropCache.reset();
+
+}  // PropertyCache::ShutdownPropertyCache
+
+
+/**
+ * Unit test support.  Allows use of derived versions
+ *  of PropertyCache that easy testing
+ */
+void
+PropertyCache::SetGlobalPropertyCache(
+    PropertyCache * NewGlobal)
+{
+    // (creates infinite loop) ShutdownPropertyCache();
+    lPropCache = NewGlobal;
+
+    return;
+
+}   // PropertyCache::SetGlobalPropertyCache
+
+
+/**
+ * Unit test support.  Allows use of derived versions
+ *  of PropertyCache that easy testing
+ */
+Cache &
+PropertyCache::GetCache()
+{
+
+    return(*lPropCache->GetCachePtr());
+
+}   // PropertyCache::GetCache
+
+
+/**
+ * Unit test support.  Destroy current cache, start new ond
+ */
+void
+PropertyCache::Flush()
+{
+    PropertyCachePtr_t ptr;
+
+    // stablize the object by locking it and
+    //  getting a reference count.  Flush while
+    //  holding lock to keep others away
+    //  ... anyone already using the object may segfault
+    //      this is so dangerous ... only for testing
+    {
+        SpinLock l(&lPropCacheLock);
+        ptr=lPropCache;
+
+        if (NULL!=ptr.get())
+            ptr->FlushInternal();
+    }
+
+}   // PropertyCache::Flush
+
+
+/**
+ * Construct property cache object (likely singleton)
+ */
+PropertyCache::PropertyCache(
+    EleveldbRouter_t Router)
+    : m_Cache(NULL), m_Router(Router),
+      m_Cond(&m_Mutex)
+{
+    m_Cache = NewLRUCache2(GetCacheLimit());
+
+}   // PopertyCache::PropertyCache
+
+
+PropertyCache::~PropertyCache()
+{
+    delete m_Cache;
+    m_Cache=NULL;
+}   // PropertyCache::~PropertyCache
+
+
+/**
+ * used by unit & integration tests, must protect against
+ *  background AAE operation requests
+ */
+void
+PropertyCache::FlushInternal()
+{
+    delete m_Cache;
+    m_Cache = NewLRUCache2(GetCacheLimit());
+
+    return;
+
+}   // PropertyCache::FlushInternal
+
+
+/**
+ * Retrieve property from cache if available,
+ *  else call out to Riak to get properties
+ */
+Cache::Handle *
+PropertyCache::Lookup(
+    const Slice & CompositeBucket)
+{
+    Cache::Handle * ret_handle(NULL);
+    PropertyCachePtr_t ptr;
+
+    // race condition ... lPropCache going away as ptr assigned
+    //   (unlikely here, but seen in Insert)
+    {
+        SpinLock l(&lPropCacheLock);
+        ptr=lPropCache;
+    }   // lock
+
+    if (NULL!=ptr.get())
+    {
+        ret_handle=ptr->LookupInternal(CompositeBucket);
+    }   // if
+
+    return(ret_handle);
+
+}   // PropertyCache::Lookup
+
+
+/**
+ * Test if global cache is running,
+ *  does NOT imply it will stay valid
+ */
+bool
+PropertyCache::Valid()
+{
+    PropertyCachePtr_t ptr;
+    bool ret_flag(false);
+
+    // race condition ... lPropCache going away as ptr assigned
+    //   (unlikely here, but seen in Insert)
+    {
+        SpinLock l(&lPropCacheLock);
+        ptr=lPropCache;
+    }   // lock
+
+    if (NULL!=ptr.get())
+    {
+        ret_flag=(NULL!=ptr->m_Cache);
+    }   // if
+
+    return(ret_flag);
+
+}   // PropertyCache::Valid
+
+
+/**
+ * Retrieve property from cache if available,
+ *  else call out to Riak to get properties
+ */
+Cache::Handle *
+PropertyCache::LookupInternal(
+    const Slice & CompositeBucket)
+{
+    Cache::Handle * ret_handle(NULL);
+
+    if (NULL!=m_Cache)
+    {
+        ret_handle=m_Cache->Lookup(CompositeBucket);
+
+        // force a reread of properties every 5 minutes
+        if (NULL!=ret_handle)
+        {
+            uint64_t now;
+            ExpiryModule * mod_ptr;
+
+            now=GetCachedTimeMicros();
+            mod_ptr=(ExpiryModule *)m_Cache->Value(ret_handle);
+
+            // some unit tests of mod_ptr of NULL
+            if (NULL!=mod_ptr && 0!=mod_ptr->ExpiryModuleExpiryMicros()
+                && mod_ptr->ExpiryModuleExpiryMicros()<now)
+            {
+                m_Cache->Release(ret_handle);
+                m_Cache->Erase(CompositeBucket);
+                ret_handle=NULL;
+            }   // if
+        }   // if
+
+        // not waiting in the cache already.  Request info
+        if (NULL==ret_handle && NULL!=m_Router)
+        {
+            // call to Riak required
+            ret_handle=LookupWait(CompositeBucket);
+            gPerfCounters->Inc(ePerfPropCacheMiss);
+        }   // if
+        else if (NULL!=ret_handle)
+        {
+            // cached or no router
+            gPerfCounters->Inc(ePerfPropCacheHit);
+        }   // else if
+    }   // if
+
+    // never supposed to be missing if property cache in play
+    if (NULL==ret_handle)
+        gPerfCounters->Inc(ePerfPropCacheError);
+
+    return(ret_handle);
+
+}   // PropertyCache::LookupInternal
+
+
+/**
+ * Callback function used when Cache drops an object
+ *  to make room for another due to cache size being exceeded
+ */
+static void
+DeleteProperty(
+    const Slice& key,
+    void* value)
+{
+    ExpiryModuleOS * expiry;
+
+    expiry=(ExpiryModuleOS *)value;
+
+    delete expiry;
+}   // static DeleteProperty
+
+
+/**
+ * (static) Add / Overwrite key in property cache.  Manage handle
+ *  on caller's behalf
+ */
+bool
+PropertyCache::Insert(
+    const Slice & CompositeBucket,
+    void * Props,
+    Cache::Handle ** OutputPtr)
+{
+    PropertyCachePtr_t ptr;
+    bool ret_flag(false);
+    Cache::Handle * ret_handle(NULL);
+
+    // race condition ... lPropCache going away as ptr assigned
+    {
+        SpinLock l(&lPropCacheLock);
+        ptr=lPropCache;
+    }   // lock
+
+    if (NULL!=ptr.get() && NULL!=ptr->GetCachePtr())
+    {
+        ret_handle=ptr->InsertInternal(CompositeBucket, Props);
+
+        if (NULL!=OutputPtr)
+            *OutputPtr=ret_handle;
+        else if (NULL!=ret_handle)
+            GetCache().Release(ret_handle);
+
+        ret_flag=(NULL!=ret_handle);
+    }   // if
+
+    return(ret_flag);
+
+}   // PropertyCache::Insert
+
+
+Cache::Handle *
+PropertyCache::InsertInternal(
+    const Slice & CompositeBucket,
+    void * Props)
+{
+    assert(NULL!=m_Cache);
+
+    Cache::Handle * ret_handle(NULL);
+
+    {
+        MutexLock lock(&m_Mutex);
+
+        ret_handle=m_Cache->Insert(CompositeBucket, Props, 1, DeleteProperty);
+        m_Cond.SignalAll();
+    }
+
+    return(ret_handle);
+
+}   // PropertyCache::InsertInternal
+
+}  // namespace leveldb
diff --git a/src/leveldb/util/prop_cache.h b/src/leveldb/util/prop_cache.h
new file mode 100644
index 000000000..9ea31f5f1
--- /dev/null
+++ b/src/leveldb/util/prop_cache.h
@@ -0,0 +1,219 @@
+// -------------------------------------------------------------------
+//
+// prop_cache.h
+//
+// Copyright (c) 2016-2017 Basho Technologies, Inc. All Rights Reserved.
+//
+// This file is provided to you under the Apache License,
+// Version 2.0 (the "License"); you may not use this file
+// except in compliance with the License.  You may obtain
+// a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+//
+// -------------------------------------------------------------------
+
+#ifndef PROP_CACHE_H
+#define PROP_CACHE_H
+
+#include "leveldb/cache.h"
+#include "util/expiry_os.h"
+#include "util/refobject_base.h"
+#include "port/port.h"
+
+
+namespace leveldb
+{
+
+class PropertyCache : public RefObjectBase
+{
+public:
+    /**
+     * static functions are API for production usage
+     */
+
+    // create global cache object
+    static void InitPropertyCache(EleveldbRouter_t Router);
+
+    // release global cache object
+    static void ShutdownPropertyCache();
+
+    // unit test support
+    static void SetGlobalPropertyCache(PropertyCache * NewCache);
+
+    // static lookup, usually from CachePtr
+    static Cache::Handle * Lookup(const Slice & CompositeBucket);
+
+    // static insert, usually from eleveldb::property_cache()
+    static bool Insert(const Slice & CompositeBucket, void * Props, Cache::Handle ** OutputPtr);
+
+    // static retrieval of active cache
+    static Cache & GetCache();
+
+    // for unit tests, "flush" cache
+    static void Flush();
+
+    // test if cache is running (so OS builds know to ignore)
+    static bool Valid();
+
+    // virtual destructor to facilitate unit tests
+    virtual ~PropertyCache();
+
+protected:
+    /**
+     * protected functions are API for unit tests.  The static functions
+     *  route program flow to these.
+     */
+
+    // only allow creation from InitPropertyCache() or unit tests
+    PropertyCache(EleveldbRouter_t);
+
+    // accessor to m_Cache pointer (really bad if NULL m_Cache)
+    Cache * GetCachePtr() {return(m_Cache);};
+
+    // unit & integration test support to get rid of current cache entries
+    void FlushInternal();
+
+    // internal equivalent to static Lookup() function
+    Cache::Handle * LookupInternal(const Slice & CompositeBucket);
+
+    // internal routine to launch lookup request via eleveldb router, then wait
+    Cache::Handle * LookupWait(const Slice & CompositeBucket);
+
+    // internal routine to insert object and signal condition variable
+    Cache::Handle * InsertInternal(const Slice & CompositeBucket, void * Props);
+
+    // 1000 is number of cache entries.  Just pulled
+    //  that number out of the air.
+    // virtual for unit test to override
+    virtual int GetCacheLimit() const {return(1000);}
+
+    Cache * m_Cache;
+    EleveldbRouter_t m_Router;
+    port::Mutex m_Mutex;
+    port::CondVar m_Cond;
+
+// The follow explicitly disable use of default constructor, copy constructor,
+//  and assignment operator.
+private:
+    PropertyCache();
+    PropertyCache(const PropertyCache &);
+    PropertyCache operator=(const PropertyCache &);
+
+}; // class PropertyCache
+
+
+/**
+ * This temple wraps the entire property cache
+ */
+typedef RefPtr<PropertyCache> PropertyCachePtr_t;
+
+
+/**
+ * This template wraps an object in property cache
+ *  to insure it is properly released.
+ *  Makes calls to static functions of PropertyCache.
+ */
+template<typename Object> class CachePtr
+{
+    /****************************************************************
+    *  Member objects
+    ****************************************************************/
+public:
+
+protected:
+    Cache::Handle * m_Ptr;            // NULL or object in cache
+
+private:
+
+    /****************************************************************
+    *  Member functions
+    ****************************************************************/
+public:
+    CachePtr() : m_Ptr(NULL) {};
+
+    ~CachePtr() {Release();};
+
+    // unprotected if GetCache is NULL
+    void Release()
+    {
+        if (NULL!=m_Ptr)
+            PropertyCache::GetCache().Release(m_Ptr);
+        m_Ptr=NULL;
+    };
+
+    CachePtr & operator=(Cache::Handle * Hand) {reset(Hand);};
+
+    void reset(Cache::Handle * Hand=NULL)
+    {
+        if (m_Ptr!=Hand)
+        {
+            Release();
+            m_Ptr=Hand;
+        }   // if
+    }
+
+
+    Object * get()
+    {return(PropertyCache::Valid()
+            ? (Object *)PropertyCache::GetCache().Value(m_Ptr)
+            : NULL);};
+
+    // unprotected if GetCache is NULL
+    const Object * get() const
+    {return(PropertyCache::Valid()
+            ? (const Object *)PropertyCache::GetCache().Value(m_Ptr)
+            : NULL);};
+
+    Object * operator->() {return(get());};
+    const Object * operator->() const {return(get());};
+
+    Object & operator*() {return(*get());};
+    const Object & operator*() const {return(*get());};
+
+    bool Lookup(const Slice & Key)
+    {
+        Release();
+        m_Ptr=PropertyCache::Lookup(Key);
+        return(NULL!=m_Ptr);
+    };
+
+    bool Insert(const Slice & Key, Object * Value)
+    {
+        bool ret_flag(false);
+        Release();
+        ret_flag=PropertyCache::Insert(Key, (void *)Value, &m_Ptr);
+        return(ret_flag);
+    };
+
+    // unprotected if GetCache is NULL
+    void Erase(const Slice & Key)
+    {
+        Release();
+        if (PropertyCache::Valid())
+            PropertyCache::GetCache().Erase(Key);
+        return;
+    };
+
+protected:
+
+private:
+    CachePtr(const CachePtr &);
+    CachePtr & operator=(const CachePtr &);
+
+};  // template CachePtr
+
+
+typedef CachePtr<ExpiryModuleOS> ExpiryPropPtr_t;
+
+
+}  // namespace leveldb
+
+#endif // ifndef
diff --git a/src/leveldb/util/random.h b/src/leveldb/util/random.h
index ddd51b1c7..07538242e 100644
--- a/src/leveldb/util/random.h
+++ b/src/leveldb/util/random.h
@@ -16,12 +16,7 @@ class Random {
  private:
   uint32_t seed_;
  public:
-  explicit Random(uint32_t s) : seed_(s & 0x7fffffffu) {
-    // Avoid bad seeds.
-    if (seed_ == 0 || seed_ == 2147483647L) {
-      seed_ = 1;
-    }
-  }
+  explicit Random(uint32_t s) : seed_(s & 0x7fffffffu) { }
   uint32_t Next() {
     static const uint32_t M = 2147483647L;   // 2^31-1
     static const uint64_t A = 16807;  // bits 14, 8, 7, 5, 2, 1, 0
diff --git a/src/leveldb/util/refobject_base.h b/src/leveldb/util/refobject_base.h
new file mode 100644
index 000000000..782e860b4
--- /dev/null
+++ b/src/leveldb/util/refobject_base.h
@@ -0,0 +1,192 @@
+// -------------------------------------------------------------------
+//
+// refobject_base.h
+//
+// Copyright (c) 2015 Basho Technologies, Inc. All Rights Reserved.
+//
+// This file is provided to you under the Apache License,
+// Version 2.0 (the "License"); you may not use this file
+// except in compliance with the License.  You may obtain
+// a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+//
+// -------------------------------------------------------------------
+
+// -------------------------------------------------------------------
+//  Base class for reference-counted types; refactored from
+//  eleveldb/c_src/refobjects.h and leveldb/util/thread_tasks.h
+// -------------------------------------------------------------------
+
+#ifndef LEVELDB_INCLUDE_REFOBJECT_BASE_H_
+#define LEVELDB_INCLUDE_REFOBJECT_BASE_H_
+
+#include <stdint.h>
+
+#include "port/port.h"
+#include "leveldb/atomics.h"
+#include "util/mutexlock.h"
+
+namespace leveldb {
+
+/**
+ * Base class for reference-counted types
+ *
+ * A user of a reference-counted object makes the reference explicit by
+ * calling the RefInc() method, which increments the internal reference
+ * counter in a thread safe manner. When the user of the object is done
+ * with the object, it releases the reference by calling the RefDec()
+ * method, which decrements the internal counter in a thread safe manner.
+ * When the reference counter reaches 0, the RefDec() method deletes
+ * the current object by executing a "delete this" statement.
+ *
+ * Note that the because RefDec() executes "delete this" when the reference
+ * count reaches 0, the reference-counted object must be allocated on the
+ * heap.
+ */
+class RefObjectBase
+{
+    // force this private so everyone is using memory fenced GetRefCount
+ private:
+    volatile uint32_t m_RefCount;
+
+ public:
+    RefObjectBase() : m_RefCount(0) {}
+    virtual ~RefObjectBase() {}
+
+    virtual uint32_t RefInc() {return(inc_and_fetch(&m_RefCount));}
+
+    virtual uint32_t RefDec()
+    {
+        uint32_t current_refs;
+
+        current_refs=dec_and_fetch(&m_RefCount);
+        if (0==current_refs) {
+            delete this;
+        }
+
+        return(current_refs);
+    }   // RefDec
+
+    // some derived objects might need other cleanup before delete (see ErlRefObject)
+    virtual uint32_t RefDecNoDelete() {return(dec_and_fetch(&m_RefCount));};
+
+    // establish memory fence via atomic operation call
+    virtual uint32_t GetRefCount() {return(add_and_fetch(&m_RefCount, (uint32_t)0));};
+
+ private:
+    // hide the copy ctor and assignment operator (not implemented)
+    RefObjectBase(const RefObjectBase&);
+    RefObjectBase& operator=(const RefObjectBase&);
+};
+
+
+template<typename Object> class RefPtr
+{
+    /****************************************************************
+    *  Member objects
+    ****************************************************************/
+public:
+
+protected:
+    port::Spin m_Spin;
+    Object * m_Ptr;            // NULL or object being reference counted
+
+private:
+
+    /****************************************************************
+    *  Member functions
+    ****************************************************************/
+public:
+    RefPtr() : m_Ptr(NULL) {};
+
+    virtual ~RefPtr() {RefDecrement();};
+
+    RefPtr(const RefPtr & rhs) : m_Ptr(NULL) {reset(rhs.m_Ptr);};
+    RefPtr(Object * Ptr) : m_Ptr(NULL) {reset(Ptr);};
+    RefPtr(Object & Obj) : m_Ptr(NULL) {reset(&Obj);};
+
+//    RefPtr & operator=(const Object & rhs) {reset(rhs.m_Ptr); return(*this);};
+    RefPtr & operator=(Object & rhs) {reset(&rhs); return(*this);};
+    RefPtr & operator=(Object * Ptr) {reset(Ptr); return(*this);};
+    RefPtr & operator=(RefPtr & RPtr) {reset(RPtr.m_Ptr); return(*this);};
+    RefPtr & operator=(const RefPtr & RPtr) {reset(RPtr.m_Ptr); return(*this);};
+
+    bool operator==(const Object & Obj) const {return(m_Ptr==&Obj);};
+    bool operator!=(const Object & Obj) const {return(m_Ptr!=&Obj);};
+    operator void*() {return(m_Ptr);};
+
+    // stl like functions
+    void assign(Object * Ptr) {reset(Ptr);};
+
+    void reset(Object * ObjectPtr=NULL)
+    {
+        SpinLock l(&m_Spin);
+        Object * old_ptr;
+
+        // increment new before decrement old in case
+        //  there are any side effects / contained / circular objects
+        old_ptr=m_Ptr;
+        m_Ptr=ObjectPtr;
+
+        if (NULL!=m_Ptr)
+        {
+            RefIncrement();
+        }   // if
+        // swap back for the moment
+        if (NULL!=old_ptr)
+        {
+            m_Ptr=old_ptr;
+            RefDecrement();
+        }   // if
+
+        // final pointer
+        m_Ptr=ObjectPtr;
+    }
+
+    Object * get() {return(m_Ptr);};
+
+    const Object * get() const {return(m_Ptr);};
+
+    Object * operator->() {return(m_Ptr);};
+    const Object * operator->() const {return(m_Ptr);};
+
+    Object & operator*() {return(*get());};
+    const Object & operator*() const {return(*get());};
+
+    bool operator<(const RefPtr & rhs) const
+    {return(*get()<*rhs.get());};
+
+protected:
+    // reduce reference count, delete if 0
+    void RefDecrement()
+    {
+        if (NULL!=m_Ptr)
+        {
+            m_Ptr->RefDec();
+            m_Ptr=NULL;
+        }   // if
+    };
+
+    void RefIncrement()
+    {
+        if (NULL!=m_Ptr)
+            m_Ptr->RefInc();
+    };
+
+private:
+
+
+};  // template RefPtr
+
+
+} // namespace leveldb
+
+#endif  // LEVELDB_INCLUDE_REFOBJECT_BASE_H_
diff --git a/src/leveldb/util/testharness.cc b/src/leveldb/util/testharness.cc
index 402fab34d..be8ebfd7d 100644
--- a/src/leveldb/util/testharness.cc
+++ b/src/leveldb/util/testharness.cc
@@ -38,7 +38,7 @@ int RunAllTests() {
 
   int num = 0;
   if (tests != NULL) {
-    for (size_t i = 0; i < tests->size(); i++) {
+    for (int i = 0; i < tests->size(); i++) {
       const Test& t = (*tests)[i];
       if (matcher != NULL) {
         std::string name = t.base;
@@ -54,6 +54,11 @@ int RunAllTests() {
     }
   }
   fprintf(stderr, "==== PASSED %d tests\n", num);
+
+  // cleanup memory for valgrind
+  leveldb::Env::Shutdown();
+  delete tests;
+
   return 0;
 }
 
diff --git a/src/leveldb/util/testharness.h b/src/leveldb/util/testharness.h
index da4fe68bb..70ae51158 100644
--- a/src/leveldb/util/testharness.h
+++ b/src/leveldb/util/testharness.h
@@ -74,6 +74,14 @@ class Tester {
     return *this;
   }
 
+  Tester& IsNotOk(const Status& s) {
+    if (s.ok()) {
+      ss_ << "Test needed to fail.";
+      ok_ = false;
+    }
+    return *this;
+  }
+
 #define BINARY_OP(name,op)                              \
   template <class X, class Y>                           \
   Tester& name(const X& x, const Y& y) {                \
@@ -103,7 +111,9 @@ class Tester {
 };
 
 #define ASSERT_TRUE(c) ::leveldb::test::Tester(__FILE__, __LINE__).Is((c), #c)
+#define ASSERT_FALSE(c) ::leveldb::test::Tester(__FILE__, __LINE__).Is(!(c), #c)
 #define ASSERT_OK(s) ::leveldb::test::Tester(__FILE__, __LINE__).IsOk((s))
+#define ASSERT_NOTOK(s) ::leveldb::test::Tester(__FILE__, __LINE__).IsNotOk((s))
 #define ASSERT_EQ(a,b) ::leveldb::test::Tester(__FILE__, __LINE__).IsEq((a),(b))
 #define ASSERT_NE(a,b) ::leveldb::test::Tester(__FILE__, __LINE__).IsNe((a),(b))
 #define ASSERT_GE(a,b) ::leveldb::test::Tester(__FILE__, __LINE__).IsGe((a),(b))
diff --git a/src/leveldb/util/testutil.cc b/src/leveldb/util/testutil.cc
index bee56bf75..538d09516 100644
--- a/src/leveldb/util/testutil.cc
+++ b/src/leveldb/util/testutil.cc
@@ -32,7 +32,7 @@ std::string RandomKey(Random* rnd, int len) {
 
 
 extern Slice CompressibleString(Random* rnd, double compressed_fraction,
-                                size_t len, std::string* dst) {
+                                int len, std::string* dst) {
   int raw = static_cast<int>(len * compressed_fraction);
   if (raw < 1) raw = 1;
   std::string raw_data;
diff --git a/src/leveldb/util/testutil.h b/src/leveldb/util/testutil.h
index d7e458370..e84323b71 100644
--- a/src/leveldb/util/testutil.h
+++ b/src/leveldb/util/testutil.h
@@ -24,7 +24,7 @@ extern std::string RandomKey(Random* rnd, int len);
 // "N*compressed_fraction" bytes and return a Slice that references
 // the generated data.
 extern Slice CompressibleString(Random* rnd, double compressed_fraction,
-                                size_t len, std::string* dst);
+                                int len, std::string* dst);
 
 // A wrapper that allows injection of errors.
 class ErrorEnv : public EnvWrapper {
@@ -37,23 +37,13 @@ class ErrorEnv : public EnvWrapper {
                num_writable_file_errors_(0) { }
 
   virtual Status NewWritableFile(const std::string& fname,
-                                 WritableFile** result) {
+                                 WritableFile** result, size_t map_size) {
     if (writable_file_error_) {
       ++num_writable_file_errors_;
       *result = NULL;
       return Status::IOError(fname, "fake error");
     }
-    return target()->NewWritableFile(fname, result);
-  }
-
-  virtual Status NewAppendableFile(const std::string& fname,
-                                   WritableFile** result) {
-    if (writable_file_error_) {
-      ++num_writable_file_errors_;
-      *result = NULL;
-      return Status::IOError(fname, "fake error");
-    }
-    return target()->NewAppendableFile(fname, result);
+    return target()->NewWritableFile(fname, result, map_size);
   }
 };
 
diff --git a/src/leveldb/util/thread_tasks.cc b/src/leveldb/util/thread_tasks.cc
new file mode 100644
index 000000000..d17813246
--- /dev/null
+++ b/src/leveldb/util/thread_tasks.cc
@@ -0,0 +1,63 @@
+// -------------------------------------------------------------------
+//
+// thread_tasks.cc
+//
+// Copyright (c) 2015 Basho Technologies, Inc. All Rights Reserved.
+//
+// This file is provided to you under the Apache License,
+// Version 2.0 (the "License"); you may not use this file
+// except in compliance with the License.  You may obtain
+// a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+//
+// -------------------------------------------------------------------
+
+#include "util/db_list.h"
+#include "util/hot_threads.h"
+#include "util/thread_tasks.h"
+
+namespace leveldb {
+
+void
+CompactionTask::operator()()
+{
+    m_DBImpl->BackgroundCall2(m_Compaction);
+    m_Compaction=NULL;
+
+    // look for grooming compactions in other databases.
+    // MUST submit to different pool, or will seldom work.
+    if (0==gCompactionThreads->m_WorkQueueAtomic)
+    {
+        ThreadTask * task=new GroomingPollTask;
+
+        // this sequence could be a race condition, and that is ok.
+        // Race is when this thread is the grooming thread and
+        // it deschedules for the entire time of the GroomingPollTasks'
+        // scan.  oh well.  not critical.
+        gWriteThreads->Submit(task, true);
+    }   // if
+}   // CompactionTask::operator()()
+
+
+void
+GroomingPollTask::operator()()
+{
+    // if there is no current backlog ... see if
+    //  databases have grooming opportunity waiting
+    // "false" only scan user databases, not internal
+    if (0==gCompactionThreads->m_WorkQueueAtomic)
+        DBList()->ScanDBs(false, &DBImpl::CheckAvailableCompactions);
+    if (0==gCompactionThreads->m_WorkQueueAtomic)
+        DBList()->ScanDBs(true, &DBImpl::CheckAvailableCompactions);
+
+}   // GroomingPollTask::operator()
+
+}  // namespace leveldb
diff --git a/src/leveldb/util/thread_tasks.h b/src/leveldb/util/thread_tasks.h
new file mode 100644
index 000000000..1971a4b36
--- /dev/null
+++ b/src/leveldb/util/thread_tasks.h
@@ -0,0 +1,185 @@
+// -------------------------------------------------------------------
+//
+// thread_tasks.h
+//
+// Copyright (c) 2011-2015 Basho Technologies, Inc. All Rights Reserved.
+//
+// This file is provided to you under the Apache License,
+// Version 2.0 (the "License"); you may not use this file
+// except in compliance with the License.  You may obtain
+// a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+//
+// -------------------------------------------------------------------
+
+// -------------------------------------------------------------------
+//  Modeled after eleveldb's workitems.h/.cc
+// -------------------------------------------------------------------
+
+
+#ifndef STORAGE_LEVELDB_INCLUDE_THREAD_TASKS_H_
+#define STORAGE_LEVELDB_INCLUDE_THREAD_TASKS_H_
+
+#include <stdint.h>
+
+#include "db/db_impl.h"
+#include "db/version_set.h"
+#include "leveldb/atomics.h"
+#include "refobject_base.h"
+
+namespace leveldb {
+
+
+/**
+ * Virtual base class for leveldb background tasks
+ */
+class ThreadTask : public RefObjectBase
+{
+ protected:
+    bool m_ResubmitWork;          //!< true if this work item is loaded for prefetch
+
+ public:
+    uint64_t m_QueueStart;        //!< NowMicros() time placed on work queue
+
+ public:
+    ThreadTask() : m_ResubmitWork(false), m_QueueStart(0) {}
+
+    virtual ~ThreadTask() {}
+
+    // this is the derived object's task routine
+    virtual void operator()() = 0;
+
+    // methods used by the thread pool to potentially reuse this task object
+    bool resubmit() const {return(m_ResubmitWork);}
+    virtual void recycle() {}
+
+ private:
+    ThreadTask(const ThreadTask &);
+    ThreadTask & operator=(const ThreadTask &);
+
+};  // class ThreadTask
+
+
+/**
+ * Background write of imm buffer to Level-0 file
+ */
+
+class ImmWriteTask : public ThreadTask
+{
+protected:
+    DBImpl * m_DBImpl;
+
+public:
+    explicit ImmWriteTask(DBImpl * Db)
+        : m_DBImpl(Db) {};
+
+    virtual ~ImmWriteTask() {};
+
+    virtual void operator()() {m_DBImpl->BackgroundImmCompactCall();};
+
+private:
+    ImmWriteTask();
+    ImmWriteTask(const ImmWriteTask &);
+    ImmWriteTask & operator=(const ImmWriteTask &);
+
+};  // class ImmWriteTask
+
+
+/**
+ * Background compaction
+ */
+
+class CompactionTask : public ThreadTask
+{
+protected:
+    DBImpl * m_DBImpl;
+    Compaction * m_Compaction;
+
+public:
+    CompactionTask(DBImpl * Db, Compaction * Compact)
+        : m_DBImpl(Db), m_Compaction(Compact) {};
+
+    virtual ~CompactionTask() {delete m_Compaction;};
+
+    virtual void operator()();
+
+private:
+    CompactionTask();
+    CompactionTask(const CompactionTask &);
+    CompactionTask & operator=(const CompactionTask &);
+
+};  // class CompactionTask
+
+
+/**
+ * Poll all databases for grooming opportunities
+ */
+
+class GroomingPollTask : public ThreadTask
+{
+protected:
+
+public:
+    GroomingPollTask() {};
+
+    virtual ~GroomingPollTask() {};
+
+    virtual void operator()();
+
+private:
+    GroomingPollTask(const GroomingPollTask &);
+    GroomingPollTask & operator=(const GroomingPollTask &);
+
+};  // class GroomingPollTask
+
+
+/**
+ * Original env_posix.cc task
+ */
+
+class LegacyTask : public ThreadTask
+{
+protected:
+    void (*m_Function)(void*);
+    void * m_Arg;
+
+public:
+    LegacyTask(void (*Function)(void*), void * Arg)
+        : m_Function(Function), m_Arg(Arg) {};
+
+    virtual ~LegacyTask() {};
+
+    virtual void operator()()
+    {
+        (*m_Function)(m_Arg);
+    };
+
+private:
+    LegacyTask();
+    LegacyTask(const LegacyTask &);
+    LegacyTask & operator=(const LegacyTask &);
+
+};  // class LegacyTask
+
+
+/**
+ * Riak Enterprise Edition's hot backup entry point
+ *
+ *  Called every 60 seconds to test for external hot backup trigger
+ *   (initiates backup if trigger seen)
+ */
+
+void CheckHotBackupTrigger();
+
+} // namespace leveldb
+
+
+#endif  // STORAGE_LEVELDB_INCLUDE_THREAD_TASKS_H_
diff --git a/src/leveldb/util/throttle.cc b/src/leveldb/util/throttle.cc
new file mode 100644
index 000000000..25fd53199
--- /dev/null
+++ b/src/leveldb/util/throttle.cc
@@ -0,0 +1,392 @@
+// -------------------------------------------------------------------
+//
+// throttle.cc
+//
+// Copyright (c) 2011-2017 Basho Technologies, Inc. All Rights Reserved.
+//
+// This file is provided to you under the Apache License,
+// Version 2.0 (the "License"); you may not use this file
+// except in compliance with the License.  You may obtain
+// a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+//
+// -------------------------------------------------------------------
+
+#include <sys/time.h>
+#include <unistd.h>
+
+#include "leveldb/perf_count.h"
+#include "leveldb/env.h"
+
+#include "db/db_impl.h"
+#include "util/cache2.h"
+#include "util/db_list.h"
+#include "util/flexcache.h"
+#include "util/hot_threads.h"
+#include "util/thread_tasks.h"
+#include "util/throttle.h"
+
+#define __STDC_FORMAT_MACROS
+#include <inttypes.h>
+
+namespace leveldb {
+
+// mutex and condition variable objects for use in the code below
+port::Mutex* gThrottleMutex=NULL;
+port::CondVar* gThrottleCond=NULL;
+
+// current time, on roughly a 60 second scale
+//  (used to reduce number of OS calls for expiry)
+uint64_t gCurrentTime=0;
+
+#define THROTTLE_SECONDS 60
+#define THROTTLE_TIME THROTTLE_SECONDS*1000000
+#define THROTTLE_INTERVALS 63
+// following is a heristic value, determined by trial and error.
+//  its job is slow down the rate of change in the current throttle.
+//  do not want sudden changes in one or two intervals to swing
+//  the throttle value wildly.  Goal is a nice, even throttle value.
+#define THROTTLE_SCALING 17
+
+struct ThrottleData_t
+{
+    uint64_t m_Micros;
+    uint64_t m_Keys;
+    uint64_t m_Backlog;
+    uint64_t m_Compactions;
+};
+
+// this array stores compaction statistics used in throttle calculation.
+//  Index 0 of this array accumulates the current minute's compaction data for level 0.
+//  Index 1 accumulates accumulates current minute's compaction
+//  statistics for all other levels.  Remaining intervals contain
+//  most recent interval statistics for last hour.
+ThrottleData_t gThrottleData[THROTTLE_INTERVALS];
+
+uint64_t gThrottleRate, gUnadjustedThrottleRate;
+
+static volatile bool gThrottleRunning=false;
+static pthread_t gThrottleThreadId;
+
+static void * ThrottleThread(void * arg);
+
+
+void
+ThrottleInit()
+{
+    gThrottleMutex = new port::Mutex;
+    gThrottleCond = new port::CondVar(gThrottleMutex);
+
+    memset(&gThrottleData, 0, sizeof(gThrottleData));
+    gThrottleRate=0;
+    gUnadjustedThrottleRate=0;
+
+    // addresses race condition during fast start/stop
+    {
+        MutexLock lock(gThrottleMutex);
+
+        pthread_create(&gThrottleThreadId, NULL,  &ThrottleThread, NULL);
+
+        while(!gThrottleRunning)
+            gThrottleCond->Wait();
+    }   // mutex
+
+    return;
+
+}   // ThrottleInit
+
+
+static void *
+ThrottleThread(
+    void * /*arg*/)
+{
+    uint64_t tot_micros, tot_keys, tot_backlog, tot_compact;
+    int replace_idx, loop, ret_val;
+    uint64_t new_throttle, new_unadjusted;
+    time_t now_seconds, cache_expire;
+    struct timespec wait_time;
+
+    replace_idx=2;
+    now_seconds=0;
+    cache_expire=0;
+    new_unadjusted=1;
+
+    // addresses race condition during fast start/stop
+    {
+        MutexLock lock(gThrottleMutex);
+        gThrottleRunning=true;
+        gThrottleCond->Signal();
+    }   // mutex
+
+    while(gThrottleRunning)
+    {
+        // update our global clock, not intended to be a precise
+        //  60 second interval.
+        gCurrentTime=port::TimeMicros();
+
+        //
+        // This is code polls for existance of /etc/riak/perf_counters and sets
+        //  the global gPerfCountersDisabled accordingly.
+        //  Sure, there should be a better place for this code.  But fits here nicely today.
+        //
+        ret_val=access("/etc/riak/perf_counters", F_OK);
+        gPerfCountersDisabled=(-1==ret_val);
+
+        //
+        // start actual throttle work
+        //
+        {
+            // lock gThrottleMutex while we update gThrottleData and
+            // wait on gThrottleCond
+            MutexLock lock(gThrottleMutex);
+
+            // sleep 1 minute
+#if _POSIX_TIMERS >= 200801L
+            clock_gettime(CLOCK_REALTIME, &wait_time);
+#else
+            struct timeval tv;
+            gettimeofday(&tv, NULL);
+            wait_time.tv_sec=tv.tv_sec;
+            wait_time.tv_nsec=tv.tv_usec*1000;
+#endif
+
+            now_seconds=wait_time.tv_sec;
+            wait_time.tv_sec+=THROTTLE_SECONDS;
+            if (gThrottleRunning) { // test in case of race at shutdown
+                gThrottleCond->Wait(&wait_time);
+            }
+            gThrottleData[replace_idx]=gThrottleData[1];
+            gThrottleData[replace_idx].m_Backlog=0;
+            memset(&gThrottleData[1], 0, sizeof(gThrottleData[1]));
+        } // unlock gThrottleMutex
+
+        tot_micros=0;
+        tot_keys=0;
+        tot_backlog=0;
+        tot_compact=0;
+
+        // this could be faster by keeping running totals and
+        //  subtracting [replace_idx] before copying [0] into it,
+        //  then adding new [replace_idx].  But that needs more
+        //  time for testing.
+        for (loop=2; loop<THROTTLE_INTERVALS; ++loop)
+        {
+            tot_micros+=gThrottleData[loop].m_Micros;
+            tot_keys+=gThrottleData[loop].m_Keys;
+            tot_backlog+=gThrottleData[loop].m_Backlog;
+            tot_compact+=gThrottleData[loop].m_Compactions;
+        }   // for
+
+        // lock gThrottleMutex while we update gThrottleData
+        {
+            MutexLock lock(gThrottleMutex);
+
+            // capture current state of level-0 and other levels' backlog
+            gThrottleData[replace_idx].m_Backlog=gCompactionThreads->m_WorkQueueAtomic;
+            gPerfCounters->Add(ePerfThrottleBacklog1, gThrottleData[replace_idx].m_Backlog);
+
+            gThrottleData[0].m_Backlog=gLevel0Threads->m_WorkQueueAtomic;
+            gPerfCounters->Add(ePerfThrottleBacklog0, gThrottleData[0].m_Backlog);
+
+            // non-level0 data available?
+            if (0!=tot_keys)
+            {
+                if (0==tot_compact)
+                    tot_compact=1;
+
+                // average write time for level 1+ compactions per key
+                //   times the average number of tasks waiting
+                //   ( the *100 stuff is to exploit fractional data in integers )
+                new_throttle=((tot_micros*100) / tot_keys)
+                    * ((tot_backlog*100) / tot_compact);
+
+                new_throttle /= 10000;  // remove *100 stuff
+                //new_throttle /= gCompactionThreads->m_Threads.size();      // number of general compaction threads
+
+                if (0==new_throttle)
+                    new_throttle=1;     // throttle must have an effect
+
+                new_unadjusted=(tot_micros*100) / tot_keys;
+                new_unadjusted /= 100;
+                if (0==new_unadjusted)
+                    new_unadjusted=1;
+            }   // if
+
+            // attempt to most recent level0
+            //  (only use most recent level0 until level1+ data becomes available,
+            //   useful on restart of heavily loaded server)
+            else if (0!=gThrottleData[0].m_Keys && 0!=gThrottleData[0].m_Compactions)
+            {
+                new_throttle=(gThrottleData[0].m_Micros / gThrottleData[0].m_Keys)
+                    * (gThrottleData[0].m_Backlog / gThrottleData[0].m_Compactions);
+
+                new_unadjusted=(gThrottleData[0].m_Micros / gThrottleData[0].m_Keys);
+                if (0==new_unadjusted)
+                    new_unadjusted=1;
+            }   // else if
+            else
+            {
+                new_throttle=1;
+            }   // else
+
+            // change the throttle slowly
+            //  (+1 & +2 keep throttle moving toward goal when difference new and
+            //   old is less than THROTTLE_SCALING)
+            int temp_rate;
+
+            temp_rate=gThrottleRate;
+            if (temp_rate < new_throttle)
+                temp_rate+=(new_throttle - temp_rate)/THROTTLE_SCALING +1;
+            else
+                temp_rate-=(temp_rate - new_throttle)/THROTTLE_SCALING +2;
+
+            // +2 can make this go negative
+            if (temp_rate<1)
+                temp_rate=1;   // throttle must always have an effect
+
+            gThrottleRate=temp_rate;
+            gUnadjustedThrottleRate=new_unadjusted;
+
+	    // Log(NULL, "ThrottleRate %" PRIu64 ", UnadjustedThrottleRate %" PRIu64, gThrottleRate, gUnadjustedThrottleRate);
+
+            gPerfCounters->Set(ePerfThrottleGauge, gThrottleRate);
+            gPerfCounters->Add(ePerfThrottleCounter, gThrottleRate*THROTTLE_SECONDS);
+            gPerfCounters->Set(ePerfThrottleUnadjusted, gUnadjustedThrottleRate);
+
+            // prepare for next interval
+            memset(&gThrottleData[0], 0, sizeof(gThrottleData[0]));
+        } // unlock gThrottleMutex
+
+        ++replace_idx;
+        if (THROTTLE_INTERVALS==replace_idx)
+            replace_idx=2;
+
+        //
+        // This is code to manage / flush the flexcache's old file cache entries.
+        //  Sure, there should be a better place for this code.  But fits here nicely today.
+        //
+        if (cache_expire < now_seconds)
+        {
+            cache_expire = now_seconds + 60*60;  // hard coded to one hour for now
+            DBList()->ScanDBs(true,  &DBImpl::PurgeExpiredFileCache);
+            DBList()->ScanDBs(false, &DBImpl::PurgeExpiredFileCache);
+        }   // if
+
+        //
+        // This is a second non-throttle task added to this one minute loop.  Pattern forming.
+        //  See if hot backup wants to initiate.
+        //
+	CheckHotBackupTrigger();
+
+        // nudge compaction logic of potential grooming
+        if (0==gCompactionThreads->m_WorkQueueAtomic)  // user databases
+            DBList()->ScanDBs(false, &DBImpl::CheckAvailableCompactions);
+        if (0==gCompactionThreads->m_WorkQueueAtomic)  // internal databases
+            DBList()->ScanDBs(true,  &DBImpl::CheckAvailableCompactions);
+
+    }   // while
+
+    return(NULL);
+
+}   // ThrottleThread
+
+
+void SetThrottleWriteRate(uint64_t Micros, uint64_t Keys, bool IsLevel0)
+{
+    if (IsLevel0)
+    {
+        // lock gThrottleMutex while we update gThrottleData
+        {
+            MutexLock lock(gThrottleMutex);
+
+            gThrottleData[0].m_Micros+=Micros;
+            gThrottleData[0].m_Keys+=Keys;
+            gThrottleData[0].m_Backlog=0;
+            gThrottleData[0].m_Compactions+=1;
+        } // unlock gThrottleMutex
+
+        gPerfCounters->Add(ePerfThrottleMicros0, Micros);
+        gPerfCounters->Add(ePerfThrottleKeys0, Keys);
+        gPerfCounters->Inc(ePerfThrottleCompacts0);
+    }   // if
+
+    else
+    {
+        // lock gThrottleMutex while we update gThrottleData
+        {
+            MutexLock lock(gThrottleMutex);
+
+            gThrottleData[1].m_Micros+=Micros;
+            gThrottleData[1].m_Keys+=Keys;
+            gThrottleData[1].m_Backlog=0;
+            gThrottleData[1].m_Compactions+=1;
+        } // unlock gThrottleMutex
+
+        gPerfCounters->Add(ePerfThrottleMicros1, Micros);
+        gPerfCounters->Add(ePerfThrottleKeys1, Keys);
+        gPerfCounters->Inc(ePerfThrottleCompacts1);
+    }   // else
+
+    return;
+};
+
+uint64_t GetThrottleWriteRate() {return(gThrottleRate);};
+uint64_t GetUnadjustedThrottleWriteRate() {return(gUnadjustedThrottleRate);};
+
+// clock_gettime but only updated once every 60 seconds (roughly)
+uint64_t GetCachedTimeMicros() {return(gCurrentTime);};
+void SetCachedTimeMicros(uint64_t Time) {gCurrentTime=Time;};
+/**
+ * ThrottleStopThreads() is the first step in a two step shutdown.
+ * This stops the 1 minute throttle calculation loop that also
+ * can initiate leveldb compaction actions.  Background compaction
+ * threads should stop between these two steps.
+ */
+void ThrottleStopThreads()
+{
+    if (gThrottleRunning)
+    {
+        gThrottleRunning=false;
+
+        // lock gThrottleMutex so that we can signal gThrottleCond
+        {
+            MutexLock lock(gThrottleMutex);
+            gThrottleCond->Signal();
+        } // unlock gThrottleMutex
+
+        pthread_join(gThrottleThreadId, NULL);
+    }   // if
+
+    return;
+
+}   // ThrottleShutdown
+
+/**
+ * ThrottleClose is the second step in a two step shutdown of
+ *  throttle.  The intent is for background compaction threads
+ *  to stop between these two steps.
+ */
+void ThrottleClose()
+{
+    // safety check
+    if (gThrottleRunning)
+        ThrottleStopThreads();
+
+    delete gThrottleCond;
+    gThrottleCond = NULL;
+
+    delete gThrottleMutex;
+    gThrottleMutex = NULL;
+
+    return;
+}   // ThrottleShutdown
+
+}  // namespace leveldb
diff --git a/src/leveldb/util/throttle.h b/src/leveldb/util/throttle.h
new file mode 100644
index 000000000..2a06fd6a9
--- /dev/null
+++ b/src/leveldb/util/throttle.h
@@ -0,0 +1,47 @@
+// -------------------------------------------------------------------
+//
+// throttle.h
+//
+// Copyright (c) 2011-2013 Basho Technologies, Inc. All Rights Reserved.
+//
+// This file is provided to you under the Apache License,
+// Version 2.0 (the "License"); you may not use this file
+// except in compliance with the License.  You may obtain
+// a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+//
+// -------------------------------------------------------------------
+
+#include <pthread.h>
+
+
+namespace leveldb
+{
+
+void ThrottleInit();
+
+void SetThrottleWriteRate(uint64_t Micros, uint64_t Keys, bool IsLevel0);
+
+uint64_t GetThrottleWriteRate();
+uint64_t GetUnadjustedThrottleWriteRate();
+
+// clock_gettime but only updated once every 60 seconds (roughly)
+//  (SetCachedTimeMicros() intended for unit tests)
+uint64_t GetCachedTimeMicros();
+void SetCachedTimeMicros(uint64_t);
+
+// step 1 in two step shutdown
+void ThrottleStopThreads();
+
+// step 2 in two step shutdown
+void ThrottleClose();
+
+}  // namespace leveldb