diff --git a/src/Makefile.am b/src/Makefile.am index e88e21a60..02be7e443 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -475,7 +475,6 @@ lbrycrdd_LDADD = \ $(LIBBITCOIN_CONSENSUS) \ $(LIBBITCOIN_CRYPTO) \ $(LIBLEVELDB) \ - $(LIBLEVELDB_SSE42) \ $(LIBMEMENV) \ $(LIBSECP256K1) @@ -573,7 +572,7 @@ $(top_srcdir)/$(subdir)/config/bitcoin-config.h.in: $(am__configure_deps) clean-local: -$(MAKE) -C secp256k1 clean -$(MAKE) -C univalue clean - -rm -f leveldb/*/*.gcda leveldb/*/*.gcno leveldb/helpers/memenv/*.gcda leveldb/helpers/memenv/*.gcno + -$(MAKE) -C leveldb clean -rm -f config.h -rm -rf test/__pycache__ diff --git a/src/Makefile.bench.include b/src/Makefile.bench.include index 0462ce04f..68b813675 100644 --- a/src/Makefile.bench.include +++ b/src/Makefile.bench.include @@ -42,7 +42,6 @@ bench_bench_bitcoin_LDADD = \ $(LIBBITCOIN_CONSENSUS) \ $(LIBBITCOIN_CRYPTO) \ $(LIBLEVELDB) \ - $(LIBLEVELDB_SSE42) \ $(LIBMEMENV) \ $(LIBSECP256K1) \ $(LIBUNIVALUE) diff --git a/src/Makefile.leveldb.include b/src/Makefile.leveldb.include index 833f3d2a1..25ea1a355 100644 --- a/src/Makefile.leveldb.include +++ b/src/Makefile.leveldb.include @@ -2,148 +2,23 @@ # Distributed under the MIT software license, see the accompanying # file COPYING or http://www.opensource.org/licenses/mit-license.php. +SUBDIRS = leveldb + LIBLEVELDB_INT = leveldb/libleveldb.a LIBMEMENV_INT = leveldb/libmemenv.a -LIBLEVELDB_SSE42_INT = leveldb/libleveldb_sse42.a EXTRA_LIBRARIES += $(LIBLEVELDB_INT) EXTRA_LIBRARIES += $(LIBMEMENV_INT) -EXTRA_LIBRARIES += $(LIBLEVELDB_SSE42_INT) LIBLEVELDB += $(LIBLEVELDB_INT) LIBMEMENV += $(LIBMEMENV_INT) -LIBLEVELDB_SSE42 = $(LIBLEVELDB_SSE42_INT) LEVELDB_CPPFLAGS += -I$(srcdir)/leveldb/include LEVELDB_CPPFLAGS += -I$(srcdir)/leveldb/helpers/memenv +LEVELDB_CPPFLAGS += -I$(srcdir)/leveldb -LEVELDB_CPPFLAGS_INT = -LEVELDB_CPPFLAGS_INT += -I$(srcdir)/leveldb -LEVELDB_CPPFLAGS_INT += $(LEVELDB_TARGET_FLAGS) -LEVELDB_CPPFLAGS_INT += -DLEVELDB_ATOMIC_PRESENT -LEVELDB_CPPFLAGS_INT += -D__STDC_LIMIT_MACROS +leveldb/libleveldb.a: + $(AM_V_at)$(MAKE) $(AM_MAKEFLAGS) -C leveldb -if TARGET_WINDOWS -LEVELDB_CPPFLAGS_INT += -DLEVELDB_PLATFORM_WINDOWS -DWINVER=0x0500 -D__USE_MINGW_ANSI_STDIO=1 -else -LEVELDB_CPPFLAGS_INT += -DLEVELDB_PLATFORM_POSIX -endif - -leveldb_libleveldb_a_CPPFLAGS = $(AM_CPPFLAGS) $(LEVELDB_CPPFLAGS_INT) $(LEVELDB_CPPFLAGS) -leveldb_libleveldb_a_CXXFLAGS = $(AM_CXXFLAGS) $(PIE_FLAGS) - -leveldb_libleveldb_a_SOURCES= -leveldb_libleveldb_a_SOURCES += leveldb/port/atomic_pointer.h -leveldb_libleveldb_a_SOURCES += leveldb/port/port_example.h -leveldb_libleveldb_a_SOURCES += leveldb/port/port_posix.h -leveldb_libleveldb_a_SOURCES += leveldb/port/win/stdint.h -leveldb_libleveldb_a_SOURCES += leveldb/port/port.h -leveldb_libleveldb_a_SOURCES += leveldb/port/port_win.h -leveldb_libleveldb_a_SOURCES += leveldb/port/thread_annotations.h -leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/db.h -leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/options.h -leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/comparator.h -leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/filter_policy.h -leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/slice.h -leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/table_builder.h -leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/env.h -leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/c.h -leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/iterator.h -leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/cache.h -leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/dumpfile.h -leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/table.h -leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/write_batch.h -leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/status.h -leveldb_libleveldb_a_SOURCES += leveldb/db/log_format.h -leveldb_libleveldb_a_SOURCES += leveldb/db/memtable.h -leveldb_libleveldb_a_SOURCES += leveldb/db/version_set.h -leveldb_libleveldb_a_SOURCES += leveldb/db/write_batch_internal.h -leveldb_libleveldb_a_SOURCES += leveldb/db/filename.h -leveldb_libleveldb_a_SOURCES += leveldb/db/version_edit.h -leveldb_libleveldb_a_SOURCES += leveldb/db/dbformat.h -leveldb_libleveldb_a_SOURCES += leveldb/db/builder.h -leveldb_libleveldb_a_SOURCES += leveldb/db/log_writer.h -leveldb_libleveldb_a_SOURCES += leveldb/db/db_iter.h -leveldb_libleveldb_a_SOURCES += leveldb/db/skiplist.h -leveldb_libleveldb_a_SOURCES += leveldb/db/db_impl.h -leveldb_libleveldb_a_SOURCES += leveldb/db/table_cache.h -leveldb_libleveldb_a_SOURCES += leveldb/db/snapshot.h -leveldb_libleveldb_a_SOURCES += leveldb/db/log_reader.h -leveldb_libleveldb_a_SOURCES += leveldb/table/filter_block.h -leveldb_libleveldb_a_SOURCES += leveldb/table/block_builder.h -leveldb_libleveldb_a_SOURCES += leveldb/table/block.h -leveldb_libleveldb_a_SOURCES += leveldb/table/two_level_iterator.h -leveldb_libleveldb_a_SOURCES += leveldb/table/merger.h -leveldb_libleveldb_a_SOURCES += leveldb/table/format.h -leveldb_libleveldb_a_SOURCES += leveldb/table/iterator_wrapper.h -leveldb_libleveldb_a_SOURCES += leveldb/util/crc32c.h -leveldb_libleveldb_a_SOURCES += leveldb/util/env_posix_test_helper.h -leveldb_libleveldb_a_SOURCES += leveldb/util/arena.h -leveldb_libleveldb_a_SOURCES += leveldb/util/random.h -leveldb_libleveldb_a_SOURCES += leveldb/util/posix_logger.h -leveldb_libleveldb_a_SOURCES += leveldb/util/hash.h -leveldb_libleveldb_a_SOURCES += leveldb/util/histogram.h -leveldb_libleveldb_a_SOURCES += leveldb/util/coding.h -leveldb_libleveldb_a_SOURCES += leveldb/util/testutil.h -leveldb_libleveldb_a_SOURCES += leveldb/util/mutexlock.h -leveldb_libleveldb_a_SOURCES += leveldb/util/logging.h -leveldb_libleveldb_a_SOURCES += leveldb/util/testharness.h - -leveldb_libleveldb_a_SOURCES += leveldb/db/builder.cc -leveldb_libleveldb_a_SOURCES += leveldb/db/c.cc -leveldb_libleveldb_a_SOURCES += leveldb/db/dbformat.cc -leveldb_libleveldb_a_SOURCES += leveldb/db/db_impl.cc -leveldb_libleveldb_a_SOURCES += leveldb/db/db_iter.cc -leveldb_libleveldb_a_SOURCES += leveldb/db/dumpfile.cc -leveldb_libleveldb_a_SOURCES += leveldb/db/filename.cc -leveldb_libleveldb_a_SOURCES += leveldb/db/log_reader.cc -leveldb_libleveldb_a_SOURCES += leveldb/db/log_writer.cc -leveldb_libleveldb_a_SOURCES += leveldb/db/memtable.cc -leveldb_libleveldb_a_SOURCES += leveldb/db/repair.cc -leveldb_libleveldb_a_SOURCES += leveldb/db/table_cache.cc -leveldb_libleveldb_a_SOURCES += leveldb/db/version_edit.cc -leveldb_libleveldb_a_SOURCES += leveldb/db/version_set.cc -leveldb_libleveldb_a_SOURCES += leveldb/db/write_batch.cc -leveldb_libleveldb_a_SOURCES += leveldb/table/block_builder.cc -leveldb_libleveldb_a_SOURCES += leveldb/table/block.cc -leveldb_libleveldb_a_SOURCES += leveldb/table/filter_block.cc -leveldb_libleveldb_a_SOURCES += leveldb/table/format.cc -leveldb_libleveldb_a_SOURCES += leveldb/table/iterator.cc -leveldb_libleveldb_a_SOURCES += leveldb/table/merger.cc -leveldb_libleveldb_a_SOURCES += leveldb/table/table_builder.cc -leveldb_libleveldb_a_SOURCES += leveldb/table/table.cc -leveldb_libleveldb_a_SOURCES += leveldb/table/two_level_iterator.cc -leveldb_libleveldb_a_SOURCES += leveldb/util/arena.cc -leveldb_libleveldb_a_SOURCES += leveldb/util/bloom.cc -leveldb_libleveldb_a_SOURCES += leveldb/util/cache.cc -leveldb_libleveldb_a_SOURCES += leveldb/util/coding.cc -leveldb_libleveldb_a_SOURCES += leveldb/util/comparator.cc -leveldb_libleveldb_a_SOURCES += leveldb/util/crc32c.cc -leveldb_libleveldb_a_SOURCES += leveldb/util/env.cc -leveldb_libleveldb_a_SOURCES += leveldb/util/env_posix.cc -leveldb_libleveldb_a_SOURCES += leveldb/util/filter_policy.cc -leveldb_libleveldb_a_SOURCES += leveldb/util/hash.cc -leveldb_libleveldb_a_SOURCES += leveldb/util/histogram.cc -leveldb_libleveldb_a_SOURCES += leveldb/util/logging.cc -leveldb_libleveldb_a_SOURCES += leveldb/util/options.cc -leveldb_libleveldb_a_SOURCES += leveldb/util/status.cc - -if TARGET_WINDOWS -leveldb_libleveldb_a_SOURCES += leveldb/util/env_win.cc -leveldb_libleveldb_a_SOURCES += leveldb/port/port_win.cc -else -leveldb_libleveldb_a_SOURCES += leveldb/port/port_posix.cc -endif - -leveldb_libmemenv_a_CPPFLAGS = $(leveldb_libleveldb_a_CPPFLAGS) -leveldb_libmemenv_a_CXXFLAGS = $(leveldb_libleveldb_a_CXXFLAGS) -leveldb_libmemenv_a_SOURCES = leveldb/helpers/memenv/memenv.cc -leveldb_libmemenv_a_SOURCES += leveldb/helpers/memenv/memenv.h - -leveldb_libleveldb_sse42_a_CPPFLAGS = $(leveldb_libleveldb_a_CPPFLAGS) -leveldb_libleveldb_sse42_a_CXXFLAGS = $(leveldb_libleveldb_a_CXXFLAGS) -if ENABLE_HWCRC32 -leveldb_libleveldb_sse42_a_CPPFLAGS += -DLEVELDB_PLATFORM_POSIX_SSE -leveldb_libleveldb_sse42_a_CXXFLAGS += $(SSE42_CXXFLAGS) -endif -leveldb_libleveldb_sse42_a_SOURCES = leveldb/port/port_posix_sse.cc +leveldb/libmemenv.a: leveldb/libleveldb.a + $(AM_V_at)$(MAKE) $(AM_MAKEFLAGS) -C leveldb memenv_test diff --git a/src/Makefile.qt.include b/src/Makefile.qt.include index 98371539a..74971f6ac 100644 --- a/src/Makefile.qt.include +++ b/src/Makefile.qt.include @@ -408,7 +408,7 @@ endif if ENABLE_ZMQ qt_lbrycrd_qt_LDADD += $(LIBBITCOIN_ZMQ) $(ZMQ_LIBS) endif -qt_lbrycrd_qt_LDADD += $(LIBBITCOIN_CLI) $(LIBBITCOIN_COMMON) $(LIBBITCOIN_UTIL) $(LIBBITCOIN_CONSENSUS) $(LIBBITCOIN_CRYPTO) $(LIBUNIVALUE) $(LIBLEVELDB) $(LIBLEVELDB_SSE42) $(LIBMEMENV) \ +qt_lbrycrd_qt_LDADD += $(LIBBITCOIN_CLI) $(LIBBITCOIN_COMMON) $(LIBBITCOIN_UTIL) $(LIBBITCOIN_CONSENSUS) $(LIBBITCOIN_CRYPTO) $(LIBUNIVALUE) $(LIBLEVELDB) $(LIBMEMENV) \ $(BOOST_LIBS) $(QT_LIBS) $(QT_DBUS_LIBS) $(QR_LIBS) $(PROTOBUF_LIBS) $(ICU_LIBS) $(BDB_LIBS) $(SSL_LIBS) $(CRYPTO_LIBS) $(MINIUPNPC_LIBS) $(LIBSECP256K1) \ $(EVENT_PTHREADS_LIBS) $(EVENT_LIBS) qt_lbrycrd_qt_LDFLAGS = $(RELDFLAGS) $(AM_LDFLAGS) $(QT_LDFLAGS) $(LIBTOOL_APP_LDFLAGS) diff --git a/src/Makefile.qttest.include b/src/Makefile.qttest.include index 450e9faf7..616e44284 100644 --- a/src/Makefile.qttest.include +++ b/src/Makefile.qttest.include @@ -63,7 +63,7 @@ if ENABLE_ZMQ qt_test_test_lbrycrd_qt_LDADD += $(LIBBITCOIN_ZMQ) $(ZMQ_LIBS) endif qt_test_test_lbrycrd_qt_LDADD += $(LIBBITCOIN_CLI) $(LIBBITCOIN_COMMON) $(LIBBITCOIN_UTIL) $(LIBBITCOIN_CONSENSUS) $(LIBBITCOIN_CRYPTO) $(LIBUNIVALUE) $(LIBLEVELDB) \ - $(LIBLEVELDB_SSE42) $(LIBMEMENV) $(BOOST_LIBS) $(QT_DBUS_LIBS) $(QT_TEST_LIBS) $(QT_LIBS) \ + $(LIBMEMENV) $(BOOST_LIBS) $(QT_DBUS_LIBS) $(QT_TEST_LIBS) $(QT_LIBS) \ $(QR_LIBS) $(PROTOBUF_LIBS) $(ICU_LIBS) $(BDB_LIBS) $(SSL_LIBS) $(CRYPTO_LIBS) $(MINIUPNPC_LIBS) $(LIBSECP256K1) \ $(EVENT_PTHREADS_LIBS) $(EVENT_LIBS) qt_test_test_lbrycrd_qt_LDFLAGS = $(RELDFLAGS) $(AM_LDFLAGS) $(QT_LDFLAGS) $(LIBTOOL_APP_LDFLAGS) diff --git a/src/Makefile.test.include b/src/Makefile.test.include index 0e18a3ba8..4f0ca5f06 100644 --- a/src/Makefile.test.include +++ b/src/Makefile.test.include @@ -122,7 +122,7 @@ test_test_lbrycrd_LDADD += $(LIBBITCOIN_WALLET) endif test_test_lbrycrd_LDADD += $(LIBBITCOIN_SERVER) $(LIBBITCOIN_CLI) $(LIBBITCOIN_COMMON) $(LIBBITCOIN_UTIL) $(LIBBITCOIN_CONSENSUS) $(LIBBITCOIN_CRYPTO) $(LIBUNIVALUE) \ - $(LIBLEVELDB) $(LIBLEVELDB_SSE42) $(LIBMEMENV) $(BOOST_LIBS) $(BOOST_UNIT_TEST_FRAMEWORK_LIB) $(LIBSECP256K1) $(EVENT_LIBS) $(EVENT_PTHREADS_LIBS) + $(LIBLEVELDB) $(LIBMEMENV) $(BOOST_LIBS) $(BOOST_UNIT_TEST_FRAMEWORK_LIB) $(LIBSECP256K1) $(EVENT_LIBS) $(EVENT_PTHREADS_LIBS) test_test_lbrycrd_CXXFLAGS = $(AM_CXXFLAGS) $(PIE_FLAGS) test_test_lbrycrd_LDADD += $(LIBBITCOIN_CONSENSUS) $(BDB_LIBS) $(CRYPTO_LIBS) $(ICU_LIBS) $(MINIUPNPC_LIBS) diff --git a/src/claimtrie.cpp b/src/claimtrie.cpp index 0ddf7bad5..8fb414376 100644 --- a/src/claimtrie.cpp +++ b/src/claimtrie.cpp @@ -597,7 +597,7 @@ bool CClaimTrieCacheBase::flush() base->nNextHeight = nNextHeight; if (!nodesToAddOrUpdate.empty() && (LogAcceptCategory(BCLog::CLAIMS) || LogAcceptCategory(BCLog::BENCH))) { LogPrintf("TrieCache size: %zu nodes on block %d, batch writes %zu bytes.\n", - nodesToAddOrUpdate.height(), nNextHeight, batch.SizeEstimate()); + nodesToAddOrUpdate.height(), nNextHeight, batch.SizeEstimate(), base->db->DynamicMemoryUsage()); } auto ret = base->db->WriteBatch(batch); clear(); diff --git a/src/dbwrapper.cpp b/src/dbwrapper.cpp index dbbf9c877..8b9c5d906 100644 --- a/src/dbwrapper.cpp +++ b/src/dbwrapper.cpp @@ -97,11 +97,45 @@ static void SetMaxOpenFiles(leveldb::Options *options) { options->max_open_files, default_open_files); } +class CappedLenCache: public leveldb::Cache { + leveldb::Cache* inner; + std::size_t maxKeyLen; +public: + CappedLenCache(std::size_t capacity, std::size_t maxKeyLen) + : inner(leveldb::NewLRUCache(capacity)), maxKeyLen(maxKeyLen) {} + + ~CappedLenCache() override { delete inner; } + + Handle* Insert(const leveldb::Slice& key, void* value, size_t charge, + void (*deleter)(const leveldb::Slice& key, void* value)) override { + if (key.size() <= maxKeyLen) + return inner->Insert(key, value, charge, deleter); + deleter(key, value); + return nullptr; + } + + Handle* Lookup(const leveldb::Slice& key) override { return inner->Lookup(key); } + void Release(Handle* handle) override { return inner->Release(handle); } + void* Value(Handle* handle) override { return inner->Value(handle); } + void Erase(const leveldb::Slice& key) override {return inner->Erase(key); } + uint64_t NewId() override { return inner->NewId(); } +}; + static leveldb::Options GetOptions(size_t nCacheSize) { leveldb::Options options; - auto write_cache = std::min(nCacheSize / 4, size_t(16) << 20U); // cap write_cache at 16MB (4x default) + + options.filter_policy=leveldb::NewBloomFilterPolicy2(16); + options.write_buffer_size=60 * 1024 * 1024; + options.total_leveldb_mem=2500ULL * 1024ULL * 1024ULL; + options.env=leveldb::Env::Default(); + options.compression = leveldb::kNoCompression; + options.info_log = new CBitcoinLevelDBLogger(); + return options; + + auto write_cache = std::min(nCacheSize / 4, size_t(4 * 1024 * 1024)); // cap write_cache at 4MB (default) options.block_cache = leveldb::NewLRUCache(nCacheSize - write_cache * 2); + // options.block_cache = new CappedLenCache(nCacheSize - write_cache * 2, 6); options.write_buffer_size = write_cache; // up to two write buffers may be held in memory simultaneously options.filter_policy = leveldb::NewBloomFilterPolicy(10); options.compression = leveldb::kNoCompression; @@ -112,6 +146,7 @@ static leveldb::Options GetOptions(size_t nCacheSize) options.paranoid_checks = true; } SetMaxOpenFiles(&options); + options.max_open_files = 30000; return options; } diff --git a/src/dbwrapper.h b/src/dbwrapper.h index c20b64bc7..687760c08 100644 --- a/src/dbwrapper.h +++ b/src/dbwrapper.h @@ -81,7 +81,7 @@ public: ssValue.Xor(dbwrapper_private::GetObfuscateKey(parent)); leveldb::Slice slValue(ssValue.data(), ssValue.size()); - batch.Put(slKey, slValue); + batch.Put(slKey, slValue, nullptr); // LevelDB serializes writes as: // - byte: header // - varint: key length (1 byte up to 127B, 2 bytes up to 16383B, ...) diff --git a/src/leveldb/.gitignore b/src/leveldb/.gitignore deleted file mode 100644 index 71d87a4ee..000000000 --- a/src/leveldb/.gitignore +++ /dev/null @@ -1,13 +0,0 @@ -build_config.mk -*.a -*.o -*.dylib* -*.so -*.so.* -*_test -db_bench -leveldbutil -Release -Debug -Benchmark -vs2010.* diff --git a/src/leveldb/AUTHORS b/src/leveldb/AUTHORS index 2439d7a45..27a9407e5 100644 --- a/src/leveldb/AUTHORS +++ b/src/leveldb/AUTHORS @@ -6,7 +6,3 @@ Google Inc. # Initial version authors: Jeffrey Dean Sanjay Ghemawat - -# Partial list of contributors: -Kevin Regan -Johan Bilien diff --git a/src/leveldb/BASHO_RELEASES b/src/leveldb/BASHO_RELEASES new file mode 100644 index 000000000..56726135d --- /dev/null +++ b/src/leveldb/BASHO_RELEASES @@ -0,0 +1,72 @@ +github.com tag 2.0.34 - February 15, 2017 +----------------------------------------- +mv-hot-backup2: - correct MakeTieredDbname() within db/filename.cc + for case where dbname input is blank and fast/slow + already populated in options. Corrects issue + with hot backup in non-tiered storage situations + +github.com tag 2.0.33 - November 21, 2016 +----------------------------------------- +mv-bucket-expiry: - partial branch to enable X-Riak-Meta-Expiry-Base-Seconds + property within enterprise edition + +--- no 2.0.32 tag on leveldb --- + +github.com tag 2.0.31 - November 1, 2016 +---------------------------------------- + - version shipped with Riak 2.2 +mv-no-md-expiry: - Riak specific + - never convert a key prefix of sext:encoded "{md" to expiry + - update sst_scan for dumping Riak formated keys +mv-tuning8: - rework penalty rules in version_set.cc UpdatePenalty() + - add unit test framework for UpdatePenalty() + +github.com tag 2.0.30 - October 11, 2016 +---------------------------------------- +mv-delayed-bloom: - when opening an .sst table file, only load + bloom filter on second Get() operation. Saves time. + - correct VersionSet::Finalize() logic for level 1 when + when level 2 is above desired size + - move hot backup to Riak ee build + +github.com tag 2.0.29 - September 13, 2016 +------------------------------------------ +mv-expiry-manifest: only switch to expiry enabled manifest format + if expiry function enabled. Eases downgrade + during early Riak releases containing expiry + +github.com tag 2.0.28 - September 6, 2016 +----------------------------------------- +mv-hot-backup: add externally triggered hot backup feature + +github.com tag 2.0.27 - August 22, 2016 +--------------------------------------- +mv-mem-fences: fix iterator double delete bug in eleveldb and + build better memory fenced operations for referenced count objects. + +github.com tag 2.0.26 - August 21, 2016 +--------------------------------------- +mv-expiry-iter-bug: DBImpl::NewIterator() was not setting the new expiry parameter. + +github.com tag 2.0.25 - August 10, 2016 +--------------------------------------- +Make LZ4 the default compression instead of Snappy. + +github.com tag 2.0.24 - August 2, 2016 +-------------------------------------- +mv-expiry: open source expiry. Supports one expiry policy for all databases. + +github.com tag 2.0.23 - July 20, 2016 +------------------------------------- +mv-no-semaphore: remove semaphore controlled thread in hot_threads.cc. Instead use + use mutex of thread 0 (only one thread's mutex) to address know race condition. + +github.com tag 2.0.22 - June 22, 2016 +------------------------------------- +no change: iterator fix in eleveldb + +github.com tag 2.0.21 - June 16, 2016 +------------------------------------- +branch mv-iterator-hot-threads: correct condition where eleveldb MoveTask + could hang an iterator. (https://github.com/basho/leveldb/wiki/mv-iterator-hot-threads) + diff --git a/src/leveldb/CONTRIBUTING.md b/src/leveldb/CONTRIBUTING.md deleted file mode 100644 index cd600ff46..000000000 --- a/src/leveldb/CONTRIBUTING.md +++ /dev/null @@ -1,36 +0,0 @@ -# Contributing - -We'd love to accept your code patches! However, before we can take them, we -have to jump a couple of legal hurdles. - -## Contributor License Agreements - -Please fill out either the individual or corporate Contributor License -Agreement as appropriate. - -* If you are an individual writing original source code and you're sure you -own the intellectual property, then sign an [individual CLA](https://developers.google.com/open-source/cla/individual). -* If you work for a company that wants to allow you to contribute your work, -then sign a [corporate CLA](https://developers.google.com/open-source/cla/corporate). - -Follow either of the two links above to access the appropriate CLA and -instructions for how to sign and return it. - -## Submitting a Patch - -1. Sign the contributors license agreement above. -2. Decide which code you want to submit. A submission should be a set of changes -that addresses one issue in the [issue tracker](https://github.com/google/leveldb/issues). -Please don't mix more than one logical change per submission, because it makes -the history hard to follow. If you want to make a change -(e.g. add a sample or feature) that doesn't have a corresponding issue in the -issue tracker, please create one. -3. **Submitting**: When you are ready to submit, send us a Pull Request. Be -sure to include the issue number you fixed and the name you used to sign -the CLA. - -## Writing Code ## - -If your contribution contains code, please make sure that it follows -[the style guide](http://google-styleguide.googlecode.com/svn/trunk/cppguide.xml). -Otherwise we will have to ask you to make changes, and that's no fun for anyone. diff --git a/src/leveldb/Makefile b/src/leveldb/Makefile index f7cc7d736..dbe1d7bf3 100644 --- a/src/leveldb/Makefile +++ b/src/leveldb/Makefile @@ -2,423 +2,219 @@ # Use of this source code is governed by a BSD-style license that can be # found in the LICENSE file. See the AUTHORS file for names of contributors. +# Inherit some settings from environment variables, if available +INSTALL_PATH ?= $(CURDIR) + #----------------------------------------------- # Uncomment exactly one of the lines labelled (A), (B), and (C) below # to switch between compilation modes. +# NOTE: targets "debug" and "prof" provide same functionality +# NOTE 2: -DNDEBUG disables assert() statements within C code, +# i.e. no assert()s in production code -# (A) Production use (optimized mode) -OPT ?= -O2 -DNDEBUG -# (B) Debug mode, w/ full line-level debugging symbols -# OPT ?= -g2 -# (C) Profiling mode: opt, but w/debugging symbols -# OPT ?= -O2 -g2 -DNDEBUG +OPT ?= -O2 -g -DNDEBUG # (A) Production use (optimized mode) +# OPT ?= -g2 # (B) Debug mode, w/ full line-level debugging symbols +# OPT ?= -O2 -g2 -DNDEBUG # (C) Profiling mode: opt, but w/debugging symbols #----------------------------------------------- # detect what platform we're building on -$(shell CC="$(CC)" CXX="$(CXX)" TARGET_OS="$(TARGET_OS)" \ - ./build_detect_platform build_config.mk ./) +ifeq ($(wildcard build_config.mk),) +$(shell ./build_detect_platform build_config.mk) +endif # this file is generated by the previous line to set build flags and sources include build_config.mk -TESTS = \ - db/autocompact_test \ - db/c_test \ - db/corruption_test \ - db/db_test \ - db/dbformat_test \ - db/fault_injection_test \ - db/filename_test \ - db/log_test \ - db/recovery_test \ - db/skiplist_test \ - db/version_edit_test \ - db/version_set_test \ - db/write_batch_test \ - helpers/memenv/memenv_test \ - issues/issue178_test \ - issues/issue200_test \ - table/filter_block_test \ - table/table_test \ - util/arena_test \ - util/bloom_test \ - util/cache_test \ - util/coding_test \ - util/crc32c_test \ - util/env_posix_test \ - util/env_test \ - util/hash_test - -UTILS = \ - db/db_bench \ - db/leveldbutil - -# Put the object files in a subdirectory, but the application at the top of the object dir. -PROGNAMES := $(notdir $(TESTS) $(UTILS)) - -# On Linux may need libkyotocabinet-dev for dependency. -BENCHMARKS = \ - doc/bench/db_bench_sqlite3 \ - doc/bench/db_bench_tree_db - CFLAGS += -I. -I./include $(PLATFORM_CCFLAGS) $(OPT) CXXFLAGS += -I. -I./include $(PLATFORM_CXXFLAGS) $(OPT) LDFLAGS += $(PLATFORM_LDFLAGS) -LIBS += $(PLATFORM_LIBS) -SIMULATOR_OUTDIR=out-ios-x86 -DEVICE_OUTDIR=out-ios-arm +LIBOBJECTS := $(SOURCES:.cc=.o) +LIBOBJECTS += util/lz4.o +MEMENVOBJECTS = $(MEMENV_SOURCES:.cc=.o) +DEPEND := $(SOURCES:.cc=.d) -ifeq ($(PLATFORM), IOS) -# Note: iOS should probably be using libtool, not ar. -AR=xcrun ar -SIMULATORSDK=$(shell xcrun -sdk iphonesimulator --show-sdk-path) -DEVICESDK=$(shell xcrun -sdk iphoneos --show-sdk-path) -DEVICE_CFLAGS = -isysroot "$(DEVICESDK)" -arch armv6 -arch armv7 -arch armv7s -arch arm64 -SIMULATOR_CFLAGS = -isysroot "$(SIMULATORSDK)" -arch i686 -arch x86_64 -STATIC_OUTDIR=out-ios-universal +TESTUTIL = ./util/testutil.o +TESTHARNESS = ./util/testharness.o $(TESTUTIL) + +TESTS := $(sort $(notdir $(basename $(TEST_SOURCES)))) + +TOOLS = \ + leveldb_repair \ + perf_dump \ + sst_rewrite \ + sst_scan + +PROGRAMS = db_bench $(TESTS) $(TOOLS) +BENCHMARKS = db_bench_sqlite3 db_bench_tree_db + +LIBRARY = libleveldb.a +MEMENVLIBRARY = libmemenv.a + +# +# static link leveldb to tools to simplify platform usage (if Linux) +# +ifeq ($(PLATFORM),OS_LINUX) +LEVEL_LDFLAGS := -L . -Wl,-non_shared -lleveldb -Wl,-call_shared else -STATIC_OUTDIR=out-static -SHARED_OUTDIR=out-shared -STATIC_PROGRAMS := $(addprefix $(STATIC_OUTDIR)/, $(PROGNAMES)) -SHARED_PROGRAMS := $(addprefix $(SHARED_OUTDIR)/, db_bench) +LEVEL_LDFLAGS := -L . -lleveldb endif -STATIC_LIBOBJECTS := $(addprefix $(STATIC_OUTDIR)/, $(SOURCES:.cc=.o)) -STATIC_MEMENVOBJECTS := $(addprefix $(STATIC_OUTDIR)/, $(MEMENV_SOURCES:.cc=.o)) - -DEVICE_LIBOBJECTS := $(addprefix $(DEVICE_OUTDIR)/, $(SOURCES:.cc=.o)) -DEVICE_MEMENVOBJECTS := $(addprefix $(DEVICE_OUTDIR)/, $(MEMENV_SOURCES:.cc=.o)) - -SIMULATOR_LIBOBJECTS := $(addprefix $(SIMULATOR_OUTDIR)/, $(SOURCES:.cc=.o)) -SIMULATOR_MEMENVOBJECTS := $(addprefix $(SIMULATOR_OUTDIR)/, $(MEMENV_SOURCES:.cc=.o)) - -SHARED_LIBOBJECTS := $(addprefix $(SHARED_OUTDIR)/, $(SOURCES:.cc=.o)) -SHARED_MEMENVOBJECTS := $(addprefix $(SHARED_OUTDIR)/, $(MEMENV_SOURCES:.cc=.o)) - -TESTUTIL := $(STATIC_OUTDIR)/util/testutil.o -TESTHARNESS := $(STATIC_OUTDIR)/util/testharness.o $(TESTUTIL) - -STATIC_TESTOBJS := $(addprefix $(STATIC_OUTDIR)/, $(addsuffix .o, $(TESTS))) -STATIC_UTILOBJS := $(addprefix $(STATIC_OUTDIR)/, $(addsuffix .o, $(UTILS))) -STATIC_ALLOBJS := $(STATIC_LIBOBJECTS) $(STATIC_MEMENVOBJECTS) $(STATIC_TESTOBJS) $(STATIC_UTILOBJS) $(TESTHARNESS) -DEVICE_ALLOBJS := $(DEVICE_LIBOBJECTS) $(DEVICE_MEMENVOBJECTS) -SIMULATOR_ALLOBJS := $(SIMULATOR_LIBOBJECTS) $(SIMULATOR_MEMENVOBJECTS) - default: all # Should we build shared libraries? ifneq ($(PLATFORM_SHARED_EXT),) -# Many leveldb test apps use non-exported API's. Only build a subset for testing. -SHARED_ALLOBJS := $(SHARED_LIBOBJECTS) $(SHARED_MEMENVOBJECTS) $(TESTHARNESS) - ifneq ($(PLATFORM_SHARED_VERSIONED),true) -SHARED_LIB1 = libleveldb.$(PLATFORM_SHARED_EXT) -SHARED_LIB2 = $(SHARED_LIB1) -SHARED_LIB3 = $(SHARED_LIB1) -SHARED_LIBS = $(SHARED_LIB1) -SHARED_MEMENVLIB = $(SHARED_OUTDIR)/libmemenv.a +SHARED1 = libleveldb.$(PLATFORM_SHARED_EXT) +SHARED2 = $(SHARED1) +SHARED3 = $(SHARED1) +SHARED = $(SHARED1) else # Update db.h if you change these. -SHARED_VERSION_MAJOR = 1 -SHARED_VERSION_MINOR = 20 -SHARED_LIB1 = libleveldb.$(PLATFORM_SHARED_EXT) -SHARED_LIB2 = $(SHARED_LIB1).$(SHARED_VERSION_MAJOR) -SHARED_LIB3 = $(SHARED_LIB1).$(SHARED_VERSION_MAJOR).$(SHARED_VERSION_MINOR) -SHARED_LIBS = $(SHARED_OUTDIR)/$(SHARED_LIB1) $(SHARED_OUTDIR)/$(SHARED_LIB2) $(SHARED_OUTDIR)/$(SHARED_LIB3) -$(SHARED_OUTDIR)/$(SHARED_LIB1): $(SHARED_OUTDIR)/$(SHARED_LIB3) - ln -fs $(SHARED_LIB3) $(SHARED_OUTDIR)/$(SHARED_LIB1) -$(SHARED_OUTDIR)/$(SHARED_LIB2): $(SHARED_OUTDIR)/$(SHARED_LIB3) - ln -fs $(SHARED_LIB3) $(SHARED_OUTDIR)/$(SHARED_LIB2) -SHARED_MEMENVLIB = $(SHARED_OUTDIR)/libmemenv.a +SHARED_MAJOR = 1 +SHARED_MINOR = 9 +SHARED1 = libleveldb.$(PLATFORM_SHARED_EXT) +SHARED2 = $(SHARED1).$(SHARED_MAJOR) +SHARED3 = $(SHARED1).$(SHARED_MAJOR).$(SHARED_MINOR) +SHARED = $(SHARED1) $(SHARED2) $(SHARED3) +$(SHARED1): $(SHARED3) + ln -fs $(SHARED3) $(SHARED1) +$(SHARED2): $(SHARED3) + ln -fs $(SHARED3) $(SHARED2) endif -$(SHARED_OUTDIR)/$(SHARED_LIB3): $(SHARED_LIBOBJECTS) - $(CXX) $(LDFLAGS) $(PLATFORM_SHARED_LDFLAGS)$(SHARED_LIB2) $(SHARED_LIBOBJECTS) -o $(SHARED_OUTDIR)/$(SHARED_LIB3) $(LIBS) +$(SHARED3): $(LIBOBJECTS) + $(CXX) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) $(LIBOBJECTS) -o $(SHARED3) $(LDFLAGS) $(PLATFORM_SHARED_LDFLAGS)$(SHARED2) endif # PLATFORM_SHARED_EXT -all: $(SHARED_LIBS) $(SHARED_PROGRAMS) $(STATIC_OUTDIR)/libleveldb.a $(STATIC_OUTDIR)/libmemenv.a $(STATIC_PROGRAMS) +all: $(SHARED) $(LIBRARY) -check: $(STATIC_PROGRAMS) - for t in $(notdir $(TESTS)); do echo "***** Running $$t"; $(STATIC_OUTDIR)/$$t || exit 1; done +test check: all $(PROGRAMS) $(TESTS) + for t in $(TESTS); do echo "***** Running $$t"; ./$$t || exit 1; done -clean: - -rm -rf out-static out-shared out-ios-x86 out-ios-arm out-ios-universal - -rm -f build_config.mk - -rm -rf ios-x86 ios-arm +tools: all $(TOOLS) -$(STATIC_OUTDIR): - mkdir $@ - -$(STATIC_OUTDIR)/db: | $(STATIC_OUTDIR) - mkdir $@ - -$(STATIC_OUTDIR)/helpers/memenv: | $(STATIC_OUTDIR) - mkdir -p $@ - -$(STATIC_OUTDIR)/port: | $(STATIC_OUTDIR) - mkdir $@ - -$(STATIC_OUTDIR)/table: | $(STATIC_OUTDIR) - mkdir $@ - -$(STATIC_OUTDIR)/util: | $(STATIC_OUTDIR) - mkdir $@ - -.PHONY: STATIC_OBJDIRS -STATIC_OBJDIRS: \ - $(STATIC_OUTDIR)/db \ - $(STATIC_OUTDIR)/port \ - $(STATIC_OUTDIR)/table \ - $(STATIC_OUTDIR)/util \ - $(STATIC_OUTDIR)/helpers/memenv - -$(SHARED_OUTDIR): - mkdir $@ - -$(SHARED_OUTDIR)/db: | $(SHARED_OUTDIR) - mkdir $@ - -$(SHARED_OUTDIR)/helpers/memenv: | $(SHARED_OUTDIR) - mkdir -p $@ - -$(SHARED_OUTDIR)/port: | $(SHARED_OUTDIR) - mkdir $@ - -$(SHARED_OUTDIR)/table: | $(SHARED_OUTDIR) - mkdir $@ - -$(SHARED_OUTDIR)/util: | $(SHARED_OUTDIR) - mkdir $@ - -.PHONY: SHARED_OBJDIRS -SHARED_OBJDIRS: \ - $(SHARED_OUTDIR)/db \ - $(SHARED_OUTDIR)/port \ - $(SHARED_OUTDIR)/table \ - $(SHARED_OUTDIR)/util \ - $(SHARED_OUTDIR)/helpers/memenv - -$(DEVICE_OUTDIR): - mkdir $@ - -$(DEVICE_OUTDIR)/db: | $(DEVICE_OUTDIR) - mkdir $@ - -$(DEVICE_OUTDIR)/helpers/memenv: | $(DEVICE_OUTDIR) - mkdir -p $@ - -$(DEVICE_OUTDIR)/port: | $(DEVICE_OUTDIR) - mkdir $@ - -$(DEVICE_OUTDIR)/table: | $(DEVICE_OUTDIR) - mkdir $@ - -$(DEVICE_OUTDIR)/util: | $(DEVICE_OUTDIR) - mkdir $@ - -.PHONY: DEVICE_OBJDIRS -DEVICE_OBJDIRS: \ - $(DEVICE_OUTDIR)/db \ - $(DEVICE_OUTDIR)/port \ - $(DEVICE_OUTDIR)/table \ - $(DEVICE_OUTDIR)/util \ - $(DEVICE_OUTDIR)/helpers/memenv - -$(SIMULATOR_OUTDIR): - mkdir $@ - -$(SIMULATOR_OUTDIR)/db: | $(SIMULATOR_OUTDIR) - mkdir $@ - -$(SIMULATOR_OUTDIR)/helpers/memenv: | $(SIMULATOR_OUTDIR) - mkdir -p $@ - -$(SIMULATOR_OUTDIR)/port: | $(SIMULATOR_OUTDIR) - mkdir $@ - -$(SIMULATOR_OUTDIR)/table: | $(SIMULATOR_OUTDIR) - mkdir $@ - -$(SIMULATOR_OUTDIR)/util: | $(SIMULATOR_OUTDIR) - mkdir $@ - -.PHONY: SIMULATOR_OBJDIRS -SIMULATOR_OBJDIRS: \ - $(SIMULATOR_OUTDIR)/db \ - $(SIMULATOR_OUTDIR)/port \ - $(SIMULATOR_OUTDIR)/table \ - $(SIMULATOR_OUTDIR)/util \ - $(SIMULATOR_OUTDIR)/helpers/memenv - -$(STATIC_ALLOBJS): | STATIC_OBJDIRS -$(DEVICE_ALLOBJS): | DEVICE_OBJDIRS -$(SIMULATOR_ALLOBJS): | SIMULATOR_OBJDIRS -$(SHARED_ALLOBJS): | SHARED_OBJDIRS - -ifeq ($(PLATFORM), IOS) -$(DEVICE_OUTDIR)/libleveldb.a: $(DEVICE_LIBOBJECTS) - rm -f $@ - $(AR) -rs $@ $(DEVICE_LIBOBJECTS) - -$(SIMULATOR_OUTDIR)/libleveldb.a: $(SIMULATOR_LIBOBJECTS) - rm -f $@ - $(AR) -rs $@ $(SIMULATOR_LIBOBJECTS) - -$(DEVICE_OUTDIR)/libmemenv.a: $(DEVICE_MEMENVOBJECTS) - rm -f $@ - $(AR) -rs $@ $(DEVICE_MEMENVOBJECTS) - -$(SIMULATOR_OUTDIR)/libmemenv.a: $(SIMULATOR_MEMENVOBJECTS) - rm -f $@ - $(AR) -rs $@ $(SIMULATOR_MEMENVOBJECTS) - -# For iOS, create universal object libraries to be used on both the simulator and -# a device. -$(STATIC_OUTDIR)/libleveldb.a: $(STATIC_OUTDIR) $(DEVICE_OUTDIR)/libleveldb.a $(SIMULATOR_OUTDIR)/libleveldb.a - lipo -create $(DEVICE_OUTDIR)/libleveldb.a $(SIMULATOR_OUTDIR)/libleveldb.a -output $@ - -$(STATIC_OUTDIR)/libmemenv.a: $(STATIC_OUTDIR) $(DEVICE_OUTDIR)/libmemenv.a $(SIMULATOR_OUTDIR)/libmemenv.a - lipo -create $(DEVICE_OUTDIR)/libmemenv.a $(SIMULATOR_OUTDIR)/libmemenv.a -output $@ -else -$(STATIC_OUTDIR)/libleveldb.a:$(STATIC_LIBOBJECTS) - rm -f $@ - $(AR) -rs $@ $(STATIC_LIBOBJECTS) - -$(STATIC_OUTDIR)/libmemenv.a:$(STATIC_MEMENVOBJECTS) - rm -f $@ - $(AR) -rs $@ $(STATIC_MEMENVOBJECTS) +# +# command line targets: debug and prof +# just like +ifneq ($(filter debug,$(MAKECMDGOALS)),) +OPT := -g2 # (B) Debug mode, w/ full line-level debugging symbols +debug: all endif -$(SHARED_MEMENVLIB):$(SHARED_MEMENVOBJECTS) +ifneq ($(filter prof,$(MAKECMDGOALS)),) +OPT := -O2 -g2 -DNDEBUG # (C) Profiling mode: opt, but w/debugging symbols +prof: all +endif + + +clean: + -rm -f $(PROGRAMS) $(BENCHMARKS) $(LIBRARY) $(SHARED) $(MEMENVLIBRARY) */*.o */*/*.o */*.d */*/*.d ios-x86/*/*.o ios-arm/*/*.o build_config.mk include/leveldb/ldb_config.h + -rm -rf ios-x86/* ios-arm/* *.dSYM + + +$(LIBRARY): $(LIBOBJECTS) rm -f $@ - $(AR) -rs $@ $(SHARED_MEMENVOBJECTS) + $(AR) -rs $@ $(LIBOBJECTS) -$(STATIC_OUTDIR)/db_bench:db/db_bench.cc $(STATIC_LIBOBJECTS) $(TESTUTIL) - $(CXX) $(LDFLAGS) $(CXXFLAGS) db/db_bench.cc $(STATIC_LIBOBJECTS) $(TESTUTIL) -o $@ $(LIBS) +# +# all tools, programs, and tests depend upon the static library +$(TESTS) $(PROGRAMS) $(TOOLS) : $(LIBRARY) -$(STATIC_OUTDIR)/db_bench_sqlite3:doc/bench/db_bench_sqlite3.cc $(STATIC_LIBOBJECTS) $(TESTUTIL) - $(CXX) $(LDFLAGS) $(CXXFLAGS) doc/bench/db_bench_sqlite3.cc $(STATIC_LIBOBJECTS) $(TESTUTIL) -o $@ -lsqlite3 $(LIBS) +# +# all tests depend upon the test harness +$(TESTS) : $(TESTHARNESS) -$(STATIC_OUTDIR)/db_bench_tree_db:doc/bench/db_bench_tree_db.cc $(STATIC_LIBOBJECTS) $(TESTUTIL) - $(CXX) $(LDFLAGS) $(CXXFLAGS) doc/bench/db_bench_tree_db.cc $(STATIC_LIBOBJECTS) $(TESTUTIL) -o $@ -lkyotocabinet $(LIBS) +# +# tools, programs, and tests will compile to the root directory +# but their .cc source file will be in one of the following subdirectories +vpath %.cc db:table:util:leveldb_ee:leveldb_os -$(STATIC_OUTDIR)/leveldbutil:db/leveldbutil.cc $(STATIC_LIBOBJECTS) - $(CXX) $(LDFLAGS) $(CXXFLAGS) db/leveldbutil.cc $(STATIC_LIBOBJECTS) -o $@ $(LIBS) +# special case for c_test +vpath %.c db -$(STATIC_OUTDIR)/arena_test:util/arena_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) - $(CXX) $(LDFLAGS) $(CXXFLAGS) util/arena_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS) +db_bench: db/db_bench.o $(LIBRARY) $(TESTUTIL) + $(CXX) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) $< $(TESTUTIL) -o $@ $(LEVEL_LDFLAGS) $(LDFLAGS) -$(STATIC_OUTDIR)/autocompact_test:db/autocompact_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) - $(CXX) $(LDFLAGS) $(CXXFLAGS) db/autocompact_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS) +db_bench_sqlite3: doc/bench/db_bench_sqlite3.o $(LIBRARY) $(TESTUTIL) -$(STATIC_OUTDIR)/bloom_test:util/bloom_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) - $(CXX) $(LDFLAGS) $(CXXFLAGS) util/bloom_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS) +db_bench_tree_db: doc/bench/db_bench_tree_db.o $(LIBRARY) $(TESTUTIL) -$(STATIC_OUTDIR)/c_test:$(STATIC_OUTDIR)/db/c_test.o $(STATIC_LIBOBJECTS) $(TESTHARNESS) - $(CXX) $(LDFLAGS) $(STATIC_OUTDIR)/db/c_test.o $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS) -$(STATIC_OUTDIR)/cache_test:util/cache_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) - $(CXX) $(LDFLAGS) $(CXXFLAGS) util/cache_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS) +# +# build line taken from lz4 makefile +# +util/lz4.o: util/lz4.c util/lz4.h + $(CC) $(CFLAGS) $(PLATFORM_SHARED_CFLAGS) -O3 -std=c99 -Wall -Wextra -Wundef -Wshadow -Wcast-qual -Wcast-align -Wstrict-prototypes -pedantic -DLZ4_VERSION=\"r130\" -c util/lz4.c -o util/lz4.o -$(STATIC_OUTDIR)/coding_test:util/coding_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) - $(CXX) $(LDFLAGS) $(CXXFLAGS) util/coding_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS) +# +# memory env +# +$(MEMENVLIBRARY) : $(MEMENVOBJECTS) + rm -f $@ + $(AR) -rs $@ $(MEMENVOBJECTS) -$(STATIC_OUTDIR)/corruption_test:db/corruption_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) - $(CXX) $(LDFLAGS) $(CXXFLAGS) db/corruption_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS) +memenv_test : helpers/memenv/memenv_test.o $(MEMENVLIBRARY) $(LIBRARY) $(TESTHARNESS) + $(CXX) helpers/memenv/memenv_test.o $(MEMENVLIBRARY) $(LIBRARY) $(TESTHARNESS) -o $@ $(LDFLAGS) -$(STATIC_OUTDIR)/crc32c_test:util/crc32c_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) - $(CXX) $(LDFLAGS) $(CXXFLAGS) util/crc32c_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS) +# +# IOS build +# +ifeq ($(PLATFORM), IOS) +# For iOS, create universal object files to be used on both the simulator and +# a device. +PLATFORMSROOT=/Applications/Xcode.app/Contents/Developer/Platforms +SIMULATORROOT=$(PLATFORMSROOT)/iPhoneSimulator.platform/Developer +DEVICEROOT=$(PLATFORMSROOT)/iPhoneOS.platform/Developer +IOSVERSION=$(shell defaults read $(PLATFORMSROOT)/iPhoneOS.platform/version CFBundleShortVersionString) -$(STATIC_OUTDIR)/db_test:db/db_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) - $(CXX) $(LDFLAGS) $(CXXFLAGS) db/db_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS) +.cc.o: + mkdir -p ios-x86/$(dir $@) + $(SIMULATORROOT)/usr/bin/$(CXX) $(CXXFLAGS) -isysroot $(SIMULATORROOT)/SDKs/iPhoneSimulator$(IOSVERSION).sdk -arch i686 -c $< -o ios-x86/$@ + mkdir -p ios-arm/$(dir $@) + $(DEVICEROOT)/usr/bin/$(CXX) $(CXXFLAGS) -isysroot $(DEVICEROOT)/SDKs/iPhoneOS$(IOSVERSION).sdk -arch armv6 -arch armv7 -c $< -o ios-arm/$@ + lipo ios-x86/$@ ios-arm/$@ -create -output $@ -$(STATIC_OUTDIR)/dbformat_test:db/dbformat_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) - $(CXX) $(LDFLAGS) $(CXXFLAGS) db/dbformat_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS) +.c.o: + mkdir -p ios-x86/$(dir $@) + $(SIMULATORROOT)/usr/bin/$(CC) $(CFLAGS) -isysroot $(SIMULATORROOT)/SDKs/iPhoneSimulator$(IOSVERSION).sdk -arch i686 -c $< -o ios-x86/$@ + mkdir -p ios-arm/$(dir $@) + $(DEVICEROOT)/usr/bin/$(CC) $(CFLAGS) -isysroot $(DEVICEROOT)/SDKs/iPhoneOS$(IOSVERSION).sdk -arch armv6 -arch armv7 -c $< -o ios-arm/$@ + lipo ios-x86/$@ ios-arm/$@ -create -output $@ -$(STATIC_OUTDIR)/env_posix_test:util/env_posix_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) - $(CXX) $(LDFLAGS) $(CXXFLAGS) util/env_posix_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS) - -$(STATIC_OUTDIR)/env_test:util/env_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) - $(CXX) $(LDFLAGS) $(CXXFLAGS) util/env_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS) - -$(STATIC_OUTDIR)/fault_injection_test:db/fault_injection_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) - $(CXX) $(LDFLAGS) $(CXXFLAGS) db/fault_injection_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS) - -$(STATIC_OUTDIR)/filename_test:db/filename_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) - $(CXX) $(LDFLAGS) $(CXXFLAGS) db/filename_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS) - -$(STATIC_OUTDIR)/filter_block_test:table/filter_block_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) - $(CXX) $(LDFLAGS) $(CXXFLAGS) table/filter_block_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS) - -$(STATIC_OUTDIR)/hash_test:util/hash_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) - $(CXX) $(LDFLAGS) $(CXXFLAGS) util/hash_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS) - -$(STATIC_OUTDIR)/issue178_test:issues/issue178_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) - $(CXX) $(LDFLAGS) $(CXXFLAGS) issues/issue178_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS) - -$(STATIC_OUTDIR)/issue200_test:issues/issue200_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) - $(CXX) $(LDFLAGS) $(CXXFLAGS) issues/issue200_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS) - -$(STATIC_OUTDIR)/log_test:db/log_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) - $(CXX) $(LDFLAGS) $(CXXFLAGS) db/log_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS) - -$(STATIC_OUTDIR)/recovery_test:db/recovery_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) - $(CXX) $(LDFLAGS) $(CXXFLAGS) db/recovery_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS) - -$(STATIC_OUTDIR)/table_test:table/table_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) - $(CXX) $(LDFLAGS) $(CXXFLAGS) table/table_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS) - -$(STATIC_OUTDIR)/skiplist_test:db/skiplist_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) - $(CXX) $(LDFLAGS) $(CXXFLAGS) db/skiplist_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS) - -$(STATIC_OUTDIR)/version_edit_test:db/version_edit_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) - $(CXX) $(LDFLAGS) $(CXXFLAGS) db/version_edit_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS) - -$(STATIC_OUTDIR)/version_set_test:db/version_set_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) - $(CXX) $(LDFLAGS) $(CXXFLAGS) db/version_set_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS) - -$(STATIC_OUTDIR)/write_batch_test:db/write_batch_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) - $(CXX) $(LDFLAGS) $(CXXFLAGS) db/write_batch_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS) - -$(STATIC_OUTDIR)/memenv_test:$(STATIC_OUTDIR)/helpers/memenv/memenv_test.o $(STATIC_OUTDIR)/libmemenv.a $(STATIC_OUTDIR)/libleveldb.a $(TESTHARNESS) - $(XCRUN) $(CXX) $(LDFLAGS) $(STATIC_OUTDIR)/helpers/memenv/memenv_test.o $(STATIC_OUTDIR)/libmemenv.a $(STATIC_OUTDIR)/libleveldb.a $(TESTHARNESS) -o $@ $(LIBS) - -$(SHARED_OUTDIR)/db_bench:$(SHARED_OUTDIR)/db/db_bench.o $(SHARED_LIBS) $(TESTUTIL) - $(XCRUN) $(CXX) $(LDFLAGS) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) $(SHARED_OUTDIR)/db/db_bench.o $(TESTUTIL) $(SHARED_OUTDIR)/$(SHARED_LIB3) -o $@ $(LIBS) - -.PHONY: run-shared -run-shared: $(SHARED_OUTDIR)/db_bench - LD_LIBRARY_PATH=$(SHARED_OUTDIR) $(SHARED_OUTDIR)/db_bench - -$(SIMULATOR_OUTDIR)/%.o: %.cc - xcrun -sdk iphonesimulator $(CXX) $(CXXFLAGS) $(SIMULATOR_CFLAGS) -c $< -o $@ - -$(DEVICE_OUTDIR)/%.o: %.cc - xcrun -sdk iphoneos $(CXX) $(CXXFLAGS) $(DEVICE_CFLAGS) -c $< -o $@ - -$(SIMULATOR_OUTDIR)/%.o: %.c - xcrun -sdk iphonesimulator $(CC) $(CFLAGS) $(SIMULATOR_CFLAGS) -c $< -o $@ - -$(DEVICE_OUTDIR)/%.o: %.c - xcrun -sdk iphoneos $(CC) $(CFLAGS) $(DEVICE_CFLAGS) -c $< -o $@ - -$(STATIC_OUTDIR)/%.o: %.cc - $(CXX) $(CXXFLAGS) -c $< -o $@ - -$(STATIC_OUTDIR)/%.o: %.c - $(CC) $(CFLAGS) -c $< -o $@ - -$(SHARED_OUTDIR)/%.o: %.cc +else +# +# build for everything NOT IOS +# +.cc.o: $(CXX) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) -c $< -o $@ -$(SHARED_OUTDIR)/%.o: %.c +.c.o: $(CC) $(CFLAGS) $(PLATFORM_SHARED_CFLAGS) -c $< -o $@ -$(STATIC_OUTDIR)/port/port_posix_sse.o: port/port_posix_sse.cc - $(CXX) $(CXXFLAGS) $(PLATFORM_SSEFLAGS) -c $< -o $@ +## @echo -- Creating dependency file for $< +%.d: %.cc + $(CC) $(CFLAGS) $(PLATFORM_SHARED_CFLAGS) -MM -E -MT $(basename $@).d -MT $(basename $@).o -MF $@ $< + @echo $(basename $@).o: $(basename $@).d >>$@ -$(SHARED_OUTDIR)/port/port_posix_sse.o: port/port_posix_sse.cc - $(CXX) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) $(PLATFORM_SSEFLAGS) -c $< -o $@ +# generic build for command line tests +%: %.cc + $(CXX) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) $< $(TESTHARNESS) -o $@ $(LEVEL_LDFLAGS) $(LDFLAGS) + +%: db/%.c + $(CXX) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) $< $(TESTHARNESS) -o $@ $(LEVEL_LDFLAGS) $(LDFLAGS) + +# for tools, omits test harness +%: tools/%.cc + $(CXX) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) $< -o $@ $(LEVEL_LDFLAGS) $(LDFLAGS) + +endif + +# +# load dependency files +# +ifeq ($(filter tar clean allclean distclean,$(MAKECMDGOALS)),) +-include $(DEPEND) +endif diff --git a/src/leveldb/README b/src/leveldb/README new file mode 100644 index 000000000..6a3677406 --- /dev/null +++ b/src/leveldb/README @@ -0,0 +1,83 @@ +leveldb: A key-value store +Authors: Sanjay Ghemawat (sanjay@google.com) and Jeff Dean (jeff@google.com) + +The original Google README is now README.GOOGLE. + +** Introduction + +This repository contains the Google source code as modified to benefit +the Riak environment. The typical Riak environment has two attributes +that necessitate leveldb adjustments, both in options and code: + +- production servers: Riak often runs in heavy Internet environments: + servers with many CPU cores, lots of memory, and 24x7 disk activity. + Basho's leveldb takes advantage of the environment by adding + hardware CRC calculation, increasing Bloom filter accuracy, and + defaulting to integrity checking enabled. + +- multiple databases open: Riak opens 8 to 128 databases + simultaneously. Google's leveldb supports this, but its background + compaction thread can fall behind. leveldb will "stall" new user + writes whenever the compaction thread gets too far behind. Basho's + leveldb modification include multiple thread blocks that each + contain prioritized threads for specific compaction activities. + +Details for Basho's customizations exist in the leveldb wiki: + + http://github.com/basho/leveldb/wiki + + +** Branch pattern + +This repository follows the Basho standard for branch management +as of November 28, 2013. The standard is found here: + +https://github.com/basho/riak/wiki/Basho-repository-management + +In summary, the "develop" branch contains the most recently reviewed +engineering work. The "master" branch contains the most recently +released work, i.e. distributed as part of a Riak release. + + +** Basic options needed + +Those wishing to truly savor the benefits of Basho's modifications +need to initialize a new leveldb::Options structure similar to the +following before each call to leveldb::DB::Open: + + leveldb::Options * options; + + options=new Leveldb::Options; + + options.filter_policy=leveldb::NewBloomFilterPolicy2(16); + options.write_buffer_size=62914560; // 60Mbytes + options.total_leveldb_mem=2684354560; // 2.5Gbytes (details below) + options.env=leveldb::Env::Default(); + + +** Memory plan + +Basho's leveldb dramatically departed from Google's original internal +memory allotment plan with Riak 2.0. Basho's leveldb uses a methodology +called flexcache. The technical details are here: + + https://github.com/basho/leveldb/wiki/mv-flexcache + +The key points are: + +- options.total_leveldb_mem is an allocation for the entire process, + not a single database + +- giving different values to options.total_leveldb_mem on subsequent Open + calls causes memory to rearrange to current value across all databases + +- recommended minimum for Basho's leveldb is 340Mbytes per database. + +- performance improves rapidly from 340Mbytes to 2.5Gbytes per database (3.0Gbytes + if using Riak's active anti-entropy). Even more is nice, but not as helpful. + +- never assign more than 75% of available RAM to total_leveldb_mem. There is + too much unaccounted memory overhead (worse if you use tcmalloc library). + +- options.max_open_files and options.block_cache should not be used. + diff --git a/src/leveldb/README.GOOGLE b/src/leveldb/README.GOOGLE new file mode 100644 index 000000000..3618adeee --- /dev/null +++ b/src/leveldb/README.GOOGLE @@ -0,0 +1,51 @@ +leveldb: A key-value store +Authors: Sanjay Ghemawat (sanjay@google.com) and Jeff Dean (jeff@google.com) + +The code under this directory implements a system for maintaining a +persistent key/value store. + +See doc/index.html for more explanation. +See doc/impl.html for a brief overview of the implementation. + +The public interface is in include/*.h. Callers should not include or +rely on the details of any other header files in this package. Those +internal APIs may be changed without warning. + +Guide to header files: + +include/db.h + Main interface to the DB: Start here + +include/options.h + Control over the behavior of an entire database, and also + control over the behavior of individual reads and writes. + +include/comparator.h + Abstraction for user-specified comparison function. If you want + just bytewise comparison of keys, you can use the default comparator, + but clients can write their own comparator implementations if they + want custom ordering (e.g. to handle different character + encodings, etc.) + +include/iterator.h + Interface for iterating over data. You can get an iterator + from a DB object. + +include/write_batch.h + Interface for atomically applying multiple updates to a database. + +include/slice.h + A simple module for maintaining a pointer and a length into some + other byte array. + +include/status.h + Status is returned from many of the public interfaces and is used + to report success and various kinds of errors. + +include/env.h + Abstraction of the OS environment. A posix implementation of + this interface is in util/env_posix.cc + +include/table.h +include/table_builder.h + Lower-level modules that most clients probably won't use directly diff --git a/src/leveldb/README.md b/src/leveldb/README.md deleted file mode 100644 index a010c5085..000000000 --- a/src/leveldb/README.md +++ /dev/null @@ -1,174 +0,0 @@ -**LevelDB is a fast key-value storage library written at Google that provides an ordered mapping from string keys to string values.** - -[![Build Status](https://travis-ci.org/google/leveldb.svg?branch=master)](https://travis-ci.org/google/leveldb) - -Authors: Sanjay Ghemawat (sanjay@google.com) and Jeff Dean (jeff@google.com) - -# Features - * Keys and values are arbitrary byte arrays. - * Data is stored sorted by key. - * Callers can provide a custom comparison function to override the sort order. - * The basic operations are `Put(key,value)`, `Get(key)`, `Delete(key)`. - * Multiple changes can be made in one atomic batch. - * Users can create a transient snapshot to get a consistent view of data. - * Forward and backward iteration is supported over the data. - * Data is automatically compressed using the [Snappy compression library](http://google.github.io/snappy/). - * External activity (file system operations etc.) is relayed through a virtual interface so users can customize the operating system interactions. - -# Documentation - [LevelDB library documentation](https://github.com/google/leveldb/blob/master/doc/index.md) is online and bundled with the source code. - - -# Limitations - * This is not a SQL database. It does not have a relational data model, it does not support SQL queries, and it has no support for indexes. - * Only a single process (possibly multi-threaded) can access a particular database at a time. - * There is no client-server support builtin to the library. An application that needs such support will have to wrap their own server around the library. - -# Contributing to the leveldb Project -The leveldb project welcomes contributions. leveldb's primary goal is to be -a reliable and fast key/value store. Changes that are in line with the -features/limitations outlined above, and meet the requirements below, -will be considered. - -Contribution requirements: - -1. **POSIX only**. We _generally_ will only accept changes that are both - compiled, and tested on a POSIX platform - usually Linux. Very small - changes will sometimes be accepted, but consider that more of an - exception than the rule. - -2. **Stable API**. We strive very hard to maintain a stable API. Changes that - require changes for projects using leveldb _might_ be rejected without - sufficient benefit to the project. - -3. **Tests**: All changes must be accompanied by a new (or changed) test, or - a sufficient explanation as to why a new (or changed) test is not required. - -## Submitting a Pull Request -Before any pull request will be accepted the author must first sign a -Contributor License Agreement (CLA) at https://cla.developers.google.com/. - -In order to keep the commit timeline linear -[squash](https://git-scm.com/book/en/v2/Git-Tools-Rewriting-History#Squashing-Commits) -your changes down to a single commit and [rebase](https://git-scm.com/docs/git-rebase) -on google/leveldb/master. This keeps the commit timeline linear and more easily sync'ed -with the internal repository at Google. More information at GitHub's -[About Git rebase](https://help.github.com/articles/about-git-rebase/) page. - -# Performance - -Here is a performance report (with explanations) from the run of the -included db_bench program. The results are somewhat noisy, but should -be enough to get a ballpark performance estimate. - -## Setup - -We use a database with a million entries. Each entry has a 16 byte -key, and a 100 byte value. Values used by the benchmark compress to -about half their original size. - - LevelDB: version 1.1 - Date: Sun May 1 12:11:26 2011 - CPU: 4 x Intel(R) Core(TM)2 Quad CPU Q6600 @ 2.40GHz - CPUCache: 4096 KB - Keys: 16 bytes each - Values: 100 bytes each (50 bytes after compression) - Entries: 1000000 - Raw Size: 110.6 MB (estimated) - File Size: 62.9 MB (estimated) - -## Write performance - -The "fill" benchmarks create a brand new database, in either -sequential, or random order. The "fillsync" benchmark flushes data -from the operating system to the disk after every operation; the other -write operations leave the data sitting in the operating system buffer -cache for a while. The "overwrite" benchmark does random writes that -update existing keys in the database. - - fillseq : 1.765 micros/op; 62.7 MB/s - fillsync : 268.409 micros/op; 0.4 MB/s (10000 ops) - fillrandom : 2.460 micros/op; 45.0 MB/s - overwrite : 2.380 micros/op; 46.5 MB/s - -Each "op" above corresponds to a write of a single key/value pair. -I.e., a random write benchmark goes at approximately 400,000 writes per second. - -Each "fillsync" operation costs much less (0.3 millisecond) -than a disk seek (typically 10 milliseconds). We suspect that this is -because the hard disk itself is buffering the update in its memory and -responding before the data has been written to the platter. This may -or may not be safe based on whether or not the hard disk has enough -power to save its memory in the event of a power failure. - -## Read performance - -We list the performance of reading sequentially in both the forward -and reverse direction, and also the performance of a random lookup. -Note that the database created by the benchmark is quite small. -Therefore the report characterizes the performance of leveldb when the -working set fits in memory. The cost of reading a piece of data that -is not present in the operating system buffer cache will be dominated -by the one or two disk seeks needed to fetch the data from disk. -Write performance will be mostly unaffected by whether or not the -working set fits in memory. - - readrandom : 16.677 micros/op; (approximately 60,000 reads per second) - readseq : 0.476 micros/op; 232.3 MB/s - readreverse : 0.724 micros/op; 152.9 MB/s - -LevelDB compacts its underlying storage data in the background to -improve read performance. The results listed above were done -immediately after a lot of random writes. The results after -compactions (which are usually triggered automatically) are better. - - readrandom : 11.602 micros/op; (approximately 85,000 reads per second) - readseq : 0.423 micros/op; 261.8 MB/s - readreverse : 0.663 micros/op; 166.9 MB/s - -Some of the high cost of reads comes from repeated decompression of blocks -read from disk. If we supply enough cache to the leveldb so it can hold the -uncompressed blocks in memory, the read performance improves again: - - readrandom : 9.775 micros/op; (approximately 100,000 reads per second before compaction) - readrandom : 5.215 micros/op; (approximately 190,000 reads per second after compaction) - -## Repository contents - -See [doc/index.md](doc/index.md) for more explanation. See -[doc/impl.md](doc/impl.md) for a brief overview of the implementation. - -The public interface is in include/*.h. Callers should not include or -rely on the details of any other header files in this package. Those -internal APIs may be changed without warning. - -Guide to header files: - -* **include/db.h**: Main interface to the DB: Start here - -* **include/options.h**: Control over the behavior of an entire database, -and also control over the behavior of individual reads and writes. - -* **include/comparator.h**: Abstraction for user-specified comparison function. -If you want just bytewise comparison of keys, you can use the default -comparator, but clients can write their own comparator implementations if they -want custom ordering (e.g. to handle different character encodings, etc.) - -* **include/iterator.h**: Interface for iterating over data. You can get -an iterator from a DB object. - -* **include/write_batch.h**: Interface for atomically applying multiple -updates to a database. - -* **include/slice.h**: A simple module for maintaining a pointer and a -length into some other byte array. - -* **include/status.h**: Status is returned from many of the public interfaces -and is used to report success and various kinds of errors. - -* **include/env.h**: -Abstraction of the OS environment. A posix implementation of this interface is -in util/env_posix.cc - -* **include/table.h, include/table_builder.h**: Lower-level modules that most -clients probably won't use directly diff --git a/src/leveldb/TODO b/src/leveldb/TODO index e603c0713..9130b6a9f 100644 --- a/src/leveldb/TODO +++ b/src/leveldb/TODO @@ -7,7 +7,6 @@ db within [start_key..end_key]? For Chrome, deletion of obsolete object stores, etc. can be done in the background anyway, so probably not that important. -- There have been requests for MultiGet. After a range is completely deleted, what gets rid of the corresponding files if we do no future changes to that range. Make diff --git a/src/leveldb/WINDOWS.md b/src/leveldb/WINDOWS.md deleted file mode 100644 index 5b76c2448..000000000 --- a/src/leveldb/WINDOWS.md +++ /dev/null @@ -1,39 +0,0 @@ -# Building LevelDB On Windows - -## Prereqs - -Install the [Windows Software Development Kit version 7.1](http://www.microsoft.com/downloads/dlx/en-us/listdetailsview.aspx?FamilyID=6b6c21d2-2006-4afa-9702-529fa782d63b). - -Download and extract the [Snappy source distribution](http://snappy.googlecode.com/files/snappy-1.0.5.tar.gz) - -1. Open the "Windows SDK 7.1 Command Prompt" : - Start Menu -> "Microsoft Windows SDK v7.1" > "Windows SDK 7.1 Command Prompt" -2. Change the directory to the leveldb project - -## Building the Static lib - -* 32 bit Version - - setenv /x86 - msbuild.exe /p:Configuration=Release /p:Platform=Win32 /p:Snappy=..\snappy-1.0.5 - -* 64 bit Version - - setenv /x64 - msbuild.exe /p:Configuration=Release /p:Platform=x64 /p:Snappy=..\snappy-1.0.5 - - -## Building and Running the Benchmark app - -* 32 bit Version - - setenv /x86 - msbuild.exe /p:Configuration=Benchmark /p:Platform=Win32 /p:Snappy=..\snappy-1.0.5 - Benchmark\leveldb.exe - -* 64 bit Version - - setenv /x64 - msbuild.exe /p:Configuration=Benchmark /p:Platform=x64 /p:Snappy=..\snappy-1.0.5 - x64\Benchmark\leveldb.exe - diff --git a/src/leveldb/build_detect_platform b/src/leveldb/build_detect_platform index 4a9471590..0f231fc1d 100755 --- a/src/leveldb/build_detect_platform +++ b/src/leveldb/build_detect_platform @@ -7,11 +7,8 @@ # CC C Compiler path # CXX C++ Compiler path # PLATFORM_LDFLAGS Linker flags -# PLATFORM_LIBS Libraries flags # PLATFORM_SHARED_EXT Extension for shared libraries # PLATFORM_SHARED_LDFLAGS Flags for building shared library -# This flag is embedded just before the name -# of the shared library without intervening spaces # PLATFORM_SHARED_CFLAGS Flags for compiling objects for shared library # PLATFORM_CCFLAGS C compiler flags # PLATFORM_CXXFLAGS C++ compiler flags. Will contain: @@ -20,15 +17,14 @@ # # The PLATFORM_CCFLAGS and PLATFORM_CXXFLAGS might include the following: # -# -DLEVELDB_ATOMIC_PRESENT if is present +# -DLEVELDB_CSTDATOMIC_PRESENT if is present # -DLEVELDB_PLATFORM_POSIX for Posix-based platforms # -DSNAPPY if the Snappy library is present # OUTPUT=$1 -PREFIX=$2 -if test -z "$OUTPUT" || test -z "$PREFIX"; then - echo "usage: $0 " >&2 +if test -z "$OUTPUT"; then + echo "usage: $0 " >&2 exit 1 fi @@ -44,10 +40,6 @@ if test -z "$CXX"; then CXX=g++ fi -if test -z "$TMPDIR"; then - TMPDIR=/tmp -fi - # Detect OS if test -z "$TARGET_OS"; then TARGET_OS=`uname -s` @@ -58,119 +50,77 @@ CROSS_COMPILE= PLATFORM_CCFLAGS= PLATFORM_CXXFLAGS= PLATFORM_LDFLAGS= -PLATFORM_LIBS= -PLATFORM_SHARED_EXT="so" +PLATFORM_SHARED_EXT= PLATFORM_SHARED_LDFLAGS="-shared -Wl,-soname -Wl," PLATFORM_SHARED_CFLAGS="-fPIC" PLATFORM_SHARED_VERSIONED=true -PLATFORM_SSEFLAGS= -MEMCMP_FLAG= -if [ "$CXX" = "g++" ]; then - # Use libc's memcmp instead of GCC's memcmp. This results in ~40% - # performance improvement on readrandom under gcc 4.4.3 on Linux/x86. - MEMCMP_FLAG="-fno-builtin-memcmp" +if test -n "$LEVELDB_VSN"; then + VERSION_FLAGS="$VERSION_FLAGS -DLEVELDB_VSN=\"$LEVELDB_VSN\"" fi +# On GCC, we pick libc's memcmp over GCC's memcmp via -fno-builtin-memcmp case "$TARGET_OS" in - CYGWIN_*) - PLATFORM=OS_LINUX - COMMON_FLAGS="$MEMCMP_FLAG -lpthread -DOS_LINUX -DCYGWIN" - PLATFORM_LDFLAGS="-lpthread" - PORT_FILE=port/port_posix.cc - PORT_SSE_FILE=port/port_posix_sse.cc - ;; Darwin) PLATFORM=OS_MACOSX - COMMON_FLAGS="$MEMCMP_FLAG -DOS_MACOSX" - PLATFORM_SHARED_EXT=dylib - [ -z "$INSTALL_PATH" ] && INSTALL_PATH=`pwd` - PLATFORM_SHARED_LDFLAGS="-dynamiclib -install_name $INSTALL_PATH/" + oIFS="$IFS"; IFS=. + set `uname -r` + IFS="$oIFS" + if [ "$1" -ge 13 ]; then + # assume clang compiler + COMMON_FLAGS="-mmacosx-version-min=10.8 -DOS_MACOSX -stdlib=libc++" + PLATFORM_LDFLAGS="-mmacosx-version-min=10.8" + else + COMMON_FLAGS="-fno-builtin-memcmp -DOS_MACOSX" + fi + PLATFORM_SHARED_EXT= + PLATFORM_SHARED_LDFLAGS="-dynamiclib -install_name " PORT_FILE=port/port_posix.cc - PORT_SSE_FILE=port/port_posix_sse.cc ;; Linux) PLATFORM=OS_LINUX - COMMON_FLAGS="$MEMCMP_FLAG -pthread -DOS_LINUX" - PLATFORM_LDFLAGS="-pthread" + COMMON_FLAGS="-fno-builtin-memcmp -pthread -DOS_LINUX" + PLATFORM_LDFLAGS="-pthread -lrt" PORT_FILE=port/port_posix.cc - PORT_SSE_FILE=port/port_posix_sse.cc ;; SunOS) PLATFORM=OS_SOLARIS - COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_SOLARIS" - PLATFORM_LIBS="-lpthread -lrt" + COMMON_FLAGS="-fno-builtin-memcmp -D_REENTRANT -DOS_SOLARIS -m64" + PLATFORM_LDFLAGS="-lpthread -lrt" + PLATFORM_SHARED_EXT= PORT_FILE=port/port_posix.cc - PORT_SSE_FILE=port/port_posix_sse.cc ;; FreeBSD) + CC=cc + CXX=c++ PLATFORM=OS_FREEBSD - COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_FREEBSD" - PLATFORM_LIBS="-lpthread" - PORT_FILE=port/port_posix.cc - PORT_SSE_FILE=port/port_posix_sse.cc - ;; - GNU/kFreeBSD) - PLATFORM=OS_KFREEBSD - COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_KFREEBSD" - PLATFORM_LIBS="-lpthread" + COMMON_FLAGS="-fno-builtin-memcmp -D_REENTRANT -DOS_FREEBSD" + PLATFORM_LDFLAGS="-lpthread" PORT_FILE=port/port_posix.cc ;; NetBSD) PLATFORM=OS_NETBSD - COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_NETBSD" - PLATFORM_LIBS="-lpthread -lgcc_s" + COMMON_FLAGS="-fno-builtin-memcmp -D_REENTRANT -DOS_NETBSD" + PLATFORM_LDFLAGS="-lpthread -lgcc_s" PORT_FILE=port/port_posix.cc - PORT_SSE_FILE=port/port_posix_sse.cc ;; OpenBSD) PLATFORM=OS_OPENBSD - COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_OPENBSD" + COMMON_FLAGS="-fno-builtin-memcmp -D_REENTRANT -DOS_OPENBSD" PLATFORM_LDFLAGS="-pthread" PORT_FILE=port/port_posix.cc - PORT_SSE_FILE=port/port_posix_sse.cc ;; DragonFly) PLATFORM=OS_DRAGONFLYBSD - COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_DRAGONFLYBSD" - PLATFORM_LIBS="-lpthread" + COMMON_FLAGS="-fno-builtin-memcmp -D_REENTRANT -DOS_DRAGONFLYBSD" + PLATFORM_LDFLAGS="-lpthread" PORT_FILE=port/port_posix.cc - PORT_SSE_FILE=port/port_posix_sse.cc ;; OS_ANDROID_CROSSCOMPILE) PLATFORM=OS_ANDROID - COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_ANDROID -DLEVELDB_PLATFORM_POSIX" + COMMON_FLAGS="-fno-builtin-memcmp -D_REENTRANT -DOS_ANDROID -DLEVELDB_PLATFORM_POSIX" PLATFORM_LDFLAGS="" # All pthread features are in the Android C library PORT_FILE=port/port_posix.cc - PORT_SSE_FILE=port/port_posix_sse.cc - CROSS_COMPILE=true - ;; - HP-UX) - PLATFORM=OS_HPUX - COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_HPUX" - PLATFORM_LDFLAGS="-pthread" - PORT_FILE=port/port_posix.cc - PORT_SSE_FILE=port/port_posix_sse.cc - # man ld: +h internal_name - PLATFORM_SHARED_LDFLAGS="-shared -Wl,+h -Wl," - ;; - IOS) - PLATFORM=IOS - COMMON_FLAGS="$MEMCMP_FLAG -DOS_MACOSX" - [ -z "$INSTALL_PATH" ] && INSTALL_PATH=`pwd` - PORT_FILE=port/port_posix.cc - PORT_SSE_FILE=port/port_posix_sse.cc - PLATFORM_SHARED_EXT= - PLATFORM_SHARED_LDFLAGS= - PLATFORM_SHARED_CFLAGS= - PLATFORM_SHARED_VERSIONED= - ;; - OS_WINDOWS_CROSSCOMPILE | NATIVE_WINDOWS) - PLATFORM=OS_WINDOWS - COMMON_FLAGS="-fno-builtin-memcmp -D_REENTRANT -DOS_WINDOWS -DLEVELDB_PLATFORM_WINDOWS -DWINVER=0x0500 -D__USE_MINGW_ANSI_STDIO=1" - PLATFORM_SOURCES="util/env_win.cc" - PLATFORM_LIBS="-lshlwapi" - PORT_FILE=port/port_win.cc CROSS_COMPILE=true ;; *) @@ -182,78 +132,106 @@ esac # except for the test and benchmark files. By default, find will output a list # of all files matching either rule, so we need to append -print to make the # prune take effect. -DIRS="$PREFIX/db $PREFIX/util $PREFIX/table" - +if [ -f leveldb_ee/README.md ]; then +DIRS="util db table leveldb_ee" +else +DIRS="util db table leveldb_os" +fi set -f # temporarily disable globbing so that our patterns aren't expanded PRUNE_TEST="-name *test*.cc -prune" PRUNE_BENCH="-name *_bench.cc -prune" -PRUNE_TOOL="-name leveldbutil.cc -prune" -PORTABLE_FILES=`find $DIRS $PRUNE_TEST -o $PRUNE_BENCH -o $PRUNE_TOOL -o -name '*.cc' -print | sort | sed "s,^$PREFIX/,," | tr "\n" " "` - +PORTABLE_FILES=`find $DIRS $PRUNE_TEST -o $PRUNE_BENCH -o -name '*.cc' -print | sort | tr "\n" " "` +TESTS=`find $DIRS -name '*_test.c*' -print | sort | tr "\n" " "` set +f # re-enable globbing # The sources consist of the portable files, plus the platform-specific port # file. -echo "SOURCES=$PORTABLE_FILES $PORT_FILE $PORT_SSE_FILE" >> $OUTPUT +echo "SOURCES=$PORTABLE_FILES $PORT_FILE" >> $OUTPUT echo "MEMENV_SOURCES=helpers/memenv/memenv.cc" >> $OUTPUT +echo "TEST_SOURCES=$TESTS" >>$OUTPUT if [ "$CROSS_COMPILE" = "true" ]; then # Cross-compiling; do not try any compilation tests. true else - CXXOUTPUT="${TMPDIR}/leveldb_build_detect_platform-cxx.$$" - - # If -std=c++0x works, use as fallback for when memory barriers - # are not available. - $CXX $CXXFLAGS -std=c++0x -x c++ - -o $CXXOUTPUT 2>/dev/null < + # If -std=c++0x works, use . Otherwise use port_posix.h. + $CXX $CFLAGS -std=c++0x -x c++ - -o /dev/null 2>/dev/null < int main() {} EOF if [ "$?" = 0 ]; then - COMMON_FLAGS="$COMMON_FLAGS -DLEVELDB_PLATFORM_POSIX -DLEVELDB_ATOMIC_PRESENT" + COMMON_FLAGS="$COMMON_FLAGS -DLEVELDB_PLATFORM_POSIX -DLEVELDB_CSTDATOMIC_PRESENT" PLATFORM_CXXFLAGS="-std=c++0x" else COMMON_FLAGS="$COMMON_FLAGS -DLEVELDB_PLATFORM_POSIX" fi + # Test whether Snappy library is installed + # http://code.google.com/p/snappy/ + $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null < + int main() {} +EOF + if [ "$?" = 0 ]; then + COMMON_FLAGS="$COMMON_FLAGS -DSNAPPY" + if [ "$PLATFORM" = "OS_LINUX" ]; then + # Basho: switching to static snappy library to make tools more portable + PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -Wl,-non_shared -lsnappy -Wl,-call_shared" + else + PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lsnappy" + fi + fi + # Test whether tcmalloc is available - $CXX $CXXFLAGS -x c++ - -o $CXXOUTPUT -ltcmalloc 2>/dev/null </dev/null </dev/null - - # Test if gcc SSE 4.2 is supported - $CXX $CXXFLAGS -x c++ - -o $CXXOUTPUT -msse4.2 2>/dev/null </dev/null fi -# Use the SSE 4.2 CRC32C intrinsics iff runtime checks indicate compiler supports them. -if [ -n "$PLATFORM_SSEFLAGS" ]; then - PLATFORM_SSEFLAGS="$PLATFORM_SSEFLAGS -DLEVELDB_PLATFORM_POSIX_SSE" -fi - -PLATFORM_CCFLAGS="$PLATFORM_CCFLAGS $COMMON_FLAGS" -PLATFORM_CXXFLAGS="$PLATFORM_CXXFLAGS $COMMON_FLAGS" +PLATFORM_CCFLAGS="$PLATFORM_CCFLAGS $COMMON_FLAGS $VERSION_FLAGS" +PLATFORM_CXXFLAGS="$PLATFORM_CXXFLAGS $COMMON_FLAGS $VERSION_FLAGS" echo "CC=$CC" >> $OUTPUT echo "CXX=$CXX" >> $OUTPUT echo "PLATFORM=$PLATFORM" >> $OUTPUT echo "PLATFORM_LDFLAGS=$PLATFORM_LDFLAGS" >> $OUTPUT -echo "PLATFORM_LIBS=$PLATFORM_LIBS" >> $OUTPUT echo "PLATFORM_CCFLAGS=$PLATFORM_CCFLAGS" >> $OUTPUT echo "PLATFORM_CXXFLAGS=$PLATFORM_CXXFLAGS" >> $OUTPUT -echo "PLATFORM_SSEFLAGS=$PLATFORM_SSEFLAGS" >> $OUTPUT echo "PLATFORM_SHARED_CFLAGS=$PLATFORM_SHARED_CFLAGS" >> $OUTPUT echo "PLATFORM_SHARED_EXT=$PLATFORM_SHARED_EXT" >> $OUTPUT echo "PLATFORM_SHARED_LDFLAGS=$PLATFORM_SHARED_LDFLAGS" >> $OUTPUT echo "PLATFORM_SHARED_VERSIONED=$PLATFORM_SHARED_VERSIONED" >> $OUTPUT + +# +# Basho extension to place -D variable in include/leveldb/ldb_config.h +# + +LDB_CONFIG="include/leveldb/ldb_config.h" + +# Delete existing output, if it exists +rm -f $LDB_CONFIG + +write_config_h() +{ + for param in $@ + do + prefix=$(expr -- $param : "\(..\)") + if [ X$prefix = "X-D" ] + then + echo "" >>$LDB_CONFIG + echo "#ifndef $(expr -- $param : '..\(.*\)')" >>$LDB_CONFIG + echo " #define $(expr -- $param : '..\(.*\)')" >>$LDB_CONFIG + echo "#endif" >>$LDB_CONFIG + fi + done +} + +echo "/** This file is generated by build_detect_platform." >$LDB_CONFIG +echo " * It saves the state of compile flags. This benefits the reuse" >>$LDB_CONFIG +echo " * of internal include files outside of a leveldb build." >>$LDB_CONFIG +echo " */" >>$LDB_CONFIG + +write_config_h $COMMON_FLAGS diff --git a/src/leveldb/db/autocompact_test.cc b/src/leveldb/db/autocompact_test.cc deleted file mode 100644 index d20a2362c..000000000 --- a/src/leveldb/db/autocompact_test.cc +++ /dev/null @@ -1,118 +0,0 @@ -// Copyright (c) 2013 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "leveldb/db.h" -#include "db/db_impl.h" -#include "leveldb/cache.h" -#include "util/testharness.h" -#include "util/testutil.h" - -namespace leveldb { - -class AutoCompactTest { - public: - std::string dbname_; - Cache* tiny_cache_; - Options options_; - DB* db_; - - AutoCompactTest() { - dbname_ = test::TmpDir() + "/autocompact_test"; - tiny_cache_ = NewLRUCache(100); - options_.block_cache = tiny_cache_; - DestroyDB(dbname_, options_); - options_.create_if_missing = true; - options_.compression = kNoCompression; - ASSERT_OK(DB::Open(options_, dbname_, &db_)); - } - - ~AutoCompactTest() { - delete db_; - DestroyDB(dbname_, Options()); - delete tiny_cache_; - } - - std::string Key(int i) { - char buf[100]; - snprintf(buf, sizeof(buf), "key%06d", i); - return std::string(buf); - } - - uint64_t Size(const Slice& start, const Slice& limit) { - Range r(start, limit); - uint64_t size; - db_->GetApproximateSizes(&r, 1, &size); - return size; - } - - void DoReads(int n); -}; - -static const int kValueSize = 200 * 1024; -static const int kTotalSize = 100 * 1024 * 1024; -static const int kCount = kTotalSize / kValueSize; - -// Read through the first n keys repeatedly and check that they get -// compacted (verified by checking the size of the key space). -void AutoCompactTest::DoReads(int n) { - std::string value(kValueSize, 'x'); - DBImpl* dbi = reinterpret_cast(db_); - - // Fill database - for (int i = 0; i < kCount; i++) { - ASSERT_OK(db_->Put(WriteOptions(), Key(i), value)); - } - ASSERT_OK(dbi->TEST_CompactMemTable()); - - // Delete everything - for (int i = 0; i < kCount; i++) { - ASSERT_OK(db_->Delete(WriteOptions(), Key(i))); - } - ASSERT_OK(dbi->TEST_CompactMemTable()); - - // Get initial measurement of the space we will be reading. - const int64_t initial_size = Size(Key(0), Key(n)); - const int64_t initial_other_size = Size(Key(n), Key(kCount)); - - // Read until size drops significantly. - std::string limit_key = Key(n); - for (int read = 0; true; read++) { - ASSERT_LT(read, 100) << "Taking too long to compact"; - Iterator* iter = db_->NewIterator(ReadOptions()); - for (iter->SeekToFirst(); - iter->Valid() && iter->key().ToString() < limit_key; - iter->Next()) { - // Drop data - } - delete iter; - // Wait a little bit to allow any triggered compactions to complete. - Env::Default()->SleepForMicroseconds(1000000); - uint64_t size = Size(Key(0), Key(n)); - fprintf(stderr, "iter %3d => %7.3f MB [other %7.3f MB]\n", - read+1, size/1048576.0, Size(Key(n), Key(kCount))/1048576.0); - if (size <= initial_size/10) { - break; - } - } - - // Verify that the size of the key space not touched by the reads - // is pretty much unchanged. - const int64_t final_other_size = Size(Key(n), Key(kCount)); - ASSERT_LE(final_other_size, initial_other_size + 1048576); - ASSERT_GE(final_other_size, initial_other_size/5 - 1048576); -} - -TEST(AutoCompactTest, ReadAll) { - DoReads(kCount); -} - -TEST(AutoCompactTest, ReadHalf) { - DoReads(kCount/2); -} - -} // namespace leveldb - -int main(int argc, char** argv) { - return leveldb::test::RunAllTests(); -} diff --git a/src/leveldb/db/builder.cc b/src/leveldb/db/builder.cc index f41988219..4ac60f488 100644 --- a/src/leveldb/db/builder.cc +++ b/src/leveldb/db/builder.cc @@ -2,12 +2,16 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. +#define __STDC_FORMAT_MACROS +#include + #include "db/builder.h" #include "db/filename.h" #include "db/dbformat.h" #include "db/table_cache.h" #include "db/version_edit.h" +#include "db/version_set.h" #include "leveldb/db.h" #include "leveldb/env.h" #include "leveldb/iterator.h" @@ -17,27 +21,51 @@ namespace leveldb { Status BuildTable(const std::string& dbname, Env* env, const Options& options, + const Comparator * user_comparator, TableCache* table_cache, Iterator* iter, - FileMetaData* meta) { + FileMetaData* meta, + SequenceNumber smallest_snapshot) { Status s; + size_t keys_seen, keys_retired; + + keys_seen=0; + keys_retired=0; + meta->file_size = 0; iter->SeekToFirst(); - std::string fname = TableFileName(dbname, meta->number); + KeyRetirement retire(user_comparator, smallest_snapshot, &options); + + std::string fname = TableFileName(options, meta->number, meta->level); if (iter->Valid()) { WritableFile* file; - s = env->NewWritableFile(fname, &file); + + s = env->NewWritableFile(fname, &file, + env->RecoveryMmapSize(&options)); if (!s.ok()) { return s; } + // tune fadvise to keep all of this lower level file in page cache + // (compaction of unsorted files causes severe cache misses) + file->SetMetadataOffset(1); + TableBuilder* builder = new TableBuilder(options, file); meta->smallest.DecodeFrom(iter->key()); for (; iter->Valid(); iter->Next()) { + ++keys_seen; Slice key = iter->key(); - meta->largest.DecodeFrom(key); - builder->Add(key, iter->value()); + if (!retire(key)) + { + meta->largest.DecodeFrom(key); + builder->Add(key, iter->value()); + ++meta->num_entries; + } // if + else + { + ++keys_retired; + } // else } // Finish and check for builder errors @@ -45,6 +73,9 @@ Status BuildTable(const std::string& dbname, s = builder->Finish(); if (s.ok()) { meta->file_size = builder->FileSize(); + meta->exp_write_low = builder->GetExpiryWriteLow(); + meta->exp_write_high = builder->GetExpiryWriteHigh(); + meta->exp_explicit_high = builder->GetExpiryExplicitHigh(); assert(meta->file_size > 0); } } else { @@ -64,10 +95,20 @@ Status BuildTable(const std::string& dbname, if (s.ok()) { // Verify that the table is usable + Table * table_ptr; Iterator* it = table_cache->NewIterator(ReadOptions(), meta->number, - meta->file_size); + meta->file_size, + meta->level, + &table_ptr); s = it->status(); + + // Riak specific: bloom filter is no longer read by default, + // force read on highly used overlapped table files + if (s.ok() && VersionSet::IsLevelOverlapped(meta->level)) + table_ptr->ReadFilter(); + + // table_ptr is owned by it and therefore invalidated by this delete delete it; } } @@ -79,6 +120,11 @@ Status BuildTable(const std::string& dbname, if (s.ok() && meta->file_size > 0) { // Keep it + if (0!=keys_retired) + { + Log(options.info_log, "Level-0 table #%" PRIu64 ": %zd keys seen, %zd keys retired, %zd keys expired", + meta->number, keys_seen, retire.GetDroppedCount(), retire.GetExpiredCount()); + } // if } else { env->DeleteFile(fname); } diff --git a/src/leveldb/db/builder.h b/src/leveldb/db/builder.h index 62431fcf4..712924f8b 100644 --- a/src/leveldb/db/builder.h +++ b/src/leveldb/db/builder.h @@ -6,6 +6,7 @@ #define STORAGE_LEVELDB_DB_BUILDER_H_ #include "leveldb/status.h" +#include "db/dbformat.h" namespace leveldb { @@ -25,9 +26,11 @@ class VersionEdit; extern Status BuildTable(const std::string& dbname, Env* env, const Options& options, + const Comparator * user_comparator, TableCache* table_cache, Iterator* iter, - FileMetaData* meta); + FileMetaData* meta, + SequenceNumber smallest_snapshot); } // namespace leveldb diff --git a/src/leveldb/db/c.cc b/src/leveldb/db/c.cc index 08ff0ad90..36066ffe0 100644 --- a/src/leveldb/db/c.cc +++ b/src/leveldb/db/c.cc @@ -6,6 +6,7 @@ #include #include +#include #include "leveldb/cache.h" #include "leveldb/comparator.h" #include "leveldb/db.h" @@ -40,6 +41,8 @@ using leveldb::Status; using leveldb::WritableFile; using leveldb::WriteBatch; using leveldb::WriteOptions; +using leveldb::KeyMetaData; +using leveldb::ValueType; extern "C" { @@ -49,6 +52,7 @@ struct leveldb_writebatch_t { WriteBatch rep; }; struct leveldb_snapshot_t { const Snapshot* rep; }; struct leveldb_readoptions_t { ReadOptions rep; }; struct leveldb_writeoptions_t { WriteOptions rep; }; +struct leveldb_keymetadata_t { KeyMetaData rep; }; struct leveldb_options_t { Options rep; }; struct leveldb_cache_t { Cache* rep; }; struct leveldb_seqfile_t { SequentialFile* rep; }; @@ -173,8 +177,19 @@ void leveldb_put( const char* key, size_t keylen, const char* val, size_t vallen, char** errptr) { + return(leveldb_put2(db, options, key, keylen, val, vallen, errptr, NULL)); +} + +void leveldb_put2( + leveldb_t* db, + const leveldb_writeoptions_t* options, + const char* key, size_t keylen, + const char* val, size_t vallen, + char** errptr, + const leveldb_keymetadata_t * metadata) { SaveError(errptr, - db->rep->Put(options->rep, Slice(key, keylen), Slice(val, vallen))); + db->rep->Put(options->rep, Slice(key, keylen), Slice(val, vallen), + (NULL==metadata ? NULL : &metadata->rep))); } void leveldb_delete( @@ -200,9 +215,21 @@ char* leveldb_get( const char* key, size_t keylen, size_t* vallen, char** errptr) { + + return(leveldb_get2(db, options, key, keylen, vallen, errptr, NULL)); +} + +char* leveldb_get2( + leveldb_t* db, + const leveldb_readoptions_t* options, + const char* key, size_t keylen, + size_t* vallen, + char** errptr, + leveldb_keymetadata_t * metadata) { char* result = NULL; std::string tmp; - Status s = db->rep->Get(options->rep, Slice(key, keylen), &tmp); + Status s = db->rep->Get(options->rep, Slice(key, keylen), &tmp, + (NULL==metadata ? NULL : &metadata->rep)); if (s.ok()) { *vallen = tmp.size(); result = CopyString(tmp); @@ -330,6 +357,15 @@ const char* leveldb_iter_value(const leveldb_iterator_t* iter, size_t* vlen) { return s.data(); } +const void leveldb_iter_keymetadata(const leveldb_iterator_t* iter, + leveldb_keymetadata_t * meta) +{ + if (NULL!=iter && NULL!=meta) + { + meta->rep=iter->rep->keymetadata(); + } // if +} + void leveldb_iter_get_error(const leveldb_iterator_t* iter, char** errptr) { SaveError(errptr, iter->rep->status()); } @@ -350,7 +386,16 @@ void leveldb_writebatch_put( leveldb_writebatch_t* b, const char* key, size_t klen, const char* val, size_t vlen) { - b->rep.Put(Slice(key, klen), Slice(val, vlen)); + leveldb_writebatch_put2(b, key, klen, val, vlen,NULL); +} + +void leveldb_writebatch_put2( + leveldb_writebatch_t* b, + const char* key, size_t klen, + const char* val, size_t vlen, + const leveldb_keymetadata_t * metadata) { + b->rep.Put(Slice(key, klen), Slice(val, vlen), + (NULL==metadata ? NULL : &metadata->rep)); } void leveldb_writebatch_delete( @@ -362,15 +407,20 @@ void leveldb_writebatch_delete( void leveldb_writebatch_iterate( leveldb_writebatch_t* b, void* state, - void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen), + void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen, + const int & type, const uint64_t & expiry), void (*deleted)(void*, const char* k, size_t klen)) { class H : public WriteBatch::Handler { public: void* state_; - void (*put_)(void*, const char* k, size_t klen, const char* v, size_t vlen); + void (*put_)(void*, const char* k, size_t klen, const char* v, size_t vlen, + const int & type, const uint64_t & expiry); void (*deleted_)(void*, const char* k, size_t klen); - virtual void Put(const Slice& key, const Slice& value) { - (*put_)(state_, key.data(), key.size(), value.data(), value.size()); + virtual void Put(const Slice& key, const Slice& value, + const leveldb::ValueType & type, + const leveldb::ExpiryTimeMicros & expiry) + { + (*put_)(state_, key.data(), key.size(), value.data(), value.size(), (int)type, (uint64_t)expiry); } virtual void Delete(const Slice& key) { (*deleted_)(state_, key.data(), key.size()); @@ -418,6 +468,11 @@ void leveldb_options_set_paranoid_checks( opt->rep.paranoid_checks = v; } +void leveldb_options_set_verify_compactions( + leveldb_options_t* opt, unsigned char v) { + opt->rep.verify_compactions = v; +} + void leveldb_options_set_env(leveldb_options_t* opt, leveldb_env_t* env) { opt->rep.env = (env ? env->rep : NULL); } @@ -450,6 +505,10 @@ void leveldb_options_set_compression(leveldb_options_t* opt, int t) { opt->rep.compression = static_cast(t); } +void leveldb_options_set_total_leveldb_mem(leveldb_options_t* opt, size_t s) { + opt->rep.total_leveldb_mem = s; +} + leveldb_comparator_t* leveldb_comparator_create( void* state, void (*destructor)(void*), @@ -580,8 +639,18 @@ void leveldb_env_destroy(leveldb_env_t* env) { delete env; } +void leveldb_env_shutdown() { + Env::Shutdown(); +} + +/** + * CAUTION: this call is only for char * objects returned by + * functions like leveldb_get and leveldb_property_value. + * Also used to release errptr strings. + */ void leveldb_free(void* ptr) { - free(ptr); + if (NULL!=ptr) + free(ptr); } int leveldb_major_version() { diff --git a/src/leveldb/db/c_test.c b/src/leveldb/db/c_test.c index 7cd5ee020..637ba9311 100644 --- a/src/leveldb/db/c_test.c +++ b/src/leveldb/db/c_test.c @@ -3,6 +3,8 @@ found in the LICENSE file. See the AUTHORS file for names of contributors. */ #include "leveldb/c.h" +#include "leveldb/options.h" +#include "port/port.h" #include #include @@ -11,8 +13,13 @@ #include #include +using leveldb::ValueType; + +struct leveldb_keymetadata_t { leveldb::KeyMetaData rep; }; + const char* phase = ""; static char dbname[200]; +static leveldb::ExpiryTimeMicros gStartTime; static void StartPhase(const char* name) { fprintf(stderr, "=== Test %s\n", name); @@ -33,7 +40,7 @@ static const char* GetTempDir(void) { } #define CheckCondition(cond) \ - if (!(cond)) { \ + if (!(cond)) { \ fprintf(stderr, "%s:%d: %s: %s\n", __FILE__, __LINE__, phase, #cond); \ abort(); \ } @@ -49,7 +56,7 @@ static void CheckEqual(const char* expected, const char* v, size_t n) { fprintf(stderr, "%s: expected '%s', got '%s'\n", phase, (expected ? expected : "(null)"), - (v ? v : "(null")); + (v ? v : "(null)")); abort(); } } @@ -112,6 +119,117 @@ static void CheckDel(void* ptr, const char* k, size_t klen) { (*state)++; } +// (expiry enabled) +static void CheckGet2( + leveldb_t* db, + const leveldb_readoptions_t* options, + const char* key, + const char* expected, + ValueType type, + uint64_t expiry) { + char* err = NULL; + size_t val_len; + char* val; + leveldb_keymetadata_t meta; + + val = leveldb_get2(db, options, key, strlen(key), &val_len, &err, &meta); + CheckNoError(err); + CheckEqual(expected, val, val_len); + CheckCondition(type==meta.rep.m_Type); + if (0==expiry && leveldb::kTypeValueWriteTime==type) + { + leveldb::ExpiryTimeMicros now=leveldb::port::TimeMicros(); + CheckCondition(gStartTime<=meta.rep.m_Expiry && meta.rep.m_Expiry<=now); + } // if + else + {CheckCondition(expiry==meta.rep.m_Expiry);} + + Free(&val); +} + +// (expiry enabled) +static void CheckIter2(leveldb_iterator_t* iter, + const char* key, const char* val, + const leveldb::KeyMetaData & meta) { + size_t len; + const char* str; + leveldb_keymetadata_t it_meta; + + str = leveldb_iter_key(iter, &len); + CheckEqual(key, str, len); + str = leveldb_iter_value(iter, &len); + CheckEqual(val, str, len); + + leveldb_iter_keymetadata(iter, &it_meta); + CheckCondition(meta.m_Type==it_meta.rep.m_Type); + if (0==meta.m_Expiry && leveldb::kTypeValueWriteTime==meta.m_Type) + { + leveldb::ExpiryTimeMicros now=leveldb::port::TimeMicros(); + CheckCondition(gStartTime<=it_meta.rep.m_Expiry && it_meta.rep.m_Expiry<=now); + } // if + else + {CheckCondition(meta.m_Expiry==it_meta.rep.m_Expiry);} + +} + +// Callback from leveldb_writebatch_iterate() +// (expiry enabled) +struct CheckPut2Data +{ + const char * m_Key; + const char * m_Value; + ValueType m_Type; + uint64_t m_Expiry; +}; + +static struct CheckPut2Data gCheckPut2Data[]= +{ + {"foo","hello_put2",leveldb::kTypeValue,0}, + {"box","c_put2",leveldb::kTypeValue,0}, + {"disney","cartoon_put2",leveldb::kTypeValueWriteTime, 0}, + {"money","lotsof_put2",leveldb::kTypeValueWriteTime, 9988776655}, + {"time","ismoney_put2",leveldb::kTypeValueExplicitExpiry, 221199887766} +}; + +static struct CheckPut2Data gCheckPut2ItrData[]= +{ + {"bar","b",leveldb::kTypeValue,0}, + {"box","c",leveldb::kTypeValue,0}, + {"bar","",leveldb::kTypeDeletion,0}, + {"mom","texas",leveldb::kTypeValueWriteTime,0}, + {"dad","poland",leveldb::kTypeValueExplicitExpiry,22446688} + }; + +static void CheckPut2(void* ptr, + const char* k, size_t klen, + const char* v, size_t vlen, + const int & type_int, + const uint64_t & expiry) { + int* state = (int*) ptr; + CheckCondition(*state < (sizeof(gCheckPut2ItrData)/sizeof(gCheckPut2ItrData[0]))); + struct CheckPut2Data * test; + + test=&gCheckPut2ItrData[*state]; + CheckEqual(test->m_Key, k, klen); + CheckEqual(test->m_Value, v, vlen); + CheckCondition((int)test->m_Type==type_int); + if (leveldb::kTypeValueWriteTime!=test->m_Type) + {CheckCondition((uint64_t)test->m_Expiry==expiry);} + (*state)++; +} + +// Callback from leveldb_writebatch_iterate() +// (expiry enabled) +static void CheckDel2(void* ptr, const char* k, size_t klen) { + int* state = (int*) ptr; + CheckCondition(*state < (sizeof(gCheckPut2ItrData)/sizeof(gCheckPut2ItrData[0]))); + struct CheckPut2Data * test; + + test=&gCheckPut2ItrData[*state]; + CheckEqual(test->m_Key, k, klen); + (*state)++; +} + static void CmpDestroy(void* arg) { } static int CmpCompare(void* arg, const char* a, size_t alen, @@ -141,7 +259,7 @@ static char* FilterCreate( int num_keys, size_t* filter_length) { *filter_length = 4; - char* result = malloc(4); + char* result = (char*)malloc(4); memcpy(result, "fake", 4); return result; } @@ -167,6 +285,7 @@ int main(int argc, char** argv) { CheckCondition(leveldb_major_version() >= 1); CheckCondition(leveldb_minor_version() >= 1); + gStartTime=leveldb::port::TimeMicros(); snprintf(dbname, sizeof(dbname), "%s/leveldb_c_test-%d", @@ -207,12 +326,6 @@ int main(int argc, char** argv) { CheckCondition(err != NULL); Free(&err); - StartPhase("leveldb_free"); - db = leveldb_open(options, dbname, &err); - CheckCondition(err != NULL); - leveldb_free(err); - err = NULL; - StartPhase("open"); leveldb_options_set_create_if_missing(options, 1); db = leveldb_open(options, dbname, &err); @@ -234,42 +347,74 @@ int main(int argc, char** argv) { StartPhase("writebatch"); { + leveldb_keymetadata_t meta; leveldb_writebatch_t* wb = leveldb_writebatch_create(); leveldb_writebatch_put(wb, "foo", 3, "a", 1); leveldb_writebatch_clear(wb); leveldb_writebatch_put(wb, "bar", 3, "b", 1); leveldb_writebatch_put(wb, "box", 3, "c", 1); leveldb_writebatch_delete(wb, "bar", 3); + meta.rep.m_Type=leveldb::kTypeValueWriteTime; + meta.rep.m_Expiry=0; + leveldb_writebatch_put2(wb, "mom", 3, "texas", 5, &meta); + meta.rep.m_Type=leveldb::kTypeValueExplicitExpiry; + meta.rep.m_Expiry=22446688; + leveldb_writebatch_put2(wb, "dad", 3, "poland", 6, &meta); leveldb_write(db, woptions, wb, &err); CheckNoError(err); CheckGet(db, roptions, "foo", "hello"); CheckGet(db, roptions, "bar", NULL); CheckGet(db, roptions, "box", "c"); + CheckGet2(db, roptions, "dad", "poland", leveldb::kTypeValueExplicitExpiry, 22446688); + CheckGet2(db, roptions, "mom", "texas", leveldb::kTypeValueWriteTime, 0); int pos = 0; - leveldb_writebatch_iterate(wb, &pos, CheckPut, CheckDel); - CheckCondition(pos == 3); + leveldb_writebatch_iterate(wb, &pos, CheckPut2, CheckDel2); + CheckCondition(pos == 5); leveldb_writebatch_destroy(wb); } + // reminder: keymetadata not supported on backward iteration StartPhase("iter"); { + leveldb::KeyMetaData meta; leveldb_iterator_t* iter = leveldb_create_iterator(db, roptions); CheckCondition(!leveldb_iter_valid(iter)); leveldb_iter_seek_to_first(iter); CheckCondition(leveldb_iter_valid(iter)); CheckIter(iter, "box", "c"); + meta.m_Type=leveldb::kTypeValue; + meta.m_Expiry=0; + CheckIter2(iter, "box", "c", meta); + + meta.m_Type=leveldb::kTypeValueExplicitExpiry; + meta.m_Expiry=22446688; + leveldb_iter_next(iter); + CheckIter2(iter, "dad", "poland", meta); leveldb_iter_next(iter); CheckIter(iter, "foo", "hello"); leveldb_iter_prev(iter); + CheckIter(iter, "dad", "poland"); + leveldb_iter_prev(iter); CheckIter(iter, "box", "c"); leveldb_iter_prev(iter); CheckCondition(!leveldb_iter_valid(iter)); leveldb_iter_seek_to_last(iter); - CheckIter(iter, "foo", "hello"); + CheckIter(iter, "mom", "texas"); leveldb_iter_seek(iter, "b", 1); CheckIter(iter, "box", "c"); leveldb_iter_get_error(iter, &err); CheckNoError(err); + + meta.m_Type=leveldb::kTypeValue; + meta.m_Expiry=0; + CheckIter2(iter, "box", "c", meta); + leveldb_iter_seek(iter, "m", 1); + meta.m_Type=leveldb::kTypeValueWriteTime; + meta.m_Expiry=0; + CheckIter2(iter, "mom", "texas", meta); + leveldb_iter_get_error(iter, &err); + CheckNoError(err); + leveldb_iter_destroy(iter); } @@ -335,6 +480,70 @@ int main(int argc, char** argv) { leveldb_options_set_error_if_exists(options, 1); } + StartPhase("put expiry"); + { + leveldb_keymetadata_t meta; + int loop, count; + + count = sizeof(gCheckPut2Data) / sizeof(gCheckPut2Data[0]); + + for (loop=0; loopm_Key); + vlen=strlen(test->m_Value); + meta.rep.m_Type=test->m_Type; + meta.rep.m_Expiry=test->m_Expiry; + + leveldb_put2(db, woptions, test->m_Key, klen, + test->m_Value, vlen, &err, + &meta); + CheckNoError(err); + } // for + + // testing memtable right now + for (loop=0; loopm_Key); + vlen=strlen(test->m_Value); + + CheckGet2(db, roptions, test->m_Key, test->m_Value, + test->m_Type, test->m_Expiry); + } // for + + // close and open to force memory table into .sst upon open + leveldb_close(db); + leveldb_options_set_error_if_exists(options, 0); + db = leveldb_open(options, dbname, &err); + CheckNoError(err); + + // now testing get from a level-0 .sst file + for (loop=0; loopm_Key); + vlen=strlen(test->m_Value); + + CheckGet2(db, roptions, test->m_Key, test->m_Value, + test->m_Type, test->m_Expiry); + } // for + } + + // + // This screws up "options" for real database work. execute last. StartPhase("filter"); for (run = 0; run < 2; run++) { // First run uses custom filter, second run uses bloom filter @@ -376,6 +585,8 @@ int main(int argc, char** argv) { leveldb_filterpolicy_destroy(policy); } + + StartPhase("cleanup"); leveldb_close(db); leveldb_options_destroy(options); @@ -386,5 +597,7 @@ int main(int argc, char** argv) { leveldb_env_destroy(env); fprintf(stderr, "PASS\n"); + + leveldb_env_shutdown(); return 0; } diff --git a/src/leveldb/db/corruption_test.cc b/src/leveldb/db/corruption_test.cc index 37a484d25..3b40b1c96 100644 --- a/src/leveldb/db/corruption_test.cc +++ b/src/leveldb/db/corruption_test.cc @@ -35,8 +35,8 @@ class CorruptionTest { CorruptionTest() { tiny_cache_ = NewLRUCache(100); options_.env = &env_; - options_.block_cache = tiny_cache_; - dbname_ = test::TmpDir() + "/corruption_test"; + dbname_ = test::TmpDir() + "/db_test"; + dbname_ = MakeTieredDbname(dbname_, options_); DestroyDB(dbname_, options_); db_ = NULL; @@ -51,14 +51,17 @@ class CorruptionTest { delete tiny_cache_; } - Status TryReopen() { + Status TryReopen(Options* options = NULL) { delete db_; db_ = NULL; - return DB::Open(options_, dbname_, &db_); + Options opt = (options ? *options : options_); + opt.env = &env_; + opt.block_cache = tiny_cache_; + return DB::Open(opt, dbname_, &db_); } - void Reopen() { - ASSERT_OK(TryReopen()); + void Reopen(Options* options = NULL) { + ASSERT_OK(TryReopen(options)); } void RepairDB() { @@ -75,13 +78,7 @@ class CorruptionTest { Slice key = Key(i, &key_space); batch.Clear(); batch.Put(key, Value(i, &value_space)); - WriteOptions options; - // Corrupt() doesn't work without this sync on windows; stat reports 0 for - // the file size. - if (i == n - 1) { - options.sync = true; - } - ASSERT_OK(db_->Write(options, &batch)); + ASSERT_OK(db_->Write(WriteOptions(), &batch)); } } @@ -96,10 +93,6 @@ class CorruptionTest { for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { uint64_t key; Slice in(iter->key()); - if (in == "" || in == "~") { - // Ignore boundary keys. - continue; - } if (!ConsumeDecimalNumber(&in, &key) || !in.empty() || key < next_expected) { @@ -123,19 +116,26 @@ class CorruptionTest { ASSERT_GE(max_expected, correct); } - void Corrupt(FileType filetype, int offset, int bytes_to_corrupt) { + void Corrupt(FileType filetype, int offset, int bytes_to_corrupt, int level=0) { // Pick file to corrupt std::vector filenames; - ASSERT_OK(env_.GetChildren(dbname_, &filenames)); + std::string dirname; + if (leveldb::kTableFile!=filetype) + dirname=dbname_; + else + dirname=MakeDirName2(options_, level, "sst"); + + ASSERT_OK(env_.GetChildren(dirname, &filenames)); + uint64_t number; FileType type; std::string fname; int picked_number = -1; - for (size_t i = 0; i < filenames.size(); i++) { + for (int i = 0; i < filenames.size(); i++) { if (ParseFileName(filenames[i], &number, &type) && type == filetype && int(number) > picked_number) { // Pick latest file - fname = dbname_ + "/" + filenames[i]; + fname = dirname + "/" + filenames[i]; picked_number = number; } } @@ -222,12 +222,14 @@ TEST(CorruptionTest, NewFileErrorDuringWrite) { const int num = 3 + (Options().write_buffer_size / kValueSize); std::string value_storage; Status s; - for (int i = 0; s.ok() && i < num; i++) { + for (int i = 0; + s.ok() && i < num && 0==env_.num_writable_file_errors_; + i++) { WriteBatch batch; batch.Put("a", Value(100, &value_storage)); s = db_->Write(WriteOptions(), &batch); } - ASSERT_TRUE(!s.ok()); +// ASSERT_TRUE(!s.ok()); Background write thread will never report this ASSERT_GE(env_.num_writable_file_errors_, 1); env_.writable_file_error_ = false; Reopen(); @@ -240,34 +242,18 @@ TEST(CorruptionTest, TableFile) { dbi->TEST_CompactRange(0, NULL, NULL); dbi->TEST_CompactRange(1, NULL, NULL); - Corrupt(kTableFile, 100, 1); - Check(90, 99); -} - -TEST(CorruptionTest, TableFileRepair) { - options_.block_size = 2 * kValueSize; // Limit scope of corruption - options_.paranoid_checks = true; - Reopen(); - Build(100); - DBImpl* dbi = reinterpret_cast(db_); - dbi->TEST_CompactMemTable(); - dbi->TEST_CompactRange(0, NULL, NULL); - dbi->TEST_CompactRange(1, NULL, NULL); - - Corrupt(kTableFile, 100, 1); - RepairDB(); - Reopen(); + Corrupt(kTableFile, 100, 1, config::kMaxMemCompactLevel); Check(95, 99); } TEST(CorruptionTest, TableFileIndexData) { - Build(10000); // Enough to build multiple Tables + Build(100000); // Enough to build multiple Tables DBImpl* dbi = reinterpret_cast(db_); dbi->TEST_CompactMemTable(); - Corrupt(kTableFile, -2000, 500); + Corrupt(kTableFile, -2000, 500, config::kMaxMemCompactLevel); Reopen(); - Check(5000, 9999); + Check(50000, 99999); } TEST(CorruptionTest, MissingDescriptor) { @@ -319,10 +305,10 @@ TEST(CorruptionTest, CompactionInputError) { Build(10); DBImpl* dbi = reinterpret_cast(db_); dbi->TEST_CompactMemTable(); - const int last = config::kMaxMemCompactLevel; + const int last = config::kMaxMemCompactLevel; // Riak does not "move" files ASSERT_EQ(1, Property("leveldb.num-files-at-level" + NumberToString(last))); - Corrupt(kTableFile, 100, 1); + Corrupt(kTableFile, 100, 1, last); Check(5, 9); // Force compactions by writing lots of values @@ -331,31 +317,50 @@ TEST(CorruptionTest, CompactionInputError) { } TEST(CorruptionTest, CompactionInputErrorParanoid) { - options_.paranoid_checks = true; - options_.write_buffer_size = 512 << 10; - Reopen(); + Options options; + options.paranoid_checks = true; + options.write_buffer_size = 1048576; + Reopen(&options); + + int current_corruption=Property("leveldb.ReadBlockError"); DBImpl* dbi = reinterpret_cast(db_); - // Make multiple inputs so we need to compact. - for (int i = 0; i < 2; i++) { - Build(10); + // Fill levels >= 1 so memtable compaction outputs to level 1 + // matthewv 1/10/14 - what does "levels" have to do with this, + // switching to compaction trigger. + // 7/10/14 - compaction starts between 4 and 6 files ... assume 4 and 1 move + // (will make a new, descriptive constant for 4) + for (int level = Property("leveldb.num-files-at-level0")+1; + level < config::kL0_GroomingTrigger; level++) { + dbi->Put(WriteOptions(), "", "begin"); + dbi->Put(WriteOptions(), "~", "end"); dbi->TEST_CompactMemTable(); - Corrupt(kTableFile, 100, 1); - env_.SleepForMicroseconds(100000); } - dbi->CompactRange(NULL, NULL); - // Write must fail because of corrupted table + Build(10); + dbi->TEST_CompactMemTable(); + ASSERT_TRUE(1 < Property("leveldb.num-files-at-level0")); + + Corrupt(kTableFile, 100, 1, 0); + Check(5, 9); + + // Write must eventually fail because of corrupted table + Status s; std::string tmp1, tmp2; - Status s = db_->Put(WriteOptions(), Key(5, &tmp1), Value(5, &tmp2)); - ASSERT_TRUE(!s.ok()) << "write did not fail in corrupted paranoid db"; + for (int i = 0; i < 10000 && s.ok(); i++) { + s = db_->Put(WriteOptions(), Key(i, &tmp1), Value(i, &tmp2)); + } + if (s.ok()) + ASSERT_NE(current_corruption, Property("leveldb.ReadBlockError")) << "no ReadBlockError seen"; + else + ASSERT_TRUE(!s.ok()) << "write did not fail in corrupted paranoid db"; } TEST(CorruptionTest, UnrelatedKeys) { Build(10); DBImpl* dbi = reinterpret_cast(db_); dbi->TEST_CompactMemTable(); - Corrupt(kTableFile, 100, 1); + Corrupt(kTableFile, 100, 1, config::kMaxMemCompactLevel); std::string tmp1, tmp2; ASSERT_OK(db_->Put(WriteOptions(), Key(1000, &tmp1), Value(1000, &tmp2))); diff --git a/src/leveldb/db/db_bench.cc b/src/leveldb/db/db_bench.cc index 3ad19a512..644cf479c 100644 --- a/src/leveldb/db/db_bench.cc +++ b/src/leveldb/db/db_bench.cc @@ -33,7 +33,6 @@ // readmissing -- read N missing keys in random order // readhot -- read N times in random order from 1% section of DB // seekrandom -- N random seeks -// open -- cost of opening a DB // crc32c -- repeated crc32c of 4K of data // acquireload -- load N*1000 times // Meta operations: @@ -84,14 +83,6 @@ static bool FLAGS_histogram = false; // (initialized to default value by "main") static int FLAGS_write_buffer_size = 0; -// Number of bytes written to each file. -// (initialized to default value by "main") -static int FLAGS_max_file_size = 0; - -// Approximate size of user data packed per block (before compression. -// (initialized to default value by "main") -static int FLAGS_block_size = 0; - // Number of bytes to use as a cache of uncompressed data. // Negative means use default settings. static int FLAGS_cache_size = -1; @@ -103,21 +94,26 @@ static int FLAGS_open_files = 0; // Negative means use default settings. static int FLAGS_bloom_bits = -1; +// Riak bloom adaptation +static int FLAGS_bloom2_bits = -1; + +// Riak param for total memory allocation (flex_cache) +static uint64_t FLAGS_leveldb_memory = -1; + +// Riak param for compression setting +static int FLAGS_compression = 2; + // If true, do not destroy the existing database. If you set this // flag and also specify a benchmark that wants a fresh database, that // benchmark will fail. static bool FLAGS_use_existing_db = false; -// If true, reuse existing log/MANIFEST files when re-opening a database. -static bool FLAGS_reuse_logs = false; - // Use the db with the following name. static const char* FLAGS_db = NULL; namespace leveldb { namespace { -leveldb::Env* g_env = NULL; // Helper for quickly generating random data. class RandomGenerator { @@ -141,7 +137,7 @@ class RandomGenerator { pos_ = 0; } - Slice Generate(size_t len) { + Slice Generate(int len) { if (pos_ + len > data_.size()) { pos_ = 0; assert(len < data_.size()); @@ -151,19 +147,17 @@ class RandomGenerator { } }; -#if defined(__linux) static Slice TrimSpace(Slice s) { - size_t start = 0; + int start = 0; while (start < s.size() && isspace(s[start])) { start++; } - size_t limit = s.size(); + int limit = s.size(); while (limit > start && isspace(s[limit-1])) { limit--; } return Slice(s.data() + start, limit - start); } -#endif static void AppendWithSpace(std::string* str, Slice msg) { if (msg.empty()) return; @@ -195,7 +189,7 @@ class Stats { done_ = 0; bytes_ = 0; seconds_ = 0; - start_ = g_env->NowMicros(); + start_ = Env::Default()->NowMicros(); finish_ = start_; message_.clear(); } @@ -213,7 +207,7 @@ class Stats { } void Stop() { - finish_ = g_env->NowMicros(); + finish_ = Env::Default()->NowMicros(); seconds_ = (finish_ - start_) * 1e-6; } @@ -223,7 +217,7 @@ class Stats { void FinishedSingleOp() { if (FLAGS_histogram) { - double now = g_env->NowMicros(); + double now = Env::Default()->NowMicros(); double micros = now - last_op_finish_; hist_.Add(micros); if (micros > 20000) { @@ -405,7 +399,7 @@ class Benchmark { : cache_(FLAGS_cache_size >= 0 ? NewLRUCache(FLAGS_cache_size) : NULL), filter_policy_(FLAGS_bloom_bits >= 0 ? NewBloomFilterPolicy(FLAGS_bloom_bits) - : NULL), + : (FLAGS_bloom2_bits >=0 ? NewBloomFilterPolicy2(FLAGS_bloom2_bits) : NULL)), db_(NULL), num_(FLAGS_num), value_size_(FLAGS_value_size), @@ -413,10 +407,10 @@ class Benchmark { reads_(FLAGS_reads < 0 ? FLAGS_num : FLAGS_reads), heap_counter_(0) { std::vector files; - g_env->GetChildren(FLAGS_db, &files); - for (size_t i = 0; i < files.size(); i++) { + Env::Default()->GetChildren(FLAGS_db, &files); + for (int i = 0; i < files.size(); i++) { if (Slice(files[i]).starts_with("heap-")) { - g_env->DeleteFile(std::string(FLAGS_db) + "/" + files[i]); + Env::Default()->DeleteFile(std::string(FLAGS_db) + "/" + files[i]); } } if (!FLAGS_use_existing_db) { @@ -446,7 +440,7 @@ class Benchmark { benchmarks = sep + 1; } - // Reset parameters that may be overridden below + // Reset parameters that may be overriddden bwlow num_ = FLAGS_num; reads_ = (FLAGS_reads < 0 ? FLAGS_num : FLAGS_reads); value_size_ = FLAGS_value_size; @@ -457,11 +451,7 @@ class Benchmark { bool fresh_db = false; int num_threads = FLAGS_threads; - if (name == Slice("open")) { - method = &Benchmark::OpenBench; - num_ /= 10000; - if (num_ < 1) num_ = 1; - } else if (name == Slice("fillseq")) { + if (name == Slice("fillseq")) { fresh_db = true; method = &Benchmark::WriteSeq; } else if (name == Slice("fillbatch")) { @@ -553,6 +543,7 @@ class Benchmark { SharedState* shared; ThreadState* thread; void (Benchmark::*method)(ThreadState*); + pthread_t thread_id; }; static void ThreadBody(void* v) { @@ -598,7 +589,8 @@ class Benchmark { arg[i].shared = &shared; arg[i].thread = new ThreadState(i); arg[i].thread->shared = &shared; - g_env->StartThread(ThreadBody, &arg[i]); + arg[i].thread_id=Env::Default()->StartThread(ThreadBody, &arg[i]); + pthread_detach(arg[i].thread_id); } shared.mu.Lock(); @@ -709,15 +701,12 @@ class Benchmark { void Open() { assert(db_ == NULL); Options options; - options.env = g_env; options.create_if_missing = !FLAGS_use_existing_db; options.block_cache = cache_; options.write_buffer_size = FLAGS_write_buffer_size; - options.max_file_size = FLAGS_max_file_size; - options.block_size = FLAGS_block_size; - options.max_open_files = FLAGS_open_files; options.filter_policy = filter_policy_; - options.reuse_logs = FLAGS_reuse_logs; + options.compression = (leveldb::CompressionType)FLAGS_compression; + options.total_leveldb_mem = FLAGS_leveldb_memory; Status s = DB::Open(options, FLAGS_db, &db_); if (!s.ok()) { fprintf(stderr, "open error: %s\n", s.ToString().c_str()); @@ -725,14 +714,6 @@ class Benchmark { } } - void OpenBench(ThreadState* thread) { - for (int i = 0; i < num_; i++) { - delete db_; - Open(); - thread->stats.FinishedSingleOp(); - } - } - void WriteSeq(ThreadState* thread) { DoWrite(thread, true); } @@ -842,6 +823,7 @@ class Benchmark { void SeekRandom(ThreadState* thread) { ReadOptions options; + std::string value; int found = 0; for (int i = 0; i < reads_; i++) { Iterator* iter = db_->NewIterator(options); @@ -937,7 +919,7 @@ class Benchmark { char fname[100]; snprintf(fname, sizeof(fname), "%s/heap-%04d", FLAGS_db, ++heap_counter_); WritableFile* file; - Status s = g_env->NewWritableFile(fname, &file); + Status s = Env::Default()->NewWritableFile(fname, &file, 2<<20); if (!s.ok()) { fprintf(stderr, "%s\n", s.ToString().c_str()); return; @@ -946,7 +928,7 @@ class Benchmark { delete file; if (!ok) { fprintf(stderr, "heap profiling not supported\n"); - g_env->DeleteFile(fname); + Env::Default()->DeleteFile(fname); } } }; @@ -955,14 +937,14 @@ class Benchmark { int main(int argc, char** argv) { FLAGS_write_buffer_size = leveldb::Options().write_buffer_size; - FLAGS_max_file_size = leveldb::Options().max_file_size; - FLAGS_block_size = leveldb::Options().block_size; FLAGS_open_files = leveldb::Options().max_open_files; + FLAGS_leveldb_memory = 25000000000LL; std::string default_db_path; for (int i = 1; i < argc; i++) { double d; int n; + uint64_t u; char junk; if (leveldb::Slice(argv[i]).starts_with("--benchmarks=")) { FLAGS_benchmarks = argv[i] + strlen("--benchmarks="); @@ -974,9 +956,6 @@ int main(int argc, char** argv) { } else if (sscanf(argv[i], "--use_existing_db=%d%c", &n, &junk) == 1 && (n == 0 || n == 1)) { FLAGS_use_existing_db = n; - } else if (sscanf(argv[i], "--reuse_logs=%d%c", &n, &junk) == 1 && - (n == 0 || n == 1)) { - FLAGS_reuse_logs = n; } else if (sscanf(argv[i], "--num=%d%c", &n, &junk) == 1) { FLAGS_num = n; } else if (sscanf(argv[i], "--reads=%d%c", &n, &junk) == 1) { @@ -987,16 +966,18 @@ int main(int argc, char** argv) { FLAGS_value_size = n; } else if (sscanf(argv[i], "--write_buffer_size=%d%c", &n, &junk) == 1) { FLAGS_write_buffer_size = n; - } else if (sscanf(argv[i], "--max_file_size=%d%c", &n, &junk) == 1) { - FLAGS_max_file_size = n; - } else if (sscanf(argv[i], "--block_size=%d%c", &n, &junk) == 1) { - FLAGS_block_size = n; } else if (sscanf(argv[i], "--cache_size=%d%c", &n, &junk) == 1) { FLAGS_cache_size = n; } else if (sscanf(argv[i], "--bloom_bits=%d%c", &n, &junk) == 1) { FLAGS_bloom_bits = n; + } else if (sscanf(argv[i], "--bloom_bits2=%d%c", &n, &junk) == 1) { + FLAGS_bloom2_bits = n; + } else if (sscanf(argv[i], "--leveldb_memory=%d%c", &n, &junk) == 1) { + FLAGS_leveldb_memory = n * 1024 * 1024LL; } else if (sscanf(argv[i], "--open_files=%d%c", &n, &junk) == 1) { FLAGS_open_files = n; + } else if (sscanf(argv[i], "--compression=%d%c", &n, &junk) == 1) { + FLAGS_compression = n; } else if (strncmp(argv[i], "--db=", 5) == 0) { FLAGS_db = argv[i] + 5; } else { @@ -1005,16 +986,20 @@ int main(int argc, char** argv) { } } - leveldb::g_env = leveldb::Env::Default(); - // Choose a location for the test database if none given with --db= if (FLAGS_db == NULL) { - leveldb::g_env->GetTestDirectory(&default_db_path); + leveldb::Env::Default()->GetTestDirectory(&default_db_path); default_db_path += "/dbbench"; FLAGS_db = default_db_path.c_str(); } - leveldb::Benchmark benchmark; - benchmark.Run(); + // benchmark class needs to destruct before Shutdown call + { + leveldb::Benchmark benchmark; + benchmark.Run(); + } + + leveldb::Env::Shutdown(); + return 0; } diff --git a/src/leveldb/db/db_impl.cc b/src/leveldb/db/db_impl.cc index 3bb58e560..9c0f0b555 100644 --- a/src/leveldb/db/db_impl.cc +++ b/src/leveldb/db/db_impl.cc @@ -4,11 +4,16 @@ #include "db/db_impl.h" +#include #include +#include +#include +#include #include #include #include #include +#include #include #include "db/builder.h" #include "db/db_iter.h" @@ -29,14 +34,21 @@ #include "table/block.h" #include "table/merger.h" #include "table/two_level_iterator.h" +#include "util/db_list.h" #include "util/coding.h" +#include "util/flexcache.h" +#include "util/hot_threads.h" #include "util/logging.h" #include "util/mutexlock.h" +#include "util/thread_tasks.h" +#include "util/throttle.h" +#include "leveldb/perf_count.h" + +#define __STDC_FORMAT_MACROS +#include namespace leveldb { -const int kNumNonTableCacheFiles = 10; - // Information kept for every waiting writer struct DBImpl::Writer { Status status; @@ -62,6 +74,9 @@ struct DBImpl::CompactionState { uint64_t number; uint64_t file_size; InternalKey smallest, largest; + uint64_t exp_write_low, exp_write_high, exp_explicit_high; + + Output() : number(0), file_size(0), exp_write_low(ULLONG_MAX), exp_write_high(0), exp_explicit_high(0) {} }; std::vector outputs; @@ -70,6 +85,7 @@ struct DBImpl::CompactionState { TableBuilder* builder; uint64_t total_bytes; + uint64_t num_entries; Output* current_output() { return &outputs[outputs.size()-1]; } @@ -77,86 +93,150 @@ struct DBImpl::CompactionState { : compaction(c), outfile(NULL), builder(NULL), - total_bytes(0) { + total_bytes(0), + num_entries(0) { } }; +Value::~Value() {} + +class StringValue : public Value { + public: + explicit StringValue(std::string& val) : value_(val) {} + ~StringValue() {} + + StringValue& assign(const char* data, size_t size) { + value_.assign(data, size); + return *this; + } + + private: + std::string& value_; +}; + // Fix user-supplied options to be reasonable template static void ClipToRange(T* ptr, V minvalue, V maxvalue) { if (static_cast(*ptr) > maxvalue) *ptr = maxvalue; if (static_cast(*ptr) < minvalue) *ptr = minvalue; } + Options SanitizeOptions(const std::string& dbname, const InternalKeyComparator* icmp, const InternalFilterPolicy* ipolicy, - const Options& src) { + const Options& src, + Cache * block_cache) { + std::string tiered_dbname; Options result = src; result.comparator = icmp; result.filter_policy = (src.filter_policy != NULL) ? ipolicy : NULL; - ClipToRange(&result.max_open_files, 64 + kNumNonTableCacheFiles, 50000); - ClipToRange(&result.write_buffer_size, 64<<10, 1<<30); - ClipToRange(&result.max_file_size, 1<<20, 1<<30); - ClipToRange(&result.block_size, 1<<10, 4<<20); + ClipToRange(&result.max_open_files, 20, 50000); + ClipToRange(&result.write_buffer_size, 64<<10, 1<<30); + ClipToRange(&result.block_size, 1<<10, 4<<20); + + // alternate means to change gMapSize ... more generic + if (0!=src.mmap_size) + gMapSize=src.mmap_size; + + // reduce buffer sizes if limited_developer_mem is true + if (src.limited_developer_mem) + { + if (0==src.mmap_size) + gMapSize=2*1024*1024L; + if (gMapSize < result.write_buffer_size) // let unit tests be smaller + result.write_buffer_size=gMapSize; + } // if + + // Validate tiered storage options + tiered_dbname=MakeTieredDbname(dbname, result); + if (result.info_log == NULL) { // Open a log file in the same directory as the db - src.env->CreateDir(dbname); // In case it does not exist - src.env->RenameFile(InfoLogFileName(dbname), OldInfoLogFileName(dbname)); - Status s = src.env->NewLogger(InfoLogFileName(dbname), &result.info_log); + src.env->CreateDir(tiered_dbname); // In case it does not exist + src.env->RenameFile(InfoLogFileName(tiered_dbname), OldInfoLogFileName(tiered_dbname)); + Status s = src.env->NewLogger(InfoLogFileName(tiered_dbname), &result.info_log); if (!s.ok()) { // No place suitable for logging result.info_log = NULL; } } + if (result.block_cache == NULL) { - result.block_cache = NewLRUCache(8 << 20); + result.block_cache = block_cache; } + + // remove anything expiry if this is an internal database + if (result.is_internal_db) + result.expiry_module.reset(); + else if (NULL!=result.expiry_module.get()) + result.expiry_module.get()->NoteUserExpirySettings(); + return result; } -DBImpl::DBImpl(const Options& raw_options, const std::string& dbname) - : env_(raw_options.env), - internal_comparator_(raw_options.comparator), - internal_filter_policy_(raw_options.filter_policy), - options_(SanitizeOptions(dbname, &internal_comparator_, - &internal_filter_policy_, raw_options)), - owns_info_log_(options_.info_log != raw_options.info_log), - owns_cache_(options_.block_cache != raw_options.block_cache), - dbname_(dbname), +DBImpl::DBImpl(const Options& options, const std::string& dbname) + : double_cache(options), + env_(options.env), + internal_comparator_(options.comparator), + internal_filter_policy_(options.filter_policy), + options_(SanitizeOptions( + dbname, &internal_comparator_, &internal_filter_policy_, + options, block_cache())), + owns_info_log_(options_.info_log != options.info_log), + owns_cache_(options_.block_cache != options.block_cache), + dbname_(options_.tiered_fast_prefix), db_lock_(NULL), shutting_down_(NULL), bg_cv_(&mutex_), - mem_(NULL), + mem_(new MemTable(internal_comparator_)), imm_(NULL), logfile_(NULL), logfile_number_(0), log_(NULL), - seed_(0), tmp_batch_(new WriteBatch), - bg_compaction_scheduled_(false), - manual_compaction_(NULL) { + manual_compaction_(NULL), + throttle_end(0), + running_compactions_(0), + block_size_changed_(0), last_low_mem_(0), + hotbackup_pending_(false) +{ + current_block_size_=options_.block_size; + + mem_->Ref(); has_imm_.Release_Store(NULL); - // Reserve ten files or so for other uses and give the rest to TableCache. - const int table_cache_size = options_.max_open_files - kNumNonTableCacheFiles; - table_cache_ = new TableCache(dbname_, &options_, table_cache_size); + table_cache_ = new TableCache(dbname_, &options_, file_cache(), double_cache); versions_ = new VersionSet(dbname_, &options_, table_cache_, &internal_comparator_); + + // switch global for everyone ... tacky implementation for now + gFadviseWillNeed=options_.fadvise_willneed; + + // CAUTION: all object initialization must be completed + // before the AddDB and SetTotalMemory calls. + DBList()->AddDB(this, options_.is_internal_db); + gFlexCache.SetTotalMemory(options_.total_leveldb_mem); + + options_.Dump(options_.info_log); + Log(options_.info_log," File cache size: %zd", double_cache.GetCapacity(true)); + Log(options_.info_log," Block cache size: %zd", double_cache.GetCapacity(false)); } DBImpl::~DBImpl() { + DBList()->ReleaseDB(this, options_.is_internal_db); + // Wait for background work to finish mutex_.Lock(); shutting_down_.Release_Store(this); // Any non-NULL value is ok - while (bg_compaction_scheduled_) { + while (IsCompactionScheduled()) { bg_cv_.Wait(); } mutex_.Unlock(); - if (db_lock_ != NULL) { - env_->UnlockFile(db_lock_); - } + // make sure flex cache knows this db is gone + // (must follow ReleaseDB() call ... see above) + gFlexCache.RecalculateAllocations(); delete versions_; if (mem_ != NULL) mem_->Unref(); @@ -164,13 +244,17 @@ DBImpl::~DBImpl() { delete tmp_batch_; delete log_; delete logfile_; + + if (options_.cache_object_warming) + table_cache_->SaveOpenFileList(); + delete table_cache_; if (owns_info_log_) { delete options_.info_log; } - if (owns_cache_) { - delete options_.block_cache; + if (db_lock_ != NULL) { + env_->UnlockFile(db_lock_); } } @@ -183,14 +267,14 @@ Status DBImpl::NewDB() { const std::string manifest = DescriptorFileName(dbname_, 1); WritableFile* file; - Status s = env_->NewWritableFile(manifest, &file); + Status s = env_->NewWritableFile(manifest, &file, 4*1024L); if (!s.ok()) { return s; } { log::Writer log(file); std::string record; - new_db.EncodeTo(&record); + new_db.EncodeTo(&record, options_.ExpiryActivated()); s = log.AddRecord(record); if (s.ok()) { s = file->Close(); @@ -203,6 +287,7 @@ Status DBImpl::NewDB() { } else { env_->DeleteFile(manifest); } + return s; } @@ -215,69 +300,120 @@ void DBImpl::MaybeIgnoreError(Status* s) const { } } -void DBImpl::DeleteObsoleteFiles() { - if (!bg_error_.ok()) { - // After a background error, we don't know whether a new version may - // or may not have been committed, so we cannot safely garbage collect. - return; - } +void DBImpl::DeleteObsoleteFiles() +{ + // Only run this routine when down to one + // simultaneous compaction + if (RunningCompactionCount()<2) + { + // each caller has mutex, we need to release it + // since this disk activity can take a while + mutex_.AssertHeld(); - // Make a set of all of the live files - std::set live = pending_outputs_; - versions_->AddLiveFiles(&live); + // Make a set of all of the live files + std::set live = pending_outputs_; + versions_->AddLiveFiles(&live); - std::vector filenames; - env_->GetChildren(dbname_, &filenames); // Ignoring errors on purpose - uint64_t number; - FileType type; - for (size_t i = 0; i < filenames.size(); i++) { - if (ParseFileName(filenames[i], &number, &type)) { - bool keep = true; - switch (type) { - case kLogFile: - keep = ((number >= versions_->LogNumber()) || - (number == versions_->PrevLogNumber())); - break; - case kDescriptorFile: - // Keep my manifest file, and any newer incarnations' - // (in case there is a race that allows other incarnations) - keep = (number >= versions_->ManifestFileNumber()); - break; - case kTableFile: - keep = (live.find(number) != live.end()); - break; - case kTempFile: - // Any temp files that are currently being written to must - // be recorded in pending_outputs_, which is inserted into "live" - keep = (live.find(number) != live.end()); - break; - case kCurrentFile: - case kDBLockFile: - case kInfoLogFile: - keep = true; - break; - } + // prune the database root directory + std::vector filenames; + env_->GetChildren(dbname_, &filenames); // Ignoring errors on purpose + for (size_t i = 0; i < filenames.size(); i++) { + KeepOrDelete(filenames[i], -1, live); + } // for - if (!keep) { - if (type == kTableFile) { - table_cache_->Evict(number); - } - Log(options_.info_log, "Delete type=%d #%lld\n", - int(type), - static_cast(number)); - env_->DeleteFile(dbname_ + "/" + filenames[i]); - } - } - } + // prune the table file directories + for (int level=0; levelGetChildren(dirname, &filenames); // Ignoring errors on purpose + for (size_t i = 0; i < filenames.size(); i++) { + KeepOrDelete(filenames[i], level, live); + } // for + } // for + } // if } -Status DBImpl::Recover(VersionEdit* edit, bool *save_manifest) { +void +DBImpl::KeepOrDelete( + const std::string & Filename, + int Level, + const std::set & Live) +{ + uint64_t number; + FileType type; + bool keep = true; + + if (ParseFileName(Filename, &number, &type)) + { + switch (type) + { + case kLogFile: + keep = ((number >= versions_->LogNumber()) || + (number == versions_->PrevLogNumber())); + break; + + case kDescriptorFile: + // Keep my manifest file, and any newer incarnations' + // (in case there is a race that allows other incarnations) + keep = (number >= versions_->ManifestFileNumber()); + break; + + case kTableFile: + keep = (Live.find(number) != Live.end()); + break; + + case kTempFile: + // Any temp files that are currently being written to must + // be recorded in pending_outputs_, which is inserted into "Live" + keep = (Live.find(number) != Live.end()); + break; + + case kCurrentFile: + case kDBLockFile: + case kInfoLogFile: + case kCacheWarming: + keep = true; + break; + } // switch + + if (!keep) + { + if (type == kTableFile) { + // temporary hard coding of extra overlapped + // levels + table_cache_->Evict(number, (Level(number)); + + if (-1!=Level) + { + std::string file; + + file=TableFileName(options_, number, Level); + env_->DeleteFile(file); + } // if + else + { + env_->DeleteFile(dbname_ + "/" + Filename); + } // else + } // if + } // if +} // DBImpl::KeepOrDelete + + +Status DBImpl::Recover(VersionEdit* edit) { mutex_.AssertHeld(); // Ignore error from CreateDir since the creation of the DB is // committed only when the descriptor is created, and this directory // may already exist from a previous failed creation attempt. - env_->CreateDir(dbname_); + env_->CreateDir(options_.tiered_fast_prefix); + env_->CreateDir(options_.tiered_slow_prefix); assert(db_lock_ == NULL); Status s = env_->LockFile(LockFileName(dbname_), &db_lock_); if (!s.ok()) { @@ -301,69 +437,155 @@ Status DBImpl::Recover(VersionEdit* edit, bool *save_manifest) { } } - s = versions_->Recover(save_manifest); - if (!s.ok()) { - return s; - } - SequenceNumber max_sequence(0); + // read manifest + s = versions_->Recover(); - // Recover from all newer log files than the ones named in the - // descriptor (new log files may have been added by the previous - // incarnation without registering them in the descriptor). - // - // Note that PrevLogNumber() is no longer used, but we pay - // attention to it in case we are recovering a database - // produced by an older version of leveldb. - const uint64_t min_log = versions_->LogNumber(); - const uint64_t prev_log = versions_->PrevLogNumber(); - std::vector filenames; - s = env_->GetChildren(dbname_, &filenames); - if (!s.ok()) { - return s; - } - std::set expected; - versions_->AddLiveFiles(&expected); - uint64_t number; - FileType type; - std::vector logs; - for (size_t i = 0; i < filenames.size(); i++) { - if (ParseFileName(filenames[i], &number, &type)) { - expected.erase(number); - if (type == kLogFile && ((number >= min_log) || (number == prev_log))) - logs.push_back(number); - } - } - if (!expected.empty()) { - char buf[50]; - snprintf(buf, sizeof(buf), "%d missing files; e.g.", - static_cast(expected.size())); - return Status::Corruption(buf, TableFileName(dbname_, *(expected.begin()))); - } + // Verify Riak 1.3 directory structure created and ready + if (s.ok() && !TestForLevelDirectories(env_, options_, versions_->current())) + { + int level; + std::string old_name, new_name; - // Recover in the order in which the logs were generated - std::sort(logs.begin(), logs.end()); - for (size_t i = 0; i < logs.size(); i++) { - s = RecoverLogFile(logs[i], (i == logs.size() - 1), save_manifest, edit, - &max_sequence); + if (options_.create_if_missing) + { + // move files from old heirarchy to new + s=MakeLevelDirectories(env_, options_); + if (s.ok()) + { + for (level=0; level & level_files(versions_->current()->GetFileList(level)); + std::vector::const_iterator it; + + for (it=level_files.begin(); level_files.end()!=it && s.ok(); ++it) + { + new_name=TableFileName(options_, (*it)->number, level); + + // test for partial completion + if (!env_->FileExists(new_name.c_str())) + { + old_name=TableFileName(options_, (*it)->number, -2); + s=env_->RenameFile(old_name, new_name); + } // if + } // for + } // for + } // if + else + return s; + } // if + else + { + return Status::InvalidArgument( + dbname_, "level directories do not exist (create_if_missing is false)"); + } // else + } // if + + + if (s.ok()) { + SequenceNumber max_sequence(0); + + // Recover from all newer log files than the ones named in the + // descriptor (new log files may have been added by the previous + // incarnation without registering them in the descriptor). + // + // Note that PrevLogNumber() is no longer used, but we pay + // attention to it in case we are recovering a database + // produced by an older version of leveldb. + const uint64_t min_log = versions_->LogNumber(); + const uint64_t prev_log = versions_->PrevLogNumber(); + std::vector filenames; + s = env_->GetChildren(dbname_, &filenames); if (!s.ok()) { return s; } + uint64_t number; + FileType type; + std::vector logs; + for (size_t i = 0; i < filenames.size(); i++) { + if (ParseFileName(filenames[i], &number, &type) + && type == kLogFile + && ((number >= min_log) || (number == prev_log))) { + logs.push_back(number); + } + } - // The previous incarnation may not have written any MANIFEST - // records after allocating this log number. So we manually - // update the file number allocation counter in VersionSet. - versions_->MarkFileNumberUsed(logs[i]); + // Recover in the order in which the logs were generated + std::sort(logs.begin(), logs.end()); + for (size_t i = 0; i < logs.size() && s.ok(); i++) { + s = RecoverLogFile(logs[i], edit, &max_sequence); + + // The previous incarnation may not have written any MANIFEST + // records after allocating this log number. So we manually + // update the file number allocation counter in VersionSet. + versions_->MarkFileNumberUsed(logs[i]); + } + + if (s.ok()) { + if (versions_->LastSequence() < max_sequence) { + versions_->SetLastSequence(max_sequence); + } + } } - if (versions_->LastSequence() < max_sequence) { - versions_->SetLastSequence(max_sequence); - } - - return Status::OK(); + return s; } -Status DBImpl::RecoverLogFile(uint64_t log_number, bool last_log, - bool* save_manifest, VersionEdit* edit, + +void DBImpl::CheckCompactionState() +{ + mutex_.AssertHeld(); + bool log_flag, need_compaction; + + // Verify Riak 1.4 level sizing, run compactions to fix as necessary + // (also recompacts hard repair of all files to level 0) + + log_flag=false; + need_compaction=false; + + // loop on pending background compactions + // reminder: mutex_ is held + do + { + int level; + + // wait out executing compaction (Wait gives mutex to compactions) + if (IsCompactionScheduled()) + bg_cv_.Wait(); + + for (level=0, need_compaction=false; + levelIsLevelOverlapped(level) + && config::kL0_SlowdownWritesTrigger<=versions_->NumLevelFiles(level)) + { + need_compaction=true; + MaybeScheduleCompaction(); + if (!log_flag) + { + log_flag=true; + Log(options_.info_log, "Cleanup compactions started ... DB::Open paused"); + } // if + } //if + } // for + + } while(IsCompactionScheduled() && need_compaction); + + if (log_flag) + Log(options_.info_log, "Cleanup compactions completed ... DB::Open continuing"); + + // prior code only called this function instead of CheckCompactionState + // (duplicates original Google functionality) + else + MaybeScheduleCompaction(); + + return; + +} // DBImpl::CheckCompactionState() + + +Status DBImpl::RecoverLogFile(uint64_t log_number, + VersionEdit* edit, SequenceNumber* max_sequence) { struct LogReporter : public log::Reader::Reporter { Env* env; @@ -395,7 +617,7 @@ Status DBImpl::RecoverLogFile(uint64_t log_number, bool last_log, reporter.info_log = options_.info_log; reporter.fname = fname.c_str(); reporter.status = (options_.paranoid_checks ? &status : NULL); - // We intentionally make log::Reader do checksumming even if + // We intentially make log::Reader do checksumming even if // paranoid_checks==false so that corruptions cause entire commits // to be skipped instead of propagating bad information (like overly // large sequence numbers). @@ -408,13 +630,12 @@ Status DBImpl::RecoverLogFile(uint64_t log_number, bool last_log, std::string scratch; Slice record; WriteBatch batch; - int compactions = 0; MemTable* mem = NULL; while (reader.ReadRecord(&record, &scratch) && status.ok()) { if (record.size() < 12) { reporter.Corruption( - record.size(), Status::Corruption("log record too small", fname)); + record.size(), Status::Corruption("log record too small")); continue; } WriteBatchInternal::SetContents(&batch, record); @@ -423,7 +644,7 @@ Status DBImpl::RecoverLogFile(uint64_t log_number, bool last_log, mem = new MemTable(internal_comparator_); mem->Ref(); } - status = WriteBatchInternal::InsertInto(&batch, mem); + status = WriteBatchInternal::InsertInto(&batch, mem, &options_); MaybeIgnoreError(&status); if (!status.ok()) { break; @@ -436,77 +657,68 @@ Status DBImpl::RecoverLogFile(uint64_t log_number, bool last_log, } if (mem->ApproximateMemoryUsage() > options_.write_buffer_size) { - compactions++; - *save_manifest = true; status = WriteLevel0Table(mem, edit, NULL); - mem->Unref(); - mem = NULL; if (!status.ok()) { // Reflect errors immediately so that conditions like full // file-systems cause the DB::Open() to fail. break; } + mem->Unref(); + mem = NULL; } } + if (status.ok() && mem != NULL) { + status = WriteLevel0Table(mem, edit, NULL); + // Reflect errors immediately so that conditions like full + // file-systems cause the DB::Open() to fail. + } + + if (mem != NULL) mem->Unref(); delete file; - - // See if we should keep reusing the last log file. - if (status.ok() && options_.reuse_logs && last_log && compactions == 0) { - assert(logfile_ == NULL); - assert(log_ == NULL); - assert(mem_ == NULL); - uint64_t lfile_size; - if (env_->GetFileSize(fname, &lfile_size).ok() && - env_->NewAppendableFile(fname, &logfile_).ok()) { - Log(options_.info_log, "Reusing old log %s \n", fname.c_str()); - log_ = new log::Writer(logfile_, lfile_size); - logfile_number_ = log_number; - if (mem != NULL) { - mem_ = mem; - mem = NULL; - } else { - // mem can be NULL if lognum exists but was empty. - mem_ = new MemTable(internal_comparator_); - mem_->Ref(); - } - } - } - - if (mem != NULL) { - // mem did not get reused; compact it. - if (status.ok()) { - *save_manifest = true; - status = WriteLevel0Table(mem, edit, NULL); - } - mem->Unref(); - } - return status; } -Status DBImpl::WriteLevel0Table(MemTable* mem, VersionEdit* edit, +Status DBImpl::WriteLevel0Table(volatile MemTable* mem, VersionEdit* edit, Version* base) { mutex_.AssertHeld(); const uint64_t start_micros = env_->NowMicros(); FileMetaData meta; meta.number = versions_->NewFileNumber(); + meta.level = 0; pending_outputs_.insert(meta.number); - Iterator* iter = mem->NewIterator(); - Log(options_.info_log, "Level-0 table #%llu: started", - (unsigned long long) meta.number); + Iterator* iter = ((MemTable *)mem)->NewIterator(); + SequenceNumber smallest_snapshot; + + if (snapshots_.empty()) { + smallest_snapshot = versions_->LastSequence(); + } else { + smallest_snapshot = snapshots_.oldest()->number_; + } Status s; { + Options local_options; + mutex_.Unlock(); - s = BuildTable(dbname_, env_, options_, table_cache_, iter, &meta); + Log(options_.info_log, "Level-0 table #%llu: started", + (unsigned long long) meta.number); + + // want the data slammed to disk as fast as possible, + // no compression for level 0. + local_options=options_; + // matthewv Nov 2, 2016 local_options.compression=kNoCompression; + local_options.block_size=current_block_size_; + s = BuildTable(dbname_, env_, local_options, user_comparator(), table_cache_, iter, &meta, smallest_snapshot); + + Log(options_.info_log, "Level-0 table #%llu: %llu bytes, %llu keys %s", + (unsigned long long) meta.number, + (unsigned long long) meta.file_size, + (unsigned long long) meta.num_entries, + s.ToString().c_str()); mutex_.Lock(); } - Log(options_.info_log, "Level-0 table #%llu: %lld bytes %s", - (unsigned long long) meta.number, - (unsigned long long) meta.file_size, - s.ToString().c_str()); delete iter; pending_outputs_.erase(meta.number); @@ -518,20 +730,75 @@ Status DBImpl::WriteLevel0Table(MemTable* mem, VersionEdit* edit, const Slice min_user_key = meta.smallest.user_key(); const Slice max_user_key = meta.largest.user_key(); if (base != NULL) { - level = base->PickLevelForMemTableOutput(min_user_key, max_user_key); + int level_limit; + if (0!=options_.tiered_slow_level && (options_.tiered_slow_level-1)(config::kMaxMemCompactLevel)) + level_limit=options_.tiered_slow_level-1; + else + level_limit=config::kMaxMemCompactLevel; + + // remember, mutex is held so safe to push file into a non-compacting level + level = base->PickLevelForMemTableOutput(min_user_key, max_user_key, level_limit); + if (versions_->IsCompactionSubmitted(level) || !versions_->NeighborCompactionsQuiet(level)) + level=0; + + if (0!=level) + { + Status move_s; + std::string old_name, new_name; + + old_name=TableFileName(options_, meta.number, 0); + new_name=TableFileName(options_, meta.number, level); + move_s=env_->RenameFile(old_name, new_name); + + if (move_s.ok()) + { + // builder already added file to table_cache with 2 references and + // marked as level 0 (used by cache warming) ... going to remove from cache + // and add again correctly + table_cache_->Evict(meta.number, true); + meta.level=level; + + // sadly, we must hold the mutex during this file open + // since operating in non-overlapped level + Iterator* it=table_cache_->NewIterator(ReadOptions(), + meta.number, + meta.file_size, + meta.level); + delete it; + + // argh! logging while holding mutex ... cannot release + Log(options_.info_log, "Level-0 table #%llu: moved to level %d", + (unsigned long long) meta.number, + level); + } // if + else + { + level=0; + } // else + } // if } - edit->AddFile(level, meta.number, meta.file_size, - meta.smallest, meta.largest); + + if (s.ok()) + edit->AddFile2(level, meta.number, meta.file_size, + meta.smallest, meta.largest, + meta.exp_write_low, meta.exp_write_high, meta.exp_explicit_high); } CompactionStats stats; stats.micros = env_->NowMicros() - start_micros; stats.bytes_written = meta.file_size; stats_[level].Add(stats); + + // Riak adds extra reference to file, must remove it + // in this race condition upon close + if (s.ok() && shutting_down_.Acquire_Load()) { + table_cache_->Evict(meta.number, versions_->IsLevelOverlapped(level)); + } + return s; } -void DBImpl::CompactMemTable() { +Status DBImpl::CompactMemTable() { mutex_.AssertHeld(); assert(imm_ != NULL); @@ -559,9 +826,9 @@ void DBImpl::CompactMemTable() { imm_ = NULL; has_imm_.Release_Store(NULL); DeleteObsoleteFiles(); - } else { - RecordBackgroundError(s); } + + return s; } void DBImpl::CompactRange(const Slice* begin, const Slice* end) { @@ -575,7 +842,7 @@ void DBImpl::CompactRange(const Slice* begin, const Slice* end) { } } } - TEST_CompactMemTable(); // TODO(sanjay): Skip if memtable does not overlap + CompactMemTableSynchronous(); // TODO(sanjay): Skip if memtable does not overlap for (int level = 0; level < max_level_with_files; level++) { TEST_CompactRange(level, begin, end); } @@ -593,32 +860,40 @@ void DBImpl::TEST_CompactRange(int level, const Slice* begin,const Slice* end) { if (begin == NULL) { manual.begin = NULL; } else { - begin_storage = InternalKey(*begin, kMaxSequenceNumber, kValueTypeForSeek); + begin_storage = InternalKey(*begin, 0, kMaxSequenceNumber, kValueTypeForSeek); manual.begin = &begin_storage; } if (end == NULL) { manual.end = NULL; } else { - end_storage = InternalKey(*end, 0, static_cast(0)); + end_storage = InternalKey(*end, 0, 0, static_cast(0)); manual.end = &end_storage; } MutexLock l(&mutex_); - while (!manual.done && !shutting_down_.Acquire_Load() && bg_error_.ok()) { - if (manual_compaction_ == NULL) { // Idle - manual_compaction_ = &manual; - MaybeScheduleCompaction(); - } else { // Running either my compaction or another compaction. + while (!manual.done) { + while (manual_compaction_ != NULL || IsCompactionScheduled()) { + bg_cv_.Wait(); + } + manual_compaction_ = &manual; + MaybeScheduleCompaction(); + while (manual_compaction_ == &manual) { bg_cv_.Wait(); } } - if (manual_compaction_ == &manual) { - // Cancel my manual compaction since we aborted early for some reason. - manual_compaction_ = NULL; - } } +/** + * This "test" routine was used in one production location, + * then two with addition of hot backup. Inappropriate for + * TEST_ prefix if used in production. + */ Status DBImpl::TEST_CompactMemTable() { + return(CompactMemTableSynchronous()); +} // TEST_CompactMemTable + + +Status DBImpl::CompactMemTableSynchronous() { // NULL batch means just wait for earlier writes to be done Status s = Write(WriteOptions(), NULL); if (s.ok()) { @@ -634,68 +909,168 @@ Status DBImpl::TEST_CompactMemTable() { return s; } -void DBImpl::RecordBackgroundError(const Status& s) { - mutex_.AssertHeld(); - if (bg_error_.ok()) { - bg_error_ = s; - bg_cv_.SignalAll(); - } -} - void DBImpl::MaybeScheduleCompaction() { mutex_.AssertHeld(); - if (bg_compaction_scheduled_) { - // Already scheduled - } else if (shutting_down_.Acquire_Load()) { - // DB is being deleted; no more background compactions - } else if (!bg_error_.ok()) { - // Already got an error; no more changes - } else if (imm_ == NULL && - manual_compaction_ == NULL && - !versions_->NeedsCompaction()) { - // No work to be done - } else { - bg_compaction_scheduled_ = true; - env_->Schedule(&DBImpl::BGWork, this); - } + + if (!shutting_down_.Acquire_Load()) + { + if (NULL==manual_compaction_) + { + // ask versions_ to schedule work to hot threads + versions_->PickCompaction(this); + } // if + + else if (!versions_->IsCompactionSubmitted(manual_compaction_->level)) + { + // support manual compaction under hot threads + versions_->SetCompactionSubmitted(manual_compaction_->level); + ThreadTask * task=new CompactionTask(this, NULL); + gCompactionThreads->Submit(task, true); + } // else if + } // if } -void DBImpl::BGWork(void* db) { - reinterpret_cast(db)->BackgroundCall(); -} -void DBImpl::BackgroundCall() { +void DBImpl::BackgroundCall2( + Compaction * Compact) { MutexLock l(&mutex_); - assert(bg_compaction_scheduled_); - if (shutting_down_.Acquire_Load()) { - // No more background work when shutting down. - } else if (!bg_error_.ok()) { - // No more background work after a background error. - } else { - BackgroundCompaction(); - } + int level, type; + assert(IsCompactionScheduled()); - bg_compaction_scheduled_ = false; + type=kNormalCompaction; + ++running_compactions_; + if (NULL!=Compact) + { + level=Compact->level(); + type=Compact->GetCompactionType(); + } // if + else if (NULL!=manual_compaction_) + level=manual_compaction_->level; + else + level=0; + + if (0==level) + gPerfCounters->Inc(ePerfBGCompactLevel0); + else + gPerfCounters->Inc(ePerfBGNormal); + + versions_->SetCompactionRunning(level); + + if (!shutting_down_.Acquire_Load()) { + Status s; + + switch(type) + { + case kNormalCompaction: + s = BackgroundCompaction(Compact); + break; + + case kExpiryFileCompaction: + s = BackgroundExpiry(Compact); + break; + + default: + assert(0); + break; + } // switch + + if (!s.ok() && !shutting_down_.Acquire_Load()) { + // Wait a little bit before retrying background compaction in + // case this is an environmental problem and we do not want to + // chew up resources for failed compactions for the duration of + // the problem. + bg_cv_.SignalAll(); // In case a waiter can proceed despite the error + mutex_.Unlock(); + Log(options_.info_log, "Waiting after background compaction error: %s", + s.ToString().c_str()); + env_->SleepForMicroseconds(1000000); + mutex_.Lock(); + } + } + else + { + delete Compact; + } // else + + --running_compactions_; + versions_->SetCompactionDone(level, env_->NowMicros()); // Previous compaction may have produced too many files in a level, // so reschedule another compaction if needed. - MaybeScheduleCompaction(); + if (!options_.is_repair) + MaybeScheduleCompaction(); + bg_cv_.SignalAll(); + +} + + +void +DBImpl::BackgroundImmCompactCall() { + MutexLock l(&mutex_); + assert(NULL != imm_); + Status s; + + ++running_compactions_; + gPerfCounters->Inc(ePerfBGCompactImm); + + if (!shutting_down_.Acquire_Load()) { + s = CompactMemTable(); + if (!s.ok() && !shutting_down_.Acquire_Load()) { + // Wait a little bit before retrying background compaction in + // case this is an environmental problem and we do not want to + // chew up resources for failed compactions for the duration of + // the problem. + bg_cv_.SignalAll(); // In case a waiter can proceed despite the error + mutex_.Unlock(); + Log(options_.info_log, "Waiting after background imm compaction error: %s", + s.ToString().c_str()); + env_->SleepForMicroseconds(1000000); + mutex_.Lock(); + } + } + + --running_compactions_; + + // Previous compaction may have produced too many files in a level, + // so reschedule another compaction if needed. + if (!options_.is_repair) + MaybeScheduleCompaction(); + + // shutdown is waiting for this imm_ to clear + if (shutting_down_.Acquire_Load()) { + + // must abandon data in memory ... hope recovery log works + if (NULL!=imm_) + imm_->Unref(); + imm_ = NULL; + has_imm_.Release_Store(NULL); + } // if + + // retry imm compaction if failed and not shutting down + else if (!s.ok()) + { + ThreadTask * task=new ImmWriteTask(this); + gImmThreads->Submit(task, true); + } // else + bg_cv_.SignalAll(); } -void DBImpl::BackgroundCompaction() { + +Status DBImpl::BackgroundCompaction( + Compaction * Compact) { + Status status; + bool do_compact(true); + mutex_.AssertHeld(); - if (imm_ != NULL) { - CompactMemTable(); - return; - } - - Compaction* c; + Compaction* c(Compact); bool is_manual = (manual_compaction_ != NULL); InternalKey manual_end; - if (is_manual) { - ManualCompaction* m = manual_compaction_; + if (NULL!=c) { + // do nothing in this work block + } else if (is_manual) { + ManualCompaction* m = (ManualCompaction *) manual_compaction_; c = versions_->CompactRange(m->level, m->begin, m->end); m->done = (c == NULL); if (c != NULL) { @@ -708,36 +1083,58 @@ void DBImpl::BackgroundCompaction() { (m->end ? m->end->DebugString().c_str() : "(end)"), (m->done ? "(end)" : manual_end.DebugString().c_str())); } else { - c = versions_->PickCompaction(); + // c = versions_->PickCompaction(); } - Status status; + if (c == NULL) { // Nothing to do - } else if (!is_manual && c->IsTrivialMove()) { + do_compact=false; + } else if (!is_manual && c->IsTrivialMove() + && (c->level()+1)!=(int)options_.tiered_slow_level) { // Move file to next level assert(c->num_input_files(0) == 1); + std::string old_name, new_name; FileMetaData* f = c->input(0, 0); - c->edit()->DeleteFile(c->level(), f->number); - c->edit()->AddFile(c->level() + 1, f->number, f->file_size, - f->smallest, f->largest); - status = versions_->LogAndApply(c->edit(), &mutex_); - if (!status.ok()) { - RecordBackgroundError(status); - } - VersionSet::LevelSummaryStorage tmp; - Log(options_.info_log, "Moved #%lld to level-%d %lld bytes %s: %s\n", - static_cast(f->number), - c->level() + 1, - static_cast(f->file_size), - status.ToString().c_str(), - versions_->LevelSummary(&tmp)); - } else { + + old_name=TableFileName(options_, f->number, c->level()); + new_name=TableFileName(options_, f->number, c->level() +1); + status=env_->RenameFile(old_name, new_name); + + if (status.ok()) + { + gPerfCounters->Inc(ePerfBGMove); + do_compact=false; + c->edit()->DeleteFile(c->level(), f->number); + c->edit()->AddFile2(c->level() + 1, f->number, f->file_size, + f->smallest, f->largest, + f->exp_write_low, f->exp_write_high, f->exp_explicit_high); + status = versions_->LogAndApply(c->edit(), &mutex_); + DeleteObsoleteFiles(); + + // if LogAndApply fails, should file be renamed back to original spot? + VersionSet::LevelSummaryStorage tmp; + Log(options_.info_log, "Moved #%lld to level-%d %lld bytes %s: %s\n", + static_cast(f->number), + c->level() + 1, + static_cast(f->file_size), + status.ToString().c_str(), + versions_->LevelSummary(&tmp)); + + // no time, no keys ... just make the call so that one compaction + // gets posted against potential backlog ... extremely important + // to write throttle logic. + SetThrottleWriteRate(0, 0, (0 == c->level())); + } // if + else { + // retry as compaction instead of move + do_compact=true; // redundant but safe + gPerfCounters->Inc(ePerfBGMoveFail); + } // else + } + if (do_compact) { CompactionState* compact = new CompactionState(c); status = DoCompactionWork(compact); - if (!status.ok()) { - RecordBackgroundError(status); - } CleanupCompaction(compact); c->ReleaseInputs(); DeleteObsoleteFiles(); @@ -751,10 +1148,13 @@ void DBImpl::BackgroundCompaction() { } else { Log(options_.info_log, "Compaction error: %s", status.ToString().c_str()); + if (options_.paranoid_checks && bg_error_.ok()) { + bg_error_ = status; + } } if (is_manual) { - ManualCompaction* m = manual_compaction_; + ManualCompaction* m = (ManualCompaction *)manual_compaction_; if (!status.ok()) { m->done = true; } @@ -766,6 +1166,8 @@ void DBImpl::BackgroundCompaction() { } manual_compaction_ = NULL; } + + return status; } void DBImpl::CleanupCompaction(CompactionState* compact) { @@ -785,10 +1187,14 @@ void DBImpl::CleanupCompaction(CompactionState* compact) { delete compact; } -Status DBImpl::OpenCompactionOutputFile(CompactionState* compact) { +Status DBImpl::OpenCompactionOutputFile( + CompactionState* compact, + size_t sample_value_size) { assert(compact != NULL); assert(compact->builder == NULL); uint64_t file_number; + bool pagecache_flag; + { mutex_.Lock(); file_number = versions_->NewFileNumber(); @@ -798,18 +1204,230 @@ Status DBImpl::OpenCompactionOutputFile(CompactionState* compact) { out.smallest.Clear(); out.largest.Clear(); compact->outputs.push_back(out); + pagecache_flag=Send2PageCache(compact); mutex_.Unlock(); } // Make the output file - std::string fname = TableFileName(dbname_, file_number); - Status s = env_->NewWritableFile(fname, &compact->outfile); + std::string fname = TableFileName(options_, file_number, compact->compaction->level()+1); + Status s = env_->NewWritableFile(fname, &compact->outfile, gMapSize); if (s.ok()) { - compact->builder = new TableBuilder(options_, compact->outfile); - } + Options options; + options=options_; + options.block_size=current_block_size_; + + // consider larger block size if option enabled (block_size_steps!=0) + // and low on file cache space + if (0!=options.block_size_steps) + { + uint64_t now; + + now=env_->NowMicros(); + + if (!double_cache.GetPlentySpace()) + { + // keep track of last time there was lack of space. + // use info in block below to revert block_size + last_low_mem_=now; + + // do not make changes often, a multi file compaction + // could raise more than one step (5 min) + if (block_size_changed_+(5*60*1000000L) < now) + { + size_t old_size=current_block_size_; + + options.block_size=MaybeRaiseBlockSize(*compact->compaction, sample_value_size); + + // did size change? + if (options.block_size!=old_size) + { + block_size_changed_=now; + } // if + } // if + + } // if + + // has system's memory been ok for a while now + else if (last_low_mem_+double_cache.GetFileTimeout()*1000000L < now) + { + // reset size to original, data could have been deleted and/or old + // files no longer need cache space + current_block_size_=options_.block_size; + } // else if + + } // if + + // force call to CalcInputState to set IsCompressible + compact->compaction->CalcInputStats(*table_cache_); + + // do not attempt compression if data known to not compress + if (kSnappyCompression==options.compression && !compact->compaction->IsCompressible()) + { + options.compression=kNoCompressionAutomated; + Log(options.info_log, "kNoCompressionAutomated"); + } // if + + + // tune fadvise to keep as much of the file data in RAM as + // reasonably possible + if (pagecache_flag) + compact->outfile->SetMetadataOffset(1); + compact->builder = new TableBuilder(options, compact->outfile); + } // if + return s; } + +bool +DBImpl::Send2PageCache( + CompactionState* compact) +{ + bool ret_flag; + + mutex_.AssertHeld(); + + // tune fadvise to keep all of the lower level file in page cache + // (compaction of unsorted files causes severe cache misses) + if (versions_->IsLevelOverlapped(compact->compaction->level())) +// if (0==compact->compaction->level()) + { + ret_flag=true; + } // if + + // look at current RAM availability to decide whether or not to keep + // file data in page cache + else + { + size_t avail_block; + int64_t lower_levels; + int level; + + // current block cache size without PageCache estimation + avail_block=double_cache.GetCapacity(false, false); + + lower_levels=0; + for (level=0; level<=compact->compaction->level(); ++level) + lower_levels+=versions_->NumLevelBytes(level); + + // does the block cache's unadjusted size exceed higher + // volatility file sizes in lower levels? + ret_flag=(lower_levels<=(int64_t)avail_block); + } // else + + return(ret_flag); + +} // DbImpl::Send2PageCache + +size_t +DBImpl::MaybeRaiseBlockSize( + Compaction & CompactionStuff, + size_t SampleValueSize) +{ + size_t new_block_size, tot_user_data, tot_index_keys, avg_value_size, + avg_key_size, avg_block_size; + + // start with most recent dynamic sizing + new_block_size=current_block_size_; + + // + // 1. Get estimates for key values. Zero implies unable to estimate + // (as the formula is tuned, some of the values become unused ... apologies + CompactionStuff.CalcInputStats(*table_cache_); + tot_user_data=CompactionStuff.TotalUserDataSize(); + tot_index_keys=CompactionStuff.TotalIndexKeys(); + avg_value_size=CompactionStuff.AverageValueSize(); + avg_key_size=CompactionStuff.AverageKeySize(); + avg_block_size=CompactionStuff.AverageBlockSize(); + + // CalcInputStats does not have second source for avg_value_size. + // Use size of next key. + if (0==avg_value_size) + avg_value_size=SampleValueSize; + + Log(options_.info_log, + "Block stats used %zd user data, %zd index keys, %zd avg value, %zd avg key, %zd avg block", + tot_user_data, tot_index_keys, avg_value_size, avg_key_size, avg_block_size); + + // + // 2. Define boundaries of block size steps. Calculate + // "next step" + // + if (0!=tot_user_data && 0!=tot_index_keys && 0!=avg_value_size + && 0!=avg_key_size && 0!=avg_block_size) + { + size_t high_size, low_size, cur_size, increment, file_data_size, keys_per_file; + + // 2a. Highest block size: + // (sqrt()/sqrt() stuff is from first derivative to minimize + // total read size of one block plus file metadata) + + // limited by keys or filesize? (pretend metadata is zero, i love pretend games) + file_data_size=versions_->MaxFileSizeForLevel(CompactionStuff.level()); + keys_per_file=file_data_size / avg_value_size; + + if (300000 < keys_per_file) + { + keys_per_file = 300000; + file_data_size = avg_value_size * keys_per_file; + } // if + + // cast to double inside sqrt() is required for Solaris 13 + high_size=(size_t)((double)file_data_size / (sqrt((double)file_data_size)/sqrt((double)avg_key_size))); + + // 2b. Lowest block size: largest of given block size or average value size + // because large values are one block + if (avg_value_size < options_.block_size) + low_size=options_.block_size; + else + low_size=avg_value_size; + + // 2c. Current block size: compaction can skew numbers in files + // without counters, use current dynamic block_size in that case + if (options_.block_size < avg_block_size) + cur_size=avg_block_size; + else + cur_size=current_block_size_; + + // safety check values to eliminate negatives + if (low_size <= high_size) + { + size_t cur_step; + + increment=(high_size - low_size)/options_.block_size_steps; + + // adjust old, too low stuff + if (low_size < cur_size) + cur_step=(cur_size - low_size)/increment; + else + cur_step=0; + + // move to next step, but not over the top step + if (cur_step < (size_t)options_.block_size_steps) + ++cur_step; + else + cur_step=options_.block_size_steps; + + // + // 3. Set new block size to next higher step + // + new_block_size=low_size + increment * cur_step; + + Log(options_.info_log, + "Block size selected %zd block size, %zd cur, %zd low, %zd high, %zd inc, %zd step", + new_block_size, cur_size, low_size, high_size, increment, cur_step); + + // This is not thread safe, but not worthy of mutex either + if (current_block_size_ < new_block_size) + current_block_size_ = new_block_size; + } // if + } // if + + return(new_block_size); + +} // DBImpl::MaybeRaiseBlockSize + + Status DBImpl::FinishCompactionOutputFile(CompactionState* compact, Iterator* input) { assert(compact != NULL); @@ -830,6 +1448,10 @@ Status DBImpl::FinishCompactionOutputFile(CompactionState* compact, const uint64_t current_bytes = compact->builder->FileSize(); compact->current_output()->file_size = current_bytes; compact->total_bytes += current_bytes; + compact->num_entries += compact->builder->NumEntries(); + compact->current_output()->exp_write_low = compact->builder->GetExpiryWriteLow(); + compact->current_output()->exp_write_high = compact->builder->GetExpiryWriteHigh(); + compact->current_output()->exp_explicit_high = compact->builder->GetExpiryExplicitHigh(); delete compact->builder; compact->builder = NULL; @@ -845,16 +1467,25 @@ Status DBImpl::FinishCompactionOutputFile(CompactionState* compact, if (s.ok() && current_entries > 0) { // Verify that the table is usable + Table * table_ptr; Iterator* iter = table_cache_->NewIterator(ReadOptions(), output_number, - current_bytes); + current_bytes, + compact->compaction->level()+1, + &table_ptr); s = iter->status(); + // Riak specific: bloom filter is no longer read by default, + // force read on highly used overlapped table files + if (s.ok() && VersionSet::IsLevelOverlapped(compact->compaction->level()+1)) + table_ptr->ReadFilter(); + + // table_ptr invalidated by this delete delete iter; + if (s.ok()) { Log(options_.info_log, - "Generated table #%llu@%d: %lld keys, %lld bytes", + "Generated table #%llu: %lld keys, %lld bytes", (unsigned long long) output_number, - compact->compaction->level(), (unsigned long long) current_entries, (unsigned long long) current_bytes); } @@ -865,34 +1496,31 @@ Status DBImpl::FinishCompactionOutputFile(CompactionState* compact, Status DBImpl::InstallCompactionResults(CompactionState* compact) { mutex_.AssertHeld(); + + mutex_.Unlock(); + // release lock while writing Log entry, could stall Log(options_.info_log, "Compacted %d@%d + %d@%d files => %lld bytes", compact->compaction->num_input_files(0), compact->compaction->level(), compact->compaction->num_input_files(1), compact->compaction->level() + 1, static_cast(compact->total_bytes)); + mutex_.Lock(); // Add compaction outputs compact->compaction->AddInputDeletions(compact->compaction->edit()); const int level = compact->compaction->level(); for (size_t i = 0; i < compact->outputs.size(); i++) { const CompactionState::Output& out = compact->outputs[i]; - compact->compaction->edit()->AddFile( + compact->compaction->edit()->AddFile2( level + 1, - out.number, out.file_size, out.smallest, out.largest); + out.number, out.file_size, out.smallest, out.largest, + out.exp_write_low, out.exp_write_high, out.exp_explicit_high); } return versions_->LogAndApply(compact->compaction->edit(), &mutex_); } Status DBImpl::DoCompactionWork(CompactionState* compact) { - const uint64_t start_micros = env_->NowMicros(); - int64_t imm_micros = 0; // Micros spent doing imm_ compactions - - Log(options_.info_log, "Compacting %d@%d + %d@%d files", - compact->compaction->num_input_files(0), - compact->compaction->level(), - compact->compaction->num_input_files(1), - compact->compaction->level() + 1); assert(versions_->NumLevelFiles(compact->compaction->level()) > 0); assert(compact->builder == NULL); @@ -906,29 +1534,28 @@ Status DBImpl::DoCompactionWork(CompactionState* compact) { // Release mutex while we're actually doing the compaction work mutex_.Unlock(); + Log(options_.info_log, "Compacting %d@%d + %d@%d files", + compact->compaction->num_input_files(0), + compact->compaction->level(), + compact->compaction->num_input_files(1), + compact->compaction->level() + 1); + + bool is_level0_compaction=(0 == compact->compaction->level()); + + const uint64_t start_micros = env_->NowMicros(); + Iterator* input = versions_->MakeInputIterator(compact->compaction); input->SeekToFirst(); Status status; - ParsedInternalKey ikey; - std::string current_user_key; - bool has_current_user_key = false; - SequenceNumber last_sequence_for_key = kMaxSequenceNumber; - for (; input->Valid() && !shutting_down_.Acquire_Load(); ) { - // Prioritize immutable compaction work - if (has_imm_.NoBarrier_Load() != NULL) { - const uint64_t imm_start = env_->NowMicros(); - mutex_.Lock(); - if (imm_ != NULL) { - CompactMemTable(); - bg_cv_.SignalAll(); // Wakeup MakeRoomForWrite() if necessary - } - mutex_.Unlock(); - imm_micros += (env_->NowMicros() - imm_start); - } + KeyRetirement retire(user_comparator(), compact->smallest_snapshot, &options_, compact->compaction); + + for (; input->Valid() && !shutting_down_.Acquire_Load(); ) + { Slice key = input->key(); - if (compact->compaction->ShouldStopBefore(key) && - compact->builder != NULL) { + if (compact->builder != NULL + && compact->compaction->ShouldStopBefore(key, compact->builder->NumEntries())) { + status = FinishCompactionOutputFile(compact, input); if (!status.ok()) { break; @@ -936,54 +1563,12 @@ Status DBImpl::DoCompactionWork(CompactionState* compact) { } // Handle key/value, add to state, etc. - bool drop = false; - if (!ParseInternalKey(key, &ikey)) { - // Do not hide error keys - current_user_key.clear(); - has_current_user_key = false; - last_sequence_for_key = kMaxSequenceNumber; - } else { - if (!has_current_user_key || - user_comparator()->Compare(ikey.user_key, - Slice(current_user_key)) != 0) { - // First occurrence of this user key - current_user_key.assign(ikey.user_key.data(), ikey.user_key.size()); - has_current_user_key = true; - last_sequence_for_key = kMaxSequenceNumber; - } - - if (last_sequence_for_key <= compact->smallest_snapshot) { - // Hidden by an newer entry for same user key - drop = true; // (A) - } else if (ikey.type == kTypeDeletion && - ikey.sequence <= compact->smallest_snapshot && - compact->compaction->IsBaseLevelForKey(ikey.user_key)) { - // For this user key: - // (1) there is no data in higher levels - // (2) data in lower levels will have larger sequence numbers - // (3) data in layers that are being compacted here and have - // smaller sequence numbers will be dropped in the next - // few iterations of this loop (by rule (A) above). - // Therefore this deletion marker is obsolete and can be dropped. - drop = true; - } - - last_sequence_for_key = ikey.sequence; - } -#if 0 - Log(options_.info_log, - " Compact: %s, seq %d, type: %d %d, drop: %d, is_base: %d, " - "%d smallest_snapshot: %d", - ikey.user_key.ToString().c_str(), - (int)ikey.sequence, ikey.type, kTypeValue, drop, - compact->compaction->IsBaseLevelForKey(ikey.user_key), - (int)last_sequence_for_key, (int)compact->smallest_snapshot); -#endif + bool drop = retire(key); if (!drop) { // Open output file if necessary if (compact->builder == NULL) { - status = OpenCompactionOutputFile(compact); + status = OpenCompactionOutputFile(compact, input->value().size() + key.size()); if (!status.ok()) { break; } @@ -1009,6 +1594,17 @@ Status DBImpl::DoCompactionWork(CompactionState* compact) { if (status.ok() && shutting_down_.Acquire_Load()) { status = Status::IOError("Deleting DB during compaction"); +#if 0 // validating this block is redundant (eleveldb issue #110) + // cleanup Riak modification that adds extra reference + // to overlap levels files. + if (compact->compaction->level() < config::kNumOverlapLevels) + { + for (size_t i = 0; i < compact->outputs.size(); i++) { + const CompactionState::Output& out = compact->outputs[i]; + versions_->GetTableCache()->Evict(out.number, true); + } // for + } // if +#endif } if (status.ok() && compact->builder != NULL) { status = FinishCompactionOutputFile(compact, input); @@ -1020,7 +1616,7 @@ Status DBImpl::DoCompactionWork(CompactionState* compact) { input = NULL; CompactionStats stats; - stats.micros = env_->NowMicros() - start_micros - imm_micros; + stats.micros = env_->NowMicros() - start_micros; for (int which = 0; which < 2; which++) { for (int i = 0; i < compact->compaction->num_input_files(which); i++) { stats.bytes_read += compact->compaction->input(which, i)->file_size; @@ -1030,27 +1626,31 @@ Status DBImpl::DoCompactionWork(CompactionState* compact) { stats.bytes_written += compact->outputs[i].file_size; } + // write log before taking mutex_ + VersionSet::LevelSummaryStorage tmp; + Log(options_.info_log, + "compacted to: %s", versions_->LevelSummary(&tmp)); + mutex_.Lock(); stats_[compact->compaction->level() + 1].Add(stats); if (status.ok()) { + if (0!=compact->num_entries) + SetThrottleWriteRate((env_->NowMicros() - start_micros), + compact->num_entries, is_level0_compaction); status = InstallCompactionResults(compact); } - if (!status.ok()) { - RecordBackgroundError(status); - } - VersionSet::LevelSummaryStorage tmp; - Log(options_.info_log, - "compacted to: %s", versions_->LevelSummary(&tmp)); + return status; } + namespace { struct IterState { port::Mutex* mu; Version* version; MemTable* mem; - MemTable* imm; + volatile MemTable* imm; }; static void CleanupIteratorState(void* arg1, void* arg2) { @@ -1065,8 +1665,7 @@ static void CleanupIteratorState(void* arg1, void* arg2) { } // namespace Iterator* DBImpl::NewInternalIterator(const ReadOptions& options, - SequenceNumber* latest_snapshot, - uint32_t* seed) { + SequenceNumber* latest_snapshot) { IterState* cleanup = new IterState; mutex_.Lock(); *latest_snapshot = versions_->LastSequence(); @@ -1076,7 +1675,7 @@ Iterator* DBImpl::NewInternalIterator(const ReadOptions& options, list.push_back(mem_->NewIterator()); mem_->Ref(); if (imm_ != NULL) { - list.push_back(imm_->NewIterator()); + list.push_back(((MemTable *)imm_)->NewIterator()); imm_->Ref(); } versions_->current()->AddIterators(options, &list); @@ -1090,15 +1689,13 @@ Iterator* DBImpl::NewInternalIterator(const ReadOptions& options, cleanup->version = versions_->current(); internal_iter->RegisterCleanup(CleanupIteratorState, cleanup, NULL); - *seed = ++seed_; mutex_.Unlock(); return internal_iter; } Iterator* DBImpl::TEST_NewInternalIterator() { SequenceNumber ignored; - uint32_t ignored_seed; - return NewInternalIterator(ReadOptions(), &ignored, &ignored_seed); + return NewInternalIterator(ReadOptions(), &ignored); } int64_t DBImpl::TEST_MaxNextLevelOverlappingBytes() { @@ -1108,7 +1705,16 @@ int64_t DBImpl::TEST_MaxNextLevelOverlappingBytes() { Status DBImpl::Get(const ReadOptions& options, const Slice& key, - std::string* value) { + std::string* value, + KeyMetaData * meta) { + StringValue stringvalue(*value); + return DBImpl::Get(options, key, &stringvalue, meta); +} + +Status DBImpl::Get(const ReadOptions& options, + const Slice& key, + Value* value, + KeyMetaData * meta) { Status s; MutexLock l(&mutex_); SequenceNumber snapshot; @@ -1119,7 +1725,7 @@ Status DBImpl::Get(const ReadOptions& options, } MemTable* mem = mem_; - MemTable* imm = imm_; + volatile MemTable* imm = imm_; Version* current = versions_->current(); mem->Ref(); if (imm != NULL) imm->Ref(); @@ -1132,44 +1738,44 @@ Status DBImpl::Get(const ReadOptions& options, { mutex_.Unlock(); // First look in the memtable, then in the immutable memtable (if any). - LookupKey lkey(key, snapshot); - if (mem->Get(lkey, value, &s)) { + LookupKey lkey(key, snapshot, meta); + if (mem->Get(lkey, value, &s, &options_)) { // Done - } else if (imm != NULL && imm->Get(lkey, value, &s)) { + gPerfCounters->Inc(ePerfGetMem); + } else if (imm != NULL && ((MemTable *)imm)->Get(lkey, value, &s, &options_)) { // Done + gPerfCounters->Inc(ePerfGetImm); } else { s = current->Get(options, lkey, value, &stats); have_stat_update = true; + gPerfCounters->Inc(ePerfGetVersion); } mutex_.Lock(); } if (have_stat_update && current->UpdateStats(stats)) { - MaybeScheduleCompaction(); + // no compactions initiated by reads, takes too long + // MaybeScheduleCompaction(); } mem->Unref(); if (imm != NULL) imm->Unref(); current->Unref(); + + gPerfCounters->Inc(ePerfApiGet); + return s; } Iterator* DBImpl::NewIterator(const ReadOptions& options) { SequenceNumber latest_snapshot; - uint32_t seed; - Iterator* iter = NewInternalIterator(options, &latest_snapshot, &seed); + Iterator* internal_iter = NewInternalIterator(options, &latest_snapshot); + gPerfCounters->Inc(ePerfIterNew); return NewDBIterator( - this, user_comparator(), iter, + &dbname_, env_, user_comparator(), internal_iter, (options.snapshot != NULL ? reinterpret_cast(options.snapshot)->number_ : latest_snapshot), - seed); -} - -void DBImpl::RecordReadSample(Slice key) { - MutexLock l(&mutex_); - if (versions_->current()->RecordReadSample(key)) { - MaybeScheduleCompaction(); - } + options_.expiry_module.get()); } const Snapshot* DBImpl::GetSnapshot() { @@ -1183,8 +1789,8 @@ void DBImpl::ReleaseSnapshot(const Snapshot* s) { } // Convenience methods -Status DBImpl::Put(const WriteOptions& o, const Slice& key, const Slice& val) { - return DB::Put(o, key, val); +Status DBImpl::Put(const WriteOptions& o, const Slice& key, const Slice& val, const KeyMetaData * meta) { + return DB::Put(o, key, val, meta); } Status DBImpl::Delete(const WriteOptions& options, const Slice& key) { @@ -1192,22 +1798,27 @@ Status DBImpl::Delete(const WriteOptions& options, const Slice& key) { } Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) { + Status status; + int throttle(0); + Writer w(&mutex_); w.batch = my_batch; w.sync = options.sync; w.done = false; + { // place mutex_ within a block + // not changing tabs to ease compare to Google sources MutexLock l(&mutex_); writers_.push_back(&w); while (!w.done && &w != writers_.front()) { w.cv.Wait(); } if (w.done) { - return w.status; + return w.status; // skips throttle ... maintenance unfriendly coding, bastards } // May temporarily unlock and wait. - Status status = MakeRoomForWrite(my_batch == NULL); + status = MakeRoomForWrite(my_batch == NULL); uint64_t last_sequence = versions_->LastSequence(); Writer* last_writer = &w; if (status.ok() && my_batch != NULL) { // NULL batch is for compactions @@ -1222,23 +1833,13 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) { { mutex_.Unlock(); status = log_->AddRecord(WriteBatchInternal::Contents(updates)); - bool sync_error = false; if (status.ok() && options.sync) { status = logfile_->Sync(); - if (!status.ok()) { - sync_error = true; - } } if (status.ok()) { - status = WriteBatchInternal::InsertInto(updates, mem_); + status = WriteBatchInternal::InsertInto(updates, mem_, &options_); } mutex_.Lock(); - if (sync_error) { - // The state of the log file is indeterminate: the log record we - // just added may or may not show up when the DB is re-opened. - // So we force the DB into a mode where all future writes fail. - RecordBackgroundError(status); - } } if (updates == tmp_batch_) tmp_batch_->Clear(); @@ -1261,12 +1862,75 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) { writers_.front()->cv.Signal(); } + gPerfCounters->Inc(ePerfApiWrite); + + // protect use of versions_ ... still within scope of mutex_ lock + throttle=versions_->WriteThrottleUsec(IsCompactionScheduled()); + } // release MutexLock l(&mutex_) + + + // throttle on exit to reduce possible reordering + if (0!=throttle) + { + uint64_t now, remaining_wait, new_end, batch_wait; + int batch_count; + + /// slowing each call down sequentially + MutexLock l(&throttle_mutex_); + + // server may have been busy since previous write, + // use only the remaining time as throttle + now=env_->NowMicros(); + + if (now < throttle_end) + { + + remaining_wait=throttle_end - now; + env_->SleepForMicroseconds(remaining_wait); + new_end=now+remaining_wait+throttle; + + gPerfCounters->Add(ePerfThrottleWait, remaining_wait); + } // if + else + { + remaining_wait=0; + new_end=now + throttle; + } // else + + // throttle is per key write, how many in batch? + // (do not use batch count on internal db because of impact to AAE) + batch_count=(!options_.is_internal_db && NULL!=my_batch ? WriteBatchInternal::Count(my_batch) : 1); + if (0 < batch_count) // unclear if Count() could return zero + --batch_count; + batch_wait=throttle * batch_count; + + // only wait on batch if extends beyond potential wait period + if (now + remaining_wait < throttle_end + batch_wait) + { + remaining_wait=throttle_end + batch_wait - (now + remaining_wait); + env_->SleepForMicroseconds(remaining_wait); + new_end +=remaining_wait; + + gPerfCounters->Add(ePerfThrottleWait, remaining_wait); + } // if + + throttle_end=new_end; + } // if + + // throttle not needed, kill off old wait time + else if (0!=throttle_end) + { + throttle_end=0; + } // else if + return status; } // REQUIRES: Writer list must be non-empty // REQUIRES: First writer must have a non-NULL batch +// REQUIRES: mutex_ is held WriteBatch* DBImpl::BuildBatchGroup(Writer** last_writer) { + mutex_.AssertHeld(); assert(!writers_.empty()); Writer* first = writers_.front(); WriteBatch* result = first->batch; @@ -1299,7 +1963,7 @@ WriteBatch* DBImpl::BuildBatchGroup(Writer** last_writer) { break; } - // Append to *result + // Append to *reuslt if (result == first->batch) { // Switch to temporary batch instead of disturbing caller's batch result = tmp_batch_; @@ -1320,14 +1984,16 @@ Status DBImpl::MakeRoomForWrite(bool force) { assert(!writers_.empty()); bool allow_delay = !force; Status s; + while (true) { if (!bg_error_.ok()) { // Yield previous error + gPerfCounters->Inc(ePerfWriteError); s = bg_error_; break; } else if ( allow_delay && - versions_->NumLevelFiles(0) >= config::kL0_SlowdownWritesTrigger) { + versions_->NumLevelFiles(0) >= (int)config::kL0_SlowdownWritesTrigger) { // We are getting close to hitting a hard limit on the number of // L0 files. Rather than delaying a single write by several // seconds when we hit the hard limit, start delaying each @@ -1335,42 +2001,59 @@ Status DBImpl::MakeRoomForWrite(bool force) { // this delay hands over some CPU to the compaction thread in // case it is sharing the same core as the writer. mutex_.Unlock(); +#if 0 // see if this impacts smoothing or helps (but keep the counts) + // (original Google code left for reference) env_->SleepForMicroseconds(1000); +#endif allow_delay = false; // Do not delay a single write more than once + gPerfCounters->Inc(ePerfWriteSleep); mutex_.Lock(); } else if (!force && (mem_->ApproximateMemoryUsage() <= options_.write_buffer_size)) { // There is room in current memtable + gPerfCounters->Inc(ePerfWriteNoWait); break; } else if (imm_ != NULL) { // We have filled up the current memtable, but the previous // one is still being compacted, so we wait. - Log(options_.info_log, "Current memtable full; waiting...\n"); - bg_cv_.Wait(); + Log(options_.info_log, "waiting 2...\n"); + gPerfCounters->Inc(ePerfWriteWaitImm); + MaybeScheduleCompaction(); + if (!shutting_down_.Acquire_Load()) + bg_cv_.Wait(); + Log(options_.info_log, "running 2...\n"); } else if (versions_->NumLevelFiles(0) >= config::kL0_StopWritesTrigger) { // There are too many level-0 files. - Log(options_.info_log, "Too many L0 files; waiting...\n"); - bg_cv_.Wait(); + Log(options_.info_log, "waiting...\n"); + gPerfCounters->Inc(ePerfWriteWaitLevel0); + MaybeScheduleCompaction(); + if (!shutting_down_.Acquire_Load()) + bg_cv_.Wait(); + Log(options_.info_log, "running...\n"); } else { // Attempt to switch to a new memtable and trigger compaction of old assert(versions_->PrevLogNumber() == 0); uint64_t new_log_number = versions_->NewFileNumber(); - WritableFile* lfile = NULL; - s = env_->NewWritableFile(LogFileName(dbname_, new_log_number), &lfile); + + gPerfCounters->Inc(ePerfWriteNewMem); + s = NewRecoveryLog(new_log_number); + if (!s.ok()) { // Avoid chewing through file number space in a tight loop. versions_->ReuseFileNumber(new_log_number); break; } - delete log_; - delete logfile_; - logfile_ = lfile; - logfile_number_ = new_log_number; - log_ = new log::Writer(lfile); + imm_ = mem_; - has_imm_.Release_Store(imm_); + has_imm_.Release_Store((MemTable*)imm_); + if (NULL!=imm_) + { + ThreadTask * task=new ImmWriteTask(this); + gImmThreads->Submit(task, true); + } mem_ = new MemTable(internal_comparator_); mem_->Ref(); + force = false; // Do not force another compaction if have room MaybeScheduleCompaction(); } @@ -1378,6 +2061,35 @@ Status DBImpl::MakeRoomForWrite(bool force) { return s; } + +// the following steps existed in two places, DB::Open() and +// DBImpl::MakeRoomForWrite(). This lead to a bug in Basho's +// tiered storage feature. Unifying the code. +Status DBImpl::NewRecoveryLog( + uint64_t NewLogNumber) +{ + mutex_.AssertHeld(); + Status s; + WritableFile * lfile(NULL); + + s = env_->NewWriteOnlyFile(LogFileName(dbname_, NewLogNumber), &lfile, + options_.env->RecoveryMmapSize(&options_)); + if (s.ok()) + { + // close any existing + delete log_; + delete logfile_; + + logfile_ = lfile; + logfile_number_ = NewLogNumber; + log_ = new log::Writer(lfile); + } // if + + return(s); + +} // DBImpl::NewRecoveryLog + + bool DBImpl::GetProperty(const Slice& property, std::string* value) { value->clear(); @@ -1391,11 +2103,11 @@ bool DBImpl::GetProperty(const Slice& property, std::string* value) { in.remove_prefix(strlen("num-files-at-level")); uint64_t level; bool ok = ConsumeDecimalNumber(&in, &level) && in.empty(); - if (!ok || level >= config::kNumLevels) { + if (!ok || level >= (uint64_t)config::kNumLevels) { return false; } else { char buf[100]; - snprintf(buf, sizeof(buf), "%d", + snprintf(buf, sizeof(buf), "%zd", versions_->NumLevelFiles(static_cast(level))); *value = buf; return true; @@ -1427,19 +2139,34 @@ bool DBImpl::GetProperty(const Slice& property, std::string* value) { } else if (in == "sstables") { *value = versions_->current()->DebugString(); return true; - } else if (in == "approximate-memory-usage") { - size_t total_usage = options_.block_cache->TotalCharge(); - if (mem_) { - total_usage += mem_->ApproximateMemoryUsage(); - } - if (imm_) { - total_usage += imm_->ApproximateMemoryUsage(); - } + } else if (in == "total-bytes") { char buf[50]; - snprintf(buf, sizeof(buf), "%llu", - static_cast(total_usage)); + uint64_t total = 0; + for (int level = 0; level < config::kNumLevels; level++) { + total += versions_->NumLevelBytes(level); + } + snprintf(buf, sizeof(buf), "%" PRIu64, total); value->append(buf); return true; + } else if (in == "file-cache") { + char buf[50]; + snprintf(buf, sizeof(buf), "%zd", double_cache.GetCapacity(true)); + value->append(buf); + return true; + } else if (in == "block-cache") { + char buf[50]; + snprintf(buf, sizeof(buf), "%zd", double_cache.GetCapacity(false)); + value->append(buf); + return true; + } else if (-1!=gPerfCounters->LookupCounter(in.ToString().c_str())) { + + char buf[66]; + int index; + + index=gPerfCounters->LookupCounter(in.ToString().c_str()); + snprintf(buf, sizeof(buf), "%" PRIu64 , gPerfCounters->Value(index)); + value->append(buf); + return(true); } return false; @@ -1458,8 +2185,8 @@ void DBImpl::GetApproximateSizes( for (int i = 0; i < n; i++) { // Convert user_key into a corresponding internal key. - InternalKey k1(range[i].start, kMaxSequenceNumber, kValueTypeForSeek); - InternalKey k2(range[i].limit, kMaxSequenceNumber, kValueTypeForSeek); + InternalKey k1(range[i].start, 0, kMaxSequenceNumber, kValueTypeForSeek); + InternalKey k2(range[i].limit, 0, kMaxSequenceNumber, kValueTypeForSeek); uint64_t start = versions_->ApproximateOffsetOf(v, k1); uint64_t limit = versions_->ApproximateOffsetOf(v, k2); sizes[i] = (limit >= start ? limit - start : 0); @@ -1473,15 +2200,21 @@ void DBImpl::GetApproximateSizes( // Default implementations of convenience methods that subclasses of DB // can call if they wish -Status DB::Put(const WriteOptions& opt, const Slice& key, const Slice& value) { +Status DB::Put(const WriteOptions& opt, const Slice& key, const Slice& value, + const KeyMetaData * meta) { WriteBatch batch; - batch.Put(key, value); + batch.Put(key, value, meta); return Write(opt, &batch); } Status DB::Delete(const WriteOptions& opt, const Slice& key) { WriteBatch batch; batch.Delete(key); + + // Negate the count to "ApiWrite" + gPerfCounters->Dec(ePerfApiWrite); + gPerfCounters->Inc(ePerfApiDelete); + return Write(opt, &batch); } @@ -1494,40 +2227,47 @@ Status DB::Open(const Options& options, const std::string& dbname, DBImpl* impl = new DBImpl(options, dbname); impl->mutex_.Lock(); VersionEdit edit; - // Recover handles create_if_missing, error_if_exists - bool save_manifest = false; - Status s = impl->Recover(&edit, &save_manifest); - if (s.ok() && impl->mem_ == NULL) { - // Create new log and a corresponding memtable. + Status s; + + // WARNING: only use impl and impl->options_ from this point. + // Things like tiered storage change the meanings + + // 4 level0 files at 2Mbytes and 2Mbytes of block cache + // (but first level1 file is likely to thrash) + // ... this value is AFTER write_buffer and 40M for recovery log and LOG + //if (!options.limited_developer_mem && impl->GetCacheCapacity() < flex::kMinimumDBMemory) + // s=Status::InvalidArgument("Less than 10Mbytes per database/vnode"); + + if (s.ok()) + s = impl->Recover(&edit); // Handles create_if_missing, error_if_exists + + if (s.ok()) { uint64_t new_log_number = impl->versions_->NewFileNumber(); - WritableFile* lfile; - s = options.env->NewWritableFile(LogFileName(dbname, new_log_number), - &lfile); + + s = impl->NewRecoveryLog(new_log_number); + if (s.ok()) { edit.SetLogNumber(new_log_number); - impl->logfile_ = lfile; - impl->logfile_number_ = new_log_number; - impl->log_ = new log::Writer(lfile); - impl->mem_ = new MemTable(impl->internal_comparator_); - impl->mem_->Ref(); + s = impl->versions_->LogAndApply(&edit, &impl->mutex_); + } + if (s.ok()) { + impl->DeleteObsoleteFiles(); + impl->CheckCompactionState(); } } - if (s.ok() && save_manifest) { - edit.SetPrevLogNumber(0); // No older logs needed after recovery. - edit.SetLogNumber(impl->logfile_number_); - s = impl->versions_->LogAndApply(&edit, &impl->mutex_); - } - if (s.ok()) { - impl->DeleteObsoleteFiles(); - impl->MaybeScheduleCompaction(); - } + + if (impl->options_.cache_object_warming) + impl->table_cache_->PreloadTableCache(); + impl->mutex_.Unlock(); if (s.ok()) { - assert(impl->mem_ != NULL); *dbptr = impl; } else { delete impl; } + + gPerfCounters->Inc(ePerfApiOpen); + return s; } @@ -1537,22 +2277,50 @@ Snapshot::~Snapshot() { Status DestroyDB(const std::string& dbname, const Options& options) { Env* env = options.env; std::vector filenames; + Options options_tiered; + std::string dbname_tiered; + + options_tiered=options; + dbname_tiered=MakeTieredDbname(dbname, options_tiered); + // Ignore error in case directory does not exist - env->GetChildren(dbname, &filenames); + env->GetChildren(dbname_tiered, &filenames); if (filenames.empty()) { return Status::OK(); } FileLock* lock; - const std::string lockname = LockFileName(dbname); + const std::string lockname = LockFileName(dbname_tiered); Status result = env->LockFile(lockname, &lock); if (result.ok()) { uint64_t number; FileType type; + + // prune the table file directories + for (int level=0; levelGetChildren(dirname, &filenames); // Ignoring errors on purpose + for (size_t i = 0; i < filenames.size(); i++) { + if (ParseFileName(filenames[i], &number, &type)) { + Status del = env->DeleteFile(dirname + "/" + filenames[i]); + if (result.ok() && !del.ok()) { + result = del; + } // if + } // if + } // for + env->DeleteDir(dirname); + } // for + + filenames.clear(); + env->GetChildren(dbname_tiered, &filenames); for (size_t i = 0; i < filenames.size(); i++) { if (ParseFileName(filenames[i], &number, &type) && type != kDBLockFile) { // Lock file will be deleted at end - Status del = env->DeleteFile(dbname + "/" + filenames[i]); + Status del = env->DeleteFile(dbname_tiered + "/" + filenames[i]); if (result.ok() && !del.ok()) { result = del; } @@ -1560,9 +2328,89 @@ Status DestroyDB(const std::string& dbname, const Options& options) { } env->UnlockFile(lock); // Ignore error since state is already gone env->DeleteFile(lockname); - env->DeleteDir(dbname); // Ignore error in case dir contains other files + env->DeleteDir(options.tiered_fast_prefix); // Ignore error in case dir contains other files + env->DeleteDir(options.tiered_slow_prefix); // Ignore error in case dir contains other files } return result; } + +Status DB::VerifyLevels() {return(Status::InvalidArgument("is_repair not set in Options before database opened"));}; + +// Riak specific repair +Status +DBImpl::VerifyLevels() +{ + Status result; + + // did they remember to open the db with flag set in options + if (options_.is_repair) + { + InternalKey begin, end; + bool overlap_found; + int level; + Version * ver; + + overlap_found=false; + level=0; + + do + { + // get a copy of current version + { + MutexLock l(&mutex_); + ver = versions_->current(); + ver->Ref(); + } + + // level is input and output (acts as cursor to progress) + // begin and end are outputs of function + overlap_found=ver->VerifyLevels(level, begin, end); + ver->Unref(); + + if (overlap_found) + { + Slice s_begin, s_end; + + s_begin=begin.user_key(); + s_end=end.user_key(); + TEST_CompactRange(level, &s_begin, &s_end); + } // if + + } while(overlap_found); + + } // if + else + { + result=Status::InvalidArgument("is_repair not set in Options before database opened"); + } // else + + return(result); + +} // VerifyLevels + +void DB::CheckAvailableCompactions() {return;}; + +// Used internally for inter-database notification +// of potential grooming timeslot availability. +void +DBImpl::CheckAvailableCompactions() +{ + MutexLock l(&mutex_); + MaybeScheduleCompaction(); + + return; +} // CheckAvailableCompactions + + +bool +DBImpl::IsCompactionScheduled() +{ + mutex_.AssertHeld(); + bool flag(false); + for (int level=0; level< config::kNumLevels && !flag; ++level) + flag=versions_->IsCompactionSubmitted(level); + return(flag || NULL!=imm_ || hotbackup_pending_); +} // DBImpl::IsCompactionScheduled + } // namespace leveldb diff --git a/src/leveldb/db/db_impl.h b/src/leveldb/db/db_impl.h index 8ff323e72..5e3976a31 100644 --- a/src/leveldb/db/db_impl.h +++ b/src/leveldb/db/db_impl.h @@ -13,7 +13,7 @@ #include "leveldb/db.h" #include "leveldb/env.h" #include "port/port.h" -#include "port/thread_annotations.h" +#include "util/cache2.h" namespace leveldb { @@ -29,26 +29,37 @@ class DBImpl : public DB { virtual ~DBImpl(); // Implementations of the DB interface - virtual Status Put(const WriteOptions&, const Slice& key, const Slice& value); + virtual Status Put(const WriteOptions&, const Slice& key, const Slice& value, const KeyMetaData * meta=NULL); virtual Status Delete(const WriteOptions&, const Slice& key); virtual Status Write(const WriteOptions& options, WriteBatch* updates); virtual Status Get(const ReadOptions& options, const Slice& key, - std::string* value); + std::string* value, + KeyMetaData * meta=NULL); + virtual Status Get(const ReadOptions& options, + const Slice& key, + Value* value, + KeyMetaData * meta=NULL); virtual Iterator* NewIterator(const ReadOptions&); virtual const Snapshot* GetSnapshot(); virtual void ReleaseSnapshot(const Snapshot* snapshot); virtual bool GetProperty(const Slice& property, std::string* value); virtual void GetApproximateSizes(const Range* range, int n, uint64_t* sizes); virtual void CompactRange(const Slice* begin, const Slice* end); + virtual Status VerifyLevels(); + virtual void CheckAvailableCompactions(); + virtual Logger* GetLogger() const { return options_.info_log; } // Extra methods (for testing) that are not in the public DB interface + const Options & GetOptions() const { return options_; }; + // Compact any files in the named level that overlap [*begin,*end] void TEST_CompactRange(int level, const Slice* begin, const Slice* end); - // Force current memtable contents to be compacted. - Status TEST_CompactMemTable(); + // Force current memtable contents to be compacted, waits for completion + Status CompactMemTableSynchronous(); + Status TEST_CompactMemTable(); // wraps CompactMemTableSynchronous (historical) // Return an internal iterator over the current state of the database. // The keys of this iterator are internal keys (see format.h). @@ -59,64 +70,82 @@ class DBImpl : public DB { // file at a level >= 1. int64_t TEST_MaxNextLevelOverlappingBytes(); - // Record a sample of bytes read at the specified internal key. - // Samples are taken approximately once every config::kReadBytesPeriod - // bytes. - void RecordReadSample(Slice key); + // These are routines that DBListImpl calls across all open databases + void ResizeCaches() {double_cache.ResizeCaches();}; + size_t GetCacheCapacity() {return(double_cache.GetCapacity(false));} + void PurgeExpiredFileCache() {double_cache.PurgeExpiredFiles();}; - private: + // in util/hot_backup.cc + void HotBackup(); + bool PurgeWriteBuffer(); + bool WriteBackupManifest(); + bool CreateBackupLinks(Version * Version, Options & BackupOptions); + bool CopyLOGSegment(long FileEnd); + void HotBackupComplete(); + + void BackgroundCall2(Compaction * Compact); + void BackgroundImmCompactCall(); + bool IsCompactionScheduled(); + uint32_t RunningCompactionCount() {mutex_.AssertHeld(); return(running_compactions_);}; + + protected: friend class DB; struct CompactionState; struct Writer; Iterator* NewInternalIterator(const ReadOptions&, - SequenceNumber* latest_snapshot, - uint32_t* seed); + SequenceNumber* latest_snapshot); Status NewDB(); // Recover the descriptor from persistent storage. May do a significant // amount of work to recover recently logged updates. Any changes to // be made to the descriptor are added to *edit. - Status Recover(VersionEdit* edit, bool* save_manifest) - EXCLUSIVE_LOCKS_REQUIRED(mutex_); + Status Recover(VersionEdit* edit); + + // Riak routine: pause DB::Open if too many compactions + // stacked up immediately. Happens in some repairs and + // some Riak upgrades + void CheckCompactionState(); void MaybeIgnoreError(Status* s) const; // Delete any unneeded files and stale in-memory entries. void DeleteObsoleteFiles(); + void KeepOrDelete(const std::string & Filename, int level, const std::set & Live); // Compact the in-memory write buffer to disk. Switches to a new // log-file/memtable and writes a new descriptor iff successful. - // Errors are recorded in bg_error_. - void CompactMemTable() EXCLUSIVE_LOCKS_REQUIRED(mutex_); + Status CompactMemTable(); - Status RecoverLogFile(uint64_t log_number, bool last_log, bool* save_manifest, - VersionEdit* edit, SequenceNumber* max_sequence) - EXCLUSIVE_LOCKS_REQUIRED(mutex_); + Status RecoverLogFile(uint64_t log_number, + VersionEdit* edit, + SequenceNumber* max_sequence); - Status WriteLevel0Table(MemTable* mem, VersionEdit* edit, Version* base) - EXCLUSIVE_LOCKS_REQUIRED(mutex_); + Status WriteLevel0Table(volatile MemTable* mem, VersionEdit* edit, Version* base); + + Status MakeRoomForWrite(bool force /* TRUE forces memtable rotation to disk (for testing) */); + Status NewRecoveryLog(uint64_t NewLogNumber); - Status MakeRoomForWrite(bool force /* compact even if there is room? */) - EXCLUSIVE_LOCKS_REQUIRED(mutex_); WriteBatch* BuildBatchGroup(Writer** last_writer); - void RecordBackgroundError(const Status& s); + void MaybeScheduleCompaction(); - void MaybeScheduleCompaction() EXCLUSIVE_LOCKS_REQUIRED(mutex_); - static void BGWork(void* db); - void BackgroundCall(); - void BackgroundCompaction() EXCLUSIVE_LOCKS_REQUIRED(mutex_); - void CleanupCompaction(CompactionState* compact) - EXCLUSIVE_LOCKS_REQUIRED(mutex_); - Status DoCompactionWork(CompactionState* compact) - EXCLUSIVE_LOCKS_REQUIRED(mutex_); + Status BackgroundCompaction(Compaction * Compact=NULL); + Status BackgroundExpiry(Compaction * Compact=NULL); - Status OpenCompactionOutputFile(CompactionState* compact); + void CleanupCompaction(CompactionState* compact); + Status DoCompactionWork(CompactionState* compact); + int64_t PrioritizeWork(bool IsLevel0); + + Status OpenCompactionOutputFile(CompactionState* compact, size_t sample_value_size); + bool Send2PageCache(CompactionState * compact); + size_t MaybeRaiseBlockSize(Compaction & CompactionStuff, size_t SampleValueSize); Status FinishCompactionOutputFile(CompactionState* compact, Iterator* input); - Status InstallCompactionResults(CompactionState* compact) - EXCLUSIVE_LOCKS_REQUIRED(mutex_); + Status InstallCompactionResults(CompactionState* compact); + + // initialized before options so its block_cache is available + class DoubleCache double_cache; // Constant after construction Env* const env_; @@ -130,20 +159,22 @@ class DBImpl : public DB { // table_cache_ provides its own synchronization TableCache* table_cache_; + // Lock over the persistent DB state. Non-NULL iff successfully acquired. FileLock* db_lock_; // State below is protected by mutex_ port::Mutex mutex_; + port::Mutex throttle_mutex_; // used by write throttle to force sequential waits on callers port::AtomicPointer shutting_down_; + port::CondVar bg_cv_; // Signalled when background work finishes MemTable* mem_; - MemTable* imm_; // Memtable being compacted + volatile MemTable* imm_; // Memtable being compacted port::AtomicPointer has_imm_; // So bg thread can detect non-NULL imm_ WritableFile* logfile_; uint64_t logfile_number_; log::Writer* log_; - uint32_t seed_; // For sampling. // Queue of writers. std::deque writers_; @@ -155,9 +186,6 @@ class DBImpl : public DB { // part of ongoing compactions. std::set pending_outputs_; - // Has a background compaction been scheduled or is running? - bool bg_compaction_scheduled_; - // Information for a manual compaction struct ManualCompaction { int level; @@ -166,7 +194,7 @@ class DBImpl : public DB { const InternalKey* end; // NULL means end of key range InternalKey tmp_storage; // Used to keep track of compaction progress }; - ManualCompaction* manual_compaction_; + volatile ManualCompaction* manual_compaction_; VersionSet* versions_; @@ -190,6 +218,18 @@ class DBImpl : public DB { }; CompactionStats stats_[config::kNumLevels]; + volatile uint64_t throttle_end; + volatile uint32_t running_compactions_; + volatile size_t current_block_size_; // last dynamic block size computed + volatile uint64_t block_size_changed_; // NowMicros() when block size computed + volatile uint64_t last_low_mem_; // NowMicros() when low memory last seen + + // accessor to new, dynamic block_cache + Cache * block_cache() {return(double_cache.GetBlockCache());}; + Cache * file_cache() {return(double_cache.GetFileCache());}; + + volatile bool hotbackup_pending_; + // No copying allowed DBImpl(const DBImpl&); void operator=(const DBImpl&); @@ -204,7 +244,8 @@ class DBImpl : public DB { extern Options SanitizeOptions(const std::string& db, const InternalKeyComparator* icmp, const InternalFilterPolicy* ipolicy, - const Options& src); + const Options& src, + Cache * block_cache); } // namespace leveldb diff --git a/src/leveldb/db/db_iter.cc b/src/leveldb/db/db_iter.cc index 3b2035e9e..3ef3b2b2e 100644 --- a/src/leveldb/db/db_iter.cc +++ b/src/leveldb/db/db_iter.cc @@ -5,14 +5,14 @@ #include "db/db_iter.h" #include "db/filename.h" -#include "db/db_impl.h" #include "db/dbformat.h" #include "leveldb/env.h" +#include "leveldb/expiry.h" #include "leveldb/iterator.h" +#include "leveldb/perf_count.h" #include "port/port.h" #include "util/logging.h" #include "util/mutexlock.h" -#include "util/random.h" namespace leveldb { @@ -48,18 +48,20 @@ class DBIter: public Iterator { kReverse }; - DBIter(DBImpl* db, const Comparator* cmp, Iterator* iter, SequenceNumber s, - uint32_t seed) - : db_(db), + DBIter(const std::string* dbname, Env* env, + const Comparator* cmp, Iterator* iter, SequenceNumber s, + const ExpiryModule * expiry) + : dbname_(dbname), + env_(env), user_comparator_(cmp), iter_(iter), sequence_(s), direction_(kForward), valid_(false), - rnd_(seed), - bytes_counter_(RandomPeriod()) { + expiry_(expiry) { } virtual ~DBIter() { + gPerfCounters->Inc(ePerfIterDelete); delete iter_; } virtual bool Valid() const { return valid_; } @@ -71,6 +73,26 @@ class DBIter: public Iterator { assert(valid_); return (direction_ == kForward) ? iter_->value() : saved_value_; } + // Riak specific: if a database iterator, returns key meta data + // REQUIRES: Valid() and forward iteration + // (reverse iteration is possible, just needs code) + virtual KeyMetaData & keymetadata() const + { + assert(valid_ && kForward==direction_); + if (kForward==direction_) + { + ParsedInternalKey parsed; + // this initialization clears a warning. ParsedInternalKey says + // it is not initializing for performance reasons ... oh well + parsed.type=kTypeValue; parsed.sequence=0; parsed.expiry=0; + ParseInternalKey(iter_->key(), &parsed); + keymetadata_.m_Type=parsed.type; + keymetadata_.m_Sequence=parsed.sequence; + keymetadata_.m_Expiry=parsed.expiry; + } + return(keymetadata_); + } + virtual Status status() const { if (status_.ok()) { return iter_->status(); @@ -103,12 +125,8 @@ class DBIter: public Iterator { } } - // Pick next gap with average value of config::kReadBytesPeriod. - ssize_t RandomPeriod() { - return rnd_.Uniform(2*config::kReadBytesPeriod); - } - - DBImpl* db_; + const std::string* const dbname_; + Env* const env_; const Comparator* const user_comparator_; Iterator* const iter_; SequenceNumber const sequence_; @@ -118,9 +136,7 @@ class DBIter: public Iterator { std::string saved_value_; // == current raw value when direction_==kReverse Direction direction_; bool valid_; - - Random rnd_; - ssize_t bytes_counter_; + const ExpiryModule * expiry_; // No copying allowed DBIter(const DBIter&); @@ -128,14 +144,7 @@ class DBIter: public Iterator { }; inline bool DBIter::ParseKey(ParsedInternalKey* ikey) { - Slice k = iter_->key(); - ssize_t n = k.size() + iter_->value().size(); - bytes_counter_ -= n; - while (bytes_counter_ < 0) { - bytes_counter_ += RandomPeriod(); - db_->RecordReadSample(k); - } - if (!ParseInternalKey(k, ikey)) { + if (!ParseInternalKey(iter_->key(), ikey)) { status_ = Status::Corruption("corrupted internal key in DBIter"); return false; } else { @@ -146,6 +155,7 @@ inline bool DBIter::ParseKey(ParsedInternalKey* ikey) { void DBIter::Next() { assert(valid_); + gPerfCounters->Inc(ePerfIterNext); if (direction_ == kReverse) { // Switch directions? direction_ = kForward; // iter_ is pointing just before the entries for this->key(), @@ -161,13 +171,12 @@ void DBIter::Next() { saved_key_.clear(); return; } - // saved_key_ already contains the key to skip past. - } else { - // Store in saved_key_ the current key so we skip it below. - SaveKey(ExtractUserKey(iter_->key()), &saved_key_); } - FindNextUserEntry(true, &saved_key_); + // Temporarily use saved_key_ as storage for key to skip. + std::string* skip = &saved_key_; + SaveKey(ExtractUserKey(iter_->key()), skip); + FindNextUserEntry(true, skip); } void DBIter::FindNextUserEntry(bool skipping, std::string* skip) { @@ -177,6 +186,9 @@ void DBIter::FindNextUserEntry(bool skipping, std::string* skip) { do { ParsedInternalKey ikey; if (ParseKey(&ikey) && ikey.sequence <= sequence_) { + if (IsExpiryKey(ikey.type) && NULL!=expiry_ + && expiry_->KeyRetirementCallback(ikey)) + ikey.type=kTypeDeletion; switch (ikey.type) { case kTypeDeletion: // Arrange to skip all upcoming entries for this key since @@ -184,6 +196,9 @@ void DBIter::FindNextUserEntry(bool skipping, std::string* skip) { SaveKey(ikey.user_key, skip); skipping = true; break; + + case kTypeValueWriteTime: + case kTypeValueExplicitExpiry: case kTypeValue: if (skipping && user_comparator_->Compare(ikey.user_key, *skip) <= 0) { @@ -205,6 +220,7 @@ void DBIter::FindNextUserEntry(bool skipping, std::string* skip) { void DBIter::Prev() { assert(valid_); + gPerfCounters->Inc(ePerfIterPrev); if (direction_ == kForward) { // Switch directions? // iter_ is pointing at the current entry. Scan backwards until // the key changes so we can use the normal reverse scanning code. @@ -242,6 +258,10 @@ void DBIter::FindPrevUserEntry() { // We encountered a non-deleted value in entries for previous keys, break; } + if (IsExpiryKey(ikey.type) && NULL!=expiry_ + && expiry_->KeyRetirementCallback(ikey)) + ikey.type=kTypeDeletion; + value_type = ikey.type; if (value_type == kTypeDeletion) { saved_key_.clear(); @@ -272,11 +292,12 @@ void DBIter::FindPrevUserEntry() { } void DBIter::Seek(const Slice& target) { + gPerfCounters->Inc(ePerfIterSeek); direction_ = kForward; ClearSavedValue(); saved_key_.clear(); AppendInternalKey( - &saved_key_, ParsedInternalKey(target, sequence_, kValueTypeForSeek)); + &saved_key_, ParsedInternalKey(target, 0, sequence_, kValueTypeForSeek)); iter_->Seek(saved_key_); if (iter_->Valid()) { FindNextUserEntry(false, &saved_key_ /* temporary storage */); @@ -286,6 +307,7 @@ void DBIter::Seek(const Slice& target) { } void DBIter::SeekToFirst() { + gPerfCounters->Inc(ePerfIterSeekFirst); direction_ = kForward; ClearSavedValue(); iter_->SeekToFirst(); @@ -297,6 +319,7 @@ void DBIter::SeekToFirst() { } void DBIter::SeekToLast() { + gPerfCounters->Inc(ePerfIterSeekLast); direction_ = kReverse; ClearSavedValue(); iter_->SeekToLast(); @@ -306,12 +329,13 @@ void DBIter::SeekToLast() { } // anonymous namespace Iterator* NewDBIterator( - DBImpl* db, + const std::string* dbname, + Env* env, const Comparator* user_key_comparator, Iterator* internal_iter, - SequenceNumber sequence, - uint32_t seed) { - return new DBIter(db, user_key_comparator, internal_iter, sequence, seed); + const SequenceNumber& sequence, + const ExpiryModule * expiry) { + return new DBIter(dbname, env, user_key_comparator, internal_iter, sequence, expiry); } } // namespace leveldb diff --git a/src/leveldb/db/db_iter.h b/src/leveldb/db/db_iter.h index 04927e937..c3f40469f 100644 --- a/src/leveldb/db/db_iter.h +++ b/src/leveldb/db/db_iter.h @@ -7,21 +7,21 @@ #include #include "leveldb/db.h" +#include "leveldb/expiry.h" #include "db/dbformat.h" namespace leveldb { -class DBImpl; - // Return a new iterator that converts internal keys (yielded by // "*internal_iter") that were live at the specified "sequence" number // into appropriate user keys. extern Iterator* NewDBIterator( - DBImpl* db, + const std::string* dbname, + Env* env, const Comparator* user_key_comparator, Iterator* internal_iter, - SequenceNumber sequence, - uint32_t seed); + const SequenceNumber& sequence, + const ExpiryModule * expiry=NULL); } // namespace leveldb diff --git a/src/leveldb/db/db_test.cc b/src/leveldb/db/db_test.cc index a0b08bc19..0916673b4 100644 --- a/src/leveldb/db/db_test.cc +++ b/src/leveldb/db/db_test.cc @@ -33,11 +33,8 @@ class AtomicCounter { public: AtomicCounter() : count_(0) { } void Increment() { - IncrementBy(1); - } - void IncrementBy(int count) { MutexLock l(&mu_); - count_ += count; + count_++; } int Read() { MutexLock l(&mu_); @@ -48,20 +45,13 @@ class AtomicCounter { count_ = 0; } }; - -void DelayMilliseconds(int millis) { - Env::Default()->SleepForMicroseconds(millis * 1000); -} } // Special Env used to delay background operations class SpecialEnv : public EnvWrapper { public: - // sstable/log Sync() calls are blocked while this pointer is non-NULL. - port::AtomicPointer delay_data_sync_; - - // sstable/log Sync() calls return an error. - port::AtomicPointer data_sync_error_; + // sstable Sync() calls are blocked while this pointer is non-NULL. + port::AtomicPointer delay_sstable_sync_; // Simulate no-space errors while this pointer is non-NULL. port::AtomicPointer no_space_; @@ -69,37 +59,30 @@ class SpecialEnv : public EnvWrapper { // Simulate non-writable file system while this pointer is non-NULL port::AtomicPointer non_writable_; - // Force sync of manifest files to fail while this pointer is non-NULL - port::AtomicPointer manifest_sync_error_; - - // Force write to manifest files to fail while this pointer is non-NULL - port::AtomicPointer manifest_write_error_; - bool count_random_reads_; AtomicCounter random_read_counter_; + AtomicCounter sleep_counter_; + explicit SpecialEnv(Env* base) : EnvWrapper(base) { - delay_data_sync_.Release_Store(NULL); - data_sync_error_.Release_Store(NULL); + delay_sstable_sync_.Release_Store(NULL); no_space_.Release_Store(NULL); non_writable_.Release_Store(NULL); count_random_reads_ = false; - manifest_sync_error_.Release_Store(NULL); - manifest_write_error_.Release_Store(NULL); } - Status NewWritableFile(const std::string& f, WritableFile** r) { - class DataFile : public WritableFile { + Status NewWritableFile(const std::string& f, WritableFile** r, size_t map_size) { + class SSTableFile : public WritableFile { private: SpecialEnv* env_; WritableFile* base_; public: - DataFile(SpecialEnv* env, WritableFile* base) + SSTableFile(SpecialEnv* env, WritableFile* base) : env_(env), base_(base) { } - ~DataFile() { delete base_; } + ~SSTableFile() { delete base_; } Status Append(const Slice& data) { if (env_->no_space_.Acquire_Load() != NULL) { // Drop writes on the floor @@ -111,51 +94,21 @@ class SpecialEnv : public EnvWrapper { Status Close() { return base_->Close(); } Status Flush() { return base_->Flush(); } Status Sync() { - if (env_->data_sync_error_.Acquire_Load() != NULL) { - return Status::IOError("simulated data sync error"); - } - while (env_->delay_data_sync_.Acquire_Load() != NULL) { - DelayMilliseconds(100); + while (env_->delay_sstable_sync_.Acquire_Load() != NULL) { + env_->SleepForMicroseconds(100000); } return base_->Sync(); } }; - class ManifestFile : public WritableFile { - private: - SpecialEnv* env_; - WritableFile* base_; - public: - ManifestFile(SpecialEnv* env, WritableFile* b) : env_(env), base_(b) { } - ~ManifestFile() { delete base_; } - Status Append(const Slice& data) { - if (env_->manifest_write_error_.Acquire_Load() != NULL) { - return Status::IOError("simulated writer error"); - } else { - return base_->Append(data); - } - } - Status Close() { return base_->Close(); } - Status Flush() { return base_->Flush(); } - Status Sync() { - if (env_->manifest_sync_error_.Acquire_Load() != NULL) { - return Status::IOError("simulated sync error"); - } else { - return base_->Sync(); - } - } - }; if (non_writable_.Acquire_Load() != NULL) { return Status::IOError("simulated write error"); } - Status s = target()->NewWritableFile(f, r); + Status s = target()->NewWritableFile(f, r, 2<<20); if (s.ok()) { - if (strstr(f.c_str(), ".ldb") != NULL || - strstr(f.c_str(), ".log") != NULL) { - *r = new DataFile(this, *r); - } else if (strstr(f.c_str(), "MANIFEST") != NULL) { - *r = new ManifestFile(this, *r); + if (strstr(f.c_str(), ".sst") != NULL) { + *r = new SSTableFile(this, *r); } } return s; @@ -184,6 +137,11 @@ class SpecialEnv : public EnvWrapper { } return s; } + + virtual void SleepForMicroseconds(int micros) { + sleep_counter_.Increment(); + target()->SleepForMicroseconds(micros); + } }; class DBTest { @@ -193,7 +151,6 @@ class DBTest { // Sequence of option configurations to try enum OptionConfig { kDefault, - kReuse, kFilter, kUncompressed, kEnd @@ -209,7 +166,7 @@ class DBTest { DBTest() : option_config_(kDefault), env_(new SpecialEnv(Env::Default())) { - filter_policy_ = NewBloomFilterPolicy(10); + filter_policy_ = NewBloomFilterPolicy2(16); dbname_ = test::TmpDir() + "/db_test"; DestroyDB(dbname_, Options()); db_ = NULL; @@ -238,11 +195,7 @@ class DBTest { // Return the current option configuration. Options CurrentOptions() { Options options; - options.reuse_logs = false; switch (option_config_) { - case kReuse: - options.reuse_logs = true; - break; case kFilter: options.filter_policy = filter_policy_; break; @@ -290,6 +243,23 @@ class DBTest { return DB::Open(opts, dbname_, &db_); } + Status DoubleOpen(Options* options = NULL) { + DB * db_fail; + delete db_; + db_ = NULL; + Options opts, opts2; + if (options != NULL) { + opts = *options; + } else { + opts = CurrentOptions(); + opts.create_if_missing = true; + } + last_options_ = opts; + + DB::Open(opts, dbname_, &db_); + return DB::Open(opts2, dbname_, &db_fail); + } + Status Put(const std::string& k, const std::string& v) { return db_->Put(WriteOptions(), k, v); } @@ -311,6 +281,20 @@ class DBTest { return result; } + std::string GetNoCache(const std::string& k, const Snapshot* snapshot = NULL) { + ReadOptions options; + options.snapshot = snapshot; + options.fill_cache=false; + std::string result; + Status s = db_->Get(options, k, &result); + if (s.IsNotFound()) { + result = "NOT_FOUND"; + } else if (!s.ok()) { + result = s.ToString(); + } + return result; + } + // Return a string that contains all key,value pairs in order, // formatted like "(k1->v1)(k2->v2)". std::string Contents() { @@ -326,7 +310,7 @@ class DBTest { } // Check reverse iteration results are the reverse of forward results - size_t matched = 0; + int matched = 0; for (iter->SeekToLast(); iter->Valid(); iter->Prev()) { ASSERT_LT(matched, forward.size()); ASSERT_EQ(IterStatus(iter), forward[forward.size() - matched - 1]); @@ -340,7 +324,7 @@ class DBTest { std::string AllEntriesFor(const Slice& user_key) { Iterator* iter = dbfull()->TEST_NewInternalIterator(); - InternalKey target(user_key, kMaxSequenceNumber, kTypeValue); + InternalKey target(user_key, 0, kMaxSequenceNumber, kTypeValue); iter->Seek(target.Encode()); std::string result; if (!iter->status().ok()) { @@ -361,6 +345,8 @@ class DBTest { } first = false; switch (ikey.type) { + case kTypeValueWriteTime: + case kTypeValueExplicitExpiry: case kTypeValue: result += iter->value().ToString(); break; @@ -474,38 +460,6 @@ class DBTest { } return result; } - - bool DeleteAnSSTFile() { - std::vector filenames; - ASSERT_OK(env_->GetChildren(dbname_, &filenames)); - uint64_t number; - FileType type; - for (size_t i = 0; i < filenames.size(); i++) { - if (ParseFileName(filenames[i], &number, &type) && type == kTableFile) { - ASSERT_OK(env_->DeleteFile(TableFileName(dbname_, number))); - return true; - } - } - return false; - } - - // Returns number of files renamed. - int RenameLDBToSST() { - std::vector filenames; - ASSERT_OK(env_->GetChildren(dbname_, &filenames)); - uint64_t number; - FileType type; - int files_renamed = 0; - for (size_t i = 0; i < filenames.size(); i++) { - if (ParseFileName(filenames[i], &number, &type) && type == kTableFile) { - const std::string from = TableFileName(dbname_, number); - const std::string to = SSTTableFileName(dbname_, number); - ASSERT_OK(env_->RenameFile(from, to)); - files_renamed++; - } - } - return files_renamed; - } }; TEST(DBTest, Empty) { @@ -515,6 +469,11 @@ TEST(DBTest, Empty) { } while (ChangeOptions()); } +TEST(DBTest, DoubleOpen) +{ + ASSERT_NOTOK(DoubleOpen()); +} + TEST(DBTest, ReadWrite) { do { ASSERT_OK(Put("foo", "v1")); @@ -547,11 +506,11 @@ TEST(DBTest, GetFromImmutableLayer) { ASSERT_OK(Put("foo", "v1")); ASSERT_EQ("v1", Get("foo")); - env_->delay_data_sync_.Release_Store(env_); // Block sync calls + env_->delay_sstable_sync_.Release_Store(env_); // Block sync calls Put("k1", std::string(100000, 'x')); // Fill memtable Put("k2", std::string(100000, 'y')); // Trigger compaction ASSERT_EQ("v1", Get("foo")); - env_->delay_data_sync_.Release_Store(NULL); // Release sync calls + env_->delay_sstable_sync_.Release_Store(NULL); // Release sync calls } while (ChangeOptions()); } @@ -563,17 +522,6 @@ TEST(DBTest, GetFromVersions) { } while (ChangeOptions()); } -TEST(DBTest, GetMemUsage) { - do { - ASSERT_OK(Put("foo", "v1")); - std::string val; - ASSERT_TRUE(db_->GetProperty("leveldb.approximate-memory-usage", &val)); - int mem_usage = atoi(val.c_str()); - ASSERT_GT(mem_usage, 0); - ASSERT_LT(mem_usage, 5*1024*1024); - } while (ChangeOptions()); -} - TEST(DBTest, GetSnapshot) { do { // Try with both a short key and a long key @@ -634,6 +582,9 @@ TEST(DBTest, GetPicksCorrectFile) { } while (ChangeOptions()); } +#if 0 +// riak does not execute compaction due to reads + TEST(DBTest, GetEncountersEmptyLevel) { do { // Arrange for the following to happen: @@ -642,7 +593,7 @@ TEST(DBTest, GetEncountersEmptyLevel) { // * sstable B in level 2 // Then do enough Get() calls to arrange for an automatic compaction // of sstable A. A bug would cause the compaction to be marked as - // occurring at level 1 (instead of the correct level 0). + // occuring at level 1 (instead of the correct level 0). // Step 1: First place sstables in levels 0 and 2 int compaction_count = 0; @@ -667,11 +618,12 @@ TEST(DBTest, GetEncountersEmptyLevel) { } // Step 4: Wait for compaction to finish - DelayMilliseconds(1000); + env_->SleepForMicroseconds(1000000); ASSERT_EQ(NumTableFilesAtLevel(0), 0); } while (ChangeOptions()); } +#endif TEST(DBTest, IterEmpty) { Iterator* iter = db_->NewIterator(ReadOptions()); @@ -996,7 +948,8 @@ TEST(DBTest, CompactionsGenerateMultipleFiles) { dbfull()->TEST_CompactRange(0, NULL, NULL); ASSERT_EQ(NumTableFilesAtLevel(0), 0); - ASSERT_GT(NumTableFilesAtLevel(1), 1); +// not riak ASSERT_GT(NumTableFilesAtLevel(1), 1); + ASSERT_EQ(NumTableFilesAtLevel(1), 1); // yes riak for (int i = 0; i < 80; i++) { ASSERT_EQ(Get(Key(i)), values[i]); } @@ -1010,7 +963,8 @@ TEST(DBTest, RepeatedWritesToSameKey) { // We must have at most one file per level except for level-0, // which may have up to kL0_StopWritesTrigger files. - const int kMaxFiles = config::kNumLevels + config::kL0_StopWritesTrigger; + // ... basho adds *2 since level-1 is now overlapped too + const int kMaxFiles = config::kNumLevels + config::kL0_StopWritesTrigger*2; Random rnd(301); std::string value = RandomString(&rnd, 2 * options.write_buffer_size); @@ -1054,11 +1008,13 @@ TEST(DBTest, SparseMerge) { // Compactions should not cause us to create a situation where // a file overlaps too much data at the next level. - ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576); + // 07/10/14 matthewv - we overlap first two levels. sparse test not appropriate there, + // and we set overlaps into 100s of megabytes as "normal" +// ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576); dbfull()->TEST_CompactRange(0, NULL, NULL); - ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576); +// ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576); dbfull()->TEST_CompactRange(1, NULL, NULL); - ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576); +// ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576); } static bool Between(uint64_t val, uint64_t low, uint64_t high) { @@ -1096,14 +1052,6 @@ TEST(DBTest, ApproximateSizes) { // 0 because GetApproximateSizes() does not account for memtable space ASSERT_TRUE(Between(Size("", Key(50)), 0, 0)); - if (options.reuse_logs) { - // Recovery will reuse memtable, and GetApproximateSizes() does not - // account for memtable usage; - Reopen(&options); - ASSERT_TRUE(Between(Size("", Key(50)), 0, 0)); - continue; - } - // Check sizes across recovery by reopening a few times for (int run = 0; run < 3; run++) { Reopen(&options); @@ -1147,11 +1095,6 @@ TEST(DBTest, ApproximateSizes_MixOfSmallAndLarge) { ASSERT_OK(Put(Key(6), RandomString(&rnd, 300000))); ASSERT_OK(Put(Key(7), RandomString(&rnd, 10000))); - if (options.reuse_logs) { - // Need to force a memtable compaction since recovery does not do so. - ASSERT_OK(dbfull()->TEST_CompactMemTable()); - } - // Check sizes across recovery by reopening a few times for (int run = 0; run < 3; run++) { Reopen(&options); @@ -1223,7 +1166,7 @@ TEST(DBTest, Snapshot) { ASSERT_EQ("v4", Get("foo")); } while (ChangeOptions()); } - +#if 0 // trouble under Riak due to assumed file sizes TEST(DBTest, HiddenValuesAreRemoved) { do { Random rnd(301); @@ -1254,7 +1197,7 @@ TEST(DBTest, HiddenValuesAreRemoved) { ASSERT_TRUE(Between(Size("", "pastfoo"), 0, 1000)); } while (ChangeOptions()); } - +#endif TEST(DBTest, DeletionMarkers1) { Put("foo", "v1"); ASSERT_OK(dbfull()->TEST_CompactMemTable()); @@ -1271,13 +1214,14 @@ TEST(DBTest, DeletionMarkers1) { Delete("foo"); Put("foo", "v2"); ASSERT_EQ(AllEntriesFor("foo"), "[ v2, DEL, v1 ]"); - ASSERT_OK(dbfull()->TEST_CompactMemTable()); // Moves to level last-2 - ASSERT_EQ(AllEntriesFor("foo"), "[ v2, DEL, v1 ]"); + ASSERT_OK(dbfull()->TEST_CompactMemTable()); // stays at level 0 + ASSERT_EQ(AllEntriesFor("foo"), "[ v2, v1 ]"); // riak 1.3, DEL merged out by BuildTable Slice z("z"); - dbfull()->TEST_CompactRange(last-2, NULL, &z); + dbfull()->TEST_CompactRange(0, NULL, &z); + dbfull()->TEST_CompactRange(1, NULL, &z); // DEL eliminated, but v1 remains because we aren't compacting that level // (DEL can be eliminated because v2 hides v1). - ASSERT_EQ(AllEntriesFor("foo"), "[ v2, v1 ]"); + ASSERT_EQ(AllEntriesFor("foo"), "[ v2, v1 ]"); // Riak 1.4 has merged to level 1 dbfull()->TEST_CompactRange(last-1, NULL, NULL); // Merging last-1 w/ last, so we are the base level for "foo", so // DEL is removed. (as is v1). @@ -1289,39 +1233,47 @@ TEST(DBTest, DeletionMarkers2) { ASSERT_OK(dbfull()->TEST_CompactMemTable()); const int last = config::kMaxMemCompactLevel; ASSERT_EQ(NumTableFilesAtLevel(last), 1); // foo => v1 is now in last level + dbfull()->TEST_CompactRange(0, NULL, NULL); + ASSERT_EQ(NumTableFilesAtLevel(last), 1); // foo => v1 is now in last level + ASSERT_EQ(NumTableFilesAtLevel(last-1), 0); // Place a table at level last-1 to prevent merging with preceding mutation Put("a", "begin"); Put("z", "end"); - dbfull()->TEST_CompactMemTable(); - ASSERT_EQ(NumTableFilesAtLevel(last), 1); + dbfull()->TEST_CompactMemTable(); // goes to last-1 ASSERT_EQ(NumTableFilesAtLevel(last-1), 1); Delete("foo"); ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]"); - ASSERT_OK(dbfull()->TEST_CompactMemTable()); // Moves to level last-2 + ASSERT_OK(dbfull()->TEST_CompactMemTable()); // Moves to level 0 ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]"); - dbfull()->TEST_CompactRange(last-2, NULL, NULL); + dbfull()->TEST_CompactRange(0, NULL, NULL); // Riak overlaps level 1 // DEL kept: "last" file overlaps ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]"); - dbfull()->TEST_CompactRange(last-1, NULL, NULL); // Merging last-1 w/ last, so we are the base level for "foo", so // DEL is removed. (as is v1). + dbfull()->TEST_CompactRange(1, NULL, NULL); + ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]"); + + dbfull()->TEST_CompactRange(2, NULL, NULL); ASSERT_EQ(AllEntriesFor("foo"), "[ ]"); } TEST(DBTest, OverlapInLevel0) { do { - ASSERT_EQ(config::kMaxMemCompactLevel, 2) << "Fix test to match config"; + ASSERT_EQ(config::kMaxMemCompactLevel, 3) << "Fix test to match config"; // Fill levels 1 and 2 to disable the pushing of new memtables to levels > 0. ASSERT_OK(Put("100", "v100")); ASSERT_OK(Put("999", "v999")); dbfull()->TEST_CompactMemTable(); + dbfull()->TEST_CompactRange(0, NULL, NULL); + dbfull()->TEST_CompactRange(1, NULL, NULL); ASSERT_OK(Delete("100")); ASSERT_OK(Delete("999")); dbfull()->TEST_CompactMemTable(); - ASSERT_EQ("0,1,1", FilesPerLevel()); + dbfull()->TEST_CompactRange(0, NULL, NULL); + ASSERT_EQ("0,0,1,1", FilesPerLevel()); // Make files spanning the following ranges in level-0: // files[0] 200 .. 900 @@ -1334,7 +1286,7 @@ TEST(DBTest, OverlapInLevel0) { ASSERT_OK(Put("600", "v600")); ASSERT_OK(Put("900", "v900")); dbfull()->TEST_CompactMemTable(); - ASSERT_EQ("2,1,1", FilesPerLevel()); + ASSERT_EQ("2,0,1,1", FilesPerLevel()); // Compact away the placeholder files we created initially dbfull()->TEST_CompactRange(1, NULL, NULL); @@ -1364,7 +1316,7 @@ TEST(DBTest, L0_CompactionBug_Issue44_a) { Reopen(); Reopen(); ASSERT_EQ("(a->v)", Contents()); - DelayMilliseconds(1000); // Wait for compaction to finish + env_->SleepForMicroseconds(1000000); // Wait for compaction to finish ASSERT_EQ("(a->v)", Contents()); } @@ -1380,7 +1332,7 @@ TEST(DBTest, L0_CompactionBug_Issue44_b) { Put("",""); Reopen(); Put("",""); - DelayMilliseconds(1000); // Wait for compaction to finish + env_->SleepForMicroseconds(1000000); // Wait for compaction to finish Reopen(); Put("d","dv"); Reopen(); @@ -1390,7 +1342,7 @@ TEST(DBTest, L0_CompactionBug_Issue44_b) { Delete("b"); Reopen(); ASSERT_EQ("(->)(c->cv)", Contents()); - DelayMilliseconds(1000); // Wait for compaction to finish + env_->SleepForMicroseconds(1000000); // Wait for compaction to finish ASSERT_EQ("(->)(c->cv)", Contents()); } @@ -1473,37 +1425,37 @@ TEST(DBTest, CustomComparator) { } TEST(DBTest, ManualCompaction) { - ASSERT_EQ(config::kMaxMemCompactLevel, 2) + ASSERT_EQ(config::kMaxMemCompactLevel, 3) << "Need to update this test to match kMaxMemCompactLevel"; MakeTables(3, "p", "q"); - ASSERT_EQ("1,1,1", FilesPerLevel()); + ASSERT_EQ("1,0,1,1", FilesPerLevel()); // Compaction range falls before files Compact("", "c"); - ASSERT_EQ("1,1,1", FilesPerLevel()); + ASSERT_EQ("0,1,1,1", FilesPerLevel()); // Compaction range falls after files Compact("r", "z"); - ASSERT_EQ("1,1,1", FilesPerLevel()); + ASSERT_EQ("0,1,1,1", FilesPerLevel()); // Compaction range overlaps files Compact("p1", "p9"); - ASSERT_EQ("0,0,1", FilesPerLevel()); + ASSERT_EQ("0,0,0,1", FilesPerLevel()); // Populate a different range MakeTables(3, "c", "e"); - ASSERT_EQ("1,1,2", FilesPerLevel()); + ASSERT_EQ("1,0,1,2", FilesPerLevel()); // Compact just the new range Compact("b", "f"); - ASSERT_EQ("0,0,2", FilesPerLevel()); + ASSERT_EQ("0,0,0,2", FilesPerLevel()); // Compact all MakeTables(1, "a", "z"); - ASSERT_EQ("0,1,2", FilesPerLevel()); + ASSERT_EQ("0,0,1,2", FilesPerLevel()); db_->CompactRange(NULL, NULL); - ASSERT_EQ("0,0,1", FilesPerLevel()); + ASSERT_EQ("0,0,0,1", FilesPerLevel()); } TEST(DBTest, DBOpen_Options) { @@ -1545,12 +1497,6 @@ TEST(DBTest, DBOpen_Options) { db = NULL; } -TEST(DBTest, Locking) { - DB* db2 = NULL; - Status s = DB::Open(CurrentOptions(), dbname_, &db2); - ASSERT_TRUE(!s.ok()) << "Locking did not prevent re-opening db"; -} - // Check that number of files does not grow when we are out of space TEST(DBTest, NoSpace) { Options options = CurrentOptions(); @@ -1562,15 +1508,19 @@ TEST(DBTest, NoSpace) { Compact("a", "z"); const int num_files = CountFiles(); env_->no_space_.Release_Store(env_); // Force out-of-space errors - for (int i = 0; i < 10; i++) { + env_->sleep_counter_.Reset(); + for (int i = 0; i < 5; i++) { for (int level = 0; level < config::kNumLevels-1; level++) { dbfull()->TEST_CompactRange(level, NULL, NULL); } } env_->no_space_.Release_Store(NULL); ASSERT_LT(CountFiles(), num_files + 3); -} + // Check that compaction attempts slept after errors + ASSERT_GE(env_->sleep_counter_.Read(), 5); +} +#if 0 TEST(DBTest, NonWritableFileSystem) { Options options = CurrentOptions(); options.write_buffer_size = 1000; @@ -1584,119 +1534,13 @@ TEST(DBTest, NonWritableFileSystem) { fprintf(stderr, "iter %d; errors %d\n", i, errors); if (!Put("foo", big).ok()) { errors++; - DelayMilliseconds(100); + env_->SleepForMicroseconds(100000); } } ASSERT_GT(errors, 0); env_->non_writable_.Release_Store(NULL); } - -TEST(DBTest, WriteSyncError) { - // Check that log sync errors cause the DB to disallow future writes. - - // (a) Cause log sync calls to fail - Options options = CurrentOptions(); - options.env = env_; - Reopen(&options); - env_->data_sync_error_.Release_Store(env_); - - // (b) Normal write should succeed - WriteOptions w; - ASSERT_OK(db_->Put(w, "k1", "v1")); - ASSERT_EQ("v1", Get("k1")); - - // (c) Do a sync write; should fail - w.sync = true; - ASSERT_TRUE(!db_->Put(w, "k2", "v2").ok()); - ASSERT_EQ("v1", Get("k1")); - ASSERT_EQ("NOT_FOUND", Get("k2")); - - // (d) make sync behave normally - env_->data_sync_error_.Release_Store(NULL); - - // (e) Do a non-sync write; should fail - w.sync = false; - ASSERT_TRUE(!db_->Put(w, "k3", "v3").ok()); - ASSERT_EQ("v1", Get("k1")); - ASSERT_EQ("NOT_FOUND", Get("k2")); - ASSERT_EQ("NOT_FOUND", Get("k3")); -} - -TEST(DBTest, ManifestWriteError) { - // Test for the following problem: - // (a) Compaction produces file F - // (b) Log record containing F is written to MANIFEST file, but Sync() fails - // (c) GC deletes F - // (d) After reopening DB, reads fail since deleted F is named in log record - - // We iterate twice. In the second iteration, everything is the - // same except the log record never makes it to the MANIFEST file. - for (int iter = 0; iter < 2; iter++) { - port::AtomicPointer* error_type = (iter == 0) - ? &env_->manifest_sync_error_ - : &env_->manifest_write_error_; - - // Insert foo=>bar mapping - Options options = CurrentOptions(); - options.env = env_; - options.create_if_missing = true; - options.error_if_exists = false; - DestroyAndReopen(&options); - ASSERT_OK(Put("foo", "bar")); - ASSERT_EQ("bar", Get("foo")); - - // Memtable compaction (will succeed) - dbfull()->TEST_CompactMemTable(); - ASSERT_EQ("bar", Get("foo")); - const int last = config::kMaxMemCompactLevel; - ASSERT_EQ(NumTableFilesAtLevel(last), 1); // foo=>bar is now in last level - - // Merging compaction (will fail) - error_type->Release_Store(env_); - dbfull()->TEST_CompactRange(last, NULL, NULL); // Should fail - ASSERT_EQ("bar", Get("foo")); - - // Recovery: should not lose data - error_type->Release_Store(NULL); - Reopen(&options); - ASSERT_EQ("bar", Get("foo")); - } -} - -TEST(DBTest, MissingSSTFile) { - ASSERT_OK(Put("foo", "bar")); - ASSERT_EQ("bar", Get("foo")); - - // Dump the memtable to disk. - dbfull()->TEST_CompactMemTable(); - ASSERT_EQ("bar", Get("foo")); - - Close(); - ASSERT_TRUE(DeleteAnSSTFile()); - Options options = CurrentOptions(); - options.paranoid_checks = true; - Status s = TryReopen(&options); - ASSERT_TRUE(!s.ok()); - ASSERT_TRUE(s.ToString().find("issing") != std::string::npos) - << s.ToString(); -} - -TEST(DBTest, StillReadSST) { - ASSERT_OK(Put("foo", "bar")); - ASSERT_EQ("bar", Get("foo")); - - // Dump the memtable to disk. - dbfull()->TEST_CompactMemTable(); - ASSERT_EQ("bar", Get("foo")); - Close(); - ASSERT_GT(RenameLDBToSST(), 0); - Options options = CurrentOptions(); - options.paranoid_checks = true; - Status s = TryReopen(&options); - ASSERT_TRUE(s.ok()); - ASSERT_EQ("bar", Get("foo")); -} - +#endif TEST(DBTest, FilesDeletedAfterCompaction) { ASSERT_OK(Put("foo", "v2")); Compact("a", "z"); @@ -1713,7 +1557,7 @@ TEST(DBTest, BloomFilter) { Options options = CurrentOptions(); options.env = env_; options.block_cache = NewLRUCache(0); // Prevent cache hits - options.filter_policy = NewBloomFilterPolicy(10); + options.filter_policy = NewBloomFilterPolicy2(16); Reopen(&options); // Populate multiple layers @@ -1728,12 +1572,12 @@ TEST(DBTest, BloomFilter) { dbfull()->TEST_CompactMemTable(); // Prevent auto compactions triggered by seeks - env_->delay_data_sync_.Release_Store(env_); + env_->delay_sstable_sync_.Release_Store(env_); // Lookup present keys. Should rarely read from small sstable. env_->random_read_counter_.Reset(); for (int i = 0; i < N; i++) { - ASSERT_EQ(Key(i), Get(Key(i))); + ASSERT_EQ(Key(i), GetNoCache(Key(i))); } int reads = env_->random_read_counter_.Read(); fprintf(stderr, "%d present => %d reads\n", N, reads); @@ -1743,13 +1587,13 @@ TEST(DBTest, BloomFilter) { // Lookup present keys. Should rarely read from either sstable. env_->random_read_counter_.Reset(); for (int i = 0; i < N; i++) { - ASSERT_EQ("NOT_FOUND", Get(Key(i) + ".missing")); + ASSERT_EQ("NOT_FOUND", GetNoCache(Key(i) + ".missing")); } reads = env_->random_read_counter_.Read(); fprintf(stderr, "%d missing => %d reads\n", N, reads); ASSERT_LE(reads, 3*N/100); - env_->delay_data_sync_.Release_Store(NULL); + env_->delay_sstable_sync_.Release_Store(NULL); Close(); delete options.block_cache; delete options.filter_policy; @@ -1809,7 +1653,7 @@ static void MTThreadBody(void* arg) { ASSERT_EQ(k, key); ASSERT_GE(w, 0); ASSERT_LT(w, kNumThreads); - ASSERT_LE(static_cast(c), reinterpret_cast( + ASSERT_LE(c, reinterpret_cast( t->state->counter[w].Acquire_Load())); } } @@ -1834,27 +1678,35 @@ TEST(DBTest, MultiThreaded) { // Start threads MTThread thread[kNumThreads]; + pthread_t tid; for (int id = 0; id < kNumThreads; id++) { thread[id].state = &mt; thread[id].id = id; - env_->StartThread(MTThreadBody, &thread[id]); + tid=env_->StartThread(MTThreadBody, &thread[id]); + pthread_detach(tid); } // Let them run for a while - DelayMilliseconds(kTestSeconds * 1000); + env_->SleepForMicroseconds(kTestSeconds * 1000000); // Stop the threads and wait for them to finish mt.stop.Release_Store(&mt); for (int id = 0; id < kNumThreads; id++) { while (mt.thread_done[id].Acquire_Load() == NULL) { - DelayMilliseconds(100); + env_->SleepForMicroseconds(100000); } } } while (ChangeOptions()); } namespace { -typedef std::map KVMap; +struct KVEntry +{ + std::string m_Value; + KeyMetaData m_Meta; +}; + +typedef std::map KVMap; } class ModelDB: public DB { @@ -1866,14 +1718,21 @@ class ModelDB: public DB { explicit ModelDB(const Options& options): options_(options) { } ~ModelDB() { } - virtual Status Put(const WriteOptions& o, const Slice& k, const Slice& v) { - return DB::Put(o, k, v); + virtual Status Put(const WriteOptions& o, const Slice& k, const Slice& v, const KeyMetaData * meta=NULL) { + return DB::Put(o, k, v, meta); } virtual Status Delete(const WriteOptions& o, const Slice& key) { return DB::Delete(o, key); } virtual Status Get(const ReadOptions& options, - const Slice& key, std::string* value) { + const Slice& key, std::string* value, + KeyMetaData * meta = NULL) { + assert(false); // Not implemented + return Status::NotFound(key); + } + virtual Status Get(const ReadOptions& options, + const Slice& key, Value* value, + KeyMetaData * meta = NULL) { assert(false); // Not implemented return Status::NotFound(key); } @@ -1901,8 +1760,13 @@ class ModelDB: public DB { class Handler : public WriteBatch::Handler { public: KVMap* map_; - virtual void Put(const Slice& key, const Slice& value) { - (*map_)[key.ToString()] = value.ToString(); + virtual void Put(const Slice& key, const Slice& value, + const ValueType & type, const ExpiryTimeMicros & expiry) { + KVEntry ent; + ent.m_Value=value.ToString(); + ent.m_Meta.m_Type=type; + ent.m_Meta.m_Expiry=expiry; + (*map_)[key.ToString()] = ent; } virtual void Delete(const Slice& key) { map_->erase(key.ToString()); @@ -1948,7 +1812,7 @@ class ModelDB: public DB { virtual void Next() { ++iter_; } virtual void Prev() { --iter_; } virtual Slice key() const { return iter_->first; } - virtual Slice value() const { return iter_->second; } + virtual Slice value() const { return iter_->second.m_Value; } virtual Status status() const { return Status::OK(); } private: const KVMap* const map_; @@ -2085,6 +1949,44 @@ TEST(DBTest, Randomized) { } while (ChangeOptions()); } + +class SimpleBugs +{ + // need a class for the test harness +}; + + +TEST(SimpleBugs, TieredRecoveryLog) +{ + // DB::Open created first recovery log directly + // which lead to it NOT being in tiered storage location. + // nope std::string dbname = test::TmpDir() + "/leveldb_nontiered"; + std::string dbname = "leveldb"; + std::string fastname = test::TmpDir() + "/leveldb_fast"; + std::string slowname = test::TmpDir() + "/leveldb_slow"; + std::string combined; + + DB* db = NULL; + Options opts; + + opts.tiered_slow_level = 4; + opts.tiered_fast_prefix = fastname; + opts.tiered_slow_prefix = slowname; + opts.create_if_missing = true; + + Env::Default()->CreateDir(fastname); + Env::Default()->CreateDir(slowname); + + Status s = DB::Open(opts, dbname, &db); + ASSERT_OK(s); + ASSERT_TRUE(db != NULL); + + delete db; + DestroyDB(dbname, opts); + +} // TieredRecoveryLog + + std::string MakeKey(unsigned int num) { char buf[30]; snprintf(buf, sizeof(buf), "%016u", num); @@ -2113,14 +2015,13 @@ void BM_LogAndApply(int iters, int num_base_files) { InternalKeyComparator cmp(BytewiseComparator()); Options options; VersionSet vset(dbname, &options, NULL, &cmp); - bool save_manifest; - ASSERT_OK(vset.Recover(&save_manifest)); + ASSERT_OK(vset.Recover()); VersionEdit vbase; uint64_t fnum = 1; for (int i = 0; i < num_base_files; i++) { - InternalKey start(MakeKey(2*fnum), 1, kTypeValue); - InternalKey limit(MakeKey(2*fnum+1), 1, kTypeDeletion); - vbase.AddFile(2, fnum++, 1 /* file size */, start, limit); + InternalKey start(MakeKey(2*fnum), 0, 1, kTypeValue); + InternalKey limit(MakeKey(2*fnum+1), 0, 1, kTypeDeletion); + vbase.AddFile2(2, fnum++, 1 /* file size */, start, limit, 0,0,0); } ASSERT_OK(vset.LogAndApply(&vbase, &mu)); @@ -2129,9 +2030,9 @@ void BM_LogAndApply(int iters, int num_base_files) { for (int i = 0; i < iters; i++) { VersionEdit vedit; vedit.DeleteFile(2, fnum); - InternalKey start(MakeKey(2*fnum), 1, kTypeValue); - InternalKey limit(MakeKey(2*fnum+1), 1, kTypeDeletion); - vedit.AddFile(2, fnum++, 1 /* file size */, start, limit); + InternalKey start(MakeKey(2*fnum), 0, 1, kTypeValue); + InternalKey limit(MakeKey(2*fnum+1), 0, 1, kTypeDeletion); + vedit.AddFile2(2, fnum++, 1 /* file size */, start, limit, 0,0,0); vset.LogAndApply(&vedit, &mu); } uint64_t stop_micros = env->NowMicros(); diff --git a/src/leveldb/db/dbformat.cc b/src/leveldb/db/dbformat.cc index 20a7ca446..6d44ea114 100644 --- a/src/leveldb/db/dbformat.cc +++ b/src/leveldb/db/dbformat.cc @@ -3,7 +3,9 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include +//#include "leveldb/expiry.h" #include "db/dbformat.h" +#include "db/version_set.h" #include "port/port.h" #include "util/coding.h" @@ -11,26 +13,66 @@ namespace leveldb { static uint64_t PackSequenceAndType(uint64_t seq, ValueType t) { assert(seq <= kMaxSequenceNumber); - assert(t <= kValueTypeForSeek); + // assert(t <= kValueTypeForSeek); requires revisit once expiry live + assert(t <= kTypeValueExplicitExpiry); // temp replacement for above return (seq << 8) | t; } void AppendInternalKey(std::string* result, const ParsedInternalKey& key) { result->append(key.user_key.data(), key.user_key.size()); + if (IsExpiryKey(key.type)) + PutFixed64(result, key.expiry); PutFixed64(result, PackSequenceAndType(key.sequence, key.type)); } std::string ParsedInternalKey::DebugString() const { char buf[50]; - snprintf(buf, sizeof(buf), "' @ %llu : %d", - (unsigned long long) sequence, - int(type)); + if (IsExpiryKey(type)) + snprintf(buf, sizeof(buf), "' @ %llu %llu : %d", + (unsigned long long) expiry, + (unsigned long long) sequence, + int(type)); + else + snprintf(buf, sizeof(buf), "' @ %llu : %d", + (unsigned long long) sequence, + int(type)); std::string result = "'"; - result += EscapeString(user_key.ToString()); + result += HexString(user_key.ToString()); result += buf; return result; } +std::string ParsedInternalKey::DebugStringHex() const { + char buf[50]; + if (IsExpiryKey(type)) + snprintf(buf, sizeof(buf), "' @ %llu %llu : %d", + (unsigned long long) expiry, + (unsigned long long) sequence, + int(type)); + else + snprintf(buf, sizeof(buf), "' @ %llu : %d", + (unsigned long long) sequence, + int(type)); + std::string result = "'"; + result += HexString(user_key); + result += buf; + return result; +} + + +const char * KeyTypeString(ValueType val_type) { + const char * ret_ptr; + switch(val_type) + { + case kTypeDeletion: ret_ptr="kTypeDelete"; break; + case kTypeValue: ret_ptr="kTypeValue"; break; + case kTypeValueWriteTime: ret_ptr="kTypeValueWriteTime"; break; + case kTypeValueExplicitExpiry: ret_ptr="kTypeValueExplicitExpiry"; break; + default: ret_ptr="(unknown ValueType)"; break; + } // switch + return(ret_ptr); +} + std::string InternalKey::DebugString() const { std::string result; ParsedInternalKey parsed; @@ -54,8 +96,10 @@ int InternalKeyComparator::Compare(const Slice& akey, const Slice& bkey) const { // decreasing type (though sequence# should be enough to disambiguate) int r = user_comparator_->Compare(ExtractUserKey(akey), ExtractUserKey(bkey)); if (r == 0) { - const uint64_t anum = DecodeFixed64(akey.data() + akey.size() - 8); - const uint64_t bnum = DecodeFixed64(bkey.data() + bkey.size() - 8); + uint64_t anum = DecodeFixed64(akey.data() + akey.size() - 8); + uint64_t bnum = DecodeFixed64(bkey.data() + bkey.size() - 8); + if (IsExpiryKey((ValueType)*(unsigned char *)&anum)) *(unsigned char*)&anum=(unsigned char)kTypeValue; + if (IsExpiryKey((ValueType)*(unsigned char *)&bnum)) *(unsigned char*)&bnum=(unsigned char)kTypeValue; if (anum > bnum) { r = -1; } else if (anum < bnum) { @@ -118,7 +162,8 @@ bool InternalFilterPolicy::KeyMayMatch(const Slice& key, const Slice& f) const { return user_policy_->KeyMayMatch(ExtractUserKey(key), f); } -LookupKey::LookupKey(const Slice& user_key, SequenceNumber s) { + LookupKey::LookupKey(const Slice& user_key, SequenceNumber s, KeyMetaData * meta) { + meta_=meta; size_t usize = user_key.size(); size_t needed = usize + 13; // A conservative estimate char* dst; @@ -137,4 +182,109 @@ LookupKey::LookupKey(const Slice& user_key, SequenceNumber s) { end_ = dst; } + +KeyRetirement::KeyRetirement( + const Comparator * Comparator, + SequenceNumber SmallestSnapshot, + const Options * Opts, + Compaction * const Compaction) + : has_current_user_key(false), last_sequence_for_key(kMaxSequenceNumber), + user_comparator(Comparator), smallest_snapshot(SmallestSnapshot), + options(Opts), compaction(Compaction), + valid(false), dropped(0), expired(0) +{ + // NULL is ok for compaction + valid=(NULL!=user_comparator); + + return; +} // KeyRetirement::KeyRetirement + + +KeyRetirement::~KeyRetirement() +{ + if (0!=expired) + gPerfCounters->Add(ePerfExpiredKeys, expired); +} // KeyRetirement::~KeyRetirement + + +bool +KeyRetirement::operator()( + Slice & key) +{ + ParsedInternalKey ikey; + bool drop = false, expire_flag; + + if (valid) + { + if (!ParseInternalKey(key, &ikey)) + { + // Do not hide error keys + current_user_key.clear(); + has_current_user_key = false; + last_sequence_for_key = kMaxSequenceNumber; + } // else + else + { + if (!has_current_user_key || + user_comparator->Compare(ikey.user_key, + Slice(current_user_key)) != 0) + { + // First occurrence of this user key + current_user_key.assign(ikey.user_key.data(), ikey.user_key.size()); + has_current_user_key = true; + last_sequence_for_key = kMaxSequenceNumber; + } // if + + if (last_sequence_for_key <= smallest_snapshot) + { + // Hidden by an newer entry for same user key + drop = true; // (A) + } // if + + else + { + expire_flag=false; + if (NULL!=options && options->ExpiryActivated()) + expire_flag=options->expiry_module->KeyRetirementCallback(ikey); + + if ((ikey.type == kTypeDeletion || expire_flag) + && ikey.sequence <= smallest_snapshot + && NULL!=compaction // mem to level0 ignores this test + && compaction->IsBaseLevelForKey(ikey.user_key)) + { + // For this user key: + // (1) there is no data in higher levels + // (2) data in lower levels will have larger sequence numbers + // (3) data in layers that are being compacted here and have + // smaller sequence numbers will be dropped in the next + // few iterations of this loop (by rule (A) above). + // Therefore this deletion marker is obsolete and can be dropped. + drop = true; + + if (expire_flag) + ++expired; + else + ++dropped; + } // if + } // else + + last_sequence_for_key = ikey.sequence; + } // else + } // if + +#if 0 + // needs clean up to be used again + Log(options_.info_log, + " Compact: %s, seq %d, type: %d %d, drop: %d, is_base: %d, " + "%d smallest_snapshot: %d", + ikey.user_key.ToString().c_str(), + (int)ikey.sequence, ikey.type, kTypeValue, drop, + compact->compaction->IsBaseLevelForKey(ikey.user_key), + (int)last_sequence_for_key, (int)compact->smallest_snapshot); +#endif + return(drop); + +} // KeyRetirement::operator(Slice & ) + + } // namespace leveldb diff --git a/src/leveldb/db/dbformat.h b/src/leveldb/db/dbformat.h index ea897b13c..ec3c80c98 100644 --- a/src/leveldb/db/dbformat.h +++ b/src/leveldb/db/dbformat.h @@ -2,13 +2,14 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#ifndef STORAGE_LEVELDB_DB_DBFORMAT_H_ -#define STORAGE_LEVELDB_DB_DBFORMAT_H_ +#ifndef STORAGE_LEVELDB_DB_FORMAT_H_ +#define STORAGE_LEVELDB_DB_FORMAT_H_ #include #include "leveldb/comparator.h" #include "leveldb/db.h" #include "leveldb/filter_policy.h" +#include "leveldb/options.h" #include "leveldb/slice.h" #include "leveldb/table_builder.h" #include "util/coding.h" @@ -16,19 +17,33 @@ namespace leveldb { +class Compaction; + // Grouping of constants. We may want to make some of these // parameters set via options. namespace config { static const int kNumLevels = 7; +static const int kNumOverlapLevels = 2; // Level-0 compaction is started when we hit this many files. -static const int kL0_CompactionTrigger = 4; +// Google: static const size_t kL0_CompactionTrigger = 4; +static const size_t kL0_CompactionTrigger = 6; + +// Level-0 (any overlapped level) number of files where a grooming +// compaction could start +static const size_t kL0_GroomingTrigger = 4; +static const size_t kL0_GroomingTrigger10min = 2; +static const size_t kL0_GroomingTrigger20min = 1; + +// ... time limits in microseconds +static const size_t kL0_Grooming10minMicros = 10 * 60 * 1000000; +static const size_t kL0_Grooming20minMicros = 20 * 60 * 1000000; // Soft limit on number of level-0 files. We slow down writes at this point. -static const int kL0_SlowdownWritesTrigger = 8; +static const size_t kL0_SlowdownWritesTrigger = 8; // Maximum number of level-0 files. We stop writes at this point. -static const int kL0_StopWritesTrigger = 12; +static const size_t kL0_StopWritesTrigger = 12; // Maximum level to which a new compacted memtable is pushed if it // does not create overlap. We try to push to level 2 to avoid the @@ -36,31 +51,28 @@ static const int kL0_StopWritesTrigger = 12; // expensive manifest file operations. We do not push all the way to // the largest level since that can generate a lot of wasted disk // space if the same key space is being repeatedly overwritten. -static const int kMaxMemCompactLevel = 2; - -// Approximate gap in bytes between samples of data read during iteration. -static const int kReadBytesPeriod = 1048576; +// Basho: push to kNumOverlapLevels +1 ... beyond "landing level" +static const unsigned kMaxMemCompactLevel = kNumOverlapLevels+1; } // namespace config class InternalKey; -// Value types encoded as the last component of internal keys. -// DO NOT CHANGE THESE ENUM VALUES: they are embedded in the on-disk -// data structures. -enum ValueType { - kTypeDeletion = 0x0, - kTypeValue = 0x1 -}; // kValueTypeForSeek defines the ValueType that should be passed when // constructing a ParsedInternalKey object for seeking to a particular // sequence number (since we sort sequence numbers in decreasing order // and the value type is embedded as the low 8 bits in the sequence // number in internal keys, we need to use the highest-numbered // ValueType, not the lowest). +// Riak note: kValueTypeForSeek is placed within temporary keys +// for comparisons. Using kTypeValueExplicitExpiry would +// force more code changes to increase internal key size. +// But ValueTypeForSeek is redundant to sequence number for +// disambiguaty. Therefore going for easiest path and NOT changing. static const ValueType kValueTypeForSeek = kTypeValue; typedef uint64_t SequenceNumber; +typedef uint64_t ExpiryTimeMicros; // We leave eight bits empty at the bottom so a type and sequence# // can be packed together into 64-bits. @@ -69,20 +81,17 @@ static const SequenceNumber kMaxSequenceNumber = struct ParsedInternalKey { Slice user_key; + ExpiryTimeMicros expiry; SequenceNumber sequence; ValueType type; ParsedInternalKey() { } // Intentionally left uninitialized (for speed) - ParsedInternalKey(const Slice& u, const SequenceNumber& seq, ValueType t) - : user_key(u), sequence(seq), type(t) { } + ParsedInternalKey(const Slice& u, const ExpiryTimeMicros & exp, const SequenceNumber& seq, ValueType t) + : user_key(u), expiry(exp), sequence(seq), type(t) { } std::string DebugString() const; + std::string DebugStringHex() const; }; -// Return the length of the encoding of "key". -inline size_t InternalKeyEncodingLength(const ParsedInternalKey& key) { - return key.user_key.size() + 8; -} - // Append the serialization of "key" to *result. extern void AppendInternalKey(std::string* result, const ParsedInternalKey& key); @@ -94,20 +103,76 @@ extern void AppendInternalKey(std::string* result, extern bool ParseInternalKey(const Slice& internal_key, ParsedInternalKey* result); -// Returns the user key portion of an internal key. -inline Slice ExtractUserKey(const Slice& internal_key) { - assert(internal_key.size() >= 8); - return Slice(internal_key.data(), internal_key.size() - 8); -} - inline ValueType ExtractValueType(const Slice& internal_key) { assert(internal_key.size() >= 8); const size_t n = internal_key.size(); - uint64_t num = DecodeFixed64(internal_key.data() + n - 8); - unsigned char c = num & 0xff; + unsigned char c = DecodeLeastFixed64(internal_key.data() + n - sizeof(SequenceNumber)); return static_cast(c); } +inline size_t KeySuffixSize(ValueType val_type) { + size_t ret_val; + switch(val_type) + { + case kTypeDeletion: + case kTypeValue: + ret_val=sizeof(SequenceNumber); + break; + + case kTypeValueWriteTime: + case kTypeValueExplicitExpiry: + ret_val=sizeof(SequenceNumber) + sizeof(ExpiryTimeMicros); + break; + + default: + // assert(0); cannot use because bloom filter block's name is passed as internal key + ret_val=sizeof(SequenceNumber); + break; + } // switch + return(ret_val); +} + +const char * KeyTypeString(ValueType val_type); + +inline size_t KeySuffixSize(const Slice & internal_key) { + return(KeySuffixSize(ExtractValueType(internal_key))); +} + +// Returns the user key portion of an internal key. +inline Slice ExtractUserKey(const Slice& internal_key) { + assert(internal_key.size() >= 8); + return Slice(internal_key.data(), internal_key.size() - KeySuffixSize(internal_key)); +} + +// Returns the sequence number with ValueType removed +inline SequenceNumber ExtractSequenceNumber(const Slice& internal_key) { + assert(internal_key.size() >= 8); + return(DecodeFixed64(internal_key.data() + internal_key.size() - 8)>>8); +} + +// Return the length of the encoding of "key". +inline size_t InternalKeyEncodingLength(const ParsedInternalKey& key) { + return key.user_key.size() + KeySuffixSize(key.type); +} + +// Riak: is this an expiry key and therefore contain extra ExpiryTime field +inline bool IsExpiryKey(ValueType val_type) { + return(kTypeValueWriteTime==val_type || kTypeValueExplicitExpiry==val_type); +} + +// Riak: is this an expiry key and therefore contain extra ExpiryTime field +inline bool IsExpiryKey(const Slice & internal_key) { + return(internal_key.size()>=KeySuffixSize(kTypeValueWriteTime) + && IsExpiryKey(ExtractValueType(internal_key))); +} + +// Riak: extracts expiry value +inline ExpiryTimeMicros ExtractExpiry(const Slice& internal_key) { + assert(internal_key.size() >= KeySuffixSize(kTypeValueWriteTime)); + assert(IsExpiryKey(internal_key)); + return(DecodeFixed64(internal_key.data() + internal_key.size() - KeySuffixSize(kTypeValueWriteTime))); +} + // A comparator for internal keys that uses a specified comparator for // the user key portion and breaks ties by decreasing sequence number. class InternalKeyComparator : public Comparator { @@ -129,7 +194,7 @@ class InternalKeyComparator : public Comparator { // Filter policy wrapper that converts from internal keys to user keys class InternalFilterPolicy : public FilterPolicy { - private: + protected: const FilterPolicy* const user_policy_; public: explicit InternalFilterPolicy(const FilterPolicy* p) : user_policy_(p) { } @@ -138,6 +203,12 @@ class InternalFilterPolicy : public FilterPolicy { virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const; }; +class InternalFilterPolicy2 : public InternalFilterPolicy { + public: + explicit InternalFilterPolicy2(const FilterPolicy* p) : InternalFilterPolicy(p) { } + virtual ~InternalFilterPolicy2() {delete user_policy_;}; +}; + // Modules in this directory should keep internal keys wrapped inside // the following class instead of plain strings so that we do not // incorrectly use string comparisons instead of an InternalKeyComparator. @@ -146,8 +217,8 @@ class InternalKey { std::string rep_; public: InternalKey() { } // Leave rep_ as empty to indicate it is invalid - InternalKey(const Slice& user_key, SequenceNumber s, ValueType t) { - AppendInternalKey(&rep_, ParsedInternalKey(user_key, s, t)); + InternalKey(const Slice& user_key, ExpiryTimeMicros exp, SequenceNumber s, ValueType t) { + AppendInternalKey(&rep_, ParsedInternalKey(user_key, exp, s, t)); } void DecodeFrom(const Slice& s) { rep_.assign(s.data(), s.size()); } @@ -157,6 +228,7 @@ class InternalKey { } Slice user_key() const { return ExtractUserKey(rep_); } + Slice internal_key() const { return Slice(rep_); } void SetFrom(const ParsedInternalKey& p) { rep_.clear(); @@ -181,8 +253,12 @@ inline bool ParseInternalKey(const Slice& internal_key, unsigned char c = num & 0xff; result->sequence = num >> 8; result->type = static_cast(c); - result->user_key = Slice(internal_key.data(), n - 8); - return (c <= static_cast(kTypeValue)); + if (IsExpiryKey((ValueType)c)) + result->expiry=DecodeFixed64(internal_key.data() + n - KeySuffixSize((ValueType)c)); + else + result->expiry=0; + result->user_key = Slice(internal_key.data(), n - KeySuffixSize((ValueType)c)); + return (c <= static_cast(kTypeValueExplicitExpiry)); } // A helper class useful for DBImpl::Get() @@ -190,7 +266,7 @@ class LookupKey { public: // Initialize *this for looking up user_key at a snapshot with // the specified sequence number. - LookupKey(const Slice& user_key, SequenceNumber sequence); + LookupKey(const Slice& user_key, SequenceNumber sequence, KeyMetaData * meta=NULL); ~LookupKey(); @@ -201,12 +277,38 @@ class LookupKey { Slice internal_key() const { return Slice(kstart_, end_ - kstart_); } // Return the user key - Slice user_key() const { return Slice(kstart_, end_ - kstart_ - 8); } + Slice user_key() const + { return Slice(kstart_, end_ - kstart_ - KeySuffixSize(internal_key())); } + + // did requestor have metadata object? + bool WantsKeyMetaData() const {return(NULL!=meta_);}; + + void SetKeyMetaData(ValueType type, SequenceNumber seq, ExpiryTimeMicros expiry) const + {if (NULL!=meta_) + { + meta_->m_Type=type; + meta_->m_Sequence=seq; + meta_->m_Expiry=expiry; + } // if + }; + + void SetKeyMetaData(const ParsedInternalKey & pi_key) const + {if (NULL!=meta_) + { + meta_->m_Type=pi_key.type; + meta_->m_Sequence=pi_key.sequence; + meta_->m_Expiry=pi_key.expiry; + } // if + }; + + void SetKeyMetaData(const KeyMetaData & meta) const + {if (NULL!=meta_) *meta_=meta;}; private: // We construct a char array of the form: // klength varint32 <-- start_ // userkey char[klength] <-- kstart_ + // optional uint64 // tag uint64 // <-- end_ // The array is a suitable MemTable key. @@ -216,6 +318,9 @@ class LookupKey { const char* end_; char space_[200]; // Avoid allocation for short keys + // allow code that finds the key to place metadata here, even if 'const' + mutable KeyMetaData * meta_; + // No copying allowed LookupKey(const LookupKey&); void operator=(const LookupKey&); @@ -223,8 +328,47 @@ class LookupKey { inline LookupKey::~LookupKey() { if (start_ != space_) delete[] start_; -} +}; + + +// this class was constructed from code with DBImpl::DoCompactionWork (db_impl.cc) +// so it could be shared within BuildTable (and thus reduce Level 0 bloating) +class KeyRetirement +{ +protected: + // "state" from previous key reviewed + std::string current_user_key; + bool has_current_user_key; + SequenceNumber last_sequence_for_key; + + // database values needed for processing + const Comparator * user_comparator; + SequenceNumber smallest_snapshot; + const Options * options; + Compaction * const compaction; + + bool valid; + size_t dropped; // tombstone or old version dropped + size_t expired; // expired dropped + +public: + KeyRetirement(const Comparator * UserComparator, SequenceNumber SmallestSnapshot, + const Options * Opts, Compaction * const Compaction=NULL); + + virtual ~KeyRetirement(); + + bool operator()(Slice & key); + + size_t GetDroppedCount() const {return(dropped);}; + size_t GetExpiredCount() const {return(expired);}; + +private: + KeyRetirement(); + KeyRetirement(const KeyRetirement &); + const KeyRetirement & operator=(const KeyRetirement &); + +}; // class KeyRetirement } // namespace leveldb -#endif // STORAGE_LEVELDB_DB_DBFORMAT_H_ +#endif // STORAGE_LEVELDB_DB_FORMAT_H_ diff --git a/src/leveldb/db/dbformat_test.cc b/src/leveldb/db/dbformat_test.cc index 5d82f5d31..3ad1cd647 100644 --- a/src/leveldb/db/dbformat_test.cc +++ b/src/leveldb/db/dbformat_test.cc @@ -9,10 +9,11 @@ namespace leveldb { static std::string IKey(const std::string& user_key, + ExpiryTimeMicros exp, uint64_t seq, ValueType vt) { std::string encoded; - AppendInternalKey(&encoded, ParsedInternalKey(user_key, seq, vt)); + AppendInternalKey(&encoded, ParsedInternalKey(user_key, exp, seq, vt)); return encoded; } @@ -29,12 +30,13 @@ static std::string ShortSuccessor(const std::string& s) { } static void TestKey(const std::string& key, + ExpiryTimeMicros exp, uint64_t seq, ValueType vt) { - std::string encoded = IKey(key, seq, vt); + std::string encoded = IKey(key, exp, seq, vt); Slice in(encoded); - ParsedInternalKey decoded("", 0, kTypeValue); + ParsedInternalKey decoded("", 0, 0, kTypeValue); ASSERT_TRUE(ParseInternalKey(in, &decoded)); ASSERT_EQ(key, decoded.user_key.ToString()); @@ -56,53 +58,53 @@ TEST(FormatTest, InternalKey_EncodeDecode) { }; for (int k = 0; k < sizeof(keys) / sizeof(keys[0]); k++) { for (int s = 0; s < sizeof(seq) / sizeof(seq[0]); s++) { - TestKey(keys[k], seq[s], kTypeValue); - TestKey("hello", 1, kTypeDeletion); + TestKey(keys[k], 0, seq[s], kTypeValue); + TestKey("hello", 0, 1, kTypeDeletion); } } } TEST(FormatTest, InternalKeyShortSeparator) { // When user keys are same - ASSERT_EQ(IKey("foo", 100, kTypeValue), - Shorten(IKey("foo", 100, kTypeValue), - IKey("foo", 99, kTypeValue))); - ASSERT_EQ(IKey("foo", 100, kTypeValue), - Shorten(IKey("foo", 100, kTypeValue), - IKey("foo", 101, kTypeValue))); - ASSERT_EQ(IKey("foo", 100, kTypeValue), - Shorten(IKey("foo", 100, kTypeValue), - IKey("foo", 100, kTypeValue))); - ASSERT_EQ(IKey("foo", 100, kTypeValue), - Shorten(IKey("foo", 100, kTypeValue), - IKey("foo", 100, kTypeDeletion))); + ASSERT_EQ(IKey("foo", 0, 100, kTypeValue), + Shorten(IKey("foo", 0, 100, kTypeValue), + IKey("foo", 0, 99, kTypeValue))); + ASSERT_EQ(IKey("foo", 0, 100, kTypeValue), + Shorten(IKey("foo", 0, 100, kTypeValue), + IKey("foo", 0, 101, kTypeValue))); + ASSERT_EQ(IKey("foo", 0, 100, kTypeValue), + Shorten(IKey("foo", 0, 100, kTypeValue), + IKey("foo", 0, 100, kTypeValue))); + ASSERT_EQ(IKey("foo", 0, 100, kTypeValue), + Shorten(IKey("foo", 0, 100, kTypeValue), + IKey("foo", 0, 100, kTypeDeletion))); // When user keys are misordered - ASSERT_EQ(IKey("foo", 100, kTypeValue), - Shorten(IKey("foo", 100, kTypeValue), - IKey("bar", 99, kTypeValue))); + ASSERT_EQ(IKey("foo", 0, 100, kTypeValue), + Shorten(IKey("foo", 0, 100, kTypeValue), + IKey("bar", 0, 99, kTypeValue))); // When user keys are different, but correctly ordered - ASSERT_EQ(IKey("g", kMaxSequenceNumber, kValueTypeForSeek), - Shorten(IKey("foo", 100, kTypeValue), - IKey("hello", 200, kTypeValue))); + ASSERT_EQ(IKey("g", 0, kMaxSequenceNumber, kValueTypeForSeek), + Shorten(IKey("foo", 0, 100, kTypeValue), + IKey("hello", 0, 200, kTypeValue))); // When start user key is prefix of limit user key - ASSERT_EQ(IKey("foo", 100, kTypeValue), - Shorten(IKey("foo", 100, kTypeValue), - IKey("foobar", 200, kTypeValue))); + ASSERT_EQ(IKey("foo", 0, 100, kTypeValue), + Shorten(IKey("foo", 0, 100, kTypeValue), + IKey("foobar", 0, 200, kTypeValue))); // When limit user key is prefix of start user key - ASSERT_EQ(IKey("foobar", 100, kTypeValue), - Shorten(IKey("foobar", 100, kTypeValue), - IKey("foo", 200, kTypeValue))); + ASSERT_EQ(IKey("foobar", 0, 100, kTypeValue), + Shorten(IKey("foobar", 0, 100, kTypeValue), + IKey("foo", 0, 200, kTypeValue))); } TEST(FormatTest, InternalKeyShortestSuccessor) { - ASSERT_EQ(IKey("g", kMaxSequenceNumber, kValueTypeForSeek), - ShortSuccessor(IKey("foo", 100, kTypeValue))); - ASSERT_EQ(IKey("\xff\xff", 100, kTypeValue), - ShortSuccessor(IKey("\xff\xff", 100, kTypeValue))); + ASSERT_EQ(IKey("g", 0, kMaxSequenceNumber, kValueTypeForSeek), + ShortSuccessor(IKey("foo", 0, 100, kTypeValue))); + ASSERT_EQ(IKey("\xff\xff", 0, 100, kTypeValue), + ShortSuccessor(IKey("\xff\xff", 0, 100, kTypeValue))); } } // namespace leveldb diff --git a/src/leveldb/db/fault_injection_test.cc b/src/leveldb/db/fault_injection_test.cc deleted file mode 100644 index 875dfe81e..000000000 --- a/src/leveldb/db/fault_injection_test.cc +++ /dev/null @@ -1,554 +0,0 @@ -// Copyright 2014 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -// This test uses a custom Env to keep track of the state of a filesystem as of -// the last "sync". It then checks for data loss errors by purposely dropping -// file data (or entire files) not protected by a "sync". - -#include "leveldb/db.h" - -#include -#include -#include "db/db_impl.h" -#include "db/filename.h" -#include "db/log_format.h" -#include "db/version_set.h" -#include "leveldb/cache.h" -#include "leveldb/env.h" -#include "leveldb/table.h" -#include "leveldb/write_batch.h" -#include "util/logging.h" -#include "util/mutexlock.h" -#include "util/testharness.h" -#include "util/testutil.h" - -namespace leveldb { - -static const int kValueSize = 1000; -static const int kMaxNumValues = 2000; -static const size_t kNumIterations = 3; - -class FaultInjectionTestEnv; - -namespace { - -// Assume a filename, and not a directory name like "/foo/bar/" -static std::string GetDirName(const std::string filename) { - size_t found = filename.find_last_of("/\\"); - if (found == std::string::npos) { - return ""; - } else { - return filename.substr(0, found); - } -} - -Status SyncDir(const std::string& dir) { - // As this is a test it isn't required to *actually* sync this directory. - return Status::OK(); -} - -// A basic file truncation function suitable for this test. -Status Truncate(const std::string& filename, uint64_t length) { - leveldb::Env* env = leveldb::Env::Default(); - - SequentialFile* orig_file; - Status s = env->NewSequentialFile(filename, &orig_file); - if (!s.ok()) - return s; - - char* scratch = new char[length]; - leveldb::Slice result; - s = orig_file->Read(length, &result, scratch); - delete orig_file; - if (s.ok()) { - std::string tmp_name = GetDirName(filename) + "/truncate.tmp"; - WritableFile* tmp_file; - s = env->NewWritableFile(tmp_name, &tmp_file); - if (s.ok()) { - s = tmp_file->Append(result); - delete tmp_file; - if (s.ok()) { - s = env->RenameFile(tmp_name, filename); - } else { - env->DeleteFile(tmp_name); - } - } - } - - delete[] scratch; - - return s; -} - -struct FileState { - std::string filename_; - ssize_t pos_; - ssize_t pos_at_last_sync_; - ssize_t pos_at_last_flush_; - - FileState(const std::string& filename) - : filename_(filename), - pos_(-1), - pos_at_last_sync_(-1), - pos_at_last_flush_(-1) { } - - FileState() : pos_(-1), pos_at_last_sync_(-1), pos_at_last_flush_(-1) {} - - bool IsFullySynced() const { return pos_ <= 0 || pos_ == pos_at_last_sync_; } - - Status DropUnsyncedData() const; -}; - -} // anonymous namespace - -// A wrapper around WritableFile which informs another Env whenever this file -// is written to or sync'ed. -class TestWritableFile : public WritableFile { - public: - TestWritableFile(const FileState& state, - WritableFile* f, - FaultInjectionTestEnv* env); - virtual ~TestWritableFile(); - virtual Status Append(const Slice& data); - virtual Status Close(); - virtual Status Flush(); - virtual Status Sync(); - - private: - FileState state_; - WritableFile* target_; - bool writable_file_opened_; - FaultInjectionTestEnv* env_; - - Status SyncParent(); -}; - -class FaultInjectionTestEnv : public EnvWrapper { - public: - FaultInjectionTestEnv() : EnvWrapper(Env::Default()), filesystem_active_(true) {} - virtual ~FaultInjectionTestEnv() { } - virtual Status NewWritableFile(const std::string& fname, - WritableFile** result); - virtual Status NewAppendableFile(const std::string& fname, - WritableFile** result); - virtual Status DeleteFile(const std::string& f); - virtual Status RenameFile(const std::string& s, const std::string& t); - - void WritableFileClosed(const FileState& state); - Status DropUnsyncedFileData(); - Status DeleteFilesCreatedAfterLastDirSync(); - void DirWasSynced(); - bool IsFileCreatedSinceLastDirSync(const std::string& filename); - void ResetState(); - void UntrackFile(const std::string& f); - // Setting the filesystem to inactive is the test equivalent to simulating a - // system reset. Setting to inactive will freeze our saved filesystem state so - // that it will stop being recorded. It can then be reset back to the state at - // the time of the reset. - bool IsFilesystemActive() const { return filesystem_active_; } - void SetFilesystemActive(bool active) { filesystem_active_ = active; } - - private: - port::Mutex mutex_; - std::map db_file_state_; - std::set new_files_since_last_dir_sync_; - bool filesystem_active_; // Record flushes, syncs, writes -}; - -TestWritableFile::TestWritableFile(const FileState& state, - WritableFile* f, - FaultInjectionTestEnv* env) - : state_(state), - target_(f), - writable_file_opened_(true), - env_(env) { - assert(f != NULL); -} - -TestWritableFile::~TestWritableFile() { - if (writable_file_opened_) { - Close(); - } - delete target_; -} - -Status TestWritableFile::Append(const Slice& data) { - Status s = target_->Append(data); - if (s.ok() && env_->IsFilesystemActive()) { - state_.pos_ += data.size(); - } - return s; -} - -Status TestWritableFile::Close() { - writable_file_opened_ = false; - Status s = target_->Close(); - if (s.ok()) { - env_->WritableFileClosed(state_); - } - return s; -} - -Status TestWritableFile::Flush() { - Status s = target_->Flush(); - if (s.ok() && env_->IsFilesystemActive()) { - state_.pos_at_last_flush_ = state_.pos_; - } - return s; -} - -Status TestWritableFile::SyncParent() { - Status s = SyncDir(GetDirName(state_.filename_)); - if (s.ok()) { - env_->DirWasSynced(); - } - return s; -} - -Status TestWritableFile::Sync() { - if (!env_->IsFilesystemActive()) { - return Status::OK(); - } - // Ensure new files referred to by the manifest are in the filesystem. - Status s = target_->Sync(); - if (s.ok()) { - state_.pos_at_last_sync_ = state_.pos_; - } - if (env_->IsFileCreatedSinceLastDirSync(state_.filename_)) { - Status ps = SyncParent(); - if (s.ok() && !ps.ok()) { - s = ps; - } - } - return s; -} - -Status FaultInjectionTestEnv::NewWritableFile(const std::string& fname, - WritableFile** result) { - WritableFile* actual_writable_file; - Status s = target()->NewWritableFile(fname, &actual_writable_file); - if (s.ok()) { - FileState state(fname); - state.pos_ = 0; - *result = new TestWritableFile(state, actual_writable_file, this); - // NewWritableFile doesn't append to files, so if the same file is - // opened again then it will be truncated - so forget our saved - // state. - UntrackFile(fname); - MutexLock l(&mutex_); - new_files_since_last_dir_sync_.insert(fname); - } - return s; -} - -Status FaultInjectionTestEnv::NewAppendableFile(const std::string& fname, - WritableFile** result) { - WritableFile* actual_writable_file; - Status s = target()->NewAppendableFile(fname, &actual_writable_file); - if (s.ok()) { - FileState state(fname); - state.pos_ = 0; - { - MutexLock l(&mutex_); - if (db_file_state_.count(fname) == 0) { - new_files_since_last_dir_sync_.insert(fname); - } else { - state = db_file_state_[fname]; - } - } - *result = new TestWritableFile(state, actual_writable_file, this); - } - return s; -} - -Status FaultInjectionTestEnv::DropUnsyncedFileData() { - Status s; - MutexLock l(&mutex_); - for (std::map::const_iterator it = - db_file_state_.begin(); - s.ok() && it != db_file_state_.end(); ++it) { - const FileState& state = it->second; - if (!state.IsFullySynced()) { - s = state.DropUnsyncedData(); - } - } - return s; -} - -void FaultInjectionTestEnv::DirWasSynced() { - MutexLock l(&mutex_); - new_files_since_last_dir_sync_.clear(); -} - -bool FaultInjectionTestEnv::IsFileCreatedSinceLastDirSync( - const std::string& filename) { - MutexLock l(&mutex_); - return new_files_since_last_dir_sync_.find(filename) != - new_files_since_last_dir_sync_.end(); -} - -void FaultInjectionTestEnv::UntrackFile(const std::string& f) { - MutexLock l(&mutex_); - db_file_state_.erase(f); - new_files_since_last_dir_sync_.erase(f); -} - -Status FaultInjectionTestEnv::DeleteFile(const std::string& f) { - Status s = EnvWrapper::DeleteFile(f); - ASSERT_OK(s); - if (s.ok()) { - UntrackFile(f); - } - return s; -} - -Status FaultInjectionTestEnv::RenameFile(const std::string& s, - const std::string& t) { - Status ret = EnvWrapper::RenameFile(s, t); - - if (ret.ok()) { - MutexLock l(&mutex_); - if (db_file_state_.find(s) != db_file_state_.end()) { - db_file_state_[t] = db_file_state_[s]; - db_file_state_.erase(s); - } - - if (new_files_since_last_dir_sync_.erase(s) != 0) { - assert(new_files_since_last_dir_sync_.find(t) == - new_files_since_last_dir_sync_.end()); - new_files_since_last_dir_sync_.insert(t); - } - } - - return ret; -} - -void FaultInjectionTestEnv::ResetState() { - // Since we are not destroying the database, the existing files - // should keep their recorded synced/flushed state. Therefore - // we do not reset db_file_state_ and new_files_since_last_dir_sync_. - MutexLock l(&mutex_); - SetFilesystemActive(true); -} - -Status FaultInjectionTestEnv::DeleteFilesCreatedAfterLastDirSync() { - // Because DeleteFile access this container make a copy to avoid deadlock - mutex_.Lock(); - std::set new_files(new_files_since_last_dir_sync_.begin(), - new_files_since_last_dir_sync_.end()); - mutex_.Unlock(); - Status s; - std::set::const_iterator it; - for (it = new_files.begin(); s.ok() && it != new_files.end(); ++it) { - s = DeleteFile(*it); - } - return s; -} - -void FaultInjectionTestEnv::WritableFileClosed(const FileState& state) { - MutexLock l(&mutex_); - db_file_state_[state.filename_] = state; -} - -Status FileState::DropUnsyncedData() const { - ssize_t sync_pos = pos_at_last_sync_ == -1 ? 0 : pos_at_last_sync_; - return Truncate(filename_, sync_pos); -} - -class FaultInjectionTest { - public: - enum ExpectedVerifResult { VAL_EXPECT_NO_ERROR, VAL_EXPECT_ERROR }; - enum ResetMethod { RESET_DROP_UNSYNCED_DATA, RESET_DELETE_UNSYNCED_FILES }; - - FaultInjectionTestEnv* env_; - std::string dbname_; - Cache* tiny_cache_; - Options options_; - DB* db_; - - FaultInjectionTest() - : env_(new FaultInjectionTestEnv), - tiny_cache_(NewLRUCache(100)), - db_(NULL) { - dbname_ = test::TmpDir() + "/fault_test"; - DestroyDB(dbname_, Options()); // Destroy any db from earlier run - options_.reuse_logs = true; - options_.env = env_; - options_.paranoid_checks = true; - options_.block_cache = tiny_cache_; - options_.create_if_missing = true; - } - - ~FaultInjectionTest() { - CloseDB(); - DestroyDB(dbname_, Options()); - delete tiny_cache_; - delete env_; - } - - void ReuseLogs(bool reuse) { - options_.reuse_logs = reuse; - } - - void Build(int start_idx, int num_vals) { - std::string key_space, value_space; - WriteBatch batch; - for (int i = start_idx; i < start_idx + num_vals; i++) { - Slice key = Key(i, &key_space); - batch.Clear(); - batch.Put(key, Value(i, &value_space)); - WriteOptions options; - ASSERT_OK(db_->Write(options, &batch)); - } - } - - Status ReadValue(int i, std::string* val) const { - std::string key_space, value_space; - Slice key = Key(i, &key_space); - Value(i, &value_space); - ReadOptions options; - return db_->Get(options, key, val); - } - - Status Verify(int start_idx, int num_vals, - ExpectedVerifResult expected) const { - std::string val; - std::string value_space; - Status s; - for (int i = start_idx; i < start_idx + num_vals && s.ok(); i++) { - Value(i, &value_space); - s = ReadValue(i, &val); - if (expected == VAL_EXPECT_NO_ERROR) { - if (s.ok()) { - ASSERT_EQ(value_space, val); - } - } else if (s.ok()) { - fprintf(stderr, "Expected an error at %d, but was OK\n", i); - s = Status::IOError(dbname_, "Expected value error:"); - } else { - s = Status::OK(); // An expected error - } - } - return s; - } - - // Return the ith key - Slice Key(int i, std::string* storage) const { - char buf[100]; - snprintf(buf, sizeof(buf), "%016d", i); - storage->assign(buf, strlen(buf)); - return Slice(*storage); - } - - // Return the value to associate with the specified key - Slice Value(int k, std::string* storage) const { - Random r(k); - return test::RandomString(&r, kValueSize, storage); - } - - Status OpenDB() { - delete db_; - db_ = NULL; - env_->ResetState(); - return DB::Open(options_, dbname_, &db_); - } - - void CloseDB() { - delete db_; - db_ = NULL; - } - - void DeleteAllData() { - Iterator* iter = db_->NewIterator(ReadOptions()); - WriteOptions options; - for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { - ASSERT_OK(db_->Delete(WriteOptions(), iter->key())); - } - - delete iter; - } - - void ResetDBState(ResetMethod reset_method) { - switch (reset_method) { - case RESET_DROP_UNSYNCED_DATA: - ASSERT_OK(env_->DropUnsyncedFileData()); - break; - case RESET_DELETE_UNSYNCED_FILES: - ASSERT_OK(env_->DeleteFilesCreatedAfterLastDirSync()); - break; - default: - assert(false); - } - } - - void PartialCompactTestPreFault(int num_pre_sync, int num_post_sync) { - DeleteAllData(); - Build(0, num_pre_sync); - db_->CompactRange(NULL, NULL); - Build(num_pre_sync, num_post_sync); - } - - void PartialCompactTestReopenWithFault(ResetMethod reset_method, - int num_pre_sync, - int num_post_sync) { - env_->SetFilesystemActive(false); - CloseDB(); - ResetDBState(reset_method); - ASSERT_OK(OpenDB()); - ASSERT_OK(Verify(0, num_pre_sync, FaultInjectionTest::VAL_EXPECT_NO_ERROR)); - ASSERT_OK(Verify(num_pre_sync, num_post_sync, FaultInjectionTest::VAL_EXPECT_ERROR)); - } - - void NoWriteTestPreFault() { - } - - void NoWriteTestReopenWithFault(ResetMethod reset_method) { - CloseDB(); - ResetDBState(reset_method); - ASSERT_OK(OpenDB()); - } - - void DoTest() { - Random rnd(0); - ASSERT_OK(OpenDB()); - for (size_t idx = 0; idx < kNumIterations; idx++) { - int num_pre_sync = rnd.Uniform(kMaxNumValues); - int num_post_sync = rnd.Uniform(kMaxNumValues); - - PartialCompactTestPreFault(num_pre_sync, num_post_sync); - PartialCompactTestReopenWithFault(RESET_DROP_UNSYNCED_DATA, - num_pre_sync, - num_post_sync); - - NoWriteTestPreFault(); - NoWriteTestReopenWithFault(RESET_DROP_UNSYNCED_DATA); - - PartialCompactTestPreFault(num_pre_sync, num_post_sync); - // No new files created so we expect all values since no files will be - // dropped. - PartialCompactTestReopenWithFault(RESET_DELETE_UNSYNCED_FILES, - num_pre_sync + num_post_sync, - 0); - - NoWriteTestPreFault(); - NoWriteTestReopenWithFault(RESET_DELETE_UNSYNCED_FILES); - } - } -}; - -TEST(FaultInjectionTest, FaultTestNoLogReuse) { - ReuseLogs(false); - DoTest(); -} - -TEST(FaultInjectionTest, FaultTestWithLogReuse) { - ReuseLogs(true); - DoTest(); -} - -} // namespace leveldb - -int main(int argc, char** argv) { - return leveldb::test::RunAllTests(); -} diff --git a/src/leveldb/db/filename.cc b/src/leveldb/db/filename.cc index da32946d9..bb90e6560 100644 --- a/src/leveldb/db/filename.cc +++ b/src/leveldb/db/filename.cc @@ -4,9 +4,14 @@ #include #include +#include +#include +#include #include "db/filename.h" #include "db/dbformat.h" +#include "db/version_set.h" #include "leveldb/env.h" +#include "leveldb/status.h" #include "util/logging.h" namespace leveldb { @@ -24,19 +29,50 @@ static std::string MakeFileName(const std::string& name, uint64_t number, return name + buf; } +static std::string MakeFileName2(const Options & options, uint64_t number, + int level, const char* suffix) { + char buf[100]; + if (0<=level) + snprintf(buf, sizeof(buf), "/%s_%-d/%06llu.%s", + suffix, level, + static_cast(number), + suffix); + else if (-1==level) + snprintf(buf, sizeof(buf), "/%s/%06llu.%s", + suffix, + static_cast(number), + suffix); + else if (-2==level) + snprintf(buf, sizeof(buf), "/%06llu.%s", + static_cast(number), + suffix); + + return((level<(int)options.tiered_slow_level ? + options.tiered_fast_prefix : options.tiered_slow_prefix) + buf); +} + +std::string MakeDirName2(const Options & options, + int level, const char* suffix) { + char buf[100]; + if (-1!=level) + snprintf(buf, sizeof(buf), "/%s_%-d", + suffix, level); + else + snprintf(buf, sizeof(buf), "/%s", + suffix); + + return((level<(int)options.tiered_slow_level ? + options.tiered_fast_prefix : options.tiered_slow_prefix) + buf); +} + std::string LogFileName(const std::string& name, uint64_t number) { assert(number > 0); return MakeFileName(name, number, "log"); } -std::string TableFileName(const std::string& name, uint64_t number) { +std::string TableFileName(const Options & options, uint64_t number, int level) { assert(number > 0); - return MakeFileName(name, number, "ldb"); -} - -std::string SSTTableFileName(const std::string& name, uint64_t number) { - assert(number > 0); - return MakeFileName(name, number, "sst"); + return MakeFileName2(options, number, level, "sst"); } std::string DescriptorFileName(const std::string& dbname, uint64_t number) { @@ -69,6 +105,36 @@ std::string OldInfoLogFileName(const std::string& dbname) { return dbname + "/LOG.old"; } +// +std::string CowFileName(const std::string& dbname) { + return dbname + "/COW"; +} + + +// Append appropriate "backup" string to input path +std::string BackupPath(const std::string& dbname, int backup_num) { + std::string dirname; + + char buf[100]; + if (0 != backup_num) + snprintf(buf, sizeof(buf), "/backup.%-d", backup_num); + else + snprintf(buf, sizeof(buf), "/backup"); + + return(dbname + buf); +} + + +// update tiered_fast_prefix and tiered_slow_prefix members of +// given Options object to point to desired backup path +bool SetBackupPaths(Options & options, int backup_num) { + + options.tiered_fast_prefix = BackupPath(options.tiered_fast_prefix, backup_num); + options.tiered_slow_prefix = BackupPath(options.tiered_slow_prefix, backup_num); + + return(true); +} + // Owned filenames have the form: // dbname/CURRENT @@ -76,7 +142,8 @@ std::string OldInfoLogFileName(const std::string& dbname) { // dbname/LOG // dbname/LOG.old // dbname/MANIFEST-[0-9]+ -// dbname/[0-9]+.(log|sst|ldb) +// dbname/[0-9]+.(log|sst) +// dbname/COW bool ParseFileName(const std::string& fname, uint64_t* number, FileType* type) { @@ -84,6 +151,9 @@ bool ParseFileName(const std::string& fname, if (rest == "CURRENT") { *number = 0; *type = kCurrentFile; + } else if (rest == "COW") { + *number = 0; + *type = kCacheWarming; } else if (rest == "LOCK") { *number = 0; *type = kDBLockFile; @@ -111,7 +181,7 @@ bool ParseFileName(const std::string& fname, Slice suffix = rest; if (suffix == Slice(".log")) { *type = kLogFile; - } else if (suffix == Slice(".sst") || suffix == Slice(".ldb")) { + } else if (suffix == Slice(".sst")) { *type = kTableFile; } else if (suffix == Slice(".dbtmp")) { *type = kTempFile; @@ -141,4 +211,99 @@ Status SetCurrentFile(Env* env, const std::string& dbname, return s; } + +Status +MakeLevelDirectories(Env * env, const Options & options) +{ + Status ret_stat; + int level; + std::string dirname; + + for (level=0; levelCreateDir(dirname.c_str()); + } // for + + return(ret_stat); + +} // MakeLevelDirectories + + +bool +TestForLevelDirectories( + Env * env, + const Options & options, + Version * version) +{ + bool ret_flag, again; + int level; + std::string dirname; + + ret_flag=true; + again=true; + + // walk backwards, fault will be in higher levels if partial conversion + for (level=config::kNumLevels-1; 0<=level && again; --level) + { + again=false; + + // does directory exist + dirname=MakeDirName2(options, level, "sst"); + ret_flag=env->FileExists(dirname.c_str()); + + // do all files exist in level + if (ret_flag) + { + const std::vector & level_files(version->GetFileList(level)); + std::vector::const_iterator it; + std::string table_name; + Status s; + + for (it=level_files.begin(); level_files.end()!=it && ret_flag; ++it) + { + table_name=TableFileName(options, (*it)->number, level); + ret_flag=env->FileExists(table_name.c_str()); + } // for + + again=ret_flag && 0==level_files.size(); + } // if + } // for + + return(ret_flag); + +} // TestForLevelDirectories + +std::string // replacement dbname ... potentially tiered +MakeTieredDbname( + const std::string & dbname, // input ... original dbname from DBImpl constructor + Options & options) // input/output ... writable Options, tiered values changed +{ + // case for "", used with internal calls to DestroyDB + if (0==dbname.size() && 0!=options.tiered_fast_prefix.size()) + { + // do NOTHING ... options already initialized + } // if + else if (0<(int)options.tiered_slow_level && (int)options.tiered_slow_level #include +#include "leveldb/options.h" #include "leveldb/slice.h" #include "leveldb/status.h" #include "port/port.h" @@ -16,6 +17,7 @@ namespace leveldb { class Env; +class Version; enum FileType { kLogFile, @@ -24,9 +26,24 @@ enum FileType { kDescriptorFile, kCurrentFile, kTempFile, - kInfoLogFile // Either the current one, or an old one + kInfoLogFile, // Either the current one, or an old one + kCacheWarming }; +// Riak specific routine to help create sst_? subdirectory names +std::string MakeDirName2(const Options & options, + int level, const char* suffix); + +// Riak specific routine to help create sst_? subdirectories +Status MakeLevelDirectories(Env * env, const Options & options); + +// Riak specific routine to test if sst_? subdirectories exist +bool TestForLevelDirectories(Env * env, const Options & options, class Version *); + +// Riak specific routine to standardize conversion of dbname and +// Options' tiered directories (options parameter is MODIFIED) +std::string MakeTieredDbname(const std::string &dbname, Options & options_rw); + // Return the name of the log file with the specified number // in the db named by "dbname". The result will be prefixed with // "dbname". @@ -35,12 +52,8 @@ extern std::string LogFileName(const std::string& dbname, uint64_t number); // Return the name of the sstable with the specified number // in the db named by "dbname". The result will be prefixed with // "dbname". -extern std::string TableFileName(const std::string& dbname, uint64_t number); - -// Return the legacy file name for an sstable with the specified number -// in the db named by "dbname". The result will be prefixed with -// "dbname". -extern std::string SSTTableFileName(const std::string& dbname, uint64_t number); +extern std::string TableFileName(const Options & options, uint64_t number, + int level); // Return the name of the descriptor file for the db named by // "dbname" and the specified incarnation number. The result will be @@ -67,10 +80,21 @@ extern std::string InfoLogFileName(const std::string& dbname); // Return the name of the old info log file for "dbname". extern std::string OldInfoLogFileName(const std::string& dbname); +// Return the name of the cache object file for the db named by +// "dbname". The result will be prefixed with "dbname". +extern std::string CowFileName(const std::string& dbname); + +// Append appropriate "backup" string to input path +extern std::string BackupPath(const std::string& dbname, int backup_num); + +// update tiered_fast_prefix and tiered_slow_prefix members of +// given Options object to point to backup path +extern bool SetBackupPaths(Options & options, int backup_num); + // If filename is a leveldb file, store the type of the file in *type. // The number encoded in the filename is stored in *number. If the // filename was successfully parsed, returns true. Else return false. -extern bool ParseFileName(const std::string& filename, +extern bool ParseFileName(const std::string& tiered_filename, uint64_t* number, FileType* type); diff --git a/src/leveldb/db/filename_test.cc b/src/leveldb/db/filename_test.cc index a32556dea..a075f9b71 100644 --- a/src/leveldb/db/filename_test.cc +++ b/src/leveldb/db/filename_test.cc @@ -27,7 +27,6 @@ TEST(FileNameTest, Parse) { { "100.log", 100, kLogFile }, { "0.log", 0, kLogFile }, { "0.sst", 0, kTableFile }, - { "0.ldb", 0, kTableFile }, { "CURRENT", 0, kCurrentFile }, { "LOCK", 0, kDBLockFile }, { "MANIFEST-2", 2, kDescriptorFile }, @@ -71,13 +70,14 @@ TEST(FileNameTest, Parse) { for (int i = 0; i < sizeof(errors) / sizeof(errors[0]); i++) { std::string f = errors[i]; ASSERT_TRUE(!ParseFileName(f, &number, &type)) << f; - } + }; } TEST(FileNameTest, Construction) { uint64_t number; FileType type; std::string fname; + Options options; fname = CurrentFileName("foo"); ASSERT_EQ("foo/", std::string(fname.data(), 4)); @@ -97,12 +97,40 @@ TEST(FileNameTest, Construction) { ASSERT_EQ(192, number); ASSERT_EQ(kLogFile, type); - fname = TableFileName("bar", 200); + options.tiered_fast_prefix="bar"; + options.tiered_slow_prefix="bar"; + fname = TableFileName(options, 200, 1); ASSERT_EQ("bar/", std::string(fname.data(), 4)); - ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type)); + ASSERT_EQ("sst_1/", std::string(fname.substr(4,6))); + ASSERT_TRUE(ParseFileName(fname.c_str() + 10, &number, &type)); ASSERT_EQ(200, number); ASSERT_EQ(kTableFile, type); + fname = TableFileName(options, 400, 4); + ASSERT_EQ("bar/", std::string(fname.data(), 4)); + ASSERT_EQ("sst_4/", std::string(fname.substr(4,6))); + ASSERT_TRUE(ParseFileName(fname.c_str() + 10, &number, &type)); + ASSERT_EQ(400, number); + ASSERT_EQ(kTableFile, type); + + options.tiered_slow_level=4; + options.tiered_fast_prefix="fast"; + options.tiered_slow_prefix="slow"; + fname = TableFileName(options, 500, 3); + ASSERT_EQ("fast/", std::string(fname.data(), 5)); + ASSERT_EQ("sst_3/", std::string(fname.substr(5,6))); + ASSERT_TRUE(ParseFileName(fname.c_str() + 11, &number, &type)); + ASSERT_EQ(500, number); + ASSERT_EQ(kTableFile, type); + + fname = TableFileName(options, 600, 4); + ASSERT_EQ("slow/", std::string(fname.data(), 5)); + ASSERT_EQ("sst_4/", std::string(fname.substr(5,6))); + ASSERT_TRUE(ParseFileName(fname.c_str() + 11, &number, &type)); + ASSERT_EQ(600, number); + ASSERT_EQ(kTableFile, type); + + fname = DescriptorFileName("bar", 100); ASSERT_EQ("bar/", std::string(fname.data(), 4)); ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type)); @@ -114,6 +142,48 @@ TEST(FileNameTest, Construction) { ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type)); ASSERT_EQ(999, number); ASSERT_EQ(kTempFile, type); + + fname = CowFileName("/what/goes/moo"); + ASSERT_EQ("/what/goes/moo/COW", fname); + + fname = BackupPath("/var/db/riak/data/leveldb/0",0); + ASSERT_EQ("/var/db/riak/data/leveldb/0/backup", fname); + + fname = BackupPath("/var/db/riak/data/leveldb/0",1); + ASSERT_EQ("/var/db/riak/data/leveldb/0/backup.1", fname); + + fname = BackupPath("/var/db/riak/data/leveldb/0",5); + ASSERT_EQ("/var/db/riak/data/leveldb/0/backup.5", fname); + + options.tiered_slow_level=4; + options.tiered_fast_prefix="fast"; + options.tiered_slow_prefix="slow"; + fname = SetBackupPaths(options,0); + ASSERT_EQ("fast/backup", options.tiered_fast_prefix); + ASSERT_EQ("slow/backup", options.tiered_slow_prefix); + + options.tiered_slow_level=4; + options.tiered_fast_prefix="fast"; + options.tiered_slow_prefix="slow"; + fname = SetBackupPaths(options,3); + ASSERT_EQ("fast/backup.3", options.tiered_fast_prefix); + ASSERT_EQ("slow/backup.3", options.tiered_slow_prefix); + + + options.tiered_slow_level=4; + options.tiered_fast_prefix="//mnt/fast"; + options.tiered_slow_prefix="//mnt/slow"; + fname=MakeTieredDbname("riak/data/leveldb", options); + ASSERT_EQ("//mnt/fast/riak/data/leveldb", fname); + ASSERT_EQ("//mnt/fast/riak/data/leveldb", options.tiered_fast_prefix); + ASSERT_EQ("//mnt/slow/riak/data/leveldb", options.tiered_slow_prefix); + + // special case with no dbname given, should have no changes + fname=MakeTieredDbname("", options); + ASSERT_EQ("//mnt/fast/riak/data/leveldb", fname); + ASSERT_EQ("//mnt/fast/riak/data/leveldb", options.tiered_fast_prefix); + ASSERT_EQ("//mnt/slow/riak/data/leveldb", options.tiered_slow_prefix); + } } // namespace leveldb diff --git a/src/leveldb/db/leveldbutil.cc b/src/leveldb/db/leveldbutil.cc deleted file mode 100644 index d06d64d64..000000000 --- a/src/leveldb/db/leveldbutil.cc +++ /dev/null @@ -1,65 +0,0 @@ -// Copyright (c) 2012 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include -#include "leveldb/dumpfile.h" -#include "leveldb/env.h" -#include "leveldb/status.h" - -namespace leveldb { -namespace { - -class StdoutPrinter : public WritableFile { - public: - virtual Status Append(const Slice& data) { - fwrite(data.data(), 1, data.size(), stdout); - return Status::OK(); - } - virtual Status Close() { return Status::OK(); } - virtual Status Flush() { return Status::OK(); } - virtual Status Sync() { return Status::OK(); } - virtual std::string GetName() const { return "[stdout]"; } -}; - -bool HandleDumpCommand(Env* env, char** files, int num) { - StdoutPrinter printer; - bool ok = true; - for (int i = 0; i < num; i++) { - Status s = DumpFile(env, files[i], &printer); - if (!s.ok()) { - fprintf(stderr, "%s\n", s.ToString().c_str()); - ok = false; - } - } - return ok; -} - -} // namespace -} // namespace leveldb - -static void Usage() { - fprintf( - stderr, - "Usage: leveldbutil command...\n" - " dump files... -- dump contents of specified files\n" - ); -} - -int main(int argc, char** argv) { - leveldb::Env* env = leveldb::Env::Default(); - bool ok = true; - if (argc < 2) { - Usage(); - ok = false; - } else { - std::string command = argv[1]; - if (command == "dump") { - ok = leveldb::HandleDumpCommand(env, argv+2, argc-2); - } else { - Usage(); - ok = false; - } - } - return (ok ? 0 : 1); -} diff --git a/src/leveldb/db/log_format.h b/src/leveldb/db/log_format.h index 356e69fca..2690cb978 100644 --- a/src/leveldb/db/log_format.h +++ b/src/leveldb/db/log_format.h @@ -3,7 +3,7 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. // // Log format information shared by reader and writer. -// See ../doc/log_format.md for more detail. +// See ../doc/log_format.txt for more detail. #ifndef STORAGE_LEVELDB_DB_LOG_FORMAT_H_ #define STORAGE_LEVELDB_DB_LOG_FORMAT_H_ @@ -26,8 +26,8 @@ static const int kMaxRecordType = kLastType; static const int kBlockSize = 32768; -// Header is checksum (4 bytes), length (2 bytes), type (1 byte). -static const int kHeaderSize = 4 + 2 + 1; +// Header is checksum (4 bytes), type (1 byte), length (2 bytes). +static const int kHeaderSize = 4 + 1 + 2; } // namespace log } // namespace leveldb diff --git a/src/leveldb/db/log_reader.cc b/src/leveldb/db/log_reader.cc index 8b6ad136d..ddd620246 100644 --- a/src/leveldb/db/log_reader.cc +++ b/src/leveldb/db/log_reader.cc @@ -25,8 +25,7 @@ Reader::Reader(SequentialFile* file, Reporter* reporter, bool checksum, eof_(false), last_record_offset_(0), end_of_buffer_offset_(0), - initial_offset_(initial_offset), - resyncing_(initial_offset > 0) { + initial_offset_(initial_offset) { } Reader::~Reader() { @@ -73,25 +72,8 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch) { Slice fragment; while (true) { + uint64_t physical_record_offset = end_of_buffer_offset_ - buffer_.size(); const unsigned int record_type = ReadPhysicalRecord(&fragment); - - // ReadPhysicalRecord may have only had an empty trailer remaining in its - // internal buffer. Calculate the offset of the next physical record now - // that it has returned, properly accounting for its header size. - uint64_t physical_record_offset = - end_of_buffer_offset_ - buffer_.size() - kHeaderSize - fragment.size(); - - if (resyncing_) { - if (record_type == kMiddleType) { - continue; - } else if (record_type == kLastType) { - resyncing_ = false; - continue; - } else { - resyncing_ = false; - } - } - switch (record_type) { case kFullType: if (in_fragmented_record) { @@ -151,9 +133,7 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch) { case kEof: if (in_fragmented_record) { - // This can be caused by the writer dying immediately after - // writing a physical record but before completing the next; don't - // treat it as a corruption, just ignore the entire logical record. + ReportCorruption(scratch->size(), "partial record without end(3)"); scratch->clear(); } return false; @@ -185,20 +165,20 @@ uint64_t Reader::LastRecordOffset() { return last_record_offset_; } -void Reader::ReportCorruption(uint64_t bytes, const char* reason) { - ReportDrop(bytes, Status::Corruption(reason, file_->GetName())); +void Reader::ReportCorruption(size_t bytes, const char* reason) { + ReportDrop(bytes, Status::Corruption(reason)); } -void Reader::ReportDrop(uint64_t bytes, const Status& reason) { +void Reader::ReportDrop(size_t bytes, const Status& reason) { if (reporter_ != NULL && end_of_buffer_offset_ - buffer_.size() - bytes >= initial_offset_) { - reporter_->Corruption(static_cast(bytes), reason); + reporter_->Corruption(bytes, reason); } } unsigned int Reader::ReadPhysicalRecord(Slice* result) { while (true) { - if (buffer_.size() < kHeaderSize) { + if (buffer_.size() < (size_t)kHeaderSize) { if (!eof_) { // Last read was a full read, so this is a trailer to skip buffer_.clear(); @@ -209,16 +189,17 @@ unsigned int Reader::ReadPhysicalRecord(Slice* result) { ReportDrop(kBlockSize, status); eof_ = true; return kEof; - } else if (buffer_.size() < kBlockSize) { + } else if (buffer_.size() < (size_t)kBlockSize) { eof_ = true; } continue; + } else if (buffer_.size() == 0) { + // End of file + return kEof; } else { - // Note that if buffer_ is non-empty, we have a truncated header at the - // end of the file, which can be caused by the writer crashing in the - // middle of writing the header. Instead of considering this an error, - // just report EOF. + size_t drop_size = buffer_.size(); buffer_.clear(); + ReportCorruption(drop_size, "truncated record at end of file"); return kEof; } } @@ -232,14 +213,8 @@ unsigned int Reader::ReadPhysicalRecord(Slice* result) { if (kHeaderSize + length > buffer_.size()) { size_t drop_size = buffer_.size(); buffer_.clear(); - if (!eof_) { - ReportCorruption(drop_size, "bad record length"); - return kBadRecord; - } - // If the end of the file has been reached without reading |length| bytes - // of payload, assume the writer died in the middle of writing the record. - // Don't report a corruption. - return kEof; + ReportCorruption(drop_size, "bad record length"); + return kBadRecord; } if (type == kZeroType && length == 0) { diff --git a/src/leveldb/db/log_reader.h b/src/leveldb/db/log_reader.h index 8389d61f8..82d4bee68 100644 --- a/src/leveldb/db/log_reader.h +++ b/src/leveldb/db/log_reader.h @@ -73,11 +73,6 @@ class Reader { // Offset at which to start looking for the first record to return uint64_t const initial_offset_; - // True if we are resynchronizing after a seek (initial_offset_ > 0). In - // particular, a run of kMiddleType and kLastType records can be silently - // skipped in this mode - bool resyncing_; - // Extend record types with the following special values enum { kEof = kMaxRecordType + 1, @@ -99,8 +94,8 @@ class Reader { // Reports dropped bytes to the reporter. // buffer_ must be updated to remove the dropped bytes prior to invocation. - void ReportCorruption(uint64_t bytes, const char* reason); - void ReportDrop(uint64_t bytes, const Status& reason); + void ReportCorruption(size_t bytes, const char* reason); + void ReportDrop(size_t bytes, const Status& reason); // No copying allowed Reader(const Reader&); diff --git a/src/leveldb/db/log_test.cc b/src/leveldb/db/log_test.cc index 48a592865..4c5cf8757 100644 --- a/src/leveldb/db/log_test.cc +++ b/src/leveldb/db/log_test.cc @@ -79,7 +79,7 @@ class LogTest { virtual Status Skip(uint64_t n) { if (n > contents_.size()) { contents_.clear(); - return Status::NotFound("in-memory file skipped past end"); + return Status::NotFound("in-memory file skipepd past end"); } contents_.remove_prefix(n); @@ -104,34 +104,23 @@ class LogTest { StringSource source_; ReportCollector report_; bool reading_; - Writer* writer_; - Reader* reader_; + Writer writer_; + Reader reader_; // Record metadata for testing initial offset functionality static size_t initial_offset_record_sizes_[]; static uint64_t initial_offset_last_record_offsets_[]; - static int num_initial_offset_records_; public: LogTest() : reading_(false), - writer_(new Writer(&dest_)), - reader_(new Reader(&source_, &report_, true/*checksum*/, - 0/*initial_offset*/)) { - } - - ~LogTest() { - delete writer_; - delete reader_; - } - - void ReopenForAppend() { - delete writer_; - writer_ = new Writer(&dest_, dest_.contents_.size()); + writer_(&dest_), + reader_(&source_, &report_, true/*checksum*/, + 0/*initial_offset*/) { } void Write(const std::string& msg) { ASSERT_TRUE(!reading_) << "Write() after starting to read"; - writer_->AddRecord(Slice(msg)); + writer_.AddRecord(Slice(msg)); } size_t WrittenBytes() const { @@ -145,7 +134,7 @@ class LogTest { } std::string scratch; Slice record; - if (reader_->ReadRecord(&record, &scratch)) { + if (reader_.ReadRecord(&record, &scratch)) { return record.ToString(); } else { return "EOF"; @@ -193,18 +182,13 @@ class LogTest { } void WriteInitialOffsetLog() { - for (int i = 0; i < num_initial_offset_records_; i++) { + for (int i = 0; i < 4; i++) { std::string record(initial_offset_record_sizes_[i], static_cast('a' + i)); Write(record); } } - void StartReadingAt(uint64_t initial_offset) { - delete reader_; - reader_ = new Reader(&source_, &report_, true/*checksum*/, initial_offset); - } - void CheckOffsetPastEndReturnsNoRecords(uint64_t offset_past_end) { WriteInitialOffsetLog(); reading_ = true; @@ -224,48 +208,32 @@ class LogTest { source_.contents_ = Slice(dest_.contents_); Reader* offset_reader = new Reader(&source_, &report_, true/*checksum*/, initial_offset); - - // Read all records from expected_record_offset through the last one. - ASSERT_LT(expected_record_offset, num_initial_offset_records_); - for (; expected_record_offset < num_initial_offset_records_; - ++expected_record_offset) { - Slice record; - std::string scratch; - ASSERT_TRUE(offset_reader->ReadRecord(&record, &scratch)); - ASSERT_EQ(initial_offset_record_sizes_[expected_record_offset], - record.size()); - ASSERT_EQ(initial_offset_last_record_offsets_[expected_record_offset], - offset_reader->LastRecordOffset()); - ASSERT_EQ((char)('a' + expected_record_offset), record.data()[0]); - } + Slice record; + std::string scratch; + ASSERT_TRUE(offset_reader->ReadRecord(&record, &scratch)); + ASSERT_EQ(initial_offset_record_sizes_[expected_record_offset], + record.size()); + ASSERT_EQ(initial_offset_last_record_offsets_[expected_record_offset], + offset_reader->LastRecordOffset()); + ASSERT_EQ((char)('a' + expected_record_offset), record.data()[0]); delete offset_reader; } + }; size_t LogTest::initial_offset_record_sizes_[] = {10000, // Two sizable records in first block 10000, 2 * log::kBlockSize - 1000, // Span three blocks - 1, - 13716, // Consume all but two bytes of block 3. - log::kBlockSize - kHeaderSize, // Consume the entirety of block 4. - }; + 1}; uint64_t LogTest::initial_offset_last_record_offsets_[] = {0, kHeaderSize + 10000, 2 * (kHeaderSize + 10000), 2 * (kHeaderSize + 10000) + - (2 * log::kBlockSize - 1000) + 3 * kHeaderSize, - 2 * (kHeaderSize + 10000) + - (2 * log::kBlockSize - 1000) + 3 * kHeaderSize - + kHeaderSize + 1, - 3 * log::kBlockSize, - }; + (2 * log::kBlockSize - 1000) + 3 * kHeaderSize}; -// LogTest::initial_offset_last_record_offsets_ must be defined before this. -int LogTest::num_initial_offset_records_ = - sizeof(LogTest::initial_offset_last_record_offsets_)/sizeof(uint64_t); TEST(LogTest, Empty) { ASSERT_EQ("EOF", Read()); @@ -350,15 +318,6 @@ TEST(LogTest, AlignedEof) { ASSERT_EQ("EOF", Read()); } -TEST(LogTest, OpenForAppend) { - Write("hello"); - ReopenForAppend(); - Write("world"); - ASSERT_EQ("hello", Read()); - ASSERT_EQ("world", Read()); - ASSERT_EQ("EOF", Read()); -} - TEST(LogTest, RandomRead) { const int N = 500; Random write_rnd(301); @@ -392,32 +351,20 @@ TEST(LogTest, BadRecordType) { ASSERT_EQ("OK", MatchError("unknown record type")); } -TEST(LogTest, TruncatedTrailingRecordIsIgnored) { +TEST(LogTest, TruncatedTrailingRecord) { Write("foo"); ShrinkSize(4); // Drop all payload as well as a header byte ASSERT_EQ("EOF", Read()); - // Truncated last record is ignored, not treated as an error. - ASSERT_EQ(0, DroppedBytes()); - ASSERT_EQ("", ReportMessage()); + ASSERT_EQ(kHeaderSize - 1, DroppedBytes()); + ASSERT_EQ("OK", MatchError("truncated record at end of file")); } TEST(LogTest, BadLength) { - const int kPayloadSize = kBlockSize - kHeaderSize; - Write(BigString("bar", kPayloadSize)); - Write("foo"); - // Least significant size byte is stored in header[4]. - IncrementByte(4, 1); - ASSERT_EQ("foo", Read()); - ASSERT_EQ(kBlockSize, DroppedBytes()); - ASSERT_EQ("OK", MatchError("bad record length")); -} - -TEST(LogTest, BadLengthAtEndIsIgnored) { Write("foo"); ShrinkSize(1); ASSERT_EQ("EOF", Read()); - ASSERT_EQ(0, DroppedBytes()); - ASSERT_EQ("", ReportMessage()); + ASSERT_EQ(kHeaderSize + 2, DroppedBytes()); + ASSERT_EQ("OK", MatchError("bad record length")); } TEST(LogTest, ChecksumMismatch) { @@ -468,40 +415,6 @@ TEST(LogTest, UnexpectedFirstType) { ASSERT_EQ("OK", MatchError("partial record without end")); } -TEST(LogTest, MissingLastIsIgnored) { - Write(BigString("bar", kBlockSize)); - // Remove the LAST block, including header. - ShrinkSize(14); - ASSERT_EQ("EOF", Read()); - ASSERT_EQ("", ReportMessage()); - ASSERT_EQ(0, DroppedBytes()); -} - -TEST(LogTest, PartialLastIsIgnored) { - Write(BigString("bar", kBlockSize)); - // Cause a bad record length in the LAST block. - ShrinkSize(1); - ASSERT_EQ("EOF", Read()); - ASSERT_EQ("", ReportMessage()); - ASSERT_EQ(0, DroppedBytes()); -} - -TEST(LogTest, SkipIntoMultiRecord) { - // Consider a fragmented record: - // first(R1), middle(R1), last(R1), first(R2) - // If initial_offset points to a record after first(R1) but before first(R2) - // incomplete fragment errors are not actual errors, and must be suppressed - // until a new first or full record is encountered. - Write(BigString("foo", 3*kBlockSize)); - Write("correct"); - StartReadingAt(kBlockSize); - - ASSERT_EQ("correct", Read()); - ASSERT_EQ("", ReportMessage()); - ASSERT_EQ(0, DroppedBytes()); - ASSERT_EQ("EOF", Read()); -} - TEST(LogTest, ErrorJoinsRecords) { // Consider two fragmented records: // first(R1) last(R1) first(R2) last(R2) @@ -520,7 +433,7 @@ TEST(LogTest, ErrorJoinsRecords) { ASSERT_EQ("correct", Read()); ASSERT_EQ("EOF", Read()); - const size_t dropped = DroppedBytes(); + const int dropped = DroppedBytes(); ASSERT_LE(dropped, 2*kBlockSize + 100); ASSERT_GE(dropped, 2*kBlockSize); } @@ -571,10 +484,6 @@ TEST(LogTest, ReadFourthStart) { 3); } -TEST(LogTest, ReadInitialOffsetIntoBlockPadding) { - CheckInitialOffsetRecord(3 * log::kBlockSize - 3, 5); -} - TEST(LogTest, ReadEnd) { CheckOffsetPastEndReturnsNoRecords(0); } diff --git a/src/leveldb/db/log_writer.cc b/src/leveldb/db/log_writer.cc index 74a03270d..18c7bd837 100644 --- a/src/leveldb/db/log_writer.cc +++ b/src/leveldb/db/log_writer.cc @@ -12,22 +12,13 @@ namespace leveldb { namespace log { -static void InitTypeCrc(uint32_t* type_crc) { - for (int i = 0; i <= kMaxRecordType; i++) { - char t = static_cast(i); - type_crc[i] = crc32c::Value(&t, 1); - } -} - Writer::Writer(WritableFile* dest) : dest_(dest), block_offset_(0) { - InitTypeCrc(type_crc_); -} - -Writer::Writer(WritableFile* dest, uint64_t dest_length) - : dest_(dest), block_offset_(dest_length % kBlockSize) { - InitTypeCrc(type_crc_); + for (int i = 0; i <= kMaxRecordType; i++) { + char t = static_cast(i); + type_crc_[i] = crc32c::Value(&t, 1); + } } Writer::~Writer() { @@ -83,7 +74,7 @@ Status Writer::AddRecord(const Slice& slice) { Status Writer::EmitPhysicalRecord(RecordType t, const char* ptr, size_t n) { assert(n <= 0xffff); // Must fit in two bytes - assert(block_offset_ + kHeaderSize + n <= kBlockSize); + assert(block_offset_ + kHeaderSize + (int)n <= kBlockSize); // Format the header char buf[kHeaderSize]; diff --git a/src/leveldb/db/log_writer.h b/src/leveldb/db/log_writer.h index 9e7cc4705..c6ad7a4ff 100644 --- a/src/leveldb/db/log_writer.h +++ b/src/leveldb/db/log_writer.h @@ -9,11 +9,10 @@ #include "db/log_format.h" #include "leveldb/slice.h" #include "leveldb/status.h" +#include "leveldb/env.h" namespace leveldb { -class WritableFile; - namespace log { class Writer { @@ -22,16 +21,12 @@ class Writer { // "*dest" must be initially empty. // "*dest" must remain live while this Writer is in use. explicit Writer(WritableFile* dest); - - // Create a writer that will append data to "*dest". - // "*dest" must have initial length "dest_length". - // "*dest" must remain live while this Writer is in use. - Writer(WritableFile* dest, uint64_t dest_length); - ~Writer(); Status AddRecord(const Slice& slice); + void Close() {delete dest_; dest_=NULL;}; + private: WritableFile* dest_; int block_offset_; // Current offset in block diff --git a/src/leveldb/db/memtable.cc b/src/leveldb/db/memtable.cc index 287afdbdc..965c9d9c0 100644 --- a/src/leveldb/db/memtable.cc +++ b/src/leveldb/db/memtable.cc @@ -6,6 +6,7 @@ #include "db/dbformat.h" #include "leveldb/comparator.h" #include "leveldb/env.h" +#include "leveldb/expiry.h" #include "leveldb/iterator.h" #include "util/coding.h" @@ -63,6 +64,8 @@ class MemTableIterator: public Iterator { Slice key_slice = GetLengthPrefixedSlice(iter_.key()); return GetLengthPrefixedSlice(key_slice.data() + key_slice.size()); } + virtual KeyMetaData & keymetadata() const + {MemTable::DecodeKeyMetaData(iter_.key(), keymetadata_); return(keymetadata_);}; virtual Status status() const { return Status::OK(); } @@ -81,7 +84,8 @@ Iterator* MemTable::NewIterator() { void MemTable::Add(SequenceNumber s, ValueType type, const Slice& key, - const Slice& value) { + const Slice& value, + const ExpiryTimeMicros & expiry) { // Format of an entry is concatenation of: // key_size : varint32 of internal_key.size() // key bytes : char[internal_key.size()] @@ -89,7 +93,7 @@ void MemTable::Add(SequenceNumber s, ValueType type, // value bytes : char[value.size()] size_t key_size = key.size(); size_t val_size = value.size(); - size_t internal_key_size = key_size + 8; + size_t internal_key_size = key_size + KeySuffixSize(type); const size_t encoded_len = VarintLength(internal_key_size) + internal_key_size + VarintLength(val_size) + val_size; @@ -97,15 +101,22 @@ void MemTable::Add(SequenceNumber s, ValueType type, char* p = EncodeVarint32(buf, internal_key_size); memcpy(p, key.data(), key_size); p += key_size; + if (IsExpiryKey(type)) + { + EncodeFixed64(p, expiry); + p+=8; + } EncodeFixed64(p, (s << 8) | type); p += 8; p = EncodeVarint32(p, val_size); memcpy(p, value.data(), val_size); - assert(p + val_size == buf + encoded_len); + assert((size_t)((p + val_size) - buf) == encoded_len); table_.Insert(buf); } -bool MemTable::Get(const LookupKey& key, std::string* value, Status* s) { +bool MemTable::Get(const LookupKey& key, Value* value, Status* s, + const Options * options) { + bool ret_flag(false); Slice memkey = key.memtable_key(); Table::Iterator iter(&table_); iter.Seek(memkey.data()); @@ -113,6 +124,7 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s) { // entry format is: // klength varint32 // userkey char[klength] + // optional uint64 // tag uint64 // vlength varint32 // value char[vlength] @@ -122,24 +134,66 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s) { const char* entry = iter.key(); uint32_t key_length; const char* key_ptr = GetVarint32Ptr(entry, entry+5, &key_length); + Slice internal_key(key_ptr, key_length); if (comparator_.comparator.user_comparator()->Compare( - Slice(key_ptr, key_length - 8), + ExtractUserKey(internal_key), key.user_key()) == 0) { // Correct user key - const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8); - switch (static_cast(tag & 0xff)) { - case kTypeValue: { + KeyMetaData meta; + DecodeKeyMetaData(entry, meta); + + switch (meta.m_Type) { + case kTypeValueWriteTime: + case kTypeValueExplicitExpiry: + { + bool expired=false; + if (NULL!=options && options->ExpiryActivated()) + expired=options->expiry_module->MemTableCallback(internal_key); + if (expired) + { + // like kTypeDeletion + *s = Status::NotFound(Slice()); + ret_flag=true; + break; + } // if + //otherwise fall into kTypeValue code + } // case + + case kTypeValue: + { Slice v = GetLengthPrefixedSlice(key_ptr + key_length); value->assign(v.data(), v.size()); - return true; + ret_flag=true; + break; } case kTypeDeletion: *s = Status::NotFound(Slice()); - return true; - } + ret_flag=true; + break; + } // switch + + // only unpack metadata if requested + if (key.WantsKeyMetaData()) + key.SetKeyMetaData(meta); } } - return false; + return ret_flag; } +// this is a static function +void MemTable::DecodeKeyMetaData( + const char * key, + KeyMetaData & meta) +{ + Slice key_slice = GetLengthPrefixedSlice(key); + + meta.m_Type=ExtractValueType(key_slice); + meta.m_Sequence=ExtractSequenceNumber(key_slice); + if (IsExpiryKey(meta.m_Type)) + meta.m_Expiry=ExtractExpiry(key_slice); + else + meta.m_Expiry=0; + +} // DecodeKeyMetaData + } // namespace leveldb diff --git a/src/leveldb/db/memtable.h b/src/leveldb/db/memtable.h index 9f41567cd..ff0e98220 100644 --- a/src/leveldb/db/memtable.h +++ b/src/leveldb/db/memtable.h @@ -24,10 +24,10 @@ class MemTable { explicit MemTable(const InternalKeyComparator& comparator); // Increase reference count. - void Ref() { ++refs_; } + void Ref() volatile { ++refs_; } // Drop reference count. Delete if no more references exist. - void Unref() { + void Unref() volatile { --refs_; assert(refs_ >= 0); if (refs_ <= 0) { @@ -36,7 +36,10 @@ class MemTable { } // Returns an estimate of the number of bytes of data in use by this - // data structure. It is safe to call when MemTable is being modified. + // data structure. + // + // REQUIRES: external synchronization to prevent simultaneous + // operations on the same MemTable. size_t ApproximateMemoryUsage(); // Return an iterator that yields the contents of the memtable. @@ -52,13 +55,17 @@ class MemTable { // Typically value will be empty if type==kTypeDeletion. void Add(SequenceNumber seq, ValueType type, const Slice& key, - const Slice& value); + const Slice& value, + const ExpiryTimeMicros& expiry=0); // If memtable contains a value for key, store it in *value and return true. // If memtable contains a deletion for key, store a NotFound() error // in *status and return true. // Else, return false. - bool Get(const LookupKey& key, std::string* value, Status* s); + bool Get(const LookupKey& key, Value* value, Status* s, const Options * options); + + // parse keymetadata from skiplist key string + static void DecodeKeyMetaData(const char * key, KeyMetaData & meta); private: ~MemTable(); // Private since only Unref() should be used to delete it @@ -69,7 +76,7 @@ class MemTable { int operator()(const char* a, const char* b) const; }; friend class MemTableIterator; - friend class MemTableBackwardIterator; + friend class MemTableBackwardIterator; // does not exist typedef SkipList Table; diff --git a/src/leveldb/db/penalty_test.cc b/src/leveldb/db/penalty_test.cc new file mode 100644 index 000000000..fc28ae887 --- /dev/null +++ b/src/leveldb/db/penalty_test.cc @@ -0,0 +1,248 @@ +// ------------------------------------------------------------------- +// +// penalty_test.cc +// +// Copyright (c) 2016 Basho Technologies, Inc. All Rights Reserved. +// +// This file is provided to you under the Apache License, +// Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain +// a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// ------------------------------------------------------------------- + + +#include "util/testharness.h" +#include "util/testutil.h" + +#include "leveldb/comparator.h" + +#include "db/version_set.h" + +/** + * Execution routine + */ +int main(int argc, char** argv) +{ + return leveldb::test::RunAllTests(); +} + + +namespace leveldb { + +class TestVersion : public Version +{ +public: + TestVersion() + : Version(NULL) + { + int loop; + + for (loop=0; loop & GetFileList(int level) const + { + m_FalseVector.clear(); + m_FalseVector.push_back(&m_FalseFile[level]); + return(m_FalseVector); + }; + + mutable std::vector m_FalseVector; + mutable FileMetaData m_FalseFile[config::kNumLevels]; + + size_t m_LevelFileCount[config::kNumLevels]; + +}; // class TestVersion + +/** + * Wrapper class for tests. Holds working variables + * and helper functions. + */ +class PenaltyTester : public VersionSet +{ +public: + PenaltyTester() + : m_IntCompare(m_Options.comparator), VersionSet("", &m_Options, NULL, &m_IntCompare) + { + }; + + ~PenaltyTester() + { + }; + + Options m_Options; + InternalKeyComparator m_IntCompare; + +}; // class PenaltyTester + + + /******************* + * Form note: + * using ASSERT_TRUE(0==version.WritePenalty()); + * instead of ASSERT_EQ / ASSERT_NE because WritePenalty + * returns a volatile int, which older compilers believe is + * not an equivalent type to a constant. RedHat 5, Solaris, + * and SmartOS were giving grief. + *******************/ + +/** + * Debug 1 + */ +#if 0 +TEST(PenaltyTester, Debug1) +{ + TestVersion version; + int penalty; + + m_Options.write_buffer_size=46416847; + + version.m_FalseFile[2].file_size=1075676398; + version.m_LevelFileCount[1]=1; + + UpdatePenalty(&version); + + ASSERT_TRUE(0==version.WritePenalty()); + +} // test Debug1 +#endif + + +/** + * No penalty scenarios + */ +TEST(PenaltyTester, NoPenalty) +{ + TestVersion version; + int level; + + m_Options.write_buffer_size=46416847; + + // nothing + UpdatePenalty(&version); + ASSERT_TRUE(0==version.WritePenalty()); + + /** + * Level 0 + * (overlapped level, penalty is count based) + */ + // no penalty + version.m_LevelFileCount[0]=config::kL0_CompactionTrigger; + UpdatePenalty(&version); + ASSERT_TRUE(0==version.WritePenalty()); + + version.m_LevelFileCount[0]=config::kL0_SlowdownWritesTrigger; + UpdatePenalty(&version); + ASSERT_TRUE(0==version.WritePenalty()); + +#if 0 // needs rewrite to be time based + // threshold reached ... some penalty + version.m_LevelFileCount[0]=config::kL0_SlowdownWritesTrigger+1; + UpdatePenalty(&version); + ASSERT_TRUE(0!=version.WritePenalty()); + + // clean up + version.m_LevelFileCount[0]=0; + + /** + * Level 1 + * (overlapped level, penalty is count based) + */ + // no penalty + version.m_LevelFileCount[1]=config::kL0_CompactionTrigger; + UpdatePenalty(&version); + ASSERT_TRUE(0==version.WritePenalty()); + + version.m_LevelFileCount[1]=config::kL0_SlowdownWritesTrigger; + UpdatePenalty(&version); + ASSERT_TRUE(0==version.WritePenalty()); + + // threshold reached ... some penalty + version.m_LevelFileCount[1]=config::kL0_SlowdownWritesTrigger+1; + UpdatePenalty(&version); + ASSERT_TRUE(0!=version.WritePenalty()); + + // clean up + version.m_LevelFileCount[1]=0; + + /** + * Level 2 + * (landing level, penalty size based) + */ + // no penalty + version.m_FalseFile[2].file_size=0; + UpdatePenalty(&version); + ASSERT_TRUE(0==version.WritePenalty()); + + version.m_FalseFile[2].file_size=VersionSet::DesiredBytesForLevel(2); + UpdatePenalty(&version); + ASSERT_TRUE(0==version.WritePenalty()); + + version.m_FalseFile[2].file_size=VersionSet::MaxBytesForLevel(2)-1; + UpdatePenalty(&version); + ASSERT_TRUE(0==version.WritePenalty()); + + version.m_FalseFile[2].file_size=VersionSet::MaxBytesForLevel(2); + UpdatePenalty(&version); + ASSERT_TRUE(0!=version.WritePenalty()); + + // interaction rule with level 1 + version.m_FalseFile[2].file_size=VersionSet::MaxBytesForLevel(2)-1; + version.m_LevelFileCount[1]=config::kL0_CompactionTrigger/2; + UpdatePenalty(&version); + ASSERT_TRUE(0!=version.WritePenalty()); + + // clean up + version.m_LevelFileCount[1]=0; + version.m_FalseFile[2].file_size=0; + + /** + * Level 3+ + * (landing level, penalty size based) + */ + for (level=3; level(db_); } - Env* env() const { return env_; } - - bool CanAppend() { - WritableFile* tmp; - Status s = env_->NewAppendableFile(CurrentFileName(dbname_), &tmp); - delete tmp; - if (s.IsNotSupportedError()) { - return false; - } else { - return true; - } - } - - void Close() { - delete db_; - db_ = NULL; - } - - void Open(Options* options = NULL) { - Close(); - Options opts; - if (options != NULL) { - opts = *options; - } else { - opts.reuse_logs = true; // TODO(sanjay): test both ways - opts.create_if_missing = true; - } - if (opts.env == NULL) { - opts.env = env_; - } - ASSERT_OK(DB::Open(opts, dbname_, &db_)); - ASSERT_EQ(1, NumLogs()); - } - - Status Put(const std::string& k, const std::string& v) { - return db_->Put(WriteOptions(), k, v); - } - - std::string Get(const std::string& k, const Snapshot* snapshot = NULL) { - std::string result; - Status s = db_->Get(ReadOptions(), k, &result); - if (s.IsNotFound()) { - result = "NOT_FOUND"; - } else if (!s.ok()) { - result = s.ToString(); - } - return result; - } - - std::string ManifestFileName() { - std::string current; - ASSERT_OK(ReadFileToString(env_, CurrentFileName(dbname_), ¤t)); - size_t len = current.size(); - if (len > 0 && current[len-1] == '\n') { - current.resize(len - 1); - } - return dbname_ + "/" + current; - } - - std::string LogName(uint64_t number) { - return LogFileName(dbname_, number); - } - - size_t DeleteLogFiles() { - std::vector logs = GetFiles(kLogFile); - for (size_t i = 0; i < logs.size(); i++) { - ASSERT_OK(env_->DeleteFile(LogName(logs[i]))) << LogName(logs[i]); - } - return logs.size(); - } - - uint64_t FirstLogFile() { - return GetFiles(kLogFile)[0]; - } - - std::vector GetFiles(FileType t) { - std::vector filenames; - ASSERT_OK(env_->GetChildren(dbname_, &filenames)); - std::vector result; - for (size_t i = 0; i < filenames.size(); i++) { - uint64_t number; - FileType type; - if (ParseFileName(filenames[i], &number, &type) && type == t) { - result.push_back(number); - } - } - return result; - } - - int NumLogs() { - return GetFiles(kLogFile).size(); - } - - int NumTables() { - return GetFiles(kTableFile).size(); - } - - uint64_t FileSize(const std::string& fname) { - uint64_t result; - ASSERT_OK(env_->GetFileSize(fname, &result)) << fname; - return result; - } - - void CompactMemTable() { - dbfull()->TEST_CompactMemTable(); - } - - // Directly construct a log file that sets key to val. - void MakeLogFile(uint64_t lognum, SequenceNumber seq, Slice key, Slice val) { - std::string fname = LogFileName(dbname_, lognum); - WritableFile* file; - ASSERT_OK(env_->NewWritableFile(fname, &file)); - log::Writer writer(file); - WriteBatch batch; - batch.Put(key, val); - WriteBatchInternal::SetSequence(&batch, seq); - ASSERT_OK(writer.AddRecord(WriteBatchInternal::Contents(&batch))); - ASSERT_OK(file->Flush()); - delete file; - } - - private: - std::string dbname_; - Env* env_; - DB* db_; -}; - -TEST(RecoveryTest, ManifestReused) { - if (!CanAppend()) { - fprintf(stderr, "skipping test because env does not support appending\n"); - return; - } - ASSERT_OK(Put("foo", "bar")); - Close(); - std::string old_manifest = ManifestFileName(); - Open(); - ASSERT_EQ(old_manifest, ManifestFileName()); - ASSERT_EQ("bar", Get("foo")); - Open(); - ASSERT_EQ(old_manifest, ManifestFileName()); - ASSERT_EQ("bar", Get("foo")); -} - -TEST(RecoveryTest, LargeManifestCompacted) { - if (!CanAppend()) { - fprintf(stderr, "skipping test because env does not support appending\n"); - return; - } - ASSERT_OK(Put("foo", "bar")); - Close(); - std::string old_manifest = ManifestFileName(); - - // Pad with zeroes to make manifest file very big. - { - uint64_t len = FileSize(old_manifest); - WritableFile* file; - ASSERT_OK(env()->NewAppendableFile(old_manifest, &file)); - std::string zeroes(3*1048576 - static_cast(len), 0); - ASSERT_OK(file->Append(zeroes)); - ASSERT_OK(file->Flush()); - delete file; - } - - Open(); - std::string new_manifest = ManifestFileName(); - ASSERT_NE(old_manifest, new_manifest); - ASSERT_GT(10000, FileSize(new_manifest)); - ASSERT_EQ("bar", Get("foo")); - - Open(); - ASSERT_EQ(new_manifest, ManifestFileName()); - ASSERT_EQ("bar", Get("foo")); -} - -TEST(RecoveryTest, NoLogFiles) { - ASSERT_OK(Put("foo", "bar")); - ASSERT_EQ(1, DeleteLogFiles()); - Open(); - ASSERT_EQ("NOT_FOUND", Get("foo")); - Open(); - ASSERT_EQ("NOT_FOUND", Get("foo")); -} - -TEST(RecoveryTest, LogFileReuse) { - if (!CanAppend()) { - fprintf(stderr, "skipping test because env does not support appending\n"); - return; - } - for (int i = 0; i < 2; i++) { - ASSERT_OK(Put("foo", "bar")); - if (i == 0) { - // Compact to ensure current log is empty - CompactMemTable(); - } - Close(); - ASSERT_EQ(1, NumLogs()); - uint64_t number = FirstLogFile(); - if (i == 0) { - ASSERT_EQ(0, FileSize(LogName(number))); - } else { - ASSERT_LT(0, FileSize(LogName(number))); - } - Open(); - ASSERT_EQ(1, NumLogs()); - ASSERT_EQ(number, FirstLogFile()) << "did not reuse log file"; - ASSERT_EQ("bar", Get("foo")); - Open(); - ASSERT_EQ(1, NumLogs()); - ASSERT_EQ(number, FirstLogFile()) << "did not reuse log file"; - ASSERT_EQ("bar", Get("foo")); - } -} - -TEST(RecoveryTest, MultipleMemTables) { - // Make a large log. - const int kNum = 1000; - for (int i = 0; i < kNum; i++) { - char buf[100]; - snprintf(buf, sizeof(buf), "%050d", i); - ASSERT_OK(Put(buf, buf)); - } - ASSERT_EQ(0, NumTables()); - Close(); - ASSERT_EQ(0, NumTables()); - ASSERT_EQ(1, NumLogs()); - uint64_t old_log_file = FirstLogFile(); - - // Force creation of multiple memtables by reducing the write buffer size. - Options opt; - opt.reuse_logs = true; - opt.write_buffer_size = (kNum*100) / 2; - Open(&opt); - ASSERT_LE(2, NumTables()); - ASSERT_EQ(1, NumLogs()); - ASSERT_NE(old_log_file, FirstLogFile()) << "must not reuse log"; - for (int i = 0; i < kNum; i++) { - char buf[100]; - snprintf(buf, sizeof(buf), "%050d", i); - ASSERT_EQ(buf, Get(buf)); - } -} - -TEST(RecoveryTest, MultipleLogFiles) { - ASSERT_OK(Put("foo", "bar")); - Close(); - ASSERT_EQ(1, NumLogs()); - - // Make a bunch of uncompacted log files. - uint64_t old_log = FirstLogFile(); - MakeLogFile(old_log+1, 1000, "hello", "world"); - MakeLogFile(old_log+2, 1001, "hi", "there"); - MakeLogFile(old_log+3, 1002, "foo", "bar2"); - - // Recover and check that all log files were processed. - Open(); - ASSERT_LE(1, NumTables()); - ASSERT_EQ(1, NumLogs()); - uint64_t new_log = FirstLogFile(); - ASSERT_LE(old_log+3, new_log); - ASSERT_EQ("bar2", Get("foo")); - ASSERT_EQ("world", Get("hello")); - ASSERT_EQ("there", Get("hi")); - - // Test that previous recovery produced recoverable state. - Open(); - ASSERT_LE(1, NumTables()); - ASSERT_EQ(1, NumLogs()); - if (CanAppend()) { - ASSERT_EQ(new_log, FirstLogFile()); - } - ASSERT_EQ("bar2", Get("foo")); - ASSERT_EQ("world", Get("hello")); - ASSERT_EQ("there", Get("hi")); - - // Check that introducing an older log file does not cause it to be re-read. - Close(); - MakeLogFile(old_log+1, 2000, "hello", "stale write"); - Open(); - ASSERT_LE(1, NumTables()); - ASSERT_EQ(1, NumLogs()); - if (CanAppend()) { - ASSERT_EQ(new_log, FirstLogFile()); - } - ASSERT_EQ("bar2", Get("foo")); - ASSERT_EQ("world", Get("hello")); - ASSERT_EQ("there", Get("hi")); -} - -} // namespace leveldb - -int main(int argc, char** argv) { - return leveldb::test::RunAllTests(); -} diff --git a/src/leveldb/db/repair.cc b/src/leveldb/db/repair.cc index 7281e3d34..b1c1bc2c2 100644 --- a/src/leveldb/db/repair.cc +++ b/src/leveldb/db/repair.cc @@ -45,49 +45,113 @@ namespace { class Repairer { public: Repairer(const std::string& dbname, const Options& options) - : dbname_(dbname), + : double_cache_(options), + options_(SanitizeOptions(dbname, &icmp_, &ipolicy_, options, double_cache_.GetBlockCache())), + org_options_(options), + dbname_(options_.tiered_fast_prefix), + org_dbname_(dbname), env_(options.env), icmp_(options.comparator), ipolicy_(options.filter_policy), - options_(SanitizeOptions(dbname, &icmp_, &ipolicy_, options)), owns_info_log_(options_.info_log != options.info_log), - owns_cache_(options_.block_cache != options.block_cache), - next_file_number_(1) { + db_lock_(NULL), + next_file_number_(1) + { // TableCache can be small since we expect each table to be opened once. - table_cache_ = new TableCache(dbname_, &options_, 10); + table_cache_ = new TableCache(dbname_, &options_, double_cache_.GetFileCache(), double_cache_); + } ~Repairer() { - delete table_cache_; if (owns_info_log_) { delete options_.info_log; } - if (owns_cache_) { - delete options_.block_cache; - } +// if (owns_cache_) { +// delete options_.block_cache; +// } + + // must remove second ref counter that keeps overlapped files locked + // table cache + bool is_overlap; + for (int level = 0; level < config::kNumLevels; level++) { + { + is_overlap=(level < leveldb::config::kNumOverlapLevels); + for (size_t i = 0; i < table_numbers_[level].size(); i++) { + table_cache_->Evict(table_numbers_[level][i], is_overlap); + } // for + } // if + } // for + + delete table_cache_; } Status Run() { - Status status = FindFiles(); + Status status; + + status = env_->LockFile(LockFileName(dbname_), &db_lock_); + + if (status.ok()) + status = MakeLevelDirectories(env_, options_); + if (status.ok()) { - ConvertLogFilesToTables(); - ExtractMetaData(); - status = WriteDescriptor(); - } - if (status.ok()) { - unsigned long long bytes = 0; - for (size_t i = 0; i < tables_.size(); i++) { - bytes += tables_[i].meta.file_size; + status = FindFiles(); + if (status.ok()) { + ConvertLogFilesToTables(); + ExtractMetaData(); + status = WriteDescriptor(); + } + if (status.ok()) { + unsigned long long bytes = 0; + unsigned long long files = 0; + + // calculate size for log information + for (int level=0; level * table_ptr; + std::vector::const_iterator i; + + table_ptr=&tables_[level]; + files+=table_ptr->size(); + + for ( i = table_ptr->begin(); table_ptr->end()!= i; i++) { + bytes += i->meta.file_size; + } + } // for + + Log(options_.info_log, + "**** Repaired leveldb %s; " + "recovered %d files; %llu bytes. " + "Some data may have been lost. " + "****", + dbname_.c_str(), + static_cast(files), + bytes); + } + if (db_lock_ != NULL) { + env_->UnlockFile(db_lock_); } - Log(options_.info_log, - "**** Repaired leveldb %s; " - "recovered %d files; %llu bytes. " - "Some data may have been lost. " - "****", - dbname_.c_str(), - static_cast(tables_.size()), - bytes); } + + // perform Riak specific scan for overlapping .sst files + // within a level + if (status.ok()) + { + leveldb::DB * db_ptr; + Options options; + + db_ptr=NULL; + options=org_options_; +// options.block_cache=NULL; // not reusing for fear of edge cases + options.is_repair=true; + options.error_if_exists=false; + status=leveldb::DB::Open(options, org_dbname_, &db_ptr); + + if (status.ok()) + status=db_ptr->VerifyLevels(); + + delete db_ptr; + + } // if return status; } @@ -97,34 +161,36 @@ class Repairer { SequenceNumber max_sequence; }; - std::string const dbname_; + DoubleCache double_cache_; + Options const options_, org_options_; + std::string const dbname_, org_dbname_; Env* const env_; InternalKeyComparator const icmp_; InternalFilterPolicy const ipolicy_; - Options const options_; bool owns_info_log_; - bool owns_cache_; + FileLock* db_lock_; TableCache* table_cache_; VersionEdit edit_; std::vector manifests_; - std::vector table_numbers_; + std::vector table_numbers_[config::kNumLevels]; std::vector logs_; - std::vector tables_; + std::vector tables_[config::kNumLevels]; uint64_t next_file_number_; - Status FindFiles() { + Status FindFiles() + { std::vector filenames; + uint64_t number; + FileType type; + int level; + + // base directory Status status = env_->GetChildren(dbname_, &filenames); if (!status.ok()) { return status; } - if (filenames.empty()) { - return Status::IOError(dbname_, "repair found no files"); - } - uint64_t number; - FileType type; for (size_t i = 0; i < filenames.size(); i++) { if (ParseFileName(filenames[i], &number, &type)) { if (type == kDescriptorFile) { @@ -136,13 +202,38 @@ class Repairer { if (type == kLogFile) { logs_.push_back(number); } else if (type == kTableFile) { - table_numbers_.push_back(number); + table_numbers_[0].push_back(number); } else { // Ignore other files - } - } + } // else + } // else + } // if + } // for + + for (level=0; level < config::kNumLevels; ++level) + { + std::string dirname; + + filenames.clear(); + dirname=MakeDirName2(options_, level, "sst"); + Status status = env_->GetChildren(dirname, &filenames); + if (!status.ok()) { + return status; } - } + + for (size_t i = 0; i < filenames.size(); i++) { + if (ParseFileName(filenames[i], &number, &type)) { + if (number + 1 > next_file_number_) { + next_file_number_ = number + 1; + } + + if (type == kTableFile) { + table_numbers_[level].push_back(number); + } + } // if + } // for + } // for + return status; } @@ -186,7 +277,7 @@ class Repairer { reporter.env = env_; reporter.info_log = options_.info_log; reporter.lognum = log; - // We intentionally make log::Reader do checksumming so that + // We intentially make log::Reader do checksumming so that // corruptions cause entire commits to be skipped instead of // propagating bad information (like overly large sequence // numbers). @@ -203,11 +294,11 @@ class Repairer { while (reader.ReadRecord(&record, &scratch)) { if (record.size() < 12) { reporter.Corruption( - record.size(), Status::Corruption("log record too small", logname)); + record.size(), Status::Corruption("log record too small")); continue; } WriteBatchInternal::SetContents(&batch, record); - status = WriteBatchInternal::InsertInto(&batch, mem); + status = WriteBatchInternal::InsertInto(&batch, mem, &options_); if (status.ok()) { counter += WriteBatchInternal::Count(&batch); } else { @@ -223,14 +314,15 @@ class Repairer { // since ExtractMetaData() will also generate edits. FileMetaData meta; meta.number = next_file_number_++; + meta.level = 0; Iterator* iter = mem->NewIterator(); - status = BuildTable(dbname_, env_, options_, table_cache_, iter, &meta); + status = BuildTable(dbname_, env_, options_, icmp_.user_comparator(), table_cache_, iter, &meta, 0); delete iter; mem->Unref(); mem = NULL; if (status.ok()) { if (meta.file_size > 0) { - table_numbers_.push_back(meta.number); + table_numbers_[0].push_back(meta.number); } } Log(options_.info_log, "Log #%llu: %d ops saved to Table #%llu %s", @@ -242,168 +334,128 @@ class Repairer { } void ExtractMetaData() { - for (size_t i = 0; i < table_numbers_.size(); i++) { - ScanTable(table_numbers_[i]); - } - } + for (int level=0; level < config::kNumLevels; ++level) + { + std::vector * number_ptr; + std::vector::const_iterator i; - Iterator* NewTableIterator(const FileMetaData& meta) { - // Same as compaction iterators: if paranoid_checks are on, turn - // on checksum verification. - ReadOptions r; - r.verify_checksums = options_.paranoid_checks; - return table_cache_->NewIterator(r, meta.number, meta.file_size); - } - - void ScanTable(uint64_t number) { - TableInfo t; - t.meta.number = number; - std::string fname = TableFileName(dbname_, number); - Status status = env_->GetFileSize(fname, &t.meta.file_size); - if (!status.ok()) { - // Try alternate file name. - fname = SSTTableFileName(dbname_, number); - Status s2 = env_->GetFileSize(fname, &t.meta.file_size); - if (s2.ok()) { - status = Status::OK(); + number_ptr=&table_numbers_[level]; + for (i = number_ptr->begin(); number_ptr->end()!= i; ++i) { + TableInfo t; + t.meta.number = *i; + t.meta.level = level; + Status status = ScanTable(&t); + if (!status.ok()) + { + std::string fname = TableFileName(options_, t.meta.number, t.meta.level); + Log(options_.info_log, "Table #%llu: ignoring %s", + (unsigned long long) t.meta.number, + status.ToString().c_str()); + ArchiveFile(fname, true); + } else { + tables_[level].push_back(t); + } } } - if (!status.ok()) { - ArchiveFile(TableFileName(dbname_, number)); - ArchiveFile(SSTTableFileName(dbname_, number)); - Log(options_.info_log, "Table #%llu: dropped: %s", - (unsigned long long) t.meta.number, - status.ToString().c_str()); - return; - } + } - // Extract metadata by scanning through table. + Status ScanTable(TableInfo* t) { + Table * table_ptr; + SstCounters counters; + std::string fname = TableFileName(options_, t->meta.number, t->meta.level); int counter = 0; - Iterator* iter = NewTableIterator(t.meta); - bool empty = true; - ParsedInternalKey parsed; - t.max_sequence = 0; - for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { - Slice key = iter->key(); - if (!ParseInternalKey(key, &parsed)) { - Log(options_.info_log, "Table #%llu: unparsable key %s", - (unsigned long long) t.meta.number, - EscapeString(key).c_str()); - continue; - } + Status status = env_->GetFileSize(fname, &t->meta.file_size); + if (status.ok()) { + Iterator* iter = table_cache_->NewIterator( + ReadOptions(), t->meta.number, t->meta.file_size, t->meta.level, &table_ptr); + bool empty = true; + ParsedInternalKey parsed; + t->max_sequence = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + Slice key = iter->key(); + if (!ParseInternalKey(key, &parsed)) { + Log(options_.info_log, "Table #%llu: unparsable key %s", + (unsigned long long) t->meta.number, + EscapeString(key).c_str()); + continue; + } - counter++; - if (empty) { - empty = false; - t.meta.smallest.DecodeFrom(key); + counter++; + if (empty) { + empty = false; + t->meta.smallest.DecodeFrom(key); + } + t->meta.largest.DecodeFrom(key); + if (parsed.sequence > t->max_sequence) { + t->max_sequence = parsed.sequence; + } } - t.meta.largest.DecodeFrom(key); - if (parsed.sequence > t.max_sequence) { - t.max_sequence = parsed.sequence; + if (!iter->status().ok()) { + status = iter->status(); } + else { + counters=table_ptr->GetSstCounters(); + t->meta.exp_write_low=counters.Value(eSstCountExpiry1); + t->meta.exp_write_high=counters.Value(eSstCountExpiry2); + t->meta.exp_explicit_high=counters.Value(eSstCountExpiry3); + } + delete iter; } - if (!iter->status().ok()) { - status = iter->status(); - } - delete iter; Log(options_.info_log, "Table #%llu: %d entries %s", - (unsigned long long) t.meta.number, + (unsigned long long) t->meta.number, counter, status.ToString().c_str()); - - if (status.ok()) { - tables_.push_back(t); - } else { - RepairTable(fname, t); // RepairTable archives input file. - } - } - - void RepairTable(const std::string& src, TableInfo t) { - // We will copy src contents to a new table and then rename the - // new table over the source. - - // Create builder. - std::string copy = TableFileName(dbname_, next_file_number_++); - WritableFile* file; - Status s = env_->NewWritableFile(copy, &file); - if (!s.ok()) { - return; - } - TableBuilder* builder = new TableBuilder(options_, file); - - // Copy data. - Iterator* iter = NewTableIterator(t.meta); - int counter = 0; - for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { - builder->Add(iter->key(), iter->value()); - counter++; - } - delete iter; - - ArchiveFile(src); - if (counter == 0) { - builder->Abandon(); // Nothing to save - } else { - s = builder->Finish(); - if (s.ok()) { - t.meta.file_size = builder->FileSize(); - } - } - delete builder; - builder = NULL; - - if (s.ok()) { - s = file->Close(); - } - delete file; - file = NULL; - - if (counter > 0 && s.ok()) { - std::string orig = TableFileName(dbname_, t.meta.number); - s = env_->RenameFile(copy, orig); - if (s.ok()) { - Log(options_.info_log, "Table #%llu: %d entries repaired", - (unsigned long long) t.meta.number, counter); - tables_.push_back(t); - } - } - if (!s.ok()) { - env_->DeleteFile(copy); - } + return status; } Status WriteDescriptor() { std::string tmp = TempFileName(dbname_, 1); WritableFile* file; - Status status = env_->NewWritableFile(tmp, &file); + Status status = env_->NewWritableFile(tmp, &file, 4096); if (!status.ok()) { return status; } SequenceNumber max_sequence = 0; - for (size_t i = 0; i < tables_.size(); i++) { - if (max_sequence < tables_[i].max_sequence) { - max_sequence = tables_[i].max_sequence; - } - } + for (int level=0; level * table_ptr; + std::vector::const_iterator i; + + table_ptr=&tables_[level]; + + for ( i = table_ptr->begin(); table_ptr->end()!= i; i++) { + if (max_sequence < i->max_sequence) { + max_sequence = i->max_sequence; + } + } // for + } // for edit_.SetComparatorName(icmp_.user_comparator()->Name()); edit_.SetLogNumber(0); edit_.SetNextFile(next_file_number_); edit_.SetLastSequence(max_sequence); - for (size_t i = 0; i < tables_.size(); i++) { - // TODO(opt): separate out into multiple levels - const TableInfo& t = tables_[i]; - edit_.AddFile(0, t.meta.number, t.meta.file_size, - t.meta.smallest, t.meta.largest); - } + for (int level=0; level * table_ptr; + std::vector::const_iterator i; + + table_ptr=&tables_[level]; + + for ( i = table_ptr->begin(); table_ptr->end()!= i; i++) { + edit_.AddFile2(level, i->meta.number, i->meta.file_size, + i->meta.smallest, i->meta.largest, + i->meta.exp_write_low, i->meta.exp_write_high, i->meta.exp_explicit_high); + + } // for + } // for //fprintf(stderr, "NewDescriptor:\n%s\n", edit_.DebugString().c_str()); { log::Writer log(file); std::string record; - edit_.EncodeTo(&record); + edit_.EncodeTo(&record); // manifest format is default for release, options_ often incomplete status = log.AddRecord(record); } if (status.ok()) { @@ -431,21 +483,33 @@ class Repairer { return status; } - void ArchiveFile(const std::string& fname) { + void ArchiveFile(const std::string& fname, bool two_levels=false) { // Move into another directory. E.g., for // dir/foo // rename to // dir/lost/foo - const char* slash = strrchr(fname.c_str(), '/'); + std::string::size_type slash, slash2; + + slash=fname.rfind('/'); + if (two_levels && std::string::npos!=slash && 0CreateDir(new_dir); // Ignore error std::string new_file = new_dir; new_file.append("/"); - new_file.append((slash == NULL) ? fname.c_str() : slash + 1); + new_file.append((std::string::npos!=slash) ? fname.substr(slash+1) : fname); Status s = env_->RenameFile(fname, new_file); Log(options_.info_log, "Archiving %s: %s\n", fname.c_str(), s.ToString().c_str()); diff --git a/src/leveldb/db/skiplist.h b/src/leveldb/db/skiplist.h index 8bd77764d..2ad4c6642 100644 --- a/src/leveldb/db/skiplist.h +++ b/src/leveldb/db/skiplist.h @@ -1,10 +1,7 @@ // Copyright (c) 2011 The LevelDB Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_DB_SKIPLIST_H_ -#define STORAGE_LEVELDB_DB_SKIPLIST_H_ - +// // Thread safety // ------------- // @@ -55,6 +52,12 @@ class SkipList { // Returns true iff an entry that compares equal to key is in the list. bool Contains(const Key& key) const; + // Returns true if all inserts have been sequentially increasing; + // else this SkipList has had keys inserted in non-sequential order + bool InSequentialInsertMode() const { + return sequentialInsertMode_; + } + // Iteration over the contents of a skip list class Iterator { public: @@ -94,8 +97,22 @@ class SkipList { // Intentionally copyable }; + protected: + // Checks the structure of this SkipList object, ensuring the keys are + // properly ordered + // + // This is protected since it is intended for use by unit tests; if a lock + // is used to protect Insert(), then it should be used to protect this + // method as well + bool Valid() const; + + // Disables the sequential insert optimizations (used in performance testing) + void DisableSequentialInsertMode() { + sequentialInsertMode_ = false; + } + private: - enum { kMaxHeight = 12 }; + enum { kMaxHeight = 17 }; // Immutable after construction Comparator const compare_; @@ -115,6 +132,18 @@ class SkipList { // Read/written only by Insert(). Random rnd_; + // Points to the last node in the list; modified only by Insert() + Node* tail_; + + // Pointers to the nodes previous to the tail node; have max_height_ entries + Node* tailPrev_[kMaxHeight]; + + // The height of the tail_ node + int tailHeight_; + + // We track the tail node until we have a non-sequential insert + bool sequentialInsertMode_; + Node* NewNode(const Key& key, int height); int RandomHeight(); bool Equal(const Key& a, const Key& b) const { return (compare_(a, b) == 0); } @@ -129,6 +158,11 @@ class SkipList { // node at "level" for every level in [0..max_height_-1]. Node* FindGreaterOrEqual(const Key& key, Node** prev) const; + // Similar to FindGreaterOrEqual() except it uses the barrier-free + // variant of Next(); this is used only by Insert() and it + // checks the tail_ pointer in case we're doing a sequential insert + Node* NoBarrier_FindGreaterOrEqual(const Key& key, Node** prev) const; + // Return the latest node with a key < key. // Return head_ if there is no such node. Node* FindLessThan(const Key& key) const; @@ -280,6 +314,54 @@ typename SkipList::Node* SkipList::FindGreaterOr } } +template +typename SkipList::Node* +SkipList::NoBarrier_FindGreaterOrEqual(const Key& key, Node** prev) const { + int level = GetMaxHeight() - 1; + + // If we have only seen sequential inserts up to this point, we can use + // the tail_ node + if ( sequentialInsertMode_ ) { + if (tail_ == NULL) { + // The list is currently empty, so the node being inserted + // will be the new tail_ + assert(level == 0); + if (prev != NULL) prev[0] = head_; + return NULL; + } + else if (KeyIsAfterNode(key, tail_)) { + // The new key must be inserted after the current tail_ node + if (prev != NULL) { + int i; + for (i = 0; i < tailHeight_; ++i) { + prev[i] = tail_; + } + for (/*continue with i*/; i <= level; ++i) { + prev[i] = tailPrev_[i]; + } + } + return NULL; + } + } + + Node* x = head_; + while (true) { + Node* next = x->NoBarrier_Next(level); + if (KeyIsAfterNode(key, next)) { + // Keep searching in this list + x = next; + } else { + if (prev != NULL) prev[level] = x; + if (level == 0) { + return next; + } else { + // Switch to next list + level--; + } + } + } +} + template typename SkipList::Node* SkipList::FindLessThan(const Key& key) const { @@ -327,25 +409,41 @@ SkipList::SkipList(Comparator cmp, Arena* arena) arena_(arena), head_(NewNode(0 /* any key will do */, kMaxHeight)), max_height_(reinterpret_cast(1)), - rnd_(0xdeadbeef) { + rnd_(0xdeadbeef), + tail_(NULL), + tailHeight_(0), + sequentialInsertMode_(true) { for (int i = 0; i < kMaxHeight; i++) { head_->SetNext(i, NULL); + tailPrev_[i] = NULL; } } template void SkipList::Insert(const Key& key) { - // TODO(opt): We can use a barrier-free variant of FindGreaterOrEqual() + // We use a barrier-free variant of FindGreaterOrEqual() // here since Insert() is externally synchronized. Node* prev[kMaxHeight]; - Node* x = FindGreaterOrEqual(key, prev); + Node* x = NoBarrier_FindGreaterOrEqual(key, prev); + + // If we're still in sequential-insert mode, check if the new node is being + // inserted at the end of the list, which is indicated by x being NULL + if (sequentialInsertMode_) { + if (x != NULL) { + // we have a non-sequential (AKA random) insert, so stop maintaining + // the tail bookkeeping overhead + sequentialInsertMode_ = false; + } + } // Our data structure does not allow duplicate insertion assert(x == NULL || !Equal(key, x->key)); - int height = RandomHeight(); + int i, height = RandomHeight(); if (height > GetMaxHeight()) { - for (int i = GetMaxHeight(); i < height; i++) { + // We are extending max_height_ which means we need to fill in the blanks + // in prev[] that were not filled in by NoBarrier_FindGreaterOrEqual() + for (i = GetMaxHeight(); i < height; ++i) { prev[i] = head_; } //fprintf(stderr, "Change height from %d to %d\n", max_height_, height); @@ -361,12 +459,37 @@ void SkipList::Insert(const Key& key) { } x = NewNode(key, height); - for (int i = 0; i < height; i++) { + for (i = 0; i < height; ++i) { // NoBarrier_SetNext() suffices since we will add a barrier when // we publish a pointer to "x" in prev[i]. x->NoBarrier_SetNext(i, prev[i]->NoBarrier_Next(i)); prev[i]->SetNext(i, x); } + + // Do we need to update our tail_ pointer? + if (sequentialInsertMode_) { + Node* prevTail = tail_; + int prevTailHeight = tailHeight_; + + tail_ = x; + tailHeight_ = height; + + // We also need to update our tailPrev_ pointers; first we capture + // the nodes already pointing to the new tail_ + for (i = 0; i < height; ++i) { + tailPrev_[i] = prev[i]; + } + + // If the previous tail node was taller than the new tail node, then + // the prev pointers above the current tail node's height (up to the + // height of the previous tail node) are simply the previous tail node + for (/*continue with i*/; i < prevTailHeight; ++i) { + tailPrev_[i] = prevTail; + } + + // NOTE: any prev pointers above prevTailHeight (up to max_height_) were + // already set in tailPrev_ by previous calls to this method + } } template @@ -379,6 +502,115 @@ bool SkipList::Contains(const Key& key) const { } } -} // namespace leveldb +template +bool SkipList::Valid() const +{ + // Note that we can use barrier-free overloads in this method since it is + // protected by the same lock as Insert(). -#endif // STORAGE_LEVELDB_DB_SKIPLIST_H_ + // Ensure that the list is properly sorted; use an iterator for this check + const Key* pPrevKey = NULL; + typename SkipList::Iterator iter(this); + for ( iter.SeekToFirst(); iter.Valid(); iter.Next() ) { + if ( pPrevKey != NULL ) { + if ( compare_( *pPrevKey, iter.key() ) >= 0 ) { + return false; + } + } + pPrevKey = &iter.key(); + } + + // Now walk the linked list at each level and ensure it's sorted. Also track + // how many nodes we see at each level; the number of nodes in the linked + // list at level n must not be larger than the number of nodes at level n-1. + std::vector nodeCounts( GetMaxHeight() ); + int level; + for ( level = GetMaxHeight() - 1; level >= 0; --level ) { + int nodeCount = 0; + pPrevKey = NULL; + for ( Node* pNode = head_->NoBarrier_Next( level ); + pNode != NULL; + pNode = pNode->NoBarrier_Next( level ) ) { + ++nodeCount; + if ( pPrevKey != NULL ) { + if ( compare_( *pPrevKey, pNode->key ) >= 0 ) { + return false; + } + } + pPrevKey = &pNode->key; + } + nodeCounts[ level ] = nodeCount; + } + + // Ensure the node counts do not increase as we move up the levels + int prevNodeCount = nodeCounts[0]; + for ( level = 1; level < GetMaxHeight(); ++level ) { + int currentNodeCount = nodeCounts[ level ]; + if ( currentNodeCount > prevNodeCount ) { + return false; + } + prevNodeCount = currentNodeCount; + } + + // Ensure that tail_ points to the last node + if ( sequentialInsertMode_ ) { + if ( tail_ == NULL ) { + // tail_ is not set, so the list must be empty + if ( tailPrev_[0] != NULL || head_->NoBarrier_Next(0) != NULL ) { + return false; + } + } + else { + // we have a tail_ node; first ensure that its prev pointer actually + // points to it + if ( tailPrev_[0] == NULL || tailPrev_[0]->NoBarrier_Next(0) != tail_ ) { + return false; + } + if ( compare_( tailPrev_[0]->key, tail_->key ) >= 0 ) { + return false; + } + + // now check the rest of the pointers in tailPrev_; up to tailHeight_, + // the next pointer of the node in tailPrev_ should point to tail_; after + // that, the next pointer should be NULL + for ( level = 1; level < GetMaxHeight(); ++level ) { + Node* tailPrev = tailPrev_[ level ]; + if ( tailPrev == NULL ) { + return false; + } + if ( level < tailHeight_ ) { + if ( tailPrev->NoBarrier_Next( level ) != tail_ ) { + return false; + } + if ( compare_( tailPrev->key, tail_->key ) >= 0 ) { + return false; + } + } + else { + if ( tailPrev->NoBarrier_Next( level ) != NULL ) { + return false; + } + } + } + + // the remainder of the tailPrev_ pointers (above max_height_) + // should be NULL + for ( /*continue with level*/; level < kMaxHeight; ++level ) { + if ( tailPrev_[ level ] != NULL ) { + return false; + } + } + + // now ensure that FindLast() returns tail_ + Node* lastNode = FindLast(); + if ( lastNode != tail_ ) { + return false; + } + } + } + + // if we get here, all is good + return true; +} + +} // namespace leveldb diff --git a/src/leveldb/db/skiplist_test.cc b/src/leveldb/db/skiplist_test.cc index aee1461e1..c8643071c 100644 --- a/src/leveldb/db/skiplist_test.cc +++ b/src/leveldb/db/skiplist_test.cc @@ -2,11 +2,15 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. +#define __STDC_FORMAT_MACROS +#include + #include "db/skiplist.h" #include #include "leveldb/env.h" #include "util/arena.h" #include "util/hash.h" +#include "util/mutexlock.h" #include "util/random.h" #include "util/testharness.h" @@ -26,15 +30,29 @@ struct Comparator { } }; +template +class SkipListTest : public SkipList +{ + public: + SkipListTest(Comparator cmp, Arena* arena) : SkipList(cmp, arena) {} + + // check the validity of this SkipList object by calling the Valid() method + // in the base class + bool Valid() const { return SkipList::Valid(); } + + void DisableSequentialInsertMode() { SkipList::DisableSequentialInsertMode(); } +}; + class SkipTest { }; TEST(SkipTest, Empty) { Arena arena; Comparator cmp; - SkipList list(cmp, &arena); + SkipListTest list(cmp, &arena); ASSERT_TRUE(!list.Contains(10)); + ASSERT_TRUE(list.Valid()); - SkipList::Iterator iter(&list); + SkipListTest::Iterator iter(&list); ASSERT_TRUE(!iter.Valid()); iter.SeekToFirst(); ASSERT_TRUE(!iter.Valid()); @@ -51,13 +69,14 @@ TEST(SkipTest, InsertAndLookup) { std::set keys; Arena arena; Comparator cmp; - SkipList list(cmp, &arena); + SkipListTest list(cmp, &arena); for (int i = 0; i < N; i++) { Key key = rnd.Next() % R; if (keys.insert(key).second) { list.Insert(key); } } + ASSERT_TRUE(list.Valid()); for (int i = 0; i < R; i++) { if (list.Contains(i)) { @@ -69,7 +88,7 @@ TEST(SkipTest, InsertAndLookup) { // Simple iterator tests { - SkipList::Iterator iter(&list); + SkipListTest::Iterator iter(&list); ASSERT_TRUE(!iter.Valid()); iter.Seek(0); @@ -87,7 +106,7 @@ TEST(SkipTest, InsertAndLookup) { // Forward iteration test for (int i = 0; i < R; i++) { - SkipList::Iterator iter(&list); + SkipListTest::Iterator iter(&list); iter.Seek(i); // Compare against model iterator @@ -107,7 +126,7 @@ TEST(SkipTest, InsertAndLookup) { // Backward iteration test { - SkipList::Iterator iter(&list); + SkipListTest::Iterator iter(&list); iter.SeekToLast(); // Compare against model iterator @@ -250,7 +269,7 @@ class ConcurrentTest { // Note that generation 0 is never inserted, so it is ok if // <*,0,*> is missing. ASSERT_TRUE((gen(pos) == 0) || - (gen(pos) > static_cast(initial_state.Get(key(pos)))) + (gen(pos) > initial_state.Get(key(pos))) ) << "key: " << key(pos) << "; gen: " << gen(pos) << "; initgen: " @@ -313,18 +332,16 @@ class TestState { state_cv_(&mu_) {} void Wait(ReaderState s) { - mu_.Lock(); + MutexLock lock(&mu_); while (state_ != s) { state_cv_.Wait(); } - mu_.Unlock(); } void Change(ReaderState s) { - mu_.Lock(); + MutexLock lock(&mu_); state_ = s; state_cv_.Signal(); - mu_.Unlock(); } private: @@ -371,6 +388,211 @@ TEST(SkipTest, Concurrent3) { RunConcurrent(3); } TEST(SkipTest, Concurrent4) { RunConcurrent(4); } TEST(SkipTest, Concurrent5) { RunConcurrent(5); } +static void +RunSequentialInsert( + const int NumKeys, + bool AcquireLock, + bool ReverseInsert, + bool SequentialInsertModeEnabled ) +{ + const int loopCount = 5; // repeat the whole process this many times and average the time spent + std::vector timeSpent; + + port::Mutex mutex; + Env* env = Env::Default(); + + fprintf( stderr, + "Sequentially inserting %d keys in %s order,\n" + " seqential insert mode is initially %sabled,\n" + " %sacquiring a lock for each insert (averaging over %d runs)\n", + NumKeys, ReverseInsert ? "reverse" : "forward", + SequentialInsertModeEnabled ? "en" : "dis", + AcquireLock ? "" : "not ", loopCount ); + + int k; + for ( k = 0; k < loopCount; ++k ) { + int j; + Arena arena; + Comparator cmp; + SkipListTest list( cmp, &arena ); + + // initially the SkipList should be in sequential mode + ASSERT_TRUE( list.InSequentialInsertMode() ); + + // were we instructed to disable sequential insert mode? + if ( !SequentialInsertModeEnabled ) { + list.DisableSequentialInsertMode(); + ASSERT_TRUE( !list.InSequentialInsertMode() ); + } + + uint64_t start = env->NowMicros(); + for ( j = 0; j < NumKeys; ++j ) { + Key key = ReverseInsert ? NumKeys - 1 - j : j; + + if ( AcquireLock ) mutex.Lock(); + list.Insert( key ); + if ( AcquireLock ) mutex.Unlock(); + } + uint64_t stop = env->NowMicros(); + timeSpent.push_back( stop - start ); + //fprintf( stderr, " Time for run %d: %llu\n", k, timeSpent[k] ); + + // if SequentialInsertModeEnabled is true, the SkipList should still be + // in sequential mode iff ReverseInsert is false + if ( SequentialInsertModeEnabled ) { + ASSERT_TRUE( list.InSequentialInsertMode() != ReverseInsert ); + } + else { + ASSERT_TRUE( !list.InSequentialInsertMode() ); + } + + // ensure the SkipLlist is properly sorted + if ( AcquireLock ) mutex.Lock(); + ASSERT_TRUE( list.Valid() ); + if ( AcquireLock ) mutex.Unlock(); + + // ensure the SkipList contains all the keys we inserted + for ( j = 0; j < NumKeys; ++j ) { + ASSERT_TRUE( list.Contains( j ) ); + } + } + + // throw out the low and high times and average the rest + uint64_t totalTime, lowTime, highTime; + totalTime = lowTime = highTime = timeSpent[0]; + for ( k = 1; k < loopCount; ++k ) { + uint64_t currentTime = timeSpent[k]; + totalTime += currentTime; + if ( lowTime > currentTime ) lowTime = currentTime; + if ( highTime < currentTime ) highTime = currentTime; + } + + totalTime -= (lowTime + highTime); + + uint64_t averageTime = (totalTime / (loopCount - 2)); + double timePerKey = (double)averageTime / (double)NumKeys; + fprintf( stderr, " Average insertion time: %" PRIu64 " (%f/key)\n", averageTime, timePerKey ); +} + +TEST(SkipTest, SequentialInsert_NoLock_ForwardInsert) +{ + int numKeys = 100000; + bool acquireLock = false; + bool reverseInsert = false; + bool sequentialInsertModeEnabled = true; + RunSequentialInsert( numKeys, acquireLock, reverseInsert, sequentialInsertModeEnabled ); + + sequentialInsertModeEnabled = false; + RunSequentialInsert( numKeys, acquireLock, reverseInsert, sequentialInsertModeEnabled ); +} + +TEST(SkipTest, SequentialInsert_Lock_ForwardInsert) +{ + int numKeys = 100000; + bool acquireLock = true; + bool reverseInsert = false; + bool sequentialInsertModeEnabled = true; + RunSequentialInsert( numKeys, acquireLock, reverseInsert, sequentialInsertModeEnabled ); + + sequentialInsertModeEnabled = false; + RunSequentialInsert( numKeys, acquireLock, reverseInsert, sequentialInsertModeEnabled ); +} + +TEST(SkipTest, SequentialInsert_NoLock_ReverseInsert) +{ + int numKeys = 100000; + bool acquireLock = false; + bool reverseInsert = true; + bool sequentialInsertModeEnabled = true; + RunSequentialInsert( numKeys, acquireLock, reverseInsert, sequentialInsertModeEnabled ); +} + +TEST(SkipTest, SequentialInsert_Lock_ReverseInsert) +{ + int numKeys = 100000; + bool acquireLock = true; + bool reverseInsert = true; + bool sequentialInsertModeEnabled = true; + RunSequentialInsert( numKeys, acquireLock, reverseInsert, sequentialInsertModeEnabled ); +} + +TEST(SkipTest, SequentialInsert_IncreasingNumberOfInserts) +{ + // test with increasing numbers of keys, with sequential-insert mode both + // enabled and disabled; we're looking to see if per-key insertion times + // trend upward as the number of keys increases + int numKeys = 10000; + bool acquireLock = false; + bool reverseInsert = false; + bool sequentialInsertModeEnabled = true; + RunSequentialInsert( numKeys, acquireLock, reverseInsert, sequentialInsertModeEnabled ); + + sequentialInsertModeEnabled = false; + RunSequentialInsert( numKeys, acquireLock, reverseInsert, sequentialInsertModeEnabled ); + + numKeys = 100000; + sequentialInsertModeEnabled = true; + RunSequentialInsert( numKeys, acquireLock, reverseInsert, sequentialInsertModeEnabled ); + + sequentialInsertModeEnabled = false; + RunSequentialInsert( numKeys, acquireLock, reverseInsert, sequentialInsertModeEnabled ); + + numKeys = 1000000; + sequentialInsertModeEnabled = true; + RunSequentialInsert( numKeys, acquireLock, reverseInsert, sequentialInsertModeEnabled ); + + sequentialInsertModeEnabled = false; + RunSequentialInsert( numKeys, acquireLock, reverseInsert, sequentialInsertModeEnabled ); +} + +TEST(SkipTest, SequentialInsert_MixedInsertionModes) +{ + // start inserting sequentially, then switch to non-sequential inserts, + // ensuring all works as intended + int j, numSequentialKeys = 100000, numNonSequentialKeys = 100000; + int totalNumKeys = numSequentialKeys + numNonSequentialKeys; + Arena arena; + Comparator cmp; + SkipListTest list( cmp, &arena ); + + // initially the SkipList should be in sequential mode + ASSERT_TRUE( list.InSequentialInsertMode() ); + + // start inserting at key=1; when we insert 0 below, the list should switch + // out of sequential insert mode + for ( j = 1; j < numSequentialKeys; ++j ) { + list.Insert( j ); + } + + // the SkipList should still be in sequential mode + ASSERT_TRUE( list.InSequentialInsertMode() ); + ASSERT_TRUE( list.Valid() ); + + list.Insert( 0 ); + ASSERT_TRUE( !list.InSequentialInsertMode() ); + ASSERT_TRUE( list.Valid() ); + + // now insert the remaining keys in non-sequential order (they're not + // random, but that doesn't matter here; just ensure we switch to + // non-sequential mode and that all continues to work) + for ( j = 0; j < numNonSequentialKeys; j += 2 ) { + int key = totalNumKeys - j - 1; + list.Insert( key ); + } + for ( j = 0; j < numNonSequentialKeys; j += 2 ) { + int key = numSequentialKeys + j; + list.Insert( key ); + } + + ASSERT_TRUE( !list.InSequentialInsertMode() ); + ASSERT_TRUE( list.Valid() ); + + // ensure the SkipList contains all the keys we inserted + for ( j = 0; j < totalNumKeys; ++j ) { + ASSERT_TRUE( list.Contains( j ) ); + } +} + } // namespace leveldb int main(int argc, char** argv) { diff --git a/src/leveldb/db/snapshot.h b/src/leveldb/db/snapshot.h index 6ed413c42..e7f8fd2c3 100644 --- a/src/leveldb/db/snapshot.h +++ b/src/leveldb/db/snapshot.h @@ -5,7 +5,6 @@ #ifndef STORAGE_LEVELDB_DB_SNAPSHOT_H_ #define STORAGE_LEVELDB_DB_SNAPSHOT_H_ -#include "db/dbformat.h" #include "leveldb/db.h" namespace leveldb { diff --git a/src/leveldb/db/table_cache.cc b/src/leveldb/db/table_cache.cc index e3d82cd3e..34b03c7aa 100644 --- a/src/leveldb/db/table_cache.cc +++ b/src/leveldb/db/table_cache.cc @@ -5,22 +5,26 @@ #include "db/table_cache.h" #include "db/filename.h" +#include "db/log_reader.h" +#include "db/log_writer.h" +#include "db/version_edit.h" #include "leveldb/env.h" #include "leveldb/table.h" #include "util/coding.h" +#include "leveldb/perf_count.h" namespace leveldb { -struct TableAndFile { - RandomAccessFile* file; - Table* table; -}; - static void DeleteEntry(const Slice& key, void* value) { TableAndFile* tf = reinterpret_cast(value); - delete tf->table; - delete tf->file; - delete tf; + if (0==dec_and_fetch(&tf->user_count)) + { + if (NULL!=tf->doublecache) + tf->doublecache->SubFileSize(tf->table->GetFileSize()); + delete tf->table; + delete tf->file; + delete tf; + } // if } static void UnrefEntry(void* arg1, void* arg2) { @@ -31,37 +35,38 @@ static void UnrefEntry(void* arg1, void* arg2) { TableCache::TableCache(const std::string& dbname, const Options* options, - int entries) + Cache * file_cache, + DoubleCache & doublecache) : env_(options->env), dbname_(dbname), options_(options), - cache_(NewLRUCache(entries)) { + cache_(file_cache), + doublecache_(doublecache) +{ } TableCache::~TableCache() { - delete cache_; } -Status TableCache::FindTable(uint64_t file_number, uint64_t file_size, - Cache::Handle** handle) { +Status TableCache::FindTable(uint64_t file_number, uint64_t file_size, int level, + Cache::Handle** handle, bool is_compaction, + bool for_iterator) { Status s; char buf[sizeof(file_number)]; EncodeFixed64(buf, file_number); Slice key(buf, sizeof(buf)); *handle = cache_->Lookup(key); if (*handle == NULL) { - std::string fname = TableFileName(dbname_, file_number); + std::string fname = TableFileName(*options_, file_number, level); RandomAccessFile* file = NULL; Table* table = NULL; s = env_->NewRandomAccessFile(fname, &file); - if (!s.ok()) { - std::string old_fname = SSTTableFileName(dbname_, file_number); - if (env_->NewRandomAccessFile(old_fname, &file).ok()) { - s = Status::OK(); - } - } if (s.ok()) { s = Table::Open(*options_, file, file_size, &table); + + // Riak: support opportunity to manage Linux page cache + if (is_compaction) + file->SetForCompaction(file_size); } if (!s.ok()) { @@ -73,22 +78,74 @@ Status TableCache::FindTable(uint64_t file_number, uint64_t file_size, TableAndFile* tf = new TableAndFile; tf->file = file; tf->table = table; - *handle = cache_->Insert(key, tf, 1, &DeleteEntry); + tf->doublecache = &doublecache_; + tf->file_number = file_number; + tf->level = level; + + *handle = cache_->Insert(key, tf, table->TableObjectSize(), &DeleteEntry); + gPerfCounters->Inc(ePerfTableOpened); + doublecache_.AddFileSize(table->GetFileSize()); + + // temporary hardcoding to match number of levels defined as + // overlapped in version_set.cc + if (levelAddref(*handle); } } + else + { + Table *table = reinterpret_cast(cache_->Value(*handle))->table; + + // this is NOT first access, see if bloom filter can load now + if (!for_iterator && table->ReadFilter()) + { + // TableAndFile now going to be present in two cache entries + // 1. retrieve old entry within file cache + TableAndFile* tf = reinterpret_cast(cache_->Value(*handle)); + inc_and_fetch(&tf->user_count); + + // 2. must clean file size, do not want double count + if (NULL!=tf->doublecache) + tf->doublecache->SubFileSize(tf->table->GetFileSize()); + + // 3. release current reference (and possible special overlap reference) + cache_->Release(*handle); + if (tf->levelRelease(*handle); + + // 4. create second table cache entry using TableObjectSize that now includes + // bloom filter size + *handle = cache_->Insert(key, tf, table->TableObjectSize(), &DeleteEntry); + + // 5. set double reference if an overlapped file (prevents from being flushed) + if (levelAddref(*handle); + } // if + + // for Linux, let fadvise start precaching + if (is_compaction) + { + RandomAccessFile *file = reinterpret_cast(cache_->Value(*handle))->file; + file->SetForCompaction(file_size); + } // if + + gPerfCounters->Inc(ePerfTableCached); + } // else return s; } Iterator* TableCache::NewIterator(const ReadOptions& options, uint64_t file_number, uint64_t file_size, + int level, Table** tableptr) { if (tableptr != NULL) { *tableptr = NULL; } Cache::Handle* handle = NULL; - Status s = FindTable(file_number, file_size, &handle); + Status s = FindTable(file_number, file_size, level, &handle, options.IsCompaction(), true); + if (!s.ok()) { return NewErrorIterator(s); } @@ -105,11 +162,13 @@ Iterator* TableCache::NewIterator(const ReadOptions& options, Status TableCache::Get(const ReadOptions& options, uint64_t file_number, uint64_t file_size, + int level, const Slice& k, void* arg, - void (*saver)(void*, const Slice&, const Slice&)) { + bool (*saver)(void*, const Slice&, const Slice&)) { Cache::Handle* handle = NULL; - Status s = FindTable(file_number, file_size, &handle); + Status s = FindTable(file_number, file_size, level, &handle); + if (s.ok()) { Table* t = reinterpret_cast(cache_->Value(handle))->table; s = t->InternalGet(options, k, arg, saver); @@ -118,10 +177,60 @@ Status TableCache::Get(const ReadOptions& options, return s; } -void TableCache::Evict(uint64_t file_number) { +void TableCache::Evict(uint64_t file_number, bool is_overlapped) { char buf[sizeof(file_number)]; EncodeFixed64(buf, file_number); + + // overlapped files have extra reference to prevent their purge, + // release that reference now + if (is_overlapped) + { + Cache::Handle *handle; + + // the Lookup call adds a reference too, back out both + handle=cache_->Lookup(Slice(buf, sizeof(buf))); + + // with multiple background threads, file might already be + // evicted + if (NULL!=handle) + { + cache_->Release(handle); // release for Lookup() call just made + cache_->Release(handle); // release for extra reference + } // if + } // if + cache_->Erase(Slice(buf, sizeof(buf))); } +/** + * Riak specific routine to return table statistic ONLY if table metadata + * already within cache ... otherwise return 0. + */ +uint64_t +TableCache::GetStatisticValue( + uint64_t file_number, + unsigned Index) +{ + uint64_t ret_val; + char buf[sizeof(file_number)]; + Cache::Handle *handle; + + ret_val=0; + EncodeFixed64(buf, file_number); + Slice key(buf, sizeof(buf)); + handle = cache_->Lookup(key); + + if (NULL != handle) + { + TableAndFile * tf; + + tf=reinterpret_cast(cache_->Value(handle)); + ret_val=tf->table->GetSstCounters().Value(Index); + cache_->Release(handle); + } // if + + return(ret_val); + +} // TableCache::GetStatisticValue + } // namespace leveldb diff --git a/src/leveldb/db/table_cache.h b/src/leveldb/db/table_cache.h index 8cf4aaf12..8f77c58dd 100644 --- a/src/leveldb/db/table_cache.h +++ b/src/leveldb/db/table_cache.h @@ -13,6 +13,7 @@ #include "leveldb/cache.h" #include "leveldb/table.h" #include "port/port.h" +#include "util/cache2.h" namespace leveldb { @@ -20,8 +21,10 @@ class Env; class TableCache { public: - TableCache(const std::string& dbname, const Options* options, int entries); - ~TableCache(); + // clean up note: file_cache is redundant to GetFileCache available from doublecache + TableCache(const std::string& dbname, const Options* options, Cache * file_cache, + DoubleCache & doublecache); + virtual ~TableCache(); // Return an iterator for the specified file number (the corresponding // file length must be exactly "file_size" bytes). If "tableptr" is @@ -33,6 +36,7 @@ class TableCache { Iterator* NewIterator(const ReadOptions& options, uint64_t file_number, uint64_t file_size, + int level, Table** tableptr = NULL); // If a seek to internal key "k" in specified file finds an entry, @@ -40,22 +44,65 @@ class TableCache { Status Get(const ReadOptions& options, uint64_t file_number, uint64_t file_size, + int level, const Slice& k, void* arg, - void (*handle_result)(void*, const Slice&, const Slice&)); + bool (*handle_result)(void*, const Slice&, const Slice&)); // Evict any entry for the specified file number - void Evict(uint64_t file_number); + void Evict(uint64_t file_number, bool is_overlapped); - private: + // Riak specific: return table statistic ONLY if table in cache, otherwise zero + uint64_t GetStatisticValue(uint64_t file_number, unsigned Index); + + + // access for testing tools, not for public access + Status TEST_FindTable(uint64_t file_number, uint64_t file_size, int level, Cache::Handle** handle) + {return( FindTable(file_number, file_size, level, handle));}; + + Cache* TEST_GetInternalCache() {return(cache_);}; + + void Release(Cache::Handle * handle) {cache_->Release(handle);}; + + // routine called if Options::cache_object_warming is true. + // Writes list of all file names currently in file cache to disk. + Status SaveOpenFileList(); + + // routine called if Options::cache_object_warming is true. + // Reads file created by SaveOpenFileList() and attempts to open + // every file. + Status PreloadTableCache(); + + // was private, now protected to allow easy unit test overrides + protected: Env* const env_; const std::string dbname_; const Options* options_; - Cache* cache_; + Cache * cache_; + DoubleCache & doublecache_; - Status FindTable(uint64_t file_number, uint64_t file_size, Cache::Handle**); + // virtual to enable unit test overrides + virtual Status FindTable(uint64_t file_number, uint64_t file_size, int level, + Cache::Handle**, bool is_compaction=false, + bool for_iterator=false); }; + +struct TableAndFile { + RandomAccessFile* file; + Table* table; + DoubleCache * doublecache; + uint64_t file_number; // saved for cache object warming + int level; // saved for cache object warming + volatile uint32_t user_count; + + TableAndFile() + : file(NULL), table(NULL), doublecache(NULL), + file_number(0), level(0), user_count(1) + {}; +}; + + } // namespace leveldb #endif // STORAGE_LEVELDB_DB_TABLE_CACHE_H_ diff --git a/src/leveldb/db/version_edit.cc b/src/leveldb/db/version_edit.cc index f10a2d58b..17b565679 100644 --- a/src/leveldb/db/version_edit.cc +++ b/src/leveldb/db/version_edit.cc @@ -9,20 +9,6 @@ namespace leveldb { -// Tag numbers for serialized VersionEdit. These numbers are written to -// disk and should not be changed. -enum Tag { - kComparator = 1, - kLogNumber = 2, - kNextFileNumber = 3, - kLastSequence = 4, - kCompactPointer = 5, - kDeletedFile = 6, - kNewFile = 7, - // 8 was used for large value refs - kPrevLogNumber = 9 -}; - void VersionEdit::Clear() { comparator_.clear(); log_number_ = 0; @@ -34,11 +20,21 @@ void VersionEdit::Clear() { has_prev_log_number_ = false; has_next_file_number_ = false; has_last_sequence_ = false; + has_f1_files_ = false; + has_f2_files_ = false; + deleted_files_.clear(); new_files_.clear(); } -void VersionEdit::EncodeTo(std::string* dst) const { +/** + * EncodeTo serializes the VersionEdit object + * to the "dst" string parameter. "format2" flag + * indicates whether serialization should use original + * Google format for file objects (false) or Basho's updated + * file2 format for expiry enabled file objects (true) + */ +void VersionEdit::EncodeTo(std::string* dst, bool format2) const { if (has_comparator_) { PutVarint32(dst, kComparator); PutLengthPrefixedSlice(dst, comparator_); @@ -76,12 +72,21 @@ void VersionEdit::EncodeTo(std::string* dst) const { for (size_t i = 0; i < new_files_.size(); i++) { const FileMetaData& f = new_files_[i].second; - PutVarint32(dst, kNewFile); + if (format2) + PutVarint32(dst, kNewFile2); + else + PutVarint32(dst, kNewFile); PutVarint32(dst, new_files_[i].first); // level PutVarint64(dst, f.number); PutVarint64(dst, f.file_size); PutLengthPrefixedSlice(dst, f.smallest.Encode()); PutLengthPrefixedSlice(dst, f.largest.Encode()); + if (format2) + { + PutVarint64(dst, f.exp_write_low); + PutVarint64(dst, f.exp_write_high); + PutVarint64(dst, f.exp_explicit_high); + } } } @@ -98,7 +103,7 @@ static bool GetInternalKey(Slice* input, InternalKey* dst) { static bool GetLevel(Slice* input, int* level) { uint32_t v; if (GetVarint32(input, &v) && - v < config::kNumLevels) { + v < (unsigned)config::kNumLevels) { *level = v; return true; } else { @@ -185,13 +190,34 @@ Status VersionEdit::DecodeFrom(const Slice& src) { GetVarint64(&input, &f.number) && GetVarint64(&input, &f.file_size) && GetInternalKey(&input, &f.smallest) && - GetInternalKey(&input, &f.largest)) { + GetInternalKey(&input, &f.largest)) + { + has_f1_files_ = true; + f.level=level; new_files_.push_back(std::make_pair(level, f)); } else { msg = "new-file entry"; } break; + case kNewFile2: + if (GetLevel(&input, &level) && + GetVarint64(&input, &f.number) && + GetVarint64(&input, &f.file_size) && + GetInternalKey(&input, &f.smallest) && + GetInternalKey(&input, &f.largest) && + GetVarint64(&input, &f.exp_write_low) && + GetVarint64(&input, &f.exp_write_high) && + GetVarint64(&input, &f.exp_explicit_high)) + { + has_f2_files_ = true; + f.level=level; + new_files_.push_back(std::make_pair(level, f)); + } else { + msg = "new-file2 entry"; + } + break; + default: msg = "unknown tag"; break; @@ -258,6 +284,12 @@ std::string VersionEdit::DebugString() const { r.append(f.smallest.DebugString()); r.append(" .. "); r.append(f.largest.DebugString()); + r.append(" "); + AppendNumberTo(&r, f.exp_write_low); + r.append(" "); + AppendNumberTo(&r, f.exp_write_high); + r.append(" "); + AppendNumberTo(&r, f.exp_explicit_high); } r.append("\n}\n"); return r; diff --git a/src/leveldb/db/version_edit.h b/src/leveldb/db/version_edit.h index eaef77b32..ba0c8f8ae 100644 --- a/src/leveldb/db/version_edit.h +++ b/src/leveldb/db/version_edit.h @@ -16,15 +16,41 @@ class VersionSet; struct FileMetaData { int refs; - int allowed_seeks; // Seeks allowed until compaction +// int allowed_seeks; // Seeks allowed until compaction uint64_t number; uint64_t file_size; // File size in bytes + uint64_t num_entries; // count of values in .sst file, only valid during table build InternalKey smallest; // Smallest internal key served by table InternalKey largest; // Largest internal key served by table + int level; + ExpiryTimeMicros exp_write_low; // oldest write time in file: + // 0 - non-expiry keys exist too + // ULLONG_MAX - no write time expiry & no plain keys + ExpiryTimeMicros exp_write_high; // most recent write time in file + ExpiryTimeMicros exp_explicit_high; // most recent/furthest into future explicit expiry - FileMetaData() : refs(0), allowed_seeks(1 << 30), file_size(0) { } + FileMetaData() + : refs(0), /*allowed_seeks(1 << 30),*/ file_size(0), + num_entries(0), level(-1), exp_write_low(0), exp_write_high(0), exp_explicit_high(0) + { } }; + +class FileMetaDataPtrCompare +{ +protected: + const Comparator * comparator_; + +public: + explicit FileMetaDataPtrCompare(const Comparator * Comparer) + : comparator_(Comparer) {}; + + bool operator() (const FileMetaData * file1, const FileMetaData * file2) const + { + return(comparator_->Compare(file1->smallest.user_key(), file2->smallest.user_key()) < 0); + } +}; // class FileMetaDataPtrCompare + class VersionEdit { public: VersionEdit() { Clear(); } @@ -59,6 +85,7 @@ class VersionEdit { // Add the specified file at the specified number. // REQUIRES: This version has not been saved (see VersionSet::SaveTo) // REQUIRES: "smallest" and "largest" are smallest and largest keys in file +#if 0 void AddFile(int level, uint64_t file, uint64_t file_size, const InternalKey& smallest, @@ -68,6 +95,27 @@ class VersionEdit { f.file_size = file_size; f.smallest = smallest; f.largest = largest; + f.level = level; + new_files_.push_back(std::make_pair(level, f)); + } +#endif + + void AddFile2(int level, uint64_t file, + uint64_t file_size, + const InternalKey& smallest, + const InternalKey& largest, + uint64_t exp_write_low, + uint64_t exp_write_high, + uint64_t exp_explicit_high) { + FileMetaData f; + f.number = file; + f.file_size = file_size; + f.smallest = smallest; + f.largest = largest; + f.level = level; + f.exp_write_low = exp_write_low; + f.exp_write_high = exp_write_high; + f.exp_explicit_high = exp_explicit_high; new_files_.push_back(std::make_pair(level, f)); } @@ -75,16 +123,37 @@ class VersionEdit { void DeleteFile(int level, uint64_t file) { deleted_files_.insert(std::make_pair(level, file)); } + size_t DeletedFileCount() const {return(deleted_files_.size());}; - void EncodeTo(std::string* dst) const; + void EncodeTo(std::string* dst, bool format2=true) const; Status DecodeFrom(const Slice& src); + // unit test access to validate file entries' format types + bool HasF1Files() const {return(has_f1_files_);}; + bool HasF2Files() const {return(has_f2_files_);}; + std::string DebugString() const; +// Tag numbers for serialized VersionEdit. These numbers are written to +// disk and should not be changed. +enum Tag { + kComparator = 1, + kLogNumber = 2, + kNextFileNumber = 3, + kLastSequence = 4, + kCompactPointer = 5, + kDeletedFile = 6, + kNewFile = 7, + // 8 was used for large value refs + kPrevLogNumber = 9, + kFileCacheObject = 10, + kNewFile2 = 11 // expiry capable file +}; + private: friend class VersionSet; - typedef std::set< std::pair > DeletedFileSet; + USED_BY_NESTED_FRIEND2(typedef std::set< std::pair > DeletedFileSet) std::string comparator_; uint64_t log_number_; @@ -96,10 +165,13 @@ class VersionEdit { bool has_prev_log_number_; bool has_next_file_number_; bool has_last_sequence_; + // following should be mutually exclusive, but tested independently to be sure + bool has_f1_files_; // manifest uses format 1 (for unit tests) + bool has_f2_files_; // manifest uses format 2 (for unit tests) - std::vector< std::pair > compact_pointers_; - DeletedFileSet deleted_files_; - std::vector< std::pair > new_files_; + USED_BY_NESTED_FRIEND2(std::vector< std::pair > compact_pointers_) + USED_BY_NESTED_FRIEND(DeletedFileSet deleted_files_) + USED_BY_NESTED_FRIEND2(std::vector< std::pair > new_files_) }; } // namespace leveldb diff --git a/src/leveldb/db/version_edit_test.cc b/src/leveldb/db/version_edit_test.cc index 280310b49..bd2c9a31c 100644 --- a/src/leveldb/db/version_edit_test.cc +++ b/src/leveldb/db/version_edit_test.cc @@ -7,14 +7,22 @@ namespace leveldb { -static void TestEncodeDecode(const VersionEdit& edit) { +static void TestEncodeDecode( + const VersionEdit& edit, + bool format2=false) { std::string encoded, encoded2; - edit.EncodeTo(&encoded); + edit.EncodeTo(&encoded,format2); VersionEdit parsed; Status s = parsed.DecodeFrom(encoded); ASSERT_TRUE(s.ok()) << s.ToString(); - parsed.EncodeTo(&encoded2); + parsed.EncodeTo(&encoded2,format2); ASSERT_EQ(encoded, encoded2); + + if (parsed.HasF1Files() || parsed.HasF2Files()) + { + ASSERT_EQ(parsed.HasF1Files(), !format2); + ASSERT_EQ(parsed.HasF2Files(), format2); + } // if } class VersionEditTest { }; @@ -25,11 +33,12 @@ TEST(VersionEditTest, EncodeDecode) { VersionEdit edit; for (int i = 0; i < 4; i++) { TestEncodeDecode(edit); - edit.AddFile(3, kBig + 300 + i, kBig + 400 + i, - InternalKey("foo", kBig + 500 + i, kTypeValue), - InternalKey("zoo", kBig + 600 + i, kTypeDeletion)); + edit.AddFile2(3, kBig + 300 + i, kBig + 400 + i, + InternalKey("foo", 0, kBig + 500 + i, kTypeValue), + InternalKey("zoo", 0, kBig + 600 + i, kTypeDeletion), + 0,0,0); edit.DeleteFile(4, kBig + 700 + i); - edit.SetCompactPointer(i, InternalKey("x", kBig + 900 + i, kTypeValue)); + edit.SetCompactPointer(i, InternalKey("x", 0, kBig + 900 + i, kTypeValue)); } edit.SetComparatorName("foo"); @@ -39,6 +48,29 @@ TEST(VersionEditTest, EncodeDecode) { TestEncodeDecode(edit); } +TEST(VersionEditTest, EncodeDecodeExpiry) { + static const uint64_t kBig = 1ull << 25; + + VersionEdit edit; + for (int i = 0; i < 4; i++) { + TestEncodeDecode(edit, false); // only testing for s.ok() + edit.AddFile2(3, kBig + 300 + i, kBig + 400 + i, + InternalKey("foo", 700+i, kBig + 500 + i, kTypeValueExplicitExpiry), + InternalKey("zoo", 800+i, kBig + 600 + i, kTypeDeletion), + 10203040, + 123456789, + 987654321); + edit.DeleteFile(4, kBig + 700 + i); + edit.SetCompactPointer(i, InternalKey("x", 0, kBig + 900 + i, kTypeValue)); + } + + edit.SetComparatorName("foo"); + edit.SetLogNumber(kBig + 100); + edit.SetNextFile(kBig + 200); + edit.SetLastSequence(kBig + 1000); + TestEncodeDecode(edit, true); +} + } // namespace leveldb int main(int argc, char** argv) { diff --git a/src/leveldb/db/version_set.cc b/src/leveldb/db/version_set.cc index 2cb6d80ed..4a35306cc 100644 --- a/src/leveldb/db/version_set.cc +++ b/src/leveldb/db/version_set.cc @@ -12,48 +12,65 @@ #include "db/memtable.h" #include "db/table_cache.h" #include "leveldb/env.h" +#include "leveldb/expiry.h" #include "leveldb/table_builder.h" +#include "table/block.h" #include "table/merger.h" #include "table/two_level_iterator.h" #include "util/coding.h" +#include "util/db_list.h" +#include "util/hot_threads.h" #include "util/logging.h" +#include "util/mutexlock.h" +#include "util/thread_tasks.h" +#include "leveldb/perf_count.h" namespace leveldb { -static size_t TargetFileSize(const Options* options) { - return options->max_file_size; -} +// branch mv-level-work1, March 2013 +// +// Notes: +// +static struct +{ + uint64_t m_TargetFileSize; //!< mostly useless + uint64_t m_MaxGrandParentOverlapBytes; //!< needs tuning, but not essential + //!< since moves eliminated + int64_t m_ExpandedCompactionByteSizeLimit; //!< needs tuning -// Maximum bytes of overlaps in grandparent (i.e., level+2) before we -// stop building a single file in a level->level+1 compaction. -static int64_t MaxGrandParentOverlapBytes(const Options* options) { - return 10 * TargetFileSize(options); -} + // next two ignored if m_OverlappedFiles is true + uint64_t m_MaxBytesForLevel; //!< start write throttle above this + uint64_t m_DesiredBytesForLevel; //!< compact into next level until this -// Maximum number of bytes in all compacted files. We avoid expanding -// the lower level file set of a compaction if it would make the -// total compaction cover more than this many bytes. -static int64_t ExpandedCompactionByteSizeLimit(const Options* options) { - return 25 * TargetFileSize(options); -} + uint64_t m_MaxFileSizeForLevel; //!< google really applies this + //!< to file size of NEXT level + bool m_OverlappedFiles; //!< false means sst files are sorted + //!< and do not overlap +} gLevelTraits[config::kNumLevels]= -static double MaxBytesForLevel(const Options* options, int level) { - // Note: the result for level zero is not really used since we set - // the level-0 compaction threshold based on number of files. +// level-0 and level-1 create .sst table files that have overlapping key spaces. +// The compaction selection logic within VersionSet::Finalize() selects based +// upon file count, not accumulated file size. Write throttle is harsh if too +// many files accumulate. Timed grooming (if activated) adjusts the file +// count threshold by time since last compaction. +// level-2 is the "landing zone" / first sorted level. Try to keep it clear, +// hence the low m_DesiredBytes for level. +// level-2+: VersionSet::Finalize() selects compaction files when the +// total bytes for level exceeds m_DesiredBytesForLevel. Write throttle +// starts when total bytes exceeds m_MaxFileSizeForLevel. - // Result for both level-0 and level-1 - double result = 10. * 1048576.0; - while (level > 1) { - result *= 10; - level--; - } - return result; -} +// WARNING: m_OverlappedFiles flags need to match config::kNumOverlapFiles ... until unified +{ + {10485760, 262144000, 57671680, 209715200, 0, 420000000, true}, + {10485760, 82914560, 57671680, 419430400, 0, 209715200, true}, + {10485760, 314572800, 57671680, 3082813440, 200000000, 314572800, false}, + {10485760, 419430400, 57671680, 6442450944ULL, 4294967296ULL, 419430400, false}, + {10485760, 524288000, 57671680, 128849018880ULL, 85899345920ULL, 524288000, false}, + {10485760, 629145600, 57671680, 2576980377600ULL, 1717986918400ULL, 629145600, false}, + {10485760, 734003200, 57671680, 51539607552000ULL, 34359738368000ULL, 734003200, false} +}; -static uint64_t MaxFileSizeForLevel(const Options* options, int level) { - // We could vary per level to reduce number of files? - return TargetFileSize(options); -} +/// ULL above needed to compile on OSX 10.7.3 static int64_t TotalFileSize(const std::vector& files) { int64_t sum = 0; @@ -76,7 +93,12 @@ Version::~Version() { FileMetaData* f = files_[level][i]; assert(f->refs > 0); f->refs--; + if (f->refs <= 0) { + // clear Riak's double reference of overlapped files + if (vset_->IsLevelOverlapped(level)) + vset_->GetTableCache()->Evict(f->number, true); + delete f; } } @@ -143,7 +165,7 @@ bool SomeFileOverlapsRange( uint32_t index = 0; if (smallest_user_key != NULL) { // Find the earliest possible internal key for smallest_user_key - InternalKey small(*smallest_user_key, kMaxSequenceNumber,kValueTypeForSeek); + InternalKey small(*smallest_user_key, 0, kMaxSequenceNumber, kValueTypeForSeek); index = FindFile(icmp, files, small.Encode()); } @@ -198,6 +220,7 @@ class Version::LevelFileNumIterator : public Iterator { assert(Valid()); EncodeFixed64(value_buf_, (*flist_)[index_]->number); EncodeFixed64(value_buf_+8, (*flist_)[index_]->file_size); + EncodeFixed32(value_buf_+16, (*flist_)[index_]->level); return Slice(value_buf_, sizeof(value_buf_)); } virtual Status status() const { return Status::OK(); } @@ -206,21 +229,22 @@ class Version::LevelFileNumIterator : public Iterator { const std::vector* const flist_; uint32_t index_; - // Backing store for value(). Holds the file number and size. - mutable char value_buf_[16]; + // Backing store for value(). Holds the file number and size (and level). + mutable char value_buf_[20]; }; static Iterator* GetFileIterator(void* arg, const ReadOptions& options, const Slice& file_value) { TableCache* cache = reinterpret_cast(arg); - if (file_value.size() != 16) { + if (file_value.size() != 20) { return NewErrorIterator( Status::Corruption("FileReader invoked with unexpected value")); } else { return cache->NewIterator(options, DecodeFixed64(file_value.data()), - DecodeFixed64(file_value.data() + 8)); + DecodeFixed64(file_value.data() + 8), + DecodeFixed32(file_value.data() + 16)); } } @@ -233,22 +257,35 @@ Iterator* Version::NewConcatenatingIterator(const ReadOptions& options, void Version::AddIterators(const ReadOptions& options, std::vector* iters) { - // Merge all level zero files together since they may overlap - for (size_t i = 0; i < files_[0].size(); i++) { - iters->push_back( - vset_->table_cache_->NewIterator( - options, files_[0][i]->number, files_[0][i]->file_size)); - } - // For levels > 0, we can use a concatenating iterator that sequentially - // walks through the non-overlapping files in the level, opening them - // lazily. - for (int level = 1; level < config::kNumLevels; level++) { - if (!files_[level].empty()) { - iters->push_back(NewConcatenatingIterator(options, level)); - } - } -} + int level; + + for (level=0; level < config::kNumLevels; ++level) + { + if (gLevelTraits[level].m_OverlappedFiles) + { + // Merge all level files together since they may overlap + for (size_t i = 0; i < files_[level].size(); i++) + { + iters->push_back( + vset_->table_cache_->NewIterator( + options, files_[level][i]->number, files_[level][i]->file_size, level)); + } // for + } // if + + else + { + // For sorted levels, we can use a concatenating iterator that sequentially + // walks through the non-overlapping files in the level, opening them + // lazily. + if (!files_[level].empty()) + { + iters->push_back(NewConcatenatingIterator(options, level)); + } // if + } // else + } // for +} // Version::NewConcatenatingIterator + // Callback from TableCache::Get() namespace { @@ -261,77 +298,42 @@ enum SaverState { struct Saver { SaverState state; const Comparator* ucmp; + const Options* options; Slice user_key; - std::string* value; + Value* value; + const LookupKey * lookup; }; } -static void SaveValue(void* arg, const Slice& ikey, const Slice& v) { +static bool SaveValue(void* arg, const Slice& ikey, const Slice& v) { + bool match=false; + bool expired=false; Saver* s = reinterpret_cast(arg); ParsedInternalKey parsed_key; if (!ParseInternalKey(ikey, &parsed_key)) { s->state = kCorrupt; } else { if (s->ucmp->Compare(parsed_key.user_key, s->user_key) == 0) { - s->state = (parsed_key.type == kTypeValue) ? kFound : kDeleted; + match=true; + if (NULL!=s->options && s->options->ExpiryActivated()) + expired=s->options->expiry_module->KeyRetirementCallback(parsed_key); + s->state = (parsed_key.type != kTypeDeletion && !expired) ? kFound : kDeleted; if (s->state == kFound) { s->value->assign(v.data(), v.size()); } + if (NULL!=s->lookup) + s->lookup->SetKeyMetaData(parsed_key); } } + return(match); } static bool NewestFirst(FileMetaData* a, FileMetaData* b) { return a->number > b->number; } -void Version::ForEachOverlapping(Slice user_key, Slice internal_key, - void* arg, - bool (*func)(void*, int, FileMetaData*)) { - // TODO(sanjay): Change Version::Get() to use this function. - const Comparator* ucmp = vset_->icmp_.user_comparator(); - - // Search level-0 in order from newest to oldest. - std::vector tmp; - tmp.reserve(files_[0].size()); - for (uint32_t i = 0; i < files_[0].size(); i++) { - FileMetaData* f = files_[0][i]; - if (ucmp->Compare(user_key, f->smallest.user_key()) >= 0 && - ucmp->Compare(user_key, f->largest.user_key()) <= 0) { - tmp.push_back(f); - } - } - if (!tmp.empty()) { - std::sort(tmp.begin(), tmp.end(), NewestFirst); - for (uint32_t i = 0; i < tmp.size(); i++) { - if (!(*func)(arg, 0, tmp[i])) { - return; - } - } - } - - // Search other levels. - for (int level = 1; level < config::kNumLevels; level++) { - size_t num_files = files_[level].size(); - if (num_files == 0) continue; - - // Binary search to find earliest index whose largest key >= internal_key. - uint32_t index = FindFile(vset_->icmp_, files_[level], internal_key); - if (index < num_files) { - FileMetaData* f = files_[level][index]; - if (ucmp->Compare(user_key, f->smallest.user_key()) < 0) { - // All of "f" is past any data for user_key - } else { - if (!(*func)(arg, level, f)) { - return; - } - } - } - } -} - Status Version::Get(const ReadOptions& options, const LookupKey& k, - std::string* value, + Value* value, GetStats* stats) { Slice ikey = k.internal_key(); Slice user_key = k.user_key(); @@ -354,8 +356,8 @@ Status Version::Get(const ReadOptions& options, // Get the list of files to search in this level FileMetaData* const* files = &files_[level][0]; - if (level == 0) { - // Level-0 files may overlap each other. Find all files that + if (gLevelTraits[level].m_OverlappedFiles) { + // Level files may overlap each other. Find all files that // overlap user_key and process them in order from newest to oldest. tmp.reserve(num_files); for (uint32_t i = 0; i < num_files; i++) { @@ -389,6 +391,9 @@ Status Version::Get(const ReadOptions& options, } } + if (0!=num_files) + gPerfCounters->Add(ePerfSearchLevel0 + level, num_files); + for (uint32_t i = 0; i < num_files; ++i) { if (last_file_read != NULL && stats->seek_file == NULL) { // We have had more than one seek for this read. Charge the 1st file. @@ -403,9 +408,11 @@ Status Version::Get(const ReadOptions& options, Saver saver; saver.state = kNotFound; saver.ucmp = ucmp; + saver.options = vset_->options_; saver.user_key = user_key; saver.value = value; - s = vset_->table_cache_->Get(options, f->number, f->file_size, + saver.lookup = &k; + s = vset_->table_cache_->Get(options, f->number, f->file_size, level, ikey, &saver, SaveValue); if (!s.ok()) { return s; @@ -429,6 +436,7 @@ Status Version::Get(const ReadOptions& options, } bool Version::UpdateStats(const GetStats& stats) { +#if 0 FileMetaData* f = stats.seek_file; if (f != NULL) { f->allowed_seeks--; @@ -438,44 +446,7 @@ bool Version::UpdateStats(const GetStats& stats) { return true; } } - return false; -} - -bool Version::RecordReadSample(Slice internal_key) { - ParsedInternalKey ikey; - if (!ParseInternalKey(internal_key, &ikey)) { - return false; - } - - struct State { - GetStats stats; // Holds first matching file - int matches; - - static bool Match(void* arg, int level, FileMetaData* f) { - State* state = reinterpret_cast(arg); - state->matches++; - if (state->matches == 1) { - // Remember first match. - state->stats.seek_file = f; - state->stats.seek_file_level = level; - } - // We can stop iterating once we have a second match. - return state->matches < 2; - } - }; - - State state; - state.matches = 0; - ForEachOverlapping(ikey.user_key, internal_key, &state, &State::Match); - - // Must have at least two matches since we want to merge across - // files. But what if we have a single file that contains many - // overwrites and deletions? Should we have another mechanism for - // finding such files? - if (state.matches >= 2) { - // 1MB cost is about 1 seek (see comment in Builder::Apply). - return UpdateStats(state.stats); - } +#endif return false; } @@ -494,36 +465,43 @@ void Version::Unref() { bool Version::OverlapInLevel(int level, const Slice* smallest_user_key, - const Slice* largest_user_key) { - return SomeFileOverlapsRange(vset_->icmp_, (level > 0), files_[level], + const Slice* largest_user_key) const { + return SomeFileOverlapsRange(vset_->icmp_, + !gLevelTraits[level].m_OverlappedFiles, + files_[level], smallest_user_key, largest_user_key); } int Version::PickLevelForMemTableOutput( const Slice& smallest_user_key, - const Slice& largest_user_key) { + const Slice& largest_user_key, + const int level_limit) { int level = 0; + +// test if level 1 m_OverlappedFiles is false, proceded only then if (!OverlapInLevel(0, &smallest_user_key, &largest_user_key)) { // Push to next level if there is no overlap in next level, // and the #bytes overlapping in the level after that are limited. - InternalKey start(smallest_user_key, kMaxSequenceNumber, kValueTypeForSeek); - InternalKey limit(largest_user_key, 0, static_cast(0)); + InternalKey start(smallest_user_key, 0, kMaxSequenceNumber, kValueTypeForSeek); + InternalKey limit(largest_user_key, 0, 0, static_cast(0)); std::vector overlaps; - while (level < config::kMaxMemCompactLevel) { + while (level < level_limit) { if (OverlapInLevel(level + 1, &smallest_user_key, &largest_user_key)) { break; } - if (level + 2 < config::kNumLevels) { - // Check that file does not overlap too many grandparent bytes. - GetOverlappingInputs(level + 2, &start, &limit, &overlaps); - const int64_t sum = TotalFileSize(overlaps); - if (sum > MaxGrandParentOverlapBytes(vset_->options_)) { - break; - } + GetOverlappingInputs(level + 2, &start, &limit, &overlaps); + const uint64_t sum = TotalFileSize(overlaps); + if (sum > gLevelTraits[level].m_MaxGrandParentOverlapBytes) { + break; } level++; } + // do not waste a move into an overlapped level, breaks + // different performance improvement + if (gLevelTraits[level].m_OverlappedFiles) + level=0; } + return level; } @@ -533,44 +511,89 @@ void Version::GetOverlappingInputs( const InternalKey* begin, const InternalKey* end, std::vector* inputs) { - assert(level >= 0); - assert(level < config::kNumLevels); inputs->clear(); Slice user_begin, user_end; + + // overlap takes everything + bool test_inputs(!gLevelTraits[level].m_OverlappedFiles); + if (begin != NULL) { - user_begin = begin->user_key(); + user_begin = begin->user_key(); } if (end != NULL) { - user_end = end->user_key(); + user_end = end->user_key(); } + const Comparator* user_cmp = vset_->icmp_.user_comparator(); for (size_t i = 0; i < files_[level].size(); ) { FileMetaData* f = files_[level][i++]; const Slice file_start = f->smallest.user_key(); const Slice file_limit = f->largest.user_key(); - if (begin != NULL && user_cmp->Compare(file_limit, user_begin) < 0) { + if (test_inputs && begin != NULL && user_cmp->Compare(file_limit, user_begin) < 0) { // "f" is completely before specified range; skip it - } else if (end != NULL && user_cmp->Compare(file_start, user_end) > 0) { + } else if (test_inputs && end != NULL && user_cmp->Compare(file_start, user_end) > 0) { // "f" is completely after specified range; skip it } else { inputs->push_back(f); - if (level == 0) { - // Level-0 files may overlap each other. So check if the newly - // added file has expanded the range. If so, restart search. - if (begin != NULL && user_cmp->Compare(file_start, user_begin) < 0) { - user_begin = file_start; - inputs->clear(); - i = 0; - } else if (end != NULL && user_cmp->Compare(file_limit, user_end) > 0) { - user_end = file_limit; - inputs->clear(); - i = 0; - } - } } } } + +bool +Version::VerifyLevels( + int & level, // input / output for current level to inspect + InternalKey & begin, // output of lowest key in first overlapped file + InternalKey & end) // output of highest key in first overlapped file +{ + bool overlap_found; + const Comparator* user_cmp; + + overlap_found=false; + user_cmp = vset_->icmp_.user_comparator(); + + do + { + // test only levels that do not expect overlapped .sst files + if (!gLevelTraits[level].m_OverlappedFiles && 1& files = files_[level]; + size_t inner, outer; + + for (outer=0; outerlargest.user_key(); + + for (inner=outer+1; innersmallest.user_key(); + + // do files overlap? assumes vector sorted by "start" + if (user_cmp->Compare(inner_start, outer_limit) <= 0) + { + overlap_found=true; + begin=outer_meta->smallest; + end=outer_meta->largest; + } // if + } // for + } // for + } // if + + // current level is clean, move to next + if (!overlap_found) + ++level; + + // stopping before the last level. that needs much + // more support code ... later project + } while(!overlap_found && (level+1)new_files_[i].second); f->refs = 1; +#if 0 // We arrange to automatically compact this file after // a certain number of seeks. Let's assume: // (1) One seek costs 10ms @@ -701,6 +725,7 @@ class VersionSet::Builder { // of data before triggering a compaction. f->allowed_seeks = (f->file_size / 16384); if (f->allowed_seeks < 100) f->allowed_seeks = 100; +#endif levels_[level].deleted_files.erase(f->number); levels_[level].added_files->insert(f); @@ -740,11 +765,12 @@ class VersionSet::Builder { #ifndef NDEBUG // Make sure there is no overlap in levels > 0 - if (level > 0) { + if (!gLevelTraits[level].m_OverlappedFiles) { for (uint32_t i = 1; i < v->files_[level].size(); i++) { const InternalKey& prev_end = v->files_[level][i-1]->largest; const InternalKey& this_begin = v->files_[level][i]->smallest; - if (vset_->icmp_.Compare(prev_end, this_begin) >= 0) { + if (vset_->icmp_.Compare(prev_end, this_begin) >= 0 + && !vset_->options_->is_repair) { fprintf(stderr, "overlapping ranges in same level %s vs. %s\n", prev_end.DebugString().c_str(), this_begin.DebugString().c_str()); @@ -761,7 +787,8 @@ class VersionSet::Builder { // File is deleted: do nothing } else { std::vector* files = &v->files_[level]; - if (level > 0 && !files->empty()) { + if (!gLevelTraits[level].m_OverlappedFiles && !files->empty() + && !vset_->options_->is_repair) { // Must not overlap assert(vset_->icmp_.Compare((*files)[files->size()-1]->largest, f->smallest) < 0); @@ -789,11 +816,17 @@ VersionSet::VersionSet(const std::string& dbname, descriptor_file_(NULL), descriptor_log_(NULL), dummy_versions_(this), - current_(NULL) { + current_(NULL), + last_penalty_minutes_(0), + prev_write_penalty_(0) +{ AppendVersion(new Version(this)); } VersionSet::~VersionSet() { + // must remove second ref counter that keeps overlapped files locked + // table cache + current_->Unref(); assert(dummy_versions_.next_ == &dummy_versions_); // List must be empty delete descriptor_log_; @@ -838,7 +871,6 @@ Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mu) { builder.Apply(edit); builder.SaveTo(v); } - Finalize(v); // Initialize new descriptor log file if necessary by creating // a temporary file that contains a snapshot of the current version. @@ -850,45 +882,54 @@ Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mu) { assert(descriptor_file_ == NULL); new_manifest_file = DescriptorFileName(dbname_, manifest_file_number_); edit->SetNextFile(next_file_number_); - s = env_->NewWritableFile(new_manifest_file, &descriptor_file_); + s = env_->NewWritableFile(new_manifest_file, &descriptor_file_, 4*1024L); if (s.ok()) { descriptor_log_ = new log::Writer(descriptor_file_); s = WriteSnapshot(descriptor_log_); } } - // Unlock during expensive MANIFEST log write - { - mu->Unlock(); - - // Write new record to MANIFEST log - if (s.ok()) { - std::string record; - edit->EncodeTo(&record); - s = descriptor_log_->AddRecord(record); - if (s.ok()) { - s = descriptor_file_->Sync(); - } - if (!s.ok()) { - Log(options_->info_log, "MANIFEST write: %s\n", s.ToString().c_str()); - } - } - - // If we just created a new descriptor file, install it by writing a - // new CURRENT file that points to it. - if (s.ok() && !new_manifest_file.empty()) { - s = SetCurrentFile(env_, dbname_, manifest_file_number_); - } - - mu->Lock(); - } - // Install the new version + // matthewv Oct 2013 - this used to be after the MANIFEST write + // but overlapping compactions allow for a file to get lost + // if first does not post to version completely. if (s.ok()) { AppendVersion(v); log_number_ = edit->log_number_; prev_log_number_ = edit->prev_log_number_; - } else { + + // Unlock during expensive MANIFEST log write + { + mu->Unlock(); + + // but only one writer at a time + { + MutexLock lock(&manifest_mutex_); + // Write new record to MANIFEST log + if (s.ok()) { + std::string record; + edit->EncodeTo(&record, options_->ExpiryActivated()); + s = descriptor_log_->AddRecord(record); + if (s.ok()) { + s = descriptor_file_->Sync(); + } + } + + // If we just created a new descriptor file, install it by writing a + // new CURRENT file that points to it. + if (s.ok() && !new_manifest_file.empty()) { + s = SetCurrentFile(env_, dbname_, manifest_file_number_); + } + } // manifest_lock_ + + mu->Lock(); + } + } + + // this used to be "else" clause to if(s.ok) + // moved on Oct 2013 + else + { delete v; if (!new_manifest_file.empty()) { delete descriptor_log_; @@ -902,7 +943,7 @@ Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mu) { return s; } -Status VersionSet::Recover(bool *save_manifest) { +Status VersionSet::Recover() { struct LogReporter : public log::Reader::Reporter { Status* status; virtual void Corruption(size_t bytes, const Status& s) { @@ -951,7 +992,7 @@ Status VersionSet::Recover(bool *save_manifest) { if (edit.has_comparator_ && edit.comparator_ != icmp_.user_comparator()->Name()) { s = Status::InvalidArgument( - edit.comparator_ + " does not match existing comparator ", + edit.comparator_ + "does not match existing comparator ", icmp_.user_comparator()->Name()); } } @@ -1005,99 +1046,368 @@ Status VersionSet::Recover(bool *save_manifest) { Version* v = new Version(this); builder.SaveTo(v); // Install recovered version - Finalize(v); AppendVersion(v); manifest_file_number_ = next_file; next_file_number_ = next_file + 1; last_sequence_ = last_sequence; log_number_ = log_number; prev_log_number_ = prev_log_number; - - // See if we can reuse the existing MANIFEST file. - if (ReuseManifest(dscname, current)) { - // No need to save new manifest - } else { - *save_manifest = true; - } } return s; } -bool VersionSet::ReuseManifest(const std::string& dscname, - const std::string& dscbase) { - if (!options_->reuse_logs) { - return false; - } - FileType manifest_type; - uint64_t manifest_number; - uint64_t manifest_size; - if (!ParseFileName(dscbase, &manifest_number, &manifest_type) || - manifest_type != kDescriptorFile || - !env_->GetFileSize(dscname, &manifest_size).ok() || - // Make new compacted MANIFEST if old one is too big - manifest_size >= TargetFileSize(options_)) { - return false; - } - - assert(descriptor_file_ == NULL); - assert(descriptor_log_ == NULL); - Status r = env_->NewAppendableFile(dscname, &descriptor_file_); - if (!r.ok()) { - Log(options_->info_log, "Reuse MANIFEST: %s\n", r.ToString().c_str()); - assert(descriptor_file_ == NULL); - return false; - } - - Log(options_->info_log, "Reusing MANIFEST %s\n", dscname.c_str()); - descriptor_log_ = new log::Writer(descriptor_file_, manifest_size); - manifest_file_number_ = manifest_number; - return true; -} - void VersionSet::MarkFileNumberUsed(uint64_t number) { if (next_file_number_ <= number) { next_file_number_ = number + 1; } } -void VersionSet::Finalize(Version* v) { - // Precomputed best level for next compaction - int best_level = -1; - double best_score = -1; - for (int level = 0; level < config::kNumLevels-1; level++) { - double score; - if (level == 0) { - // We treat level-0 specially by bounding the number of files - // instead of number of bytes for two reasons: - // - // (1) With larger write-buffer sizes, it is nice not to do too - // many level-0 compactions. - // - // (2) The files in level-0 are merged on every read and - // therefore we wish to avoid too many files when the individual - // file size is small (perhaps because of a small write-buffer - // setting, or very high compression ratios, or lots of - // overwrites/deletions). - score = v->files_[level].size() / - static_cast(config::kL0_CompactionTrigger); - } else { - // Compute the ratio of current size to size limit. - const uint64_t level_bytes = TotalFileSize(v->files_[level]); - score = - static_cast(level_bytes) / MaxBytesForLevel(options_, level); - } +bool +VersionSet::NeighborCompactionsQuiet(int level) +{ + uint64_t parent_level_bytes(0); - if (score > best_score) { - best_level = level; - best_score = score; - } - } + if (level < config::kNumLevels-1) + parent_level_bytes = TotalFileSize(current_->files_[level+1]); + + // not an overlapped level and must not have compactions + // scheduled on either level below or level above + return((0==level || !m_CompactionStatus[level-1].m_Submitted) + && !gLevelTraits[level].m_OverlappedFiles + && (level==config::kNumLevels-1 + || (!m_CompactionStatus[level+1].m_Submitted + && parent_level_bytes<=((gLevelTraits[level+1].m_MaxBytesForLevel + +gLevelTraits[level+1].m_DesiredBytesForLevel)/2)))); +} // VersionSet::NeighborCompactionsQuiet + + +bool +VersionSet::Finalize(Version* v) +{ + // Riak: looking for first compaction needed in level order + int best_level = -1; + double best_score = -1; + bool compaction_found; + bool is_grooming, no_move, expire_file; + uint64_t micros_now; + + compaction_found=false; + is_grooming=false; + no_move=false; + expire_file=false; + micros_now=env_->NowMicros(); + + // Note: level kNumLevels-1 only examined for whole file expiry + for (int level = v->compaction_level_+1; level < config::kNumLevels && !compaction_found; ++level) + { + bool compact_ok; + double score(0); + uint64_t parent_level_bytes(0); + + is_grooming=false; + // is this level eligible for compaction consideration? + compact_ok=!m_CompactionStatus[level].m_Submitted; + + // not already scheduled for compaction + if (compact_ok) + { + if (level < (config::kNumLevels-1)) + parent_level_bytes = TotalFileSize(v->files_[level+1]); + + // is overlapped and so is next level + if (gLevelTraits[level].m_OverlappedFiles && gLevelTraits[level+1].m_OverlappedFiles) + { + // good ... stop consideration + } // if + + // overlapped and next level is not compacting + else if (gLevelTraits[level].m_OverlappedFiles && !m_CompactionStatus[level+1].m_Submitted + && (parent_level_bytes<=gLevelTraits[level+1].m_DesiredBytesForLevel + || config::kL0_CompactionTrigger <= v->files_[level].size())) + { + // good ... stop consideration + } // else if + + else + { + // must not have compactions scheduled on neither level below nor level above + compact_ok=NeighborCompactionsQuiet(level); + } // else + } // if + + // consider this level + if (compact_ok) + { + size_t grooming_trigger; + uint64_t elapsed_micros; + + // some platforms use gettimeofday() which can move backward + if ( m_CompactionStatus[level].m_LastCompaction < micros_now + && 0 != m_CompactionStatus[level].m_LastCompaction) + elapsed_micros=micros_now - m_CompactionStatus[level].m_LastCompaction; + else + elapsed_micros=0; + + // reevaluating timed grooming ... seems to crush caching + // this disables the code but leaves it in place for future + // reuse after block cache flushing impact addressed + elapsed_micros=0; + + // which grooming trigger point? based upon how long + // since last compaction on this level + // - less than 10 minutes? + if (elapsed_micros < config::kL0_Grooming10minMicros) + grooming_trigger=config::kL0_GroomingTrigger; + + // - less than 20 minutes? + else if (elapsed_micros < config::kL0_Grooming20minMicros) + grooming_trigger=config::kL0_GroomingTrigger10min; + + // - more than 20 minutes + else + grooming_trigger=config::kL0_GroomingTrigger20min; + + if (gLevelTraits[level].m_OverlappedFiles) { + // We treat level-0 specially by bounding the number of files + // instead of number of bytes for two reasons: + // + // (1) With larger write-buffer sizes, it is nice not to do too + // many level-0 compactions. + // + // (2) The files in level-0 are merged on every read and + // therefore we wish to avoid too many files when the individual + // file size is small (perhaps because of a small write-buffer + // setting, or very high compression ratios, or lots of + // overwrites/deletions). + score=0; + + // score of 1 at compaction trigger, incrementing for each thereafter + if ( config::kL0_CompactionTrigger <= v->files_[level].size()) + score += v->files_[level].size() - config::kL0_CompactionTrigger +1; + + is_grooming=false; + + // early overlapped compaction + // only occurs if no other compactions running on groomer thread + // (no grooming if landing level is still overloaded) + if (0==score && grooming_trigger<=v->files_[level].size() + && 2GetDBCount(false) // for non-Riak use cases, helps throughput + && (uint64_t)TotalFileSize(v->files_[config::kNumOverlapLevels]) + < gLevelTraits[config::kNumOverlapLevels].m_DesiredBytesForLevel) + { + // secondary test, don't push too much to next Overlap too soon + if (!gLevelTraits[level+1].m_OverlappedFiles + || v->files_[level+1].size()<=config::kL0_CompactionTrigger) + { + score=1; + is_grooming=true; + } // if + } // if + } // if + + // highest level, kNumLevels-1, only considered for expiry not compaction + else if (level < config::kNumLevels-1) { + // Compute the ratio of current size to size limit. + const uint64_t level_bytes = TotalFileSize(v->files_[level]); + score = static_cast(level_bytes) / gLevelTraits[level].m_DesiredBytesForLevel; + is_grooming=(level_bytes < gLevelTraits[level].m_MaxFileSizeForLevel); + + // force landing level to not be grooming ... ever + if (gLevelTraits[level-1].m_OverlappedFiles) + is_grooming=false; + + // within size constraints, are there any deletes worthy of consideration + // (must not do this on overlapped levels. causes huge throughput problems + // on heavy loads) + if (score < 1 && 0!=options_->delete_threshold) + { + Version::FileMetaDataVector_t::iterator it; + + for (it=v->files_[level].begin(); + v->files_[level].end()!=it && !compaction_found; + ++it) + { + // if number of tombstones in stats exceeds threshold, + // we have a compaction candidate + if (options_->delete_threshold <= GetTableCache()->GetStatisticValue((*it)->number, eSstCountDeleteKey)) + { + compaction_found=true; + best_level=level; + best_score=0; + v->file_to_compact_=*it; + v->file_to_compact_level_=level; + is_grooming=true; + no_move=true; + } + } // for + } // if + } // else + + // this code block is old, should be rewritten + if (1<=score) + { + best_level = level; + best_score = score; + compaction_found=true; + } // if + + // finally test for expiry if no compaction candidates + if (!compaction_found && options_->ExpiryActivated()) + { + compaction_found=options_->expiry_module->CompactionFinalizeCallback(false, + *v, + level, + NULL); + + if (compaction_found) + { + best_level=level; + best_score=0; + is_grooming=false; + no_move=true; + expire_file=true; + v->file_to_compact_level_=level; + } // if + } // if + } // if + } // for + + // set (almost) all at once to ensure + // no hold over from prior Finalize() call on this version. + // (could rewrite cleaner by doing reset of these at top of function) + v->compaction_level_ = best_level; + v->compaction_score_ = best_score; + v->compaction_grooming_ = is_grooming; + v->compaction_no_move_ = no_move; + v->compaction_expirefile_ = expire_file; + + return(compaction_found); + +} // VersionSet::Finalize + + +/** + * UpdatePenalty was previous part of Finalize(). It is now + * an independent routine dedicated to setting the penalty + * value used within the WriteThrottle calculations. + * + * Penalty is an estimate of how many compactions/keys of work + * are overdue. + */ +void +VersionSet::UpdatePenalty( + Version* v) +{ + int penalty=0; + + for (int level = 0; level < config::kNumLevels-1; ++level) + { + int loop, count, value; + + value=0; + count=0; + + if (gLevelTraits[level].m_OverlappedFiles) + { + + // compute penalty for write throttle if too many Level-0 files accumulating + if (config::kL0_SlowdownWritesTrigger < v->NumFiles(level)) + { + // assume each overlapped file represents another pass at same key + // and we are "close" on compaction backlog + if ( v->NumFiles(level) < config::kL0_SlowdownWritesTrigger) + { + // this code block will not execute due both "if"s using same values now + value = 1; + count = 0; + } // if + + // no longer estimating work, now trying to throw on the breaks + // to keep leveldb from stalling + else + { + count=(v->NumFiles(level) - config::kL0_SlowdownWritesTrigger); + + // level 0 has own thread pool and will stall writes, + // heavy penalty + if (0==level) + { // non-linear penalty + value=2; + } // if + else + { // slightly less penalty + value=1; + } // else + } // else + } // if + } // if + else + { + const uint64_t level_bytes = TotalFileSize(v->GetFileList(level)); + + // how dire is the situation + count=(int)(static_cast(level_bytes) / gLevelTraits[level].m_MaxBytesForLevel); + + if (0write_buffer_size; + value+=1; + } // if + + // this penalty is about reducing write amplification, its + // side effect is to also improve compaction performance across + // the level 1 to 2 to 3 boundry. + else if (config::kNumOverlapLevels==level + && gLevelTraits[level].m_DesiredBytesForLevel < level_bytes) + { + // this approximates the number of compactions needed, no other penalty + value=(int)(static_cast(level_bytes-gLevelTraits[level].m_DesiredBytesForLevel) / options_->write_buffer_size); + + // how urgent is the need to clear this level before next flood + // (negative value is ignored) + count= v->NumFiles(level-1) - (config::kL0_CompactionTrigger/2); + + // only throttle if backlog on the horizon + if (count < 0) + value=0; + } // else if + + } // else + + penalty+=value; + + } // for + + // put a ceiling on the value + if (1000write_penalty_=prev_write_penalty_; + + return; + +} // VersionSet::UpdatePenalty - v->compaction_level_ = best_level; - v->compaction_score_ = best_score; -} Status VersionSet::WriteSnapshot(log::Writer* log) { // TODO: Break up into multiple records to reduce memory usage on recovery? @@ -1120,21 +1430,46 @@ Status VersionSet::WriteSnapshot(log::Writer* log) { const std::vector& files = current_->files_[level]; for (size_t i = 0; i < files.size(); i++) { const FileMetaData* f = files[i]; - edit.AddFile(level, f->number, f->file_size, f->smallest, f->largest); + edit.AddFile2(level, f->number, f->file_size, f->smallest, f->largest, + f->exp_write_low, f->exp_write_high, f->exp_explicit_high); } } std::string record; - edit.EncodeTo(&record); + edit.EncodeTo(&record, options_->ExpiryActivated()); return log->AddRecord(record); } -int VersionSet::NumLevelFiles(int level) const { +size_t VersionSet::NumLevelFiles(int level) const { assert(level >= 0); assert(level < config::kNumLevels); return current_->files_[level].size(); } +bool VersionSet::IsLevelOverlapped(int level) { + assert(level >= 0); + assert(level < config::kNumLevels); + return(gLevelTraits[level].m_OverlappedFiles); +} + +uint64_t VersionSet::DesiredBytesForLevel(int level) { + assert(level >= 0); + assert(level < config::kNumLevels); + return(gLevelTraits[level].m_DesiredBytesForLevel); +} + +uint64_t VersionSet::MaxBytesForLevel(int level) { + assert(level >= 0); + assert(level < config::kNumLevels); + return(gLevelTraits[level].m_MaxBytesForLevel); +} + +uint64_t VersionSet::MaxFileSizeForLevel(int level) { + assert(level >= 0); + assert(level < config::kNumLevels); + return(gLevelTraits[level].m_MaxFileSizeForLevel); +} + const char* VersionSet::LevelSummary(LevelSummaryStorage* scratch) const { // Update code if kNumLevels changes assert(config::kNumLevels == 7); @@ -1150,6 +1485,22 @@ const char* VersionSet::LevelSummary(LevelSummaryStorage* scratch) const { return scratch->buffer; } +const char* VersionSet::CompactionSummary(LevelSummaryStorage* scratch) const { + // Update code if kNumLevels changes + assert(config::kNumLevels == 7); + snprintf(scratch->buffer, sizeof(scratch->buffer), + "files[ %d,%d %d,%d %d,%d %d,%d %d,%d %d,%d %d,%d ]", + m_CompactionStatus[0].m_Submitted, m_CompactionStatus[0].m_Running, + m_CompactionStatus[1].m_Submitted, m_CompactionStatus[1].m_Running, + m_CompactionStatus[2].m_Submitted, m_CompactionStatus[2].m_Running, + m_CompactionStatus[3].m_Submitted, m_CompactionStatus[3].m_Running, + m_CompactionStatus[4].m_Submitted, m_CompactionStatus[4].m_Running, + m_CompactionStatus[5].m_Submitted, m_CompactionStatus[5].m_Running, + m_CompactionStatus[6].m_Submitted, m_CompactionStatus[6].m_Running); + + return scratch->buffer; +} + uint64_t VersionSet::ApproximateOffsetOf(Version* v, const InternalKey& ikey) { uint64_t result = 0; for (int level = 0; level < config::kNumLevels; level++) { @@ -1160,8 +1511,8 @@ uint64_t VersionSet::ApproximateOffsetOf(Version* v, const InternalKey& ikey) { result += files[i]->file_size; } else if (icmp_.Compare(files[i]->smallest, ikey) > 0) { // Entire file is after "ikey", so ignore - if (level > 0) { - // Files other than level 0 are sorted by meta->smallest, so + if (!gLevelTraits[level].m_OverlappedFiles) { + // Non-overlapped files are sorted by meta->smallest, so // no further files in this level will contain data for // "ikey". break; @@ -1171,7 +1522,7 @@ uint64_t VersionSet::ApproximateOffsetOf(Version* v, const InternalKey& ikey) { // approximate offset of "ikey" within the table. Table* tableptr; Iterator* iter = table_cache_->NewIterator( - ReadOptions(), files[i]->number, files[i]->file_size, &tableptr); + ReadOptions(), files[i]->number, files[i]->file_size, level, &tableptr); if (tableptr != NULL) { result += tableptr->ApproximateOffsetOf(ikey.Encode()); } @@ -1257,22 +1608,35 @@ void VersionSet::GetRange2(const std::vector& inputs1, Iterator* VersionSet::MakeInputIterator(Compaction* c) { ReadOptions options; - options.verify_checksums = options_->paranoid_checks; + options.verify_checksums = options_->verify_compactions; options.fill_cache = false; + options.is_compaction = true; + options.info_log = options_->info_log; + options.dbname = dbname_; + options.env = env_; + + int which_limit, space; // Level-0 files have to be merged together. For other levels, // we will make a concatenating iterator per level. // TODO(opt): use concatenating iterator for level-0 if there is no overlap - const int space = (c->level() == 0 ? c->inputs_[0].size() + 1 : 2); + // (during a repair, all levels use merge iterator as a precaution) + if (!options_->is_repair) + space = (gLevelTraits[c->level()].m_OverlappedFiles ? c->inputs_[0].size() + 1 : 2); + else + space = c->inputs_[0].size() + c->inputs_[1].size(); + Iterator** list = new Iterator*[space]; int num = 0; - for (int which = 0; which < 2; which++) { + + which_limit=gLevelTraits[c->level()+1].m_OverlappedFiles ? 1 : 2; + for (int which = 0; which < which_limit; which++) { if (!c->inputs_[which].empty()) { - if (c->level() + which == 0) { + if (gLevelTraits[c->level() + which].m_OverlappedFiles || options_->is_repair) { const std::vector& files = c->inputs_[which]; for (size_t i = 0; i < files.size(); i++) { list[num++] = table_cache_->NewIterator( - options, files[i]->number, files[i]->file_size); + options, files[i]->number, files[i]->file_size, c->level() + which); } } else { // Create concatenating iterator for the files from this level @@ -1288,58 +1652,126 @@ Iterator* VersionSet::MakeInputIterator(Compaction* c) { return result; } -Compaction* VersionSet::PickCompaction() { + +/** + * PickCompactions() directly feeds hot_thread pools as of October 2013 + */ +void +VersionSet::PickCompaction( + class DBImpl * db_impl) +{ Compaction* c; int level; - // We prefer compactions triggered by too much data in a level over - // the compactions triggered by seeks. - const bool size_compaction = (current_->compaction_score_ >= 1); - const bool seek_compaction = (current_->file_to_compact_ != NULL); - if (size_compaction) { - level = current_->compaction_level_; - assert(level >= 0); - assert(level+1 < config::kNumLevels); - c = new Compaction(options_, level); + // perform this once per call ... since Finalize now loops + UpdatePenalty(current_); - // Pick the first file that comes after compact_pointer_[level] - for (size_t i = 0; i < current_->files_[level].size(); i++) { - FileMetaData* f = current_->files_[level][i]; - if (compact_pointer_[level].empty() || - icmp_.Compare(f->largest.Encode(), compact_pointer_[level]) > 0) { - c->inputs_[0].push_back(f); - break; + // submit a work object for every valid compaction needed + current_->compaction_level_=-1; + while(Finalize(current_)) + { + bool submit_flag; + + Log(options_->info_log,"Finalize level: %d, grooming %d", + current_->compaction_level_, current_->compaction_grooming_); + + c=NULL; + + // We prefer compactions triggered by too much data in a level over + // the compactions triggered by seeks. (Riak redefines "seeks" to + // "files containing delete tombstones") + const bool size_compaction = (current_->compaction_score_ >= 1); + const bool seek_compaction = (current_->file_to_compact_ != NULL); + if (size_compaction) + { + level = current_->compaction_level_; + assert(level >= 0); + assert(level+1 < config::kNumLevels); + + c = new Compaction(level); + + // Pick the first file that comes after compact_pointer_[level] + for (size_t i = 0; i < current_->files_[level].size(); i++) { + FileMetaData* f = current_->files_[level][i]; + if (compact_pointer_[level].empty() || + icmp_.Compare(f->largest.Encode(), compact_pointer_[level]) > 0) { + c->inputs_[0].push_back(f); + break; + } + } + if (c->inputs_[0].empty()) { + // Wrap-around to the beginning of the key space + c->inputs_[0].push_back(current_->files_[level][0]); + } + } else if (seek_compaction) { + level = current_->file_to_compact_level_; + c = new Compaction(level); + c->inputs_[0].push_back(current_->file_to_compact_); + } else if (current_->compaction_expirefile_) { + level = current_->file_to_compact_level_; + c = new Compaction(level); + c->compaction_type_=kExpiryFileCompaction; + } else { + return; } - } - if (c->inputs_[0].empty()) { - // Wrap-around to the beginning of the key space - c->inputs_[0].push_back(current_->files_[level][0]); - } - } else if (seek_compaction) { - level = current_->file_to_compact_level_; - c = new Compaction(options_, level); - c->inputs_[0].push_back(current_->file_to_compact_); - } else { - return NULL; - } - c->input_version_ = current_; - c->input_version_->Ref(); + c->input_version_ = current_; + c->input_version_->Ref(); + c->no_move_ = current_->compaction_no_move_; - // Files in level 0 may overlap each other, so pick up all overlapping ones - if (level == 0) { - InternalKey smallest, largest; - GetRange(c->inputs_[0], &smallest, &largest); - // Note that the next call will discard the file we placed in - // c->inputs_[0] earlier and replace it with an overlapping set - // which will include the picked file. - current_->GetOverlappingInputs(0, &smallest, &largest, &c->inputs_[0]); - assert(!c->inputs_[0].empty()); - } + // set submitted as race defense + m_CompactionStatus[level].m_Submitted=true; - SetupOtherInputs(c); + if (!current_->compaction_expirefile_) + { + // m_OverlappedFiles==true levels have files that + // may overlap each other, so pick up all overlapping ones + if (gLevelTraits[level].m_OverlappedFiles) { + InternalKey smallest, largest; + GetRange(c->inputs_[0], &smallest, &largest); + // Note that the next call will discard the file we placed in + // c->inputs_[0] earlier and replace it with an overlapping set + // which will include the picked file. + current_->GetOverlappingInputs(level, &smallest, &largest, &c->inputs_[0]); + assert(!c->inputs_[0].empty()); - return c; + // this can get into tens of thousands after a repair + // keep it sane + size_t max_open_files=100; // previously an options_ member variable + if (max_open_files < c->inputs_[0].size()) + { + std::nth_element(c->inputs_[0].begin(), + c->inputs_[0].begin()+max_open_files-1, + c->inputs_[0].end(),FileMetaDataPtrCompare(options_->comparator)); + c->inputs_[0].erase(c->inputs_[0].begin()+max_open_files, + c->inputs_[0].end()); + } // if + } // if + + SetupOtherInputs(c); + + ThreadTask * task=new CompactionTask(db_impl, c); + + if (0==level) + submit_flag=gLevel0Threads->Submit(task, !current_->compaction_grooming_); + else + submit_flag=gCompactionThreads->Submit(task, !current_->compaction_grooming_); + } // if + + // expiry compaction + else + { + ThreadTask * task=new CompactionTask(db_impl, c); + submit_flag=gCompactionThreads->Submit(task, true); + } // else + + // set/reset submitted based upon truth of queuing + // (ref counting will auto delete task rejected) + m_CompactionStatus[level].m_Submitted=submit_flag; + + } // while + + return; } void VersionSet::SetupOtherInputs(Compaction* c) { @@ -1347,53 +1779,79 @@ void VersionSet::SetupOtherInputs(Compaction* c) { InternalKey smallest, largest; GetRange(c->inputs_[0], &smallest, &largest); - current_->GetOverlappingInputs(level+1, &smallest, &largest, &c->inputs_[1]); + if (!gLevelTraits[level+1].m_OverlappedFiles) + { + current_->GetOverlappingInputs(level+1, &smallest, &largest, &c->inputs_[1]); - // Get entire range covered by compaction - InternalKey all_start, all_limit; - GetRange2(c->inputs_[0], c->inputs_[1], &all_start, &all_limit); + // Get entire range covered by compaction + InternalKey all_start, all_limit; + GetRange2(c->inputs_[0], c->inputs_[1], &all_start, &all_limit); - // See if we can grow the number of inputs in "level" without - // changing the number of "level+1" files we pick up. - if (!c->inputs_[1].empty()) { - std::vector expanded0; - current_->GetOverlappingInputs(level, &all_start, &all_limit, &expanded0); - const int64_t inputs0_size = TotalFileSize(c->inputs_[0]); - const int64_t inputs1_size = TotalFileSize(c->inputs_[1]); - const int64_t expanded0_size = TotalFileSize(expanded0); - if (expanded0.size() > c->inputs_[0].size() && - inputs1_size + expanded0_size < - ExpandedCompactionByteSizeLimit(options_)) { - InternalKey new_start, new_limit; - GetRange(expanded0, &new_start, &new_limit); - std::vector expanded1; - current_->GetOverlappingInputs(level+1, &new_start, &new_limit, - &expanded1); - if (expanded1.size() == c->inputs_[1].size()) { - Log(options_->info_log, - "Expanding@%d %d+%d (%ld+%ld bytes) to %d+%d (%ld+%ld bytes)\n", - level, - int(c->inputs_[0].size()), - int(c->inputs_[1].size()), - long(inputs0_size), long(inputs1_size), - int(expanded0.size()), - int(expanded1.size()), - long(expanded0_size), long(inputs1_size)); - smallest = new_start; - largest = new_limit; - c->inputs_[0] = expanded0; - c->inputs_[1] = expanded1; - GetRange2(c->inputs_[0], c->inputs_[1], &all_start, &all_limit); + // See if we can grow the number of inputs in "level" without + // changing the number of "level+1" files we pick up. + if (!c->inputs_[1].empty()) { + std::vector expanded0; + current_->GetOverlappingInputs(level, &all_start, &all_limit, &expanded0); + //const int64_t inputs0_size = TotalFileSize(c->inputs_[0]); + const int64_t inputs1_size = TotalFileSize(c->inputs_[1]); + const int64_t expanded0_size = TotalFileSize(expanded0); + if (expanded0.size() > c->inputs_[0].size() && + inputs1_size + expanded0_size < gLevelTraits[level].m_ExpandedCompactionByteSizeLimit) { + InternalKey new_start, new_limit; + GetRange(expanded0, &new_start, &new_limit); + std::vector expanded1; + current_->GetOverlappingInputs(level+1, &new_start, &new_limit, + &expanded1); + if (expanded1.size() == c->inputs_[1].size()) { +#if 0 // mutex_ held + Log(options_->info_log, + "Expanding@%d %d+%d (%ld+%ld bytes) to %d+%d (%ld+%ld bytes)\n", + level, + int(c->inputs_[0].size()), + int(c->inputs_[1].size()), + long(inputs0_size), long(inputs1_size), + int(expanded0.size()), + int(expanded1.size()), + long(expanded0_size), long(inputs1_size)); +#endif + smallest = new_start; + largest = new_limit; + c->inputs_[0] = expanded0; + c->inputs_[1] = expanded1; + GetRange2(c->inputs_[0], c->inputs_[1], &all_start, &all_limit); + } + } } - } - } - // Compute the set of grandparent files that overlap this compaction - // (parent == level+1; grandparent == level+2) - if (level + 2 < config::kNumLevels) { - current_->GetOverlappingInputs(level + 2, &all_start, &all_limit, - &c->grandparents_); - } + // Compute the set of grandparent files that overlap this compaction + // (parent == level+1; grandparent == level+2) + if (level + 2 < config::kNumLevels) { + current_->GetOverlappingInputs(level + 2, &all_start, &all_limit, + &c->grandparents_); + } + } // if +#if 1 + // compacting into an overlapped layer + else + { + // if this is NOT a repair (or panic) situation, take all files + // to reduce write amplification + if (c->inputs_[0].size()<=config::kL0_StopWritesTrigger + && c->inputs_[0].size()!=current_->files_[level].size()) + { + c->inputs_[0].clear(); + c->inputs_[0].reserve(current_->files_[level].size()); + + for (size_t i = 0; i < current_->files_[level].size(); ++i ) + { + FileMetaData* f = current_->files_[level][i]; + c->inputs_[0].push_back(f); + } // for + + GetRange(c->inputs_[0], &smallest, &largest); + } // if + } // else +#endif if (false) { Log(options_->info_log, "Compacting %d '%s' .. '%s'", @@ -1421,23 +1879,18 @@ Compaction* VersionSet::CompactRange( } // Avoid compacting too much in one shot in case the range is large. - // But we cannot do this for level-0 since level-0 files can overlap - // and we must not pick one file and drop another older file if the - // two files overlap. - if (level > 0) { - const uint64_t limit = MaxFileSizeForLevel(options_, level); - uint64_t total = 0; - for (size_t i = 0; i < inputs.size(); i++) { - uint64_t s = inputs[i]->file_size; - total += s; - if (total >= limit) { - inputs.resize(i + 1); - break; - } + const uint64_t limit = gLevelTraits[level].m_MaxFileSizeForLevel; + uint64_t total = 0; + for (size_t i = 0; i < inputs.size(); i++) { + uint64_t s = inputs[i]->file_size; + total += s; + if (total >= limit) { + inputs.resize(i + 1); + break; } } - Compaction* c = new Compaction(options_, level); + Compaction* c = new Compaction(level); c->input_version_ = current_; c->input_version_->Ref(); c->inputs_[0] = inputs; @@ -1445,13 +1898,21 @@ Compaction* VersionSet::CompactRange( return c; } -Compaction::Compaction(const Options* options, int level) + +Compaction::Compaction(int level) : level_(level), - max_output_file_size_(MaxFileSizeForLevel(options, level)), + max_output_file_size_(gLevelTraits[level].m_MaxFileSizeForLevel), input_version_(NULL), + compaction_type_(kNormalCompaction), grandparent_index_(0), seen_key_(false), - overlapped_bytes_(0) { + overlapped_bytes_(0), + tot_user_data_(0), tot_index_keys_(0), + avg_value_size_(0), avg_key_size_(0), avg_block_size_(0), + compressible_(true), + stats_done_(false), + no_move_(false) + { for (int i = 0; i < config::kNumLevels; i++) { level_ptrs_[i] = 0; } @@ -1464,13 +1925,24 @@ Compaction::~Compaction() { } bool Compaction::IsTrivialMove() const { - const VersionSet* vset = input_version_->vset_; // Avoid a move if there is lots of overlapping grandparent data. // Otherwise, the move could create a parent file that will require // a very expensive merge later on. - return (num_input_files(0) == 1 && num_input_files(1) == 0 && - TotalFileSize(grandparents_) <= - MaxGrandParentOverlapBytes(vset->options_)); +#if 1 + return (!gLevelTraits[level_].m_OverlappedFiles && + IsMoveOk() && + num_input_files(0) == 1 && + num_input_files(1) == 0 && + (uint64_t)TotalFileSize(grandparents_) <= gLevelTraits[level_].m_MaxGrandParentOverlapBytes); +#else + // removed this functionality when creating gLevelTraits[].m_OverlappedFiles + // flag. "Move" was intented by Google to delay compaction by moving small + // files in-between non-overlapping sorted files. New concept is to delay + // all compactions by creating larger log files before starting to thrash + // disk by maintaining smaller sorted files. Less thrash -> higher throughput + return(false); +#endif + } void Compaction::AddInputDeletions(VersionEdit* edit) { @@ -1482,47 +1954,76 @@ void Compaction::AddInputDeletions(VersionEdit* edit) { } bool Compaction::IsBaseLevelForKey(const Slice& user_key) { - // Maybe use binary search to find right entry instead of linear search? - const Comparator* user_cmp = input_version_->vset_->icmp_.user_comparator(); - for (int lvl = level_ + 2; lvl < config::kNumLevels; lvl++) { - const std::vector& files = input_version_->files_[lvl]; - for (; level_ptrs_[lvl] < files.size(); ) { - FileMetaData* f = files[level_ptrs_[lvl]]; - if (user_cmp->Compare(user_key, f->largest.user_key()) <= 0) { - // We've advanced far enough - if (user_cmp->Compare(user_key, f->smallest.user_key()) >= 0) { - // Key falls in this file's range, so definitely not base level - return false; + bool ret_flag; + + ret_flag=true; + + if (gLevelTraits[level_].m_OverlappedFiles + || gLevelTraits[level_+1].m_OverlappedFiles) + { + ret_flag=false; + } // if + else + { + // Maybe use binary search to find right entry instead of linear search? + const Comparator* user_cmp = input_version_->vset_->icmp_.user_comparator(); + for (int lvl = level_ + 2; lvl < config::kNumLevels; lvl++) { + const std::vector& files = input_version_->files_[lvl]; + for (; level_ptrs_[lvl] < files.size(); ) { + FileMetaData* f = files[level_ptrs_[lvl]]; + if (user_cmp->Compare(user_key, f->largest.user_key()) <= 0) { + // We've advanced far enough + if (user_cmp->Compare(user_key, f->smallest.user_key()) >= 0) { + // Key falls in this file's range, so definitely not base level + return false; + } + break; + } + level_ptrs_[lvl]++; + } } - break; - } - level_ptrs_[lvl]++; - } - } - return true; + } // else + + return ret_flag; } -bool Compaction::ShouldStopBefore(const Slice& internal_key) { - const VersionSet* vset = input_version_->vset_; - // Scan to find earliest grandparent file that contains key. - const InternalKeyComparator* icmp = &vset->icmp_; - while (grandparent_index_ < grandparents_.size() && - icmp->Compare(internal_key, - grandparents_[grandparent_index_]->largest.Encode()) > 0) { - if (seen_key_) { - overlapped_bytes_ += grandparents_[grandparent_index_]->file_size; - } - grandparent_index_++; - } - seen_key_ = true; +bool Compaction::ShouldStopBefore(const Slice& internal_key, size_t key_count) { - if (overlapped_bytes_ > MaxGrandParentOverlapBytes(vset->options_)) { - // Too much overlap for current output; start new output + bool ret_flag(false); + + // This is a look ahead to see how costly this key will make the subsequent compaction + // of this new file to the next higher level. Start a new file if the cost is high. + if (!gLevelTraits[level()+1].m_OverlappedFiles) + { + // Scan to find earliest grandparent file that contains key. + const InternalKeyComparator* icmp = &input_version_->vset_->icmp_; + while (grandparent_index_ < grandparents_.size() && + icmp->Compare(internal_key, + grandparents_[grandparent_index_]->largest.Encode()) > 0) { + if (seen_key_) { + overlapped_bytes_ += grandparents_[grandparent_index_]->file_size; + } + grandparent_index_++; + } + seen_key_ = true; + + if (overlapped_bytes_ > gLevelTraits[level_].m_MaxGrandParentOverlapBytes) { + // Too much overlap for current output; start new output + ret_flag=true; + } // if + + // Second consideration: sorted files need to keep the bloom filter size controlled + // to meet file open speed goals + else + { + ret_flag=(300000number, eSstCountBlocks) + >tables.GetStatisticValue(fmd->number, eSstCountCompressAborted)) + || 0==tables.GetStatisticValue(fmd->number, eSstCountBlocks); + + // block sizing algorithm + temp=0; + temp_cnt=0; + user_est=0; + idx_est=0; + + // get and hold handle to cache entry + s=tables.TEST_FindTable(fmd->number, fmd->file_size, fmd->level, &handle); + + if (s.ok()) + { + // 1. total size of all blocks before compaction + temp=tables.GetStatisticValue(fmd->number, eSstCountBlockSize); + + // estimate size when counter does not exist + if (0==temp) + { + TableAndFile * tf; + + tf=reinterpret_cast(tables.TEST_GetInternalCache()->Value(handle)); + if (tf->table->TableObjectSize() < fmd->file_size) + temp=fmd->file_size - tf->table->TableObjectSize(); + } // if + + user_est=temp; + tot_user_data_+=temp; + + // 2. total keys in the indexes + temp=tables.GetStatisticValue(fmd->number, eSstCountIndexKeys); + + // estimate total when counter does not exist + if (0==temp) + { + TableAndFile * tf; + Block * index_block; + + tf=reinterpret_cast(tables.TEST_GetInternalCache()->Value(handle)); + index_block=tf->table->TEST_GetIndexBlock(); + temp=index_block->NumRestarts(); + } // if + + idx_est=temp; + tot_index_keys_+=temp; + + // 3. average size of values in input set + // (value is really size of value plus size of key) + temp=tables.GetStatisticValue(fmd->number, eSstCountValueSize); + temp+=tables.GetStatisticValue(fmd->number, eSstCountKeySize); + temp_cnt=tables.GetStatisticValue(fmd->number, eSstCountKeys); + + // estimate total when counter does not exist + if (0==temp || 0==temp_cnt) + { + // no way to estimate total key count + // (ok, could try from bloom filter size ... but likely no + // bloom filter if no stats) + temp=0; + temp_cnt=0; + } // if + + avg_value_size_+=temp; + value_count+=temp_cnt; + + // 4. average key size + temp=tables.GetStatisticValue(fmd->number, eSstCountKeySize); + temp_cnt=tables.GetStatisticValue(fmd->number, eSstCountKeys); + + // estimate total when counter does not exist + if (0==temp || 0==temp_cnt) + { + // no way to estimate total key count + // (ok, could try from bloom filter size ... but likely no + // bloom filter if no stats) + temp=0; + temp_cnt=0; + } // if + + avg_key_size_+=temp; + key_count+=temp_cnt; + + // 5. block key size + temp=tables.GetStatisticValue(fmd->number, eSstCountBlockSizeUsed); + temp_cnt=tables.GetStatisticValue(fmd->number, eSstCountBlocks); + temp*=temp_cnt; + + // estimate total when counter does not exist + if (0==temp || 0==temp_cnt) + { + temp=user_est; + temp_cnt=idx_est; + } // if + + avg_block_size_+=temp; + block_count+=temp_cnt; + + // cleanup + tables.Release(handle); + } // if + } // for + + // compute averages + if (0!=value_count) + avg_value_size_/=value_count; + else + avg_value_size_=0; + + if (0!=key_count) + avg_key_size_/=key_count; + else + avg_key_size_=0; + + if (0!=block_count) + avg_block_size_/=block_count; + else + avg_block_size_=0; + + // only want to do this once per compaction + stats_done_=true; + } // if + + return; + +} // Compaction::CalcInputStats + + +} // namespace leveldb diff --git a/src/leveldb/db/version_set.h b/src/leveldb/db/version_set.h index 7935a965a..477c3d8d4 100644 --- a/src/leveldb/db/version_set.h +++ b/src/leveldb/db/version_set.h @@ -21,7 +21,9 @@ #include "db/dbformat.h" #include "db/version_edit.h" #include "port/port.h" -#include "port/thread_annotations.h" +#include "leveldb/atomics.h" +#include "leveldb/env.h" +#include "util/throttle.h" namespace leveldb { @@ -70,7 +72,7 @@ class Version { FileMetaData* seek_file; int seek_file_level; }; - Status Get(const ReadOptions&, const LookupKey& key, std::string* val, + Status Get(const ReadOptions&, const LookupKey& key, Value* val, GetStats* stats); // Adds "stats" into the current state. Returns true if a new @@ -78,12 +80,6 @@ class Version { // REQUIRES: lock is held bool UpdateStats(const GetStats& stats); - // Record a sample of bytes read at the specified internal key. - // Samples are taken approximately once every config::kReadBytesPeriod - // bytes. Returns true if a new compaction may need to be triggered. - // REQUIRES: lock is held - bool RecordReadSample(Slice key); - // Reference count management (so Versions do not disappear out from // under live iterators) void Ref(); @@ -101,43 +97,47 @@ class Version { // largest_user_key==NULL represents a key largest than all keys in the DB. bool OverlapInLevel(int level, const Slice* smallest_user_key, - const Slice* largest_user_key); + const Slice* largest_user_key) const; // Return the level at which we should place a new memtable compaction // result that covers the range [smallest_user_key,largest_user_key]. int PickLevelForMemTableOutput(const Slice& smallest_user_key, - const Slice& largest_user_key); + const Slice& largest_user_key, + const int level_limit); - int NumFiles(int level) const { return files_[level].size(); } + virtual size_t NumFiles(int level) const { return files_[level].size(); } + + const VersionSet * GetVersionSet() const { return vset_; } + + typedef std::vector FileMetaDataVector_t; + + virtual const std::vector & GetFileList(int level) const {return files_[level];}; + + volatile int WritePenalty() const {return write_penalty_; } + + // Riak specific repair routine + bool VerifyLevels(int & level, InternalKey & begin, InternalKey & end); // Return a human readable string that describes this version's contents. std::string DebugString() const; - private: +protected: friend class Compaction; friend class VersionSet; class LevelFileNumIterator; Iterator* NewConcatenatingIterator(const ReadOptions&, int level) const; - // Call func(arg, level, f) for every file that overlaps user_key in - // order from newest to oldest. If an invocation of func returns - // false, makes no more calls. - // - // REQUIRES: user portion of internal_key == user_key. - void ForEachOverlapping(Slice user_key, Slice internal_key, - void* arg, - bool (*func)(void*, int, FileMetaData*)); - VersionSet* vset_; // VersionSet to which this Version belongs Version* next_; // Next version in linked list Version* prev_; // Previous version in linked list int refs_; // Number of live refs to this version // List of files per level - std::vector files_[config::kNumLevels]; + USED_BY_NESTED_FRIEND(std::vector files_[config::kNumLevels];) - // Next file to compact based on seek stats. + protected: + // Next file to compact based on seek stats (or Riak delete test) FileMetaData* file_to_compact_; int file_to_compact_level_; @@ -146,17 +146,29 @@ class Version { // are initialized by Finalize(). double compaction_score_; int compaction_level_; + bool compaction_grooming_; + bool compaction_no_move_; + bool compaction_expirefile_; + volatile int write_penalty_; + protected: + // make the ctor/dtor protected, so that a unit test can subclass explicit Version(VersionSet* vset) : vset_(vset), next_(this), prev_(this), refs_(0), file_to_compact_(NULL), file_to_compact_level_(-1), compaction_score_(-1), - compaction_level_(-1) { + compaction_level_(-1), + compaction_grooming_(false), + compaction_no_move_(false), + compaction_expirefile_(false), + write_penalty_(0) + { } - ~Version(); + virtual ~Version(); +private: // No copying allowed Version(const Version&); void operator=(const Version&); @@ -175,11 +187,10 @@ class VersionSet { // current version. Will release *mu while actually writing to the file. // REQUIRES: *mu is held on entry. // REQUIRES: no other thread concurrently calls LogAndApply() - Status LogAndApply(VersionEdit* edit, port::Mutex* mu) - EXCLUSIVE_LOCKS_REQUIRED(mu); + Status LogAndApply(VersionEdit* edit, port::Mutex* mu); // Recover the last saved descriptor from persistent storage. - Status Recover(bool *save_manifest); + Status Recover(); // Return the current version. Version* current() const { return current_; } @@ -188,19 +199,29 @@ class VersionSet { uint64_t ManifestFileNumber() const { return manifest_file_number_; } // Allocate and return a new file number - uint64_t NewFileNumber() { return next_file_number_++; } + // (-1 is to "duplicate" old post-increment logic while maintaining + // some threading integrity ... next_file_number_ used naked a bunch) + uint64_t NewFileNumber() { return(inc_and_fetch(&next_file_number_) -1); } // Arrange to reuse "file_number" unless a newer file number has // already been allocated. // REQUIRES: "file_number" was returned by a call to NewFileNumber(). + // (disabled due to threading concerns ... and desire NOT to use mutex, matthewv) void ReuseFileNumber(uint64_t file_number) { - if (next_file_number_ == file_number + 1) { - next_file_number_ = file_number; - } +// if (next_file_number_ == file_number + 1) { +// next_file_number_ = file_number; +// } } // Return the number of Table files at the specified level. - int NumLevelFiles(int level) const; + size_t NumLevelFiles(int level) const; + + // is the specified level overlapped (or if false->sorted) + static bool IsLevelOverlapped(int level); + + static uint64_t DesiredBytesForLevel(int level); + static uint64_t MaxBytesForLevel(int level); + static uint64_t MaxFileSizeForLevel(int level); // Return the combined file size of all files at the specified level. int64_t NumLevelBytes(int level) const; @@ -224,11 +245,36 @@ class VersionSet { // being compacted, or zero if there is no such log file. uint64_t PrevLogNumber() const { return prev_log_number_; } + int WriteThrottleUsec(bool active_compaction) + { + uint64_t penalty, throttle; + int ret_val; + + penalty=current_->write_penalty_; + throttle=GetThrottleWriteRate(); + + ret_val=0; + if (0==penalty && 1!=throttle) + ret_val=(int)throttle; + else if (0!=penalty) + { + if (1==throttle) + throttle=GetUnadjustedThrottleWriteRate(); + ret_val=(int)penalty * throttle; + } // else if + + return(ret_val); + } + + // Pick level and inputs for a new compaction. // Returns NULL if there is no compaction to be done. // Otherwise returns a pointer to a heap-allocated object that // describes the compaction. Caller should delete the result. - Compaction* PickCompaction(); + // + // Riak October 2013: Pick Compaction now posts work directly + // to hot_thread pools + void PickCompaction(class DBImpl * db_impl); // Return a compaction object for compacting the range [begin,end] in // the specified level. Returns NULL if there is nothing in that @@ -267,16 +313,42 @@ class VersionSet { char buffer[100]; }; const char* LevelSummary(LevelSummaryStorage* scratch) const; + const char* CompactionSummary(LevelSummaryStorage* scratch) const; - private: + TableCache* GetTableCache() {return(table_cache_);}; + + const Options * GetOptions() const {return(options_);}; + + bool IsCompactionSubmitted(int level) + {return(m_CompactionStatus[level].m_Submitted);} + + void SetCompactionSubmitted(int level) + {m_CompactionStatus[level].m_Submitted=true;} + + void SetCompactionRunning(int level) + {m_CompactionStatus[level].m_Running=true;} + + void SetCompactionDone(int level, uint64_t Now) + { m_CompactionStatus[level].m_Running=false; + m_CompactionStatus[level].m_Submitted=false; + // must set both source and destination. otherwise + // destination might immediately decide it needs a + // timed grooming too ... defeating idea to spreadout the groomings + m_CompactionStatus[level].m_LastCompaction=Now; + if ((level+1)& inputs, InternalKey* smallest, @@ -299,7 +371,7 @@ class VersionSet { const Options* const options_; TableCache* const table_cache_; const InternalKeyComparator icmp_; - uint64_t next_file_number_; + volatile uint64_t next_file_number_; uint64_t manifest_file_number_; uint64_t last_sequence_; uint64_t log_number_; @@ -315,11 +387,44 @@ class VersionSet { // Either an empty string, or a valid InternalKey. std::string compact_pointer_[config::kNumLevels]; + // Riak allows multiple compaction threads, this mutex allows + // only one to write to manifest at a time. Only used in LogAndApply + port::Mutex manifest_mutex_; + + volatile uint64_t last_penalty_minutes_; + volatile int prev_write_penalty_; + + + + struct CompactionStatus_s + { + bool m_Submitted; //!< level submitted to hot thread pool + bool m_Running; //!< thread actually running compaction + uint64_t m_LastCompaction; //! inputs_[2]; // The two sets of inputs - // State used to check for number of overlapping grandparent files + // State used to check for number of of overlapping grandparent files // (parent == level_ + 1, grandparent == level_ + 2) std::vector grandparents_; size_t grandparent_index_; // Index in grandparent_starts_ bool seen_key_; // Some output key has been seen - int64_t overlapped_bytes_; // Bytes of overlap between current output + uint64_t overlapped_bytes_; // Bytes of overlap between current output // and grandparent files // State for implementing IsBaseLevelForKey @@ -391,6 +514,16 @@ class Compaction { // higher level than the ones involved in this compaction (i.e. for // all L >= level_ + 2). size_t level_ptrs_[config::kNumLevels]; + + // Riak specific: output statistics from CalcInputStats + size_t tot_user_data_; + size_t tot_index_keys_; + size_t avg_value_size_; + size_t avg_key_size_; + size_t avg_block_size_; + bool compressible_; + bool stats_done_; + bool no_move_; }; } // namespace leveldb diff --git a/src/leveldb/db/version_set_test.cc b/src/leveldb/db/version_set_test.cc index 501e34d13..aa36b4ee7 100644 --- a/src/leveldb/db/version_set_test.cc +++ b/src/leveldb/db/version_set_test.cc @@ -27,13 +27,13 @@ class FindFileTest { SequenceNumber largest_seq = 100) { FileMetaData* f = new FileMetaData; f->number = files_.size() + 1; - f->smallest = InternalKey(smallest, smallest_seq, kTypeValue); - f->largest = InternalKey(largest, largest_seq, kTypeValue); + f->smallest = InternalKey(smallest, 0, smallest_seq, kTypeValue); + f->largest = InternalKey(largest, 0, largest_seq, kTypeValue); files_.push_back(f); } int Find(const char* key) { - InternalKey target(key, 100, kTypeValue); + InternalKey target(key, 0, 100, kTypeValue); InternalKeyComparator cmp(BytewiseComparator()); return FindFile(cmp, files_, target.Encode()); } diff --git a/src/leveldb/db/write_batch.cc b/src/leveldb/db/write_batch.cc index 33f4a4257..116e717a9 100644 --- a/src/leveldb/db/write_batch.cc +++ b/src/leveldb/db/write_batch.cc @@ -13,13 +13,17 @@ // len: varint32 // data: uint8[len] -#include "leveldb/write_batch.h" +#include #include "leveldb/db.h" +#include "leveldb/env.h" +#include "leveldb/expiry.h" +#include "leveldb/write_batch.h" #include "db/dbformat.h" #include "db/memtable.h" #include "db/write_batch_internal.h" #include "util/coding.h" +#include "util/throttle.h" namespace leveldb { @@ -47,16 +51,17 @@ Status WriteBatch::Iterate(Handler* handler) const { input.remove_prefix(kHeader); Slice key, value; + ExpiryTimeMicros expiry; int found = 0; while (!input.empty()) { found++; - char tag = input[0]; + ValueType tag = (ValueType)input[0]; input.remove_prefix(1); switch (tag) { case kTypeValue: if (GetLengthPrefixedSlice(&input, &key) && GetLengthPrefixedSlice(&input, &value)) { - handler->Put(key, value); + handler->Put(key, value, kTypeValue, 0); } else { return Status::Corruption("bad WriteBatch Put"); } @@ -68,6 +73,16 @@ Status WriteBatch::Iterate(Handler* handler) const { return Status::Corruption("bad WriteBatch Delete"); } break; + case kTypeValueWriteTime: + case kTypeValueExplicitExpiry: + if (GetLengthPrefixedSlice(&input, &key) && + GetVarint64(&input, &expiry) && + GetLengthPrefixedSlice(&input, &value)) { + handler->Put(key, value, tag, expiry); + } else { + return Status::Corruption("bad WriteBatch Expiry"); + } + break; default: return Status::Corruption("unknown WriteBatch tag"); } @@ -95,10 +110,20 @@ void WriteBatchInternal::SetSequence(WriteBatch* b, SequenceNumber seq) { EncodeFixed64(&b->rep_[0], seq); } -void WriteBatch::Put(const Slice& key, const Slice& value) { +void WriteBatch::Put(const Slice& key, const Slice& value, const KeyMetaData * meta) { + KeyMetaData local_meta; WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1); - rep_.push_back(static_cast(kTypeValue)); + if (NULL!=meta) + local_meta=*meta; + rep_.push_back(static_cast(local_meta.m_Type)); PutLengthPrefixedSlice(&rep_, key); + if (kTypeValueExplicitExpiry==local_meta.m_Type + || kTypeValueWriteTime==local_meta.m_Type) + { + if (kTypeValueWriteTime==local_meta.m_Type && 0==local_meta.m_Expiry) + local_meta.m_Expiry=GetCachedTimeMicros(); + PutVarint64(&rep_, local_meta.m_Expiry); + } // if PutLengthPrefixedSlice(&rep_, value); } @@ -113,23 +138,33 @@ class MemTableInserter : public WriteBatch::Handler { public: SequenceNumber sequence_; MemTable* mem_; + const Options * options_; - virtual void Put(const Slice& key, const Slice& value) { - mem_->Add(sequence_, kTypeValue, key, value); + MemTableInserter() : mem_(NULL), options_(NULL) {}; + + virtual void Put(const Slice& key, const Slice& value, const ValueType &type, const ExpiryTimeMicros &expiry) { + ValueType type_use(type); + ExpiryTimeMicros expiry_use(expiry); + + if (NULL!=options_ && options_->ExpiryActivated()) + options_->expiry_module->MemTableInserterCallback(key, value, type_use, expiry_use); + mem_->Add(sequence_, (ValueType)type_use, key, value, expiry_use); sequence_++; } virtual void Delete(const Slice& key) { - mem_->Add(sequence_, kTypeDeletion, key, Slice()); + mem_->Add(sequence_, kTypeDeletion, key, Slice(), 0); sequence_++; } }; } // namespace Status WriteBatchInternal::InsertInto(const WriteBatch* b, - MemTable* memtable) { + MemTable* memtable, + const Options * options) { MemTableInserter inserter; inserter.sequence_ = WriteBatchInternal::Sequence(b); inserter.mem_ = memtable; + inserter.options_ = options; return b->Iterate(&inserter); } diff --git a/src/leveldb/db/write_batch_internal.h b/src/leveldb/db/write_batch_internal.h index 9448ef7b2..d313d02da 100644 --- a/src/leveldb/db/write_batch_internal.h +++ b/src/leveldb/db/write_batch_internal.h @@ -5,7 +5,6 @@ #ifndef STORAGE_LEVELDB_DB_WRITE_BATCH_INTERNAL_H_ #define STORAGE_LEVELDB_DB_WRITE_BATCH_INTERNAL_H_ -#include "db/dbformat.h" #include "leveldb/write_batch.h" namespace leveldb { @@ -22,10 +21,10 @@ class WriteBatchInternal { // Set the count for the number of entries in the batch. static void SetCount(WriteBatch* batch, int n); - // Return the sequence number for the start of this batch. + // Return the seqeunce number for the start of this batch. static SequenceNumber Sequence(const WriteBatch* batch); - // Store the specified number as the sequence number for the start of + // Store the specified number as the seqeunce number for the start of // this batch. static void SetSequence(WriteBatch* batch, SequenceNumber seq); @@ -39,7 +38,7 @@ class WriteBatchInternal { static void SetContents(WriteBatch* batch, const Slice& contents); - static Status InsertInto(const WriteBatch* batch, MemTable* memtable); + static Status InsertInto(const WriteBatch* batch, MemTable* memtable, const Options * options); static void Append(WriteBatch* dst, const WriteBatch* src); }; diff --git a/src/leveldb/db/write_batch_test.cc b/src/leveldb/db/write_batch_test.cc index 9064e3d85..4854af429 100644 --- a/src/leveldb/db/write_batch_test.cc +++ b/src/leveldb/db/write_batch_test.cc @@ -2,6 +2,7 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. +#include #include "leveldb/db.h" #include "db/memtable.h" @@ -17,11 +18,12 @@ static std::string PrintContents(WriteBatch* b) { MemTable* mem = new MemTable(cmp); mem->Ref(); std::string state; - Status s = WriteBatchInternal::InsertInto(b, mem); + Status s = WriteBatchInternal::InsertInto(b, mem, NULL); int count = 0; Iterator* iter = mem->NewIterator(); for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { ParsedInternalKey ikey; + std::stringstream sstr; ASSERT_TRUE(ParseInternalKey(iter->key(), &ikey)); switch (ikey.type) { case kTypeValue: @@ -32,6 +34,28 @@ static std::string PrintContents(WriteBatch* b) { state.append(")"); count++; break; + case kTypeValueWriteTime: + state.append("PutWT("); + state.append(ikey.user_key.ToString()); + state.append(", "); + sstr << ikey.expiry; + state.append(sstr.str()); + state.append(", "); + state.append(iter->value().ToString()); + state.append(")"); + count++; + break; + case kTypeValueExplicitExpiry: + state.append("PutEE("); + state.append(ikey.user_key.ToString()); + state.append(", "); + sstr << ikey.expiry; + state.append(sstr.str()); + state.append(", "); + state.append(iter->value().ToString()); + state.append(")"); + count++; + break; case kTypeDeletion: state.append("Delete("); state.append(ikey.user_key.ToString()); @@ -74,6 +98,32 @@ TEST(WriteBatchTest, Multiple) { PrintContents(&batch)); } +TEST(WriteBatchTest, MultipleExpiry) { + WriteBatch batch; + KeyMetaData meta; + batch.Put(Slice("Mary"), Slice("Lamb")); + meta.m_Type=kTypeValueExplicitExpiry; + meta.m_Expiry=2347; + batch.Put(Slice("Adam"), Slice("Ant"), &meta); + //batch.PutExplicitExpiry(Slice("Adam"), Slice("Ant"), 2347); + batch.Put(Slice("Frosty"), Slice("Snowman")); + batch.Put(Slice("Tip"), Slice("ONeal")); + batch.Delete(Slice("Frosty")); + meta.m_Type=kTypeValueExplicitExpiry; + meta.m_Expiry=987654321; + batch.Put(Slice("The"), Slice("Fonz"), &meta); + WriteBatchInternal::SetSequence(&batch, 200); + ASSERT_EQ(200, WriteBatchInternal::Sequence(&batch)); + ASSERT_EQ(6, WriteBatchInternal::Count(&batch)); + ASSERT_EQ("PutEE(Adam, 2347, Ant)@201" + "Delete(Frosty)@204" + "Put(Frosty, Snowman)@202" + "Put(Mary, Lamb)@200" + "PutEE(The, 987654321, Fonz)@205" + "Put(Tip, ONeal)@203", + PrintContents(&batch)); +} + TEST(WriteBatchTest, Corruption) { WriteBatch batch; batch.Put(Slice("foo"), Slice("bar")); diff --git a/src/leveldb/doc/bench/db_bench_sqlite3.cc b/src/leveldb/doc/bench/db_bench_sqlite3.cc index e63aaa8dc..256793a9d 100644 --- a/src/leveldb/doc/bench/db_bench_sqlite3.cc +++ b/src/leveldb/doc/bench/db_bench_sqlite3.cc @@ -618,7 +618,7 @@ class Benchmark { ErrorCheck(status); // Execute read statement - while ((status = sqlite3_step(read_stmt)) == SQLITE_ROW) {} + while ((status = sqlite3_step(read_stmt)) == SQLITE_ROW); StepErrorCheck(status); // Reset SQLite statement for another use diff --git a/src/leveldb/doc/bench/db_bench_tree_db.cc b/src/leveldb/doc/bench/db_bench_tree_db.cc index 4ca381f11..ed86f031c 100644 --- a/src/leveldb/doc/bench/db_bench_tree_db.cc +++ b/src/leveldb/doc/bench/db_bench_tree_db.cc @@ -338,7 +338,7 @@ class Benchmark { bool write_sync = false; if (name == Slice("fillseq")) { Write(write_sync, SEQUENTIAL, FRESH, num_, FLAGS_value_size, 1); - DBSynchronize(db_); + } else if (name == Slice("fillrandom")) { Write(write_sync, RANDOM, FRESH, num_, FLAGS_value_size, 1); DBSynchronize(db_); diff --git a/src/leveldb/doc/doc.css b/src/leveldb/doc/doc.css new file mode 100644 index 000000000..700c564e4 --- /dev/null +++ b/src/leveldb/doc/doc.css @@ -0,0 +1,89 @@ +body { + margin-left: 0.5in; + margin-right: 0.5in; + background: white; + color: black; +} + +h1 { + margin-left: -0.2in; + font-size: 14pt; +} +h2 { + margin-left: -0in; + font-size: 12pt; +} +h3 { + margin-left: -0in; +} +h4 { + margin-left: -0in; +} +hr { + margin-left: -0in; +} + +/* Definition lists: definition term bold */ +dt { + font-weight: bold; +} + +address { + text-align: center; +} +code,samp,var { + color: blue; +} +kbd { + color: #600000; +} +div.note p { + float: right; + width: 3in; + margin-right: 0%; + padding: 1px; + border: 2px solid #6060a0; + background-color: #fffff0; +} + +ul { + margin-top: -0em; + margin-bottom: -0em; +} + +ol { + margin-top: -0em; + margin-bottom: -0em; +} + +UL.nobullets { + list-style-type: none; + list-style-image: none; + margin-left: -1em; +} + +p { + margin: 1em 0 1em 0; + padding: 0 0 0 0; +} + +pre { + line-height: 1.3em; + padding: 0.4em 0 0.8em 0; + margin: 0 0 0 0; + border: 0 0 0 0; + color: blue; +} + +.datatable { + margin-left: auto; + margin-right: auto; + margin-top: 2em; + margin-bottom: 2em; + border: 1px solid; +} + +.datatable td,th { + padding: 0 0.5em 0 0.5em; + text-align: right; +} diff --git a/src/leveldb/doc/impl.html b/src/leveldb/doc/impl.html new file mode 100644 index 000000000..e870795d2 --- /dev/null +++ b/src/leveldb/doc/impl.html @@ -0,0 +1,213 @@ + + + + +Leveldb file layout and compactions + + + + +

Files

+ +The implementation of leveldb is similar in spirit to the +representation of a single + +Bigtable tablet (section 5.3). +However the organization of the files that make up the representation +is somewhat different and is explained below. + +

+Each database is represented by a set of files stored in a directory. +There are several different types of files as documented below: +

+

Log files

+

+A log file (*.log) stores a sequence of recent updates. Each update +is appended to the current log file. When the log file reaches a +pre-determined size (approximately 4MB by default), it is converted +to a sorted table (see below) and a new log file is created for future +updates. +

+A copy of the current log file is kept in an in-memory structure (the +memtable). This copy is consulted on every read so that read +operations reflect all logged updates. +

+

Sorted tables

+

+A sorted table (*.sst) stores a sequence of entries sorted by key. +Each entry is either a value for the key, or a deletion marker for the +key. (Deletion markers are kept around to hide obsolete values +present in older sorted tables). +

+The set of sorted tables are organized into a sequence of levels. The +sorted table generated from a log file is placed in a special young +level (also called level-0). When the number of young files exceeds a +certain threshold (currently four), all of the young files are merged +together with all of the overlapping level-1 files to produce a +sequence of new level-1 files (we create a new level-1 file for every +2MB of data.) +

+Files in the young level may contain overlapping keys. However files +in other levels have distinct non-overlapping key ranges. Consider +level number L where L >= 1. When the combined size of files in +level-L exceeds (10^L) MB (i.e., 10MB for level-1, 100MB for level-2, +...), one file in level-L, and all of the overlapping files in +level-(L+1) are merged to form a set of new files for level-(L+1). +These merges have the effect of gradually migrating new updates from +the young level to the largest level using only bulk reads and writes +(i.e., minimizing expensive seeks). + +

Manifest

+

+A MANIFEST file lists the set of sorted tables that make up each +level, the corresponding key ranges, and other important metadata. +A new MANIFEST file (with a new number embedded in the file name) +is created whenever the database is reopened. The MANIFEST file is +formatted as a log, and changes made to the serving state (as files +are added or removed) are appended to this log. +

+

Current

+

+CURRENT is a simple text file that contains the name of the latest +MANIFEST file. +

+

Info logs

+

+Informational messages are printed to files named LOG and LOG.old. +

+

Others

+

+Other files used for miscellaneous purposes may also be present +(LOCK, *.dbtmp). + +

Level 0

+When the log file grows above a certain size (1MB by default): +
    +
  • Create a brand new memtable and log file and direct future updates here +
  • In the background: +
      +
    • Write the contents of the previous memtable to an sstable +
    • Discard the memtable +
    • Delete the old log file and the old memtable +
    • Add the new sstable to the young (level-0) level. +
    +
+ +

Compactions

+ +

+When the size of level L exceeds its limit, we compact it in a +background thread. The compaction picks a file from level L and all +overlapping files from the next level L+1. Note that if a level-L +file overlaps only part of a level-(L+1) file, the entire file at +level-(L+1) is used as an input to the compaction and will be +discarded after the compaction. Aside: because level-0 is special +(files in it may overlap each other), we treat compactions from +level-0 to level-1 specially: a level-0 compaction may pick more than +one level-0 file in case some of these files overlap each other. + +

+A compaction merges the contents of the picked files to produce a +sequence of level-(L+1) files. We switch to producing a new +level-(L+1) file after the current output file has reached the target +file size (2MB). We also switch to a new output file when the key +range of the current output file has grown enough to overlap more then +ten level-(L+2) files. This last rule ensures that a later compaction +of a level-(L+1) file will not pick up too much data from level-(L+2). + +

+The old files are discarded and the new files are added to the serving +state. + +

+Compactions for a particular level rotate through the key space. In +more detail, for each level L, we remember the ending key of the last +compaction at level L. The next compaction for level L will pick the +first file that starts after this key (wrapping around to the +beginning of the key space if there is no such file). + +

+Compactions drop overwritten values. They also drop deletion markers +if there are no higher numbered levels that contain a file whose range +overlaps the current key. + +

Timing

+ +Level-0 compactions will read up to four 1MB files from level-0, and +at worst all the level-1 files (10MB). I.e., we will read 14MB and +write 14MB. + +

+Other than the special level-0 compactions, we will pick one 2MB file +from level L. In the worst case, this will overlap ~ 12 files from +level L+1 (10 because level-(L+1) is ten times the size of level-L, +and another two at the boundaries since the file ranges at level-L +will usually not be aligned with the file ranges at level-L+1). The +compaction will therefore read 26MB and write 26MB. Assuming a disk +IO rate of 100MB/s (ballpark range for modern drives), the worst +compaction cost will be approximately 0.5 second. + +

+If we throttle the background writing to something small, say 10% of +the full 100MB/s speed, a compaction may take up to 5 seconds. If the +user is writing at 10MB/s, we might build up lots of level-0 files +(~50 to hold the 5*10MB). This may signficantly increase the cost of +reads due to the overhead of merging more files together on every +read. + +

+Solution 1: To reduce this problem, we might want to increase the log +switching threshold when the number of level-0 files is large. Though +the downside is that the larger this threshold, the more memory we will +need to hold the corresponding memtable. + +

+Solution 2: We might want to decrease write rate artificially when the +number of level-0 files goes up. + +

+Solution 3: We work on reducing the cost of very wide merges. +Perhaps most of the level-0 files will have their blocks sitting +uncompressed in the cache and we will only need to worry about the +O(N) complexity in the merging iterator. + +

Number of files

+ +Instead of always making 2MB files, we could make larger files for +larger levels to reduce the total file count, though at the expense of +more bursty compactions. Alternatively, we could shard the set of +files into multiple directories. + +

+An experiment on an ext3 filesystem on Feb 04, 2011 shows +the following timings to do 100K file opens in directories with +varying number of files: + + + + + +
Files in directoryMicroseconds to open a file
10009
1000010
10000016
+So maybe even the sharding is not necessary on modern filesystems? + +

Recovery

+ +
    +
  • Read CURRENT to find name of the latest committed MANIFEST +
  • Read the named MANIFEST file +
  • Clean up stale files +
  • We could open all sstables here, but it is probably better to be lazy... +
  • Convert log chunk to a new level-0 sstable +
  • Start directing new writes to a new log file with recovered sequence# +
+ +

Garbage collection of files

+ +DeleteObsoleteFiles() is called at the end of every +compaction and at the end of recovery. It finds the names of all +files in the database. It deletes all log files that are not the +current log file. It deletes all table files that are not referenced +from some level and are not the output of an active compaction. + + + diff --git a/src/leveldb/doc/impl.md b/src/leveldb/doc/impl.md deleted file mode 100644 index 4b13f2a6b..000000000 --- a/src/leveldb/doc/impl.md +++ /dev/null @@ -1,170 +0,0 @@ -## Files - -The implementation of leveldb is similar in spirit to the representation of a -single [Bigtable tablet (section 5.3)](http://research.google.com/archive/bigtable.html). -However the organization of the files that make up the representation is -somewhat different and is explained below. - -Each database is represented by a set of files stored in a directory. There are -several different types of files as documented below: - -### Log files - -A log file (*.log) stores a sequence of recent updates. Each update is appended -to the current log file. When the log file reaches a pre-determined size -(approximately 4MB by default), it is converted to a sorted table (see below) -and a new log file is created for future updates. - -A copy of the current log file is kept in an in-memory structure (the -`memtable`). This copy is consulted on every read so that read operations -reflect all logged updates. - -## Sorted tables - -A sorted table (*.ldb) stores a sequence of entries sorted by key. Each entry is -either a value for the key, or a deletion marker for the key. (Deletion markers -are kept around to hide obsolete values present in older sorted tables). - -The set of sorted tables are organized into a sequence of levels. The sorted -table generated from a log file is placed in a special **young** level (also -called level-0). When the number of young files exceeds a certain threshold -(currently four), all of the young files are merged together with all of the -overlapping level-1 files to produce a sequence of new level-1 files (we create -a new level-1 file for every 2MB of data.) - -Files in the young level may contain overlapping keys. However files in other -levels have distinct non-overlapping key ranges. Consider level number L where -L >= 1. When the combined size of files in level-L exceeds (10^L) MB (i.e., 10MB -for level-1, 100MB for level-2, ...), one file in level-L, and all of the -overlapping files in level-(L+1) are merged to form a set of new files for -level-(L+1). These merges have the effect of gradually migrating new updates -from the young level to the largest level using only bulk reads and writes -(i.e., minimizing expensive seeks). - -### Manifest - -A MANIFEST file lists the set of sorted tables that make up each level, the -corresponding key ranges, and other important metadata. A new MANIFEST file -(with a new number embedded in the file name) is created whenever the database -is reopened. The MANIFEST file is formatted as a log, and changes made to the -serving state (as files are added or removed) are appended to this log. - -### Current - -CURRENT is a simple text file that contains the name of the latest MANIFEST -file. - -### Info logs - -Informational messages are printed to files named LOG and LOG.old. - -### Others - -Other files used for miscellaneous purposes may also be present (LOCK, *.dbtmp). - -## Level 0 - -When the log file grows above a certain size (1MB by default): -Create a brand new memtable and log file and direct future updates here -In the background: -Write the contents of the previous memtable to an sstable -Discard the memtable -Delete the old log file and the old memtable -Add the new sstable to the young (level-0) level. - -## Compactions - -When the size of level L exceeds its limit, we compact it in a background -thread. The compaction picks a file from level L and all overlapping files from -the next level L+1. Note that if a level-L file overlaps only part of a -level-(L+1) file, the entire file at level-(L+1) is used as an input to the -compaction and will be discarded after the compaction. Aside: because level-0 -is special (files in it may overlap each other), we treat compactions from -level-0 to level-1 specially: a level-0 compaction may pick more than one -level-0 file in case some of these files overlap each other. - -A compaction merges the contents of the picked files to produce a sequence of -level-(L+1) files. We switch to producing a new level-(L+1) file after the -current output file has reached the target file size (2MB). We also switch to a -new output file when the key range of the current output file has grown enough -to overlap more than ten level-(L+2) files. This last rule ensures that a later -compaction of a level-(L+1) file will not pick up too much data from -level-(L+2). - -The old files are discarded and the new files are added to the serving state. - -Compactions for a particular level rotate through the key space. In more detail, -for each level L, we remember the ending key of the last compaction at level L. -The next compaction for level L will pick the first file that starts after this -key (wrapping around to the beginning of the key space if there is no such -file). - -Compactions drop overwritten values. They also drop deletion markers if there -are no higher numbered levels that contain a file whose range overlaps the -current key. - -### Timing - -Level-0 compactions will read up to four 1MB files from level-0, and at worst -all the level-1 files (10MB). I.e., we will read 14MB and write 14MB. - -Other than the special level-0 compactions, we will pick one 2MB file from level -L. In the worst case, this will overlap ~ 12 files from level L+1 (10 because -level-(L+1) is ten times the size of level-L, and another two at the boundaries -since the file ranges at level-L will usually not be aligned with the file -ranges at level-L+1). The compaction will therefore read 26MB and write 26MB. -Assuming a disk IO rate of 100MB/s (ballpark range for modern drives), the worst -compaction cost will be approximately 0.5 second. - -If we throttle the background writing to something small, say 10% of the full -100MB/s speed, a compaction may take up to 5 seconds. If the user is writing at -10MB/s, we might build up lots of level-0 files (~50 to hold the 5*10MB). This -may significantly increase the cost of reads due to the overhead of merging more -files together on every read. - -Solution 1: To reduce this problem, we might want to increase the log switching -threshold when the number of level-0 files is large. Though the downside is that -the larger this threshold, the more memory we will need to hold the -corresponding memtable. - -Solution 2: We might want to decrease write rate artificially when the number of -level-0 files goes up. - -Solution 3: We work on reducing the cost of very wide merges. Perhaps most of -the level-0 files will have their blocks sitting uncompressed in the cache and -we will only need to worry about the O(N) complexity in the merging iterator. - -### Number of files - -Instead of always making 2MB files, we could make larger files for larger levels -to reduce the total file count, though at the expense of more bursty -compactions. Alternatively, we could shard the set of files into multiple -directories. - -An experiment on an ext3 filesystem on Feb 04, 2011 shows the following timings -to do 100K file opens in directories with varying number of files: - - -| Files in directory | Microseconds to open a file | -|-------------------:|----------------------------:| -| 1000 | 9 | -| 10000 | 10 | -| 100000 | 16 | - -So maybe even the sharding is not necessary on modern filesystems? - -## Recovery - -* Read CURRENT to find name of the latest committed MANIFEST -* Read the named MANIFEST file -* Clean up stale files -* We could open all sstables here, but it is probably better to be lazy... -* Convert log chunk to a new level-0 sstable -* Start directing new writes to a new log file with recovered sequence# - -## Garbage collection of files - -`DeleteObsoleteFiles()` is called at the end of every compaction and at the end -of recovery. It finds the names of all files in the database. It deletes all log -files that are not the current log file. It deletes all table files that are not -referenced from some level and are not the output of an active compaction. diff --git a/src/leveldb/doc/index.html b/src/leveldb/doc/index.html new file mode 100644 index 000000000..521d2baf4 --- /dev/null +++ b/src/leveldb/doc/index.html @@ -0,0 +1,549 @@ + + + + +Leveldb + + + +

Leveldb

+
Jeff Dean, Sanjay Ghemawat
+

+The leveldb library provides a persistent key value store. Keys and +values are arbitrary byte arrays. The keys are ordered within the key +value store according to a user-specified comparator function. + +

+

Opening A Database

+

+A leveldb database has a name which corresponds to a file system +directory. All of the contents of database are stored in this +directory. The following example shows how to open a database, +creating it if necessary: +

+

+  #include <assert>
+  #include "leveldb/db.h"
+
+  leveldb::DB* db;
+  leveldb::Options options;
+  options.create_if_missing = true;
+  leveldb::Status status = leveldb::DB::Open(options, "/tmp/testdb", &db);
+  assert(status.ok());
+  ...
+
+If you want to raise an error if the database already exists, add +the following line before the leveldb::DB::Open call: +
+  options.error_if_exists = true;
+
+

Status

+

+You may have noticed the leveldb::Status type above. Values of this +type are returned by most functions in leveldb that may encounter an +error. You can check if such a result is ok, and also print an +associated error message: +

+

+   leveldb::Status s = ...;
+   if (!s.ok()) cerr << s.ToString() << endl;
+
+

Closing A Database

+

+When you are done with a database, just delete the database object. +Example: +

+

+  ... open the db as described above ...
+  ... do something with db ...
+  delete db;
+
+

Reads And Writes

+

+The database provides Put, Delete, and Get methods to +modify/query the database. For example, the following code +moves the value stored under key1 to key2. +

+  std::string value;
+  leveldb::Status s = db->Get(leveldb::ReadOptions(), key1, &value);
+  if (s.ok()) s = db->Put(leveldb::WriteOptions(), key2, value);
+  if (s.ok()) s = db->Delete(leveldb::WriteOptions(), key1);
+
+ +

Atomic Updates

+

+Note that if the process dies after the Put of key2 but before the +delete of key1, the same value may be left stored under multiple keys. +Such problems can be avoided by using the WriteBatch class to +atomically apply a set of updates: +

+

+  #include "leveldb/write_batch.h"
+  ...
+  std::string value;
+  leveldb::Status s = db->Get(leveldb::ReadOptions(), key1, &value);
+  if (s.ok()) {
+    leveldb::WriteBatch batch;
+    batch.Delete(key1);
+    batch.Put(key2, value);
+    s = db->Write(leveldb::WriteOptions(), &batch);
+  }
+
+The WriteBatch holds a sequence of edits to be made to the database, +and these edits within the batch are applied in order. Note that we +called Delete before Put so that if key1 is identical to key2, +we do not end up erroneously dropping the value entirely. +

+Apart from its atomicity benefits, WriteBatch may also be used to +speed up bulk updates by placing lots of individual mutations into the +same batch. + +

Synchronous Writes

+By default, each write to leveldb is asynchronous: it +returns after pushing the write from the process into the operating +system. The transfer from operating system memory to the underlying +persistent storage happens asynchronously. The sync flag +can be turned on for a particular write to make the write operation +not return until the data being written has been pushed all the way to +persistent storage. (On Posix systems, this is implemented by calling +either fsync(...) or fdatasync(...) or +msync(..., MS_SYNC) before the write operation returns.) +
+  leveldb::WriteOptions write_options;
+  write_options.sync = true;
+  db->Put(write_options, ...);
+
+Asynchronous writes are often more than a thousand times as fast as +synchronous writes. The downside of asynchronous writes is that a +crash of the machine may cause the last few updates to be lost. Note +that a crash of just the writing process (i.e., not a reboot) will not +cause any loss since even when sync is false, an update +is pushed from the process memory into the operating system before it +is considered done. + +

+Asynchronous writes can often be used safely. For example, when +loading a large amount of data into the database you can handle lost +updates by restarting the bulk load after a crash. A hybrid scheme is +also possible where every Nth write is synchronous, and in the event +of a crash, the bulk load is restarted just after the last synchronous +write finished by the previous run. (The synchronous write can update +a marker that describes where to restart on a crash.) + +

+WriteBatch provides an alternative to asynchronous writes. +Multiple updates may be placed in the same WriteBatch and +applied together using a synchronous write (i.e., +write_options.sync is set to true). The extra cost of +the synchronous write will be amortized across all of the writes in +the batch. + +

+

Concurrency

+

+A database may only be opened by one process at a time. +The leveldb implementation acquires a lock from the +operating system to prevent misuse. Within a single process, the +same leveldb::DB object may be safely shared by multiple +concurrent threads. I.e., different threads may write into or fetch +iterators or call Get on the same database without any +external synchronization (the leveldb implementation will +automatically do the required synchronization). However other objects +(like Iterator and WriteBatch) may require external synchronization. +If two threads share such an object, they must protect access to it +using their own locking protocol. More details are available in +the public header files. +

+

Iteration

+

+The following example demonstrates how to print all key,value pairs +in a database. +

+

+  leveldb::Iterator* it = db->NewIterator(leveldb::ReadOptions());
+  for (it->SeekToFirst(); it->Valid(); it->Next()) {
+    cout << it->key().ToString() << ": "  << it->value().ToString() << endl;
+  }
+  assert(it->status().ok());  // Check for any errors found during the scan
+  delete it;
+
+The following variation shows how to process just the keys in the +range [start,limit): +

+

+  for (it->Seek(start);
+       it->Valid() && it->key().ToString() < limit;
+       it->Next()) {
+    ...
+  }
+
+You can also process entries in reverse order. (Caveat: reverse +iteration may be somewhat slower than forward iteration.) +

+

+  for (it->SeekToLast(); it->Valid(); it->Prev()) {
+    ...
+  }
+
+

Snapshots

+

+Snapshots provide consistent read-only views over the entire state of +the key-value store. ReadOptions::snapshot may be non-NULL to indicate +that a read should operate on a particular version of the DB state. +If ReadOptions::snapshot is NULL, the read will operate on an +implicit snapshot of the current state. +

+Snapshots are created by the DB::GetSnapshot() method: +

+

+  leveldb::ReadOptions options;
+  options.snapshot = db->GetSnapshot();
+  ... apply some updates to db ...
+  leveldb::Iterator* iter = db->NewIterator(options);
+  ... read using iter to view the state when the snapshot was created ...
+  delete iter;
+  db->ReleaseSnapshot(options.snapshot);
+
+Note that when a snapshot is no longer needed, it should be released +using the DB::ReleaseSnapshot interface. This allows the +implementation to get rid of state that was being maintained just to +support reading as of that snapshot. +

Slice

+

+The return value of the it->key() and it->value() calls above +are instances of the leveldb::Slice type. Slice is a simple +structure that contains a length and a pointer to an external byte +array. Returning a Slice is a cheaper alternative to returning a +std::string since we do not need to copy potentially large keys and +values. In addition, leveldb methods do not return null-terminated +C-style strings since leveldb keys and values are allowed to +contain '\0' bytes. +

+C++ strings and null-terminated C-style strings can be easily converted +to a Slice: +

+

+   leveldb::Slice s1 = "hello";
+
+   std::string str("world");
+   leveldb::Slice s2 = str;
+
+A Slice can be easily converted back to a C++ string: +
+   std::string str = s1.ToString();
+   assert(str == std::string("hello"));
+
+Be careful when using Slices since it is up to the caller to ensure that +the external byte array into which the Slice points remains live while +the Slice is in use. For example, the following is buggy: +

+

+   leveldb::Slice slice;
+   if (...) {
+     std::string str = ...;
+     slice = str;
+   }
+   Use(slice);
+
+When the if statement goes out of scope, str will be destroyed and the +backing storage for slice will disappear. +

+

Comparators

+

+The preceding examples used the default ordering function for key, +which orders bytes lexicographically. You can however supply a custom +comparator when opening a database. For example, suppose each +database key consists of two numbers and we should sort by the first +number, breaking ties by the second number. First, define a proper +subclass of leveldb::Comparator that expresses these rules: +

+

+  class TwoPartComparator : public leveldb::Comparator {
+   public:
+    // Three-way comparison function:
+    //   if a < b: negative result
+    //   if a > b: positive result
+    //   else: zero result
+    int Compare(const leveldb::Slice& a, const leveldb::Slice& b) const {
+      int a1, a2, b1, b2;
+      ParseKey(a, &a1, &a2);
+      ParseKey(b, &b1, &b2);
+      if (a1 < b1) return -1;
+      if (a1 > b1) return +1;
+      if (a2 < b2) return -1;
+      if (a2 > b2) return +1;
+      return 0;
+    }
+
+    // Ignore the following methods for now:
+    const char* Name() const { return "TwoPartComparator"; }
+    void FindShortestSeparator(std::string*, const leveldb::Slice&) const { }
+    void FindShortSuccessor(std::string*) const { }
+  };
+
+Now create a database using this custom comparator: +

+

+  TwoPartComparator cmp;
+  leveldb::DB* db;
+  leveldb::Options options;
+  options.create_if_missing = true;
+  options.comparator = &cmp;
+  leveldb::Status status = leveldb::DB::Open(options, "/tmp/testdb", &db);
+  ...
+
+

Backwards compatibility

+

+The result of the comparator's Name method is attached to the +database when it is created, and is checked on every subsequent +database open. If the name changes, the leveldb::DB::Open call will +fail. Therefore, change the name if and only if the new key format +and comparison function are incompatible with existing databases, and +it is ok to discard the contents of all existing databases. +

+You can however still gradually evolve your key format over time with +a little bit of pre-planning. For example, you could store a version +number at the end of each key (one byte should suffice for most uses). +When you wish to switch to a new key format (e.g., adding an optional +third part to the keys processed by TwoPartComparator), +(a) keep the same comparator name (b) increment the version number +for new keys (c) change the comparator function so it uses the +version numbers found in the keys to decide how to interpret them. +

+

Performance

+

+Performance can be tuned by changing the default values of the +types defined in include/leveldb/options.h. + +

+

Block size

+

+leveldb groups adjacent keys together into the same block and such a +block is the unit of transfer to and from persistent storage. The +default block size is approximately 4096 uncompressed bytes. +Applications that mostly do bulk scans over the contents of the +database may wish to increase this size. Applications that do a lot +of point reads of small values may wish to switch to a smaller block +size if performance measurements indicate an improvement. There isn't +much benefit in using blocks smaller than one kilobyte, or larger than +a few megabytes. Also note that compression will be more effective +with larger block sizes. +

+

Compression

+

+Each block is individually compressed before being written to +persistent storage. Compression is on by default since the default +compression method is very fast, and is automatically disabled for +uncompressible data. In rare cases, applications may want to disable +compression entirely, but should only do so if benchmarks show a +performance improvement: +

+

+  leveldb::Options options;
+  options.compression = leveldb::kNoCompression;
+  ... leveldb::DB::Open(options, name, ...) ....
+
+

Cache

+

+The contents of the database are stored in a set of files in the +filesystem and each file stores a sequence of compressed blocks. If +options.cache is non-NULL, it is used to cache frequently used +uncompressed block contents. +

+

+  #include "leveldb/cache.h"
+
+  leveldb::Options options;
+  options.cache = leveldb::NewLRUCache(100 * 1048576);  // 100MB cache
+  leveldb::DB* db;
+  leveldb::DB::Open(options, name, &db);
+  ... use the db ...
+  delete db
+  delete options.cache;
+
+Note that the cache holds uncompressed data, and therefore it should +be sized according to application level data sizes, without any +reduction from compression. (Caching of compressed blocks is left to +the operating system buffer cache, or any custom Env +implementation provided by the client.) +

+When performing a bulk read, the application may wish to disable +caching so that the data processed by the bulk read does not end up +displacing most of the cached contents. A per-iterator option can be +used to achieve this: +

+

+  leveldb::ReadOptions options;
+  options.fill_cache = false;
+  leveldb::Iterator* it = db->NewIterator(options);
+  for (it->SeekToFirst(); it->Valid(); it->Next()) {
+    ...
+  }
+
+

Key Layout

+

+Note that the unit of disk transfer and caching is a block. Adjacent +keys (according to the database sort order) will usually be placed in +the same block. Therefore the application can improve its performance +by placing keys that are accessed together near each other and placing +infrequently used keys in a separate region of the key space. +

+For example, suppose we are implementing a simple file system on top +of leveldb. The types of entries we might wish to store are: +

+

+   filename -> permission-bits, length, list of file_block_ids
+   file_block_id -> data
+
+We might want to prefix filename keys with one letter (say '/') and the +file_block_id keys with a different letter (say '0') so that scans +over just the metadata do not force us to fetch and cache bulky file +contents. +

+

Filters

+

+Because of the way leveldb data is organized on disk, +a single Get() call may involve multiple reads from disk. +The optional FilterPolicy mechanism can be used to reduce +the number of disk reads substantially. +

+   leveldb::Options options;
+   options.filter_policy = NewBloomFilter(10);
+   leveldb::DB* db;
+   leveldb::DB::Open(options, "/tmp/testdb", &db);
+   ... use the database ...
+   delete db;
+   delete options.filter_policy;
+
+The preceding code associates a +Bloom filter +based filtering policy with the database. Bloom filter based +filtering relies on keeping some number of bits of data in memory per +key (in this case 10 bits per key since that is the argument we passed +to NewBloomFilter). This filter will reduce the number of unnecessary +disk reads needed for Get() calls by a factor of +approximately a 100. Increasing the bits per key will lead to a +larger reduction at the cost of more memory usage. We recommend that +applications whose working set does not fit in memory and that do a +lot of random reads set a filter policy. +

+If you are using a custom comparator, you should ensure that the filter +policy you are using is compatible with your comparator. For example, +consider a comparator that ignores trailing spaces when comparing keys. +NewBloomFilter must not be used with such a comparator. +Instead, the application should provide a custom filter policy that +also ignores trailing spaces. For example: +

+  class CustomFilterPolicy : public leveldb::FilterPolicy {
+   private:
+    FilterPolicy* builtin_policy_;
+   public:
+    CustomFilterPolicy() : builtin_policy_(NewBloomFilter(10)) { }
+    ~CustomFilterPolicy() { delete builtin_policy_; }
+
+    const char* Name() const { return "IgnoreTrailingSpacesFilter"; }
+
+    void CreateFilter(const Slice* keys, int n, std::string* dst) const {
+      // Use builtin bloom filter code after removing trailing spaces
+      std::vector<Slice> trimmed(n);
+      for (int i = 0; i < n; i++) {
+        trimmed[i] = RemoveTrailingSpaces(keys[i]);
+      }
+      return builtin_policy_->CreateFilter(&trimmed[i], n, dst);
+    }
+
+    bool KeyMayMatch(const Slice& key, const Slice& filter) const {
+      // Use builtin bloom filter code after removing trailing spaces
+      return builtin_policy_->KeyMayMatch(RemoveTrailingSpaces(key), filter);
+    }
+  };
+
+

+Advanced applications may provide a filter policy that does not use +a bloom filter but uses some other mechanism for summarizing a set +of keys. See leveldb/filter_policy.h for detail. +

+

Checksums

+

+leveldb associates checksums with all data it stores in the file system. +There are two separate controls provided over how aggressively these +checksums are verified: +

+

    +
  • ReadOptions::verify_checksums may be set to true to force + checksum verification of all data that is read from the file system on + behalf of a particular read. By default, no such verification is + done. +

    +

  • Options::paranoid_checks may be set to true before opening a + database to make the database implementation raise an error as soon as + it detects an internal corruption. Depending on which portion of the + database has been corrupted, the error may be raised when the database + is opened, or later by another database operation. By default, + paranoid checking is off so that the database can be used even if + parts of its persistent storage have been corrupted. +

    + If a database is corrupted (perhaps it cannot be opened when + paranoid checking is turned on), the leveldb::RepairDB function + may be used to recover as much of the data as possible +

    +

+

Approximate Sizes

+

+The GetApproximateSizes method can used to get the approximate +number of bytes of file system space used by one or more key ranges. +

+

+   leveldb::Range ranges[2];
+   ranges[0] = leveldb::Range("a", "c");
+   ranges[1] = leveldb::Range("x", "z");
+   uint64_t sizes[2];
+   leveldb::Status s = db->GetApproximateSizes(ranges, 2, sizes);
+
+The preceding call will set sizes[0] to the approximate number of +bytes of file system space used by the key range [a..c) and +sizes[1] to the approximate number of bytes used by the key range +[x..z). +

+

Environment

+

+All file operations (and other operating system calls) issued by the +leveldb implementation are routed through a leveldb::Env object. +Sophisticated clients may wish to provide their own Env +implementation to get better control. For example, an application may +introduce artificial delays in the file IO paths to limit the impact +of leveldb on other activities in the system. +

+

+  class SlowEnv : public leveldb::Env {
+    .. implementation of the Env interface ...
+  };
+
+  SlowEnv env;
+  leveldb::Options options;
+  options.env = &env;
+  Status s = leveldb::DB::Open(options, ...);
+
+

Porting

+

+leveldb may be ported to a new platform by providing platform +specific implementations of the types/methods/functions exported by +leveldb/port/port.h. See leveldb/port/port_example.h for more +details. +

+In addition, the new platform may need a new default leveldb::Env +implementation. See leveldb/util/env_posix.h for an example. + +

Other Information

+ +

+Details about the leveldb implementation may be found in +the following documents: +

+ + + diff --git a/src/leveldb/doc/index.md b/src/leveldb/doc/index.md deleted file mode 100644 index be8569692..000000000 --- a/src/leveldb/doc/index.md +++ /dev/null @@ -1,523 +0,0 @@ -leveldb -======= - -_Jeff Dean, Sanjay Ghemawat_ - -The leveldb library provides a persistent key value store. Keys and values are -arbitrary byte arrays. The keys are ordered within the key value store -according to a user-specified comparator function. - -## Opening A Database - -A leveldb database has a name which corresponds to a file system directory. All -of the contents of database are stored in this directory. The following example -shows how to open a database, creating it if necessary: - -```c++ -#include -#include "leveldb/db.h" - -leveldb::DB* db; -leveldb::Options options; -options.create_if_missing = true; -leveldb::Status status = leveldb::DB::Open(options, "/tmp/testdb", &db); -assert(status.ok()); -... -``` - -If you want to raise an error if the database already exists, add the following -line before the `leveldb::DB::Open` call: - -```c++ -options.error_if_exists = true; -``` - -## Status - -You may have noticed the `leveldb::Status` type above. Values of this type are -returned by most functions in leveldb that may encounter an error. You can check -if such a result is ok, and also print an associated error message: - -```c++ -leveldb::Status s = ...; -if (!s.ok()) cerr << s.ToString() << endl; -``` - -## Closing A Database - -When you are done with a database, just delete the database object. Example: - -```c++ -... open the db as described above ... -... do something with db ... -delete db; -``` - -## Reads And Writes - -The database provides Put, Delete, and Get methods to modify/query the database. -For example, the following code moves the value stored under key1 to key2. - -```c++ -std::string value; -leveldb::Status s = db->Get(leveldb::ReadOptions(), key1, &value); -if (s.ok()) s = db->Put(leveldb::WriteOptions(), key2, value); -if (s.ok()) s = db->Delete(leveldb::WriteOptions(), key1); -``` - -## Atomic Updates - -Note that if the process dies after the Put of key2 but before the delete of -key1, the same value may be left stored under multiple keys. Such problems can -be avoided by using the `WriteBatch` class to atomically apply a set of updates: - -```c++ -#include "leveldb/write_batch.h" -... -std::string value; -leveldb::Status s = db->Get(leveldb::ReadOptions(), key1, &value); -if (s.ok()) { - leveldb::WriteBatch batch; - batch.Delete(key1); - batch.Put(key2, value); - s = db->Write(leveldb::WriteOptions(), &batch); -} -``` - -The `WriteBatch` holds a sequence of edits to be made to the database, and these -edits within the batch are applied in order. Note that we called Delete before -Put so that if key1 is identical to key2, we do not end up erroneously dropping -the value entirely. - -Apart from its atomicity benefits, `WriteBatch` may also be used to speed up -bulk updates by placing lots of individual mutations into the same batch. - -## Synchronous Writes - -By default, each write to leveldb is asynchronous: it returns after pushing the -write from the process into the operating system. The transfer from operating -system memory to the underlying persistent storage happens asynchronously. The -sync flag can be turned on for a particular write to make the write operation -not return until the data being written has been pushed all the way to -persistent storage. (On Posix systems, this is implemented by calling either -`fsync(...)` or `fdatasync(...)` or `msync(..., MS_SYNC)` before the write -operation returns.) - -```c++ -leveldb::WriteOptions write_options; -write_options.sync = true; -db->Put(write_options, ...); -``` - -Asynchronous writes are often more than a thousand times as fast as synchronous -writes. The downside of asynchronous writes is that a crash of the machine may -cause the last few updates to be lost. Note that a crash of just the writing -process (i.e., not a reboot) will not cause any loss since even when sync is -false, an update is pushed from the process memory into the operating system -before it is considered done. - -Asynchronous writes can often be used safely. For example, when loading a large -amount of data into the database you can handle lost updates by restarting the -bulk load after a crash. A hybrid scheme is also possible where every Nth write -is synchronous, and in the event of a crash, the bulk load is restarted just -after the last synchronous write finished by the previous run. (The synchronous -write can update a marker that describes where to restart on a crash.) - -`WriteBatch` provides an alternative to asynchronous writes. Multiple updates -may be placed in the same WriteBatch and applied together using a synchronous -write (i.e., `write_options.sync` is set to true). The extra cost of the -synchronous write will be amortized across all of the writes in the batch. - -## Concurrency - -A database may only be opened by one process at a time. The leveldb -implementation acquires a lock from the operating system to prevent misuse. -Within a single process, the same `leveldb::DB` object may be safely shared by -multiple concurrent threads. I.e., different threads may write into or fetch -iterators or call Get on the same database without any external synchronization -(the leveldb implementation will automatically do the required synchronization). -However other objects (like Iterator and `WriteBatch`) may require external -synchronization. If two threads share such an object, they must protect access -to it using their own locking protocol. More details are available in the public -header files. - -## Iteration - -The following example demonstrates how to print all key,value pairs in a -database. - -```c++ -leveldb::Iterator* it = db->NewIterator(leveldb::ReadOptions()); -for (it->SeekToFirst(); it->Valid(); it->Next()) { - cout << it->key().ToString() << ": " << it->value().ToString() << endl; -} -assert(it->status().ok()); // Check for any errors found during the scan -delete it; -``` - -The following variation shows how to process just the keys in the range -[start,limit): - -```c++ -for (it->Seek(start); - it->Valid() && it->key().ToString() < limit; - it->Next()) { - ... -} -``` - -You can also process entries in reverse order. (Caveat: reverse iteration may be -somewhat slower than forward iteration.) - -```c++ -for (it->SeekToLast(); it->Valid(); it->Prev()) { - ... -} -``` - -## Snapshots - -Snapshots provide consistent read-only views over the entire state of the -key-value store. `ReadOptions::snapshot` may be non-NULL to indicate that a -read should operate on a particular version of the DB state. If -`ReadOptions::snapshot` is NULL, the read will operate on an implicit snapshot -of the current state. - -Snapshots are created by the `DB::GetSnapshot()` method: - -```c++ -leveldb::ReadOptions options; -options.snapshot = db->GetSnapshot(); -... apply some updates to db ... -leveldb::Iterator* iter = db->NewIterator(options); -... read using iter to view the state when the snapshot was created ... -delete iter; -db->ReleaseSnapshot(options.snapshot); -``` - -Note that when a snapshot is no longer needed, it should be released using the -`DB::ReleaseSnapshot` interface. This allows the implementation to get rid of -state that was being maintained just to support reading as of that snapshot. - -## Slice - -The return value of the `it->key()` and `it->value()` calls above are instances -of the `leveldb::Slice` type. Slice is a simple structure that contains a length -and a pointer to an external byte array. Returning a Slice is a cheaper -alternative to returning a `std::string` since we do not need to copy -potentially large keys and values. In addition, leveldb methods do not return -null-terminated C-style strings since leveldb keys and values are allowed to -contain `'\0'` bytes. - -C++ strings and null-terminated C-style strings can be easily converted to a -Slice: - -```c++ -leveldb::Slice s1 = "hello"; - -std::string str("world"); -leveldb::Slice s2 = str; -``` - -A Slice can be easily converted back to a C++ string: - -```c++ -std::string str = s1.ToString(); -assert(str == std::string("hello")); -``` - -Be careful when using Slices since it is up to the caller to ensure that the -external byte array into which the Slice points remains live while the Slice is -in use. For example, the following is buggy: - -```c++ -leveldb::Slice slice; -if (...) { - std::string str = ...; - slice = str; -} -Use(slice); -``` - -When the if statement goes out of scope, str will be destroyed and the backing -storage for slice will disappear. - -## Comparators - -The preceding examples used the default ordering function for key, which orders -bytes lexicographically. You can however supply a custom comparator when opening -a database. For example, suppose each database key consists of two numbers and -we should sort by the first number, breaking ties by the second number. First, -define a proper subclass of `leveldb::Comparator` that expresses these rules: - -```c++ -class TwoPartComparator : public leveldb::Comparator { - public: - // Three-way comparison function: - // if a < b: negative result - // if a > b: positive result - // else: zero result - int Compare(const leveldb::Slice& a, const leveldb::Slice& b) const { - int a1, a2, b1, b2; - ParseKey(a, &a1, &a2); - ParseKey(b, &b1, &b2); - if (a1 < b1) return -1; - if (a1 > b1) return +1; - if (a2 < b2) return -1; - if (a2 > b2) return +1; - return 0; - } - - // Ignore the following methods for now: - const char* Name() const { return "TwoPartComparator"; } - void FindShortestSeparator(std::string*, const leveldb::Slice&) const {} - void FindShortSuccessor(std::string*) const {} -}; -``` - -Now create a database using this custom comparator: - -```c++ -TwoPartComparator cmp; -leveldb::DB* db; -leveldb::Options options; -options.create_if_missing = true; -options.comparator = &cmp; -leveldb::Status status = leveldb::DB::Open(options, "/tmp/testdb", &db); -... -``` - -### Backwards compatibility - -The result of the comparator's Name method is attached to the database when it -is created, and is checked on every subsequent database open. If the name -changes, the `leveldb::DB::Open` call will fail. Therefore, change the name if -and only if the new key format and comparison function are incompatible with -existing databases, and it is ok to discard the contents of all existing -databases. - -You can however still gradually evolve your key format over time with a little -bit of pre-planning. For example, you could store a version number at the end of -each key (one byte should suffice for most uses). When you wish to switch to a -new key format (e.g., adding an optional third part to the keys processed by -`TwoPartComparator`), (a) keep the same comparator name (b) increment the -version number for new keys (c) change the comparator function so it uses the -version numbers found in the keys to decide how to interpret them. - -## Performance - -Performance can be tuned by changing the default values of the types defined in -`include/leveldb/options.h`. - -### Block size - -leveldb groups adjacent keys together into the same block and such a block is -the unit of transfer to and from persistent storage. The default block size is -approximately 4096 uncompressed bytes. Applications that mostly do bulk scans -over the contents of the database may wish to increase this size. Applications -that do a lot of point reads of small values may wish to switch to a smaller -block size if performance measurements indicate an improvement. There isn't much -benefit in using blocks smaller than one kilobyte, or larger than a few -megabytes. Also note that compression will be more effective with larger block -sizes. - -### Compression - -Each block is individually compressed before being written to persistent -storage. Compression is on by default since the default compression method is -very fast, and is automatically disabled for uncompressible data. In rare cases, -applications may want to disable compression entirely, but should only do so if -benchmarks show a performance improvement: - -```c++ -leveldb::Options options; -options.compression = leveldb::kNoCompression; -... leveldb::DB::Open(options, name, ...) .... -``` - -### Cache - -The contents of the database are stored in a set of files in the filesystem and -each file stores a sequence of compressed blocks. If options.cache is non-NULL, -it is used to cache frequently used uncompressed block contents. - -```c++ -#include "leveldb/cache.h" - -leveldb::Options options; -options.cache = leveldb::NewLRUCache(100 * 1048576); // 100MB cache -leveldb::DB* db; -leveldb::DB::Open(options, name, &db); -... use the db ... -delete db -delete options.cache; -``` - -Note that the cache holds uncompressed data, and therefore it should be sized -according to application level data sizes, without any reduction from -compression. (Caching of compressed blocks is left to the operating system -buffer cache, or any custom Env implementation provided by the client.) - -When performing a bulk read, the application may wish to disable caching so that -the data processed by the bulk read does not end up displacing most of the -cached contents. A per-iterator option can be used to achieve this: - -```c++ -leveldb::ReadOptions options; -options.fill_cache = false; -leveldb::Iterator* it = db->NewIterator(options); -for (it->SeekToFirst(); it->Valid(); it->Next()) { - ... -} -``` - -### Key Layout - -Note that the unit of disk transfer and caching is a block. Adjacent keys -(according to the database sort order) will usually be placed in the same block. -Therefore the application can improve its performance by placing keys that are -accessed together near each other and placing infrequently used keys in a -separate region of the key space. - -For example, suppose we are implementing a simple file system on top of leveldb. -The types of entries we might wish to store are: - - filename -> permission-bits, length, list of file_block_ids - file_block_id -> data - -We might want to prefix filename keys with one letter (say '/') and the -`file_block_id` keys with a different letter (say '0') so that scans over just -the metadata do not force us to fetch and cache bulky file contents. - -### Filters - -Because of the way leveldb data is organized on disk, a single `Get()` call may -involve multiple reads from disk. The optional FilterPolicy mechanism can be -used to reduce the number of disk reads substantially. - -```c++ -leveldb::Options options; -options.filter_policy = NewBloomFilterPolicy(10); -leveldb::DB* db; -leveldb::DB::Open(options, "/tmp/testdb", &db); -... use the database ... -delete db; -delete options.filter_policy; -``` - -The preceding code associates a Bloom filter based filtering policy with the -database. Bloom filter based filtering relies on keeping some number of bits of -data in memory per key (in this case 10 bits per key since that is the argument -we passed to `NewBloomFilterPolicy`). This filter will reduce the number of -unnecessary disk reads needed for Get() calls by a factor of approximately -a 100. Increasing the bits per key will lead to a larger reduction at the cost -of more memory usage. We recommend that applications whose working set does not -fit in memory and that do a lot of random reads set a filter policy. - -If you are using a custom comparator, you should ensure that the filter policy -you are using is compatible with your comparator. For example, consider a -comparator that ignores trailing spaces when comparing keys. -`NewBloomFilterPolicy` must not be used with such a comparator. Instead, the -application should provide a custom filter policy that also ignores trailing -spaces. For example: - -```c++ -class CustomFilterPolicy : public leveldb::FilterPolicy { - private: - FilterPolicy* builtin_policy_; - - public: - CustomFilterPolicy() : builtin_policy_(NewBloomFilterPolicy(10)) {} - ~CustomFilterPolicy() { delete builtin_policy_; } - - const char* Name() const { return "IgnoreTrailingSpacesFilter"; } - - void CreateFilter(const Slice* keys, int n, std::string* dst) const { - // Use builtin bloom filter code after removing trailing spaces - std::vector trimmed(n); - for (int i = 0; i < n; i++) { - trimmed[i] = RemoveTrailingSpaces(keys[i]); - } - return builtin_policy_->CreateFilter(&trimmed[i], n, dst); - } -}; -``` - -Advanced applications may provide a filter policy that does not use a bloom -filter but uses some other mechanism for summarizing a set of keys. See -`leveldb/filter_policy.h` for detail. - -## Checksums - -leveldb associates checksums with all data it stores in the file system. There -are two separate controls provided over how aggressively these checksums are -verified: - -`ReadOptions::verify_checksums` may be set to true to force checksum -verification of all data that is read from the file system on behalf of a -particular read. By default, no such verification is done. - -`Options::paranoid_checks` may be set to true before opening a database to make -the database implementation raise an error as soon as it detects an internal -corruption. Depending on which portion of the database has been corrupted, the -error may be raised when the database is opened, or later by another database -operation. By default, paranoid checking is off so that the database can be used -even if parts of its persistent storage have been corrupted. - -If a database is corrupted (perhaps it cannot be opened when paranoid checking -is turned on), the `leveldb::RepairDB` function may be used to recover as much -of the data as possible - -## Approximate Sizes - -The `GetApproximateSizes` method can used to get the approximate number of bytes -of file system space used by one or more key ranges. - -```c++ -leveldb::Range ranges[2]; -ranges[0] = leveldb::Range("a", "c"); -ranges[1] = leveldb::Range("x", "z"); -uint64_t sizes[2]; -leveldb::Status s = db->GetApproximateSizes(ranges, 2, sizes); -``` - -The preceding call will set `sizes[0]` to the approximate number of bytes of -file system space used by the key range `[a..c)` and `sizes[1]` to the -approximate number of bytes used by the key range `[x..z)`. - -## Environment - -All file operations (and other operating system calls) issued by the leveldb -implementation are routed through a `leveldb::Env` object. Sophisticated clients -may wish to provide their own Env implementation to get better control. -For example, an application may introduce artificial delays in the file IO -paths to limit the impact of leveldb on other activities in the system. - -```c++ -class SlowEnv : public leveldb::Env { - ... implementation of the Env interface ... -}; - -SlowEnv env; -leveldb::Options options; -options.env = &env; -Status s = leveldb::DB::Open(options, ...); -``` - -## Porting - -leveldb may be ported to a new platform by providing platform specific -implementations of the types/methods/functions exported by -`leveldb/port/port.h`. See `leveldb/port/port_example.h` for more details. - -In addition, the new platform may need a new default `leveldb::Env` -implementation. See `leveldb/util/env_posix.h` for an example. - -## Other Information - -Details about the leveldb implementation may be found in the following -documents: - -1. [Implementation notes](impl.md) -2. [Format of an immutable Table file](table_format.md) -3. [Format of a log file](log_format.md) diff --git a/src/leveldb/doc/log_format.md b/src/leveldb/doc/log_format.md deleted file mode 100644 index f32cb5d7d..000000000 --- a/src/leveldb/doc/log_format.md +++ /dev/null @@ -1,75 +0,0 @@ -leveldb Log format -================== -The log file contents are a sequence of 32KB blocks. The only exception is that -the tail of the file may contain a partial block. - -Each block consists of a sequence of records: - - block := record* trailer? - record := - checksum: uint32 // crc32c of type and data[] ; little-endian - length: uint16 // little-endian - type: uint8 // One of FULL, FIRST, MIDDLE, LAST - data: uint8[length] - -A record never starts within the last six bytes of a block (since it won't fit). -Any leftover bytes here form the trailer, which must consist entirely of zero -bytes and must be skipped by readers. - -Aside: if exactly seven bytes are left in the current block, and a new non-zero -length record is added, the writer must emit a FIRST record (which contains zero -bytes of user data) to fill up the trailing seven bytes of the block and then -emit all of the user data in subsequent blocks. - -More types may be added in the future. Some Readers may skip record types they -do not understand, others may report that some data was skipped. - - FULL == 1 - FIRST == 2 - MIDDLE == 3 - LAST == 4 - -The FULL record contains the contents of an entire user record. - -FIRST, MIDDLE, LAST are types used for user records that have been split into -multiple fragments (typically because of block boundaries). FIRST is the type -of the first fragment of a user record, LAST is the type of the last fragment of -a user record, and MIDDLE is the type of all interior fragments of a user -record. - -Example: consider a sequence of user records: - - A: length 1000 - B: length 97270 - C: length 8000 - -**A** will be stored as a FULL record in the first block. - -**B** will be split into three fragments: first fragment occupies the rest of -the first block, second fragment occupies the entirety of the second block, and -the third fragment occupies a prefix of the third block. This will leave six -bytes free in the third block, which will be left empty as the trailer. - -**C** will be stored as a FULL record in the fourth block. - ----- - -## Some benefits over the recordio format: - -1. We do not need any heuristics for resyncing - just go to next block boundary - and scan. If there is a corruption, skip to the next block. As a - side-benefit, we do not get confused when part of the contents of one log - file are embedded as a record inside another log file. - -2. Splitting at approximate boundaries (e.g., for mapreduce) is simple: find the - next block boundary and skip records until we hit a FULL or FIRST record. - -3. We do not need extra buffering for large records. - -## Some downsides compared to recordio format: - -1. No packing of tiny records. This could be fixed by adding a new record type, - so it is a shortcoming of the current implementation, not necessarily the - format. - -2. No compression. Again, this could be fixed by adding new record types. diff --git a/src/leveldb/doc/log_format.txt b/src/leveldb/doc/log_format.txt new file mode 100644 index 000000000..3a0414b65 --- /dev/null +++ b/src/leveldb/doc/log_format.txt @@ -0,0 +1,75 @@ +The log file contents are a sequence of 32KB blocks. The only +exception is that the tail of the file may contain a partial block. + +Each block consists of a sequence of records: + block := record* trailer? + record := + checksum: uint32 // crc32c of type and data[] + length: uint16 + type: uint8 // One of FULL, FIRST, MIDDLE, LAST + data: uint8[length] + +A record never starts within the last six bytes of a block (since it +won't fit). Any leftover bytes here form the trailer, which must +consist entirely of zero bytes and must be skipped by readers. + +Aside: if exactly seven bytes are left in the current block, and a new +non-zero length record is added, the writer must emit a FIRST record +(which contains zero bytes of user data) to fill up the trailing seven +bytes of the block and then emit all of the user data in subsequent +blocks. + +More types may be added in the future. Some Readers may skip record +types they do not understand, others may report that some data was +skipped. + +FULL == 1 +FIRST == 2 +MIDDLE == 3 +LAST == 4 + +The FULL record contains the contents of an entire user record. + +FIRST, MIDDLE, LAST are types used for user records that have been +split into multiple fragments (typically because of block boundaries). +FIRST is the type of the first fragment of a user record, LAST is the +type of the last fragment of a user record, and MID is the type of all +interior fragments of a user record. + +Example: consider a sequence of user records: + A: length 1000 + B: length 97270 + C: length 8000 +A will be stored as a FULL record in the first block. + +B will be split into three fragments: first fragment occupies the rest +of the first block, second fragment occupies the entirety of the +second block, and the third fragment occupies a prefix of the third +block. This will leave six bytes free in the third block, which will +be left empty as the trailer. + +C will be stored as a FULL record in the fourth block. + +=================== + +Some benefits over the recordio format: + +(1) We do not need any heuristics for resyncing - just go to next +block boundary and scan. If there is a corruption, skip to the next +block. As a side-benefit, we do not get confused when part of the +contents of one log file are embedded as a record inside another log +file. + +(2) Splitting at approximate boundaries (e.g., for mapreduce) is +simple: find the next block boundary and skip records until we +hit a FULL or FIRST record. + +(3) We do not need extra buffering for large records. + +Some downsides compared to recordio format: + +(1) No packing of tiny records. This could be fixed by adding a new +record type, so it is a shortcoming of the current implementation, +not necessarily the format. + +(2) No compression. Again, this could be fixed by adding new record types. diff --git a/src/leveldb/doc/table_format.md b/src/leveldb/doc/table_format.md deleted file mode 100644 index 5fe7e7241..000000000 --- a/src/leveldb/doc/table_format.md +++ /dev/null @@ -1,107 +0,0 @@ -leveldb File format -=================== - - - [data block 1] - [data block 2] - ... - [data block N] - [meta block 1] - ... - [meta block K] - [metaindex block] - [index block] - [Footer] (fixed size; starts at file_size - sizeof(Footer)) - - -The file contains internal pointers. Each such pointer is called -a BlockHandle and contains the following information: - - offset: varint64 - size: varint64 - -See [varints](https://developers.google.com/protocol-buffers/docs/encoding#varints) -for an explanation of varint64 format. - -1. The sequence of key/value pairs in the file are stored in sorted -order and partitioned into a sequence of data blocks. These blocks -come one after another at the beginning of the file. Each data block -is formatted according to the code in `block_builder.cc`, and then -optionally compressed. - -2. After the data blocks we store a bunch of meta blocks. The -supported meta block types are described below. More meta block types -may be added in the future. Each meta block is again formatted using -`block_builder.cc` and then optionally compressed. - -3. A "metaindex" block. It contains one entry for every other meta -block where the key is the name of the meta block and the value is a -BlockHandle pointing to that meta block. - -4. An "index" block. This block contains one entry per data block, -where the key is a string >= last key in that data block and before -the first key in the successive data block. The value is the -BlockHandle for the data block. - -5. At the very end of the file is a fixed length footer that contains -the BlockHandle of the metaindex and index blocks as well as a magic number. - - metaindex_handle: char[p]; // Block handle for metaindex - index_handle: char[q]; // Block handle for index - padding: char[40-p-q];// zeroed bytes to make fixed length - // (40==2*BlockHandle::kMaxEncodedLength) - magic: fixed64; // == 0xdb4775248b80fb57 (little-endian) - -## "filter" Meta Block - -If a `FilterPolicy` was specified when the database was opened, a -filter block is stored in each table. The "metaindex" block contains -an entry that maps from `filter.` to the BlockHandle for the filter -block where `` is the string returned by the filter policy's -`Name()` method. - -The filter block stores a sequence of filters, where filter i contains -the output of `FilterPolicy::CreateFilter()` on all keys that are stored -in a block whose file offset falls within the range - - [ i*base ... (i+1)*base-1 ] - -Currently, "base" is 2KB. So for example, if blocks X and Y start in -the range `[ 0KB .. 2KB-1 ]`, all of the keys in X and Y will be -converted to a filter by calling `FilterPolicy::CreateFilter()`, and the -resulting filter will be stored as the first filter in the filter -block. - -The filter block is formatted as follows: - - [filter 0] - [filter 1] - [filter 2] - ... - [filter N-1] - - [offset of filter 0] : 4 bytes - [offset of filter 1] : 4 bytes - [offset of filter 2] : 4 bytes - ... - [offset of filter N-1] : 4 bytes - - [offset of beginning of offset array] : 4 bytes - lg(base) : 1 byte - -The offset array at the end of the filter block allows efficient -mapping from a data block offset to the corresponding filter. - -## "stats" Meta Block - -This meta block contains a bunch of stats. The key is the name -of the statistic. The value contains the statistic. - -TODO(postrelease): record following stats. - - data size - index size - key size (uncompressed) - value size (uncompressed) - number of entries - number of data blocks diff --git a/src/leveldb/doc/table_format.txt b/src/leveldb/doc/table_format.txt new file mode 100644 index 000000000..d0f3065ed --- /dev/null +++ b/src/leveldb/doc/table_format.txt @@ -0,0 +1,102 @@ +File format +=========== + + + [data block 1] + [data block 2] + ... + [data block N] + [meta block 1] + ... + [meta block K] + [metaindex block] + [index block] + [Footer] (fixed size; starts at file_size - sizeof(Footer)) + + +The file contains internal pointers. Each such pointer is called +a BlockHandle and contains the following information: + offset: varint64 + size: varint64 + +(1) The sequence of key/value pairs in the file are stored in sorted +order and partitioned into a sequence of data blocks. These blocks +come one after another at the beginning of the file. Each data block +is formatted according to the code in block_builder.cc, and then +optionally compressed. + +(2) After the data blocks we store a bunch of meta blocks. The +supported meta block types are described below. More meta block types +may be added in the future. Each meta block is again formatted using +block_builder.cc and then optionally compressed. + +(3) A "metaindex" block. It contains one entry for every other meta +block where the key is the name of the meta block and the value is a +BlockHandle pointing to that meta block. + +(4) An "index" block. This block contains one entry per data block, +where the key is a string >= last key in that data block and before +the first key in the successive data block. The value is the +BlockHandle for the data block. + +(6) At the very end of the file is a fixed length footer that contains +the BlockHandle of the metaindex and index blocks as well as a magic number. + metaindex_handle: char[p]; // Block handle for metaindex + index_handle: char[q]; // Block handle for index + padding: char[40-p-q]; // 0 bytes to make fixed length + // (40==2*BlockHandle::kMaxEncodedLength) + magic: fixed64; // == 0xdb4775248b80fb57 + +"filter" Meta Block +------------------- + +If a "FilterPolicy" was specified when the database was opened, a +filter block is stored in each table. The "metaindex" block contains +an entry that maps from "filter." to the BlockHandle for the filter +block where "" is the string returned by the filter policy's +"Name()" method. + +The filter block stores a sequence of filters, where filter i contains +the output of FilterPolicy::CreateFilter() on all keys that are stored +in a block whose file offset falls within the range + + [ i*base ... (i+1)*base-1 ] + +Currently, "base" is 2KB. So for example, if blocks X and Y start in +the range [ 0KB .. 2KB-1 ], all of the keys in X and Y will be +converted to a filter by calling FilterPolicy::CreateFilter(), and the +resulting filter will be stored as the first filter in the filter +block. + +The filter block is formatted as follows: + + [filter 0] + [filter 1] + [filter 2] + ... + [filter N-1] + + [offset of filter 0] : 4 bytes + [offset of filter 1] : 4 bytes + [offset of filter 2] : 4 bytes + ... + [offset of filter N-1] : 4 bytes + + [offset of beginning of offset array] : 4 bytes + lg(base) : 1 byte + +The offset array at the end of the filter block allows efficient +mapping from a data block offset to the corresponding filter. + +"stats" Meta Block +------------------ + +This meta block contains a bunch of stats. The key is the name +of the statistic. The value contains the statistic. +TODO(postrelease): record following stats. + data size + index size + key size (uncompressed) + value size (uncompressed) + number of entries + number of data blocks diff --git a/src/leveldb/helpers/memenv/memenv.cc b/src/leveldb/helpers/memenv/memenv.cc index 68c0614a5..efad9524a 100644 --- a/src/leveldb/helpers/memenv/memenv.cc +++ b/src/leveldb/helpers/memenv/memenv.cc @@ -55,15 +55,14 @@ class FileState { } const uint64_t available = size_ - offset; if (n > available) { - n = static_cast(available); + n = available; } if (n == 0) { *result = Slice(); return Status::OK(); } - assert(offset / kBlockSize <= SIZE_MAX); - size_t block = static_cast(offset / kBlockSize); + size_t block = offset / kBlockSize; size_t block_offset = offset % kBlockSize; if (n <= kBlockSize - block_offset) { @@ -168,7 +167,7 @@ class SequentialFileImpl : public SequentialFile { if (pos_ > file_->Size()) { return Status::IOError("pos_ > file_->Size()"); } - const uint64_t available = file_->Size() - pos_; + const size_t available = file_->Size() - pos_; if (n > available) { n = available; } @@ -176,10 +175,9 @@ class SequentialFileImpl : public SequentialFile { return Status::OK(); } - virtual std::string GetName() const { return "[memenv]"; } private: FileState* file_; - uint64_t pos_; + size_t pos_; }; class RandomAccessFileImpl : public RandomAccessFile { @@ -197,7 +195,6 @@ class RandomAccessFileImpl : public RandomAccessFile { return file_->Read(offset, n, result, scratch); } - virtual std::string GetName() const { return "[memenv]"; } private: FileState* file_; }; @@ -220,16 +217,10 @@ class WritableFileImpl : public WritableFile { virtual Status Flush() { return Status::OK(); } virtual Status Sync() { return Status::OK(); } - virtual std::string GetName() const { return "[memenv]"; } private: FileState* file_; }; -class NoOpLogger : public Logger { - public: - virtual void Logv(const char* format, va_list ap) { } -}; - class InMemoryEnv : public EnvWrapper { public: explicit InMemoryEnv(Env* base_env) : EnvWrapper(base_env) { } @@ -266,7 +257,7 @@ class InMemoryEnv : public EnvWrapper { } virtual Status NewWritableFile(const std::string& fname, - WritableFile** result) { + WritableFile** result, size_t) { MutexLock lock(&mutex_); if (file_map_.find(fname) != file_map_.end()) { DeleteFileInternal(fname); @@ -280,19 +271,6 @@ class InMemoryEnv : public EnvWrapper { return Status::OK(); } - virtual Status NewAppendableFile(const std::string& fname, - WritableFile** result) { - MutexLock lock(&mutex_); - FileState** sptr = &file_map_[fname]; - FileState* file = *sptr; - if (file == NULL) { - file = new FileState(); - file->Ref(); - } - *result = new WritableFileImpl(file); - return Status::OK(); - } - virtual bool FileExists(const std::string& fname) { MutexLock lock(&mutex_); return file_map_.find(fname) != file_map_.end(); @@ -380,11 +358,6 @@ class InMemoryEnv : public EnvWrapper { return Status::OK(); } - virtual Status NewLogger(const std::string& fname, Logger** result) { - *result = new NoOpLogger; - return Status::OK(); - } - private: // Map from filenames to FileState objects, representing a simple file system. typedef std::map FileSystem; diff --git a/src/leveldb/helpers/memenv/memenv_test.cc b/src/leveldb/helpers/memenv/memenv_test.cc index 5cff77613..38ee6ac3e 100644 --- a/src/leveldb/helpers/memenv/memenv_test.cc +++ b/src/leveldb/helpers/memenv/memenv_test.cc @@ -29,68 +29,61 @@ TEST(MemEnvTest, Basics) { uint64_t file_size; WritableFile* writable_file; std::vector children; + std::string dbname; - ASSERT_OK(env_->CreateDir("/dir")); + dbname=test::TmpDir(); + ASSERT_OK(env_->CreateDir(dbname.c_str())); // Check that the directory is empty. - ASSERT_TRUE(!env_->FileExists("/dir/non_existent")); - ASSERT_TRUE(!env_->GetFileSize("/dir/non_existent", &file_size).ok()); - ASSERT_OK(env_->GetChildren("/dir", &children)); + ASSERT_TRUE(!env_->FileExists(dbname + "/non_existent")); + ASSERT_TRUE(!env_->GetFileSize(dbname + "/non_existent", &file_size).ok()); + ASSERT_OK(env_->GetChildren(dbname + "", &children)); ASSERT_EQ(0, children.size()); // Create a file. - ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file)); - ASSERT_OK(env_->GetFileSize("/dir/f", &file_size)); - ASSERT_EQ(0, file_size); + ASSERT_OK(env_->NewWritableFile(dbname + "/f", &writable_file, 2<<20)); delete writable_file; // Check that the file exists. - ASSERT_TRUE(env_->FileExists("/dir/f")); - ASSERT_OK(env_->GetFileSize("/dir/f", &file_size)); + ASSERT_TRUE(env_->FileExists(dbname + "/f")); + ASSERT_OK(env_->GetFileSize(dbname + "/f", &file_size)); ASSERT_EQ(0, file_size); - ASSERT_OK(env_->GetChildren("/dir", &children)); + ASSERT_OK(env_->GetChildren(dbname + "", &children)); ASSERT_EQ(1, children.size()); ASSERT_EQ("f", children[0]); // Write to the file. - ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file)); + ASSERT_OK(env_->NewWritableFile(dbname + "/f", &writable_file, 2<<20)); ASSERT_OK(writable_file->Append("abc")); delete writable_file; - // Check that append works. - ASSERT_OK(env_->NewAppendableFile("/dir/f", &writable_file)); - ASSERT_OK(env_->GetFileSize("/dir/f", &file_size)); - ASSERT_EQ(3, file_size); - ASSERT_OK(writable_file->Append("hello")); - delete writable_file; - // Check for expected size. - ASSERT_OK(env_->GetFileSize("/dir/f", &file_size)); - ASSERT_EQ(8, file_size); + ASSERT_OK(env_->GetFileSize(dbname + "/f", &file_size)); + ASSERT_EQ(3, file_size); // Check that renaming works. - ASSERT_TRUE(!env_->RenameFile("/dir/non_existent", "/dir/g").ok()); - ASSERT_OK(env_->RenameFile("/dir/f", "/dir/g")); - ASSERT_TRUE(!env_->FileExists("/dir/f")); - ASSERT_TRUE(env_->FileExists("/dir/g")); - ASSERT_OK(env_->GetFileSize("/dir/g", &file_size)); - ASSERT_EQ(8, file_size); + ASSERT_TRUE(!env_->RenameFile(dbname + "/non_existent", dbname + "/g").ok()); + ASSERT_OK(env_->RenameFile(dbname + "/f", dbname + "/g")); + ASSERT_TRUE(!env_->FileExists(dbname + "/f")); + ASSERT_TRUE(env_->FileExists(dbname + "/g")); + ASSERT_OK(env_->GetFileSize(dbname + "/g", &file_size)); + ASSERT_EQ(3, file_size); // Check that opening non-existent file fails. SequentialFile* seq_file; RandomAccessFile* rand_file; - ASSERT_TRUE(!env_->NewSequentialFile("/dir/non_existent", &seq_file).ok()); + ASSERT_TRUE(!env_->NewSequentialFile(dbname + "/non_existent", &seq_file).ok()); ASSERT_TRUE(!seq_file); - ASSERT_TRUE(!env_->NewRandomAccessFile("/dir/non_existent", &rand_file).ok()); + ASSERT_TRUE(!env_->NewRandomAccessFile(dbname + "/non_existent", &rand_file).ok()); ASSERT_TRUE(!rand_file); // Check that deleting works. - ASSERT_TRUE(!env_->DeleteFile("/dir/non_existent").ok()); - ASSERT_OK(env_->DeleteFile("/dir/g")); - ASSERT_TRUE(!env_->FileExists("/dir/g")); - ASSERT_OK(env_->GetChildren("/dir", &children)); + ASSERT_TRUE(!env_->DeleteFile(dbname + "/non_existent").ok()); + ASSERT_OK(env_->DeleteFile(dbname + "/g")); + ASSERT_TRUE(!env_->FileExists(dbname + "/g")); + ASSERT_OK(env_->GetChildren(dbname + "", &children)); ASSERT_EQ(0, children.size()); - ASSERT_OK(env_->DeleteDir("/dir")); + ASSERT_OK(env_->DeleteDir(dbname + "")); } TEST(MemEnvTest, ReadWrite) { @@ -99,16 +92,19 @@ TEST(MemEnvTest, ReadWrite) { RandomAccessFile* rand_file; Slice result; char scratch[100]; + std::string dbname; - ASSERT_OK(env_->CreateDir("/dir")); + dbname=test::TmpDir(); - ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file)); + ASSERT_OK(env_->CreateDir(dbname + "")); + + ASSERT_OK(env_->NewWritableFile(dbname + "/f", &writable_file, 2<<20)); ASSERT_OK(writable_file->Append("hello ")); ASSERT_OK(writable_file->Append("world")); delete writable_file; // Read sequentially. - ASSERT_OK(env_->NewSequentialFile("/dir/f", &seq_file)); + ASSERT_OK(env_->NewSequentialFile(dbname + "/f", &seq_file)); ASSERT_OK(seq_file->Read(5, &result, scratch)); // Read "hello". ASSERT_EQ(0, result.compare("hello")); ASSERT_OK(seq_file->Skip(1)); @@ -122,7 +118,7 @@ TEST(MemEnvTest, ReadWrite) { delete seq_file; // Random reads. - ASSERT_OK(env_->NewRandomAccessFile("/dir/f", &rand_file)); + ASSERT_OK(env_->NewRandomAccessFile(dbname + "/f", &rand_file)); ASSERT_OK(rand_file->Read(6, 5, &result, scratch)); // Read "world". ASSERT_EQ(0, result.compare("world")); ASSERT_OK(rand_file->Read(0, 5, &result, scratch)); // Read "hello". @@ -149,7 +145,7 @@ TEST(MemEnvTest, Misc) { ASSERT_TRUE(!test_dir.empty()); WritableFile* writable_file; - ASSERT_OK(env_->NewWritableFile("/a/b", &writable_file)); + ASSERT_OK(env_->NewWritableFile("/a/b", &writable_file, 2<<20)); // These are no-ops, but we test they return success. ASSERT_OK(writable_file->Sync()); @@ -161,6 +157,9 @@ TEST(MemEnvTest, Misc) { TEST(MemEnvTest, LargeWrite) { const size_t kWriteSize = 300 * 1024; char* scratch = new char[kWriteSize * 2]; + std::string dbname; + + dbname=test::TmpDir(); std::string write_data; for (size_t i = 0; i < kWriteSize; ++i) { @@ -168,14 +167,14 @@ TEST(MemEnvTest, LargeWrite) { } WritableFile* writable_file; - ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file)); + ASSERT_OK(env_->NewWritableFile(dbname + "/f", &writable_file, 2<<20)); ASSERT_OK(writable_file->Append("foo")); ASSERT_OK(writable_file->Append(write_data)); delete writable_file; SequentialFile* seq_file; Slice result; - ASSERT_OK(env_->NewSequentialFile("/dir/f", &seq_file)); + ASSERT_OK(env_->NewSequentialFile(dbname + "/f", &seq_file)); ASSERT_OK(seq_file->Read(3, &result, scratch)); // Read "foo". ASSERT_EQ(0, result.compare("foo")); @@ -190,17 +189,21 @@ TEST(MemEnvTest, LargeWrite) { delete seq_file; delete [] scratch; } - +#if 0 TEST(MemEnvTest, DBTest) { Options options; options.create_if_missing = true; options.env = env_; DB* db; + std::string dbname; + + dbname=test::TmpDir(); + ASSERT_OK(env_->CreateDir(dbname+ "/db")); const Slice keys[] = {Slice("aaa"), Slice("bbb"), Slice("ccc")}; const Slice vals[] = {Slice("foo"), Slice("bar"), Slice("baz")}; - ASSERT_OK(DB::Open(options, "/dir/db", &db)); + ASSERT_OK(DB::Open(options, dbname + "/db", &db)); for (size_t i = 0; i < 3; ++i) { ASSERT_OK(db->Put(WriteOptions(), keys[i], vals[i])); } @@ -233,7 +236,7 @@ TEST(MemEnvTest, DBTest) { delete db; } - +#endif } // namespace leveldb int main(int argc, char** argv) { diff --git a/src/leveldb/include/leveldb/atomics.h b/src/leveldb/include/leveldb/atomics.h new file mode 100644 index 000000000..6b2a4887b --- /dev/null +++ b/src/leveldb/include/leveldb/atomics.h @@ -0,0 +1,227 @@ +// ------------------------------------------------------------------- +// +// atomics.h: portable atomic operations for leveldb/eleveldb (http://code.google.com/p/leveldb/) +// +// Copyright (c) 2011-2013 Basho Technologies, Inc. All Rights Reserved. +// +// This file is provided to you under the Apache License, +// Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain +// a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// ------------------------------------------------------------------- + +/// Copied from basho/eleveldb/c_src/detail.hpp September 8, 2013 + +#ifndef LEVELDB_ATOMIC_H + #define LEVELDB_ATOMIC_H 1 + +#include +#include + +/* These can be hopefully-replaced with constexpr or compile-time assert later: */ +#if defined(OS_SOLARIS) || defined(SOLARIS) || defined(sun) + #define LEVELDB_IS_SOLARIS 1 +#else + #undef LEVELDB_IS_SOLARIS +#endif + +#ifdef LEVELDB_IS_SOLARIS + #include +#endif + +namespace leveldb { + +/** + * Compare and swap + */ + +// primary template +template +inline bool compare_and_swap(volatile PtrT *ptr, const ValueT& comp_val, const ValueT& exchange_val); + + +// uint32 size (needed for solaris) +template <> +inline bool compare_and_swap(volatile uint32_t *ptr, const int& comp_val, const int& exchange_val) +{ +#if LEVELDB_IS_SOLARIS + return ((uint32_t) comp_val==atomic_cas_32(ptr, comp_val, exchange_val)); +#else + return __sync_bool_compare_and_swap(ptr, comp_val, exchange_val); +#endif +} + + +// generic specification ... for pointers +template +inline bool compare_and_swap(volatile PtrT *ptr, const ValueT& comp_val, const ValueT& exchange_val) +{ +#if LEVELDB_IS_SOLARIS + return (comp_val==atomic_cas_ptr(ptr, comp_val, exchange_val)); +#else + return __sync_bool_compare_and_swap(ptr, comp_val, exchange_val); +#endif +} + + +/** + * Atomic increment + */ + +template +inline ValueT inc_and_fetch(volatile ValueT *ptr); + +template <> +inline uint64_t inc_and_fetch(volatile uint64_t *ptr) +{ +#if LEVELDB_IS_SOLARIS + return atomic_inc_64_nv(ptr); +#else + return __sync_add_and_fetch(ptr, 1); +#endif +} + +template <> +inline uint32_t inc_and_fetch(volatile uint32_t *ptr) +{ +#if LEVELDB_IS_SOLARIS + return atomic_inc_32_nv(ptr); +#else + return __sync_add_and_fetch(ptr, 1); +#endif +} + +#if defined(__APPLE__) || defined(__OpenBSD__) || (defined(__s390__) && !defined(__s390x__)) +template <> +inline size_t inc_and_fetch(volatile size_t *ptr) +{ + return __sync_add_and_fetch(ptr, 1); +} +#endif + + +/** + * atomic decrement + */ + +template +inline ValueT dec_and_fetch(volatile ValueT *ptr); + +template <> +inline uint64_t dec_and_fetch(volatile uint64_t *ptr) +{ +#if LEVELDB_IS_SOLARIS + return atomic_dec_64_nv(ptr); +#else + return __sync_sub_and_fetch(ptr, 1); +#endif +} + +template <> +inline uint32_t dec_and_fetch(volatile uint32_t *ptr) +{ +#if LEVELDB_IS_SOLARIS + return atomic_dec_32_nv(ptr); +#else + return __sync_sub_and_fetch(ptr, 1); +#endif +} + +#if defined(__APPLE__) || defined(__OpenBSD__) || (defined(__s390__) && !defined(__s390x__)) +template <> +inline size_t dec_and_fetch(volatile size_t *ptr) +{ + return __sync_sub_and_fetch(ptr, 1); +} +#endif + + +/** + * Atomic add + */ + + +template +inline ValueT add_and_fetch(volatile ValueT *ptr, ValueT val); + +template <> +inline uint64_t add_and_fetch(volatile uint64_t *ptr, uint64_t val) +{ +#if LEVELDB_IS_SOLARIS + return atomic_add_64_nv(ptr, val); +#else + return __sync_add_and_fetch(ptr, val); +#endif +} + +template <> +inline uint32_t add_and_fetch(volatile uint32_t *ptr, uint32_t val) +{ +#if LEVELDB_IS_SOLARIS + return atomic_add_32_nv(ptr, val); +#else + return __sync_add_and_fetch(ptr, val); +#endif +} + +#if defined(__APPLE__) || defined(__OpenBSD__) || (defined(__s390__) && !defined(__s390x__)) +template <> +inline size_t add_and_fetch(volatile size_t *ptr, size_t val) +{ + return __sync_add_and_fetch(ptr, val); +} +#endif + + +/** + * Atomic subtract + */ + +template +inline ValueT sub_and_fetch(volatile ValueT *ptr, ValueT val); + +template <> +inline uint64_t sub_and_fetch(volatile uint64_t *ptr, uint64_t val) +{ +#if LEVELDB_IS_SOLARIS + uint64_t temp=(~val)+1; // 2's complement, bypass sign warnings + return atomic_add_64_nv(ptr, temp); +#else + return __sync_sub_and_fetch(ptr, val); +#endif +} + +template <> +inline uint32_t sub_and_fetch(volatile uint32_t *ptr, uint32_t val) +{ +#if LEVELDB_IS_SOLARIS + uint32_t temp=(~val)+1; // 2's complement, bypass sign warnings + return atomic_add_32_nv(ptr, temp); +#else + return __sync_sub_and_fetch(ptr, val); +#endif +} + +#if defined(__APPLE__) || defined(__OpenBSD__) || (defined(__s390__) && !defined(__s390x__)) +template <> +inline size_t sub_and_fetch(volatile size_t *ptr, size_t val) +{ + return __sync_sub_and_fetch(ptr, val); +} +#endif + + + +} // namespace leveldb + +#endif diff --git a/src/leveldb/include/leveldb/c.h b/src/leveldb/include/leveldb/c.h index 1048fe3b8..d3eda280e 100644 --- a/src/leveldb/include/leveldb/c.h +++ b/src/leveldb/include/leveldb/c.h @@ -9,6 +9,7 @@ Does not support: . getters for the option types . custom comparators that implement key shortening + . capturing post-write-snapshot . custom iter, db, env, cache implementations using just the C bindings Some conventions: @@ -27,7 +28,6 @@ be true on entry: *errptr == NULL *errptr points to a malloc()ed null-terminated error message - (On Windows, *errptr must have been malloc()-ed by this library.) On success, a leveldb routine leaves *errptr unchanged. On failure, leveldb frees the old value of *errptr and set *errptr to a malloc()ed error message. @@ -66,7 +66,7 @@ typedef struct leveldb_snapshot_t leveldb_snapshot_t; typedef struct leveldb_writablefile_t leveldb_writablefile_t; typedef struct leveldb_writebatch_t leveldb_writebatch_t; typedef struct leveldb_writeoptions_t leveldb_writeoptions_t; - +typedef struct leveldb_keymetadata_t leveldb_keymetadata_t; /* DB operations */ extern leveldb_t* leveldb_open( @@ -83,6 +83,14 @@ extern void leveldb_put( const char* val, size_t vallen, char** errptr); +extern void leveldb_put2( + leveldb_t* db, + const leveldb_writeoptions_t* options, + const char* key, size_t keylen, + const char* val, size_t vallen, + char** errptr, + const leveldb_keymetadata_t * metadata); + extern void leveldb_delete( leveldb_t* db, const leveldb_writeoptions_t* options, @@ -104,6 +112,14 @@ extern char* leveldb_get( size_t* vallen, char** errptr); +extern char* leveldb_get2( + leveldb_t* db, + const leveldb_readoptions_t* options, + const char* key, size_t keylen, + size_t* vallen, + char** errptr, + leveldb_keymetadata_t * metadata); + extern leveldb_iterator_t* leveldb_create_iterator( leveldb_t* db, const leveldb_readoptions_t* options); @@ -156,6 +172,7 @@ extern void leveldb_iter_next(leveldb_iterator_t*); extern void leveldb_iter_prev(leveldb_iterator_t*); extern const char* leveldb_iter_key(const leveldb_iterator_t*, size_t* klen); extern const char* leveldb_iter_value(const leveldb_iterator_t*, size_t* vlen); +extern const void leveldb_iter_keymetadata(const leveldb_iterator_t *, leveldb_keymetadata_t *); extern void leveldb_iter_get_error(const leveldb_iterator_t*, char** errptr); /* Write batch */ @@ -167,13 +184,19 @@ extern void leveldb_writebatch_put( leveldb_writebatch_t*, const char* key, size_t klen, const char* val, size_t vlen); +extern void leveldb_writebatch_put2( + leveldb_writebatch_t*, + const char* key, size_t klen, + const char* val, size_t vlen, + const leveldb_keymetadata_t * meta); extern void leveldb_writebatch_delete( leveldb_writebatch_t*, const char* key, size_t klen); extern void leveldb_writebatch_iterate( leveldb_writebatch_t*, void* state, - void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen), + void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen, + const int & type, const uint64_t & expiry), void (*deleted)(void*, const char* k, size_t klen)); /* Options */ @@ -192,6 +215,8 @@ extern void leveldb_options_set_error_if_exists( leveldb_options_t*, unsigned char); extern void leveldb_options_set_paranoid_checks( leveldb_options_t*, unsigned char); +extern void leveldb_options_set_verify_compactions( + leveldb_options_t*, unsigned char); extern void leveldb_options_set_env(leveldb_options_t*, leveldb_env_t*); extern void leveldb_options_set_info_log(leveldb_options_t*, leveldb_logger_t*); extern void leveldb_options_set_write_buffer_size(leveldb_options_t*, size_t); @@ -199,6 +224,7 @@ extern void leveldb_options_set_max_open_files(leveldb_options_t*, int); extern void leveldb_options_set_cache(leveldb_options_t*, leveldb_cache_t*); extern void leveldb_options_set_block_size(leveldb_options_t*, size_t); extern void leveldb_options_set_block_restart_interval(leveldb_options_t*, int); +extern void leveldb_options_set_total_leveldb_mem(leveldb_options_t*, size_t); enum { leveldb_no_compression = 0, @@ -267,20 +293,20 @@ extern void leveldb_cache_destroy(leveldb_cache_t* cache); extern leveldb_env_t* leveldb_create_default_env(); extern void leveldb_env_destroy(leveldb_env_t*); +extern void leveldb_env_shutdown(); -/* Utility */ +/* Util */ -/* Calls free(ptr). - REQUIRES: ptr was malloc()-ed and returned by one of the routines - in this file. Note that in certain cases (typically on Windows), you - may need to call this routine instead of free(ptr) to dispose of - malloc()-ed memory returned by this library. */ +/** + * CAUTION: this call is only for char * objects returned by + * functions like leveldb_get and leveldb_property_value. + * Also used to release errptr strings. + */ extern void leveldb_free(void* ptr); -/* Return the major version number for this release. */ -extern int leveldb_major_version(); +/* Version */ -/* Return the minor version number for this release. */ +extern int leveldb_major_version(); extern int leveldb_minor_version(); #ifdef __cplusplus diff --git a/src/leveldb/include/leveldb/cache.h b/src/leveldb/include/leveldb/cache.h index 6819d5bc4..224e18d2a 100644 --- a/src/leveldb/include/leveldb/cache.h +++ b/src/leveldb/include/leveldb/cache.h @@ -29,6 +29,11 @@ class Cache; // of Cache uses a least-recently-used eviction policy. extern Cache* NewLRUCache(size_t capacity); +// Riak customization - just like NewLRUCache except the underlying +// structure is NOT sharded. Better for file cache. +extern Cache* NewLRUCache2(size_t capacity); + + class Cache { public: Cache() { } @@ -81,16 +86,17 @@ class Cache { // its cache keys. virtual uint64_t NewId() = 0; - // Remove all cache entries that are not actively in use. Memory-constrained - // applications may wish to call this method to reduce memory usage. - // Default implementation of Prune() does nothing. Subclasses are strongly - // encouraged to override the default implementation. A future release of - // leveldb may change Prune() to a pure abstract method. - virtual void Prune() {} + // Return size, if any, of per entry overhead for item placed in cache. + // Allows more accurate tracking of "charge" against each cache item. + virtual size_t EntryOverheadSize() {return(0);}; - // Return an estimate of the combined charges of all elements stored in the - // cache. - virtual size_t TotalCharge() const = 0; + // Riak specific: Add a reference to cache object to help hold it + // in memory + virtual void Addref(Handle* e) = 0; + + // Riak specific: walk contents of entire cache, calling functor Acc + // with the "value" for each cache entry. Locks cache throughout call. + virtual bool WalkCache(class CacheAccumulator & Acc) {return(true);}; private: void LRU_Remove(Handle* e); @@ -107,4 +113,4 @@ class Cache { } // namespace leveldb -#endif // STORAGE_LEVELDB_INCLUDE_CACHE_H_ +#endif // STORAGE_LEVELDB_UTIL_CACHE_H_ diff --git a/src/leveldb/include/leveldb/comparator.h b/src/leveldb/include/leveldb/comparator.h index 556b984c7..38b59539e 100644 --- a/src/leveldb/include/leveldb/comparator.h +++ b/src/leveldb/include/leveldb/comparator.h @@ -58,6 +58,10 @@ class Comparator { // must not be deleted. extern const Comparator* BytewiseComparator(); +// Riak specific: cleans up the default comparitor to make +// valgrind results clean +extern void ComparatorShutdown(); + } // namespace leveldb #endif // STORAGE_LEVELDB_INCLUDE_COMPARATOR_H_ diff --git a/src/leveldb/include/leveldb/db.h b/src/leveldb/include/leveldb/db.h index bfab10a0b..d2bd6dce3 100644 --- a/src/leveldb/include/leveldb/db.h +++ b/src/leveldb/include/leveldb/db.h @@ -14,7 +14,7 @@ namespace leveldb { // Update Makefile if you change these static const int kMajorVersion = 1; -static const int kMinorVersion = 20; +static const int kMinorVersion = 9; struct Options; struct ReadOptions; @@ -38,6 +38,17 @@ struct Range { Range(const Slice& s, const Slice& l) : start(s), limit(l) { } }; +// Abstract holder for a DB value. +// This allows callers to manage their own value buffers and have +// DB values copied directly into those buffers. +class Value { + public: + virtual Value& assign(const char* data, size_t size) = 0; + + protected: + virtual ~Value(); +}; + // A DB is a persistent ordered map from keys to values. // A DB is safe for concurrent access from multiple threads without // any external synchronization. @@ -60,7 +71,8 @@ class DB { // Note: consider setting options.sync = true. virtual Status Put(const WriteOptions& options, const Slice& key, - const Slice& value) = 0; + const Slice& value, + const KeyMetaData * meta=NULL) = 0; // Remove the database entry (if any) for "key". Returns OK on // success, and a non-OK status on error. It is not an error if "key" @@ -81,7 +93,11 @@ class DB { // // May return some other Status on an error. virtual Status Get(const ReadOptions& options, - const Slice& key, std::string* value) = 0; + const Slice& key, std::string* value, + KeyMetaData * meta=NULL) = 0; + virtual Status Get(const ReadOptions& options, + const Slice& key, Value* value, + KeyMetaData * meta=NULL) = 0; // Return a heap-allocated iterator over the contents of the database. // The result of NewIterator() is initially invalid (caller must @@ -115,8 +131,6 @@ class DB { // about the internal operation of the DB. // "leveldb.sstables" - returns a multi-line string that describes all // of the sstables that make up the db contents. - // "leveldb.approximate-memory-usage" - returns the approximate number of - // bytes of memory in use by the DB. virtual bool GetProperty(const Slice& property, std::string* value) = 0; // For each i in [0,n-1], store in "sizes[i]", the approximate @@ -142,6 +156,21 @@ class DB { // db->CompactRange(NULL, NULL); virtual void CompactRange(const Slice* begin, const Slice* end) = 0; + // Riak specific function: Verify that no .sst files overlap + // within the levels that expect non-overlapping files. Run + // compactions as necessary to correct. Assumes DB opened + // with Options.is_repair=true + virtual Status VerifyLevels(); + + // Riak specific function: Request database check for + // available compactions. This is to stimulate retry of + // grooming that might have been offered and rejected previously + virtual void CheckAvailableCompactions(); + + // Riak specific function: Give external code, namely + // eleveldb, access to leveldb's logging routines. + virtual Logger* GetLogger() const { return NULL; } + private: // No copying allowed DB(const DB&); diff --git a/src/leveldb/include/leveldb/dumpfile.h b/src/leveldb/include/leveldb/dumpfile.h deleted file mode 100644 index 3f97fda16..000000000 --- a/src/leveldb/include/leveldb/dumpfile.h +++ /dev/null @@ -1,25 +0,0 @@ -// Copyright (c) 2014 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_INCLUDE_DUMPFILE_H_ -#define STORAGE_LEVELDB_INCLUDE_DUMPFILE_H_ - -#include -#include "leveldb/env.h" -#include "leveldb/status.h" - -namespace leveldb { - -// Dump the contents of the file named by fname in text format to -// *dst. Makes a sequence of dst->Append() calls; each call is passed -// the newline-terminated text corresponding to a single item found -// in the file. -// -// Returns a non-OK result if fname does not name a leveldb storage -// file, or if the file cannot be read. -Status DumpFile(Env* env, const std::string& fname, WritableFile* dst); - -} // namespace leveldb - -#endif // STORAGE_LEVELDB_INCLUDE_DUMPFILE_H_ diff --git a/src/leveldb/include/leveldb/env.h b/src/leveldb/include/leveldb/env.h index 275d441ea..e1df0c78d 100644 --- a/src/leveldb/include/leveldb/env.h +++ b/src/leveldb/include/leveldb/env.h @@ -13,15 +13,19 @@ #ifndef STORAGE_LEVELDB_INCLUDE_ENV_H_ #define STORAGE_LEVELDB_INCLUDE_ENV_H_ +#include +#include #include #include -#include #include +#include "leveldb/perf_count.h" #include "leveldb/status.h" namespace leveldb { +class AppendableFile; class FileLock; +struct Options; class Logger; class RandomAccessFile; class SequentialFile; @@ -40,6 +44,11 @@ class Env { // The result of Default() belongs to leveldb and must never be deleted. static Env* Default(); + // Riak specific: Shutdown background work threads and other objects + // to get clean environment for valgrind memory test. No restart supported + // after this call. Not thread safe. + static void Shutdown(); + // Create a brand new sequentially-readable file with the specified name. // On success, stores a pointer to the new file in *result and returns OK. // On failure stores NULL in *result and returns non-OK. If the file does @@ -67,22 +76,31 @@ class Env { // // The returned file will only be accessed by one thread at a time. virtual Status NewWritableFile(const std::string& fname, - WritableFile** result) = 0; + WritableFile** result, + size_t map_size) = 0; - // Create an object that either appends to an existing file, or - // writes to a new file (if the file does not exist to begin with). - // On success, stores a pointer to the new file in *result and - // returns OK. On failure stores NULL in *result and returns - // non-OK. + // Riak specific: + // Derived from NewWritableFile. One change: if the file exists, + // move to the end of the file and continue writing. + // new file. On success, stores a pointer to the open file in + // *result and returns OK. On failure stores NULL in *result and + // returns non-OK. // // The returned file will only be accessed by one thread at a time. - // - // May return an IsNotSupportedError error if this Env does - // not allow appending to an existing file. Users of Env (including - // the leveldb implementation) must be prepared to deal with - // an Env that does not support appending. virtual Status NewAppendableFile(const std::string& fname, - WritableFile** result); + WritableFile** result, + size_t map_size) = 0; + + // Riak specific: + // Allows for virtualized version of NewWritableFile that enables write + // and close operations to execute on background threads + // (where platform supported). + // + // The returned file will only be accessed by one thread at a time. + virtual Status NewWriteOnlyFile(const std::string& fname, + WritableFile** result, + size_t map_size) + {return(NewWritableFile(fname, result, map_size));}; // Returns true iff the named file exists. virtual bool FileExists(const std::string& fname) = 0; @@ -142,7 +160,7 @@ class Env { // Start a new thread, invoking "function(arg)" within the new thread. // When "function(arg)" returns, the thread will be destroyed. - virtual void StartThread(void (*function)(void* arg), void* arg) = 0; + virtual pthread_t StartThread(void (*function)(void* arg), void* arg) = 0; // *path is set to a temporary directory that can be used for testing. It may // or many not have just been created. The directory may or may not differ @@ -157,9 +175,16 @@ class Env { // useful for computing deltas of time. virtual uint64_t NowMicros() = 0; - // Sleep/delay the thread for the prescribed number of micro-seconds. + // Sleep/delay the thread for the perscribed number of micro-seconds. virtual void SleepForMicroseconds(int micros) = 0; + // Riak specific: Get object that is tracking various software counters + virtual PerformanceCounters * GetPerformanceCounters() {return(gPerfCounters);}; + + // Riak specific: Request size of recovery memory map, potentially using + // Options data for the decision. Default 2Mbyte is Google's original size. + virtual size_t RecoveryMmapSize(const struct Options *) const {return(2*1024*1024L);}; + private: // No copying allowed Env(const Env&); @@ -190,14 +215,6 @@ class SequentialFile { // // REQUIRES: External synchronization virtual Status Skip(uint64_t n) = 0; - - // Get a name for the file, only for error reporting - virtual std::string GetName() const = 0; - - private: - // No copying allowed - SequentialFile(const SequentialFile&); - void operator=(const SequentialFile&); }; // A file abstraction for randomly reading the contents of a file. @@ -218,13 +235,11 @@ class RandomAccessFile { virtual Status Read(uint64_t offset, size_t n, Slice* result, char* scratch) const = 0; - // Get a name for the file, only for error reporting - virtual std::string GetName() const = 0; + // Riak optimization: allows advising Linux page cache + virtual void SetForCompaction(uint64_t file_size) {}; - private: - // No copying allowed - RandomAccessFile(const RandomAccessFile&); - void operator=(const RandomAccessFile&); + // Riak addition: size of this structure in bytes + virtual size_t ObjectSize() {return(sizeof(RandomAccessFile));}; }; // A file abstraction for sequential writing. The implementation @@ -240,8 +255,10 @@ class WritableFile { virtual Status Flush() = 0; virtual Status Sync() = 0; - // Get a name for the file, only for error reporting - virtual std::string GetName() const = 0; + // Riak specific: + // Provide hint where key/value data ends and metadata starts + // in an .sst table file. + virtual void SetMetadataOffset(uint64_t) {}; private: // No copying allowed @@ -249,12 +266,30 @@ class WritableFile { void operator=(const WritableFile&); }; +// A file abstraction for sequential writing at end of existing file. +class AppendableFile: public WritableFile { + public: + AppendableFile() { } + virtual ~AppendableFile(); + + private: + // No copying allowed + AppendableFile(const AppendableFile&); + void operator=(const AppendableFile&); +}; + // An interface for writing log messages. class Logger { public: Logger() { } virtual ~Logger(); + // Riak specific function for hot backup. + // hot_backup.cc assumes that it can rotate the LOG file + // via standard Env routines if this function returns a + // non-zero value. + virtual long LogSize() {return(0);}; + // Write an entry to the log file with the specified format. virtual void Logv(const char* format, va_list ap) = 0; @@ -310,11 +345,14 @@ class EnvWrapper : public Env { Status NewRandomAccessFile(const std::string& f, RandomAccessFile** r) { return target_->NewRandomAccessFile(f, r); } - Status NewWritableFile(const std::string& f, WritableFile** r) { - return target_->NewWritableFile(f, r); + Status NewWritableFile(const std::string& f, WritableFile** r, size_t s=0) { + return target_->NewWritableFile(f, r, s); } - Status NewAppendableFile(const std::string& f, WritableFile** r) { - return target_->NewAppendableFile(f, r); + Status NewAppendableFile(const std::string& f, WritableFile** r, size_t s=0) { + return target_->NewAppendableFile(f, r, s); + } + Status NewWriteOnlyFile(const std::string& f, WritableFile** r, size_t s=0) { + return target_->NewWriteOnlyFile(f, r, s); } bool FileExists(const std::string& f) { return target_->FileExists(f); } Status GetChildren(const std::string& dir, std::vector* r) { @@ -334,9 +372,9 @@ class EnvWrapper : public Env { } Status UnlockFile(FileLock* l) { return target_->UnlockFile(l); } void Schedule(void (*f)(void*), void* a) { - return target_->Schedule(f, a); + return target_->Schedule(f, a); } - void StartThread(void (*f)(void*), void* a) { + pthread_t StartThread(void (*f)(void*), void* a) { return target_->StartThread(f, a); } virtual Status GetTestDirectory(std::string* path) { @@ -355,6 +393,12 @@ class EnvWrapper : public Env { Env* target_; }; +// Riak specific hack to allow runtime change +// of mapping size +extern volatile size_t gMapSize; + +extern bool gFadviseWillNeed; + } // namespace leveldb #endif // STORAGE_LEVELDB_INCLUDE_ENV_H_ diff --git a/src/leveldb/include/leveldb/expiry.h b/src/leveldb/include/leveldb/expiry.h new file mode 100644 index 000000000..c5be6603a --- /dev/null +++ b/src/leveldb/include/leveldb/expiry.h @@ -0,0 +1,135 @@ +// ------------------------------------------------------------------- +// +// expiry.h: background expiry management for Basho's modified leveldb +// +// Copyright (c) 2016 Basho Technologies, Inc. All Rights Reserved. +// +// This file is provided to you under the Apache License, +// Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain +// a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// ------------------------------------------------------------------- + +#ifndef EXPIRY_H +#define EXPIRY_H + +#include +#include +#include "leveldb/env.h" +#include "leveldb/options.h" +#include "util/refobject_base.h" + +namespace leveldb { + +class Compaction; +class Logger; +struct ParsedInternalKey; +class Slice; +class SstCounters; +class Version; +class VersionEdit; +struct FileMetaData; + + +enum EleveldbRouterActions_t +{ + eGetBucketProperties=1 +}; // enum EleveldbRouterActions_t + + +typedef bool (* EleveldbRouter_t)(EleveldbRouterActions_t Action, int ParamCount, const void ** Params); + + +class ExpiryModule : public RefObjectBase +{ +public: + virtual ~ExpiryModule() {}; + + // Print expiry options to LOG file + virtual void Dump(Logger * log) const + {Log(log," Expiry: (none)");}; + + // Quick test to allow manifest logic and such know if + // extra expiry logic should be checked + virtual bool ExpiryActivated() const {return(false);}; + + // db/write_batch.cc MemTableInserter::Put() calls this. + // returns false on internal error + virtual bool MemTableInserterCallback( + const Slice & Key, // input: user's key about to be written + const Slice & Value, // input: user's value object + ValueType & ValType, // input/output: key type. call might change + ExpiryTimeMicros & Expiry) const // input/output: 0 or specific expiry. call might change + {return(true);}; + + // db/dbformat.cc KeyRetirement::operator() calls this. + // db/version_set.cc SaveValue() calls this too. + // returns true if key is expired, returns false if key not expired + virtual bool KeyRetirementCallback( + const ParsedInternalKey & Ikey) const + {return(false);}; + + // table/table_builder.cc TableBuilder::Add() calls this. + // returns false on internal error + virtual bool TableBuilderCallback( + const Slice & Key, // input: internal key + SstCounters & Counters) const // input/output: counters for new sst table + {return(true);}; + + // db/memtable.cc MemTable::Get() calls this. + // returns true if type/expiry is expired, returns false if not expired + virtual bool MemTableCallback( + const Slice & Key) const // input: leveldb internal key + {return(false);}; + + // db/version_set.cc VersionSet::Finalize() calls this if no + // other compaction selected for a level + // returns true if there is an expiry compaction eligible + virtual bool CompactionFinalizeCallback( + bool WantAll, // input: true - examine all expired files + const Version & Ver, // input: database state for examination + int Level, // input: level to review for expiry + VersionEdit * Edit) const // output: NULL or destination of delete list + {return(false);}; + + // yep, sometimes we want to expiry this expiry module object. + // mostly for bucket level properties in Riak EE + virtual uint64_t ExpiryModuleExpiryMicros() {return(0);}; + + // Creates derived ExpiryModule object that matches compile time + // switch for open source or Basho enterprise edition features. + static ExpiryModule * CreateExpiryModule(EleveldbRouter_t Router); + + // Cleans up global objects related to expiry + // switch for open source or Basho enterprise edition features. + static void ShutdownExpiryModule(); + + // Riak EE: stash a user created module with settings + virtual void NoteUserExpirySettings() {}; + +protected: + ExpiryModule() {}; + +private: + ExpiryModule(const ExpiryModule &); + ExpiryModule & operator=(const ExpiryModule &); + +}; // ExpiryModule + + +typedef RefPtr ExpiryPtr_t; + +} // namespace leveldb + +#endif // ifndef + diff --git a/src/leveldb/include/leveldb/filter_policy.h b/src/leveldb/include/leveldb/filter_policy.h index 1fba08001..9369f7224 100644 --- a/src/leveldb/include/leveldb/filter_policy.h +++ b/src/leveldb/include/leveldb/filter_policy.h @@ -23,9 +23,21 @@ namespace leveldb { class Slice; class FilterPolicy { - public: +protected: + mutable const FilterPolicy * m_Next; // used by FilterInventory + +public: + FilterPolicy() + : m_Next(NULL) + {}; + virtual ~FilterPolicy(); + // list pointer accessors + const FilterPolicy * GetNext() const {return(m_Next);}; + void SetNext(const FilterPolicy * Next) const {m_Next=Next;}; + + // Return the name of this policy. Note that if the filter encoding // changes in an incompatible way, the name returned by this method // must be changed. Otherwise, old incompatible filters may be @@ -47,6 +59,7 @@ class FilterPolicy { // This method may return true or false if the key was not on the // list, but it should aim to return false with a high probability. virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const = 0; + }; // Return a new filter policy that uses a bloom filter with approximately @@ -64,7 +77,29 @@ class FilterPolicy { // FilterPolicy (like NewBloomFilterPolicy) that does not ignore // trailing spaces in keys. extern const FilterPolicy* NewBloomFilterPolicy(int bits_per_key); +extern const FilterPolicy* NewBloomFilterPolicy2(int bits_per_key); -} + +class FilterInventory +{ +public: + // MUST be static variable so that it initializes before any static objects + // have their initializers called + static const FilterPolicy * ListHead; + + // This might be called prior to singleton FilterInventory object + // being initialized. NOT THREAD SAFE. + static void AddFilterToInventory(const FilterPolicy * Filter) + { + if (NULL!=Filter) + { + Filter->SetNext(ListHead); + ListHead=Filter; + } // if + return; + } +}; // class FilterInventory + +} // namespace leveldb #endif // STORAGE_LEVELDB_INCLUDE_FILTER_POLICY_H_ diff --git a/src/leveldb/include/leveldb/iterator.h b/src/leveldb/include/leveldb/iterator.h index da631ed9d..71d201f62 100644 --- a/src/leveldb/include/leveldb/iterator.h +++ b/src/leveldb/include/leveldb/iterator.h @@ -17,6 +17,7 @@ #include "leveldb/slice.h" #include "leveldb/status.h" +#include "leveldb/options.h" namespace leveldb { @@ -37,7 +38,7 @@ class Iterator { // Valid() after this call iff the source is not empty. virtual void SeekToLast() = 0; - // Position at the first key in the source that is at or past target. + // Position at the first key in the source that at or past target // The iterator is Valid() after this call iff the source contains // an entry that comes at or past target. virtual void Seek(const Slice& target) = 0; @@ -61,9 +62,13 @@ class Iterator { // Return the value for the current entry. The underlying storage for // the returned slice is valid only until the next modification of // the iterator. - // REQUIRES: Valid() + // REQUIRES: !AtEnd() && !AtStart() virtual Slice value() const = 0; + // Riak specific: if a database iterator, returns key meta data + // REQUIRES: Valid() + virtual KeyMetaData & keymetadata() const {return(keymetadata_); }; + // If an error has occurred, return it. Else return an ok status. virtual Status status() const = 0; @@ -75,6 +80,10 @@ class Iterator { typedef void (*CleanupFunction)(void* arg1, void* arg2); void RegisterCleanup(CleanupFunction function, void* arg1, void* arg2); + protected: + // mutable so reusable by derived classes + mutable KeyMetaData keymetadata_; + private: struct Cleanup { CleanupFunction function; diff --git a/src/leveldb/include/leveldb/options.h b/src/leveldb/include/leveldb/options.h index 976e38122..00efa3333 100644 --- a/src/leveldb/include/leveldb/options.h +++ b/src/leveldb/include/leveldb/options.h @@ -6,15 +6,23 @@ #define STORAGE_LEVELDB_INCLUDE_OPTIONS_H_ #include +#include +#include +#include namespace leveldb { class Cache; class Comparator; class Env; +class ExpiryModule; class FilterPolicy; class Logger; class Snapshot; +namespace log +{ + class Writer; +} // namespace log // DB contents are stored in a set of blocks, each of which holds a // sequence of key,value pairs. Each block may be compressed before @@ -24,9 +32,34 @@ enum CompressionType { // NOTE: do not change the values of existing entries, as these are // part of the persistent format on disk. kNoCompression = 0x0, - kSnappyCompression = 0x1 + kSnappyCompression = 0x1, + kLZ4Compression = 0x2, + kNoCompressionAutomated = 0x3 }; +// Originally located in db/dbformat.h. Now available publically. +// Value types encoded as the last component of internal keys. +// DO NOT CHANGE THESE ENUM VALUES: they are embedded in the on-disk +// data structures. +enum ValueType { + kTypeDeletion = 0x0, + kTypeValue = 0x1, + kTypeValueWriteTime = 0x2, + kTypeValueExplicitExpiry = 0x3 +}; + +// Originally located in db/dbformat.h +typedef uint64_t SequenceNumber; +typedef uint64_t ExpiryTimeMicros; + +}; // namespace leveldb + +// +// must follow ValueType declaration +#include "leveldb/expiry.h" + +namespace leveldb { + // Options to control the behavior of a database (passed to DB::Open) struct Options { // ------------------- @@ -56,6 +89,14 @@ struct Options { // Default: false bool paranoid_checks; + // Riak specific: this variable replaces paranoid_checks at one + // one place in the code. This variable alone controls whether or not + // compaction read operations check CRC values. Riak needs + // the compaction CRC check, but not other paranoid_checks ... so + // this independent control. + // Default: true + bool verify_compactions; + // Use the specified object to interact with the environment, // e.g. to read/write files, schedule background work, etc. // Default: Env::Default() @@ -85,7 +126,7 @@ struct Options { // Number of open files that can be used by the DB. You may need to // increase this if your database has a large working set (budget // one open file per 2MB of working set). - // + // RIAK: NO LONGER USED // Default: 1000 int max_open_files; @@ -105,6 +146,15 @@ struct Options { // Default: 4K size_t block_size; + // Riak specific: non-zero value activates code to automatically + // increase block_size as needed to ensure maximum number of files + // are available in the file cache. The value indicates how many + // incremental increases to use between the original block_size + // and largest, reasonable block_size. + // + // Default: 16 + int block_size_steps; + // Number of keys between restart points for delta encoding of keys. // This parameter can be changed dynamically. Most clients should // leave this parameter alone. @@ -112,18 +162,6 @@ struct Options { // Default: 16 int block_restart_interval; - // Leveldb will write up to this amount of bytes to a file before - // switching to a new one. - // Most clients should leave this parameter alone. However if your - // filesystem is more efficient with larger files, you could - // consider increasing the value. The downside will be longer - // compactions and hence longer latency/performance hiccups. - // Another reason to increase this parameter might be when you are - // initially populating a large database. - // - // Default: 2MB - size_t max_file_size; - // Compress blocks using the specified compression algorithm. This // parameter can be changed dynamically. // @@ -140,12 +178,6 @@ struct Options { // efficiently detect that and will switch to uncompressed mode. CompressionType compression; - // EXPERIMENTAL: If true, append to existing MANIFEST and log files - // when a database is opened. This can significantly speed up open. - // - // Default: currently false, but may become true later. - bool reuse_logs; - // If non-NULL, use the specified filter policy to reduce disk reads. // Many applications will benefit from passing the result of // NewBloomFilterPolicy() here. @@ -153,8 +185,84 @@ struct Options { // Default: NULL const FilterPolicy* filter_policy; + // Riak specific flag used to indicate when database is open + // as part of a Repair operation. Default is false + bool is_repair; + + // Riak specific flag to mark Riak internal database versus + // user database. (User database gets larger cache resources.) + bool is_internal_db; + + // Riak replacement for max_open_files and block_cache. This is + // TOTAL memory to be used by leveldb across ALL DATABASES. + // Most recent value seen upon database open, wins. Zero for default. + uint64_t total_leveldb_mem; + + // Riak specific option specifying block cache space that cannot + // be released for page cache use. The space may still be + // released for file cache. + uint64_t block_cache_threshold; + + // Riak option to override most memory modeling and create + // smaller memory footprint for developers. Helps when + // running large number of databases and multiple VMs. Do + // NOT use this option if making performance measurements. + // Default: false + bool limited_developer_mem; + + // The size of each MMAped file, choose 0 for the default (20M) + uint64_t mmap_size; + + // Riak option to adjust aggressive delete behavior. + // - zero disables aggressive delete + // - positive value indicates how many deletes must exist + // in a file for it to be compacted due to deletes + uint64_t delete_threshold; + + // Riak specific flag used to indicate when fadvise() management + // should default to WILLNEED instead of DONTNEED. Default is false + bool fadvise_willneed; + + // ***** + // Riak specific options for establishing two tiers of disk arrays. + // All three tier options must be valid for the option to activate. + // When active, leveldb directories are constructed using either + // the fast or slow prefix followed by the database name given + // in the DB::Open call. (a synonym for "prefix" is "mount") + // ***** + + // Riak specific option setting the level number at which the + // "tiered_slow_prefix" should be used. Default is zero which + // disables the option. Valid values are 1 to 6. 3 or 4 recommended. + unsigned tiered_slow_level; + + // Riak specific option with the path prefix used for "fast" disk + // array. levels 0 to tiered_slow_level-1 use this path prefix + std::string tiered_fast_prefix; + + // Riak specific option with the path prefix used for "slow" disk + // array. levels tiered_slow_level through 6 use this path prefix + std::string tiered_slow_prefix; + + // Riak specific option that writes a list of open table files + // to disk on close then automatically opens same files again + // upon restart. + bool cache_object_warming; + + // Riak specific object that defines expiry policy for data + // written to leveldb. + ExpiryPtr_t expiry_module; + // Create an Options object with default values for all fields. Options(); + + void Dump(Logger * log) const; + + bool ExpiryActivated() const + {return(NULL!=expiry_module.get() && expiry_module->ExpiryActivated());}; + +private: + }; // Options that control read operations @@ -171,16 +279,57 @@ struct ReadOptions { // If "snapshot" is non-NULL, read as of the supplied snapshot // (which must belong to the DB that is being read and which must - // not have been released). If "snapshot" is NULL, use an implicit + // not have been released). If "snapshot" is NULL, use an impliicit // snapshot of the state at the beginning of this read operation. // Default: NULL const Snapshot* snapshot; + // Riak specific flag, currently used within Erlang adaptor + // to enable automatic delete and new of fresh snapshot + // and database iterator objects for long running iterations + // (only supports iterator NEXT operations). + // Default: false + bool iterator_refresh; + ReadOptions() - : verify_checksums(false), - fill_cache(true), - snapshot(NULL) { + : verify_checksums(true), + fill_cache(true), + snapshot(NULL), + iterator_refresh(false), + is_compaction(false), + env(NULL), + info_log(NULL) + { } + + + // accessors to the private data + bool IsCompaction() const {return(is_compaction);}; + + Logger * GetInfoLog() const {return(info_log);}; + + const std::string & GetDBName() const {return(dbname);}; + + Env * GetEnv() const {return(env);}; + + // The items below are internal options, not for external manipulation. + // They are populated by VersionSet::MakeInputIterator only during compaction operations +private: + friend class VersionSet; + + // true when used on background compaction + bool is_compaction; + + // Database name for potential creation of bad blocks file + std::string dbname; + + // Needed for file operations if creating bad blocks file + Env * env; + + // Open log file for error notifications + // Only valid when is_compation==true + Logger* info_log; + }; // Options that control write operations @@ -208,6 +357,22 @@ struct WriteOptions { } }; + +// Riak specific object that can return key metadata +// during get or iterate operation +struct KeyMetaData +{ + ValueType m_Type; // see above + SequenceNumber m_Sequence; // output only, leveldb internal + ExpiryTimeMicros m_Expiry; // microseconds since Epoch, UTC + + KeyMetaData() + : m_Type(kTypeValue), m_Sequence(0), m_Expiry(0) + {}; +}; // struct KeyMetaData + +const char * CompileOptionsString(); + } // namespace leveldb #endif // STORAGE_LEVELDB_INCLUDE_OPTIONS_H_ diff --git a/src/leveldb/include/leveldb/perf_count.h b/src/leveldb/include/leveldb/perf_count.h new file mode 100644 index 000000000..2f957f4fe --- /dev/null +++ b/src/leveldb/include/leveldb/perf_count.h @@ -0,0 +1,329 @@ +// ------------------------------------------------------------------- +// +// perf_count.h: performance counters LevelDB +// +// Copyright (c) 2012-2016 Basho Technologies, Inc. All Rights Reserved. +// +// This file is provided to you under the Apache License, +// Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain +// a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// ------------------------------------------------------------------- + +#ifndef STORAGE_LEVELDB_INCLUDE_PERF_COUNT_H_ +#define STORAGE_LEVELDB_INCLUDE_PERF_COUNT_H_ + +#include +#include +#include "leveldb/status.h" + +namespace leveldb { + +enum SstCountEnum +{ + // + // array index values/names + // + eSstCountKeys=0, //!< how many keys in this sst + eSstCountBlocks=1, //!< how many blocks in this sst + eSstCountCompressAborted=2,//!< how many blocks attempted compression and aborted use + eSstCountKeySize=3, //!< byte count of all keys + eSstCountValueSize=4, //!< byte count of all values + eSstCountBlockSize=5, //!< byte count of all blocks (pre-compression) + eSstCountBlockWriteSize=6, //!< post-compression size, or BlockSize if no compression + eSstCountIndexKeys=7, //!< how many keys in the index block + eSstCountKeyLargest=8, //!< largest key in sst + eSstCountKeySmallest=9, //!< smallest key in sst + eSstCountValueLargest=10, //!< largest value in sst + eSstCountValueSmallest=11, //!< smallest value in sst + eSstCountDeleteKey=12, //!< tombstone count + eSstCountBlockSizeUsed=13, //!< Options::block_size used with this file + eSstCountUserDataSize=14, //!< post-compression size of non-metadata (user keys/values/block overhead) + eSstCountExpiry1=15, //!< undocumented expiry counter 1 + eSstCountExpiry2=16, //!< undocumented expiry counter 2 + eSstCountExpiry3=17, //!< undocumented expiry counter 3 + eSstCountSequence=18, //!< highest sequence number in file + + // must follow last index name to represent size of array + eSstCountEnumSize, //!< size of the array described by the enum values + + eSstCountVersion=1 + +}; // enum SstCountEnum + + +class SstCounters +{ +protected: + bool m_IsReadOnly; //!< set when data decoded from a file + uint32_t m_Version; //!< object revision identification + uint32_t m_CounterSize; //!< number of objects in m_Counter + + uint64_t m_Counter[eSstCountEnumSize]; + +public: + // constructors / destructor + SstCounters(); + + // Put data into disk form + void EncodeTo(std::string & Dst) const; + + // Populate member data from prior EncodeTo block + Status DecodeFrom(const Slice& src); + + // increment the counter + uint64_t Inc(unsigned Index); + + // add value to the counter + uint64_t Add(unsigned Index, uint64_t Amount); + + // return value of a counter + uint64_t Value(unsigned Index) const; + + // set a value + void Set(unsigned Index, uint64_t); + + // return number of counters + uint32_t Size() const {return(m_CounterSize);}; + + // printf all values + void Dump() const; + +}; // class SstCounters + + +extern struct PerformanceCounters * gPerfCounters; + + +enum PerformanceCountersEnum +{ + // + // array index values/names + // (enum explicitly numbered to allow future edits / moves / inserts) + // + ePerfROFileOpen=0, //!< PosixMmapReadableFile open + ePerfROFileClose=1, //!< closed + ePerfROFileUnmap=2, //!< unmap without close + + ePerfRWFileOpen=3, //!< PosixMmapFile open + ePerfRWFileClose=4, //!< closed + ePerfRWFileUnmap=5, //!< unmap without close + + ePerfApiOpen=6, //!< Count of DB::Open completions + ePerfApiGet=7, //!< Count of DBImpl::Get completions + ePerfApiWrite=8, //!< Count of DBImpl::Get completions + + ePerfWriteSleep=9, //!< DBImpl::MakeRoomForWrite called sleep + ePerfWriteWaitImm=10, //!< DBImpl::MakeRoomForWrite called Wait on Imm compact + ePerfWriteWaitLevel0=11,//!< DBImpl::MakeRoomForWrite called Wait on Level0 compact + ePerfWriteNewMem=12, //!< DBImpl::MakeRoomForWrite created new memory log + ePerfWriteError=13, //!< DBImpl::MakeRoomForWrite saw bg_error_ + ePerfWriteNoWait=14, //!< DBImpl::MakeRoomForWrite took no action + + ePerfGetMem=15, //!< DBImpl::Get read from memory log + ePerfGetImm=16, //!< DBImpl::Get read from previous memory log + ePerfGetVersion=17, //!< DBImpl::Get read from Version object + + // code ASSUMES the levels are in numerical order, + // i.e. based off of ePerfSearchLevel0 + ePerfSearchLevel0=18, //!< Version::Get read searched one or more files here + ePerfSearchLevel1=19, //!< Version::Get read searched one or more files here + ePerfSearchLevel2=20, //!< Version::Get read searched one or more files here + ePerfSearchLevel3=21, //!< Version::Get read searched one or more files here + ePerfSearchLevel4=22, //!< Version::Get read searched one or more files here + ePerfSearchLevel5=23, //!< Version::Get read searched one or more files here + ePerfSearchLevel6=24, //!< Version::Get read searched one or more files here + + ePerfTableCached=25, //!< TableCache::FindTable found table in cache + ePerfTableOpened=26, //!< TableCache::FindTable had to open table file + ePerfTableGet=27, //!< TableCache::Get used to retrieve a key + + ePerfBGCloseUnmap=28, //!< PosixEnv::BGThreaed started Unmap/Close job + ePerfBGCompactImm=29, //!< PosixEnv::BGThreaed started compaction of Imm + ePerfBGNormal=30, //!< PosixEnv::BGThreaed started normal compaction job + ePerfBGCompactLevel0=31,//!< PosixEnv::BGThreaed started compaction of Level0 + + ePerfBlockFiltered=32, //!< Table::BlockReader search stopped due to filter + ePerfBlockFilterFalse=33,//!< Table::BlockReader gave a false positive for match + ePerfBlockCached=34, //!< Table::BlockReader found block in cache + ePerfBlockRead=35, //!< Table::BlockReader read block from disk + ePerfBlockFilterRead=36,//!< Table::ReadMeta filter loaded from file + ePerfBlockValidGet=37, //!< Table::InternalGet has valid iterator + + ePerfDebug0=38, //!< Developer debug counters, moveable + ePerfDebug1=39, //!< Developer debug counters, moveable + ePerfDebug2=40, //!< Developer debug counters, moveable + ePerfDebug3=41, //!< Developer debug counters, moveable + ePerfDebug4=42, //!< Developer debug counters, moveable + + ePerfReadBlockError=43, //!< crc or compression error in ReadBlock (format.cc) + + ePerfIterNew=44, //!< Count of DBImpl::NewDBIterator calls + ePerfIterNext=45, //!< Count of DBIter::Next calls + ePerfIterPrev=46, //!< Count of DBIter::Prev calls + ePerfIterSeek=47, //!< Count of DBIter::Seek calls + ePerfIterSeekFirst=48, //!< Count of DBIter::SeekFirst calls + ePerfIterSeekLast=49, //!< Count of DBIter::SeekLast calls + ePerfIterDelete=50, //!< Count of DBIter::~DBIter + + ePerfElevelDirect=51, //!< eleveldb's FindWaitingThread went direct to thread + ePerfElevelQueued=52, //!< eleveldb's FindWaitingThread queued work item + ePerfElevelDequeued=53, //!< eleveldb's worker took item from backlog queue + + ePerfElevelRefCreate=54,//!< eleveldb RefObject constructed + ePerfElevelRefDelete=55,//!< eleveldb RefObject destructed + + ePerfThrottleGauge=56, //!< current throttle value + ePerfThrottleCounter=57,//!< running throttle by seconds + + ePerfThrottleMicros0=58,//!< level 0 micros spent compacting + ePerfThrottleKeys0=59, //!< level 0 keys processed + ePerfThrottleBacklog0=60,//!< backlog at time of posting (level0) + ePerfThrottleCompacts0=61,//!< number of level 0 compactions + + ePerfThrottleMicros1=62,//!< level 1+ micros spent compacting + ePerfThrottleKeys1=63, //!< level 1+ keys processed + ePerfThrottleBacklog1=64,//!< backlog at time of posting (level1+) + ePerfThrottleCompacts1=65,//!< number of level 1+ compactions + + ePerfBGWriteError=66, //!< error in write/close, see syslog + + ePerfThrottleWait=67, //!< milliseconds of throttle wait + ePerfThreadError=68, //!< system error on thread related call, no LOG access + + ePerfBGImmDirect=69, //!< count Imm compactions happened directly + ePerfBGImmQueued=70, //!< count Imm compactions placed on queue + ePerfBGImmDequeued=71, //!< count Imm compactions removed from queue + ePerfBGImmWeighted=72, //!< total microseconds item spent on queue + + ePerfBGUnmapDirect=73, //!< count Unmap operations happened directly + ePerfBGUnmapQueued=74, //!< count Unmap operations placed on queue + ePerfBGUnmapDequeued=75,//!< count Unmap operations removed from queue + ePerfBGUnmapWeighted=76,//!< total microseconds item spent on queue + + ePerfBGLevel0Direct=77, //!< count Level0 compactions happened directly + ePerfBGLevel0Queued=78, //!< count Level0 compactions placed on queue + ePerfBGLevel0Dequeued=79,//!< count Level0 compactions removed from queue + ePerfBGLevel0Weighted=80,//!< total microseconds item spent on queue + + ePerfBGCompactDirect=81, //!< count generic compactions happened directly + ePerfBGCompactQueued=82, //!< count generic compactions placed on queue + ePerfBGCompactDequeued=83,//!< count generic compactions removed from queue + ePerfBGCompactWeighted=84,//!< total microseconds item spent on queue + + ePerfFileCacheInsert=85, //!< total bytes inserted into file cache + ePerfFileCacheRemove=86, //!< total bytes removed from file cache + + ePerfBlockCacheInsert=87, //!< total bytes inserted into block cache + ePerfBlockCacheRemove=88, //!< total bytes removed from block cache + + ePerfApiDelete=89, //!< Count of DB::Delete + + ePerfBGMove=90, //!< compaction was a successful move + ePerfBGMoveFail=91, //!< compaction move failed, regular compaction attempted + + ePerfThrottleUnadjusted=92,//!< current unadjusted throttle gauge + + // this one was added to the other ePerfElevelXxx counters above when we backported HotThreadPool to eleveldb + ePerfElevelWeighted=93, //!< total microseconds item spent on queue + + ePerfExpiredKeys=94, //!< key physically removed because it expired + ePerfExpiredFiles=95, //!< entire file removed because all keys expired + + ePerfSyslogWrite=96, //!< logged message to syslog + ePerfBackupStarted=97, //!< hot backup initiated + ePerfBackupError=98, //!< hot backup had an error + + ePerfPropCacheHit=99, //!< property cache had data + ePerfPropCacheMiss=100, //!< property cache had to look up data + ePerfPropCacheError=101, //!< no property cache entry built/located + + // must follow last index name to represent size of array + // (ASSUMES previous enum is highest value) + ePerfCountEnumSize, //!< size of the array described by the enum values + + ePerfVersion=1, //!< structure versioning + ePerfKey=41207 //!< random number as shared memory identifier +}; + + +struct PerfCounterAttributes +{ + const char * m_PerfCounterName; //!< text description + const bool m_PerfDiscretionary; //!< true if ok to disable +}; // PerfCounterAttributes + + +// +// Do NOT use virtual functions. This structure will be aligned at different +// locations in multiple processes. Things can get messy with virtuals. + +struct PerformanceCounters +{ +public: + static int m_LastError; + +protected: + uint32_t m_Version; //!< object revision identification + uint32_t m_CounterSize; //!< number of objects in m_Counter + + volatile uint64_t m_Counter[ePerfCountEnumSize]; + + static const PerfCounterAttributes m_PerfCounterAttr[]; + static int m_PerfSharedId; + static volatile uint64_t m_BogusCounter; //!< for out of range GetPtr calls + +public: + // only called for local object, not for shared memory + PerformanceCounters(); + + //!< does executable's idea of version match shared object? + bool VersionTest() + {return(ePerfCountEnumSize<=m_CounterSize && ePerfVersion==m_Version);}; + + //!< mostly for perf_count_test.cc + void SetVersion(uint32_t Version, uint32_t CounterSize) + {m_Version=Version; m_CounterSize=CounterSize;}; + + static PerformanceCounters * Init(bool IsReadOnly); + static int Close(PerformanceCounters * Counts); + + uint64_t Inc(unsigned Index); + uint64_t Dec(unsigned Index); + + // add value to the counter + uint64_t Add(unsigned Index, uint64_t Amount); + + // return value of a counter + uint64_t Value(unsigned Index) const; + + // set a value + void Set(unsigned Index, uint64_t); + + volatile const uint64_t * GetPtr(unsigned Index) const; + + static const char * GetNamePtr(unsigned Index); + + int LookupCounter(const char * Name); + + void Dump(); + +}; // struct PerformanceCounters + +extern PerformanceCounters * gPerfCounters; + +extern volatile bool gPerfCountersDisabled; + +} // namespace leveldb + +#endif // STORAGE_LEVELDB_INCLUDE_PERF_COUNT_H_ diff --git a/src/leveldb/include/leveldb/slice.h b/src/leveldb/include/leveldb/slice.h index bc367986f..74ea8fa49 100644 --- a/src/leveldb/include/leveldb/slice.h +++ b/src/leveldb/include/leveldb/slice.h @@ -94,7 +94,7 @@ inline bool operator!=(const Slice& x, const Slice& y) { } inline int Slice::compare(const Slice& b) const { - const size_t min_len = (size_ < b.size_) ? size_ : b.size_; + const int min_len = (size_ < b.size_) ? size_ : b.size_; int r = memcmp(data_, b.data_, min_len); if (r == 0) { if (size_ < b.size_) r = -1; diff --git a/src/leveldb/include/leveldb/status.h b/src/leveldb/include/leveldb/status.h index d9575f975..11dbd4b47 100644 --- a/src/leveldb/include/leveldb/status.h +++ b/src/leveldb/include/leveldb/status.h @@ -60,12 +60,6 @@ class Status { // Returns true iff the status indicates an IOError. bool IsIOError() const { return code() == kIOError; } - // Returns true iff the status indicates a NotSupportedError. - bool IsNotSupportedError() const { return code() == kNotSupported; } - - // Returns true iff the status indicates an InvalidArgument. - bool IsInvalidArgument() const { return code() == kInvalidArgument; } - // Return a string representation of this status suitable for printing. // Returns the string "OK" for success. std::string ToString() const; diff --git a/src/leveldb/include/leveldb/table.h b/src/leveldb/include/leveldb/table.h index a9746c3f5..96e8e81d9 100644 --- a/src/leveldb/include/leveldb/table.h +++ b/src/leveldb/include/leveldb/table.h @@ -7,6 +7,7 @@ #include #include "leveldb/iterator.h" +#include "leveldb/perf_count.h" namespace leveldb { @@ -40,7 +41,7 @@ class Table { uint64_t file_size, Table** table); - ~Table(); + virtual ~Table(); // Returns a new iterator over the table contents. // The result of NewIterator() is initially invalid (caller must @@ -55,7 +56,29 @@ class Table { // be close to the file length. uint64_t ApproximateOffsetOf(const Slice& key) const; - private: + // return a static copy of the table's counters. + SstCounters GetSstCounters() const; + + // riak routine to retrieve total memory footprint of an open table + // object in memory + size_t TableObjectSize(); + + // riak routine to retrieve disk size of table file + // ("virtual" is for unit test activites) + virtual uint64_t GetFileSize(); + + // Riak routine to request bloom filter load on + // second read operation (not iterator read) + bool ReadFilter(); + + // access routines for testing tools, not for public use + Block * TEST_GetIndexBlock(); + size_t TEST_TableObjectSize() {return(TableObjectSize());}; + size_t TEST_FilterDataSize(); + static Iterator* TEST_BlockReader(void* Ptr, const ReadOptions& ROptions, const Slice& SliceReturn) + {return(BlockReader(Ptr, ROptions, SliceReturn));}; + + protected: // was private, made protected for unit tests struct Rep; Rep* rep_; @@ -69,11 +92,12 @@ class Table { Status InternalGet( const ReadOptions&, const Slice& key, void* arg, - void (*handle_result)(void* arg, const Slice& k, const Slice& v)); + bool (*handle_result)(void* arg, const Slice& k, const Slice& v)); void ReadMeta(const Footer& footer); - void ReadFilter(const Slice& filter_handle_value); + void ReadFilter(class BlockHandle & filter_handle_value, const class FilterPolicy * policy); + void ReadSstCounters(const Slice& sst_counters_handle_value); // No copying allowed Table(const Table&); diff --git a/src/leveldb/include/leveldb/table_builder.h b/src/leveldb/include/leveldb/table_builder.h index 5fd1dc71f..cbe741f59 100644 --- a/src/leveldb/include/leveldb/table_builder.h +++ b/src/leveldb/include/leveldb/table_builder.h @@ -74,6 +74,14 @@ class TableBuilder { // Finish() call, returns the size of the final generated file. uint64_t FileSize() const; + // Number of delete tombstones so far. + uint64_t NumDeletes() const; + + // Retrieve expiry control values + uint64_t GetExpiryWriteLow() const; + uint64_t GetExpiryWriteHigh() const; + uint64_t GetExpiryExplicitHigh() const; + private: bool ok() const { return status().ok(); } void WriteBlock(BlockBuilder* block, BlockHandle* handle); diff --git a/src/leveldb/include/leveldb/write_batch.h b/src/leveldb/include/leveldb/write_batch.h index ee9aab68e..bd887fd62 100644 --- a/src/leveldb/include/leveldb/write_batch.h +++ b/src/leveldb/include/leveldb/write_batch.h @@ -23,6 +23,7 @@ #include #include "leveldb/status.h" +#include "leveldb/options.h" namespace leveldb { @@ -34,7 +35,7 @@ class WriteBatch { ~WriteBatch(); // Store the mapping "key->value" in the database. - void Put(const Slice& key, const Slice& value); + void Put(const Slice& key, const Slice& value, const KeyMetaData * meta=NULL); // If the database contains a mapping for "key", erase it. Else do nothing. void Delete(const Slice& key); @@ -46,7 +47,8 @@ class WriteBatch { class Handler { public: virtual ~Handler(); - virtual void Put(const Slice& key, const Slice& value) = 0; + virtual void Put(const Slice& key, const Slice& value, + const ValueType & type, const ExpiryTimeMicros & expiry) = 0; virtual void Delete(const Slice& key) = 0; }; Status Iterate(Handler* handler) const; diff --git a/src/leveldb/issues/issue178_test.cc b/src/leveldb/issues/issue178_test.cc deleted file mode 100644 index 1b1cf8bb2..000000000 --- a/src/leveldb/issues/issue178_test.cc +++ /dev/null @@ -1,92 +0,0 @@ -// Copyright (c) 2013 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -// Test for issue 178: a manual compaction causes deleted data to reappear. -#include -#include -#include - -#include "leveldb/db.h" -#include "leveldb/write_batch.h" -#include "util/testharness.h" - -namespace { - -const int kNumKeys = 1100000; - -std::string Key1(int i) { - char buf[100]; - snprintf(buf, sizeof(buf), "my_key_%d", i); - return buf; -} - -std::string Key2(int i) { - return Key1(i) + "_xxx"; -} - -class Issue178 { }; - -TEST(Issue178, Test) { - // Get rid of any state from an old run. - std::string dbpath = leveldb::test::TmpDir() + "/leveldb_cbug_test"; - DestroyDB(dbpath, leveldb::Options()); - - // Open database. Disable compression since it affects the creation - // of layers and the code below is trying to test against a very - // specific scenario. - leveldb::DB* db; - leveldb::Options db_options; - db_options.create_if_missing = true; - db_options.compression = leveldb::kNoCompression; - ASSERT_OK(leveldb::DB::Open(db_options, dbpath, &db)); - - // create first key range - leveldb::WriteBatch batch; - for (size_t i = 0; i < kNumKeys; i++) { - batch.Put(Key1(i), "value for range 1 key"); - } - ASSERT_OK(db->Write(leveldb::WriteOptions(), &batch)); - - // create second key range - batch.Clear(); - for (size_t i = 0; i < kNumKeys; i++) { - batch.Put(Key2(i), "value for range 2 key"); - } - ASSERT_OK(db->Write(leveldb::WriteOptions(), &batch)); - - // delete second key range - batch.Clear(); - for (size_t i = 0; i < kNumKeys; i++) { - batch.Delete(Key2(i)); - } - ASSERT_OK(db->Write(leveldb::WriteOptions(), &batch)); - - // compact database - std::string start_key = Key1(0); - std::string end_key = Key1(kNumKeys - 1); - leveldb::Slice least(start_key.data(), start_key.size()); - leveldb::Slice greatest(end_key.data(), end_key.size()); - - // commenting out the line below causes the example to work correctly - db->CompactRange(&least, &greatest); - - // count the keys - leveldb::Iterator* iter = db->NewIterator(leveldb::ReadOptions()); - size_t num_keys = 0; - for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { - num_keys++; - } - delete iter; - ASSERT_EQ(kNumKeys, num_keys) << "Bad number of keys"; - - // close database - delete db; - DestroyDB(dbpath, leveldb::Options()); -} - -} // anonymous namespace - -int main(int argc, char** argv) { - return leveldb::test::RunAllTests(); -} diff --git a/src/leveldb/issues/issue200_test.cc b/src/leveldb/issues/issue200_test.cc deleted file mode 100644 index 1cec79f44..000000000 --- a/src/leveldb/issues/issue200_test.cc +++ /dev/null @@ -1,59 +0,0 @@ -// Copyright (c) 2013 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -// Test for issue 200: when iterator switches direction from backward -// to forward, the current key can be yielded unexpectedly if a new -// mutation has been added just before the current key. - -#include "leveldb/db.h" -#include "util/testharness.h" - -namespace leveldb { - -class Issue200 { }; - -TEST(Issue200, Test) { - // Get rid of any state from an old run. - std::string dbpath = test::TmpDir() + "/leveldb_issue200_test"; - DestroyDB(dbpath, Options()); - - DB *db; - Options options; - options.create_if_missing = true; - ASSERT_OK(DB::Open(options, dbpath, &db)); - - WriteOptions write_options; - ASSERT_OK(db->Put(write_options, "1", "b")); - ASSERT_OK(db->Put(write_options, "2", "c")); - ASSERT_OK(db->Put(write_options, "3", "d")); - ASSERT_OK(db->Put(write_options, "4", "e")); - ASSERT_OK(db->Put(write_options, "5", "f")); - - ReadOptions read_options; - Iterator *iter = db->NewIterator(read_options); - - // Add an element that should not be reflected in the iterator. - ASSERT_OK(db->Put(write_options, "25", "cd")); - - iter->Seek("5"); - ASSERT_EQ(iter->key().ToString(), "5"); - iter->Prev(); - ASSERT_EQ(iter->key().ToString(), "4"); - iter->Prev(); - ASSERT_EQ(iter->key().ToString(), "3"); - iter->Next(); - ASSERT_EQ(iter->key().ToString(), "4"); - iter->Next(); - ASSERT_EQ(iter->key().ToString(), "5"); - - delete iter; - delete db; - DestroyDB(dbpath, options); -} - -} // namespace leveldb - -int main(int argc, char** argv) { - return leveldb::test::RunAllTests(); -} diff --git a/src/leveldb/leveldb_os/compile_opt.cc b/src/leveldb/leveldb_os/compile_opt.cc new file mode 100644 index 000000000..b311bcd43 --- /dev/null +++ b/src/leveldb/leveldb_os/compile_opt.cc @@ -0,0 +1,32 @@ +// ------------------------------------------------------------------- +// +// compile_opt.h +// +// Copyright (c) 2016 Basho Technologies, Inc. All Rights Reserved. +// +// This file is provided to you under the Apache License, +// Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain +// a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// ------------------------------------------------------------------- + +#include "leveldb/options.h" + +namespace leveldb +{ + const char * CompileOptionsString() + { + return("(open source)"); + } +} // namespace leveldb + diff --git a/src/leveldb/leveldb_os/expiry_os_stub.cc b/src/leveldb/leveldb_os/expiry_os_stub.cc new file mode 100644 index 000000000..a8463e233 --- /dev/null +++ b/src/leveldb/leveldb_os/expiry_os_stub.cc @@ -0,0 +1,62 @@ +// ------------------------------------------------------------------- +// +// expiry_os_stub.cc +// +// Copyright (c) 2016-2017 Basho Technologies, Inc. All Rights Reserved. +// +// This file is provided to you under the Apache License, +// Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain +// a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// ------------------------------------------------------------------- + +#include "db/dbformat.h" +#include "leveldb/expiry.h" +#include "util/expiry_os.h" + +namespace leveldb { + +/** + * This is the factory function to create + * an open source version of object expiry + */ +ExpiryModule * +ExpiryModule::CreateExpiryModule( + EleveldbRouter_t Router) +{ + + return(new leveldb::ExpiryModuleOS); + +} // ExpiryModule::CreateExpiryModule() + + +void +ExpiryModule::ShutdownExpiryModule() +{ + + return; + +} // ExpiryModule::ShutdownExpiryModule + + +uint64_t +CuttlefishDurationMinutes( + const char * Buffer) +{ + + // zero is safe return since it implies "disable write time expiry" + return(0); + +} // CuttlefishDurationMinutes + +} // namespace leveldb diff --git a/src/leveldb/leveldb_os/hot_backup_stub.cc b/src/leveldb/leveldb_os/hot_backup_stub.cc new file mode 100644 index 000000000..73190975f --- /dev/null +++ b/src/leveldb/leveldb_os/hot_backup_stub.cc @@ -0,0 +1,37 @@ +// ------------------------------------------------------------------- +// +// hot_backup_stub.cc +// +// Copyright (c) 2011-2016 Basho Technologies, Inc. All Rights Reserved. +// +// This file is provided to you under the Apache License, +// Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain +// a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// ------------------------------------------------------------------- + +#include "util/thread_tasks.h" + +namespace leveldb { + +/** + * Called by throttle.cc's thread once a minute. Used to + * test for trigger condition + */ +void +CheckHotBackupTrigger() +{ + return; +} // CheckHotBackupTrigger + +} // namespace leveldb diff --git a/src/leveldb/leveldb_os/prop_cache_stub.cc b/src/leveldb/leveldb_os/prop_cache_stub.cc new file mode 100644 index 000000000..47778977d --- /dev/null +++ b/src/leveldb/leveldb_os/prop_cache_stub.cc @@ -0,0 +1,41 @@ +// ------------------------------------------------------------------- +// +// hot_backup_stub.cc +// +// Copyright (c) 2011-2016 Basho Technologies, Inc. All Rights Reserved. +// +// This file is provided to you under the Apache License, +// Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain +// a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// ------------------------------------------------------------------- + +#include "util/prop_cache.h" + +namespace leveldb { + +/** + * Internal Lookup function that first requests property + * data from Eleveldb Router, then waits for the data + * to post to the cache. + */ +Cache::Handle * +PropertyCache::LookupWait( + const Slice & CompositeBucket) +{ + + return(NULL); + +} // PropertyCache::LookupWait + +} // namespace leveldb diff --git a/src/leveldb/leveldb_os/warming_stub.cc b/src/leveldb/leveldb_os/warming_stub.cc new file mode 100644 index 000000000..6db93dfc4 --- /dev/null +++ b/src/leveldb/leveldb_os/warming_stub.cc @@ -0,0 +1,48 @@ +// ------------------------------------------------------------------- +// +// cache_warm.cc +// +// Copyright (c) 2011-2016 Basho Technologies, Inc. All Rights Reserved. +// +// This file is provided to you under the Apache License, +// Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain +// a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// ------------------------------------------------------------------- + +#include "db/table_cache.h" + +namespace leveldb { + + +/** + * Riak specific routine to push list of open files to disk + */ +Status +TableCache::SaveOpenFileList() +{ + return(Status::OK()); +} // TableCache::SaveOpenFiles + + +/** + * Riak specific routine to read list of previously open files + * and preload them into the table cache + */ +Status +TableCache::PreloadTableCache() +{ + return(Status::OK()); +} // TableCache::PreloadTableCache + +} // namespace leveldb diff --git a/src/leveldb/port/atomic_pointer.h b/src/leveldb/port/atomic_pointer.h index d79a02230..2b485c7f7 100644 --- a/src/leveldb/port/atomic_pointer.h +++ b/src/leveldb/port/atomic_pointer.h @@ -5,13 +5,14 @@ // AtomicPointer provides storage for a lock-free pointer. // Platform-dependent implementation of AtomicPointer: // - If the platform provides a cheap barrier, we use it with raw pointers -// - If is present (on newer versions of gcc, it is), we use -// a -based AtomicPointer. However we prefer the memory +// - If cstdatomic is present (on newer versions of gcc, it is), we use +// a cstdatomic-based AtomicPointer. However we prefer the memory // barrier based version, because at least on a gcc 4.4 32-bit build -// on linux, we have encountered a buggy implementation. -// Also, some implementations are much slower than a memory-barrier -// based implementation (~16ns for based acquire-load vs. ~1ns for -// a barrier based acquire-load). +// on linux, we have encountered a buggy +// implementation. Also, some implementations are much +// slower than a memory-barrier based implementation (~16ns for +// based acquire-load vs. ~1ns for a barrier based +// acquire-load). // This code is based on atomicops-internals-* in Google's perftools: // http://code.google.com/p/google-perftools/source/browse/#svn%2Ftrunk%2Fsrc%2Fbase @@ -19,9 +20,9 @@ #define PORT_ATOMIC_POINTER_H_ #include -#ifdef LEVELDB_ATOMIC_PRESENT -#include -#endif +//#ifdef LEVELDB_CSTDATOMIC_PRESENT +//#include ... moved below +//#endif #ifdef OS_WIN #include #endif @@ -35,41 +36,11 @@ #define ARCH_CPU_X86_FAMILY 1 #elif defined(__ARMEL__) #define ARCH_CPU_ARM_FAMILY 1 -#elif defined(__aarch64__) -#define ARCH_CPU_ARM64_FAMILY 1 -#elif defined(__ppc__) || defined(__powerpc__) || defined(__powerpc64__) -#define ARCH_CPU_PPC_FAMILY 1 -#elif defined(__mips__) -#define ARCH_CPU_MIPS_FAMILY 1 #endif namespace leveldb { namespace port { -// AtomicPointer based on if available -#if defined(LEVELDB_ATOMIC_PRESENT) -class AtomicPointer { - private: - std::atomic rep_; - public: - AtomicPointer() { } - explicit AtomicPointer(void* v) : rep_(v) { } - inline void* Acquire_Load() const { - return rep_.load(std::memory_order_acquire); - } - inline void Release_Store(void* v) { - rep_.store(v, std::memory_order_release); - } - inline void* NoBarrier_Load() const { - return rep_.load(std::memory_order_relaxed); - } - inline void NoBarrier_Store(void* v) { - rep_.store(v, std::memory_order_relaxed); - } -}; - -#else - // Define MemoryBarrier() if available // Windows on x86 #if defined(OS_WIN) && defined(COMPILER_MSVC) && defined(ARCH_CPU_X86_FAMILY) @@ -77,13 +48,6 @@ class AtomicPointer { // http://msdn.microsoft.com/en-us/library/ms684208(v=vs.85).aspx #define LEVELDB_HAVE_MEMORY_BARRIER -// Mac OS -#elif defined(OS_MACOSX) -inline void MemoryBarrier() { - OSMemoryBarrier(); -} -#define LEVELDB_HAVE_MEMORY_BARRIER - // Gcc on x86 #elif defined(ARCH_CPU_X86_FAMILY) && defined(__GNUC__) inline void MemoryBarrier() { @@ -102,6 +66,13 @@ inline void MemoryBarrier() { } #define LEVELDB_HAVE_MEMORY_BARRIER +// Mac OS +#elif defined(OS_MACOSX) +inline void MemoryBarrier() { + OSMemoryBarrier(); +} +#define LEVELDB_HAVE_MEMORY_BARRIER + // ARM Linux #elif defined(ARCH_CPU_ARM_FAMILY) && defined(__linux__) typedef void (*LinuxKernelMemoryBarrierFunc)(void); @@ -120,29 +91,6 @@ inline void MemoryBarrier() { } #define LEVELDB_HAVE_MEMORY_BARRIER -// ARM64 -#elif defined(ARCH_CPU_ARM64_FAMILY) -inline void MemoryBarrier() { - asm volatile("dmb sy" : : : "memory"); -} -#define LEVELDB_HAVE_MEMORY_BARRIER - -// PPC -#elif defined(ARCH_CPU_PPC_FAMILY) && defined(__GNUC__) -inline void MemoryBarrier() { - // TODO for some powerpc expert: is there a cheaper suitable variant? - // Perhaps by having separate barriers for acquire and release ops. - asm volatile("sync" : : : "memory"); -} -#define LEVELDB_HAVE_MEMORY_BARRIER - -// MIPS -#elif defined(ARCH_CPU_MIPS_FAMILY) && defined(__GNUC__) -inline void MemoryBarrier() { - __asm__ __volatile__("sync" : : : "memory"); -} -#define LEVELDB_HAVE_MEMORY_BARRIER - #endif // AtomicPointer built using platform-specific MemoryBarrier() @@ -166,78 +114,39 @@ class AtomicPointer { } }; -// Atomic pointer based on sparc memory barriers -#elif defined(__sparcv9) && defined(__GNUC__) +// AtomicPointer based on +#elif defined(LEVELDB_CSTDATOMIC_PRESENT) +#include + class AtomicPointer { private: - void* rep_; + std::atomic rep_; public: AtomicPointer() { } explicit AtomicPointer(void* v) : rep_(v) { } inline void* Acquire_Load() const { - void* val; - __asm__ __volatile__ ( - "ldx [%[rep_]], %[val] \n\t" - "membar #LoadLoad|#LoadStore \n\t" - : [val] "=r" (val) - : [rep_] "r" (&rep_) - : "memory"); - return val; + return rep_.load(std::memory_order_acquire); } inline void Release_Store(void* v) { - __asm__ __volatile__ ( - "membar #LoadStore|#StoreStore \n\t" - "stx %[v], [%[rep_]] \n\t" - : - : [rep_] "r" (&rep_), [v] "r" (v) - : "memory"); + rep_.store(v, std::memory_order_release); + } + inline void* NoBarrier_Load() const { + return rep_.load(std::memory_order_relaxed); + } + inline void NoBarrier_Store(void* v) { + rep_.store(v, std::memory_order_relaxed); } - inline void* NoBarrier_Load() const { return rep_; } - inline void NoBarrier_Store(void* v) { rep_ = v; } }; -// Atomic pointer based on ia64 acq/rel -#elif defined(__ia64) && defined(__GNUC__) -class AtomicPointer { - private: - void* rep_; - public: - AtomicPointer() { } - explicit AtomicPointer(void* v) : rep_(v) { } - inline void* Acquire_Load() const { - void* val ; - __asm__ __volatile__ ( - "ld8.acq %[val] = [%[rep_]] \n\t" - : [val] "=r" (val) - : [rep_] "r" (&rep_) - : "memory" - ); - return val; - } - inline void Release_Store(void* v) { - __asm__ __volatile__ ( - "st8.rel [%[rep_]] = %[v] \n\t" - : - : [rep_] "r" (&rep_), [v] "r" (v) - : "memory" - ); - } - inline void* NoBarrier_Load() const { return rep_; } - inline void NoBarrier_Store(void* v) { rep_ = v; } -}; - -// We have neither MemoryBarrier(), nor +// We have neither MemoryBarrier(), nor #else #error Please implement AtomicPointer for this platform. -#endif #endif #undef LEVELDB_HAVE_MEMORY_BARRIER #undef ARCH_CPU_X86_FAMILY #undef ARCH_CPU_ARM_FAMILY -#undef ARCH_CPU_ARM64_FAMILY -#undef ARCH_CPU_PPC_FAMILY } // namespace port } // namespace leveldb diff --git a/src/leveldb/port/port.h b/src/leveldb/port/port.h index 4baafa8e2..d3c5d6aad 100644 --- a/src/leveldb/port/port.h +++ b/src/leveldb/port/port.h @@ -6,6 +6,7 @@ #define STORAGE_LEVELDB_PORT_PORT_H_ #include +#include "leveldb/ldb_config.h" // Include the appropriate platform specific file below. If you are // porting to a new platform, see "port_example.h" for documentation @@ -14,8 +15,6 @@ # include "port/port_posix.h" #elif defined(LEVELDB_PLATFORM_CHROMIUM) # include "port/port_chromium.h" -#elif defined(LEVELDB_PLATFORM_WINDOWS) -# include "port/port_win.h" #endif #endif // STORAGE_LEVELDB_PORT_PORT_H_ diff --git a/src/leveldb/port/port_android.cc b/src/leveldb/port/port_android.cc new file mode 100644 index 000000000..815abf299 --- /dev/null +++ b/src/leveldb/port/port_android.cc @@ -0,0 +1,64 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "port/port_android.h" + +#include + +extern "C" { +size_t fread_unlocked(void *a, size_t b, size_t c, FILE *d) { + return fread(a, b, c, d); +} + +size_t fwrite_unlocked(const void *a, size_t b, size_t c, FILE *d) { + return fwrite(a, b, c, d); +} + +int fflush_unlocked(FILE *f) { + return fflush(f); +} + +int fdatasync(int fd) { + return fsync(fd); +} +} + +namespace leveldb { +namespace port { + +static void PthreadCall(const char* label, int result) { + if (result != 0) { + fprintf(stderr, "pthread %s: %s\n", label, strerror(result)); + abort(); + } +} + +Mutex::Mutex() { PthreadCall("init mutex", pthread_mutex_init(&mu_, NULL)); } +Mutex::~Mutex() { PthreadCall("destroy mutex", pthread_mutex_destroy(&mu_)); } +void Mutex::Lock() { PthreadCall("lock", pthread_mutex_lock(&mu_)); } +void Mutex::Unlock() { PthreadCall("unlock", pthread_mutex_unlock(&mu_)); } + +CondVar::CondVar(Mutex* mu) + : mu_(mu) { + PthreadCall("init cv", pthread_cond_init(&cv_, NULL)); +} + +CondVar::~CondVar() { + PthreadCall("destroy cv", pthread_cond_destroy(&cv_)); +} + +void CondVar::Wait() { + PthreadCall("wait", pthread_cond_wait(&cv_, &mu_->mu_)); +} + +void CondVar::Signal(){ + PthreadCall("signal", pthread_cond_signal(&cv_)); +} + +void CondVar::SignalAll() { + PthreadCall("broadcast", pthread_cond_broadcast(&cv_)); +} + +} // namespace port +} // namespace leveldb diff --git a/src/leveldb/port/port_android.h b/src/leveldb/port/port_android.h new file mode 100644 index 000000000..b733388d8 --- /dev/null +++ b/src/leveldb/port/port_android.h @@ -0,0 +1,159 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// See port_example.h for documentation for the following types/functions. + +#ifndef STORAGE_LEVELDB_PORT_PORT_ANDROID_H_ +#define STORAGE_LEVELDB_PORT_PORT_ANDROID_H_ + +#include +#include +#include +#include +#include +#include + +// Collapse the plethora of ARM flavors available to an easier to manage set +// Defs reference is at https://wiki.edubuntu.org/ARM/Thumb2PortingHowto +#if defined(__ARM_ARCH_6__) || \ + defined(__ARM_ARCH_6J__) || \ + defined(__ARM_ARCH_6K__) || \ + defined(__ARM_ARCH_6Z__) || \ + defined(__ARM_ARCH_6T2__) || \ + defined(__ARM_ARCH_6ZK__) || \ + defined(__ARM_ARCH_7__) || \ + defined(__ARM_ARCH_7R__) || \ + defined(__ARM_ARCH_7A__) +#define ARMV6_OR_7 1 +#endif + +extern "C" { + size_t fread_unlocked(void *a, size_t b, size_t c, FILE *d); + size_t fwrite_unlocked(const void *a, size_t b, size_t c, FILE *d); + int fflush_unlocked(FILE *f); + int fdatasync (int fd); +} + +namespace leveldb { +namespace port { + +static const bool kLittleEndian = __BYTE_ORDER == __LITTLE_ENDIAN; + +class CondVar; + +class Mutex { + public: + Mutex(); + ~Mutex(); + + void Lock(); + void Unlock(); + void AssertHeld() { + //TODO(gabor): How can I implement this? + } + + private: + friend class CondVar; + pthread_mutex_t mu_; + + // No copying + Mutex(const Mutex&); + void operator=(const Mutex&); +}; + +class CondVar { + public: + explicit CondVar(Mutex* mu); + ~CondVar(); + void Wait(); + void Signal(); + void SignalAll(); + private: + Mutex* mu_; + pthread_cond_t cv_; +}; + +#ifndef ARMV6_OR_7 +// On ARM chipsets = V6 +#ifdef ARMV6_OR_7 + __asm__ __volatile__("dmb" : : : "memory"); +#else + pLinuxKernelMemoryBarrier(); +#endif + } + + public: + AtomicPointer() { } + explicit AtomicPointer(void* v) : rep_(v) { } + inline void* Acquire_Load() const { + void* r = rep_; + MemoryBarrier(); + return r; + } + inline void Release_Store(void* v) { + MemoryBarrier(); + rep_ = v; + } + inline void* NoBarrier_Load() const { + void* r = rep_; + return r; + } + inline void NoBarrier_Store(void* v) { + rep_ = v; + } +}; + +// TODO(gabor): Implement compress +inline bool Snappy_Compress( + const char* input, + size_t input_length, + std::string* output) { + return false; +} + +// TODO(gabor): Implement uncompress +inline bool Snappy_GetUncompressedLength(const char* input, size_t length, + size_t* result) { + return false; +} + +// TODO(gabor): Implement uncompress +inline bool Snappy_Uncompress( + const char* input_data, + size_t input_length, + char* output) { + return false; +} + +inline uint64_t ThreadIdentifier() { + pthread_t tid = pthread_self(); + uint64_t r = 0; + memcpy(&r, &tid, sizeof(r) < sizeof(tid) ? sizeof(r) : sizeof(tid)); + return r; +} + +inline bool GetHeapProfile(void (*func)(void*, const char*, int), void* arg) { + return false; +} + +} // namespace port +} // namespace leveldb + +#endif // STORAGE_LEVELDB_PORT_PORT_ANDROID_H_ diff --git a/src/leveldb/port/port_example.h b/src/leveldb/port/port_example.h index 5b1d027de..ab9e489b3 100644 --- a/src/leveldb/port/port_example.h +++ b/src/leveldb/port/port_example.h @@ -129,16 +129,6 @@ extern bool Snappy_Uncompress(const char* input_data, size_t input_length, // The concatenation of all "data[0,n-1]" fragments is the heap profile. extern bool GetHeapProfile(void (*func)(void*, const char*, int), void* arg); -// Determine whether a working accelerated crc32 implementation exists -// Returns true if AcceleratedCRC32C is safe to call -bool HasAcceleratedCRC32C(); - -// Extend the CRC to include the first n bytes of buf. -// -// Returns zero if the CRC cannot be extended using acceleration, else returns -// the newly extended CRC value (which may also be zero). -uint32_t AcceleratedCRC32C(uint32_t crc, const char* buf, size_t size); - } // namespace port } // namespace leveldb diff --git a/src/leveldb/port/port_posix.cc b/src/leveldb/port/port_posix.cc index 4b80203bd..280c29f6e 100644 --- a/src/leveldb/port/port_posix.cc +++ b/src/leveldb/port/port_posix.cc @@ -7,10 +7,9 @@ #include #include #include - -#if (defined(__x86_64__) || defined(__i386__)) && defined(__GNUC__) -#include -#endif +#include +#include "leveldb/env.h" +#include "util/logging.h" namespace leveldb { namespace port { @@ -18,11 +17,24 @@ namespace port { static void PthreadCall(const char* label, int result) { if (result != 0) { fprintf(stderr, "pthread %s: %s\n", label, strerror(result)); + Log(NULL, "pthread %s: %s\n", label, strerror(result)); abort(); } } -Mutex::Mutex() { PthreadCall("init mutex", pthread_mutex_init(&mu_, NULL)); } +Mutex::Mutex(bool recursive) { + if (recursive) { + pthread_mutexattr_t attr; + + PthreadCall("init mutex attr", pthread_mutexattr_init(&attr)); + PthreadCall("set mutex recursive", pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE)); + PthreadCall("init recursive mutex", pthread_mutex_init(&mu_, &attr)); + PthreadCall("destroy mutex attr", pthread_mutexattr_destroy(&attr)); + } + else { + PthreadCall("init mutex", pthread_mutex_init(&mu_, NULL)); + } +} Mutex::~Mutex() { PthreadCall("destroy mutex", pthread_mutex_destroy(&mu_)); } @@ -30,6 +42,16 @@ void Mutex::Lock() { PthreadCall("lock", pthread_mutex_lock(&mu_)); } void Mutex::Unlock() { PthreadCall("unlock", pthread_mutex_unlock(&mu_)); } +#if defined(_POSIX_SPIN_LOCKS) && 0<_POSIX_SPIN_LOCKS +Spin::Spin() { PthreadCall("init spinlock", pthread_spin_init(&sp_, PTHREAD_PROCESS_PRIVATE)); } + +Spin::~Spin() { PthreadCall("destroy spinlock", pthread_spin_destroy(&sp_)); } + +void Spin::Lock() { PthreadCall("lock spin", pthread_spin_lock(&sp_)); } + +void Spin::Unlock() { PthreadCall("unlock spin", pthread_spin_unlock(&sp_)); } +#endif + CondVar::CondVar(Mutex* mu) : mu_(mu) { PthreadCall("init cv", pthread_cond_init(&cv_, NULL)); @@ -41,6 +63,20 @@ void CondVar::Wait() { PthreadCall("wait", pthread_cond_wait(&cv_, &mu_->mu_)); } +bool CondVar::Wait(struct timespec* pTimespec) { + bool signaled = true; + int result = pthread_cond_timedwait(&cv_, &mu_->mu_, pTimespec); + if (0 != result) { + signaled = false; + + // the only expected errno is ETIMEDOUT; anything else is a real error + if (ETIMEDOUT != result) { + PthreadCall("timed wait", result); + } + } + return signaled; +} + void CondVar::Signal() { PthreadCall("signal", pthread_cond_signal(&cv_)); } @@ -53,15 +89,15 @@ void InitOnce(OnceType* once, void (*initializer)()) { PthreadCall("once", pthread_once(once, initializer)); } -bool HasAcceleratedCRC32C() { -#if (defined(__x86_64__) || defined(__i386__)) && defined(__GNUC__) - unsigned int eax, ebx, ecx = 0, edx; - __get_cpuid(1, &eax, &ebx, &ecx, &edx); - return (ecx & (1 << 20)) != 0; -#else - return false; -#endif -} +RWMutex::RWMutex() { PthreadCall("init mutex", pthread_rwlock_init(&mu_, NULL)); } + +RWMutex::~RWMutex() { PthreadCall("destroy mutex", pthread_rwlock_destroy(&mu_)); } + +void RWMutex::ReadLock() { PthreadCall("read lock", pthread_rwlock_rdlock(&mu_)); } + +void RWMutex::WriteLock() { PthreadCall("write lock", pthread_rwlock_wrlock(&mu_)); } + +void RWMutex::Unlock() { PthreadCall("unlock", pthread_rwlock_unlock(&mu_)); } } // namespace port } // namespace leveldb diff --git a/src/leveldb/port/port_posix.h b/src/leveldb/port/port_posix.h index d85fa5d63..4d9146289 100644 --- a/src/leveldb/port/port_posix.h +++ b/src/leveldb/port/port_posix.h @@ -7,6 +7,16 @@ #ifndef STORAGE_LEVELDB_PORT_PORT_POSIX_H_ #define STORAGE_LEVELDB_PORT_PORT_POSIX_H_ +// to properly pull in bits/posix_opt.h on Linux +#include +#include + +#if _POSIX_TIMERS >= 200801L + #include // declares clock_gettime() +#else + #include // declares gettimeofday() +#endif + #undef PLATFORM_IS_LITTLE_ENDIAN #if defined(OS_MACOSX) #include @@ -21,23 +31,17 @@ #else #define PLATFORM_IS_LITTLE_ENDIAN false #endif -#elif defined(OS_FREEBSD) || defined(OS_OPENBSD) ||\ - defined(OS_NETBSD) || defined(OS_DRAGONFLYBSD) +#elif defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) ||\ + defined(OS_DRAGONFLYBSD) || defined(OS_ANDROID) #include #include - #define PLATFORM_IS_LITTLE_ENDIAN (_BYTE_ORDER == _LITTLE_ENDIAN) -#elif defined(OS_HPUX) - #define PLATFORM_IS_LITTLE_ENDIAN false -#elif defined(OS_ANDROID) - // Due to a bug in the NDK x86 definition, - // _BYTE_ORDER must be used instead of __BYTE_ORDER on Android. - // See http://code.google.com/p/android/issues/detail?id=39824 - #include - #define PLATFORM_IS_LITTLE_ENDIAN (_BYTE_ORDER == _LITTLE_ENDIAN) + + #if !defined(PLATFORM_IS_LITTLE_ENDIAN) && defined(_BYTE_ORDER) + #define PLATFORM_IS_LITTLE_ENDIAN (_BYTE_ORDER == _LITTLE_ENDIAN) + #endif #else #include #endif - #include #ifdef SNAPPY #include @@ -52,21 +56,28 @@ #if defined(OS_MACOSX) || defined(OS_SOLARIS) || defined(OS_FREEBSD) ||\ defined(OS_NETBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLYBSD) ||\ - defined(OS_ANDROID) || defined(OS_HPUX) || defined(CYGWIN) + defined(OS_ANDROID) // Use fread/fwrite/fflush on platforms without _unlocked variants #define fread_unlocked fread #define fwrite_unlocked fwrite #define fflush_unlocked fflush #endif -#if defined(OS_FREEBSD) ||\ +#if defined(OS_MACOSX) || defined(OS_FREEBSD) ||\ defined(OS_OPENBSD) || defined(OS_DRAGONFLYBSD) // Use fsync() on platforms without fdatasync() #define fdatasync fsync #endif -#if defined(OS_MACOSX) -#define fdatasync(fd) fcntl(fd, F_FULLFSYNC, 0) +// Some compilers do not provide access to nested classes of a declared friend class +// Defining PUBLIC_NESTED_FRIEND_ACCESS will cause those declarations to be made +// public as a workaround. Added by David Smith, Basho. +#if defined(OS_MACOSX) || defined(OS_SOLARIS) +#define USED_BY_NESTED_FRIEND(a) public: a; private: +#define USED_BY_NESTED_FRIEND2(a,b) public: a,b; private: +#else +#define USED_BY_NESTED_FRIEND(a) a; +#define USED_BY_NESTED_FRIEND2(a,b) a,b; #endif #if defined(OS_ANDROID) && __ANDROID_API__ < 9 @@ -85,12 +96,12 @@ class CondVar; class Mutex { public: - Mutex(); + Mutex(bool recursive=false); // true => creates a mutex that can be locked recursively ~Mutex(); void Lock(); void Unlock(); - void AssertHeld() { } + void AssertHeld() {assert(0!=pthread_mutex_trylock(&mu_));} private: friend class CondVar; @@ -101,11 +112,40 @@ class Mutex { void operator=(const Mutex&); }; + +#if defined(_POSIX_SPIN_LOCKS) && 0<_POSIX_SPIN_LOCKS +class Spin { + public: + Spin(); + ~Spin(); + + void Lock(); + void Unlock(); + void AssertHeld() {assert(0!=pthread_spin_trylock(&sp_));} + + private: + friend class CondVar; + pthread_spinlock_t sp_; + + // No copying + Spin(const Spin&); + void operator=(const Spin&); +}; +#else +typedef Mutex Spin; +#endif + + class CondVar { public: explicit CondVar(Mutex* mu); ~CondVar(); void Wait(); + + // waits on the condition variable until the specified time is reached + bool // true => the condition variable was signaled, else timed out + Wait(struct timespec* pTimespec); + void Signal(); void SignalAll(); private: @@ -117,6 +157,27 @@ typedef pthread_once_t OnceType; #define LEVELDB_ONCE_INIT PTHREAD_ONCE_INIT extern void InitOnce(OnceType* once, void (*initializer)()); + +class RWMutex { + public: + RWMutex(); + ~RWMutex(); + + void ReadLock(); + void WriteLock(); + void Unlock(); + void AssertHeld() { } + + private: + pthread_rwlock_t mu_; + + // No copying + RWMutex(const RWMutex&); + void operator=(const RWMutex&); + +}; + + inline bool Snappy_Compress(const char* input, size_t length, ::std::string* output) { #ifdef SNAPPY @@ -152,8 +213,45 @@ inline bool GetHeapProfile(void (*func)(void*, const char*, int), void* arg) { return false; } -bool HasAcceleratedCRC32C(); -uint32_t AcceleratedCRC32C(uint32_t crc, const char* buf, size_t size); +// sets the name of the current thread +inline void SetCurrentThreadName(const char* threadName) { + if (NULL == threadName) { + threadName = ""; + } +#if defined(OS_MACOSX) + pthread_setname_np(threadName); +//#elif defined(OS_LINUX) +#elif defined(__GLIBC__) +#if __GLIBC_PREREQ(2,12) + pthread_setname_np(pthread_self(), threadName); +#endif +#elif defined(OS_NETBSD) + pthread_setname_np(pthread_self(), threadName, NULL); +#else + // we have some other platform(s) to support + // defined(OS_FREEBSD) ... freebsd-9.2, Feb 19, 2016 not working + // + // NOTE: do not fail here since this functionality is optional +#endif +} + +// similar to Env::NowMicros except guaranteed to return "time" instead +// of potentially only ticks since reboot +const uint64_t UINT64_ONE_SECOND_MICROS=1000000; + +inline uint64_t TimeMicros() { +#if _POSIX_TIMERS >= 200801L + struct timespec ts; + + // this is rumored to be faster than gettimeofday(), + clock_gettime(CLOCK_REALTIME, &ts); + return static_cast(ts.tv_sec) * 1000000 + ts.tv_nsec/1000; +#else + struct timeval tv; + gettimeofday(&tv, NULL); + return static_cast(tv.tv_sec) * 1000000 + tv.tv_usec; +#endif +} // TimeMicros } // namespace port } // namespace leveldb diff --git a/src/leveldb/port/port_posix_sse.cc b/src/leveldb/port/port_posix_sse.cc deleted file mode 100644 index 2d49c21dd..000000000 --- a/src/leveldb/port/port_posix_sse.cc +++ /dev/null @@ -1,110 +0,0 @@ -// Copyright 2016 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -// -// A portable implementation of crc32c, optimized to handle -// four bytes at a time. -// -// In a separate source file to allow this accelerated CRC32C function to be -// compiled with the appropriate compiler flags to enable x86 SSE 4.2 -// instructions. - -#include -#include -#include "port/port.h" - -#if defined(LEVELDB_PLATFORM_POSIX_SSE) - -#if defined(_MSC_VER) -#include -#elif defined(__GNUC__) && defined(__SSE4_2__) -#include -#endif - -#endif // defined(LEVELDB_PLATFORM_POSIX_SSE) - -namespace leveldb { -namespace port { - -#if defined(LEVELDB_PLATFORM_POSIX_SSE) - -// Used to fetch a naturally-aligned 32-bit word in little endian byte-order -static inline uint32_t LE_LOAD32(const uint8_t *p) { - // SSE is x86 only, so ensured that |p| is always little-endian. - uint32_t word; - memcpy(&word, p, sizeof(word)); - return word; -} - -#if defined(_M_X64) || defined(__x86_64__) // LE_LOAD64 is only used on x64. - -// Used to fetch a naturally-aligned 64-bit word in little endian byte-order -static inline uint64_t LE_LOAD64(const uint8_t *p) { - uint64_t dword; - memcpy(&dword, p, sizeof(dword)); - return dword; -} - -#endif // defined(_M_X64) || defined(__x86_64__) - -#endif // defined(LEVELDB_PLATFORM_POSIX_SSE) - -// For further improvements see Intel publication at: -// http://download.intel.com/design/intarch/papers/323405.pdf -uint32_t AcceleratedCRC32C(uint32_t crc, const char* buf, size_t size) { -#if !defined(LEVELDB_PLATFORM_POSIX_SSE) - return 0; -#else - - const uint8_t *p = reinterpret_cast(buf); - const uint8_t *e = p + size; - uint32_t l = crc ^ 0xffffffffu; - -#define STEP1 do { \ - l = _mm_crc32_u8(l, *p++); \ -} while (0) -#define STEP4 do { \ - l = _mm_crc32_u32(l, LE_LOAD32(p)); \ - p += 4; \ -} while (0) -#define STEP8 do { \ - l = _mm_crc32_u64(l, LE_LOAD64(p)); \ - p += 8; \ -} while (0) - - if (size > 16) { - // Process unaligned bytes - for (unsigned int i = reinterpret_cast(p) % 8; i; --i) { - STEP1; - } - - // _mm_crc32_u64 is only available on x64. -#if defined(_M_X64) || defined(__x86_64__) - // Process 8 bytes at a time - while ((e-p) >= 8) { - STEP8; - } - // Process 4 bytes at a time - if ((e-p) >= 4) { - STEP4; - } -#else // !(defined(_M_X64) || defined(__x86_64__)) - // Process 4 bytes at a time - while ((e-p) >= 4) { - STEP4; - } -#endif // defined(_M_X64) || defined(__x86_64__) - } - // Process the last few bytes - while (p != e) { - STEP1; - } -#undef STEP8 -#undef STEP4 -#undef STEP1 - return l ^ 0xffffffffu; -#endif // defined(LEVELDB_PLATFORM_POSIX_SSE) -} - -} // namespace port -} // namespace leveldb diff --git a/src/leveldb/port/port_win.cc b/src/leveldb/port/port_win.cc deleted file mode 100644 index 1be9e8d5b..000000000 --- a/src/leveldb/port/port_win.cc +++ /dev/null @@ -1,158 +0,0 @@ -// LevelDB Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -// -// See port_example.h for documentation for the following types/functions. - -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// * Neither the name of the University of California, Berkeley nor the -// names of its contributors may be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -// DISCLAIMED. IN NO EVENT SHALL THE REGENTS AND CONTRIBUTORS BE LIABLE FOR ANY -// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// - -#include "port/port_win.h" - -#include -#include -#include - -namespace leveldb { -namespace port { - -Mutex::Mutex() : - cs_(NULL) { - assert(!cs_); - cs_ = static_cast(new CRITICAL_SECTION()); - ::InitializeCriticalSection(static_cast(cs_)); - assert(cs_); -} - -Mutex::~Mutex() { - assert(cs_); - ::DeleteCriticalSection(static_cast(cs_)); - delete static_cast(cs_); - cs_ = NULL; - assert(!cs_); -} - -void Mutex::Lock() { - assert(cs_); - ::EnterCriticalSection(static_cast(cs_)); -} - -void Mutex::Unlock() { - assert(cs_); - ::LeaveCriticalSection(static_cast(cs_)); -} - -void Mutex::AssertHeld() { - assert(cs_); - assert(1); -} - -CondVar::CondVar(Mutex* mu) : - waiting_(0), - mu_(mu), - sem1_(::CreateSemaphore(NULL, 0, 10000, NULL)), - sem2_(::CreateSemaphore(NULL, 0, 10000, NULL)) { - assert(mu_); -} - -CondVar::~CondVar() { - ::CloseHandle(sem1_); - ::CloseHandle(sem2_); -} - -void CondVar::Wait() { - mu_->AssertHeld(); - - wait_mtx_.Lock(); - ++waiting_; - wait_mtx_.Unlock(); - - mu_->Unlock(); - - // initiate handshake - ::WaitForSingleObject(sem1_, INFINITE); - ::ReleaseSemaphore(sem2_, 1, NULL); - mu_->Lock(); -} - -void CondVar::Signal() { - wait_mtx_.Lock(); - if (waiting_ > 0) { - --waiting_; - - // finalize handshake - ::ReleaseSemaphore(sem1_, 1, NULL); - ::WaitForSingleObject(sem2_, INFINITE); - } - wait_mtx_.Unlock(); -} - -void CondVar::SignalAll() { - wait_mtx_.Lock(); - ::ReleaseSemaphore(sem1_, waiting_, NULL); - while(waiting_ > 0) { - --waiting_; - ::WaitForSingleObject(sem2_, INFINITE); - } - wait_mtx_.Unlock(); -} - -AtomicPointer::AtomicPointer(void* v) { - Release_Store(v); -} - -void InitOnce(OnceType* once, void (*initializer)()) { - once->InitOnce(initializer); -} - -void* AtomicPointer::Acquire_Load() const { - void * p = NULL; - InterlockedExchangePointer(&p, rep_); - return p; -} - -void AtomicPointer::Release_Store(void* v) { - InterlockedExchangePointer(&rep_, v); -} - -void* AtomicPointer::NoBarrier_Load() const { - return rep_; -} - -void AtomicPointer::NoBarrier_Store(void* v) { - rep_ = v; -} - -bool HasAcceleratedCRC32C() { -#if defined(__x86_64__) || defined(__i386__) - int cpu_info[4]; - __cpuid(cpu_info, 1); - return (cpu_info[2] & (1 << 20)) != 0; -#else - return false; -#endif -} - -} -} diff --git a/src/leveldb/port/port_win.h b/src/leveldb/port/port_win.h deleted file mode 100644 index e8bf46ef2..000000000 --- a/src/leveldb/port/port_win.h +++ /dev/null @@ -1,177 +0,0 @@ -// LevelDB Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -// -// See port_example.h for documentation for the following types/functions. - -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// * Neither the name of the University of California, Berkeley nor the -// names of its contributors may be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -// DISCLAIMED. IN NO EVENT SHALL THE REGENTS AND CONTRIBUTORS BE LIABLE FOR ANY -// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// - -#ifndef STORAGE_LEVELDB_PORT_PORT_WIN_H_ -#define STORAGE_LEVELDB_PORT_PORT_WIN_H_ - -#ifdef _MSC_VER -#define snprintf _snprintf -#define close _close -#define fread_unlocked _fread_nolock -#endif - -#include -#include -#ifdef SNAPPY -#include -#endif - -namespace leveldb { -namespace port { - -// Windows is little endian (for now :p) -static const bool kLittleEndian = true; - -class CondVar; - -class Mutex { - public: - Mutex(); - ~Mutex(); - - void Lock(); - void Unlock(); - void AssertHeld(); - - private: - friend class CondVar; - // critical sections are more efficient than mutexes - // but they are not recursive and can only be used to synchronize threads within the same process - // we use opaque void * to avoid including windows.h in port_win.h - void * cs_; - - // No copying - Mutex(const Mutex&); - void operator=(const Mutex&); -}; - -// the Win32 API offers a dependable condition variable mechanism, but only starting with -// Windows 2008 and Vista -// no matter what we will implement our own condition variable with a semaphore -// implementation as described in a paper written by Andrew D. Birrell in 2003 -class CondVar { - public: - explicit CondVar(Mutex* mu); - ~CondVar(); - void Wait(); - void Signal(); - void SignalAll(); - private: - Mutex* mu_; - - Mutex wait_mtx_; - long waiting_; - - void * sem1_; - void * sem2_; - - -}; - -class OnceType { -public: -// OnceType() : init_(false) {} - OnceType(const OnceType &once) : init_(once.init_) {} - OnceType(bool f) : init_(f) {} - void InitOnce(void (*initializer)()) { - mutex_.Lock(); - if (!init_) { - init_ = true; - initializer(); - } - mutex_.Unlock(); - } - -private: - bool init_; - Mutex mutex_; -}; - -#define LEVELDB_ONCE_INIT false -extern void InitOnce(port::OnceType*, void (*initializer)()); - -// Storage for a lock-free pointer -class AtomicPointer { - private: - void * rep_; - public: - AtomicPointer() : rep_(NULL) { } - explicit AtomicPointer(void* v); - void* Acquire_Load() const; - - void Release_Store(void* v); - - void* NoBarrier_Load() const; - - void NoBarrier_Store(void* v); -}; - -inline bool Snappy_Compress(const char* input, size_t length, - ::std::string* output) { -#ifdef SNAPPY - output->resize(snappy::MaxCompressedLength(length)); - size_t outlen; - snappy::RawCompress(input, length, &(*output)[0], &outlen); - output->resize(outlen); - return true; -#endif - - return false; -} - -inline bool Snappy_GetUncompressedLength(const char* input, size_t length, - size_t* result) { -#ifdef SNAPPY - return snappy::GetUncompressedLength(input, length, result); -#else - return false; -#endif -} - -inline bool Snappy_Uncompress(const char* input, size_t length, - char* output) { -#ifdef SNAPPY - return snappy::RawUncompress(input, length, output); -#else - return false; -#endif -} - -inline bool GetHeapProfile(void (*func)(void*, const char*, int), void* arg) { - return false; -} - -bool HasAcceleratedCRC32C(); -uint32_t AcceleratedCRC32C(uint32_t crc, const char* buf, size_t size); - -} -} - -#endif // STORAGE_LEVELDB_PORT_PORT_WIN_H_ diff --git a/src/leveldb/port/thread_annotations.h b/src/leveldb/port/thread_annotations.h index 9470ef587..6f9b6a792 100644 --- a/src/leveldb/port/thread_annotations.h +++ b/src/leveldb/port/thread_annotations.h @@ -2,8 +2,7 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#ifndef STORAGE_LEVELDB_PORT_THREAD_ANNOTATIONS_H_ -#define STORAGE_LEVELDB_PORT_THREAD_ANNOTATIONS_H_ +#ifndef STORAGE_LEVELDB_PORT_THREAD_ANNOTATIONS_H // Some environments provide custom macros to aid in static thread-safety // analysis. Provide empty definitions of such macros unless they are already @@ -57,4 +56,4 @@ #define NO_THREAD_SAFETY_ANALYSIS #endif -#endif // STORAGE_LEVELDB_PORT_THREAD_ANNOTATIONS_H_ +#endif // STORAGE_LEVELDB_PORT_THREAD_ANNOTATIONS_H diff --git a/src/leveldb/table/block.cc b/src/leveldb/table/block.cc index 43e402c9c..c27c912e7 100644 --- a/src/leveldb/table/block.cc +++ b/src/leveldb/table/block.cc @@ -15,8 +15,8 @@ namespace leveldb { -inline uint32_t Block::NumRestarts() const { - assert(size_ >= sizeof(uint32_t)); +uint32_t Block::NumRestarts() const { + assert(size_ >= 2*sizeof(uint32_t)); return DecodeFixed32(data_ + size_ - sizeof(uint32_t)); } @@ -27,12 +27,11 @@ Block::Block(const BlockContents& contents) if (size_ < sizeof(uint32_t)) { size_ = 0; // Error marker } else { - size_t max_restarts_allowed = (size_-sizeof(uint32_t)) / sizeof(uint32_t); - if (NumRestarts() > max_restarts_allowed) { - // The size is too small for NumRestarts() + restart_offset_ = size_ - (1 + NumRestarts()) * sizeof(uint32_t); + if (restart_offset_ > size_ - sizeof(uint32_t)) { + // The size is too small for NumRestarts() and therefore + // restart_offset_ wrapped around. size_ = 0; - } else { - restart_offset_ = size_ - (1 + NumRestarts()) * sizeof(uint32_t); } } } @@ -46,7 +45,7 @@ Block::~Block() { // Helper routine: decode the next block entry starting at "p", // storing the number of shared key bytes, non_shared key bytes, // and the length of the value in "*shared", "*non_shared", and -// "*value_length", respectively. Will not dereference past "limit". +// "*value_length", respectively. Will not derefence past "limit". // // If any errors are detected, returns NULL. Otherwise, returns a // pointer to the key delta (just past the three decoded values). @@ -163,8 +162,8 @@ class Block::Iter : public Iterator { } virtual void Seek(const Slice& target) { - // Binary search in restart array to find the last restart point - // with a key < target + // Binary search in restart array to find the first restart point + // with a key >= target uint32_t left = 0; uint32_t right = num_restarts_ - 1; while (left < right) { @@ -254,7 +253,7 @@ class Block::Iter : public Iterator { }; Iterator* Block::NewIterator(const Comparator* cmp) { - if (size_ < sizeof(uint32_t)) { + if (size_ < 2*sizeof(uint32_t)) { return NewErrorIterator(Status::Corruption("bad block contents")); } const uint32_t num_restarts = NumRestarts(); diff --git a/src/leveldb/table/block.h b/src/leveldb/table/block.h index 2493eb9f9..f29f08186 100644 --- a/src/leveldb/table/block.h +++ b/src/leveldb/table/block.h @@ -24,9 +24,10 @@ class Block { size_t size() const { return size_; } Iterator* NewIterator(const Comparator* comparator); - private: uint32_t NumRestarts() const; + private: + const char* data_; size_t size_; uint32_t restart_offset_; // Offset in data_ of restart array diff --git a/src/leveldb/table/block_builder.h b/src/leveldb/table/block_builder.h index 4fbcb3397..5b545bd1a 100644 --- a/src/leveldb/table/block_builder.h +++ b/src/leveldb/table/block_builder.h @@ -21,7 +21,7 @@ class BlockBuilder { // Reset the contents as if the BlockBuilder was just constructed. void Reset(); - // REQUIRES: Finish() has not been called since the last call to Reset(). + // REQUIRES: Finish() has not been callled since the last call to Reset(). // REQUIRES: key is larger than any previously added key void Add(const Slice& key, const Slice& value); diff --git a/src/leveldb/table/filter_block.cc b/src/leveldb/table/filter_block.cc index 1ed513417..fb171e698 100644 --- a/src/leveldb/table/filter_block.cc +++ b/src/leveldb/table/filter_block.cc @@ -9,22 +9,31 @@ namespace leveldb { -// See doc/table_format.md for an explanation of the filter block format. +// See doc/table_format.txt for an explanation of the filter block format. -// Generate new filter every 2KB of data -static const size_t kFilterBaseLg = 11; -static const size_t kFilterBase = 1 << kFilterBaseLg; +// list of available filters within code base +const FilterPolicy * FilterInventory::ListHead(NULL); FilterBlockBuilder::FilterBlockBuilder(const FilterPolicy* policy) - : policy_(policy) { + : policy_(policy), filter_base_lg_(0), filter_base_(0), last_offset_(0) +{ } void FilterBlockBuilder::StartBlock(uint64_t block_offset) { - uint64_t filter_index = (block_offset / kFilterBase); - assert(filter_index >= filter_offsets_.size()); - while (filter_index > filter_offsets_.size()) { - GenerateFilter(); - } + if (0==filter_base_lg_ && (1500= filter_offsets_.size()); + while (filter_index > filter_offsets_.size()) + { + GenerateFilter(); + } // if + } // if + + last_offset_=block_offset; } void FilterBlockBuilder::AddKey(const Slice& key) { @@ -34,6 +43,9 @@ void FilterBlockBuilder::AddKey(const Slice& key) { } Slice FilterBlockBuilder::Finish() { + if (0==filter_base_lg_) + PickFilterBase(last_offset_); + if (!start_.empty()) { GenerateFilter(); } @@ -45,7 +57,7 @@ Slice FilterBlockBuilder::Finish() { } PutFixed32(&result_, array_offset); - result_.push_back(kFilterBaseLg); // Save encoding parameter in result + result_.push_back(filter_base_lg_); // Save encoding parameter in result return Slice(result_); } @@ -68,7 +80,7 @@ void FilterBlockBuilder::GenerateFilter() { // Generate filter for current set of keys and append to result_. filter_offsets_.push_back(result_.size()); - policy_->CreateFilter(&tmp_keys_[0], static_cast(num_keys), &result_); + policy_->CreateFilter(&tmp_keys_[0], num_keys, &result_); tmp_keys_.clear(); keys_.clear(); @@ -97,7 +109,7 @@ bool FilterBlockReader::KeyMayMatch(uint64_t block_offset, const Slice& key) { if (index < num_) { uint32_t start = DecodeFixed32(offset_ + index*4); uint32_t limit = DecodeFixed32(offset_ + index*4 + 4); - if (start <= limit && limit <= static_cast(offset_ - data_)) { + if (start <= limit && limit <= (offset_ - data_)) { Slice filter = Slice(data_ + start, limit - start); return policy_->KeyMayMatch(key, filter); } else if (start == limit) { @@ -108,4 +120,48 @@ bool FilterBlockReader::KeyMayMatch(uint64_t block_offset, const Slice& key) { return true; // Errors are treated as potential matches } + +// wikipedia.com quotes following as source +// Warren Jr., Henry S. (2002). Hacker's Delight. Addison Wesley. pp. 48. ISBN 978-0-201-91465-8 +// Numerical Recipes, Third Edition credits +// Anderson, S.E. 2001, "BitTwiddling Hacks", http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2 +// latter states public domain. +static uint32_t +PowerOfTwoGreater(uint32_t num) +{ + uint32_t n; + + n=num; + --n; + n |= n >> 1; + n |= n >> 2; + n |= n >> 4; + n |= n >> 8; + n |= n >> 16; + ++n; + + return n; +} // CalcFilterBaseLg + + +void +FilterBlockBuilder::PickFilterBase( + size_t BlockOffset) +{ + // create limits just for safety sake + if (0==BlockOffset || 268435456>1; 0!=temp; ++filter_base_lg_, temp=temp >> 1); + } // else + +} // FilterBlockBuilder::PickFilterBase + + } diff --git a/src/leveldb/table/filter_block.h b/src/leveldb/table/filter_block.h index c67d010bd..5acf337a5 100644 --- a/src/leveldb/table/filter_block.h +++ b/src/leveldb/table/filter_block.h @@ -36,8 +36,13 @@ class FilterBlockBuilder { private: void GenerateFilter(); + void PickFilterBase(size_t BlockOffset); const FilterPolicy* policy_; + size_t filter_base_lg_; + size_t filter_base_; + size_t last_offset_; + std::string keys_; // Flattened key contents std::vector start_; // Starting index in keys_ of each key std::string result_; // Filter data computed so far diff --git a/src/leveldb/table/filter_block_test.cc b/src/leveldb/table/filter_block_test.cc index 8c4a4741f..8d0752819 100644 --- a/src/leveldb/table/filter_block_test.cc +++ b/src/leveldb/table/filter_block_test.cc @@ -29,7 +29,7 @@ class TestHashFilter : public FilterPolicy { virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const { uint32_t h = Hash(key.data(), key.size(), 1); - for (size_t i = 0; i + 4 <= filter.size(); i += 4) { + for (int i = 0; i + 4 <= filter.size(); i += 4) { if (h == DecodeFixed32(filter.data() + i)) { return true; } @@ -46,7 +46,7 @@ class FilterBlockTest { TEST(FilterBlockTest, EmptyBuilder) { FilterBlockBuilder builder(&policy_); Slice block = builder.Finish(); - ASSERT_EQ("\\x00\\x00\\x00\\x00\\x0b", EscapeString(block)); + ASSERT_EQ("\\x00\\x00\\x00\\x00\\x1c", EscapeString(block)); FilterBlockReader reader(&policy_, block); ASSERT_TRUE(reader.KeyMayMatch(0, "foo")); ASSERT_TRUE(reader.KeyMayMatch(100000, "foo")); @@ -95,7 +95,7 @@ TEST(FilterBlockTest, MultiChunk) { Slice block = builder.Finish(); FilterBlockReader reader(&policy_, block); - +#if 0 // all in first/only filter with riak // Check first filter ASSERT_TRUE(reader.KeyMayMatch(0, "foo")); ASSERT_TRUE(reader.KeyMayMatch(2000, "bar")); @@ -119,6 +119,30 @@ TEST(FilterBlockTest, MultiChunk) { ASSERT_TRUE(reader.KeyMayMatch(9000, "hello")); ASSERT_TRUE(! reader.KeyMayMatch(9000, "foo")); ASSERT_TRUE(! reader.KeyMayMatch(9000, "bar")); +#else + ASSERT_TRUE(reader.KeyMayMatch(0, "foo")); + ASSERT_TRUE(reader.KeyMayMatch(2000, "bar")); + ASSERT_TRUE(reader.KeyMayMatch(0, "box")); + ASSERT_TRUE(reader.KeyMayMatch(0, "hello")); + + // Check second filter + ASSERT_TRUE(reader.KeyMayMatch(3100, "box")); + ASSERT_TRUE(reader.KeyMayMatch(3100, "foo")); + ASSERT_TRUE(reader.KeyMayMatch(3100, "bar")); + ASSERT_TRUE(reader.KeyMayMatch(3100, "hello")); + + // Check third filter (empty) + ASSERT_TRUE(reader.KeyMayMatch(4100, "foo")); + ASSERT_TRUE(reader.KeyMayMatch(4100, "bar")); + ASSERT_TRUE(reader.KeyMayMatch(4100, "box")); + ASSERT_TRUE(reader.KeyMayMatch(4100, "hello")); + + // Check last filter + ASSERT_TRUE(reader.KeyMayMatch(9000, "box")); + ASSERT_TRUE(reader.KeyMayMatch(9000, "hello")); + ASSERT_TRUE(reader.KeyMayMatch(9000, "foo")); + ASSERT_TRUE(reader.KeyMayMatch(9000, "bar")); +#endif } } // namespace leveldb diff --git a/src/leveldb/table/format.cc b/src/leveldb/table/format.cc index 285e1c0de..c98ef930f 100644 --- a/src/leveldb/table/format.cc +++ b/src/leveldb/table/format.cc @@ -5,13 +5,23 @@ #include "table/format.h" #include "leveldb/env.h" +#include "leveldb/perf_count.h" #include "port/port.h" #include "table/block.h" #include "util/coding.h" #include "util/crc32c.h" +#include "util/lz4.h" +#include "db/log_writer.h" namespace leveldb { +static struct +{ + uint32_t filler_; //!< don't know and don't care + uint32_t zero_restarts_; //!< path to an EmptyIterator +} gEmptyBlock={0,0}; + + void BlockHandle::EncodeTo(std::string* dst) const { // Sanity check that all fields have been set assert(offset_ != ~static_cast(0)); @@ -30,14 +40,15 @@ Status BlockHandle::DecodeFrom(Slice* input) { } void Footer::EncodeTo(std::string* dst) const { +#ifndef NDEBUG const size_t original_size = dst->size(); +#endif metaindex_handle_.EncodeTo(dst); index_handle_.EncodeTo(dst); dst->resize(2 * BlockHandle::kMaxEncodedLength); // Padding PutFixed32(dst, static_cast(kTableMagicNumber & 0xffffffffu)); PutFixed32(dst, static_cast(kTableMagicNumber >> 32)); assert(dst->size() == original_size + kEncodedLength); - (void)original_size; // Disable unused variable warning. } Status Footer::DecodeFrom(Slice* input) { @@ -47,7 +58,7 @@ Status Footer::DecodeFrom(Slice* input) { const uint64_t magic = ((static_cast(magic_hi) << 32) | (static_cast(magic_lo))); if (magic != kTableMagicNumber) { - return Status::Corruption("not an sstable (bad magic number)"); + return Status::InvalidArgument("not an sstable (bad magic number)"); } Status result = metaindex_handle_.DecodeFrom(input); @@ -65,7 +76,14 @@ Status Footer::DecodeFrom(Slice* input) { Status ReadBlock(RandomAccessFile* file, const ReadOptions& options, const BlockHandle& handle, - BlockContents* result) { + BlockContents* result) +{ + char * buf, * ubuf; + const char * data; + + buf=NULL; + ubuf=NULL; + data=NULL; result->data = Slice(); result->cachable = false; result->heap_allocated = false; @@ -73,72 +91,161 @@ Status ReadBlock(RandomAccessFile* file, // Read the block contents as well as the type/crc footer. // See table_builder.cc for the code that built this structure. size_t n = static_cast(handle.size()); - char* buf = new char[n + kBlockTrailerSize]; + buf = new char[n + kBlockTrailerSize]; Slice contents; Status s = file->Read(handle.offset(), n + kBlockTrailerSize, &contents, buf); - if (!s.ok()) { - delete[] buf; - return s; - } - if (contents.size() != n + kBlockTrailerSize) { - delete[] buf; - return Status::Corruption("truncated block read", file->GetName()); - } + if (s.ok()) + { + if (contents.size() != n + kBlockTrailerSize) { + s=Status::Corruption("truncated block read"); + } + } // if // Check the crc of the type and the block contents - const char* data = contents.data(); // Pointer to where Read put the data - if (options.verify_checksums) { - const uint32_t crc = crc32c::Unmask(DecodeFixed32(data + n + 1)); - const uint32_t actual = crc32c::Value(data, n + 1); - if (actual != crc) { - delete[] buf; - s = Status::Corruption("block checksum mismatch", file->GetName()); - return s; - } - } + if (s.ok()) + { + data = contents.data(); // Pointer to where Read put the data + if (options.verify_checksums) { + const uint32_t crc = crc32c::Unmask(DecodeFixed32(data + n + 1)); + const uint32_t actual = crc32c::Value(data, n + 1); + if (actual != crc) { + s = Status::Corruption("block checksum mismatch"); + } // if + } // if + } // if - switch (data[n]) { - case kNoCompression: - if (data != buf) { - // File implementation gave us pointer to some other data. - // Use it directly under the assumption that it will be live - // while the file is open. - delete[] buf; - result->data = Slice(data, n); - result->heap_allocated = false; - result->cachable = false; // Do not double-cache - } else { - result->data = Slice(buf, n); - result->heap_allocated = true; - result->cachable = true; - } + if (s.ok()) + { + switch (data[n]) { + case kNoCompression: + if (data != buf) { + // File implementation gave us pointer to some other data. + // Use it directly under the assumption that it will be live + // while the file is open. + delete[] buf; + buf=NULL; + result->data = Slice(data, n); + result->heap_allocated = false; + result->cachable = false; // Do not double-cache + } else { + result->data = Slice(buf, n); + result->heap_allocated = true; + result->cachable = true; + } // else + // Ok + break; - // Ok - break; - case kSnappyCompression: { - size_t ulength = 0; - if (!port::Snappy_GetUncompressedLength(data, n, &ulength)) { - delete[] buf; - return Status::Corruption("corrupted compressed block contents", file->GetName()); - } - char* ubuf = new char[ulength]; - if (!port::Snappy_Uncompress(data, n, ubuf)) { - delete[] buf; - delete[] ubuf; - return Status::Corruption("corrupted compressed block contents", file->GetName()); - } - delete[] buf; - result->data = Slice(ubuf, ulength); - result->heap_allocated = true; - result->cachable = true; - break; - } - default: - delete[] buf; - return Status::Corruption("bad block type", file->GetName()); - } + case kSnappyCompression: { + size_t ulength = 0; + if (!port::Snappy_GetUncompressedLength(data, n, &ulength)) { + s = Status::Corruption("corrupted compressed block contents"); + } - return Status::OK(); + if (s.ok()) + { + ubuf = new char[ulength]; + if (!port::Snappy_Uncompress(data, n, ubuf)) { + s=Status::Corruption("corrupted compressed block contents"); + } + } // if + + if (s.ok()) + { + delete[] buf; + buf=NULL; + result->data = Slice(ubuf, ulength); + result->heap_allocated = true; + result->cachable = true; + } // if + break; + } + + case kLZ4Compression: { + size_t ulength = DecodeFixed32(data); + size_t ret_val; + ubuf = new char[ulength]; + + ret_val=LZ4_decompress_safe(data+4, ubuf, n-4, ulength); + if (ret_val != ulength) + { + s=Status::Corruption("corrupted LZ4 compressed block"); + } // if + + if (s.ok()) + { + delete[] buf; + buf=NULL; + result->data = Slice(ubuf, ulength); + result->heap_allocated = true; + result->cachable = true; + } // if + break; + } + + default: + s=Status::Corruption("bad block type"); + break; + } // switch + } // if + + // clean up error and decide what to do with it + if (!s.ok()) + { + gPerfCounters->Inc(ePerfReadBlockError); + + if (options.IsCompaction() && 0!=options.GetDBName().length()) + { + // this process is slow. assumption is that it does not happen often. + if (NULL!=data) + { + std::string new_name; + WritableFile *bad_file; + log::Writer *bad_logger; + Status s2; + + bad_file=NULL; + bad_logger=NULL; + + // potentially create the "lost" directory. It might already exist. + new_name=options.GetDBName(); + new_name+="/lost"; + options.GetEnv()->CreateDir(new_name); + + // create / append file to hold removed blocks + new_name+="/BLOCKS.bad"; + s2=options.GetEnv()->NewAppendableFile(new_name, &bad_file, 4*1024); + if (s2.ok()) + { + // need a try/catch + bad_logger=new log::Writer(bad_file); + bad_logger->AddRecord(Slice(data, n)); + Log(options.GetInfoLog(),"Moving corrupted block to lost/BLOCKS.bad (size %zd)", n); + + // Close also deletes bad_file + bad_logger->Close(); + delete bad_logger; + bad_logger=NULL; + bad_file=NULL; + } // if + else + { + Log(options.GetInfoLog(), "Unable to create file for bad/corrupted blocks: %s", new_name.c_str()); + } // else + } // if + + // lie to the upper layers to keep compaction from going into an infinite loop + s = Status::OK(); + } // if + + delete [] buf; + delete [] ubuf; + + result->data = Slice((char *)&gEmptyBlock, sizeof(gEmptyBlock)); + result->cachable = false; + result->heap_allocated = false; + } // if + + return s; } } // namespace leveldb diff --git a/src/leveldb/table/iterator_wrapper.h b/src/leveldb/table/iterator_wrapper.h index f410c3fab..9e16b3dbe 100644 --- a/src/leveldb/table/iterator_wrapper.h +++ b/src/leveldb/table/iterator_wrapper.h @@ -5,9 +5,6 @@ #ifndef STORAGE_LEVELDB_TABLE_ITERATOR_WRAPPER_H_ #define STORAGE_LEVELDB_TABLE_ITERATOR_WRAPPER_H_ -#include "leveldb/iterator.h" -#include "leveldb/slice.h" - namespace leveldb { // A internal wrapper class with an interface similar to Iterator that diff --git a/src/leveldb/table/table.cc b/src/leveldb/table/table.cc index decf8082c..877ee4fc6 100644 --- a/src/leveldb/table/table.cc +++ b/src/leveldb/table/table.cc @@ -9,6 +9,7 @@ #include "leveldb/env.h" #include "leveldb/filter_policy.h" #include "leveldb/options.h" +#include "leveldb/perf_count.h" #include "table/block.h" #include "table/filter_block.h" #include "table/format.h" @@ -27,12 +28,18 @@ struct Table::Rep { Options options; Status status; RandomAccessFile* file; + uint64_t file_size; uint64_t cache_id; FilterBlockReader* filter; const char* filter_data; + size_t filter_data_size; BlockHandle metaindex_handle; // Handle to metaindex_block: saved from footer Block* index_block; + SstCounters sst_counters; + BlockHandle filter_handle; + const FilterPolicy * filter_policy; + volatile uint32_t filter_flag; }; Status Table::Open(const Options& options, @@ -41,10 +48,14 @@ Status Table::Open(const Options& options, Table** table) { *table = NULL; if (size < Footer::kEncodedLength) { - return Status::Corruption("file is too short to be an sstable"); + return Status::InvalidArgument("file is too short to be an sstable"); } char footer_space[Footer::kEncodedLength]; + // stop valgrind uninitialize warning + // let footer.DecodeFrom returned status do the talking for read of bad info + memset(footer_space, 0, Footer::kEncodedLength); + Slice footer_input; Status s = file->Read(size - Footer::kEncodedLength, Footer::kEncodedLength, &footer_input, footer_space); @@ -58,11 +69,7 @@ Status Table::Open(const Options& options, BlockContents contents; Block* index_block = NULL; if (s.ok()) { - ReadOptions opt; - if (options.paranoid_checks) { - opt.verify_checksums = true; - } - s = ReadBlock(file, opt, footer.index_handle(), &contents); + s = ReadBlock(file, ReadOptions(), footer.index_handle(), &contents); if (s.ok()) { index_block = new Block(contents); } @@ -74,32 +81,32 @@ Status Table::Open(const Options& options, Rep* rep = new Table::Rep; rep->options = options; rep->file = file; + rep->file_size = size; rep->metaindex_handle = footer.metaindex_handle(); rep->index_block = index_block; rep->cache_id = (options.block_cache ? options.block_cache->NewId() : 0); rep->filter_data = NULL; + rep->filter_data_size = 0; rep->filter = NULL; + rep->filter_policy = NULL; + rep->filter_flag = 0; *table = new Table(rep); (*table)->ReadMeta(footer); } else { - delete index_block; + if (index_block) delete index_block; } return s; } void Table::ReadMeta(const Footer& footer) { - if (rep_->options.filter_policy == NULL) { - return; // Do not need any metadata - } // TODO(sanjay): Skip this if footer.metaindex_handle() size indicates // it is an empty block. + std::string key; ReadOptions opt; - if (rep_->options.paranoid_checks) { - opt.verify_checksums = true; - } BlockContents contents; + if (!ReadBlock(rep_->file, opt, footer.metaindex_handle(), &contents).ok()) { // Do not propagate errors since meta info is not needed for operation return; @@ -107,39 +114,139 @@ void Table::ReadMeta(const Footer& footer) { Block* meta = new Block(contents); Iterator* iter = meta->NewIterator(BytewiseComparator()); - std::string key = "filter."; - key.append(rep_->options.filter_policy->Name()); + + // read filter only if policy set + if (NULL != rep_->options.filter_policy) { + bool found,first; + const FilterPolicy * policy, * next; + + first=true; + next=NULL; + + do + { + found=false; + + if (first) + { + policy=rep_->options.filter_policy; + next=FilterInventory::ListHead; + first=false; + } // if + else + { + policy=next; + if (NULL!=policy) + next=policy->GetNext(); + else + next=NULL; + } // else + + if (NULL!=policy) + { + key = "filter."; + key.append(policy->Name()); + iter->Seek(key); + if (iter->Valid() && iter->key() == Slice(key)) + { + // store information needed to load bloom filter + // at a later time + Slice v = iter->value(); + rep_->filter_handle.DecodeFrom(&v); + rep_->filter_policy = policy; + + found=true; + } // if + } //if + } while(!found && NULL!=policy); + } // if + + // always read counters + key="stats.sst1"; iter->Seek(key); if (iter->Valid() && iter->key() == Slice(key)) { - ReadFilter(iter->value()); + ReadSstCounters(iter->value()); } + delete iter; delete meta; } -void Table::ReadFilter(const Slice& filter_handle_value) { - Slice v = filter_handle_value; - BlockHandle filter_handle; - if (!filter_handle.DecodeFrom(&v).ok()) { - return; - } +// public version that reads filter at some time +// after open ... true if filter read +bool +Table::ReadFilter() +{ + bool ret_flag; + + ret_flag=false; + + if (0!=rep_->filter_handle.size() + && NULL!=rep_->filter_policy + && 1 == inc_and_fetch(&rep_->filter_flag)) + { + gPerfCounters->Inc(ePerfBlockFilterRead); + + ReadFilter(rep_->filter_handle, rep_->filter_policy); + ret_flag=(NULL != rep_->filter); + + // only attempt the read once + rep_->filter_handle.set_size(0); + } // if + + return(ret_flag); +} // ReadFilter + +// Private version of ReadFilter that does the actual work +void +Table::ReadFilter( + BlockHandle & filter_handle, + const FilterPolicy * policy) +{ // We might want to unify with ReadBlock() if we start // requiring checksum verification in Table::Open. ReadOptions opt; - if (rep_->options.paranoid_checks) { - opt.verify_checksums = true; - } BlockContents block; if (!ReadBlock(rep_->file, opt, filter_handle, &block).ok()) { return; } if (block.heap_allocated) { rep_->filter_data = block.data.data(); // Will need to delete later + rep_->filter_data_size = block.data.size(); } - rep_->filter = new FilterBlockReader(rep_->options.filter_policy, block.data); + + rep_->filter = new FilterBlockReader(policy, block.data); } + +void Table::ReadSstCounters(const Slice& sst_counters_handle_value) { + Slice v = sst_counters_handle_value; + BlockHandle counters_handle; + if (!counters_handle.DecodeFrom(&v).ok()) { + return; + } + + // We might want to unify with ReadBlock() if we start + // requiring checksum verification in Table::Open. + ReadOptions opt; + BlockContents block; + if (!ReadBlock(rep_->file, opt, counters_handle, &block).ok()) { + return; + } + if (block.heap_allocated) { + rep_->sst_counters.DecodeFrom(block.data); + delete [] block.data.data(); + } + +} + +SstCounters Table::GetSstCounters() const +{ + return(rep_->sst_counters); +} // Table::GetSstCounters + + Table::~Table() { delete rep_; } @@ -185,18 +292,23 @@ Iterator* Table::BlockReader(void* arg, cache_handle = block_cache->Lookup(key); if (cache_handle != NULL) { block = reinterpret_cast(block_cache->Value(cache_handle)); + gPerfCounters->Inc(ePerfBlockCached); } else { s = ReadBlock(table->rep_->file, options, handle, &contents); + gPerfCounters->Inc(ePerfBlockRead); if (s.ok()) { block = new Block(contents); if (contents.cachable && options.fill_cache) { cache_handle = block_cache->Insert( - key, block, block->size(), &DeleteCachedBlock); + key, block, + (block->size() + /*block_cache->EntryOverheadSize() +*/ sizeof(cache_key_buffer)), + &DeleteCachedBlock); } } } } else { s = ReadBlock(table->rep_->file, options, handle, &contents); + gPerfCounters->Inc(ePerfBlockRead); if (s.ok()) { block = new Block(contents); } @@ -225,7 +337,7 @@ Iterator* Table::NewIterator(const ReadOptions& options) const { Status Table::InternalGet(const ReadOptions& options, const Slice& k, void* arg, - void (*saver)(void*, const Slice&, const Slice&)) { + bool (*saver)(void*, const Slice&, const Slice&)) { Status s; Iterator* iiter = rep_->index_block->NewIterator(rep_->options.comparator); iiter->Seek(k); @@ -237,12 +349,19 @@ Status Table::InternalGet(const ReadOptions& options, const Slice& k, handle.DecodeFrom(&handle_value).ok() && !filter->KeyMayMatch(handle.offset(), k)) { // Not found + gPerfCounters->Inc(ePerfBlockFiltered); } else { Iterator* block_iter = BlockReader(this, options, iiter->value()); block_iter->Seek(k); if (block_iter->Valid()) { - (*saver)(arg, block_iter->key(), block_iter->value()); + bool match; + match=(*saver)(arg, block_iter->key(), block_iter->value()); + if (!match && NULL!=filter) + gPerfCounters->Inc(ePerfBlockFilterFalse); + if (match) + gPerfCounters->Inc(ePerfBlockValidGet); } + s = block_iter->status(); delete block_iter; } @@ -282,4 +401,27 @@ uint64_t Table::ApproximateOffsetOf(const Slice& key) const { return result; } + +uint64_t +Table::GetFileSize() +{ + return(rep_->file_size); +}; + +Block * +Table::TEST_GetIndexBlock() {return(rep_->index_block);}; + +// Riak specific routine. Calculates total footprint of an open +// table in memory. +size_t +Table::TableObjectSize() +{ + return(sizeof(Table) + sizeof(Table::Rep) + rep_->index_block->size() + rep_->filter_data_size + rep_->file->ObjectSize() + + sizeof(FilterBlockReader) + sizeof(Block)); +}; + +size_t +Table::TEST_FilterDataSize() {return(rep_->filter_data_size);}; + + } // namespace leveldb diff --git a/src/leveldb/table/table_builder.cc b/src/leveldb/table/table_builder.cc index 62002c84f..0672cc742 100644 --- a/src/leveldb/table/table_builder.cc +++ b/src/leveldb/table/table_builder.cc @@ -5,15 +5,19 @@ #include "leveldb/table_builder.h" #include +#include "db/dbformat.h" #include "leveldb/comparator.h" #include "leveldb/env.h" +#include "leveldb/expiry.h" #include "leveldb/filter_policy.h" #include "leveldb/options.h" +#include "leveldb/perf_count.h" #include "table/block_builder.h" #include "table/filter_block.h" #include "table/format.h" #include "util/coding.h" #include "util/crc32c.h" +#include "util/lz4.h" namespace leveldb { @@ -29,6 +33,7 @@ struct TableBuilder::Rep { int64_t num_entries; bool closed; // Either Finish() or Abandon() has been called. FilterBlockBuilder* filter_block; + SstCounters sst_counters; // We do not emit the index entry for a block until we have seen the // first key for the next data block. This allows us to use shorter @@ -104,6 +109,7 @@ void TableBuilder::Add(const Slice& key, const Slice& value) { r->pending_handle.EncodeTo(&handle_encoding); r->index_block.Add(r->last_key, Slice(handle_encoding)); r->pending_index_entry = false; + r->sst_counters.Inc(eSstCountIndexKeys); } if (r->filter_block != NULL) { @@ -114,6 +120,38 @@ void TableBuilder::Add(const Slice& key, const Slice& value) { r->num_entries++; r->data_block.Add(key, value); + // statistics + r->sst_counters.Inc(eSstCountKeys); + r->sst_counters.Add(eSstCountKeySize, key.size()); + r->sst_counters.Add(eSstCountValueSize, value.size()); + + if (key.size() < r->sst_counters.Value(eSstCountKeySmallest)) + r->sst_counters.Set(eSstCountKeySmallest, key.size()); + if (r->sst_counters.Value(eSstCountKeyLargest) < key.size()) + r->sst_counters.Set(eSstCountKeyLargest, key.size()); + + if (value.size() < r->sst_counters.Value(eSstCountValueSmallest)) + r->sst_counters.Set(eSstCountValueSmallest, value.size()); + if (r->sst_counters.Value(eSstCountValueLargest) < value.size()) + r->sst_counters.Set(eSstCountValueLargest, value.size()); + + // unit tests use non-standard keys ... must ignore the short ones + if (8 < key.size() && kTypeDeletion==ExtractValueType(key)) + r->sst_counters.Inc(eSstCountDeleteKey); + + // again ignore short keys, save high sequence number for abbreviated repair + if (8 <= key.size() + && r->sst_counters.Value(eSstCountSequence)sst_counters.Set(eSstCountSequence,ExtractSequenceNumber(key)); + + // statistics if an expiry key + // Note: not using ExpiryActivated(). Forcing expiry statistics which + // are upgrade / downgrade safe. + if (NULL!=r->options.expiry_module.get()) + { + r->options.expiry_module->TableBuilderCallback(key, r->sst_counters); + } // if + const size_t estimated_block_size = r->data_block.CurrentSizeEstimate(); if (estimated_block_size >= r->options.block_size) { Flush(); @@ -145,16 +183,28 @@ void TableBuilder::WriteBlock(BlockBuilder* block, BlockHandle* handle) { Rep* r = rep_; Slice raw = block->Finish(); + r->sst_counters.Inc(eSstCountBlocks); + r->sst_counters.Add(eSstCountBlockSize, raw.size()); + Slice block_contents; CompressionType type = r->options.compression; // TODO(postrelease): Support more compression options: zlib? + std::string * compressed; + switch (type) { + case kNoCompressionAutomated: + // automation disabled compression + type=kNoCompression; + r->sst_counters.Inc(eSstCountCompressAborted); + block_contents = raw; + break; + case kNoCompression: block_contents = raw; break; - case kSnappyCompression: { - std::string* compressed = &r->compressed_output; + case kSnappyCompression: + compressed = &r->compressed_output; if (port::Snappy_Compress(raw.data(), raw.size(), compressed) && compressed->size() < raw.size() - (raw.size() / 8u)) { block_contents = *compressed; @@ -163,11 +213,36 @@ void TableBuilder::WriteBlock(BlockBuilder* block, BlockHandle* handle) { // store uncompressed form block_contents = raw; type = kNoCompression; + r->sst_counters.Inc(eSstCountCompressAborted); } break; - } + + case kLZ4Compression: + compressed = &r->compressed_output; + int limit, result_size; + limit=raw.size() - (raw.size() / 8u); + + compressed->resize(limit+4); + result_size=LZ4_compress_default(raw.data(), (char *)(compressed->data())+4, raw.size(), limit); + if (result_size) + { + EncodeFixed32((char *)compressed->data(), raw.size()); + compressed->resize(result_size+4); + block_contents = *compressed; + } + else { + // Snappy not supported, or compressed less than 12.5%, so just + // store uncompressed form + block_contents = raw; + type = kNoCompression; + r->sst_counters.Inc(eSstCountCompressAborted); + } + break; + + } WriteRawBlock(block_contents, type, handle); + r->sst_counters.Add(eSstCountBlockWriteSize, block_contents.size()); r->compressed_output.clear(); block->Reset(); } @@ -202,7 +277,12 @@ Status TableBuilder::Finish() { assert(!r->closed); r->closed = true; - BlockHandle filter_block_handle, metaindex_block_handle, index_block_handle; + BlockHandle filter_block_handle, metaindex_block_handle, index_block_handle, + sst_stats_block_handle; + + // pass hint to Linux fadvise management + r->sst_counters.Set(eSstCountUserDataSize, r->offset); + r->file->SetMetadataOffset(r->offset); // Write filter block if (ok() && r->filter_block != NULL) { @@ -210,18 +290,42 @@ Status TableBuilder::Finish() { &filter_block_handle); } + // Write sst statistic counters + if (ok()) + { + std::string encoded_stats; + + r->sst_counters.Set(eSstCountBlockSizeUsed, r->options.block_size); + + if (r->pending_index_entry) + r->sst_counters.Inc(eSstCountIndexKeys); + + r->sst_counters.EncodeTo(encoded_stats); + WriteRawBlock(Slice(encoded_stats), kNoCompression, + &sst_stats_block_handle); + } // if + // Write metaindex block if (ok()) { BlockBuilder meta_index_block(&r->options); + std::string key, handle_encoding; + if (r->filter_block != NULL) { // Add mapping from "filter.Name" to location of filter data - std::string key = "filter."; + key = "filter."; key.append(r->options.filter_policy->Name()); - std::string handle_encoding; + handle_encoding.clear(); filter_block_handle.EncodeTo(&handle_encoding); meta_index_block.Add(key, handle_encoding); + } + // Add mapping for "stats.sst1" + key = "stats.sst1"; + handle_encoding.clear(); + sst_stats_block_handle.EncodeTo(&handle_encoding); + meta_index_block.Add(key, handle_encoding); + // TODO(postrelease): Add stats and other meta blocks WriteBlock(&meta_index_block, &metaindex_block_handle); } @@ -267,4 +371,20 @@ uint64_t TableBuilder::FileSize() const { return rep_->offset; } +uint64_t TableBuilder::NumDeletes() const { + return rep_->sst_counters.Value(eSstCountDeleteKey); +} + +uint64_t TableBuilder::GetExpiryWriteLow() const { + return rep_->sst_counters.Value(eSstCountExpiry1); +} + +uint64_t TableBuilder::GetExpiryWriteHigh() const { + return rep_->sst_counters.Value(eSstCountExpiry2); +} + +uint64_t TableBuilder::GetExpiryExplicitHigh() const { + return rep_->sst_counters.Value(eSstCountExpiry3); +} + } // namespace leveldb diff --git a/src/leveldb/table/table_test.cc b/src/leveldb/table/table_test.cc index abf6e246f..4382c8e01 100644 --- a/src/leveldb/table/table_test.cc +++ b/src/leveldb/table/table_test.cc @@ -279,7 +279,7 @@ class KeyConvertingIterator: public Iterator { virtual ~KeyConvertingIterator() { delete iter_; } virtual bool Valid() const { return iter_->Valid(); } virtual void Seek(const Slice& target) { - ParsedInternalKey ikey(target, kMaxSequenceNumber, kTypeValue); + ParsedInternalKey ikey(target, 0, kMaxSequenceNumber, kTypeValue); std::string encoded; AppendInternalKey(&encoded, ikey); iter_->Seek(encoded); @@ -644,36 +644,6 @@ class Harness { Constructor* constructor_; }; -// Test empty table/block. -TEST(Harness, Empty) { - for (int i = 0; i < kNumTestArgs; i++) { - Init(kTestArgList[i]); - Random rnd(test::RandomSeed() + 1); - Test(&rnd); - } -} - -// Special test for a block with no restart entries. The C++ leveldb -// code never generates such blocks, but the Java version of leveldb -// seems to. -TEST(Harness, ZeroRestartPointsInBlock) { - char data[sizeof(uint32_t)]; - memset(data, 0, sizeof(data)); - BlockContents contents; - contents.data = Slice(data, sizeof(data)); - contents.cachable = false; - contents.heap_allocated = false; - Block block(contents); - Iterator* iter = block.NewIterator(BytewiseComparator()); - iter->SeekToFirst(); - ASSERT_TRUE(!iter->Valid()); - iter->SeekToLast(); - ASSERT_TRUE(!iter->Valid()); - iter->Seek("foo"); - ASSERT_TRUE(!iter->Valid()); - delete iter; -} - // Test the empty key TEST(Harness, SimpleEmptyKey) { for (int i = 0; i < kNumTestArgs; i++) { @@ -769,7 +739,7 @@ TEST(MemTableTest, Simple) { batch.Put(std::string("k2"), std::string("v2")); batch.Put(std::string("k3"), std::string("v3")); batch.Put(std::string("largekey"), std::string("vlarge")); - ASSERT_TRUE(WriteBatchInternal::InsertInto(&batch, memtable).ok()); + ASSERT_TRUE(WriteBatchInternal::InsertInto(&batch, memtable, NULL).ok()); Iterator* iter = memtable->NewIterator(); iter->SeekToFirst(); @@ -853,20 +823,12 @@ TEST(TableTest, ApproximateOffsetOfCompressed) { options.compression = kSnappyCompression; c.Finish(options, &keys, &kvmap); - // Expected upper and lower bounds of space used by compressible strings. - static const int kSlop = 1000; // Compressor effectiveness varies. - const int expected = 2500; // 10000 * compression ratio (0.25) - const int min_z = expected - kSlop; - const int max_z = expected + kSlop; - - ASSERT_TRUE(Between(c.ApproximateOffsetOf("abc"), 0, kSlop)); - ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01"), 0, kSlop)); - ASSERT_TRUE(Between(c.ApproximateOffsetOf("k02"), 0, kSlop)); - // Have now emitted a large compressible string, so adjust expected offset. - ASSERT_TRUE(Between(c.ApproximateOffsetOf("k03"), min_z, max_z)); - ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04"), min_z, max_z)); - // Have now emitted two large compressible strings, so adjust expected offset. - ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"), 2 * min_z, 2 * max_z)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("abc"), 0, 0)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01"), 0, 0)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k02"), 0, 0)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k03"), 2000, 3000)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04"), 2000, 3000)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"), 4000, 6000)); } } // namespace leveldb diff --git a/src/leveldb/tools/builder.list b/src/leveldb/tools/builder.list new file mode 100644 index 000000000..0f7baedaa --- /dev/null +++ b/src/leveldb/tools/builder.list @@ -0,0 +1,15 @@ +10.0.27.221 # Centos 6.3 +10.0.27.222 # Debian Squeeze +10.0.27.231 # Fedora 17 +10.0.27.248 # Centos 7 +10.0.27.234 # SmartOS 1.8.4 +10.0.27.211 # SmartOS 1.6.3 +10.0.27.220 # Centos 5.8 +10.0.27.251 # SLES 11 +10.0.27.190 # FreeBSD 9.2 +10.0.27.214 # FreeBSD 9 64 +10.0.27.213 # Ubuntu Lucid 64 +10.0.27.219 # Ubuntu Natty 64 +10.0.27.212 # Ubuntu Precise 64 +10.0.27.250 # Ubuntu 14 +10.0.27.217 # Solaris diff --git a/src/leveldb/tools/buildtester.sh b/src/leveldb/tools/buildtester.sh new file mode 100755 index 000000000..6524a1b7b --- /dev/null +++ b/src/leveldb/tools/buildtester.sh @@ -0,0 +1,89 @@ +#! /bin/bash +## ./buildtester.sh builder.list leveldb_tag +## or +## ./buildertest.sh builder.list "" tarfile.tgz +## $0 $1 $2 $3 +## +## NOTE: you must manually ssh to each buildbot to get RSA fingerprint +## into your local known_hosts file before script works + +# eleveldb requires knowing which erlang is installed where +REPO=leveldb + +## +## Subroutines must appear before code using them +## + +ssh_command() +{ +# ssh_command "ip address" "command to execute" + if ssh -q -o 'BatchMode yes' buildbot@$1 "$2" + then + echo "success: $2" + else + echo "Error on $1 executing $2" + exit 1 + fi +} + +ssh_command_test() +{ +# ssh_command "ip address" "command to execute" + if ssh -q -o 'BatchMode yes' buildbot@$1 "$2" + then + return 0 + else + return 1 + fi +} + +# +# main +# + +if [ $# == 2 ]; then + echo "2 parameters" + builder_list=$(cut -d ' ' -f 1 $1) + for builder in $builder_list + do + echo "builder: " $builder + + ## remove previous eleveldb instance + #ssh_command $builder "cd ~/$USER && if [ -d eleveldb ] rm -rf eleveldb" + echo -n "Start $builder: " >>./builder.log + date >>./builder.log + + ssh_command $builder "rm -rf ~/$USER/$REPO" + ssh_command $builder "mkdir -p ~/$USER" + ssh_command $builder "cd ~/$USER && git clone git@github.com:basho/$REPO" + ssh_command $builder "cd ~/$USER/$REPO && git checkout $2" + + # freeBSD needs gmake explicitly, otherwise "Missing dependency operator" errors + # but other platforms assume "make" is gnumake + if ssh_command_test $builder "which gmake" + then + ssh_command $builder "cd ~/$USER/$REPO && gmake -j 4" + echo -n "Test $builder: " >>./builder.log + date >>./builder.log + ssh_command $builder "cd ~/$USER/$REPO && export LD_LIBRARY_PATH=. && gmake -j 4 check" + # freebsd error: util/cache2_test.cc:170: failed: -1 == 201 ... fixed + else + ssh_command $builder "cd ~/$USER/$REPO && make -j 4" + echo -n "Test $builder: " >>./builder.log + date >>./builder.log + ssh_command $builder "cd ~/$USER/$REPO && export LD_LIBRARY_PATH=. && make -j 4 check" + fi + + echo -n "End $builder: " >>./builder.log + date >>./builder.log + echo "" >>./builder.log + done +elif [ $# == 3 ]; then + echo "3 parameters" +else + echo " ./buildtester.sh builder.list leveldb_tag" + echo " or" + echo " ./buildertest.sh builder.list \"\" tarfile.tgz" +fi + +exit 0 diff --git a/src/leveldb/db/dumpfile.cc b/src/leveldb/tools/leveldb_main.cc similarity index 51% rename from src/leveldb/db/dumpfile.cc rename to src/leveldb/tools/leveldb_main.cc index 61c47c2ff..995d76107 100644 --- a/src/leveldb/db/dumpfile.cc +++ b/src/leveldb/tools/leveldb_main.cc @@ -35,112 +35,93 @@ bool GuessType(const std::string& fname, FileType* type) { // Notified when log reader encounters corruption. class CorruptionReporter : public log::Reader::Reporter { public: - WritableFile* dst_; virtual void Corruption(size_t bytes, const Status& status) { - std::string r = "corruption: "; - AppendNumberTo(&r, bytes); - r += " bytes; "; - r += status.ToString(); - r.push_back('\n'); - dst_->Append(r); + printf("corruption: %d bytes; %s\n", + static_cast(bytes), + status.ToString().c_str()); } }; // Print contents of a log file. (*func)() is called on every record. -Status PrintLogContents(Env* env, const std::string& fname, - void (*func)(uint64_t, Slice, WritableFile*), - WritableFile* dst) { +bool PrintLogContents(Env* env, const std::string& fname, + void (*func)(Slice)) { SequentialFile* file; Status s = env->NewSequentialFile(fname, &file); if (!s.ok()) { - return s; + fprintf(stderr, "%s\n", s.ToString().c_str()); + return false; } CorruptionReporter reporter; - reporter.dst_ = dst; log::Reader reader(file, &reporter, true, 0); Slice record; std::string scratch; while (reader.ReadRecord(&record, &scratch)) { - (*func)(reader.LastRecordOffset(), record, dst); + printf("--- offset %llu; ", + static_cast(reader.LastRecordOffset())); + (*func)(record); } delete file; - return Status::OK(); + return true; } // Called on every item found in a WriteBatch. class WriteBatchItemPrinter : public WriteBatch::Handler { public: - WritableFile* dst_; + uint64_t offset_; + uint64_t sequence_; + virtual void Put(const Slice& key, const Slice& value) { - std::string r = " put '"; - AppendEscapedStringTo(&r, key); - r += "' '"; - AppendEscapedStringTo(&r, value); - r += "'\n"; - dst_->Append(r); + printf(" put '%s' '%s'\n", + EscapeString(key).c_str(), + EscapeString(value).c_str()); } virtual void Delete(const Slice& key) { - std::string r = " del '"; - AppendEscapedStringTo(&r, key); - r += "'\n"; - dst_->Append(r); + printf(" del '%s'\n", + EscapeString(key).c_str()); } }; // Called on every log record (each one of which is a WriteBatch) // found in a kLogFile. -static void WriteBatchPrinter(uint64_t pos, Slice record, WritableFile* dst) { - std::string r = "--- offset "; - AppendNumberTo(&r, pos); - r += "; "; +static void WriteBatchPrinter(Slice record) { if (record.size() < 12) { - r += "log record length "; - AppendNumberTo(&r, record.size()); - r += " is too small\n"; - dst->Append(r); + printf("log record length %d is too small\n", + static_cast(record.size())); return; } WriteBatch batch; WriteBatchInternal::SetContents(&batch, record); - r += "sequence "; - AppendNumberTo(&r, WriteBatchInternal::Sequence(&batch)); - r.push_back('\n'); - dst->Append(r); + printf("sequence %llu\n", + static_cast(WriteBatchInternal::Sequence(&batch))); WriteBatchItemPrinter batch_item_printer; - batch_item_printer.dst_ = dst; Status s = batch.Iterate(&batch_item_printer); if (!s.ok()) { - dst->Append(" error: " + s.ToString() + "\n"); + printf(" error: %s\n", s.ToString().c_str()); } } -Status DumpLog(Env* env, const std::string& fname, WritableFile* dst) { - return PrintLogContents(env, fname, WriteBatchPrinter, dst); +bool DumpLog(Env* env, const std::string& fname) { + return PrintLogContents(env, fname, WriteBatchPrinter); } // Called on every log record (each one of which is a WriteBatch) // found in a kDescriptorFile. -static void VersionEditPrinter(uint64_t pos, Slice record, WritableFile* dst) { - std::string r = "--- offset "; - AppendNumberTo(&r, pos); - r += "; "; +static void VersionEditPrinter(Slice record) { VersionEdit edit; Status s = edit.DecodeFrom(record); if (!s.ok()) { - r += s.ToString(); - r.push_back('\n'); - } else { - r += edit.DebugString(); + printf("%s\n", s.ToString().c_str()); + return; } - dst->Append(r); + printf("%s", edit.DebugString().c_str()); } -Status DumpDescriptor(Env* env, const std::string& fname, WritableFile* dst) { - return PrintLogContents(env, fname, VersionEditPrinter, dst); +bool DumpDescriptor(Env* env, const std::string& fname) { + return PrintLogContents(env, fname, VersionEditPrinter); } -Status DumpTable(Env* env, const std::string& fname, WritableFile* dst) { +bool DumpTable(Env* env, const std::string& fname) { uint64_t file_size; RandomAccessFile* file = NULL; Table* table = NULL; @@ -156,70 +137,102 @@ Status DumpTable(Env* env, const std::string& fname, WritableFile* dst) { s = Table::Open(Options(), file, file_size, &table); } if (!s.ok()) { + fprintf(stderr, "%s\n", s.ToString().c_str()); delete table; delete file; - return s; + return false; } ReadOptions ro; ro.fill_cache = false; Iterator* iter = table->NewIterator(ro); - std::string r; for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { - r.clear(); ParsedInternalKey key; if (!ParseInternalKey(iter->key(), &key)) { - r = "badkey '"; - AppendEscapedStringTo(&r, iter->key()); - r += "' => '"; - AppendEscapedStringTo(&r, iter->value()); - r += "'\n"; - dst->Append(r); + printf("badkey '%s' => '%s'\n", + EscapeString(iter->key()).c_str(), + EscapeString(iter->value()).c_str()); } else { - r = "'"; - AppendEscapedStringTo(&r, key.user_key); - r += "' @ "; - AppendNumberTo(&r, key.sequence); - r += " : "; + char kbuf[20]; + const char* type; if (key.type == kTypeDeletion) { - r += "del"; + type = "del"; } else if (key.type == kTypeValue) { - r += "val"; + type = "val"; } else { - AppendNumberTo(&r, key.type); + snprintf(kbuf, sizeof(kbuf), "%d", static_cast(key.type)); + type = kbuf; } - r += " => '"; - AppendEscapedStringTo(&r, iter->value()); - r += "'\n"; - dst->Append(r); + printf("'%s' @ %8llu : %s => '%s'\n", + EscapeString(key.user_key).c_str(), + static_cast(key.sequence), + type, + EscapeString(iter->value()).c_str()); } } s = iter->status(); if (!s.ok()) { - dst->Append("iterator error: " + s.ToString() + "\n"); + printf("iterator error: %s\n", s.ToString().c_str()); } delete iter; delete table; delete file; - return Status::OK(); + return true; } -} // namespace - -Status DumpFile(Env* env, const std::string& fname, WritableFile* dst) { +bool DumpFile(Env* env, const std::string& fname) { FileType ftype; if (!GuessType(fname, &ftype)) { - return Status::InvalidArgument(fname + ": unknown file type"); + fprintf(stderr, "%s: unknown file type\n", fname.c_str()); + return false; } switch (ftype) { - case kLogFile: return DumpLog(env, fname, dst); - case kDescriptorFile: return DumpDescriptor(env, fname, dst); - case kTableFile: return DumpTable(env, fname, dst); - default: + case kLogFile: return DumpLog(env, fname); + case kDescriptorFile: return DumpDescriptor(env, fname); + case kTableFile: return DumpTable(env, fname); + + default: { + fprintf(stderr, "%s: not a dump-able file type\n", fname.c_str()); break; + } } - return Status::InvalidArgument(fname + ": not a dump-able file type"); + return false; } +bool HandleDumpCommand(Env* env, char** files, int num) { + bool ok = true; + for (int i = 0; i < num; i++) { + ok &= DumpFile(env, files[i]); + } + return ok; +} + +} } // namespace leveldb + +static void Usage() { + fprintf( + stderr, + "Usage: leveldbutil command...\n" + " dump files... -- dump contents of specified files\n" + ); +} + +int main(int argc, char** argv) { + leveldb::Env* env = leveldb::Env::Default(); + bool ok = true; + if (argc < 2) { + Usage(); + ok = false; + } else { + std::string command = argv[1]; + if (command == "dump") { + ok = leveldb::HandleDumpCommand(env, argv+2, argc-2); + } else { + Usage(); + ok = false; + } + } + return (ok ? 0 : 1); +} diff --git a/src/leveldb/tools/leveldb_repair.cc b/src/leveldb/tools/leveldb_repair.cc new file mode 100644 index 000000000..a0cfb08fb --- /dev/null +++ b/src/leveldb/tools/leveldb_repair.cc @@ -0,0 +1,99 @@ +#include +#include + +#include "db/filename.h" +#include "leveldb/env.h" +#include "leveldb/db.h" +#include "leveldb/cache.h" +#include "leveldb/iterator.h" +#include "leveldb/filter_policy.h" +#include "leveldb/slice.h" +#include "db/table_cache.h" +#include "db/version_edit.h" +#include "table/format.h" +#include "table/block.h" +#include "table/filter_block.h" + +//#include "util/logging.h" +//#include "db/log_reader.h" + +void command_help(); + +int +main( + int argc, + char ** argv) +{ + bool error_seen, running; + int error_counter; + char ** cursor; + + running=true; + error_seen=false; + error_counter=0; + + + for (cursor=argv+1; NULL!=*cursor && running; ++cursor) + { + // option flag? + if ('-'==**cursor) + { + char flag; + + flag=*((*cursor)+1); + switch(flag) + { + default: + fprintf(stderr, " option \'%c\' is not valid\n", flag); + command_help(); + running=false; + error_counter+=1; + error_seen=true; + break; + } // switch + } // if + + // database path + else + { + std::string dbname; + leveldb::Options options; + leveldb::Status status; + + dbname=*cursor; + options.env=leveldb::Env::Default(); + + status=leveldb::RepairDB(dbname.c_str(), options); + printf("Repair of %s %s.\n", + dbname.c_str(), + (status.ok() ? "successful" : "failed")); + + if (!status.ok()) + { + ++error_counter; + error_seen=true; + } // if + } // else + } // for + + if (1==argc) + command_help(); + + return( error_seen && 0!=error_counter ? 1 : 0 ); + +} // main + + +void +command_help() +{ + fprintf(stderr, "leveldb_repair [option | data_base]*\n"); + fprintf(stderr, " options\n"); + fprintf(stderr, " (none at this time)\n"); +} // command_help + +namespace leveldb { + + +} // namespace leveldb + diff --git a/src/leveldb/tools/pbuilder.list b/src/leveldb/tools/pbuilder.list new file mode 100644 index 000000000..bb9e82172 --- /dev/null +++ b/src/leveldb/tools/pbuilder.list @@ -0,0 +1,16 @@ +10.0.27.222 debian6 +10.0.27.239 debian7 +10.0.27.249 fedora19 +10.0.27.190 freebsd9.2 +10.0.27.220 rhel5 +10.0.27.221 rhel6 +10.0.27.248 rhel7 +10.0.27.251 sles11 +10.0.27.234 smartos1.8 +10.0.27.217 solaris10 +10.0.27.213 ubuntuLucid +10.0.27.212 ubuntuPrecise +10.0.27.250 ubuntuTrusty +bsd-build.bos1 freebsd10 +mac-mini.bos1 osx10.8 +10.0.27.240 smartos13.1 diff --git a/src/leveldb/tools/pbuilder.sh b/src/leveldb/tools/pbuilder.sh new file mode 100755 index 000000000..fd28dcf40 --- /dev/null +++ b/src/leveldb/tools/pbuilder.sh @@ -0,0 +1,99 @@ +#! /bin/bash +## ./pbuilder.sh builder.list leveldb_tag +## or +## ./pbuilder.sh builder.list "" tarfile.tgz +## $0 $1 $2 $3 +## +## NOTE: you must manually ssh to each buildbot to get RSA fingerprint +## into your local known_hosts file before script works + +REPO=leveldb + +# +# main +# + +if [ $# == 2 ]; then + echo "2 parameters" + temp_path=$(mktemp) + temp_name=$(basename $temp_path) + echo "temp file " $temp_path + cat <$temp_path +rm -rf ~/$USER/$REPO +mkdir -p ~/$USER +cd ~/$USER +echo " Git start:: " \$(date) +git clone git@github.com:basho/$REPO +cd $REPO +git checkout $2 +export LD_LIBRARY_PATH=. + +echo "Make start: " \$(date) +if which gmake +then + if gmake -j 2 -s + then + echo "Build successful." + else + echo "Build failed." + exit 1 + fi + echo "Test start: " \$(date) + if gmake -j 2 -s check >/dev/null + then + echo "Test successful." + else + echo "Test failed." + exit 1 + fi +else + if make -j 2 -s + then + echo "Build successful." + else + echo "Build failed." + exit 1 + fi + echo "Test start: " \$(date) + if make -j 2 -s check >/dev/null + then + echo "Test successful." + else + echo "Test failed." + exit 1 + fi +fi +echo " Test end: " \$(date) +EOF + +# +#. /usr/local/erlang-r16b02/activate +#echo "Build name: " $REPO\_$2_"\$1" +#export build_name="$REPO\_$2_\$1" +#echo "Again: $build_name" +#env + + + chmod a+x $temp_path + + mkdir -p ~/builds/$REPO + rm ~/builds/$REPO/out + rm ~/builds/$REPO/err + parallel --tag -a $1 --gnu --colsep '[ ]{1,}' scp $temp_path buildbot@{1}:~/. >>~/builds/$REPO/out 2>>~/builds/$REPO/err + parallel --tag -a $1 --gnu --colsep '[ ]{1,}' ssh -q buildbot@{1} ./$temp_name {2} >>~/builds/$REPO/out 2>>~/builds/$REPO/err + parallel --tag -a $1 --gnu --colsep '[ ]{1,}' ssh -q buildbot@{1} rm $temp_name >>~/builds/$REPO/out 2>>~/builds/$REPO/err + echo "done" + rm $temp_path + + grep 'Test successful.' ~/builds/$REPO/out + grep 'Test successful.' ~/builds/$REPO/out | wc -l + echo "Builder count: " $(wc -l $1) +elif [ $# == 3 ]; then + echo "3 parameters" +else + echo " ./pbuilder.sh leveldb_tag" + echo " or" + echo " ./pbuilder.sh builder.list \"\" tarfile.tgz" +fi + +exit 0 diff --git a/src/leveldb/tools/perf_dump.cc b/src/leveldb/tools/perf_dump.cc new file mode 100644 index 000000000..551923268 --- /dev/null +++ b/src/leveldb/tools/perf_dump.cc @@ -0,0 +1,173 @@ +#include +#include +#include + +#include "leveldb/env.h" +#include "leveldb/perf_count.h" +#include "port/port.h" + +#define __STDC_FORMAT_MACROS +#include + +void command_help(); + +int +main( + int argc, + char ** argv) +{ + bool error_seen, csv_header, diff_mode, running; + int error_counter; + unsigned diff_seconds; + char ** cursor; + + running=true; + error_seen=false; + error_counter=0; + + csv_header=false; + diff_mode=false; + diff_seconds=1; + + + for (cursor=argv+1; NULL!=*cursor && running; ++cursor) + { + // option flag? + if ('-'==**cursor) + { + char flag; + + flag=*((*cursor)+1); + switch(flag) + { + case 'h': csv_header=true; break; + case 'd': + diff_mode=true; + ++cursor; + diff_seconds=strtoul(*cursor, NULL, 10); + break; + + default: + fprintf(stderr, " option \'%c\' is not valid\n", flag); + command_help(); + running=false; + error_counter=1; + error_seen=true; + break; + } // switch + } // if + + // non flag params + else + { + fprintf(stderr, " option \'%s\' is not valid\n", *cursor); + command_help(); + running=false; + error_counter=1; + error_seen=true; + } // else + } // for + + // attach to shared memory if params looking good + if (!error_seen) + { + const leveldb::PerformanceCounters * perf_ptr; + bool first_pass; + + first_pass=true; + perf_ptr=leveldb::PerformanceCounters::Init(true); + + if (NULL!=perf_ptr) + { + uint64_t first_time; + int loop; + + first_time=leveldb::port::TimeMicros(); + + if (csv_header) + { + csv_header=false; + printf("time, diff time, name, count\n"); + } // if + + if (diff_mode) + { + uint64_t prev_counters[leveldb::ePerfCountEnumSize], cur_counters[leveldb::ePerfCountEnumSize]; + uint64_t cur_time; + + do + { + // capture state before reporting + cur_time=leveldb::port::TimeMicros(); + for (loop=0; loopValue(loop); + } // for + + if (!first_pass) + { + for (loop=0; loopValue(loop)); + } // for + } // else + } // if + else + { + fprintf(stderr, "unable to attach to shared memory, error %d\n", + leveldb::PerformanceCounters::m_LastError); + ++error_counter; + error_seen=true; + } // else + } // if + + if (error_seen) + command_help(); + + return( error_seen && 0!=error_counter ? 1 : 0 ); + +} // main + + +void +command_help() +{ + fprintf(stderr, "perf_dump [option]*\n"); + fprintf(stderr, " options\n"); + fprintf(stderr, " -h print csv formatted header line (once)\n"); + fprintf(stderr, " -d n print diff ever n seconds\n"); +} // command_help + +namespace leveldb { + + +} // namespace leveldb + diff --git a/src/leveldb/tools/ppackager.sh b/src/leveldb/tools/ppackager.sh new file mode 100755 index 000000000..46f912083 --- /dev/null +++ b/src/leveldb/tools/ppackager.sh @@ -0,0 +1,136 @@ +#! /bin/bash +## ./ppackager.sh builder.list leveldb_tag +## or +## ./ppackager.sh builder.list "" tarfile.tgz +## $0 $1 $2 $3 +## +## NOTE: you must manually ssh to each buildbot to get RSA fingerprint +## into your local known_hosts file before script works + +REPO=eleveldb + +# +# main +# + +if [ $# == 2 ]; then + echo "2 parameters" + temp_path=$(mktemp) + temp_name=$(basename $temp_path) + echo "temp file " $temp_path + cat <$temp_path +rm -rf ~/$USER/$REPO +mkdir -p ~/$USER +cd ~/$USER +echo " Git start:: " \$(date) +git clone git@github.com:basho/$REPO +cd $REPO +git checkout $2 +sed -i -e 's/% #!sed//' rebar.config test/eleveldb_schema_tests.erl +export LD_LIBRARY_PATH=. +rm ~/$USER/eleveldb_$2\* + +. /usr/local/erlang-r16b02/activate + +echo "Make start: " \$(date) +if hash gmake 2>/dev/null +then + if gmake -j 2 -s + then + echo "Build successful." + else + echo "Build failed." + exit 1 + fi + echo "Test start: " \$(date) + if gmake -s test >/dev/null + then + echo "Test successful." + else + echo "Test failed." + #exit 1 + fi +else + if make -j 2 -s + then + echo "Build successful." + else + echo "Build failed." + exit 1 + fi + echo "Test start: " \$(date) + if make -s test >/dev/null + then + echo "Test successful." + else + echo "Test failed." + #exit 1 + fi +fi +echo " Test end: " \$(date) + +cd priv +cp -p ../ebin/eleveldb.beam . + +# hack to deal with the fact that md5sum may be in a weird place on smartos +export PATH=$PATH:/opt/local/gnu/bin + +if hash md5sum 2>/dev/null +then + echo calling md5sum + md5sum eleveldb.beam eleveldb.so >md5sum.txt +else + if hash md5 2>/dev/null + then + echo calling md5 + md5 -r eleveldb.beam eleveldb.so >md5sum.txt + else + // solaris does not have the md5sum or md5 commands, so use digest + echo calling digest + digest -a md5 eleveldb.beam eleveldb.so >md5sum.txt + fi +fi + +if uname -a | grep solaris >/dev/null +then + echo running tar and gzip on solaris + tar cf - eleveldb.beam eleveldb.so md5sum.txt | gzip -c > ~/$USER/eleveldb_$2_\$1.tar.gz +else + echo running gnu tar with -z option + tar -czf ~/$USER/eleveldb_$2_\$1.tar.gz eleveldb.beam eleveldb.so md5sum.txt +fi + +EOF + +# +#echo "Build name: " $REPO\_$2_"\$1" +#export build_name="$REPO\_$2_\$1" +#echo "Again: $build_name" +#env + + + chmod a+x $temp_path + + mkdir -p ~/builds/$REPO + rm ~/builds/$REPO/out + rm ~/builds/$REPO/err + rm ~/builds/$REPO/eleveldb_$2* + parallel --tag -a $1 --gnu --colsep '[ ]{1,}' scp $temp_path buildbot@{1}:~/. >>~/builds/$REPO/out 2>>~/builds/$REPO/err + parallel --tag -a $1 --gnu --colsep '[ ]{1,}' ssh -q buildbot@{1} ./$temp_name {2} >>~/builds/$REPO/out 2>>~/builds/$REPO/err + parallel --tag -a $1 --gnu --colsep '[ ]{1,}' ssh -q buildbot@{1} rm $temp_name >>~/builds/$REPO/out 2>>~/builds/$REPO/err + parallel --tag -a $1 --gnu --colsep '[ ]{1,}' scp -q buildbot@{1}:~/$USER/eleveldb_$2\* ~/builds/$REPO/. + echo "done" + rm $temp_path + + grep 'Test successful.' ~/builds/$REPO/out + grep 'Test successful.' ~/builds/$REPO/out | wc -l + echo "Packager count: " $(wc -l $1) +elif [ $# == 3 ]; then + echo "3 parameters" +else + echo " ./ppackager.sh leveldb_tag" + echo " or" + echo " ./ppackager.sh builder.list \"\" tarfile.tgz" +fi + +exit 0 diff --git a/src/leveldb/tools/sst_rewrite.cc b/src/leveldb/tools/sst_rewrite.cc new file mode 100644 index 000000000..49f233038 --- /dev/null +++ b/src/leveldb/tools/sst_rewrite.cc @@ -0,0 +1,398 @@ +// ------------------------------------------------------------------- +// +// sst_rewrite.cc +// +// Copyright (c) 2015 Basho Technologies, Inc. All Rights Reserved. +// +// This file is provided to you under the Apache License, +// Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain +// a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// ------------------------------------------------------------------- + +#include +#include +#include + +#define __STDC_FORMAT_MACROS +#include + +#include "leveldb/env.h" +#include "leveldb/filter_policy.h" +#include "leveldb/options.h" +#include "leveldb/table.h" +#include "leveldb/table_builder.h" + +void command_help(); + +// wrapper class for opening / closing existing leveldb tables +class LDbTable +{ +public: + LDbTable(leveldb::Options &, std::string &); + virtual ~LDbTable(); + + bool Ok() const {return(m_IsOpen);}; + leveldb::Iterator * NewIterator(); + + const leveldb::Status & GetStatus() const {return(m_LastStatus);}; + const char * GetFileName() const {return(m_FileName.c_str());}; + + uint64_t GetSstCounter(unsigned Idx) const + {return(m_IsOpen ? m_TablePtr->GetSstCounters().Value(Idx) : 0);}; + +protected: + leveldb::Options & m_Options; + const std::string m_FileName; + leveldb::RandomAccessFile * m_FilePtr; + leveldb::Table * m_TablePtr; + uint64_t m_FileSize; + leveldb::Status m_LastStatus; + + bool m_IsOpen; + + void Reset(); + +private: + // disable these + LDbTable(); + LDbTable(const LDbTable &); + const LDbTable operator=(const LDbTable&); +}; // LDbTable + + +LDbTable::LDbTable( + leveldb::Options & Options, + std::string & FileName) + : m_Options(Options), m_FileName(FileName), + m_FilePtr(NULL), m_TablePtr(NULL), m_FileSize(0), m_IsOpen(false) +{ + m_LastStatus=m_Options.env->GetFileSize(m_FileName, &m_FileSize); + + if (m_LastStatus.ok()) + {m_LastStatus=m_Options.env->NewRandomAccessFile(m_FileName, &m_FilePtr);} + + if (m_LastStatus.ok()) + { + m_LastStatus=leveldb::Table::Open(m_Options, m_FilePtr, m_FileSize, &m_TablePtr); + + // use fadvise to start file pre-read + m_FilePtr->SetForCompaction(m_FileSize); + } // if + + m_IsOpen=m_LastStatus.ok(); + + if (!m_IsOpen) + { + // some people would throw() at this point, but not me + Reset(); + } // if + + return; + +} // LDbTable::LDbTable + + +LDbTable::~LDbTable() +{ + Reset(); + + return; + +} // LDbTable::~LDbTable + + +void +LDbTable::Reset() +{ + m_IsOpen=false; + delete m_TablePtr; + m_TablePtr=NULL; + delete m_FilePtr; + m_FilePtr=NULL; + m_FileSize=0; + + return; + +} // LDbTable::Reset + + +leveldb::Iterator * +LDbTable::NewIterator() +{ + leveldb::Iterator * ret_ptr(NULL); + + if (m_IsOpen) + { + leveldb::ReadOptions read_options; + + read_options.fill_cache=false; + ret_ptr=m_TablePtr->NewIterator(read_options); + } // if + + return(ret_ptr); + +} // LDbTable::NewIterator + + +int +main( + int argc, + char ** argv) +{ + bool error_seen, running, compare_files; + char ** cursor; + + compare_files=false; + error_seen=false; + running=true; + + // Options: needs filter & total_leveldb_mem initialized + leveldb::Options options; + + // using 16 bit width per key in bloom filter + options.filter_policy=leveldb::NewBloomFilterPolicy2(16); + // tell leveldb it can use 512Mbyte of memory + options.total_leveldb_mem=(512 << 20); + + for (cursor=argv+1; + NULL!=*cursor && running && !error_seen; + ++cursor) + { + // option flag? + if ('-'==**cursor) + { + char flag; + + flag=*((*cursor)+1); + switch(flag) + { + case 'b': + { + error_seen=(NULL==(cursor+1)); + if (!error_seen) + { + ++cursor; + options.block_size=atol(*cursor); + }; + break; + } // case b + + case 's': options.compression=leveldb::kSnappyCompression; break; + case 'z': options.compression=leveldb::kLZ4Compression; break; + case 'n': options.compression=leveldb::kNoCompression; break; + + case 'c': + { + // test for first pair ... but after that user beware + error_seen=(NULL==(cursor+1)) || (NULL==(cursor+2)); + if (!error_seen) + {compare_files=true;} + break; + } // case c + + case 'w': compare_files=false; break; + + default: + fprintf(stderr, " option \'%c\' is not valid\n", flag); + command_help(); + running=false; + error_seen=true; + break; + } // switch + } // if + + // sst file + else + { + std::string fname; + fname=*cursor; + + // do a rewrite + if (!compare_files) + { + leveldb::WritableFile * outfile; + leveldb::Status s; + std::auto_ptr it; + std::auto_ptr builder; + + LDbTable in_file(options, fname); + + if (in_file.GetStatus().ok()) + { + it.reset(in_file.NewIterator()); + + fname.append(".new"); + s = options.env->NewWritableFile(fname, &outfile, + options.env->RecoveryMmapSize(&options)); + if (s.ok()) + builder.reset(new leveldb::TableBuilder(options, outfile)); + else + { + // Table::Open failed on file "fname" + fprintf(stderr, "%s: NewWritableFile failed (%s)\n", + fname.c_str(), s.ToString().c_str()); + error_seen=true; + } // else + + for (it->SeekToFirst(); + it->Valid() && s.ok() && builder->status().ok(); + it->Next()) + { + leveldb::Slice key = it->key(); + builder->Add(key, it->value()); + } // for + + // hmmm, nothing new setting status right now. + if (s.ok() && builder->status().ok()) { + s = builder->Finish(); + } else { + builder->Abandon(); + } + + if (NULL!=outfile) + outfile->Close(); + delete outfile; + } // if + else + { + fprintf(stderr, "%s: Input table open failed (%s)\n", + fname.c_str(), in_file.GetStatus().ToString().c_str()); + error_seen=true; + } // else + } // if + + // compare two files + else + { + LDbTable file1(options, fname); + + ++cursor; + if (NULL!=*cursor) + { + fname=*cursor; + LDbTable file2(options, fname); + + if (file1.GetStatus().ok() && file2.GetStatus().ok()) + { + // quick check: same number of keys and bytes of user data? + // do this before reading entire files + if (file1.GetSstCounter(leveldb::eSstCountKeys)==file2.GetSstCounter(leveldb::eSstCountKeys) + && file1.GetSstCounter(leveldb::eSstCountKeySize)==file2.GetSstCounter(leveldb::eSstCountKeySize) + && file1.GetSstCounter(leveldb::eSstCountValueSize)==file2.GetSstCounter(leveldb::eSstCountValueSize)) + { + leveldb::Iterator * it1, *it2; + uint64_t key_count; + bool match; + + it1=file1.NewIterator(); + it2=file2.NewIterator(); + match=true; + + for (it1->SeekToFirst(), it2->SeekToFirst(), key_count=1; + it1->Valid() && it2->Valid() && match; + it1->Next(), it2->Next(), ++key_count) + { + match=(0==it1->key().compare(it2->key())) && (0==it1->value().compare(it2->value())); + + if (!match) + { + fprintf(stderr, "%s, %s: Content mismatch at key position %d (%d, %d).\n", + file1.GetFileName(), file2.GetFileName(), + (int)key_count, + it1->key().compare(it2->key()), it1->value().compare(it2->value())); + error_seen=true; + } // if + + } // for + + if (it1->Valid() != it2->Valid()) + { + fprintf(stderr, "%s, %s: Walk of keys terminated early (%d, %d).\n", + file1.GetFileName(), file2.GetFileName(), + (int)it1->Valid(), (int)it2->Valid()); + error_seen=true; + } + } // if + else + { + if (file1.GetSstCounter(leveldb::eSstCountKeys)==file2.GetSstCounter(leveldb::eSstCountKeys)) + fprintf(stderr, "%s, %s: Number of keys different, %" PRIu64 " vs %" PRIu64 ".\n", + file1.GetFileName(), file2.GetFileName(), + file1.GetSstCounter(leveldb::eSstCountKeys), + file2.GetSstCounter(leveldb::eSstCountKeys)); + + if (file1.GetSstCounter(leveldb::eSstCountKeySize)==file2.GetSstCounter(leveldb::eSstCountKeySize)) + fprintf(stderr, "%s, %s: Byte size of all keys different, %" PRIu64 " vs %" PRIu64 "\n", + file1.GetFileName(), file2.GetFileName(), + file1.GetSstCounter(leveldb::eSstCountKeySize), + file2.GetSstCounter(leveldb::eSstCountKeySize)); + + if (file1.GetSstCounter(leveldb::eSstCountValueSize)==file2.GetSstCounter(leveldb::eSstCountValueSize)) + fprintf(stderr, "%s, %s: Byte size of all values different, %" PRIu64 " vs %" PRIu64 "\n", + file1.GetFileName(), file2.GetFileName(), + file1.GetSstCounter(leveldb::eSstCountValueSize), + file2.GetSstCounter(leveldb::eSstCountValueSize)); + error_seen=true; + } // else + } // if + else + { + if (!file1.GetStatus().ok()) + fprintf(stderr, "%s: Input table open failed (%s)\n", + file1.GetFileName(), file1.GetStatus().ToString().c_str()); + if (!file2.GetStatus().ok()) + fprintf(stderr, "%s: Input table open failed (%s)\n", + file2.GetFileName(), file2.GetStatus().ToString().c_str()); + error_seen=true; + } // else + } // if + else + { + fprintf(stderr, "%s: compare needs two file names, only have one\n", + fname.c_str()); + } // else + } // else + } // else + } // for + + // cleanup + options.env->Shutdown(); + delete options.filter_policy; + + if (1==argc) + command_help(); + + return( error_seen ? 1 : 0 ); + +} // main + + +void +command_help() +{ + fprintf(stderr, "sst_rewrite [option | file]*\n"); + fprintf(stderr, " options\n"); + fprintf(stderr, " -b value set Options.block_size to value\n"); + fprintf(stderr, " -n set Options.compression to No compression\n"); + fprintf(stderr, " -s set Options.compression to Snappy compression\n"); + fprintf(stderr, " -z set Options.compression to LZ4 compression\n"); + fprintf(stderr, " -c compare next two files (inverse of -w)\n"); + fprintf(stderr, " -w rewrite next file (default, inverse of -c)\n"); +} // command_help + +namespace leveldb { + + +} // namespace leveldb + diff --git a/src/leveldb/tools/sst_scan.cc b/src/leveldb/tools/sst_scan.cc new file mode 100644 index 000000000..3c93a9b42 --- /dev/null +++ b/src/leveldb/tools/sst_scan.cc @@ -0,0 +1,563 @@ +// ------------------------------------------------------------------- +// +// sst_scan.cc +// +// Copyright (c) 2016 Basho Technologies, Inc. All Rights Reserved. +// +// This file is provided to you under the Apache License, +// Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain +// a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// ------------------------------------------------------------------- + +#include +#include +#include + +#define __STDC_FORMAT_MACROS +#include + +#include "db/filename.h" +#include "leveldb/env.h" +#include "leveldb/db.h" +#include "leveldb/cache.h" +#include "leveldb/filter_policy.h" +#include "leveldb/slice.h" +#include "db/table_cache.h" +#include "db/version_edit.h" +#include "table/format.h" +#include "table/block.h" +#include "table/filter_block.h" +#include "util/cache2.h" + +//#include "leveldb_ee/riak_object.h" + +//#include "util/logging.h" +//#include "db/log_reader.h" + +void command_help(); +bool PrintSextKey(leveldb::Slice & Cursor, int Limit=1); +bool PrintSextAtom(leveldb::Slice & Cursor); +void PrintInternalKeyInfo(leveldb::ParsedInternalKey & ParsedKey); + +int +main( + int argc, + char ** argv) +{ + bool error_seen, index_keys, all_keys, block_info, csv_header, counter_info, + running, no_csv, summary_only, riak_translations, value_dump; + int error_counter; + char ** cursor; + + running=true; + error_seen=false; + + block_info=false; + counter_info=false; + index_keys=false; + csv_header=false; + all_keys=false; + no_csv=false; + summary_only=false; + riak_translations=false; + value_dump=false; + + error_counter=0; + + + for (cursor=argv+1; NULL!=*cursor && running; ++cursor) + { + // option flag? + if ('-'==**cursor) + { + char flag; + + flag=*((*cursor)+1); + switch(flag) + { + case 'b': block_info=true; break; + case 'c': counter_info=true; break; + case 'h': csv_header=true; break; + case 'i': index_keys=true; break; + case 'k': all_keys=true; break; + case 'n': no_csv=true; break; + case 'r': riak_translations=true; break; + case 's': summary_only=true; break; + case 'v': all_keys=true; value_dump=true; break; + default: + fprintf(stderr, " option \'%c\' is not valid\n", flag); + command_help(); + running=false; + error_counter=1; + error_seen=true; + break; + } // switch + } // if + + // sst file + else + { + leveldb::Options options; + leveldb::DoubleCache double_cache(options); + leveldb::ReadOptions read_options; + std::string table_name, dbname, path_temp; + leveldb::Env * env; + leveldb::FileMetaData meta; + leveldb::TableCache * table_cache; + env=leveldb::Env::Default(); + + const int search_level = -2; + const bool is_overlapped = search_level < 3; // temporary: see TableCache::Evict() + + // make copy since basename() and dirname() may modify + path_temp=*cursor; + dbname=dirname((char *)path_temp.c_str()); + dbname=MakeTieredDbname(dbname, options); + path_temp=*cursor; + table_name=basename((char *)path_temp.c_str()); + meta.number=strtol(table_name.c_str(), NULL, 10); + + options.filter_policy=leveldb::NewBloomFilterPolicy(10); + table_cache=new leveldb::TableCache(dbname, &options, double_cache.GetFileCache(), double_cache); + table_name = leveldb::TableFileName(options, meta.number, search_level); + + // open table, step 1 get file size + leveldb::Status status = env->GetFileSize(table_name, &meta.file_size); + if (!status.ok()) + { + fprintf(stderr, "%s: GetFileSize failed (%s)\n", table_name.c_str(),status.ToString().c_str()); + error_seen=true; + error_counter=10; + } // if + + //open table, step 2 find table (cache or open) + if (status.ok()) + { + leveldb::Cache::Handle * fhandle; + + fhandle=NULL; + + status=table_cache->TEST_FindTable(meta.number, meta.file_size, search_level, &fhandle); + + // count keys and size keys/filter + if (status.ok()) + { + leveldb::Table* table; + leveldb::Iterator *it, *it2; + int count, count2, total, block_count; + size_t tot_size, smallest_block, tot_compress, tot_uncompress; + bool first; + leveldb::Status status; + leveldb::RandomAccessFile * file; + + total=0; + count=0; + count2=0; + tot_size=0; + + table = reinterpret_cast(table_cache->TEST_GetInternalCache()->Value(fhandle))->table; + table->ReadFilter(); + file = reinterpret_cast(table_cache->TEST_GetInternalCache()->Value(fhandle))->file; + it = table->TEST_GetIndexBlock()->NewIterator(options.comparator); + + + // walk keys in index block + if (index_keys) + { + for (it->SeekToFirst(), count=0; it->Valid(); it->Next()) + { + ++count; + if (it->status().ok()) + { + leveldb::ParsedInternalKey parsed; + leveldb::Slice key = it->key(); + leveldb::Slice value = it->value(); + + ParseInternalKey(key, &parsed); + printf("key %zd, value %zd: %s\n", key.size(), value.size(), parsed.DebugStringHex().c_str()); + } // if + else + { + fprintf(stderr, "%s: index iterator failed (%s)\n", table_name.c_str(),it->status().ToString().c_str()); + } // else + } // for + } // if + + // Walk all blocks (but nothing within block) + smallest_block=0; + first=true; + block_count=0; + tot_compress=0; + tot_uncompress=0; + + for (it->SeekToFirst(), count=0; it->Valid() && !summary_only; it->Next()) + { + leveldb::BlockContents contents; + leveldb::BlockHandle bhandle; + leveldb::Slice slice; + + ++block_count; + slice=it->value(); + bhandle.DecodeFrom(&slice); + + if (block_info) + { + printf("block %d, offset %" PRIu64 ", size %" PRIu64 ", next %" PRIu64 "\n", + block_count, bhandle.offset(), bhandle.size(), bhandle.offset()+bhandle.size()); + } // if + + tot_compress+=bhandle.size(); + status=leveldb::ReadBlock(file, read_options, bhandle, &contents); + if (status.ok()) + { + if (first) + { + first=false; + smallest_block=contents.data.size(); + } // if + else if (contents.data.size()SeekToFirst(), count=0; it->Valid() && !summary_only; it->Next()) + { + ++count; + it2=leveldb::Table::TEST_BlockReader(table, read_options, it->value()); + for (it2->SeekToFirst(), count2=0; it2->Valid(); it2->Next()) + { + ++count2; + ++total; + if (it2->status().ok()) + { + tot_size+=it2->value().size(); + + if (all_keys) + { + leveldb::ParsedInternalKey parsed; + leveldb::Slice key = it2->key(); + + ParseInternalKey(key, &parsed); + printf("%s block_key %s\n", parsed.DebugStringHex().c_str(), table_name.c_str()); + + if (riak_translations && '\x10'==*parsed.user_key.data()) + { + leveldb::Slice cursor_slice; + + cursor_slice=parsed.user_key; + printf(" "); + PrintSextKey(cursor_slice); + printf("\n"); + printf(" "); + PrintInternalKeyInfo(parsed); + printf("\n"); + + cursor_slice=parsed.user_key; + } // if + + if (value_dump) + { + printf(" %s\n", HexString(it2->value()).c_str()); + } // if + } // if + } // if + else + { + fprintf(stderr, "%s: value iterator failed, location [%d, %d] (%s)\n", + table_name.c_str(),count, count2,it2->status().ToString().c_str()); + } // else + } // for + + delete it2; + } // for + + delete it; + + if (!no_csv) + { + if (csv_header) + { + csv_header=false; + printf("Table File, File size, Index size, Index key count, "); + printf("total key count, total value size, average value size, smallest block, ratio*100, "); + printf("table object size, filter size"); + + if (counter_info) + { + unsigned loop; + leveldb::SstCounters counters; + + counters=table->GetSstCounters(); + + for (loop=0; loopTEST_GetIndexBlock()->size(), count); + + printf(" %d, %zd, %zd, %zd, %zd,", + total, tot_size, (0!=total) ? tot_size/total : 0, smallest_block, + (0!=tot_compress) ? (tot_uncompress*100)/tot_compress: 0); + + printf(" %zd, %zd", + table->TEST_TableObjectSize(), table->TEST_FilterDataSize()); + + if (counter_info || summary_only) + { + unsigned loop; + leveldb::SstCounters counters; + + counters=table->GetSstCounters(); + + for (loop=0; loopEvict(meta.number, is_overlapped); + } // if + else + { + fprintf(stderr, "%s: FindTable failed (%s)\n", table_name.c_str(),status.ToString().c_str()); + error_seen=true; + error_counter=1; + } // else + } // if + + // cleanup + delete table_cache; + delete options.filter_policy; + + } // else + } // for + + if (1==argc) + command_help(); + + return( error_seen && 0!=error_counter ? 1 : 0 ); + +} // main + + +void +command_help() +{ + fprintf(stderr, "sst_scan [option | file]*\n"); + fprintf(stderr, " options\n"); + fprintf(stderr, " -b print details about block\n"); + fprintf(stderr, " -c print sst counters\n"); + fprintf(stderr, " -h print csv formatted header line (once)\n"); + fprintf(stderr, " -i print index keys\n"); + fprintf(stderr, " -k print all keys\n"); + fprintf(stderr, " -n NO csv data (or header)\n"); + fprintf(stderr, " -r print riak translations\n"); + fprintf(stderr, " -v print all keys and their values\n"); + +} // command_help + + +/** + * Recursive routine to give idea of key contents + */ +bool +PrintSextKey( + leveldb::Slice & Cursor, + int Limit) +{ + int loop; + bool good(true); + + for (loop=0; loop>"); + break; + } // atom + } // switch + } // for + + return(good); + +} // PrintSextKey + + +bool +PrintSextAtom( + leveldb::Slice & Cursor) +{ + bool good(true); + uint8_t mask(0x80); + char output; + + while(good && (uint8_t)*Cursor.data() & mask) + { + // this could be done easier with variables instead of fixed constants + switch(mask) + { + case(0x80): + { + output=(*Cursor.data() & 0x7f) << 1; + Cursor.remove_prefix(1); + output+=(*Cursor.data() & 0x80) >> 7; + printf("%c",output); + mask=0x40; + break; + } + + case(0x40): + { + output=(*Cursor.data() & 0x3f) << 2; + Cursor.remove_prefix(1); + output+=(*Cursor.data() & 0xc0) >> 6; + printf("%c",output); + mask=0x20; + break; + } + + case(0x20): + { + output=(*Cursor.data() & 0x1f) << 3; + Cursor.remove_prefix(1); + output+=(*Cursor.data() & 0xe0) >> 5; + printf("%c",output); + mask=0x10; + break; + } + + case(0x10): + { + output=(*Cursor.data() & 0x0f) << 4; + Cursor.remove_prefix(1); + output+=(*Cursor.data() & 0xf0) >> 4; + printf("%c",output); + mask=0x08; + break; + } + + case(0x08): + { + output=(*Cursor.data() & 0x07) << 5; + Cursor.remove_prefix(1); + output+=(*Cursor.data() & 0xf8) >> 3; + printf("%c",output); + mask=0x04; + break; + } + + case(0x04): + { + output=(*Cursor.data() & 0x03) << 6; + Cursor.remove_prefix(1); + output+=(*Cursor.data() & 0xfc) >> 2; + printf("%c",output); + mask=0x02; + break; + } + + case(0x02): + { + output=(*Cursor.data() & 0x01) << 7; + Cursor.remove_prefix(1); + output+=(*Cursor.data() & 0xfe) >> 1; + printf("%c",output); + mask=0x01; + break; + } + + case(0x01): + { + Cursor.remove_prefix(1); + output=*Cursor.data(); + Cursor.remove_prefix(1); + printf("%c",output); + mask=0x80; + break; + } + } // switch + + } // while + + Cursor.remove_prefix(2); + + return(good); + +} // PrintSextAtom + + +void +PrintInternalKeyInfo( + leveldb::ParsedInternalKey & ParsedKey) +{ + printf("%s, seq: %" PRIu64, leveldb::KeyTypeString(ParsedKey.type), ParsedKey.sequence); + + if (leveldb::IsExpiryKey(ParsedKey.type)) + printf(", expiry: %" PRIu64, ParsedKey.expiry); + +} // PrintInternalKeyInfo + +namespace leveldb { + + +} // namespace leveldb + diff --git a/src/leveldb/util/arena.cc b/src/leveldb/util/arena.cc index 74078213e..9551d6a3a 100644 --- a/src/leveldb/util/arena.cc +++ b/src/leveldb/util/arena.cc @@ -9,7 +9,8 @@ namespace leveldb { static const int kBlockSize = 4096; -Arena::Arena() : memory_usage_(0) { +Arena::Arena() { + blocks_memory_ = 0; alloc_ptr_ = NULL; // First allocation will allocate a block alloc_bytes_remaining_ = 0; } @@ -39,7 +40,7 @@ char* Arena::AllocateFallback(size_t bytes) { } char* Arena::AllocateAligned(size_t bytes) { - const int align = (sizeof(void*) > 8) ? sizeof(void*) : 8; + const int align = sizeof(void*); // We'll align to pointer size assert((align & (align-1)) == 0); // Pointer size should be a power of 2 size_t current_mod = reinterpret_cast(alloc_ptr_) & (align-1); size_t slop = (current_mod == 0 ? 0 : align - current_mod); @@ -59,9 +60,8 @@ char* Arena::AllocateAligned(size_t bytes) { char* Arena::AllocateNewBlock(size_t block_bytes) { char* result = new char[block_bytes]; + blocks_memory_ += block_bytes; blocks_.push_back(result); - memory_usage_.NoBarrier_Store( - reinterpret_cast(MemoryUsage() + block_bytes + sizeof(char*))); return result; } diff --git a/src/leveldb/util/arena.h b/src/leveldb/util/arena.h index 48bab3374..8f7dde226 100644 --- a/src/leveldb/util/arena.h +++ b/src/leveldb/util/arena.h @@ -5,11 +5,10 @@ #ifndef STORAGE_LEVELDB_UTIL_ARENA_H_ #define STORAGE_LEVELDB_UTIL_ARENA_H_ +#include #include #include -#include #include -#include "port/port.h" namespace leveldb { @@ -25,9 +24,10 @@ class Arena { char* AllocateAligned(size_t bytes); // Returns an estimate of the total memory usage of data allocated - // by the arena. + // by the arena (including space allocated but not yet used for user + // allocations). size_t MemoryUsage() const { - return reinterpret_cast(memory_usage_.NoBarrier_Load()); + return blocks_memory_ + blocks_.capacity() * sizeof(char*); } private: @@ -41,8 +41,8 @@ class Arena { // Array of new[] allocated memory blocks std::vector blocks_; - // Total memory usage of the arena. - port::AtomicPointer memory_usage_; + // Bytes of memory in blocks allocated so far + size_t blocks_memory_; // No copying allowed Arena(const Arena&); diff --git a/src/leveldb/util/arena_test.cc b/src/leveldb/util/arena_test.cc index 58e870ec4..63d177803 100644 --- a/src/leveldb/util/arena_test.cc +++ b/src/leveldb/util/arena_test.cc @@ -40,7 +40,7 @@ TEST(ArenaTest, Simple) { r = arena.Allocate(s); } - for (size_t b = 0; b < s; b++) { + for (int b = 0; b < s; b++) { // Fill the "i"th allocation with a known bit pattern r[b] = i % 256; } @@ -51,10 +51,10 @@ TEST(ArenaTest, Simple) { ASSERT_LE(arena.MemoryUsage(), bytes * 1.10); } } - for (size_t i = 0; i < allocated.size(); i++) { + for (int i = 0; i < allocated.size(); i++) { size_t num_bytes = allocated[i].first; const char* p = allocated[i].second; - for (size_t b = 0; b < num_bytes; b++) { + for (int b = 0; b < num_bytes; b++) { // Check the "i"th allocation for the known bit pattern ASSERT_EQ(int(p[b]) & 0xff, i % 256); } diff --git a/src/leveldb/util/bloom.cc b/src/leveldb/util/bloom.cc index bf3e4ca6e..1cb63d1c3 100644 --- a/src/leveldb/util/bloom.cc +++ b/src/leveldb/util/bloom.cc @@ -2,8 +2,10 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. +#include #include "leveldb/filter_policy.h" +#include "db/dbformat.h" #include "leveldb/slice.h" #include "util/hash.h" @@ -29,7 +31,7 @@ class BloomFilterPolicy : public FilterPolicy { } virtual const char* Name() const { - return "leveldb.BuiltinBloomFilter2"; + return "leveldb.BuiltinBloomFilter"; } virtual void CreateFilter(const Slice* keys, int n, std::string* dst) const { @@ -47,7 +49,7 @@ class BloomFilterPolicy : public FilterPolicy { dst->resize(init_size + bytes, 0); dst->push_back(static_cast(k_)); // Remember # of probes in filter char* array = &(*dst)[init_size]; - for (int i = 0; i < n; i++) { + for (size_t i = 0; i < (size_t)n; i++) { // Use double-hashing to generate a sequence of hash values. // See analysis in [Kirsch,Mitzenmacher 2006]. uint32_t h = BloomHash(keys[i]); @@ -92,4 +94,19 @@ const FilterPolicy* NewBloomFilterPolicy(int bits_per_key) { return new BloomFilterPolicy(bits_per_key); } +// container to hold one bloom filter and auto destruct +struct BloomInventoryItem +{ + std::auto_ptr m_Item; + + BloomInventoryItem() + { + m_Item.reset(new InternalFilterPolicy2(NewBloomFilterPolicy(16))); + FilterInventory::AddFilterToInventory(m_Item.get()); + }; +}; // struct BloomInventoryItem + +// bloom filter for reading, created on start-up +static BloomInventoryItem lBloomItem; + } // namespace leveldb diff --git a/src/leveldb/util/bloom2.cc b/src/leveldb/util/bloom2.cc new file mode 100644 index 000000000..5ffb2840c --- /dev/null +++ b/src/leveldb/util/bloom2.cc @@ -0,0 +1,1447 @@ +// Copyright (c) 2012 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include "leveldb/filter_policy.h" + +#include "db/dbformat.h" +#include "leveldb/slice.h" +#include "util/hash.h" +#include "util/murmurhash.h" + +namespace leveldb { + +static unsigned Bytes2Prime(unsigned Bytes); +static unsigned Bits2PrimeNBytes(unsigned Bits, unsigned & BytesOut); + + +namespace { +static uint32_t BloomHash0(const Slice& key) { + return Hash(key.data(), key.size(), 0xbc9f1d34); +} + +static uint32_t BloomHash1(const Slice& key) { + return((uint32_t)MurmurHash(key.data(), key.size(), 0x5bd1e995)); +} + +class BloomFilterPolicy2 : public FilterPolicy { + private: + size_t bits_per_key_; + size_t k_; + + public: + explicit BloomFilterPolicy2(int bits_per_key) + : bits_per_key_(bits_per_key) { + // We intentionally round down to reduce probing cost a little bit + k_ = static_cast(bits_per_key * 0.69); // 0.69 =~ ln(2) + if (k_ < 1) k_ = 1; + if (k_ > 30) k_ = 30; + } + + virtual const char* Name() const { + return "leveldb.BuiltinBloomFilter2"; + } + + virtual void CreateFilter(const Slice* keys, int n, std::string* dst) const + { + unsigned bytes; + + // Compute bloom filter size (in both bits and bytes) + size_t bits = n * bits_per_key_; + + // For small n, we can see a very high false positive rate. Fix it + // by enforcing a minimum bloom filter length. + if (bits < 61) bits = 61; + + const unsigned prime=Bits2PrimeNBytes(bits, bytes); + + const size_t init_size = dst->size(); + dst->resize(init_size + bytes, 0); + dst->push_back(static_cast(k_)); // Remember # of probes in filter + char* array = &(*dst)[init_size]; + for (size_t i = 0; i < (size_t)n; i++) { + // Use double-hashing to generate a sequence of hash values. + // See analysis in [Kirsch,Mitzenmacher 2006]. + uint32_t h = BloomHash0(keys[i]); + uint32_t h2= BloomHash1(keys[i]); + const uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits + for (size_t j = 0; j < k_; j++) { + const uint32_t bitpos = (h + ((j+1)*h2)) % prime; + array[bitpos/8] |= (1 << (bitpos % 8)); + h += delta; + } + } + } + + virtual bool KeyMayMatch(const Slice& key, const Slice& bloom_filter) const { + const size_t len = bloom_filter.size(); + if (len < 2) return false; + + const char* array = bloom_filter.data(); + const unsigned prime=Bytes2Prime(len-1); + + // Use the encoded k so that we can read filters generated by + // bloom filters created using different parameters. + const size_t k = array[len-1]; + if (k > 30) { + // Reserved for potentially new encodings for short bloom filters. + // Consider it a match. + return true; + } + + uint32_t h = BloomHash0(key); + uint32_t h2= BloomHash1(key); + const uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits + for (size_t j = 0; j < k; j++) { + const uint32_t bitpos = (h + ((j+1)*h2)) % prime; + if ((array[bitpos/8] & (1 << (bitpos % 8))) == 0) return false; + h += delta; + } + return true; + } +}; +} + +const FilterPolicy* NewBloomFilterPolicy2(int bits_per_key) { + return new BloomFilterPolicy2(bits_per_key); +} + + +// container to hold one bloom filter and auto destruct +struct BloomInventoryItem2 +{ + std::auto_ptr m_Item; + + BloomInventoryItem2() + { + m_Item.reset(new InternalFilterPolicy2(NewBloomFilterPolicy2(16))); + FilterInventory::AddFilterToInventory(m_Item.get()); + }; +}; // struct BloomInventoryItem2 + +// bloom filter for reading, created on start-up +static BloomInventoryItem2 lBloom2Item; + + + +// sparse table of primes where index to array is count +// of bytes to contain the prime number of bits +// The prime table only helps where key count is roughly 6,250 +// or less. It adds accuracy to smaller populations. +static unsigned ByteSizePrimes[]= +{ + 0, 7, 13, 23, 31, 37, 47, 53, 61, 71, + 79, 83, 89, 103, 109, 113, 127, 131, 139, 151, + 157, 167, 173, 181, 191, 199, 199, 211, 223, 229, + 239, 241, 251, 263, 271, 277, 283, 293, 293, 311, + 317, 317, 331, 337, 349, 359, 367, 373, 383, 389, + 397, 401, 409, 421, 431, 439, 443, 449, 463, 467, + 479, 487, 491, 503, 509, 509, 523, 523, 541, 547, + 557, 563, 571, 577, 587, 599, 607, 613, 619, 631, + 631, 647, 653, 661, 661, 677, 683, 691, 701, 709, + 719, 727, 733, 743, 751, 757, 761, 773, 773, 787, + 797, 797, 811, 823, 829, 839, 839, 853, 863, 863, + 877, 887, 887, 887, 911, 919, 919, 929, 941, 947, + 953, 967, 971, 983, 991, 997, 997, 1013, 1021, 1031, + 1039, 1039, 1051, 1063, 1069, 1069, 1087, 1093, 1103, 1109, + 1117, 1123, 1129, 1129, 1151, 1153, 1163, 1171, 1181, 1187, + 1193, 1201, 1213, 1223, 1231, 1237, 1237, 1249, 1259, 1259, + 1279, 1283, 1291, 1303, 1307, 1319, 1327, 1327, 1327, 1327, + 1327, 1367, 1373, 1381, 1381, 1399, 1399, 1409, 1423, 1429, + 1439, 1447, 1453, 1459, 1471, 1471, 1487, 1493, 1499, 1511, + 1511, 1523, 1531, 1543, 1549, 1559, 1567, 1571, 1583, 1583, + 1597, 1607, 1613, 1621, 1627, 1637, 1637, 1637, 1663, 1669, + 1669, 1669, 1693, 1699, 1709, 1709, 1723, 1733, 1741, 1747, + 1759, 1759, 1759, 1783, 1789, 1789, 1801, 1811, 1823, 1831, + 1831, 1847, 1847, 1861, 1871, 1879, 1879, 1889, 1901, 1907, + 1913, 1913, 1933, 1933, 1951, 1951, 1951, 1973, 1979, 1987, + 1999, 2003, 2011, 2017, 2029, 2039, 2039, 2053, 2063, 2069, + 2069, 2087, 2089, 2099, 2111, 2113, 2113, 2131, 2143, 2143, + 2153, 2161, 2161, 2179, 2179, 2179, 2207, 2213, 2221, 2221, + 2239, 2243, 2251, 2251, 2269, 2273, 2287, 2293, 2297, 2311, + 2311, 2311, 2333, 2341, 2351, 2357, 2357, 2371, 2383, 2389, + 2399, 2399, 2411, 2423, 2423, 2437, 2447, 2447, 2459, 2467, + 2477, 2477, 2477, 2503, 2503, 2503, 2521, 2531, 2543, 2551, + 2557, 2557, 2557, 2579, 2591, 2593, 2593, 2609, 2621, 2621, + 2633, 2647, 2647, 2663, 2671, 2677, 2687, 2693, 2699, 2711, + 2719, 2719, 2731, 2741, 2749, 2753, 2767, 2767, 2777, 2791, + 2797, 2803, 2803, 2819, 2819, 2837, 2843, 2851, 2861, 2861, + 2879, 2887, 2887, 2903, 2909, 2917, 2927, 2927, 2939, 2939, + 2957, 2963, 2971, 2971, 2971, 2999, 3001, 3011, 3023, 3023, + 3037, 3041, 3049, 3061, 3067, 3079, 3083, 3089, 3089, 3109, + 3119, 3121, 3121, 3137, 3137, 3137, 3167, 3169, 3181, 3191, + 3191, 3203, 3209, 3221, 3229, 3229, 3229, 3253, 3259, 3271, + 3271, 3271, 3271, 3301, 3307, 3319, 3323, 3331, 3343, 3347, + 3359, 3361, 3373, 3373, 3391, 3391, 3407, 3413, 3413, 3413, + 3433, 3433, 3449, 3463, 3469, 3469, 3469, 3491, 3499, 3511, + 3517, 3527, 3533, 3541, 3547, 3559, 3559, 3571, 3583, 3583, + 3593, 3607, 3613, 3623, 3631, 3637, 3643, 3643, 3659, 3671, + 3677, 3677, 3691, 3701, 3709, 3719, 3727, 3733, 3739, 3739, + 3739, 3767, 3769, 3779, 3779, 3797, 3803, 3803, 3823, 3823, + 3833, 3847, 3853, 3863, 3863, 3877, 3881, 3889, 3889, 3911, + 3919, 3923, 3931, 3943, 3947, 3947, 3967, 3967, 3967, 3989, + 3989, 4007, 4013, 4021, 4027, 4027, 4027, 4051, 4057, 4057, + 4079, 4079, 4093, 4099, 4111, 4111, 4127, 4133, 4139, 4139, + 4159, 4159, 4159, 4177, 4177, 4177, 4201, 4211, 4219, 4231, + 4231, 4243, 4253, 4261, 4271, 4273, 4283, 4289, 4297, 4297, + 4297, 4327, 4327, 4339, 4349, 4357, 4363, 4373, 4373, 4391, + 4397, 4397, 4409, 4423, 4423, 4423, 4447, 4451, 4463, 4463, + 4463, 4483, 4493, 4493, 4507, 4519, 4523, 4523, 4523, 4549, + 4549, 4567, 4567, 4583, 4591, 4597, 4603, 4603, 4621, 4621, + 4639, 4643, 4651, 4663, 4663, 4679, 4679, 4691, 4703, 4703, + 4703, 4723, 4733, 4733, 4751, 4759, 4759, 4759, 4783, 4789, + 4799, 4801, 4813, 4817, 4831, 4831, 4831, 4831, 4861, 4871, + 4877, 4877, 4889, 4903, 4909, 4919, 4919, 4933, 4943, 4951, + 4957, 4967, 4973, 4973, 4987, 4999, 5003, 5011, 5023, 5023, + 5039, 5039, 5051, 5059, 5059, 5077, 5087, 5087, 5101, 5107, + 5119, 5119, 5119, 5119, 5147, 5153, 5167, 5171, 5179, 5189, + 5197, 5197, 5209, 5209, 5231, 5237, 5237, 5237, 5261, 5261, + 5279, 5281, 5281, 5303, 5309, 5309, 5323, 5333, 5333, 5351, + 5351, 5351, 5351, 5381, 5387, 5399, 5407, 5413, 5419, 5431, + 5437, 5443, 5449, 5449, 5471, 5479, 5483, 5483, 5503, 5507, + 5519, 5527, 5531, 5531, 5531, 5557, 5563, 5573, 5581, 5591, + 5591, 5591, 5591, 5623, 5623, 5639, 5647, 5653, 5659, 5669, + 5669, 5683, 5693, 5701, 5711, 5717, 5717, 5717, 5743, 5749, + 5749, 5749, 5749, 5783, 5791, 5791, 5807, 5813, 5821, 5827, + 5839, 5843, 5851, 5861, 5869, 5879, 5881, 5881, 5903, 5903, + 5903, 5927, 5927, 5939, 5939, 5953, 5953, 5953, 5981, 5987, + 5987, 6007, 6011, 6011, 6029, 6037, 6047, 6053, 6053, 6067, + 6079, 6079, 6091, 6101, 6101, 6113, 6121, 6133, 6143, 6151, + 6151, 6163, 6173, 6173, 6173, 6199, 6203, 6211, 6221, 6229, + 6229, 6247, 6247, 6263, 6271, 6277, 6287, 6287, 6301, 6311, + 6317, 6323, 6329, 6343, 6343, 6359, 6367, 6373, 6379, 6389, + 6397, 6397, 6397, 6421, 6427, 6427, 6427, 6451, 6451, 6469, + 6473, 6481, 6491, 6491, 6491, 6491, 6521, 6529, 6529, 6551, + 6553, 6563, 6571, 6581, 6581, 6599, 6607, 6607, 6619, 6619, + 6637, 6637, 6653, 6661, 6661, 6679, 6679, 6691, 6703, 6709, + 6719, 6719, 6733, 6737, 6737, 6737, 6763, 6763, 6781, 6791, + 6793, 6803, 6803, 6823, 6829, 6833, 6841, 6841, 6863, 6871, + 6871, 6883, 6883, 6899, 6911, 6917, 6917, 6917, 6917, 6949, + 6959, 6967, 6971, 6983, 6991, 6997, 7001, 7013, 7019, 7027, + 7039, 7043, 7043, 7057, 7069, 7079, 7079, 7079, 7103, 7109, + 7109, 7127, 7129, 7129, 7151, 7159, 7159, 7159, 7177, 7187, + 7193, 7207, 7213, 7219, 7229, 7237, 7247, 7253, 7253, 7253, + 7253, 7283, 7283, 7297, 7309, 7309, 7321, 7333, 7333, 7351, + 7351, 7351, 7369, 7369, 7369, 7393, 7393, 7411, 7417, 7417, + 7433, 7433, 7451, 7459, 7459, 7477, 7487, 7489, 7499, 7507, + 7517, 7523, 7529, 7541, 7549, 7559, 7561, 7573, 7583, 7591, + 7591, 7607, 7607, 7621, 7621, 7639, 7643, 7649, 7649, 7669, + 7673, 7687, 7691, 7703, 7703, 7717, 7727, 7727, 7741, 7741, + 7759, 7759, 7759, 7759, 7789, 7793, 7793, 7793, 7823, 7829, + 7829, 7841, 7853, 7853, 7867, 7879, 7883, 7883, 7901, 7907, + 7919, 7927, 7933, 7937, 7951, 7951, 7963, 7963, 7963, 7963, + 7993, 7993, 8011, 8017, 8017, 8039, 8039, 8053, 8059, 8069, + 8069, 8087, 8093, 8101, 8111, 8117, 8123, 8123, 8123, 8147, + 8147, 8167, 8171, 8179, 8191, 8191, 8191, 8209, 8221, 8231, + 8237, 8243, 8243, 8263, 8269, 8273, 8287, 8293, 8297, 8311, + 8317, 8317, 8329, 8329, 8329, 8353, 8363, 8369, 8377, 8389, + 8389, 8389, 8389, 8423, 8431, 8431, 8447, 8447, 8461, 8467, + 8467, 8467, 8467, 8501, 8501, 8513, 8527, 8527, 8543, 8543, + 8543, 8563, 8573, 8581, 8581, 8599, 8599, 8609, 8623, 8629, + 8629, 8647, 8647, 8663, 8669, 8677, 8681, 8693, 8699, 8707, + 8719, 8719, 8731, 8741, 8747, 8753, 8761, 8761, 8783, 8783, + 8783, 8807, 8807, 8821, 8831, 8839, 8839, 8849, 8863, 8867, + 8867, 8887, 8893, 8893, 8893, 8893, 8923, 8933, 8941, 8951, + 8951, 8963, 8971, 8971, 8971, 8999, 9007, 9013, 9013, 9029, + 9029, 9043, 9049, 9059, 9067, 9067, 9067, 9091, 9103, 9109, + 9109, 9127, 9133, 9137, 9151, 9157, 9161, 9173, 9181, 9187, + 9199, 9203, 9209, 9221, 9227, 9239, 9241, 9241, 9257, 9257, + 9277, 9283, 9293, 9293, 9311, 9319, 9323, 9323, 9343, 9349, + 9349, 9349, 9371, 9377, 9391, 9397, 9403, 9413, 9421, 9431, + 9439, 9439, 9439, 9463, 9467, 9479, 9479, 9491, 9497, 9511, + 9511, 9521, 9533, 9539, 9551, 9551, 9551, 9551, 9551, 9587, + 9587, 9601, 9613, 9623, 9631, 9631, 9643, 9649, 9661, 9661, + 9679, 9679, 9689, 9697, 9697, 9719, 9721, 9733, 9743, 9749, + 9749, 9767, 9769, 9781, 9791, 9791, 9803, 9811, 9817, 9829, + 9839, 9839, 9851, 9859, 9871, 9871, 9887, 9887, 9901, 9907, + 9907, 9923, 9931, 9941, 9949, 9949, 9967, 9973, 9973, 9973, + 9973, 10007, 10009, 10009, 10009, 10039, 10039, 10039, 10061, 10069, + 10079, 10079, 10093, 10103, 10111, 10111, 10111, 10133, 10141, 10151, + 10159, 10163, 10169, 10181, 10181, 10193, 10193, 10211, 10223, 10223, + 10223, 10247, 10253, 10259, 10271, 10273, 10273, 10289, 10303, 10303, + 10313, 10321, 10333, 10343, 10343, 10357, 10357, 10369, 10369, 10391, + 10399, 10399, 10399, 10399, 10429, 10433, 10433, 10453, 10463, 10463, + 10477, 10487, 10487, 10501, 10501, 10513, 10513, 10531, 10531, 10531, + 10559, 10567, 10567, 10567, 10589, 10597, 10607, 10613, 10613, 10631, + 10639, 10639, 10651, 10663, 10667, 10667, 10687, 10691, 10691, 10711, + 10711, 10723, 10733, 10739, 10739, 10753, 10753, 10771, 10781, 10789, + 10799, 10799, 10799, 10799, 10831, 10837, 10847, 10853, 10861, 10867, + 10867, 10883, 10891, 10903, 10909, 10909, 10909, 10909, 10939, 10949, + 10957, 10957, 10973, 10979, 10987, 10993, 11003, 11003, 11003, 11027, + 11027, 11047, 11047, 11059, 11071, 11071, 11087, 11093, 11093, 11093, + 11119, 11119, 11131, 11131, 11149, 11159, 11161, 11173, 11177, 11177, + 11197, 11197, 11213, 11213, 11213, 11239, 11243, 11251, 11261, 11261, + 11279, 11287, 11287, 11299, 11311, 11317, 11321, 11329, 11329, 11351, + 11353, 11353, 11369, 11383, 11383, 11399, 11399, 11411, 11423, 11423, + 11437, 11447, 11447, 11447, 11471, 11471, 11483, 11491, 11503, 11503, + 11519, 11527, 11527, 11527, 11551, 11551, 11551, 11551, 11579, 11587, + 11597, 11597, 11597, 11621, 11621, 11633, 11633, 11633, 11657, 11657, + 11677, 11681, 11689, 11701, 11701, 11719, 11719, 11731, 11743, 11743, + 11743, 11743, 11743, 11783, 11789, 11789, 11807, 11813, 11821, 11831, + 11839, 11839, 11839, 11863, 11867, 11867, 11887, 11887, 11903, 11909, + 11909, 11927, 11933, 11941, 11941, 11959, 11959, 11971, 11981, 11987, + 11987, 12007, 12011, 12011, 12011, 12037, 12043, 12049, 12049, 12071, + 12073, 12073, 12073, 12101, 12109, 12119, 12119, 12119, 12143, 12149, + 12157, 12163, 12163, 12163, 12163, 12197, 12203, 12211, 12211, 12227, + 12239, 12241, 12253, 12263, 12269, 12277, 12281, 12289, 12301, 12301, + 12301, 12323, 12329, 12343, 12347, 12347, 12347, 12373, 12379, 12391, + 12391, 12401, 12413, 12421, 12421, 12437, 12437, 12451, 12457, 12457, + 12479, 12487, 12491, 12503, 12511, 12517, 12527, 12527, 12541, 12547, + 12553, 12553, 12569, 12583, 12589, 12589, 12601, 12613, 12619, 12619, + 12637, 12647, 12653, 12659, 12671, 12671, 12671, 12689, 12703, 12703, + 12713, 12721, 12721, 12743, 12743, 12757, 12763, 12763, 12781, 12791, + 12799, 12799, 12809, 12823, 12829, 12829, 12841, 12853, 12853, 12853, + 12853, 12853, 12893, 12899, 12911, 12919, 12923, 12923, 12941, 12941, + 12959, 12967, 12973, 12983, 12983, 12983, 13007, 13009, 13009, 13009, + 13037, 13043, 13049, 13063, 13063, 13063, 13063, 13093, 13103, 13109, + 13109, 13127, 13127, 13127, 13151, 13159, 13163, 13171, 13183, 13187, + 13187, 13187, 13187, 13219, 13229, 13229, 13241, 13249, 13259, 13267, + 13267, 13267, 13291, 13297, 13309, 13313, 13327, 13331, 13339, 13339, + 13339, 13367, 13367, 13381, 13381, 13399, 13399, 13411, 13421, 13421, + 13421, 13441, 13451, 13463, 13469, 13477, 13487, 13487, 13499, 13499, + 13513, 13523, 13523, 13537, 13537, 13553, 13567, 13567, 13577, 13591, + 13597, 13597, 13613, 13619, 13627, 13633, 13633, 13649, 13649, 13669, + 13679, 13687, 13693, 13697, 13711, 13711, 13723, 13729, 13729, 13751, + 13759, 13763, 13763, 13781, 13789, 13799, 13807, 13807, 13807, 13831, + 13831, 13841, 13841, 13859, 13859, 13879, 13883, 13883, 13903, 13907, + 13913, 13921, 13933, 13933, 13933, 13933, 13967, 13967, 13967, 13967, + 13999, 13999, 14011, 14011, 14029, 14033, 14033, 14051, 14057, 14071, + 14071, 14087, 14087, 14087, 14107, 14107, 14107, 14107, 14143, 14149, + 14159, 14159, 14173, 14177, 14177, 14197, 14207, 14207, 14221, 14221, + 14221, 14243, 14251, 14251, 14251, 14251, 14281, 14293, 14303, 14303, + 14303, 14327, 14327, 14341, 14347, 14347, 14347, 14369, 14369, 14389, + 14389, 14407, 14411, 14423, 14431, 14437, 14447, 14449, 14461, 14461, + 14479, 14479, 14489, 14503, 14503, 14519, 14519, 14533, 14543, 14551, + 14557, 14563, 14563, 14563, 14591, 14593, 14593, 14593, 14621, 14629, + 14639, 14639, 14653, 14657, 14669, 14669, 14683, 14683, 14699, 14699, + 14717, 14723, 14731, 14741, 14747, 14759, 14767, 14771, 14783, 14783, + 14797, 14797, 14813, 14821, 14831, 14831, 14843, 14851, 14851, 14869, + 14879, 14887, 14891, 14897, 14897, 14897, 14923, 14929, 14939, 14951, + 14957, 14957, 14969, 14983, 14983, 14983, 14983, 15013, 15017, 15031, + 15031, 15031, 15053, 15061, 15061, 15077, 15083, 15091, 15101, 15107, + 15107, 15121, 15131, 15139, 15149, 15149, 15161, 15173, 15173, 15187, + 15199, 15199, 15199, 15217, 15227, 15233, 15241, 15241, 15263, 15271, + 15277, 15287, 15289, 15299, 15307, 15319, 15319, 15331, 15331, 15349, + 15359, 15361, 15373, 15383, 15391, 15391, 15401, 15413, 15413, 15427, + 15439, 15443, 15451, 15461, 15467, 15473, 15473, 15493, 15497, 15511, + 15511, 15527, 15527, 15541, 15551, 15559, 15559, 15569, 15583, 15583, + 15583, 15607, 15607, 15619, 15629, 15629, 15647, 15649, 15661, 15671, + 15679, 15683, 15683, 15683, 15683, 15683, 15727, 15733, 15739, 15749, + 15749, 15767, 15773, 15773, 15791, 15797, 15803, 15809, 15823, 15823, + 15823, 15823, 15823, 15859, 15859, 15877, 15887, 15889, 15901, 15907, + 15919, 15923, 15923, 15937, 15937, 15959, 15959, 15973, 15973, 15991, + 15991, 16007, 16007, 16007, 16007, 16033, 16033, 16033, 16063, 16069, + 16073, 16087, 16091, 16103, 16111, 16111, 16127, 16127, 16141, 16141, + 16141, 16141, 16141, 16183, 16189, 16193, 16193, 16193, 16223, 16231, + 16231, 16231, 16253, 16253, 16267, 16273, 16273, 16273, 16301, 16301, + 16319, 16319, 16333, 16339, 16349, 16349, 16363, 16369, 16381, 16381, + 16381, 16381, 16411, 16421, 16427, 16433, 16447, 16453, 16453, 16453, + 16477, 16487, 16493, 16493, 16493, 16519, 16519, 16529, 16529, 16547, + 16553, 16567, 16573, 16573, 16573, 16573, 16607, 16607, 16619, 16631, + 16633, 16633, 16651, 16661, 16661, 16673, 16673, 16693, 16703, 16703, + 16703, 16703, 16729, 16741, 16747, 16759, 16763, 16763, 16763, 16787, + 16787, 16787, 16811, 16823, 16831, 16831, 16843, 16843, 16843, 16871, + 16879, 16883, 16889, 16903, 16903, 16903, 16927, 16931, 16943, 16943, + 16943, 16963, 16963, 16981, 16987, 16993, 16993, 17011, 17021, 17029, + 17033, 17047, 17053, 17053, 17053, 17077, 17077, 17093, 17099, 17107, + 17117, 17123, 17123, 17137, 17137, 17159, 17167, 17167, 17183, 17191, + 17191, 17207, 17209, 17209, 17231, 17239, 17239, 17239, 17257, 17257, + 17257, 17257, 17293, 17299, 17299, 17317, 17327, 17333, 17341, 17351, + 17359, 17359, 17359, 17383, 17389, 17393, 17401, 17401, 17419, 17431, + 17431, 17443, 17449, 17449, 17471, 17477, 17483, 17491, 17497, 17509, + 17519, 17519, 17519, 17539, 17551, 17551, 17551, 17573, 17581, 17581, + 17599, 17599, 17609, 17623, 17627, 17627, 17627, 17627, 17659, 17669, + 17669, 17683, 17683, 17683, 17707, 17713, 17713, 17729, 17737, 17749, + 17749, 17761, 17761, 17783, 17791, 17791, 17807, 17807, 17807, 17827, + 17839, 17839, 17851, 17863, 17863, 17863, 17881, 17891, 17903, 17911, + 17911, 17923, 17929, 17939, 17939, 17959, 17959, 17971, 17981, 17989, + 17989, 17989, 18013, 18013, 18013, 18013, 18047, 18049, 18061, 18061, + 18077, 18077, 18089, 18097, 18097, 18119, 18127, 18133, 18143, 18149, + 18149, 18149, 18169, 18181, 18191, 18199, 18199, 18211, 18223, 18229, + 18233, 18233, 18253, 18257, 18269, 18269, 18287, 18289, 18301, 18311, + 18313, 18313, 18329, 18341, 18341, 18353, 18367, 18371, 18379, 18379, + 18397, 18401, 18413, 18413, 18427, 18439, 18443, 18451, 18461, 18461, + 18461, 18481, 18493, 18503, 18503, 18517, 18523, 18523, 18541, 18541, + 18553, 18553, 18553, 18583, 18587, 18593, 18593, 18593, 18617, 18617, + 18637, 18637, 18637, 18661, 18671, 18679, 18679, 18691, 18701, 18701, + 18719, 18719, 18731, 18743, 18749, 18757, 18757, 18773, 18773, 18787, + 18797, 18803, 18803, 18803, 18803, 18839, 18839, 18839, 18859, 18869, + 18869, 18869, 18869, 18899, 18911, 18919, 18919, 18919, 18919, 18947, + 18959, 18959, 18973, 18979, 18979, 18979, 19001, 19013, 19013, 19031, + 19037, 19037, 19051, 19051, 19069, 19079, 19087, 19087, 19087, 19087, + 19087, 19121, 19121, 19141, 19141, 19157, 19163, 19163, 19183, 19183, + 19183, 19207, 19213, 19219, 19231, 19237, 19237, 19249, 19259, 19267, + 19273, 19273, 19289, 19301, 19309, 19319, 19319, 19333, 19333, 19333, + 19333, 19333, 19373, 19381, 19391, 19391, 19403, 19403, 19423, 19429, + 19433, 19447, 19447, 19463, 19471, 19477, 19483, 19489, 19501, 19507, + 19507, 19507, 19531, 19543, 19543, 19559, 19559, 19571, 19583, 19583, + 19597, 19603, 19609, 19609, 19609, 19609, 19609, 19609, 19661, 19661, + 19661, 19687, 19687, 19699, 19709, 19717, 19727, 19727, 19739, 19751, + 19759, 19763, 19763, 19777, 19777, 19793, 19801, 19813, 19819, 19819, + 19819, 19843, 19853, 19861, 19867, 19867, 19867, 19891, 19891, 19891, + 19919, 19927, 19927, 19937, 19949, 19949, 19963, 19973, 19979, 19991, + 19997, 19997, 20011, 20023, 20029, 20029, 20047, 20051, 20063, 20071, + 20071, 20071, 20089, 20101, 20107, 20117, 20123, 20129, 20143, 20149, + 20149, 20161, 20173, 20183, 20183, 20183, 20201, 20201, 20219, 20231, + 20233, 20233, 20249, 20261, 20269, 20269, 20287, 20287, 20297, 20297, + 20297, 20327, 20333, 20341, 20347, 20359, 20359, 20369, 20369, 20389, + 20399, 20407, 20411, 20411, 20431, 20431, 20443, 20443, 20443, 20443, + 20479, 20483, 20483, 20483, 20509, 20509, 20521, 20533, 20543, 20551, + 20551, 20563, 20563, 20563, 20563, 20599, 20599, 20611, 20611, 20627, + 20639, 20641, 20641, 20663, 20663, 20663, 20681, 20693, 20693, 20707, + 20719, 20719, 20731, 20743, 20749, 20759, 20759, 20773, 20773, 20789, + 20789, 20807, 20809, 20809, 20809, 20809, 20809, 20849, 20857, 20857, + 20879, 20887, 20887, 20903, 20903, 20903, 20921, 20929, 20939, 20947, + 20959, 20963, 20963, 20983, 20983, 20983, 21001, 21013, 21023, 21031, + 21031, 21031, 21031, 21061, 21067, 21067, 21067, 21089, 21101, 21107, + 21107, 21121, 21121, 21143, 21149, 21157, 21163, 21169, 21179, 21191, + 21193, 21193, 21211, 21221, 21227, 21227, 21247, 21247, 21247, 21269, + 21277, 21283, 21283, 21283, 21283, 21319, 21323, 21323, 21341, 21347, + 21347, 21347, 21347, 21383, 21391, 21397, 21407, 21407, 21419, 21419, + 21433, 21433, 21433, 21433, 21467, 21467, 21487, 21493, 21503, 21503, + 21517, 21523, 21529, 21529, 21529, 21559, 21563, 21569, 21577, 21589, + 21599, 21601, 21613, 21617, 21617, 21617, 21647, 21649, 21661, 21661, + 21673, 21683, 21683, 21701, 21701, 21713, 21727, 21727, 21739, 21751, + 21757, 21767, 21773, 21773, 21787, 21799, 21803, 21803, 21821, 21821, + 21839, 21841, 21851, 21863, 21871, 21871, 21881, 21893, 21893, 21911, + 21911, 21911, 21929, 21943, 21943, 21943, 21961, 21961, 21977, 21991, + 21997, 22003, 22013, 22013, 22031, 22039, 22039, 22051, 22063, 22067, + 22079, 22079, 22093, 22093, 22111, 22111, 22123, 22133, 22133, 22147, + 22159, 22159, 22171, 22171, 22189, 22193, 22193, 22193, 22193, 22229, + 22229, 22247, 22247, 22259, 22271, 22279, 22283, 22291, 22303, 22307, + 22307, 22307, 22307, 22343, 22349, 22349, 22367, 22369, 22381, 22391, + 22397, 22397, 22409, 22409, 22409, 22433, 22447, 22453, 22453, 22469, + 22469, 22483, 22483, 22501, 22511, 22511, 22511, 22531, 22543, 22549, + 22549, 22567, 22573, 22573, 22573, 22573, 22573, 22613, 22621, 22621, + 22639, 22643, 22651, 22651, 22669, 22679, 22679, 22691, 22699, 22709, + 22717, 22727, 22727, 22741, 22751, 22751, 22751, 22769, 22783, 22787, + 22787, 22807, 22811, 22817, 22817, 22817, 22817, 22853, 22861, 22871, + 22877, 22877, 22877, 22901, 22907, 22907, 22921, 22921, 22943, 22943, + 22943, 22963, 22973, 22973, 22973, 22993, 23003, 23011, 23021, 23029, + 23039, 23041, 23053, 23063, 23071, 23071, 23087, 23087, 23099, 23099, + 23117, 23117, 23131, 23143, 23143, 23159, 23167, 23173, 23173, 23189, + 23197, 23203, 23209, 23209, 23227, 23227, 23227, 23251, 23251, 23269, + 23279, 23279, 23293, 23297, 23311, 23311, 23327, 23333, 23339, 23339, + 23357, 23357, 23371, 23371, 23371, 23399, 23399, 23399, 23417, 23431, + 23431, 23447, 23447, 23459, 23459, 23473, 23473, 23473, 23497, 23509, + 23509, 23509, 23531, 23539, 23549, 23557, 23567, 23567, 23581, 23581, + 23599, 23603, 23609, 23623, 23629, 23633, 23633, 23633, 23663, 23671, + 23677, 23687, 23689, 23689, 23689, 23719, 23719, 23719, 23743, 23747, + 23753, 23767, 23773, 23773, 23789, 23789, 23801, 23813, 23819, 23831, + 23833, 23833, 23833, 23857, 23869, 23879, 23887, 23893, 23899, 23911, + 23917, 23917, 23929, 23929, 23929, 23957, 23957, 23971, 23981, 23981, + 23993, 24007, 24007, 24023, 24029, 24029, 24043, 24049, 24061, 24071, + 24077, 24083, 24091, 24103, 24109, 24113, 24121, 24133, 24137, 24151, + 24151, 24151, 24169, 24181, 24181, 24197, 24203, 24203, 24223, 24229, + 24239, 24247, 24251, 24251, 24251, 24251, 24281, 24281, 24281, 24281, + 24317, 24317, 24329, 24337, 24337, 24359, 24359, 24373, 24379, 24391, + 24391, 24407, 24413, 24421, 24421, 24439, 24443, 24443, 24443, 24469, + 24473, 24481, 24481, 24499, 24509, 24517, 24527, 24533, 24533, 24551, + 24551, 24551, 24571, 24571, 24571, 24593, 24593, 24611, 24623, 24631, + 24631, 24631, 24631, 24659, 24671, 24677, 24683, 24691, 24697, 24709, + 24709, 24709, 24733, 24733, 24749, 24749, 24767, 24767, 24781, 24781, + 24799, 24799, 24809, 24821, 24821, 24821, 24847, 24851, 24859, 24859, + 24877, 24877, 24889, 24889, 24907, 24919, 24923, 24923, 24943, 24943, + 24953, 24967, 24971, 24979, 24989, 24989, 24989, 25013, 25013, 25031, + 25037, 25037, 25037, 25057, 25057, 25073, 25087, 25087, 25097, 25111, + 25117, 25127, 25127, 25127, 25147, 25153, 25163, 25171, 25183, 25189, + 25189, 25189, 25189, 25219, 25229, 25237, 25247, 25253, 25261, 25261, + 25261, 25261, 25261, 25303, 25309, 25309, 25321, 25321, 25343, 25349, + 25357, 25367, 25373, 25373, 25391, 25391, 25391, 25411, 25423, 25423, + 25439, 25447, 25453, 25463, 25471, 25471, 25471, 25471, 25471, 25471, + 25471, 25523, 25523, 25541, 25541, 25541, 25561, 25561, 25583, 25589, + 25589, 25603, 25609, 25621, 25621, 25639, 25643, 25643, 25657, 25667, + 25679, 25679, 25693, 25703, 25703, 25717, 25717, 25733, 25741, 25747, + 25759, 25763, 25771, 25771, 25771, 25799, 25801, 25801, 25819, 25819, + 25819, 25847, 25849, 25849, 25867, 25873, 25873, 25889, 25903, 25903, + 25919, 25919, 25933, 25943, 25951, 25951, 25951, 25969, 25981, 25981, + 25999, 26003, 26003, 26021, 26029, 26029, 26041, 26053, 26053, 26053, + 26053, 26083, 26083, 26099, 26111, 26119, 26119, 26119, 26141, 26141, + 26153, 26161, 26171, 26183, 26189, 26189, 26203, 26209, 26209, 26227, + 26237, 26237, 26251, 26263, 26267, 26267, 26267, 26293, 26297, 26309, + 26317, 26321, 26321, 26339, 26347, 26357, 26357, 26371, 26371, 26387, + 26399, 26407, 26407, 26423, 26431, 26437, 26437, 26449, 26459, 26459, + 26479, 26479, 26489, 26501, 26501, 26513, 26513, 26513, 26539, 26539, + 26557, 26561, 26573, 26573, 26591, 26597, 26597, 26597, 26597, 26627, + 26633, 26647, 26647, 26647, 26669, 26669, 26687, 26693, 26701, 26711, + 26717, 26723, 26731, 26737, 26737, 26759, 26759, 26759, 26783, 26783, + 26783, 26801, 26813, 26821, 26821, 26839, 26839, 26849, 26863, 26863, + 26879, 26881, 26893, 26903, 26903, 26903, 26927, 26927, 26927, 26951, + 26959, 26959, 26959, 26981, 26987, 26993, 26993, 27011, 27017, 27031, + 27031, 27043, 27043, 27061, 27067, 27077, 27077, 27091, 27103, 27109, + 27109, 27127, 27127, 27143, 27143, 27143, 27143, 27143, 27179, 27191, + 27197, 27197, 27211, 27211, 27211, 27239, 27241, 27253, 27259, 27271, + 27277, 27283, 27283, 27299, 27299, 27299, 27299, 27329, 27337, 27337, + 27337, 27367, 27367, 27367, 27367, 27397, 27407, 27409, 27409, 27431, + 27437, 27437, 27449, 27457, 27457, 27479, 27487, 27487, 27487, 27509, + 27509, 27527, 27529, 27541, 27551, 27551, 27551, 27551, 27583, 27583, + 27583, 27583, 27611, 27617, 27631, 27631, 27647, 27653, 27653, 27653, + 27673, 27673, 27691, 27701, 27701, 27701, 27701, 27733, 27743, 27751, + 27751, 27767, 27773, 27779, 27791, 27799, 27803, 27809, 27823, 27827, + 27827, 27847, 27851, 27851, 27851, 27851, 27883, 27893, 27901, 27901, + 27919, 27919, 27919, 27943, 27947, 27953, 27967, 27967, 27983, 27983, + 27997, 28001, 28001, 28019, 28031, 28031, 28031, 28051, 28057, 28069, + 28069, 28087, 28087, 28099, 28111, 28111, 28123, 28123, 28123, 28151, + 28151, 28163, 28163, 28183, 28183, 28183, 28201, 28211, 28219, 28229, + 28229, 28229, 28229, 28229, 28229, 28279, 28283, 28289, 28297, 28309, + 28319, 28319, 28319, 28319, 28351, 28351, 28351, 28351, 28351, 28387, + 28393, 28403, 28411, 28411, 28429, 28439, 28447, 28447, 28463, 28463, + 28477, 28477, 28493, 28499, 28499, 28517, 28517, 28517, 28541, 28549, + 28559, 28559, 28573, 28579, 28591, 28597, 28607, 28607, 28621, 28631, + 28631, 28643, 28649, 28663, 28669, 28669, 28687, 28687, 28703, 28711, + 28711, 28723, 28729, 28729, 28751, 28759, 28759, 28771, 28771, 28789, + 28793, 28807, 28813, 28817, 28817, 28837, 28843, 28843, 28859, 28871, + 28879, 28879, 28879, 28901, 28909, 28909, 28927, 28933, 28933, 28949, + 28949, 28961, 28961, 28979, 28979, 28979, 28979, 29009, 29023, 29027, + 29033, 29033, 29033, 29063, 29063, 29077, 29077, 29077, 29101, 29101, + 29101, 29123, 29131, 29137, 29147, 29153, 29167, 29173, 29179, 29191, + 29191, 29207, 29209, 29221, 29231, 29231, 29243, 29251, 29251, 29269, + 29269, 29287, 29287, 29303, 29311, 29311, 29327, 29333, 29339, 29347, + 29347, 29363, 29363, 29383, 29389, 29399, 29401, 29411, 29423, 29429, + 29437, 29443, 29453, 29453, 29453, 29473, 29483, 29483, 29501, 29501, + 29501, 29527, 29531, 29537, 29537, 29537, 29567, 29573, 29581, 29587, + 29599, 29599, 29611, 29611, 29629, 29633, 29641, 29641, 29663, 29671, + 29671, 29683, 29683, 29683, 29683, 29717, 29723, 29723, 29741, 29741, + 29759, 29761, 29761, 29761, 29789, 29789, 29803, 29803, 29819, 29819, + 29837, 29837, 29851, 29863, 29867, 29879, 29881, 29881, 29881, 29881, + 29917, 29927, 29927, 29927, 29947, 29959, 29959, 29959, 29983, 29989, + 29989, 29989, 30013, 30013, 30029, 30029, 30047, 30047, 30059, 30071, + 30071, 30071, 30091, 30103, 30109, 30119, 30119, 30133, 30139, 30139, + 30139, 30161, 30169, 30181, 30187, 30197, 30203, 30211, 30223, 30223, + 30223, 30241, 30253, 30259, 30271, 30271, 30271, 30293, 30293, 30307, + 30319, 30323, 30323, 30341, 30347, 30347, 30367, 30367, 30367, 30391, + 30391, 30403, 30403, 30403, 30431, 30431, 30431, 30449, 30449, 30469, + 30469, 30469, 30493, 30497, 30509, 30517, 30517, 30529, 30539, 30539, + 30559, 30559, 30559, 30577, 30577, 30593, 30593, 30593, 30593, 30631, + 30637, 30643, 30649, 30661, 30671, 30677, 30677, 30689, 30703, 30707, + 30713, 30727, 30727, 30727, 30727, 30757, 30763, 30773, 30781, 30781, + 30781, 30803, 30809, 30817, 30829, 30839, 30841, 30853, 30859, 30871, + 30871, 30881, 30893, 30893, 30911, 30911, 30911, 30931, 30941, 30949, + 30949, 30949, 30971, 30983, 30983, 30983, 30983, 31013, 31019, 31019, + 31039, 31039, 31051, 31063, 31069, 31079, 31081, 31091, 31091, 31091, + 31091, 31123, 31123, 31139, 31151, 31159, 31159, 31159, 31183, 31189, + 31193, 31193, 31193, 31223, 31231, 31237, 31247, 31253, 31259, 31271, + 31277, 31277, 31277, 31277, 31307, 31319, 31327, 31333, 31337, 31337, + 31357, 31357, 31357, 31379, 31391, 31397, 31397, 31397, 31397, 31397, + 31397, 31397, 31397, 31397, 31469, 31477, 31481, 31489, 31489, 31511, + 31517, 31517, 31531, 31543, 31547, 31547, 31567, 31573, 31583, 31583, + 31583, 31607, 31607, 31607, 31627, 31627, 31643, 31649, 31663, 31667, + 31667, 31687, 31687, 31699, 31699, 31699, 31727, 31729, 31741, 31751, + 31751, 31751, 31771, 31771, 31771, 31799, 31799, 31799, 31817, 31817, + 31817, 31847, 31849, 31859, 31859, 31873, 31883, 31891, 31891, 31907, + 31907, 31907, 31907, 31907, 31907, 31957, 31963, 31973, 31981, 31991, + 31991, 32003, 32009, 32009, 32029, 32029, 32029, 32051, 32063, 32069, + 32077, 32083, 32089, 32099, 32099, 32119, 32119, 32119, 32143, 32143, + 32159, 32159, 32173, 32183, 32191, 32191, 32203, 32213, 32213, 32213, + 32237, 32237, 32251, 32261, 32261, 32261, 32261, 32261, 32303, 32309, + 32309, 32327, 32327, 32341, 32341, 32359, 32363, 32371, 32381, 32381, + 32381, 32401, 32413, 32423, 32429, 32429, 32443, 32443, 32443, 32467, + 32479, 32479, 32491, 32503, 32507, 32507, 32507, 32533, 32537, 32537, + 32537, 32563, 32573, 32579, 32587, 32587, 32603, 32611, 32621, 32621, + 32633, 32647, 32653, 32653, 32653, 32653, 32687, 32693, 32693, 32707, + 32719, 32719, 32719, 32719, 32749, 32749, 32749, 32771, 32783, 32789, + 32797, 32803, 32803, 32803, 32831, 32839, 32843, 32843, 32843, 32869, + 32869, 32887, 32887, 32887, 32911, 32917, 32917, 32933, 32941, 32941, + 32957, 32957, 32971, 32983, 32987, 32999, 32999, 33013, 33023, 33029, + 33037, 33037, 33053, 33053, 33071, 33073, 33083, 33091, 33091, 33107, + 33119, 33119, 33119, 33119, 33151, 33151, 33161, 33161, 33181, 33191, + 33199, 33203, 33211, 33223, 33223, 33223, 33247, 33247, 33247, 33247, + 33247, 33287, 33289, 33301, 33311, 33317, 33317, 33331, 33343, 33349, + 33359, 33359, 33359, 33377, 33391, 33391, 33403, 33413, 33413, 33427, + 33427, 33427, 33427, 33461, 33469, 33479, 33487, 33493, 33503, 33503, + 33503, 33521, 33533, 33533, 33547, 33547, 33563, 33569, 33581, 33589, + 33599, 33601, 33613, 33623, 33629, 33637, 33647, 33647, 33647, 33647, + 33679, 33679, 33679, 33703, 33703, 33713, 33721, 33721, 33739, 33751, + 33757, 33767, 33773, 33773, 33791, 33797, 33797, 33811, 33811, 33829, + 33829, 33829, 33851, 33863, 33871, 33871, 33871, 33893, 33893, 33911, + 33911, 33923, 33931, 33941, 33941, 33941, 33967, 33967, 33967, 33967, + 33997, 33997, 33997, 34019, 34031, 34039, 34039, 34039, 34061, 34061, + 34061, 34061, 34061, 34061, 34061, 34061, 34127, 34129, 34141, 34147, + 34159, 34159, 34171, 34183, 34183, 34183, 34183, 34213, 34217, 34231, + 34231, 34231, 34253, 34261, 34267, 34273, 34283, 34283, 34303, 34303, + 34319, 34327, 34327, 34337, 34351, 34351, 34367, 34369, 34381, 34381, + 34381, 34403, 34403, 34421, 34429, 34439, 34439, 34439, 34457, 34471, + 34471, 34487, 34487, 34501, 34511, 34519, 34519, 34519, 34543, 34549, + 34549, 34549, 34549, 34583, 34591, 34591, 34607, 34613, 34613, 34631, + 34631, 34631, 34651, 34651, 34667, 34679, 34687, 34693, 34703, 34703, + 34703, 34721, 34729, 34739, 34747, 34759, 34763, 34763, 34781, 34781, + 34781, 34807, 34807, 34819, 34819, 34819, 34847, 34849, 34849, 34871, + 34877, 34883, 34883, 34897, 34897, 34919, 34919, 34919, 34939, 34949, + 34949, 34963, 34963, 34981, 34981, 34981, 34981, 34981, 35023, 35027, + 35027, 35027, 35053, 35059, 35069, 35069, 35083, 35089, 35099, 35111, + 35117, 35117, 35129, 35141, 35149, 35159, 35159, 35171, 35171, 35171, + 35171, 35201, 35201, 35221, 35227, 35227, 35227, 35251, 35257, 35267, + 35279, 35281, 35291, 35291, 35311, 35317, 35327, 35327, 35339, 35339, + 35353, 35363, 35363, 35381, 35381, 35393, 35407, 35407, 35423, 35423, + 35437, 35447, 35449, 35461, 35461, 35461, 35461, 35491, 35491, 35509, + 35509, 35527, 35533, 35543, 35543, 35543, 35543, 35573, 35573, 35591, + 35597, 35603, 35603, 35617, 35617, 35617, 35617, 35617, 35617, 35671, + 35677, 35677, 35677, 35677, 35677, 35677, 35677, 35731, 35731, 35747, + 35759, 35759, 35771, 35771, 35771, 35797, 35803, 35809, 35809, 35831, + 35839, 35839, 35851, 35863, 35869, 35879, 35879, 35879, 35899, 35911, + 35911, 35923, 35933, 35933, 35951, 35951, 35963, 35969, 35983, 35983, + 35999, 36007, 36013, 36017, 36017, 36037, 36037, 36037, 36061, 36067, + 36073, 36083, 36083, 36097, 36109, 36109, 36109, 36131, 36137, 36151, + 36151, 36161, 36161, 36161, 36191, 36191, 36191, 36209, 36217, 36229, + 36229, 36241, 36251, 36263, 36269, 36277, 36277, 36293, 36299, 36307, + 36319, 36319, 36319, 36343, 36343, 36353, 36353, 36373, 36383, 36389, + 36389, 36389, 36389, 36389, 36389, 36433, 36433, 36451, 36457, 36469, + 36479, 36479, 36493, 36497, 36497, 36497, 36527, 36529, 36541, 36551, + 36559, 36563, 36571, 36583, 36587, 36599, 36607, 36607, 36607, 36629, + 36637, 36643, 36653, 36653, 36671, 36677, 36683, 36691, 36697, 36709, + 36713, 36721, 36721, 36739, 36749, 36749, 36767, 36767, 36781, 36791, + 36793, 36793, 36809, 36821, 36821, 36833, 36847, 36847, 36857, 36871, + 36877, 36887, 36887, 36901, 36901, 36919, 36923, 36931, 36943, 36947, + 36947, 36947, 36973, 36979, 36979, 36997, 37003, 37013, 37021, 37021, + 37039, 37039, 37049, 37061, 37061, 37061, 37087, 37087, 37097, 37097, + 37117, 37123, 37123, 37139, 37139, 37159, 37159, 37171, 37181, 37189, + 37199, 37201, 37201, 37223, 37223, 37223, 37243, 37253, 37253, 37253, + 37277, 37277, 37277, 37277, 37309, 37313, 37321, 37321, 37339, 37339, + 37357, 37363, 37369, 37379, 37379, 37397, 37397, 37409, 37423, 37423, + 37423, 37447, 37447, 37463, 37463, 37463, 37483, 37493, 37501, 37511, + 37517, 37517, 37529, 37537, 37549, 37549, 37567, 37573, 37579, 37591, + 37591, 37607, 37607, 37619, 37619, 37633, 37643, 37649, 37663, 37663, + 37663, 37663, 37693, 37699, 37699, 37717, 37717, 37717, 37717, 37747, + 37747, 37747, 37747, 37783, 37783, 37799, 37799, 37813, 37813, 37831, + 37831, 37847, 37853, 37861, 37871, 37879, 37879, 37889, 37897, 37907, + 37907, 37907, 37907, 37907, 37951, 37957, 37967, 37967, 37967, 37991, + 37997, 37997, 38011, 38011, 38011, 38039, 38047, 38053, 38053, 38069, + 38069, 38083, 38083, 38083, 38083, 38119, 38119, 38119, 38119, 38149, + 38153, 38167, 38167, 38183, 38189, 38197, 38201, 38201, 38219, 38231, + 38239, 38239, 38239, 38261, 38261, 38273, 38287, 38287, 38303, 38303, + 38317, 38327, 38333, 38333, 38351, 38351, 38351, 38371, 38377, 38377, + 38393, 38393, 38393, 38393, 38431, 38431, 38447, 38453, 38461, 38461, + 38461, 38461, 38461, 38501, 38501, 38501, 38501, 38501, 38543, 38543, + 38557, 38567, 38569, 38569, 38569, 38593, 38603, 38611, 38611, 38629, + 38639, 38639, 38653, 38653, 38671, 38677, 38677, 38693, 38699, 38711, + 38713, 38723, 38729, 38737, 38749, 38749, 38767, 38767, 38783, 38791, + 38791, 38803, 38803, 38821, 38821, 38839, 38839, 38851, 38861, 38867, + 38873, 38873, 38891, 38903, 38903, 38917, 38923, 38933, 38933, 38933, + 38959, 38959, 38971, 38977, 38977, 38993, 38993, 38993, 39023, 39023, + 39023, 39047, 39047, 39047, 39047, 39079, 39079, 39089, 39103, 39107, + 39119, 39119, 39133, 39139, 39139, 39157, 39163, 39163, 39181, 39191, + 39199, 39199, 39209, 39217, 39229, 39239, 39241, 39251, 39251, 39251, + 39251, 39251, 39293, 39301, 39301, 39317, 39323, 39323, 39343, 39343, + 39359, 39367, 39373, 39383, 39383, 39397, 39397, 39409, 39419, 39419, + 39439, 39443, 39451, 39461, 39461, 39461, 39461, 39461, 39503, 39511, + 39511, 39521, 39521, 39541, 39551, 39551, 39563, 39569, 39581, 39581, + 39581, 39607, 39607, 39623, 39631, 39631, 39631, 39631, 39659, 39671, + 39679, 39679, 39679, 39703, 39709, 39719, 39727, 39733, 39733, 39749, + 39749, 39761, 39769, 39779, 39791, 39799, 39799, 39799, 39821, 39829, + 39839, 39847, 39847, 39863, 39869, 39877, 39887, 39887, 39901, 39901, + 39901, 39901, 39929, 39937, 39937, 39953, 39953, 39971, 39983, 39989, + 39989, 39989, 40013, 40013, 40031, 40039, 40039, 40039, 40063, 40063, + 40063, 40087, 40093, 40099, 40111, 40111, 40127, 40129, 40129, 40151, + 40153, 40163, 40169, 40177, 40189, 40193, 40193, 40213, 40213, 40231, + 40237, 40241, 40253, 40253, 40253, 40277, 40283, 40289, 40289, 40289, + 40289, 40289, 40289, 40343, 40351, 40357, 40361, 40361, 40361, 40387, + 40387, 40387, 40387, 40423, 40429, 40433, 40433, 40433, 40459, 40471, + 40471, 40487, 40493, 40499, 40507, 40519, 40519, 40531, 40543, 40543, + 40559, 40559, 40559, 40583, 40591, 40597, 40597, 40609, 40609, 40627, + 40639, 40639, 40639, 40639, 40639, 40639, 40639, 40693, 40699, 40709, + 40709, 40709, 40709, 40739, 40751, 40759, 40763, 40771, 40771, 40787, + 40787, 40801, 40813, 40823, 40829, 40829, 40847, 40853, 40853, 40867, + 40879, 40883, 40883, 40903, 40903, 40903, 40927, 40933, 40939, 40949, + 40949, 40961, 40973, 40973, 40973, 40993, 40993, 41011, 41023, 41023, + 41039, 41047, 41051, 41057, 41057, 41077, 41081, 41081, 41081, 41081, + 41117, 41117, 41131, 41143, 41149, 41149, 41161, 41161, 41183, 41189, + 41189, 41203, 41213, 41221, 41231, 41233, 41243, 41243, 41263, 41269, + 41269, 41281, 41281, 41299, 41299, 41299, 41299, 41333, 41341, 41351, + 41357, 41357, 41357, 41381, 41389, 41399, 41399, 41413, 41413, 41413, + 41413, 41443, 41453, 41453, 41467, 41479, 41479, 41491, 41491, 41507, + 41519, 41521, 41521, 41543, 41549, 41549, 41549, 41549, 41579, 41579, + 41597, 41603, 41611, 41621, 41627, 41627, 41647, 41651, 41659, 41669, + 41669, 41687, 41687, 41687, 41687, 41719, 41719, 41729, 41737, 41737, + 41759, 41761, 41771, 41777, 41777, 41777, 41801, 41813, 41813, 41813, + 41813, 41843, 41851, 41863, 41863, 41879, 41887, 41893, 41903, 41911, + 41911, 41927, 41927, 41941, 41947, 41959, 41959, 41969, 41983, 41983, + 41999, 41999, 42013, 42023, 42023, 42023, 42043, 42043, 42061, 42071, + 42073, 42083, 42089, 42101, 42101, 42101, 42101, 42131, 42139, 42139, + 42157, 42157, 42169, 42181, 42187, 42197, 42197, 42209, 42223, 42227, + 42239, 42239, 42239, 42257, 42257, 42257, 42283, 42293, 42299, 42307, + 42307, 42323, 42331, 42337, 42349, 42359, 42359, 42373, 42379, 42391, + 42397, 42407, 42409, 42409, 42409, 42437, 42443, 42451, 42463, 42467, + 42473, 42487, 42491, 42499, 42509, 42509, 42509, 42533, 42533, 42533, + 42557, 42557, 42571, 42577, 42589, 42589, 42589, 42611, 42611, 42611, + 42611, 42643, 42649, 42649, 42667, 42677, 42683, 42689, 42703, 42709, + 42719, 42727, 42727, 42743, 42751, 42751, 42767, 42773, 42773, 42787, + 42797, 42797, 42797, 42821, 42829, 42839, 42841, 42853, 42863, 42863, + 42863, 42863, 42863, 42901, 42901, 42901, 42923, 42929, 42943, 42943, + 42953, 42967, 42967, 42979, 42989, 42989, 43003, 43013, 43019, 43019, + 43037, 43037, 43051, 43063, 43067, 43067, 43067, 43093, 43103, 43103, + 43117, 43117, 43133, 43133, 43151, 43159, 43159, 43159, 43177, 43189, + 43189, 43207, 43207, 43223, 43223, 43237, 43237, 43237, 43261, 43271, + 43271, 43283, 43291, 43291, 43291, 43319, 43321, 43331, 43331, 43331, + 43331, 43331, 43331, 43331, 43391, 43399, 43403, 43411, 43411, 43427, + 43427, 43441, 43451, 43457, 43457, 43457, 43487, 43487, 43499, 43499, + 43517, 43517, 43517, 43543, 43543, 43543, 43543, 43573, 43579, 43591, + 43597, 43607, 43613, 43613, 43627, 43633, 43633, 43651, 43661, 43669, + 43669, 43669, 43691, 43691, 43711, 43717, 43721, 43721, 43721, 43721, + 43759, 43759, 43759, 43783, 43789, 43793, 43801, 43801, 43801, 43801, + 43801, 43801, 43853, 43853, 43867, 43867, 43867, 43891, 43891, 43891, + 43913, 43913, 43933, 43943, 43951, 43951, 43963, 43973, 43973, 43991, + 43997, 43997, 43997, 44021, 44029, 44029, 44041, 44053, 44059, 44071, + 44071, 44087, 44089, 44101, 44111, 44119, 44123, 44131, 44131, 44131, + 44159, 44159, 44171, 44179, 44189, 44189, 44207, 44207, 44221, 44221, + 44221, 44221, 44249, 44263, 44269, 44279, 44281, 44293, 44293, 44293, + 44293, 44293, 44293, 44293, 44351, 44357, 44357, 44371, 44383, 44389, + 44389, 44389, 44389, 44417, 44417, 44417, 44417, 44453, 44453, 44453, + 44453, 44483, 44491, 44501, 44507, 44519, 44519, 44533, 44543, 44549, + 44549, 44563, 44563, 44579, 44587, 44587, 44587, 44587, 44623, 44623, + 44633, 44647, 44651, 44657, 44657, 44657, 44687, 44687, 44701, 44711, + 44711, 44711, 44729, 44741, 44741, 44753, 44753, 44773, 44777, 44789, + 44797, 44797, 44809, 44819, 44819, 44839, 44843, 44851, 44851, 44867, + 44879, 44887, 44893, 44893, 44909, 44917, 44927, 44927, 44939, 44939, + 44959, 44963, 44971, 44983, 44987, 44987, 45007, 45013, 45013, 45013, + 45013, 45013, 45053, 45061, 45061, 45077, 45083, 45083, 45083, 45083, + 45119, 45127, 45131, 45139, 45139, 45139, 45161, 45161, 45181, 45191, + 45197, 45197, 45197, 45197, 45197, 45233, 45247, 45247, 45263, 45263, + 45263, 45281, 45293, 45293, 45307, 45319, 45319, 45329, 45343, 45343, + 45343, 45361, 45361, 45377, 45389, 45389, 45403, 45413, 45413, 45427, + 45439, 45439, 45439, 45439, 45439, 45439, 45481, 45491, 45503, 45503, + 45503, 45523, 45533, 45541, 45541, 45557, 45557, 45569, 45569, 45589, + 45599, 45599, 45613, 45613, 45631, 45631, 45641, 45641, 45659, 45667, + 45677, 45677, 45691, 45697, 45707, 45707, 45707, 45707, 45737, 45751, + 45757, 45767, 45767, 45779, 45779, 45779, 45779, 45779, 45823, 45827, + 45833, 45841, 45853, 45863, 45869, 45869, 45887, 45893, 45893, 45893, + 45893, 45893, 45893, 45943, 45949, 45959, 45959, 45971, 45979, 45989, + 45989, 45989, 45989, 46021, 46027, 46027, 46027, 46051, 46061, 46061, + 46073, 46073, 46093, 46103, 46103, 46103, 46103, 46133, 46141, 46147, + 46153, 46153, 46171, 46183, 46187, 46199, 46199, 46199, 46219, 46229, + 46237, 46237, 46237, 46261, 46271, 46279, 46279, 46279, 46301, 46309, + 46309, 46327, 46327, 46337, 46351, 46351, 46351, 46351, 46381, 46381, + 46399, 46399, 46411, 46411, 46411, 46439, 46447, 46451, 46457, 46471, + 46477, 46477, 46489, 46499, 46511, 46511, 46523, 46523, 46523, 46549, + 46559, 46567, 46573, 46573, 46591, 46591, 46601, 46601, 46619, 46619, + 46639, 46643, 46649, 46663, 46663, 46679, 46687, 46691, 46703, 46703, + 46703, 46727, 46727, 46727, 46751, 46757, 46757, 46771, 46771, 46771, + 46771, 46807, 46811, 46819, 46831, 46831, 46831, 46853, 46861, 46867, + 46877, 46877, 46889, 46901, 46901, 46919, 46919, 46933, 46933, 46933, + 46957, 46957, 46957, 46957, 46957, 46997, 46997, 46997, 47017, 47017, + 47017, 47041, 47051, 47059, 47059, 47059, 47087, 47093, 47093, 47111, + 47119, 47123, 47129, 47143, 47149, 47149, 47161, 47161, 47161, 47189, + 47189, 47207, 47207, 47221, 47221, 47237, 47237, 47251, 47251, 47269, + 47279, 47287, 47293, 47303, 47309, 47317, 47317, 47317, 47339, 47351, + 47353, 47363, 47363, 47381, 47389, 47389, 47407, 47407, 47419, 47431, + 47431, 47441, 47441, 47459, 47459, 47459, 47459, 47491, 47501, 47507, + 47513, 47527, 47533, 47543, 47543, 47543, 47563, 47569, 47581, 47591, + 47599, 47599, 47609, 47623, 47629, 47639, 47639, 47653, 47659, 47659, + 47659, 47681, 47681, 47701, 47711, 47717, 47717, 47717, 47743, 47743, + 47743, 47743, 47743, 47779, 47791, 47797, 47807, 47809, 47819, 47819, + 47837, 47843, 47843, 47857, 47869, 47869, 47881, 47881, 47903, 47911, + 47917, 47917, 47933, 47939, 47951, 47951, 47963, 47969, 47981, 47981, + 47981, 47981, 47981, 48023, 48029, 48029, 48029, 48049, 48049, 48049, + 48079, 48079, 48091, 48091, 48109, 48119, 48121, 48131, 48131, 48131, + 48157, 48163, 48163, 48179, 48187, 48197, 48197, 48197, 48221, 48221, + 48239, 48247, 48247, 48259, 48271, 48271, 48281, 48281, 48299, 48311, + 48313, 48313, 48313, 48341, 48341, 48353, 48353, 48371, 48383, 48383, + 48397, 48407, 48413, 48413, 48413, 48437, 48437, 48449, 48463, 48463, + 48479, 48487, 48491, 48497, 48497, 48497, 48527, 48533, 48541, 48541, + 48541, 48563, 48571, 48571, 48589, 48593, 48593, 48611, 48623, 48623, + 48623, 48647, 48649, 48661, 48661, 48679, 48679, 48679, 48679, 48679, + 48679, 48679, 48733, 48733, 48751, 48757, 48767, 48767, 48781, 48787, + 48799, 48799, 48809, 48823, 48823, 48823, 48847, 48847, 48859, 48871, + 48871, 48883, 48889, 48889, 48907, 48907, 48907, 48907, 48907, 48947, + 48953, 48953, 48973, 48973, 48991, 48991, 49003, 49009, 49019, 49031, + 49037, 49043, 49043, 49057, 49069, 49069, 49081, 49081, 49103, 49109, + 49117, 49123, 49123, 49139, 49139, 49157, 49157, 49171, 49177, 49177, + 49199, 49207, 49211, 49223, 49223, 49223, 49223, 49253, 49261, 49261, + 49279, 49279, 49279, 49297, 49307, 49307, 49307, 49333, 49339, 49339, + 49339, 49367, 49369, 49369, 49391, 49393, 49393, 49411, 49417, 49429, + 49433, 49433, 49451, 49463, 49463, 49477, 49481, 49481, 49499, 49499, + 49499, 49523, 49531, 49537, 49549, 49559, 49559, 49559, 49559, 49559, + 49597, 49603, 49613, 49613, 49627, 49639, 49639, 49639, 49663, 49669, + 49669, 49681, 49681, 49697, 49711, 49711, 49727, 49727, 49741, 49747, + 49757, 49757, 49757, 49783, 49789, 49789, 49807, 49811, 49823, 49831, + 49831, 49843, 49853, 49853, 49871, 49877, 49877, 49891, 49891, 49891, + 49919, 49927, 49927, 49943, 49943, 49957, 49957, 49957, 49957, 49991, + 49999, 49999, 49999, 50023, 50023, 50033, 50047, 50053, 50053, 50069, + 50077, 50087, 50093, 50101, 50111, 50119, 50123, 50131, 50131, 50147, + 50159, 50159, 50159, 50177, 50177, 50177, 50207, 50207, 50221, 50231, + 50231, 50231, 50231, 50263, 50263, 50273, 50287, 50291, 50291, 50311, + 50311, 50321, 50333, 50341, 50341, 50359, 50363, 50363, 50383, 50387, + 50387, 50387, 50411, 50423, 50423, 50423, 50441, 50441, 50461, 50461, + 50461, 50461, 50461, 50503, 50503, 50513, 50527, 50527, 50543, 50551, + 50551, 50551, 50551, 50581, 50591, 50599, 50599, 50599, 50599, 50627, + 50627, 50647, 50651, 50651, 50671, 50671, 50683, 50683, 50683, 50707, + 50707, 50723, 50723, 50741, 50741, 50753, 50767, 50773, 50777, 50789, + 50789, 50789, 50789, 50821, 50821, 50839, 50839, 50849, 50857, 50867, + 50873, 50873, 50893, 50893, 50909, 50909, 50923, 50929, 50929, 50951, + 50957, 50957, 50971, 50971, 50989, 50993, 51001, 51001, 51001, 51031, + 51031, 51047, 51047, 51061, 51071, 51071, 51071, 51071, 51071, 51109, + 51109, 51109, 51133, 51137, 51151, 51157, 51157, 51169, 51169, 51169, + 51199, 51203, 51203, 51217, 51229, 51239, 51241, 51241, 51263, 51263, + 51263, 51287, 51287, 51287, 51307, 51307, 51307, 51329, 51343, 51349, + 51349, 51361, 51361, 51383, 51383, 51383, 51407, 51413, 51421, 51431, + 51439, 51439, 51449, 51461, 51461, 51479, 51487, 51487, 51503, 51511, + 51517, 51521, 51521, 51539, 51551, 51551, 51563, 51563, 51581, 51581, + 51599, 51607, 51613, 51613, 51631, 51637, 51647, 51647, 51659, 51659, + 51679, 51683, 51691, 51691, 51691, 51719, 51721, 51721, 51721, 51749, + 51749, 51767, 51769, 51769, 51787, 51797, 51803, 51803, 51817, 51829, + 51839, 51839, 51853, 51859, 51871, 51871, 51871, 51893, 51899, 51907, + 51913, 51913, 51929, 51941, 51949, 51949, 51949, 51973, 51977, 51991, + 51991, 51991, 52009, 52021, 52027, 52027, 52027, 52051, 52057, 52069, + 52069, 52081, 52081, 52103, 52103, 52103, 52127, 52127, 52127, 52147, + 52153, 52163, 52163, 52183, 52189, 52189, 52201, 52201, 52223, 52223, + 52237, 52237, 52253, 52259, 52267, 52267, 52267, 52291, 52301, 52301, + 52313, 52321, 52321, 52321, 52321, 52321, 52363, 52369, 52379, 52391, + 52391, 52391, 52391, 52391, 52391, 52433, 52433, 52453, 52457, 52457, + 52457, 52457, 52489, 52501, 52511, 52517, 52517, 52529, 52543, 52543, + 52553, 52567, 52571, 52583, 52583, 52583, 52583, 52609, 52609, 52631, + 52639, 52639, 52639, 52639, 52667, 52673, 52673, 52691, 52697, 52711, + 52711, 52727, 52733, 52733, 52747, 52757, 52757, 52769, 52783, 52783, + 52783, 52807, 52813, 52817, 52817, 52837, 52837, 52837, 52861, 52861, + 52879, 52883, 52889, 52903, 52903, 52919, 52919, 52919, 52937, 52951, + 52957, 52967, 52973, 52981, 52981, 52999, 53003, 53003, 53017, 53017, + 53017, 53047, 53051, 53051, 53069, 53077, 53087, 53093, 53101, 53101, + 53117, 53117, 53129, 53129, 53149, 53149, 53161, 53173, 53173, 53189, + 53197, 53201, 53201, 53201, 53231, 53239, 53239, 53239, 53239, 53269, + 53279, 53281, 53281, 53299, 53309, 53309, 53327, 53327, 53327, 53327, + 53359, 53359, 53359, 53381, 53381, 53381, 53407, 53411, 53419, 53419, + 53437, 53441, 53453, 53453, 53453, 53479, 53479, 53479, 53503, 53507, + 53507, 53527, 53527, 53527, 53551, 53551, 53551, 53569, 53569, 53591, + 53597, 53597, 53611, 53623, 53629, 53639, 53639, 53653, 53657, 53657, + 53657, 53681, 53693, 53699, 53699, 53719, 53719, 53731, 53731, 53731, + 53759, 53759, 53773, 53783, 53791, 53791, 53791, 53813, 53819, 53831, + 53831, 53831, 53849, 53861, 53861, 53861, 53887, 53891, 53899, 53899, + 53917, 53927, 53927, 53939, 53951, 53959, 53959, 53959, 53959, 53987, + 53993, 54001, 54013, 54013, 54013, 54037, 54037, 54049, 54059, 54059, + 54059, 54083, 54091, 54101, 54101, 54101, 54121, 54133, 54139, 54151, + 54151, 54167, 54167, 54181, 54181, 54193, 54193, 54193, 54217, 54217, + 54217, 54217, 54251, 54251, 54269, 54277, 54287, 54293, 54293, 54311, + 54319, 54323, 54331, 54331, 54347, 54347, 54367, 54371, 54377, 54377, + 54377, 54403, 54413, 54421, 54421, 54437, 54443, 54449, 54449, 54469, + 54469, 54469, 54493, 54503, 54503, 54517, 54521, 54521, 54541, 54547, + 54559, 54563, 54563, 54583, 54583, 54583, 54601, 54601, 54623, 54631, + 54631, 54647, 54647, 54647, 54667, 54679, 54679, 54679, 54679, 54709, + 54713, 54727, 54727, 54727, 54751, 54751, 54767, 54773, 54779, 54787, + 54799, 54799, 54799, 54799, 54829, 54833, 54833, 54851, 54851, 54869, + 54877, 54881, 54881, 54881, 54907, 54919, 54919, 54919, 54941, 54949, + 54959, 54959, 54973, 54983, 54983, 54983, 55001, 55009, 55021, 55021, + 55021, 55021, 55051, 55061, 55061, 55079, 55079, 55079, 55103, 55109, + 55117, 55127, 55127, 55127, 55147, 55147, 55163, 55171, 55171, 55171, + 55171, 55207, 55213, 55219, 55229, 55229, 55243, 55249, 55259, 55259, + 55259, 55259, 55291, 55291, 55291, 55313, 55313, 55333, 55343, 55351, + 55351, 55351, 55373, 55381, 55381, 55399, 55399, 55411, 55411, 55411, + 55439, 55441, 55441, 55457, 55469, 55469, 55487, 55487, 55501, 55511, + 55511, 55511, 55529, 55541, 55547, 55547, 55547, 55547, 55579, 55589, + 55589, 55603, 55609, 55621, 55631, 55639, 55639, 55639, 55663, 55667, + 55673, 55681, 55691, 55697, 55711, 55717, 55721, 55733, 55733, 55733, + 55733, 55763, 55763, 55763, 55787, 55799, 55807, 55813, 55823, 55829, + 55837, 55843, 55849, 55849, 55871, 55871, 55871, 55889, 55903, 55903, + 55903, 55927, 55933, 55933, 55949, 55949, 55967, 55967, 55967, 55987, + 55997, 56003, 56009, 56009, 56009, 56039, 56041, 56053, 56053, 56053, + 56053, 56087, 56093, 56101, 56101, 56113, 56123, 56131, 56131, 56149, + 56149, 56167, 56171, 56179, 56179, 56197, 56207, 56209, 56209, 56209, + 56239, 56239, 56249, 56263, 56269, 56269, 56269, 56269, 56299, 56311, + 56311, 56311, 56333, 56333, 56333, 56359, 56359, 56369, 56383, 56383, + 56393, 56401, 56401, 56417, 56431, 56437, 56443, 56453, 56453, 56467, + 56479, 56479, 56489, 56503, 56509, 56519, 56527, 56533, 56543, 56543, + 56543, 56543, 56569, 56569, 56591, 56599, 56599, 56611, 56611, 56629, + 56633, 56633, 56633, 56663, 56671, 56671, 56687, 56687, 56701, 56711, + 56713, 56713, 56731, 56737, 56747, 56747, 56767, 56773, 56783, 56783, + 56783, 56807, 56813, 56821, 56827, 56827, 56843, 56843, 56857, 56857, + 56873, 56873, 56893, 56897, 56911, 56911, 56923, 56929, 56941, 56951, + 56957, 56963, 56963, 56983, 56989, 56999, 56999, 56999, 56999, 56999, + 57037, 57047, 57047, 57059, 57059, 57077, 57077, 57089, 57097, 57107, + 57119, 57119, 57131, 57143, 57149, 57149, 57163, 57173, 57179, 57191, + 57193, 57203, 57203, 57223, 57223, 57223, 57241, 57251, 57259, 57271, + 57271, 57287, 57287, 57301, 57301, 57301, 57301, 57331, 57331, 57349, + 57349, 57367, 57373, 57383, 57389, 57397, 57397, 57413, 57413, 57427, + 57427, 57427, 57427, 57457, 57467, 57467, 57487, 57493, 57503, 57503, + 57503, 57527, 57529, 57529, 57529, 57559, 57559, 57571, 57571, 57587, + 57593, 57601, 57601, 57601, 57601, 57637, 57641, 57653, 57653, 57667, + 57679, 57679, 57689, 57697, 57709, 57719, 57727, 57731, 57737, 57751, + 57751, 57751, 57773, 57781, 57791, 57793, 57803, 57809, 57809, 57829, + 57839, 57847, 57853, 57859, 57859, 57859, 57881, 57881, 57901, 57901, + 57917, 57923, 57923, 57943, 57947, 57947, 57947, 57973, 57977, 57991, + 57991, 57991, 58013, 58013, 58031, 58031, 58043, 58049, 58061, 58067, + 58073, 58073, 58073, 58099, 58111, 58111, 58111, 58129, 58129, 58151, + 58153, 58153, 58171, 58171, 58189, 58199, 58207, 58211, 58217, 58231, + 58237, 58243, 58243, 58243, 58271, 58271, 58271, 58271, 58271, 58309, + 58313, 58321, 58321, 58337, 58337, 58337, 58367, 58369, 58379, 58391, + 58393, 58403, 58411, 58417, 58427, 58439, 58441, 58453, 58453, 58453, + 58477, 58481, 58481, 58481, 58511, 58511, 58511, 58511, 58543, 58549, + 58549, 58567, 58573, 58579, 58579, 58579, 58603, 58613, 58613, 58631, + 58631, 58631, 58631, 58661, 58661, 58679, 58687, 58693, 58699, 58711, + 58711, 58727, 58733, 58741, 58741, 58757, 58763, 58771, 58771, 58789, + 58789, 58789, 58789, 58789, 58831, 58831, 58831, 58831, 58831, 58831, + 58831, 58831, 58889, 58901, 58909, 58913, 58921, 58921, 58943, 58943, + 58943, 58967, 58967, 58979, 58991, 58997, 58997, 59011, 59023, 59029, + 59029, 59029, 59053, 59063, 59069, 59077, 59083, 59093, 59093, 59107, + 59119, 59123, 59123, 59141, 59149, 59159, 59167, 59167, 59183, 59183, + 59197, 59207, 59209, 59221, 59221, 59239, 59243, 59243, 59263, 59263, + 59273, 59281, 59281, 59281, 59281, 59281, 59281, 59333, 59341, 59351, + 59359, 59359, 59369, 59377, 59387, 59399, 59407, 59407, 59419, 59419, + 59419, 59447, 59453, 59453, 59471, 59473, 59473, 59473, 59497, 59509, + 59513, 59513, 59513, 59539, 59539, 59557, 59567, 59567, 59581, 59581, + 59581, 59581, 59611, 59621, 59629, 59629, 59629, 59651, 59663, 59671, + 59671, 59671, 59693, 59699, 59707, 59707, 59723, 59729, 59743, 59747, + 59753, 59753, 59771, 59779, 59791, 59797, 59797, 59809, 59809, 59809, + 59833, 59833, 59833, 59863, 59863, 59879, 59887, 59887, 59887, 59887, + 59887, 59921, 59929, 59929, 59951, 59957, 59957, 59971, 59981, 59981, + 59999, 59999, 60013, 60017, 60029, 60037, 60041, 60041, 60041, 60041, + 60077, 60083, 60091, 60103, 60107, 60107, 60127, 60133, 60139, 60149, + 60149, 60167, 60169, 60169, 60169, 60169, 60169, 60209, 60223, 60223, + 60223, 60223, 60251, 60259, 60271, 60271, 60271, 60293, 60293, 60293, + 60317, 60317, 60331, 60343, 60343, 60353, 60353, 60373, 60383, 60383, + 60397, 60397, 60413, 60413, 60427, 60427, 60443, 60449, 60457, 60457, + 60457, 60457, 60493, 60497, 60509, 60509, 60527, 60527, 60539, 60539, + 60539, 60539, 60539, 60539, 60589, 60589, 60607, 60611, 60623, 60631, + 60637, 60647, 60649, 60661, 60661, 60679, 60679, 60689, 60703, 60703, + 60719, 60727, 60733, 60737, 60737, 60757, 60763, 60773, 60779, 60779, + 60793, 60793, 60811, 60821, 60821, 60821, 60821, 60821, 60859, 60869, + 60869, 60887, 60889, 60901, 60901, 60919, 60923, 60923, 60943, 60943, + 60953, 60961, 60961, 60961, 60961, 60961, 61007, 61007, 61007, 61031, + 61031, 61043, 61051, 61057, 61057, 61057, 61057, 61091, 61099, 61099, + 61099, 61121, 61129, 61141, 61151, 61153, 61153, 61169, 61169, 61169, + 61169, 61169, 61211, 61223, 61231, 61231, 61231, 61253, 61261, 61261, + 61261, 61283, 61291, 61297, 61297, 61297, 61297, 61333, 61343, 61343, + 61357, 61363, 61363, 61381, 61381, 61381, 61403, 61409, 61417, 61417, + 61417, 61441, 61441, 61463, 61471, 61471, 61487, 61493, 61493, 61511, + 61519, 61519, 61519, 61543, 61547, 61559, 61561, 61561, 61583, 61583, + 61583, 61603, 61613, 61613, 61631, 61637, 61643, 61651, 61657, 61667, + 61673, 61687, 61687, 61703, 61703, 61717, 61723, 61729, 61729, 61751, + 61757, 61757, 61757, 61781, 61781, 61781, 61781, 61813, 61819, 61819, + 61837, 61843, 61843, 61861, 61871, 61879, 61879, 61879, 61879, 61909, + 61909, 61927, 61933, 61933, 61949, 61949, 61967, 61967, 61981, 61991, + 61991, 62003, 62011, 62017, 62017, 62039, 62047, 62053, 62057, 62071, + 62071, 62081, 62081, 62099, 62099, 62119, 62119, 62131, 62143, 62143, + 62143, 62143, 62171, 62171, 62191, 62191, 62207, 62213, 62219, 62219, + 62233, 62233, 62233, 62233, 62233, 62273, 62273, 62273, 62303, 62311, + 62311, 62327, 62327, 62327, 62351, 62351, 62351, 62351, 62383, 62383, + 62383, 62401, 62401, 62423, 62423, 62423, 62423, 62423, 62459, 62467, + 62477, 62483, 62483, 62501, 62507, 62507, 62507, 62533, 62539, 62549, + 62549, 62563, 62563, 62581, 62591, 62597, 62603, 62603, 62617, 62627, + 62639, 62639, 62653, 62659, 62659, 62659, 62687, 62687, 62701, 62701, + 62701, 62723, 62731, 62743, 62743, 62753, 62761, 62773, 62773, 62791, + 62791, 62801, 62801, 62819, 62827, 62827, 62827, 62851, 62861, 62869, + 62873, 62873, 62873, 62903, 62903, 62903, 62927, 62929, 62939, 62939, + 62939, 62939, 62971, 62983, 62989, 62989, 62989, 62989, 62989, 63031, + 63031, 63031, 63031, 63059, 63067, 63079, 63079, 63079, 63103, 63103, + 63113, 63127, 63131, 63131, 63149, 63149, 63149, 63149, 63179, 63179, + 63199, 63199, 63211, 63211, 63211, 63211, 63247, 63247, 63247, 63247, + 63277, 63281, 63281, 63299, 63311, 63317, 63317, 63331, 63337, 63347, + 63353, 63367, 63367, 63377, 63391, 63397, 63397, 63409, 63421, 63421, + 63439, 63443, 63443, 63463, 63467, 63473, 63487, 63493, 63499, 63499, + 63499, 63527, 63533, 63541, 63541, 63559, 63559, 63559, 63577, 63589, + 63599, 63607, 63611, 63617, 63629, 63629, 63647, 63649, 63659, 63671, + 63671, 63671, 63691, 63703, 63709, 63719, 63727, 63727, 63743, 63743, + 63743, 63761, 63773, 63781, 63781, 63799, 63803, 63809, 63823, 63823, + 63839, 63841, 63853, 63863, 63863, 63863, 63863, 63863, 63901, 63907, + 63913, 63913, 63929, 63929, 63949, 63949, 63949, 63949, 63977, 63977, + 63997, 64007, 64013, 64019, 64019, 64037, 64037, 64037, 64063, 64067, + 64067, 64081, 64091, 64091, 64109, 64109, 64123, 64123, 64123, 64151, + 64157, 64157, 64171, 64171, 64189, 64189, 64189, 64189, 64223, 64231, + 64237, 64237, 64237, 64237, 64271, 64279, 64283, 64283, 64303, 64303, + 64319, 64327, 64333, 64333, 64333, 64333, 64333, 64373, 64381, 64381, + 64399, 64403, 64403, 64403, 64403, 64439, 64439, 64453, 64453, 64453, + 64453, 64483, 64489, 64499, 64499, 64513, 64513, 64513, 64513, 64513, + 64553, 64567, 64567, 64579, 64591, 64591, 64601, 64613, 64621, 64627, + 64633, 64633, 64633, 64663, 64667, 64679, 64679, 64693, 64693, 64709, + 64717, 64717, 64717, 64717, 64747, 64747, 64763, 64763, 64783, 64783, + 64793, 64793, 64811, 64817, 64817, 64817, 64817, 64853, 64853, 64871, + 64879, 64879, 64891, 64901, 64901, 64919, 64927, 64927, 64937, 64951, + 64951, 64951, 64969, 64969, 64969, 64997, 65003, 65011, 65011, 65029, + 65033, 65033, 65053, 65063, 65071, 65071, 65071, 65089, 65101, 65111, + 65119, 65123, 65129, 65141, 65147, 65147, 65167, 65173, 65183, 65183, + 65183, 65203, 65213, 65213, 65213, 65239, 65239, 65239, 65257, 65269, + 65269, 65287, 65293, 65293, 65309, 65309, 65327, 65327, 65327, 65327, + 65357, 65357, 65371, 65381, 65381, 65393, 65407, 65413, 65423, 65423, + 65437, 65447, 65449, 65449, 65449, 65479, 65479, 65479, 65497, 65497, + 65519, 65521, 65521, 65543, 65551, 65557, 65563, 65563, 65581, 65587, + 65599, 65599, 65609, 65617, 65629, 65633, 65647, 65651, 65657, 65657, + 65677, 65687, 65687, 65701, 65707, 65719, 65719, 65731, 65731, 65731, + 65731, 65761, 65761, 65777, 65789, 65789, 65789, 65809, 65809, 65831, + 65839, 65843, 65851, 65851, 65867, 65867, 65881, 65881, 65899, 65899, + 65899, 65927, 65929, 65929, 65951, 65957, 65963, 65963, 65983, 65983, + 65993, 65993, 65993, 65993, 66029, 66037, 66047, 66047, 66047, 66071, + 66071, 66083, 66089, 66103, 66109, 66109, 66109, 66109, 66137, 66137, + 66137, 66161, 66173, 66179, 66191, 66191, 66191, 66191, 66221, 66221, + 66239, 66239, 66239, 66239, 66271, 66271, 66271, 66293, 66301, 66301, + 66301, 66301, 66301, 66343, 66347, 66359, 66361, 66373, 66383, 66383, + 66383, 66403, 66413, 66413, 66431, 66431, 66431, 66449, 66463, 66467, + 66467, 66467, 66491, 66499, 66509, 66509, 66523, 66533, 66541, 66541, + 66553, 66553, 66571, 66571, 66587, 66593, 66601, 66601, 66617, 66629, + 66629, 66643, 66653, 66653, 66653, 66653, 66683, 66683, 66701, 66701, + 66713, 66721, 66733, 66739, 66751, 66751, 66763, 66763, 66763, 66791, + 66797, 66797, 66809, 66821, 66821, 66821, 66841, 66853, 66863, 66863, + 66877, 66883, 66889, 66889, 66889, 66919, 66923, 66931, 66943, 66949, + 66959, 66959, 66973, 66977, 66977, 66977, 67003, 67003, 67021, 67021, + 67033, 67043, 67049, 67061, 67061, 67079, 67079, 67079, 67103, 67103, + 67103, 67121, 67129, 67141, 67141, 67157, 67157, 67169, 67181, 67189, + 67189, 67189, 67213, 67219, 67231, 67231, 67247, 67247, 67261, 67271, + 67273, 67273, 67289, 67289, 67307, 67307, 67307, 67307, 67343, 67349, + 67349, 67349, 67369, 67369, 67391, 67399, 67399, 67411, 67421, 67429, + 67433, 67447, 67453, 67453, 67453, 67477, 67481, 67493, 67499, 67511, + 67511, 67523, 67531, 67537, 67547, 67559, 67567, 67567, 67579, 67589, + 67589, 67607, 67607, 67619, 67631, 67631, 67631, 67651, 67651, 67651, + 67679, 67679, 67679, 67699, 67709, 67709, 67723, 67733, 67741, 67751, + 67759, 67763, 67763, 67783, 67789, 67789, 67807, 67807, 67819, 67829, + 67829, 67843, 67853, 67853, 67867, 67867, 67883, 67891, 67901, 67901, + 67901, 67927, 67933, 67943, 67943, 67957, 67967, 67967, 67979, 67987, + 67993, 67993, 67993, 68023, 68023, 68023, 68041, 68053, 68059, 68071, + 68071, 68087, 68087, 68099, 68111, 68113, 68113, 68113, 68141, 68147, + 68147, 68161, 68171, 68171, 68171, 68171, 68207, 68213, 68219, 68227, + 68239, 68239, 68239, 68261, 68261, 68279, 68281, 68281, 68281, 68311, + 68311, 68311, 68329, 68329, 68351, 68351, 68351, 68371, 68371, 68389, + 68399, 68399, 68399, 68399, 68399, 68437, 68447, 68449, 68449, 68449, + 68477, 68483, 68491, 68501, 68507, 68507, 68521, 68531, 68543, 68543, + 68543, 68567, 68567, 68581, 68581, 68597, 68597, 68611, 68611, 68611, + 68639, 68639, 68639, 68659, 68669, 68669, 68687, 68687, 68699, 68711, + 68713, 68713, 68729, 68743, 68749, 68749, 68767, 68771, 68777, 68791, + 68791, 68791, 68813, 68821, 68821, 68821, 68821, 68821, 68863, 68863, + 68879, 68881, 68891, 68903, 68909, 68917, 68927, 68927, 68927, 68947, + 68947, 68963, 68963, 68963, 68963, 68993, 69001, 69011, 69019, 69031, + 69031, 69031, 69031, 69061, 69067, 69073, 69073, 69073, 69073, 69109, + 69119, 69127, 69127, 69143, 69151, 69151, 69163, 69163, 69163, 69191, + 69197, 69203, 69203, 69221, 69221, 69239, 69247, 69247, 69263, 69263, + 69263, 69263, 69263, 69263, 69263, 69317, 69317, 69317, 69341, 69341, + 69341, 69341, 69371, 69383, 69389, 69389, 69403, 69403, 69403, 69431, + 69439, 69439, 69439, 69463, 69467, 69473, 69481, 69493, 69499, 69499, + 69499, 69499, 69499, 69539, 69539, 69557, 69557, 69557, 69557, 69557, + 69593, 69593, 69593, 69623, 69623, 69623, 69623, 69653, 69661, 69661, + 69677, 69677, 69691, 69697, 69709, 69709, 69709, 69709, 69739, 69739, + 69739, 69767, 69767, 69779, 69779, 69779, 69779, 69809, 69821, 69829, + 69833, 69847, 69847, 69859, 69859, 69877, 69877, 69877, 69899, 69911, + 69911, 69911, 69931, 69941, 69941, 69959, 69959, 69959, 69959, 69991, + 69997, 70003, 70009, 70019, 70019, 70039, 70039, 70051, 70061, 70067, + 70079, 70079, 70079, 70099, 70111, 70117, 70123, 70123, 70141, 70141, + 70157, 70163, 70163, 70183, 70183, 70199, 70207, 70207, 70223, 70229, + 70237, 70241, 70249, 70249, 70271, 70271, 70271, 70289, 70297, 70309, + 70313, 70327, 70327, 70327, 70351, 70351, 70351, 70373, 70381, 70381, + 70393, 70393, 70393, 70423, 70429, 70439, 70439, 70451, 70459, 70459, + 70459, 70487, 70489, 70501, 70507, 70507, 70507, 70529, 70537, 70549, + 70549, 70549, 70573, 70583, 70589, 70589, 70607, 70607, 70621, 70627, + 70639, 70639, 70639, 70663, 70667, 70667, 70687, 70687, 70687, 70709, + 70717, 70717, 70729, 70729, 70729, 70753, 70753, 70769, 70783, 70783, + 70793, 70793, 70793, 70823, 70823, 70823, 70843, 70853, 70853, 70867, + 70879, 70879, 70891, 70901, 70901, 70919, 70921, 70921, 70937, 70951, + 70957, 70957, 70969, 70981, 70991, 70999, 70999, 71011, 71023, 71023, + 71039, 71039, 71039, 71059, 71069, 71069, 71081, 71089, 71089, 71089, + 71119, 71119, 71129, 71143, 71147, 71153, 71167, 71171, 71171, 71191, + 71191, 71191, 71209, 71209, 71209, 71237, 71237, 71249, 71263, 71263, + 71263, 71287, 71293, 71293, 71293, 71317, 71327, 71333, 71341, 71347, + 71359, 71363, 71363, 71363, 71389, 71399, 71399, 71413, 71419, 71429, + 71437, 71443, 71453, 71453, 71471, 71479, 71483, 71483, 71503, 71503, + 71503, 71527, 71527, 71537, 71551, 71551, 71563, 71569, 71569, 71569, + 71597, 71597, 71597, 71597, 71597, 71633, 71647, 71647, 71663, 71671, + 71671, 71671, 71693, 71699, 71711, 71719, 71719, 71719, 71741, 71741, + 71741, 71761, 71761, 71777, 71789, 71789, 71807, 71809, 71821, 71821, + 71837, 71843, 71849, 71861, 71867, 71879, 71887, 71887, 71899, 71909, + 71917, 71917, 71933, 71941, 71947, 71947, 71963, 71971, 71983, 71987, + 71999, 71999, 71999, 72019, 72031, 72031, 72047, 72053, 72053, 72053, + 72077, 72077, 72091, 72103, 72109, 72109, 72109, 72109, 72139, 72139, + 72139, 72167, 72173, 72173, 72173, 72173, 72173, 72211, 72223, 72229, + 72229, 72229, 72253, 72253, 72271, 72277, 72287, 72287, 72287, 72307, + 72313, 72313, 72313, 72341, 72341, 72353, 72367, 72367, 72383, 72383, + 72383, 72383, 72383, 72421, 72431, 72431, 72431, 72431, 72461, 72469, + 72469, 72481, 72493, 72503, 72503, 72503, 72503, 72533, 72533, 72551, + 72559, 72559, 72559, 72577, 72577, 72577, 72577, 72613, 72623, 72623, + 72623, 72647, 72649, 72661, 72671, 72679, 72679, 72689, 72701, 72707, + 72719, 72727, 72733, 72739, 72739, 72739, 72767, 72767, 72767, 72767, + 72797, 72797, 72797, 72823, 72823, 72823, 72823, 72823, 72859, 72871, + 72871, 72883, 72893, 72901, 72911, 72911, 72923, 72931, 72937, 72949, + 72959, 72959, 72973, 72977, 72977, 72997, 72997, 73013, 73019, 73019, + 73039, 73043, 73043, 73063, 73063, 73079, 73079, 73091, 73091, 73091, + 73091, 73127, 73133, 73141, 73141, 73141, 73141, 73141, 73181, 73189, + 73189, 73189, 73189, 73189, 73189, 73237, 73243, 73243, 73259, 73259, + 73277, 73277, 73291, 73303, 73309, 73309, 73327, 73331, 73331, 73351, + 73351, 73363, 73369, 73379, 73387, 73387, 73387, 73387, 73421, 73421, + 73433, 73433, 73453, 73459, 73471, 73477, 73483, 73483, 73483, 73483, + 73517, 73523, 73529, 73529, 73547, 73553, 73561, 73571, 73583, 73589, + 73597, 73607, 73613, 73613, 73613, 73637, 73643, 73651, 73651, 73651, + 73679, 73681, 73693, 73699, 73709, 73709, 73727, 73727, 73727, 73751, + 73757, 73757, 73771, 73783, 73783, 73783, 73783, 73783, 73823, 73823, + 73823, 73847, 73849, 73859, 73867, 73877, 73883, 73883, 73897, 73907, + 73907, 73907, 73907, 73943, 73951, 73951, 73961, 73973, 73973, 73973, + 73999, 73999, 73999, 74021, 74027, 74027, 74047, 74051, 74051, 74071, + 74077, 74077, 74093, 74101, 74101, 74101, 74101, 74131, 74143, 74149, + 74159, 74167, 74167, 74177, 74189, 74197, 74203, 74209, 74219, 74231, + 74231, 74231, 74231, 74257, 74257, 74279, 74287, 74293, 74297, 74311, + 74317, 74323, 74323, 74323, 74323, 74357, 74363, 74363, 74383, 74383, + 74383, 74383, 74413, 74419, 74419, 74419, 74441, 74453, 74453, 74471, + 74471, 74471, 74489, 74489, 74509, 74509, 74527, 74531, 74531, 74551, + 74551, 74567, 74573, 74573, 74587, 74597, 74597, 74611, 74623, 74623, + 74623, 74623, 74653, 74653, 74653, 74653, 74687, 74687, 74699, 74707, + 74719, 74719, 74731, 74731, 74747, 74759, 74761, 74771, 74779, 74779, + 74797, 74797, 74797, 74821, 74831, 74831, 74843, 74843, 74861, 74869, + 74873, 74887, 74891, 74903, 74903, 74903, 74923, 74933, 74941, 74941, + 74959, 74959, 74959, 74959, 74959, 74959, 74959, 75013, 75017, 75029, + 75037, 75041, 75041, 75041, 75041, 75079, 75083, 75083, 75083, 75109, + 75109, 75109, 75133, 75133, 75149, 75149, 75167, 75169, 75181, 75181, + 75193, 75193, 75211, 75223, 75227, 75239, 75239, 75253, 75253, 75269, + 75277, 75277, 75289, 75289, 75307, 75307, 75323, 75329, 75337, 75347, + 75353, 75367, 75367, 75377, 75391, 75391, 75407, 75407, 75407, 75431, + 75437, 75437, 75437, 75437, 75437, 75479, 75479, 75479, 75503, 75511, + 75511, 75527, 75533, 75541, 75541, 75557, 75557, 75571, 75583, 75583, + 75583, 75583, 75611, 75619, 75629, 75629, 75641, 75653, 75659, 75659, + 75679, 75683, 75689, 75703, 75709, 75709, 75721, 75731, 75743, 75743, + 75743, 75767, 75773, 75781, 75787, 75797, 75797, 75797, 75821, 75821, + 75833, 75833, 75853, 75853, 75869, 75869, 75883, 75883, 75883, 75883, + 75913, 75913, 75931, 75941, 75941, 75941, 75967, 75967, 75983, 75991, + 75997, 76003, 76003, 76003, 76031, 76039, 76039, 76039, 76039, 76039, + 76079, 76081, 76091, 76103, 76103, 76103, 76123, 76129, 76129, 76147, + 76159, 76163, 76163, 76163, 76163, 76163, 76207, 76213, 76213, 76231, + 76231, 76243, 76253, 76261, 76261, 76261, 76283, 76289, 76303, 76303, + 76303, 76303, 76333, 76343, 76343, 76343, 76367, 76369, 76379, 76387, + 76387, 76403, 76403, 76423, 76423, 76423, 76441, 76441, 76463, 76471, + 76471, 76487, 76493, 76493, 76511, 76519, 76519, 76519, 76543, 76543, + 76543, 76561, 76561, 76579, 76579, 76597, 76607, 76607, 76607, 76631, + 76631, 76631, 76651, 76651, 76667, 76679, 76679, 76679, 76697, 76697, + 76717, 76717, 76733, 76733, 76733, 76757, 76757, 76771, 76781, 76781, + 76781, 76801, 76801, 76819, 76831, 76837, 76847, 76847, 76847, 76871, + 76873, 76883, 76883, 76883, 76907, 76919, 76919, 76919, 76943, 76949, + 76949, 76963, 76963, 76963, 76991, 76991, 77003, 77003, 77023, 77029, + 77029, 77047, 77047, 77047, 77069, 77069, 77081, 77093, 77101, 77101, + 77101, 77101, 77101, 77141, 77141, 77153, 77167, 77171, 77171, 77191, + 77191, 77201, 77213, 77213, 77213, 77239, 77243, 77249, 77263, 77269, + 77279, 77279, 77291, 77291, 77291, 77317, 77323, 77323, 77339, 77351, + 77359, 77359, 77369, 77383, 77383, 77383, 77383, 77383, 77419, 77431, + 77431, 77447, 77447, 77447, 77471, 77479, 77479, 77491, 77491, 77509, + 77513, 77527, 77527, 77543, 77551, 77557, 77563, 77573, 77573, 77591, + 77591, 77591, 77611, 77621, 77621, 77621, 77647, 77647, 77659, 77659, + 77659, 77687, 77689, 77699, 77711, 77719, 77723, 77731, 77743, 77747, + 77747, 77761, 77773, 77783, 77783, 77797, 77801, 77813, 77813, 77813, + 77839, 77839, 77849, 77863, 77867, 77867, 77867, 77893, 77899, 77899, + 77899, 77899, 77933, 77933, 77951, 77951, 77951, 77969, 77983, 77983, + 77999, 78007, 78007, 78017, 78031, 78031, 78041, 78049, 78059, 78059, + 78079, 78079, 78079, 78101, 78101, 78101, 78121, 78121, 78139, 78139, + 78157, 78167, 78173, 78179, 78191, 78193, 78203, 78203, 78203, 78229, + 78233, 78241, 78241, 78259, 78259, 78277, 78283, 78283, 78301, 78311, + 78317, 78317, 78317, 78341, 78347, 78347, 78367, 78367, 78367, 78367, + 78367, 78401, 78401, 78401, 78427, 78439, 78439, 78439, 78439, 78467, + 78479, 78487, 78487, 78497, 78511, 78517, 78517, 78517, 78541, 78541, + 78553, 78553, 78571, 78583, 78583, 78593, 78607, 78607, 78623, 78623, + 78623, 78643, 78653, 78653, 78653, 78653, 78653, 78691, 78697, 78707, + 78713, 78721, 78721, 78737, 78737, 78737, 78737, 78737, 78781, 78791, + 78797, 78803, 78809, 78823, 78823, 78839, 78839, 78853, 78857, 78857, + 78877, 78887, 78893, 78901, 78901, 78919, 78919, 78929, 78941, 78941, + 78941, 78941, 78941, 78979, 78989, 78989, 78989, 78989, 78989, 79031, + 79039, 79043, 79043, 79063, 79063, 79063, 79087, 79087, 79103, 79111, + 79111, 79111, 79133, 79139, 79151, 79159, 79159, 79159, 79181, 79187, + 79193, 79201, 79201, 79201, 79231, 79231, 79241, 79241, 79259, 79259, + 79279, 79283, 79283, 79301, 79309, 79319, 79319, 79333, 79337, 79349, + 79357, 79367, 79367, 79379, 79379, 79399, 79399, 79411, 79423, 79427, + 79433, 79433, 79451, 79451, 79451, 79451, 79481, 79493, 79493, 79493, + 79493, 79493, 79531, 79537, 79549, 79559, 79561, 79561, 79579, 79589, + 79589, 79601, 79613, 79621, 79631, 79633, 79633, 79633, 79657, 79669, + 79669, 79687, 79693, 79699, 79699, 79699, 79699, 79699, 79699, 79699, + 79757, 79757, 79769, 79777, 79777, 79777, 79801, 79813, 79823, 79829, + 79829, 79847, 79847, 79861, 79867, 79873, 79873, 79889, 79903, 79907, + 79907, 79907, 79907, 79943, 79943, 79943, 79967, 79973, 79979, 79987, + 79999, 79999, 79999, 80021, 80021, 80039, 80039, 80051, 80051, 80071, + 80077, 80077, 80077, 80077, 80111, 80111, 80111, 80111, 80141, 80149, + 80153, 80167, 80173, 80177, 80191, 80191, 80207, 80209, 80221, 80231, + 80239, 80239, 80251, 80263, 80263, 80279, 80287, 80287, 80287, 80309, + 80317, 80317, 80329, 80341, 80347, 80347, 80363, 80369, 80369, 80387, + 80387, 80407, 80407, 80407, 80429, 80429, 80447, 80449, 80449, 80471, + 80473, 80473, 80491, 80491, 80491, 80513, 80527, 80527, 80537, 80537, + 80557, 80567, 80567, 80567, 80567, 80599, 80603, 80611, 80621, 80629, + 80629, 80629, 80651, 80657, 80671, 80677, 80687, 80687, 80701, 80701, + 80713, 80713, 80713, 80737, 80749, 80749, 80761, 80761, 80783, 80789, + 80789, 80803, 80809, 80819, 80831, 80833, 80833, 80849, 80863, 80863, + 80863, 80863, 80863, 80897, 80911, 80917, 80923, 80933, 80933, 80933, + 80953, 80963, 80963, 80963, 80989, 80989, 81001, 81013, 81023, 81031, + 81031, 81047, 81049, 81049, 81071, 81077, 81083, 81083, 81101, 81101, + 81119, 81119, 81131, 81131, 81131, 81157, 81163, 81173, 81181, 81181, + 81199, 81203, 81203, 81223, 81223, 81239, 81239, 81239, 81239, 81239, + 81239, 81283, 81293, 81299, 81307, 81307, 81307, 81331, 81343, 81349, + 81359, 81359, 81373, 81373, 81373, 81373, 81401, 81409, 81421, 81421, + 81439, 81439, 81439, 81463, 81463, 81463, 81463, 81463, 81463, 81509, + 81517, 81527, 81533, 81533, 81551, 81559, 81563, 81569, 81569, 81569, + 81569, 81569, 81611, 81619, 81629, 81637, 81647, 81649, 81649, 81671, + 81677, 81677, 81689, 81703, 81707, 81707, 81727, 81727, 81737, 81749, + 81749, 81761, 81773, 81773, 81773, 81799, 81799, 81799, 81817, 81817, + 81839, 81847, 81853, 81853, 81869, 81869, 81883, 81883, 81901, 81901, + 81919, 81919, 81931, 81943, 81943, 81953, 81967, 81973, 81973, 81973, + 81973, 82007, 82013, 82021, 82031, 82039, 82039, 82051, 82051, 82067, + 82073, 82073, 82073, 82073, 82073, 82073, 82073, 82129, 82141, 82141, + 82153, 82163, 82171, 82183, 82189, 82193, 82207, 82207, 82223, 82231, + 82237, 82241, 82241, 82261, 82267, 82279, 82279, 82279, 82301, 82307, + 82307, 82307, 82307, 82339, 82351, 82351, 82361, 82373, 82373, 82387, + 82393, 82393, 82393, 82421, 82421, 82421, 82421, 82421, 82463, 82471, + 82471, 82487, 82493, 82499, 82507, 82507, 82507, 82531, 82531, 82549, + 82559, 82567, 82571, 82571, 82591, 82591, 82601, 82613, 82619, 82619, + 82633, 82633, 82651, 82657, 82657, 82657, 82657, 82657, 82699, 82699, + 82699, 82727, 82729, 82729, 82729, 82759, 82763, 82763, 82781, 82787, + 82799, 82799, 82813, 82813, 82813, 82837, 82847, 82847, 82847, 82847, + 82847, 82883, 82891, 82903, 82903, 82913, 82913, 82913, 82939, 82939, + 82939, 82963, 82963, 82981, 82981, 82997, 83003, 83009, 83023, 83023, + 83023, 83047, 83047, 83063, 83071, 83077, 83077, 83093, 83101, 83101, + 83117, 83117, 83117, 83137, 83137, 83137, 83137, 83137, 83177, 83177, + 83177, 83207, 83207, 83221, 83231, 83233, 83243, 83243, 83257, 83269, + 83273, 83273, 83273, 83299, 83311, 83311, 83311, 83311, 83341, 83341, + 83357, 83357, 83357, 83383, 83389, 83399, 83407, 83407, 83423, 83431, + 83437, 83443, 83449, 83459, 83471, 83477, 83477, 83477, 83497, 83497, + 83497, 83497, 83497, 83537, 83537, 83557, 83563, 83563, 83579, 83591, + 83597, 83597, 83609, 83621, 83621, 83639, 83641, 83653, 83663, 83663, + 83663, 83663, 83689, 83701, 83701, 83719, 83719, 83719, 83737, 83737, + 83737, 83761, 83773, 83777, 83791, 83791, 83791, 83813, 83813, 83813, + 83833, 83843, 83843, 83857, 83869, 83873, 83873, 83891, 83903, 83911, + 83911, 83921, 83933, 83939, 83939, 83939, 83939, 83969, 83983, 83987, + 83987, 83987, 84011, 84017, 84017, 84017, 84047, 84053, 84061, 84067, + 84067, 84067, 84089, 84089, 84089, 84089, 84127, 84131, 84143, 84143, + 84143, 84163, 84163, 84181, 84191, 84199, 84199, 84211, 84223, 84229, + 84239, 84247, 84247, 84263, 84263, 84263, 84263, 84263, 84299, 84307, + 84319, 84319, 84319, 84319, 84349, 84349, 84349, 84349, 84377, 84391, + 84391, 84407, 84407, 84421, 84431, 84437, 84443, 84449, 84463, 84467, + 84467, 84481, 84481, 84503, 84509, 84509, 84523, 84533, 84533, 84551, + 84559, 84559, 84559, 84559, 84589, 84589, 84589, 84589, 84589, 84631, + 84631, 84631, 84653, 84659, 84659, 84673, 84673, 84691, 84701, 84701, + 84719, 84719, 84731, 84737, 84751, 84751, 84761, 84761, 84761, 84787, + 84793, 84793, 84811, 84811, 84827, 84827, 84827, 84827, 84859, 84871, + 84871, 84871, 84871, 84871, 84871, 84919, 84919, 84919, 84919, 84947, + 84947, 84967, 84967, 84979, 84991, 84991, 84991, 85009, 85021, 85027, + 85037, 85037, 85049, 85061, 85061, 85061, 85087, 85093, 85103, 85109, + 85109, 85121, 85133, 85133, 85147, 85159, 85159, 85159, 85159, 85159, + 85199, 85201, 85213, 85223, 85229, 85237, 85247, 85247, 85259, 85259, + 85259, 85259, 85259, 85303, 85303, 85313, 85313, 85333, 85333, 85333, + 85333, 85363, 85369, 85381, 85381, 85381, 85381, 85411, 85411, 85429, + 85439, 85447, 85453, 85453, 85469, 85469, 85487, 85487, 85487, 85487, + 85517, 85523, 85531, 85531, 85549, 85549, 85549, 85571, 85577, 85577, + 85597, 85607, 85607, 85621, 85627, 85639, 85643, 85643, 85661, 85669, + 85669, 85669, 85691, 85703, 85711, 85717, 85717, 85733, 85733, 85751, + 85751, 85751, 85751, 85781, 85781, 85793, 85793, 85793, 85819, 85831, + 85837, 85847, 85853, 85853, 85853, 85853, 85853, 85889, 85903, 85909, + 85909, 85909, 85933, 85933, 85933, 85933, 85933, 85933, 85933, 85991, + 85999, 85999, 86011, 86017, 86029, 86029, 86029, 86029, 86029, 86069, + 86077, 86083, 86083, 86083, 86111, 86117, 86117, 86131, 86143, 86143, + 86143, 86161, 86171, 86183, 86183, 86197, 86201, 86209, 86209, 86209, + 86239, 86243, 86249, 86263, 86269, 86269, 86287, 86293, 86297, 86311, + 86311, 86323, 86323, 86341, 86351, 86357, 86357, 86371, 86381, 86389, + 86399, 86399, 86413, 86423, 86423, 86423, 86441, 86453, 86461, 86467, + 86477, 86477, 86491, 86501, 86509, 86509, 86509, 86533, 86539, 86539, + 86539, 86561, 86573, 86579, 86587, 86599, 86599, 86599, 86599, 86629, + 86629, 86629, 86629, 86629, 86629, 86677, 86677, 86693, 86693, 86711, + 86719, 86719, 86729, 86743, 86743, 86753, 86767, 86771, 86783, 86783, + 86783, 86783, 86813, 86813, 86813, 86837, 86843, 86851, 86861, 86869, + 86869, 86869, 86869, 86869, 86869, 86869, 86927, 86929, 86939, 86951, + 86959, 86959, 86969, 86981, 86981, 86993, 86993, 87013, 87013, 87013, + 87037, 87041, 87049, 87049, 87071, 87071, 87083, 87083, 87103, 87107, + 87119, 87121, 87133, 87133, 87151, 87151, 87151, 87151, 87181, 87187, + 87187, 87187, 87211, 87223, 87223, 87223, 87223, 87253, 87257, 87257, + 87277, 87281, 87293, 87299, 87299, 87317, 87323, 87323, 87337, 87337, + 87359, 87359, 87359, 87383, 87383, 87383, 87407, 87407, 87421, 87427, + 87433, 87443, 87443, 87443, 87443, 87473, 87481, 87491, 87491, 87511, + 87517, 87523, 87523, 87541, 87547, 87559, 87559, 87559, 87583, 87589, + 87589, 87589, 87613, 87623, 87631, 87631, 87643, 87649, 87649, 87671, + 87679, 87683, 87691, 87701, 87701, 87719, 87721, 87721, 87743, 87751, + 87751, 87767, 87767, 87767, 87767, 87797, 87803, 87811, 87811, 87811, + 87833, 87833, 87853, 87853, 87869, 87877, 87887, 87887, 87887, 87911, + 87917, 87917, 87931, 87943, 87943, 87959, 87961, 87973, 87977, 87991, + 87991, 88007, 88007, 88019, 88019, 88037, 88037, 88037, 88037, 88069, + 88079, 88079, 88093, 88093, 88093, 88117, 88117, 88129, 88129, 88129, + 88129, 88129, 88169, 88177, 88177, 88177, 88177, 88211, 88223, 88223, + 88237, 88241, 88241, 88261, 88261, 88261, 88261, 88289, 88301, 88301, + 88301, 88327, 88327, 88339, 88339, 88339, 88339, 88339, 88379, 88379, + 88397, 88397, 88411, 88423, 88427, 88427, 88427, 88427, 88463, 88471, + 88471, 88471, 88493, 88499, 88499, 88513, 88523, 88523, 88523, 88547, + 88547, 88547, 88547, 88547, 88591, 88591, 88607, 88609, 88609, 88609, + 88609, 88643, 88651, 88663, 88667, 88667, 88681, 88681, 88681, 88681, + 88681, 88721, 88729, 88741, 88747, 88747, 88747, 88771, 88771, 88789, + 88799, 88807, 88813, 88819, 88819, 88819, 88843, 88853, 88861, 88867, + 88873, 88883, 88883, 88903, 88903, 88919, 88919, 88919, 88937, 88951, + 88951, 88951, 88969, 88969, 88969, 88997, 89003, 89009, 89021, 89021, + 89021, 89041, 89051, 89057, 89071, 89071, 89087, 89087, 89101, 89107, + 89119, 89123, 89123, 89137, 89137, 89153, 89153, 89153, 89153, 89189, + 89189, 89203, 89213, 89213, 89231, 89237, 89237, 89237, 89261, 89269, + 89273, 89273, 89293, 89303, 89303, 89317, 89317, 89329, 89329, 89329, + 89329, 89363, 89371, 89381, 89387, 89399, 89399, 89413, 89417, 89431, + 89431, 89443, 89449, 89459, 89459, 89477, 89477, 89491, 89501, 89501, + 89519, 89527, 89533, 89533, 89533, 89533, 89567, 89567, 89567, 89591, + 89599, 89603, 89611, 89611, 89627, 89633, 89633, 89653, 89659, 89671, + 89671, 89681, 89689, 89689, 89689, 89689, 89689, 89689, 89689, 89689, + 89759, 89767, 89767, 89783, 89783, 89797, 89797, 89809, 89821, 89821, + 89839, 89839, 89849, 89849, 89867, 89867, 89867, 89891, 89899, 89909, + 89917, 89923, 89923, 89939, 89939, 89959, 89963, 89963, 89983, 89989, + 89989, 90007, 90011, 90023, 90031, 90031, 90031, 90053, 90059, 90071, + 90073, 90073, 90089, 90089, 90107, 90107, 90127, 90127, 90127, 90149, + 90149, 90163, 90173, 90173, 90191, 90199, 90203, 90203, 90217, 90227, + 90239, 90247, 90247, 90263, 90271, 90271, 90281, 90289, 90289, 90289, + 90313, 90313, 90313, 90313, 90313, 90359, 90359, 90373, 90379, 90379, + 90397, 90407, 90407, 90407, 90407, 90439, 90439, 90439, 90439, 90469, + 90473, 90481, 90481, 90499, 90511, 90511, 90527, 90533, 90533, 90547, + 90547, 90547, 90547, 90583, 90583, 90599, 90599, 90599, 90619, 90631, + 90631, 90647, 90647, 90659, 90659, 90679, 90679, 90679, 90703, 90709, + 90709, 90709, 90731, 90731, 90749, 90749, 90749, 90749, 90749, 90787, + 90793, 90803, 90803, 90823, 90823, 90833, 90847, 90847, 90863, 90863, + 90863, 90887, 90887, 90901, 90911, 90917, 90917, 90931, 90931, 90947, + 90947, 90947, 90971, 90977, 90989, 90997, 90997, 91009, 91019, 91019, + 91033, 91033, 91033, 91033, 91033, 91079, 91081, 91081, 91099, 91099, + 91099, 91127, 91129, 91141, 91151, 91159, 91163, 91163, 91183, 91183, + 91199, 91199, 91199, 91199, 91229, 91237, 91243, 91253, 91253, 91253, + 91253, 91283, 91291, 91303, 91309, 91309, 91309, 91331, 91331, 91331, + 91331, 91367, 91373, 91381, 91387, 91397, 91397, 91411, 91423, 91423, + 91433, 91433, 91453, 91463, 91463, 91463, 91463, 91493, 91499, 91499, + 91513, 91513, 91529, 91541, 91541, 91541, 91541, 91573, 91583, 91591, + 91591, 91591, 91591, 91621, 91631, 91639, 91639, 91639, 91639, 91639, + 91673, 91673, 91691, 91703, 91711, 91711, 91711, 91733, 91733, 91733, + 91757, 91757, 91771, 91781, 91781, 91781, 91807, 91813, 91823, 91823, + 91837, 91841, 91841, 91841, 91867, 91873, 91873, 91873, 91873, 91909, + 91909, 91921, 91921, 91943, 91951, 91957, 91967, 91969, 91969, 91969, + 91997, 92003, 92009, 92009, 92009, 92033, 92041, 92051, 92051, 92051, + 92077, 92083, 92083, 92083, 92111, 92119, 92119, 92119, 92143, 92143, + 92153, 92153, 92173, 92179, 92189, 92189, 92203, 92203, 92221, 92227, + 92237, 92243, 92251, 92251, 92269, 92269, 92269, 92269, 92297, 92311, + 92317, 92317, 92333, 92333, 92347, 92357, 92363, 92369, 92383, 92387, + 92399, 92401, 92413, 92419, 92431, 92431, 92431, 92431, 92461, 92467, + 92479, 92479, 92489, 92503, 92507, 92507, 92507, 92507, 92507, 92551, + 92557, 92567, 92569, 92581, 92581, 92593, 92593, 92593, 92623, 92627, + 92639, 92647, 92647, 92657, 92671, 92671, 92683, 92693, 92699, 92707, + 92717, 92723, 92723, 92737, 92737, 92753, 92767, 92767, 92779, 92791, + 92791, 92801, 92809, 92821, 92831, 92831, 92831, 92849, 92863, 92867, + 92867, 92867, 92893, 92899, 92899, 92899, 92927, 92927, 92941, 92951, + 92959, 92959, 92959, 92959, 92987, 92993, 93001, 93001, 93001, 93001, + 93001, 93047, 93053, 93059, 93059, 93077, 93083, 93089, 93103, 93103, + 93113, 93113, 93133, 93139, 93151, 93151, 93151, 93169, 93179, 93187, + 93199, 93199, 93199, 93199, 93229, 93239, 93241, 93253, 93263, 93263, + 93263, 93287, 93287, 93287, 93307, 93319, 93323, 93329, 93337, 93337, + 93337, 93337, 93371, 93383, 93383, 93383, 93407, 93407, 93419, 93427, + 93427, 93427, 93427, 93463, 93463, 93479, 93487, 93493, 93503, 93503, + 93503, 93523, 93529, 93529, 93529, 93559, 93563, 93563, 93581, 93581, + 93581, 93607, 93607, 93607, 93629, 93637, 93637, 93637, 93637, 93637, + 93637, 93683, 93683, 93703, 93703, 93719, 93719, 93719, 93739, 93739, + 93739, 93763, 93763, 93763, 93787, 93787, 93787, 93811, 93811, 93827, + 93827, 93827, 93851, 93851, 93871, 93871, 93887, 93893, 93901, 93911, + 93913, 93923, 93923, 93941, 93949, 93949, 93967, 93971, 93983, 93983, + 93997, 94007, 94009, 94009, 94009, 94033, 94033, 94049, 94063, 94063, + 94079, 94079, 94079, 94099, 94111, 94117, 94121, 94121, 94121, 94151, + 94153, 94153, 94169, 94169, 94169, 94169, 94207, 94207, 94219, 94229, + 94229, 94229, 94253, 94261, 94261, 94273, 94273, 94291, 94291, 94309, + 94309, 94327, 94331, 94343, 94351, 94351, 94351, 94351, 94379, 94379, + 94399, 94399, 94399, 94421, 94427, 94439, 94447, 94447, 94463, 94463, + 94477, 94483, 94483, 94483, 94483, 94513, 94513, 94531, 94543, 94547, + 94559, 94561, 94573, 94583, 94583, 94597, 94603, 94613, 94621, 94621, + 94621, 94621, 94651, 94651, 94651, 94651, 94687, 94693, 94693, 94709, + 94709, 94727, 94727, 94727, 94747, 94747, 94747, 94771, 94781, 94789, + 94793, 94793, 94811, 94823, 94823, 94837, 94847, 94849, 94849, 94849, + 94873, 94873, 94889, 94903, 94907, 94907, 94907, 94933, 94933, 94951, + 94951, 94961, 94961, 94961, 94961, 94999, 95003, 95009, 95021, 95027, + 95027, 95027, 95027, 95063, 95071, 95071, 95087, 95093, 95101, 95111, + 95111, 95111, 95131, 95143, 95143, 95153, 95153, 95153, 95177, 95191, + 95191, 95203, 95213, 95219, 95231, 95239, 95239, 95239, 95261, 95267, + 95279, 95287, 95287, 95287, 95311, 95317, 95327, 95327, 95339, 95339, + 95339, 95339, 95369, 95383, 95383, 95393, 95401, 95413, 95419, 95429, + 95429, 95443, 95443, 95461, 95471, 95479, 95483, 95483, 95483, 95507, + 95507, 95527, 95531, 95539, 95549, 95549, 95561, 95569, 95581, 95581, + 95597, 95603, 95603, 95621, 95629, 95633, 95633, 95651, 95651, 95651, + 95651, 95651, 95651, 95701, 95707, 95717, 95723, 95731, 95737, 95747, + 95747, 95747, 95773, 95783, 95791, 95791, 95803, 95813, 95819, 95819, + 95819, 95819, 95819, 95857, 95869, 95873, 95881, 95891, 95891, 95911, + 95917, 95923, 95929, 95929, 95947, 95959, 95959, 95971, 95971, 95989, + 95989, 96001, 96013, 96017, 96017, 96017, 96043, 96053, 96059, 96059, + 96079, 96079, 96079, 96097, 96097, 96097, 96097, 96097, 96137, 96149, + 96157, 96167, 96167, 96181, 96181, 96199, 96199, 96211, 96223, 96223, + 96233, 96233, 96233, 96263, 96269, 96269, 96281, 96293, 96293, 96293, + 96293, 96323, 96331, 96337, 96337, 96353, 96353, 96353, 96377, 96377, + 96377, 96401, 96401, 96419, 96431, 96431, 96443, 96451, 96461, 96469, + 96479, 96487, 96493, 96497, 96497, 96517, 96527, 96527, 96527, 96527, + 96557, 96557, 96557, 96581, 96589, 96589, 96601, 96601, 96601, 96601, + 96601, 96643, 96643, 96661, 96671, 96671, 96671, 96671, 96703, 96703, + 96703, 96703, 96731, 96739, 96749, 96757, 96763, 96769, 96779, 96787, + 96799, 96799, 96799, 96823, 96827, 96827, 96847, 96851, 96857, 96857, + 96857, 96857, 96893, 96893, 96911, 96911, 96911, 96931, 96931, 96931, + 96959, 96959, 96973, 96979, 96989, 96997, 97007, 97007, 97021, 97021, + 97039, 97039, 97039, 97039, 97039, 97073, 97081, 97081, 97103, 97103, + 97117, 97127, 97127, 97127, 97151, 97159, 97159, 97171, 97177, 97187, + 97187, 97187, 97213, 97213, 97231, 97231, 97241, 97241, 97259, 97259, + 97259, 97283, 97283, 97303, 97303, 97303, 97327, 97327, 97327, 97327, + 97327, 97367, 97373, 97381, 97387, 97397, 97397, 97397, 97423, 97429, + 97429, 97441, 97453, 97463, 97463, 97463, 97463, 97463, 97501, 97511, + 97511, 97523, 97523, 97523, 97549, 97553, 97561, 97571, 97583, 97583, + 97583, 97607, 97613, 97613, 97613, 97613, 97613, 97651, 97651, 97651, + 97673, 97687, 97687, 97687, 97711, 97711, 97711, 97729, 97729, 97729, + 97729, 97729, 97771, 97777, 97789, 97789, 97789, 97813, 97813, 97829, + 97829, 97847, 97849, 97861, 97871, 97879, 97883, 97883, 97883, 97883, + 97919, 97927, 97931, 97943, 97943, 97943, 97967, 97973, 97973, 97987, + 97987, 97987, 98011, 98017, 98017, 98017, 98047, 98047, 98057, 98057, + 98057, 98081, 98081, 98101, 98101, 98101, 98123, 98129, 98143, 98143, + 98143, 98143, 98143, 98179, 98179, 98179, 98207, 98213, 98221, 98227, + 98227, 98227, 98251, 98257, 98269, 98269, 98269, 98269, 98299, 98299, + 98317, 98327, 98327, 98327, 98347, 98347, 98347, 98369, 98377, 98389, + 98389, 98407, 98411, 98419, 98429, 98429, 98443, 98453, 98459, 98467, + 98479, 98479, 98491, 98491, 98507, 98519, 98519, 98533, 98543, 98543, + 98543, 98563, 98573, 98573, 98573, 98597, 98597, 98597, 98621, 98627, + 98639, 98641, 98641, 98663, 98669, 98669, 98669, 98689, 98689, 98711, + 98717, 98717, 98731, 98737, 98737, 98737, 98737, 98773, 98779, 98779, + 98779, 98807, 98809, 98809, 98809, 98837, 98837, 98849, 98849, 98869, + 98873, 98887, 98893, 98899, 98911, 98911, 98927, 98929, 98939, 98947, + 98953, 98963, 98963, 98981, 98981, 98999, 98999, 99013, 99023, 99023, + 99023, 99041, 99053, 99053, 99053, 99079, 99083, 99089, 99103, 99109, + 99119, 99119, 99133, 99139, 99149, 99149, 99149, 99173, 99181, 99191, + 99191, 99191, 99191, 99223, 99223, 99233, 99241, 99251, 99259, 99259, + 99277, 99277, 99289, 99289, 99289, 99317, 99317, 99317, 99317, 99349, + 99349, 99367, 99371, 99377, 99391, 99397, 99401, 99409, 99409, 99431, + 99439, 99439, 99439, 99439, 99469, 99469, 99487, 99487, 99497, 99497, + 99497, 99527, 99529, 99529, 99551, 99559, 99563, 99571, 99581, 99581, + 99581, 99607, 99611, 99623, 99623, 99623, 99643, 99643, 99661, 99667, + 99679, 99679, 99689, 99689, 99709, 99719, 99721, 99733, 99733, 99733, + 99733, 99767, 99767, 99767, 99787, 99793, 99793, 99809, 99823, 99829, + 99839, 99839, 99839, 99859, 99871, 99877, 99881, 99881, 99901, 99907, + 99907, 99923, 99929, 99929, 99929, 99929, 99961, 99971 +}; + + +static const unsigned ByteSizePrimesCount=sizeof(ByteSizePrimes)/sizeof(ByteSizePrimes[0]); + + +static unsigned +Bits2PrimeNBytes( + unsigned Bits, + unsigned & BytesOut) +{ + unsigned prime, bytes, temp; + + prime=0; + BytesOut=0; + + if (0 < Bits) + { + bytes=(Bits+7)/8; + do + { + if (bytes keys_; public: - BloomTest() : policy_(NewBloomFilterPolicy(10)) { } +// BloomTest() : policy_(NewBloomFilterPolicy(10)) { } + BloomTest() : policy_(NewBloomFilterPolicy2(16)) { } ~BloomTest() { delete policy_; @@ -46,8 +46,7 @@ class BloomTest { key_slices.push_back(Slice(keys_[i])); } filter_.clear(); - policy_->CreateFilter(&key_slices[0], static_cast(key_slices.size()), - &filter_); + policy_->CreateFilter(&key_slices[0], key_slices.size(), &filter_); keys_.clear(); if (kVerbose >= 2) DumpFilter(); } @@ -107,8 +106,10 @@ static int NextLength(int length) { length += 10; } else if (length < 1000) { length += 100; - } else { + } else if (length < 15000) { length += 1000; + } else { + length += 15000; } return length; } @@ -120,15 +121,15 @@ TEST(BloomTest, VaryingLengths) { int mediocre_filters = 0; int good_filters = 0; - for (int length = 1; length <= 10000; length = NextLength(length)) { + for (int length = 1; length <= 200000; length = NextLength(length)) { Reset(); for (int i = 0; i < length; i++) { Add(Key(i, buffer)); } Build(); - ASSERT_LE(FilterSize(), static_cast((length * 10 / 8) + 40)) - << length; +// ASSERT_LE(FilterSize(), (length * 10 / 8) + 40) << length; + ASSERT_LE(FilterSize(), (length * 16 / 8) + 40) << length; // All added keys must match for (int i = 0; i < length; i++) { diff --git a/src/leveldb/util/cache.cc b/src/leveldb/util/cache.cc index ce4688617..efa481f53 100644 --- a/src/leveldb/util/cache.cc +++ b/src/leveldb/util/cache.cc @@ -19,23 +19,6 @@ Cache::~Cache() { namespace { // LRU cache implementation -// -// Cache entries have an "in_cache" boolean indicating whether the cache has a -// reference on the entry. The only ways that this can become false without the -// entry being passed to its "deleter" are via Erase(), via Insert() when -// an element with a duplicate key is inserted, or on destruction of the cache. -// -// The cache keeps two linked lists of items in the cache. All items in the -// cache are in one list or the other, and never both. Items still referenced -// by clients but erased from the cache are in neither list. The lists are: -// - in-use: contains the items currently referenced by clients, in no -// particular order. (This list is used for invariant checking. If we -// removed the check, elements that would otherwise be on this list could be -// left as disconnected singleton lists.) -// - LRU: contains the items not currently referenced by clients, in LRU order -// Elements are moved between these lists by the Ref() and Unref() methods, -// when they detect an element in the cache acquiring or losing its only -// external reference. // An entry is a variable length heap-allocated structure. Entries // are kept in a circular doubly linked list ordered by access time. @@ -47,8 +30,7 @@ struct LRUHandle { LRUHandle* prev; size_t charge; // TODO(opt): Only allow uint32_t? size_t key_length; - bool in_cache; // Whether entry is in the cache. - uint32_t refs; // References, including cache reference, if present. + uint32_t refs; uint32_t hash; // Hash of key(); used for fast sharding and comparisons char key_data[1]; // Beginning of key @@ -134,6 +116,7 @@ class HandleTable { LRUHandle* h = list_[i]; while (h != NULL) { LRUHandle* next = h->next_hash; + /*Slice key =*/ h->key(); // eliminate unused var warning, but allow for side-effects uint32_t hash = h->hash; LRUHandle** ptr = &new_list[hash & (new_length - 1)]; h->next_hash = *ptr; @@ -150,92 +133,98 @@ class HandleTable { }; // A single shard of sharded cache. -class LRUCache { +class LRUCache : public Cache { public: LRUCache(); ~LRUCache(); + static inline uint32_t HashSlice(const Slice& s) { + return Hash(s.data(), s.size(), 0); + } // Separate from constructor so caller can easily make an array of LRUCache void SetCapacity(size_t capacity) { capacity_ = capacity; } + size_t GetCapacity() const {return(capacity_);}; + size_t GetUsage() const {return(usage_);}; + + // Cache methods to allow direct use for single shard + virtual Cache::Handle* Insert(const Slice& key, + void* value, size_t charge, + void (*deleter)(const Slice& key, void* value)) + {return(Insert(key, HashSlice(key), value, charge, deleter));}; + + virtual Cache::Handle* Lookup(const Slice& key) + {return(Lookup(key, HashSlice(key)));}; + + virtual void Release(Cache::Handle* handle); + virtual void Erase(const Slice& key) + {Erase(key, HashSlice(key));}; + virtual void* Value(Handle* handle) { + return reinterpret_cast(handle)->value; + } + + virtual uint64_t NewId() { + return (++last_id_); + } + + virtual size_t EntryOverheadSize() {return(sizeof(LRUHandle));}; + // Like Cache methods, but with an extra "hash" parameter. Cache::Handle* Insert(const Slice& key, uint32_t hash, void* value, size_t charge, void (*deleter)(const Slice& key, void* value)); Cache::Handle* Lookup(const Slice& key, uint32_t hash); - void Release(Cache::Handle* handle); + void Erase(const Slice& key, uint32_t hash); - void Prune(); - size_t TotalCharge() const { - MutexLock l(&mutex_); - return usage_; - } + + virtual void Addref(Cache::Handle* handle); private: void LRU_Remove(LRUHandle* e); - void LRU_Append(LRUHandle*list, LRUHandle* e); - void Ref(LRUHandle* e); + void LRU_Append(LRUHandle* e); void Unref(LRUHandle* e); - bool FinishErase(LRUHandle* e); // Initialized before use. size_t capacity_; // mutex_ protects the following state. - mutable port::Mutex mutex_; + port::Spin spin_; size_t usage_; + uint64_t last_id_; // Dummy head of LRU list. // lru.prev is newest entry, lru.next is oldest entry. - // Entries have refs==1 and in_cache==true. LRUHandle lru_; - // Dummy head of in-use list. - // Entries are in use by clients, and have refs >= 2 and in_cache==true. - LRUHandle in_use_; - HandleTable table_; }; LRUCache::LRUCache() - : usage_(0) { - // Make empty circular linked lists. + : usage_(0), + last_id_(0) { + // Make empty circular linked list lru_.next = &lru_; lru_.prev = &lru_; - in_use_.next = &in_use_; - in_use_.prev = &in_use_; } LRUCache::~LRUCache() { - assert(in_use_.next == &in_use_); // Error if caller has an unreleased handle for (LRUHandle* e = lru_.next; e != &lru_; ) { LRUHandle* next = e->next; - assert(e->in_cache); - e->in_cache = false; - assert(e->refs == 1); // Invariant of lru_ list. + + assert(e->refs == 1); // Error if caller has an unreleased handle + Unref(e); e = next; } } -void LRUCache::Ref(LRUHandle* e) { - if (e->refs == 1 && e->in_cache) { // If on lru_ list, move to in_use_ list. - LRU_Remove(e); - LRU_Append(&in_use_, e); - } - e->refs++; -} - void LRUCache::Unref(LRUHandle* e) { assert(e->refs > 0); e->refs--; - if (e->refs == 0) { // Deallocate. - assert(!e->in_cache); + if (e->refs <= 0) { + usage_ -= e->charge; (*e->deleter)(e->key(), e->value); free(e); - } else if (e->in_cache && e->refs == 1) { // No longer in use; move to lru_ list. - LRU_Remove(e); - LRU_Append(&lru_, e); } } @@ -244,32 +233,43 @@ void LRUCache::LRU_Remove(LRUHandle* e) { e->prev->next = e->next; } -void LRUCache::LRU_Append(LRUHandle* list, LRUHandle* e) { - // Make "e" newest entry by inserting just before *list - e->next = list; - e->prev = list->prev; +void LRUCache::LRU_Append(LRUHandle* e) { + // Make "e" newest entry by inserting just before lru_ + e->next = &lru_; + e->prev = lru_.prev; e->prev->next = e; e->next->prev = e; } Cache::Handle* LRUCache::Lookup(const Slice& key, uint32_t hash) { - MutexLock l(&mutex_); + SpinLock l(&spin_); LRUHandle* e = table_.Lookup(key, hash); if (e != NULL) { - Ref(e); + e->refs++; + LRU_Remove(e); + LRU_Append(e); } return reinterpret_cast(e); } void LRUCache::Release(Cache::Handle* handle) { - MutexLock l(&mutex_); + SpinLock l(&spin_); Unref(reinterpret_cast(handle)); } +void LRUCache::Addref(Cache::Handle* handle) { + SpinLock l(&spin_); + LRUHandle * e; + + e=reinterpret_cast(handle); + if (NULL!=e && 1 <= e->refs) + ++e->refs; +} + Cache::Handle* LRUCache::Insert( const Slice& key, uint32_t hash, void* value, size_t charge, void (*deleter)(const Slice& key, void* value)) { - MutexLock l(&mutex_); + SpinLock l(&spin_); LRUHandle* e = reinterpret_cast( malloc(sizeof(LRUHandle)-1 + key.size())); @@ -278,57 +278,48 @@ Cache::Handle* LRUCache::Insert( e->charge = charge; e->key_length = key.size(); e->hash = hash; - e->in_cache = false; - e->refs = 1; // for the returned handle. + e->refs = 2; // One from LRUCache, one for the returned handle memcpy(e->key_data, key.data(), key.size()); + LRU_Append(e); + usage_ += charge; - if (capacity_ > 0) { - e->refs++; // for the cache's reference. - e->in_cache = true; - LRU_Append(&in_use_, e); - usage_ += charge; - FinishErase(table_.Insert(e)); - } // else don't cache. (Tests use capacity_==0 to turn off caching.) - - while (usage_ > capacity_ && lru_.next != &lru_) { - LRUHandle* old = lru_.next; - assert(old->refs == 1); - bool erased = FinishErase(table_.Remove(old->key(), old->hash)); - if (!erased) { // to avoid unused variable when compiled NDEBUG - assert(erased); - } + LRUHandle* old = table_.Insert(e); + if (old != NULL) { + LRU_Remove(old); + Unref(old); } + + // Riak - matthewv: code added to remove old only if it was not active. + // Had scenarios where file cache would be largely or totally drained + // because an active object does NOT reduce usage_ upon delete. So + // the previous while loop would basically delete everything. + LRUHandle * next, * cursor; + + for (cursor=lru_.next; usage_ > capacity_ && cursor != &lru_; cursor=next) + { + // take next pointer before potentially destroying cursor + next=cursor->next; + + // only delete cursor if it will actually destruct and + // return value to usage_ + if (cursor->refs <= 1) + { + LRU_Remove(cursor); + table_.Remove(cursor->key(), cursor->hash); + Unref(cursor); + } // if + } // for + return reinterpret_cast(e); } -// If e != NULL, finish removing *e from the cache; it has already been removed -// from the hash table. Return whether e != NULL. Requires mutex_ held. -bool LRUCache::FinishErase(LRUHandle* e) { - if (e != NULL) { - assert(e->in_cache); - LRU_Remove(e); - e->in_cache = false; - usage_ -= e->charge; - Unref(e); - } - return e != NULL; -} - void LRUCache::Erase(const Slice& key, uint32_t hash) { - MutexLock l(&mutex_); - FinishErase(table_.Remove(key, hash)); -} - -void LRUCache::Prune() { - MutexLock l(&mutex_); - while (lru_.next != &lru_) { - LRUHandle* e = lru_.next; - assert(e->refs == 1); - bool erased = FinishErase(table_.Remove(e->key(), e->hash)); - if (!erased) { // to avoid unused variable when compiled NDEBUG - assert(erased); - } + SpinLock l(&spin_); + LRUHandle* e = table_.Remove(key, hash); + if (e != NULL) { + LRU_Remove(e); + Unref(e); } } @@ -338,7 +329,7 @@ static const int kNumShards = 1 << kNumShardBits; class ShardedLRUCache : public Cache { private: LRUCache shard_[kNumShards]; - port::Mutex id_mutex_; + port::Spin id_spin_; uint64_t last_id_; static inline uint32_t HashSlice(const Slice& s) { @@ -367,6 +358,10 @@ class ShardedLRUCache : public Cache { const uint32_t hash = HashSlice(key); return shard_[Shard(hash)].Lookup(key, hash); } + virtual void Addref(Handle* handle) { + LRUHandle* h = reinterpret_cast(handle); + shard_[Shard(h->hash)].Addref(handle); + } virtual void Release(Handle* handle) { LRUHandle* h = reinterpret_cast(handle); shard_[Shard(h->hash)].Release(handle); @@ -379,21 +374,10 @@ class ShardedLRUCache : public Cache { return reinterpret_cast(handle)->value; } virtual uint64_t NewId() { - MutexLock l(&id_mutex_); + SpinLock l(&id_spin_); return ++(last_id_); } - virtual void Prune() { - for (int s = 0; s < kNumShards; s++) { - shard_[s].Prune(); - } - } - virtual size_t TotalCharge() const { - size_t total = 0; - for (int s = 0; s < kNumShards; s++) { - total += shard_[s].TotalCharge(); - } - return total; - } + virtual size_t EntryOverheadSize() {return(sizeof(LRUHandle));}; }; } // end anonymous namespace @@ -402,4 +386,11 @@ Cache* NewLRUCache(size_t capacity) { return new ShardedLRUCache(capacity); } +Cache* NewLRUCache2(size_t capacity) { + LRUCache * cache; + cache=new LRUCache(); + cache->SetCapacity(capacity); + return cache; +} + } // namespace leveldb diff --git a/src/leveldb/util/cache2.cc b/src/leveldb/util/cache2.cc new file mode 100644 index 000000000..3e2e3cfd1 --- /dev/null +++ b/src/leveldb/util/cache2.cc @@ -0,0 +1,760 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +// +// mildly modified version of Google's original cache.cc to support +// Riak's flexcache.cc +// + + +#include +#include +#include + +#include "db/table_cache.h" +#include "db/version_edit.h" +#include "leveldb/atomics.h" +#include "leveldb/env.h" +#include "util/cache2.h" +#include "port/port.h" +#include "util/hash.h" +#include "util/mutexlock.h" + +namespace leveldb { + +//namespace { + +// LRU cache implementation + +// An entry is a variable length heap-allocated structure. Entries +// are kept in a circular doubly linked list ordered by access time. +struct LRUHandle2 { + void* value; + void (*deleter)(const Slice&, void* value); + LRUHandle2* next_hash; + LRUHandle2* next; + LRUHandle2* prev; + size_t charge; // TODO(opt): Only allow uint32_t? + size_t key_length; + uint32_t refs; + uint32_t hash; // Hash of key(); used for fast sharding and comparisons + time_t expire_seconds; // zero (no expire) or time when this object expires + char key_data[1]; // Beginning of key + + Slice key() const { + // For cheaper lookups, we allow a temporary Handle object + // to store a pointer to a key in "value". + if (next == this) { + return *(reinterpret_cast(value)); + } else { + return Slice(key_data, key_length); + } + } +}; + +// We provide our own simple hash table since it removes a whole bunch +// of porting hacks and is also faster than some of the built-in hash +// table implementations in some of the compiler/runtime combinations +// we have tested. E.g., readrandom speeds up by ~5% over the g++ +// 4.4.3's builtin hashtable. +class HandleTable { + public: + HandleTable() : length_(0), elems_(0), list_(NULL) { Resize(); } + ~HandleTable() { delete[] list_; } + + LRUHandle2* Lookup(const Slice& key, uint32_t hash) { + return *FindPointer(key, hash); + } + + LRUHandle2* Insert(LRUHandle2* h) { + LRUHandle2** ptr = FindPointer(h->key(), h->hash); + LRUHandle2* old = *ptr; + h->next_hash = (old == NULL ? NULL : old->next_hash); + *ptr = h; + if (old == NULL) { + ++elems_; + if (elems_ > length_) { + // Since each cache entry is fairly large, we aim for a small + // average linked list length (<= 1). + Resize(); + } + } + return old; + } + + LRUHandle2* Remove(const Slice& key, uint32_t hash) { + LRUHandle2** ptr = FindPointer(key, hash); + LRUHandle2* result = *ptr; + if (result != NULL) { + *ptr = result->next_hash; + --elems_; + } + return result; + } + + private: + // The table consists of an array of buckets where each bucket is + // a linked list of cache entries that hash into the bucket. + uint32_t length_; + uint32_t elems_; + LRUHandle2** list_; + + // Return a pointer to slot that points to a cache entry that + // matches key/hash. If there is no such cache entry, return a + // pointer to the trailing slot in the corresponding linked list. + LRUHandle2** FindPointer(const Slice& key, uint32_t hash) { + LRUHandle2** ptr = &list_[hash & (length_ - 1)]; + while (*ptr != NULL && + ((*ptr)->hash != hash || key != (*ptr)->key())) { + ptr = &(*ptr)->next_hash; + } + return ptr; + } + + void Resize() { + uint32_t new_length = 4; + while (new_length < elems_) { + new_length *= 2; + } + LRUHandle2** new_list = new LRUHandle2*[new_length]; + memset(new_list, 0, sizeof(new_list[0]) * new_length); + uint32_t count = 0; + for (uint32_t i = 0; i < length_; i++) { + LRUHandle2* h = list_[i]; + while (h != NULL) { + LRUHandle2* next = h->next_hash; + /*Slice key =*/ h->key(); // eliminate unused var warning, but allow for side-effects + uint32_t hash = h->hash; + LRUHandle2** ptr = &new_list[hash & (new_length - 1)]; + h->next_hash = *ptr; + *ptr = h; + h = next; + count++; + } + } + assert(elems_ == count); + delete[] list_; + list_ = new_list; + length_ = new_length; + } +}; + + +// A single shard of sharded cache. +class LRUCache2 : public Cache { + public: + LRUCache2(); + ~LRUCache2(); + + static inline uint32_t HashSlice(const Slice& s) { + return Hash(s.data(), s.size(), 0); + } + // Separate from constructor so caller can easily make an array of LRUCache2 + + // Cache2 methods to allow direct use for single shard + virtual Cache::Handle* Insert(const Slice& key, + void* value, size_t charge, + void (*deleter)(const Slice& key, void* value)) + {return(Insert(key, HashSlice(key), value, charge, deleter));}; + + virtual Cache::Handle* Lookup(const Slice& key) + {return(Lookup(key, HashSlice(key)));}; + + virtual void Release(Cache::Handle* handle); + virtual bool ReleaseOne(); + virtual void Erase(const Slice& key) + {Erase(key, HashSlice(key));}; + virtual void* Value(Handle* handle) { + return reinterpret_cast(handle)->value; + } + + virtual uint64_t NewId() { + return inc_and_fetch(&last_id_); + } + + virtual size_t EntryOverheadSize() {return(sizeof(LRUHandle2));}; + + // Like Cache methods, but with an extra "hash" parameter. + Cache::Handle* Insert(const Slice& key, uint32_t hash, + void* value, size_t charge, + void (*deleter)(const Slice& key, void* value)); + Cache::Handle* Lookup(const Slice& key, uint32_t hash); + + void Erase(const Slice& key, uint32_t hash); + + virtual void Addref(Cache::Handle* handle); + + void SetParent(ShardedLRUCache2 * Parent, bool IsFileCache) + {parent_=Parent; is_file_cache_=IsFileCache;}; + + LRUHandle2 * LRUHead() {return(&lru_);} + + void LRUErase(LRUHandle2 * cursor) + { + LRU_Remove(cursor); + table_.Remove(cursor->key(), cursor->hash); + Unref(cursor); + } + + private: + void LRU_Remove(LRUHandle2* e); + void LRU_Append(LRUHandle2* e); + void Unref(LRUHandle2* e); + + // Initialized before use. + class ShardedLRUCache2 * parent_; + bool is_file_cache_; + + // mutex_ protects the following state. + port::Spin spin_; + uint64_t last_id_; + + // Dummy head of LRU list. + // lru.prev is newest entry, lru.next is oldest entry. + LRUHandle2 lru_; + + HandleTable table_; +}; + +LRUCache2::LRUCache2() + : parent_(NULL), is_file_cache_(true), last_id_(0) +{ + // Make empty circular linked list + lru_.next = &lru_; + lru_.prev = &lru_; + lru_.expire_seconds=0; +} + +LRUCache2::~LRUCache2() { + for (LRUHandle2* e = lru_.next; e != &lru_; ) { + LRUHandle2* next = e->next; + + assert(e->refs == 1); // Error if caller has an unreleased handle + Unref(e); + e = next; + } +} + +void LRUCache2::LRU_Remove(LRUHandle2* e) { + e->next->prev = e->prev; + e->prev->next = e->next; +} + +void LRUCache2::LRU_Append(LRUHandle2* e) { + // Make "e" newest entry by inserting just before lru_ + e->next = &lru_; + e->prev = lru_.prev; + e->prev->next = e; + e->next->prev = e; +} + +//Cache::Handle* LRUCache2::Lookup(const Slice& key, uint32_t hash); + +void LRUCache2::Release(Cache::Handle* handle) { + SpinLock l(&spin_); + Unref(reinterpret_cast(handle)); +} + +void LRUCache2::Addref(Cache::Handle* handle) { + SpinLock l(&spin_); + LRUHandle2 * e; + + e=reinterpret_cast(handle); + if (NULL!=e && 1 <= e->refs) + ++e->refs; +} + + +void LRUCache2::Erase(const Slice& key, uint32_t hash) { + SpinLock l(&spin_); + LRUHandle2* e = table_.Remove(key, hash); + if (e != NULL) { + LRU_Remove(e); + Unref(e); + } +} + +//} // end anonymous namespace + + +static const int kNumShardBits = 4; +static const int kNumShards = 1 << kNumShardBits; + +class ShardedLRUCache2 : public Cache { +public: + volatile uint64_t usage_; // cache2's usage is across all shards, + // simplifies FlexCache management + +private: + LRUCache2 shard_[kNumShards]; + port::Spin id_spin_; + DoubleCache & parent_; + bool is_file_cache_; + size_t next_shard_; + volatile uint64_t last_id_; + + static inline uint32_t HashSlice(const Slice& s) { + return Hash(s.data(), s.size(), 0); + } + + static uint32_t Shard(uint32_t hash) { + return hash >> (32 - kNumShardBits); + } + + public: + explicit ShardedLRUCache2(class DoubleCache & Parent, bool IsFileCache) + : usage_(0), parent_(Parent), is_file_cache_(IsFileCache), next_shard_(0), last_id_(0) { + for (int s = 0; s < kNumShards; s++) + { + shard_[s].SetParent(this, IsFileCache); + } + + } + virtual ~ShardedLRUCache2() { } + volatile uint64_t GetUsage() const {return(usage_);}; + volatile uint64_t * GetUsagePtr() {return(&usage_);}; + volatile uint64_t GetCapacity() {return(parent_.GetCapacity(is_file_cache_));} + time_t GetFileTimeout() {return(parent_.GetFileTimeout());}; + + virtual Handle* Insert(const Slice& key, void* value, size_t charge, + void (*deleter)(const Slice& key, void* value)) { + const uint32_t hash = HashSlice(key); + return shard_[Shard(hash)].Insert(key, hash, value, charge, deleter); + } + virtual Handle* Lookup(const Slice& key) { + const uint32_t hash = HashSlice(key); + return shard_[Shard(hash)].Lookup(key, hash); + } + virtual void Addref(Handle* handle) { + LRUHandle2* h = reinterpret_cast(handle); + shard_[Shard(h->hash)].Addref(handle); + } + virtual void Release(Handle* handle) { + LRUHandle2* h = reinterpret_cast(handle); + shard_[Shard(h->hash)].Release(handle); + } + virtual void Erase(const Slice& key) { + const uint32_t hash = HashSlice(key); + shard_[Shard(hash)].Erase(key, hash); + } + virtual void* Value(Handle* handle) { + return reinterpret_cast(handle)->value; + } + virtual uint64_t NewId() { + return inc_and_fetch(&last_id_); + } + virtual size_t EntryOverheadSize() {return(sizeof(LRUHandle2));}; + + // reduce usage of all shards to fit within current capacity limit + void Resize() + { + size_t end_shard; + bool one_deleted; + + SpinLock l(&id_spin_); + end_shard=next_shard_; + one_deleted=true; + + while((parent_.GetCapacity(is_file_cache_) < usage_) && one_deleted) + { + one_deleted=false; + + // round robin delete ... later, could delete from most full or such + // but keep simple since using spin lock + do + { + one_deleted=shard_[next_shard_].ReleaseOne(); + next_shard_=(next_shard_ +1) % kNumShards; + } while(end_shard!=next_shard_ && !one_deleted); + + } // while + + return; + + } // ShardedLRUCache2::Resize + + + // let doublecache know state of cache space + void SetFreeSpaceWarning(size_t FileMetaSize) + { + bool plenty_space; + + plenty_space=(GetUsage() + 5*FileMetaSize < GetCapacity()); + + parent_.SetPlentySpace(plenty_space); + } // SetFreeSpaceWarning + + + // Only used on file cache. Remove entries that are too old + void PurgeExpiredFiles() + { + if (is_file_cache_) + { + int loop; + time_t now; + + now=Env::Default()->NowMicros() / 1000000L; + + SpinLock l(&id_spin_); + + for (loop=0; loopnext; + cursor->expire_seconds <= now && cursor != shard_[loop].LRUHead(); + cursor=next) + { + // take next pointer before potentially destroying cursor + next=cursor->next; + + // only delete cursor if it will actually destruct and + // return value to usage_ + if (cursor->refs <= 1 && 0!=cursor->expire_seconds) + { + shard_[loop].LRUErase(cursor); + } // if + } // for + } // for + } // if + + return; + + } // ShardedLRUCache2::PurgeExpiredFiles + + // Walk all cache entries, calling functor Acc for each + bool + WalkCache( + CacheAccumulator & Acc) + { + int loop; + bool good(true); + + SpinLock l(&id_spin_); + + for (loop=0; loopnext; + cursor != shard_[loop].LRUHead() && good; + cursor=cursor->next) + { + good=Acc(cursor->value); + } // for + } // for + + return(good); + + } // ShardedLRUCache2::WalkCache + +}; //ShardedLRUCache2 + + +/** + * Initialize cache pair based upon current conditions + */ +DoubleCache::DoubleCache( + const Options & options) + : m_FileCache(NULL), m_BlockCache(NULL), + m_IsInternalDB(options.is_internal_db), m_PlentySpace(true), + m_Overhead(0), m_TotalAllocation(0), + m_FileTimeout(10*24*60*60), // default is 10 days + m_BlockCacheThreshold(options.block_cache_threshold), + m_SizeCachedFiles(0) +{ + // fixed allocation for recovery log and info LOG: 20M each + // (with 64 or open databases, this is a serious number) + // and fixed allocation for two write buffers + + m_Overhead=options.write_buffer_size*2 + + options.env->RecoveryMmapSize(&options) + 4096; + m_TotalAllocation=gFlexCache.GetDBCacheCapacity(m_IsInternalDB); + + if (m_Overhead < m_TotalAllocation) + m_TotalAllocation -= m_Overhead; + else + m_TotalAllocation=0; + + // build two new caches + Flush(); + +} // DoubleCache::DoubleCache + + +DoubleCache::~DoubleCache() +{ + delete m_FileCache; + delete m_BlockCache; + +} // DoubleCache::DoubleCache + + +/** + * Resize each of the caches based upon new global conditions + */ +void +DoubleCache::ResizeCaches() +{ + m_TotalAllocation=gFlexCache.GetDBCacheCapacity(m_IsInternalDB); + if (m_Overhead < m_TotalAllocation) + m_TotalAllocation -= m_Overhead; + else + m_TotalAllocation=0; + + // worst case is size reduction, take from block cache first + m_BlockCache->Resize(); + m_FileCache->Resize(); + + return; + +} // DoubleCache::ResizeCaches() + + +/** + * Calculate limit to file or block cache based upon global conditions + */ +size_t +DoubleCache::GetCapacity( + bool IsFileCache, + bool EstimatePageCache) +{ + size_t ret_val; + + ret_val=0; + + if (2*1024*1024L < m_TotalAllocation) + { + // file capacity is "fixed", it is always the entire + // cache allocation less minimum block size + if (IsFileCache) + { + ret_val=m_TotalAllocation - (2*1024*1024L); + } // if + + // block cache capacity is whatever file cache is not + // not using, or its minimum ... whichever is larger + else + { + uint64_t temp; + + // usage could vary between two calls, + // get it once and use same twice + temp=m_FileCache->GetUsage(); + + if (tempPurgeExpiredFiles(); + + return; + +} // DoubleCache::PurgExpiredFiles + + +// +// Definitions moved so they could access ShardedLRUCache members +// (subtle hint to Google that every object should have .h file +// because future reuse is unknowable ... and this ain't Java) +// +Cache::Handle* LRUCache2::Lookup(const Slice& key, uint32_t hash) { + SpinLock l(&spin_); + LRUHandle2* e = table_.Lookup(key, hash); + if (e != NULL) { + e->refs++; + LRU_Remove(e); + LRU_Append(e); + + // establish time limit on files in file cache (like 10 days) + // so they do not go stale and steal from block cache + if (is_file_cache_) + { + e->expire_seconds=Env::Default()->NowMicros() / 1000000L + + parent_->GetFileTimeout(); + } // if + } + return reinterpret_cast(e); +} + + + +// +// Definitions moved so they could access ShardedLRUCache members +// (subtle hint to Google that every object should have .h file +// because future reuse is unknowable) +// +void LRUCache2::Unref(LRUHandle2* e) { + assert(e->refs > 0); + e->refs--; + if (e->refs <= 0) { + sub_and_fetch(parent_->GetUsagePtr(), (uint64_t)e->charge); + + if (is_file_cache_) + gPerfCounters->Add(ePerfFileCacheRemove, e->charge); + else + gPerfCounters->Add(ePerfBlockCacheRemove, e->charge); + + (*e->deleter)(e->key(), e->value); + free(e); + } +} + + +Cache::Handle* LRUCache2::Insert( + const Slice& key, uint32_t hash, void* value, size_t charge, + void (*deleter)(const Slice& key, void* value)) { + + size_t this_size; + + this_size=sizeof(LRUHandle2)-1 + key.size(); + LRUHandle2* e = reinterpret_cast( + malloc(this_size)); + + e->value = value; + e->deleter = deleter; + e->charge = charge + this_size; // assumes charge is always byte size + e->key_length = key.size(); + e->hash = hash; + e->refs = 2; // One from LRUCache2, one for the returned handle + e->expire_seconds=0; + memcpy(e->key_data, key.data(), key.size()); + + // establish time limit on files in file cache (like 10 days) + // so they do not go stale and steal from block cache + if (is_file_cache_) + { + e->expire_seconds=Env::Default()->NowMicros() / 1000000L + + parent_->GetFileTimeout(); + } // if + + if (is_file_cache_) + gPerfCounters->Add(ePerfFileCacheInsert, e->charge); + else + gPerfCounters->Add(ePerfBlockCacheInsert, e->charge); + + + { + SpinLock l(&spin_); + + LRU_Append(e); + add_and_fetch(parent_->GetUsagePtr(), (uint64_t)e->charge); + + LRUHandle2* old = table_.Insert(e); + if (old != NULL) { + LRU_Remove(old); + Unref(old); + } + } // SpinLock + + // call parent to rebalance across all shards, not just this one + if (parent_->GetCapacity() GetUsage()) + parent_->Resize(); + + // let parent adjust free space warning level + if (is_file_cache_) + parent_->SetFreeSpaceWarning(e->charge); + + + + return reinterpret_cast(e); +} + + +bool +LRUCache2::ReleaseOne() +{ + bool ret_flag; + LRUHandle2 * next, * cursor; + SpinLock lock(&spin_); + + ret_flag=false; + + for (cursor=lru_.next; !ret_flag && parent_->GetUsage() > parent_->GetCapacity() && cursor != &lru_; cursor=next) + { + // take next pointer before potentially destroying cursor + next=cursor->next; + + // only delete cursor if it will actually destruct and + // return value to usage_ + if (cursor->refs <= 1) + { + LRU_Remove(cursor); + table_.Remove(cursor->key(), cursor->hash); + Unref(cursor); + ret_flag=true; + } // if + } // for + + return(ret_flag); + +} // LRUCache2::ReleaseOne + +} // namespace leveldb + diff --git a/src/leveldb/util/cache2.h b/src/leveldb/util/cache2.h new file mode 100644 index 000000000..b3e3f8c2b --- /dev/null +++ b/src/leveldb/util/cache2.h @@ -0,0 +1,106 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// A Cache is an interface that maps keys to values. It has internal +// synchronization and may be safely accessed concurrently from +// multiple threads. It may automatically evict entries to make room +// for new entries. Values have a specified charge against the cache +// capacity. For example, a cache where the values are variable +// length strings, may use the length of the string as the charge for +// the string. +// +// A builtin cache implementation with a least-recently-used eviction +// policy is provided. Clients may use their own implementations if +// they want something more sophisticated (like scan-resistance, a +// custom eviction policy, variable cache sizing, etc.) + +// +// mildly modified version of Google's original cache.cc to support +// Riak's flexcache.cc +// + +#ifndef STORAGE_LEVELDB_INCLUDE_CACHE2_H_ +#define STORAGE_LEVELDB_INCLUDE_CACHE2_H_ + +#include +#include +#include + +#include "leveldb/atomics.h" +#include "leveldb/cache.h" +#include "leveldb/options.h" +#include "leveldb/slice.h" +#include "util/flexcache.h" + +namespace leveldb { + +class ShardedLRUCache2; + + +/** + * CacheAccumulator is an object to process values + * when walking the contents of a cache, i.e. a functor + */ +class CacheAccumulator +{ +public: + CacheAccumulator() {}; + virtual ~CacheAccumulator() {}; + + virtual bool operator()(void * Value) = 0; +}; + + +/** + * DoubleCache holds the file cache and the block cache to easy + * interactive sizing + */ +class DoubleCache +{ +public: + explicit DoubleCache(const Options & options); + virtual ~DoubleCache(); + + Cache * GetFileCache() {return((Cache *)m_FileCache);}; + Cache * GetBlockCache() {return((Cache *)m_BlockCache);}; + + void ResizeCaches(); + size_t GetCapacity(bool IsFileCache, bool EstimatePageCache=true); + time_t GetFileTimeout() {return(m_FileTimeout);}; + void SetFileTimeout(time_t Timeout) {m_FileTimeout=Timeout;}; + + void Flush(); + void SetPlentySpace(bool PlentySpace) {m_PlentySpace=PlentySpace;}; + bool GetPlentySpace() const {return(m_PlentySpace);}; + void PurgeExpiredFiles(); + + bool IsInternalDB() const {return(m_IsInternalDB);}; + + void AddFileSize(uint64_t file_size) {add_and_fetch(&m_SizeCachedFiles, file_size);}; + void SubFileSize(uint64_t file_size) {sub_and_fetch(&m_SizeCachedFiles, file_size);}; + +protected: + ShardedLRUCache2 * m_FileCache; //!< file cache used by db/tablecache.cc + ShardedLRUCache2 * m_BlockCache; //!< used by table/table.cc + + bool m_IsInternalDB; //!< internal db gets smaller allocation from FlexCache + bool m_PlentySpace; //!< true when lots of spare space in file cache + size_t m_Overhead; //!< reduce from allocation to better estimate limits + size_t m_TotalAllocation; + time_t m_FileTimeout; //!< seconds to allow file to stay cached. default 4 days. + + uint64_t m_BlockCacheThreshold; //!< from Options, point where block cache canNOT be + //!< sacrificed for page cache + volatile uint64_t m_SizeCachedFiles; //!< disk size of .sst files in file cache + +private: + DoubleCache(); //!< no default constructor + DoubleCache(const DoubleCache &); //!< no copy constructor + void operator=(const DoubleCache &); //!< no assignment + +}; // class DoubleCache + +} // namespace leveldb + +#endif // STORAGE_LEVELDB_UTIL_CACHE2_H_ diff --git a/src/leveldb/util/cache2_test.cc b/src/leveldb/util/cache2_test.cc new file mode 100644 index 000000000..3dbd5ee8f --- /dev/null +++ b/src/leveldb/util/cache2_test.cc @@ -0,0 +1,312 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +// +// Google's cache_test.cc modified to support Riak DoubleCache +// + +#include + +#include "util/cache2.h" +#include "util/coding.h" +#include "util/testharness.h" + +namespace leveldb { + +// Conversions between numeric keys/values and the types expected by Cache. +static std::string EncodeKey(int k) { + std::string result; + PutFixed32(&result, k); + return result; +} +static int DecodeKey(const Slice& k) { + assert(k.size() == 4); + return DecodeFixed32(k.data()); +} +static void* EncodeValue(uintptr_t v) { return reinterpret_cast(v); } +static int DecodeValue(void* v) { return reinterpret_cast(v); } + +class CacheTest { + public: + static CacheTest* current_; + + static void Deleter(const Slice& key, void* v) { + current_->deleted_keys_.push_back(DecodeKey(key)); + current_->deleted_values_.push_back(DecodeValue(v)); + } + + static const int kOneMeg = 1024*1024L; + static const int kCacheSize = 180; // 180Mbytes is default + std::vector deleted_keys_; + std::vector deleted_values_; + Options options_; + + DoubleCache double_cache_; + + Cache* cache_; + Cache* file_; + + CacheTest() + : double_cache_(options_) + { + current_ = this; + gFlexCache.SetTotalMemory((120+kCacheSize)*kOneMeg); + double_cache_.ResizeCaches(); + cache_=double_cache_.GetBlockCache(); + file_=double_cache_.GetFileCache(); + } + + ~CacheTest() { + } + + void ResetCaches() + { + double_cache_.Flush(); + cache_=double_cache_.GetBlockCache(); + file_=double_cache_.GetFileCache(); + } + + int Lookup(int key) { + Cache::Handle* handle = cache_->Lookup(EncodeKey(key)); + const int r = (handle == NULL) ? -1 : DecodeValue(cache_->Value(handle)); + if (handle != NULL) { + cache_->Release(handle); + } + return r; + } + + void Insert(int key, int value, int charge = 1) { + cache_->Release(cache_->Insert(EncodeKey(key), EncodeValue(value), charge, + &CacheTest::Deleter)); + } + + void InsertFile(int key, int value, int charge = 1) { + file_->Release(file_->Insert(EncodeKey(key), EncodeValue(value), charge, + &CacheTest::Deleter)); + } + + void Erase(int key) { + cache_->Erase(EncodeKey(key)); + } +}; +CacheTest* CacheTest::current_; + +TEST(CacheTest, HitAndMiss) { + ASSERT_EQ(-1, Lookup(100)); + + Insert(100, 101); + ASSERT_EQ(101, Lookup(100)); + ASSERT_EQ(-1, Lookup(200)); + ASSERT_EQ(-1, Lookup(300)); + + Insert(200, 201); + ASSERT_EQ(101, Lookup(100)); + ASSERT_EQ(201, Lookup(200)); + ASSERT_EQ(-1, Lookup(300)); + + Insert(100, 102); + ASSERT_EQ(102, Lookup(100)); + ASSERT_EQ(201, Lookup(200)); + ASSERT_EQ(-1, Lookup(300)); + + ASSERT_EQ(1, deleted_keys_.size()); + ASSERT_EQ(100, deleted_keys_[0]); + ASSERT_EQ(101, deleted_values_[0]); +} + +TEST(CacheTest, Erase) { + Erase(200); + ASSERT_EQ(0, deleted_keys_.size()); + + Insert(100, 101); + Insert(200, 201); + Erase(100); + ASSERT_EQ(-1, Lookup(100)); + ASSERT_EQ(201, Lookup(200)); + ASSERT_EQ(1, deleted_keys_.size()); + ASSERT_EQ(100, deleted_keys_[0]); + ASSERT_EQ(101, deleted_values_[0]); + + Erase(100); + ASSERT_EQ(-1, Lookup(100)); + ASSERT_EQ(201, Lookup(200)); + ASSERT_EQ(1, deleted_keys_.size()); +} + +TEST(CacheTest, EntriesArePinned) { + Insert(100, 101); + Cache::Handle* h1 = cache_->Lookup(EncodeKey(100)); + ASSERT_EQ(101, DecodeValue(cache_->Value(h1))); + + Insert(100, 102); + Cache::Handle* h2 = cache_->Lookup(EncodeKey(100)); + ASSERT_EQ(102, DecodeValue(cache_->Value(h2))); + ASSERT_EQ(0, deleted_keys_.size()); + + cache_->Release(h1); + ASSERT_EQ(1, deleted_keys_.size()); + ASSERT_EQ(100, deleted_keys_[0]); + ASSERT_EQ(101, deleted_values_[0]); + + Erase(100); + ASSERT_EQ(-1, Lookup(100)); + ASSERT_EQ(1, deleted_keys_.size()); + + cache_->Release(h2); + ASSERT_EQ(2, deleted_keys_.size()); + ASSERT_EQ(100, deleted_keys_[1]); + ASSERT_EQ(102, deleted_values_[1]); +} + +TEST(CacheTest, EvictionPolicy) { + Insert(100, 101, kOneMeg); + Insert(200, 201, kOneMeg); + // Frequently used entry must be kept around + for (int i = 0; i < kCacheSize + 100; i++) { + Insert(1000+i, 2000+i, kOneMeg); + ASSERT_EQ(2000+i, Lookup(1000+i)); + ASSERT_EQ(101, Lookup(100)); + } + ASSERT_EQ(101, Lookup(100)); + ASSERT_EQ(-1, Lookup(200)); +} + +TEST(CacheTest, HeavyEntries) { + // Add a bunch of light and heavy entries and then count the combined + // size of items still in the cache, which must be approximately the + // same as the total capacity. + const int kLight = 1; + const int kHeavy = 10; + int added = 0; + int index = 0; + while (added < 2*kCacheSize) { + const int weight = (index & 1) ? kLight : kHeavy; + Insert(index, 1000+index, weight*kOneMeg); + added += weight; + index++; + } + + int cached_weight = 0; + for (int i = 0; i < index; i++) { + const int weight = (i & 1 ? kLight : kHeavy); + int r = Lookup(i); + if (r >= 0) { + cached_weight += weight*kOneMeg; + ASSERT_EQ(1000+i, r); + } + } + ASSERT_LE(cached_weight, (kCacheSize + kCacheSize/10)*kOneMeg); +} + +TEST(CacheTest, FlushedEntries) { + int added = 0; + int index = 0; + while (added < 2*kCacheSize) { + Insert(index, 1000+index, kOneMeg); + added += 1; + index++; + } + + added=0; + while (added < kCacheSize/2) { + InsertFile(index, 1000+index, kOneMeg); + added += 1; + index++; + } + + // one insert to block cache should rebalance both + Insert(index, 1000+index, kOneMeg); + + int cached_weight = 0; + for (int i = 0; i < index; i++) { + int r = Lookup(i); + if (r >= 0) { + cached_weight += 1; + ASSERT_EQ(1000+i, r); + } + } + ASSERT_LE(cached_weight, (kCacheSize/2 + kCacheSize/10)); +} + +TEST(CacheTest, FileCacheExpire) { + time_t expire_default; + size_t beginning_size; + + ResetCaches(); + expire_default=double_cache_.GetFileTimeout(); + + // quick two second timeout + double_cache_.SetFileTimeout(2); + + // what is block cache's starting size + beginning_size=double_cache_.GetCapacity(false); + + // add bunch of stuff to file cache + int added = 0; + int index = 0; + while (added < kCacheSize/2) { + InsertFile(index, 1000+index, kOneMeg); + added += 1; + index++; + } // while + + // did file cache take away? + ASSERT_GT(beginning_size-(kCacheSize/2)*kOneMeg, double_cache_.GetCapacity(false)); + + // sleep two seconds + Env::Default()->SleepForMicroseconds(2000000); + + // force time purge + double_cache_.PurgeExpiredFiles(); + + ASSERT_EQ(beginning_size, double_cache_.GetCapacity(false)); + + // add bunch of stuff to file cache with 2 second timeout + added = 0; + index = 0; + while (added < kCacheSize/4) { + InsertFile(index, 1000+index, kOneMeg); + added += 1; + index++; + } // while + + // add bunch of stuff to file cache with 5 second timeout + double_cache_.SetFileTimeout(5); + while (added < kCacheSize/2) { + InsertFile(index, 1000+index, kOneMeg); + added += 1; + index++; + } // while + + // did file cache take away? + ASSERT_GT(beginning_size-(kCacheSize/2)*kOneMeg, double_cache_.GetCapacity(false)); + + // sleep two seconds + Env::Default()->SleepForMicroseconds(2000000); + + // force time purge + double_cache_.PurgeExpiredFiles(); + + // did only half get purged + ASSERT_GT(beginning_size-(kCacheSize/4)*kOneMeg, double_cache_.GetCapacity(false)); + + // reset timeout to default + double_cache_.SetFileTimeout(expire_default); + + return; + +} // CacheTest::FileCacheExpire + + +TEST(CacheTest, NewId) { + uint64_t a = cache_->NewId(); + uint64_t b = cache_->NewId(); + ASSERT_NE(a, b); +} + +} // namespace leveldb + +int main(int argc, char** argv) { + return leveldb::test::RunAllTests(); +} diff --git a/src/leveldb/util/cache_test.cc b/src/leveldb/util/cache_test.cc index 468f7a642..1a1b496db 100644 --- a/src/leveldb/util/cache_test.cc +++ b/src/leveldb/util/cache_test.cc @@ -59,11 +59,6 @@ class CacheTest { &CacheTest::Deleter)); } - Cache::Handle* InsertAndReturnHandle(int key, int value, int charge = 1) { - return cache_->Insert(EncodeKey(key), EncodeValue(value), charge, - &CacheTest::Deleter); - } - void Erase(int key) { cache_->Erase(EncodeKey(key)); } @@ -140,11 +135,7 @@ TEST(CacheTest, EntriesArePinned) { TEST(CacheTest, EvictionPolicy) { Insert(100, 101); Insert(200, 201); - Insert(300, 301); - Cache::Handle* h = cache_->Lookup(EncodeKey(300)); - - // Frequently used entry must be kept around, - // as must things that are still in use. + // Frequently used entry must be kept around for (int i = 0; i < kCacheSize + 100; i++) { Insert(1000+i, 2000+i); ASSERT_EQ(2000+i, Lookup(1000+i)); @@ -152,25 +143,6 @@ TEST(CacheTest, EvictionPolicy) { } ASSERT_EQ(101, Lookup(100)); ASSERT_EQ(-1, Lookup(200)); - ASSERT_EQ(301, Lookup(300)); - cache_->Release(h); -} - -TEST(CacheTest, UseExceedsCacheSize) { - // Overfill the cache, keeping handles on all inserted entries. - std::vector h; - for (int i = 0; i < kCacheSize + 100; i++) { - h.push_back(InsertAndReturnHandle(1000+i, 2000+i)); - } - - // Check that all the entries can be found in the cache. - for (int i = 0; i < h.size(); i++) { - ASSERT_EQ(2000+i, Lookup(1000+i)); - } - - for (int i = 0; i < h.size(); i++) { - cache_->Release(h[i]); - } } TEST(CacheTest, HeavyEntries) { @@ -206,19 +178,6 @@ TEST(CacheTest, NewId) { ASSERT_NE(a, b); } -TEST(CacheTest, Prune) { - Insert(1, 100); - Insert(2, 200); - - Cache::Handle* handle = cache_->Lookup(EncodeKey(1)); - ASSERT_TRUE(handle); - cache_->Prune(); - cache_->Release(handle); - - ASSERT_EQ(100, Lookup(1)); - ASSERT_EQ(-1, Lookup(2)); -} - } // namespace leveldb int main(int argc, char** argv) { diff --git a/src/leveldb/util/coding.cc b/src/leveldb/util/coding.cc index 21e3186d5..e133765b1 100644 --- a/src/leveldb/util/coding.cc +++ b/src/leveldb/util/coding.cc @@ -7,29 +7,29 @@ namespace leveldb { void EncodeFixed32(char* buf, uint32_t value) { - if (port::kLittleEndian) { - memcpy(buf, &value, sizeof(value)); - } else { - buf[0] = value & 0xff; - buf[1] = (value >> 8) & 0xff; - buf[2] = (value >> 16) & 0xff; - buf[3] = (value >> 24) & 0xff; - } +#if __BYTE_ORDER == __LITTLE_ENDIAN + memcpy(buf, &value, sizeof(value)); +#else + buf[0] = value & 0xff; + buf[1] = (value >> 8) & 0xff; + buf[2] = (value >> 16) & 0xff; + buf[3] = (value >> 24) & 0xff; +#endif } void EncodeFixed64(char* buf, uint64_t value) { - if (port::kLittleEndian) { - memcpy(buf, &value, sizeof(value)); - } else { - buf[0] = value & 0xff; - buf[1] = (value >> 8) & 0xff; - buf[2] = (value >> 16) & 0xff; - buf[3] = (value >> 24) & 0xff; - buf[4] = (value >> 32) & 0xff; - buf[5] = (value >> 40) & 0xff; - buf[6] = (value >> 48) & 0xff; - buf[7] = (value >> 56) & 0xff; - } +#if __BYTE_ORDER == __LITTLE_ENDIAN + memcpy(buf, &value, sizeof(value)); +#else + buf[0] = value & 0xff; + buf[1] = (value >> 8) & 0xff; + buf[2] = (value >> 16) & 0xff; + buf[3] = (value >> 24) & 0xff; + buf[4] = (value >> 32) & 0xff; + buf[5] = (value >> 40) & 0xff; + buf[6] = (value >> 48) & 0xff; + buf[7] = (value >> 56) & 0xff; +#endif } void PutFixed32(std::string* dst, uint32_t value) { @@ -79,7 +79,7 @@ void PutVarint32(std::string* dst, uint32_t v) { } char* EncodeVarint64(char* dst, uint64_t v) { - static const int B = 128; + static const uint64_t B = 128; unsigned char* ptr = reinterpret_cast(dst); while (v >= B) { *(ptr++) = (v & (B-1)) | B; diff --git a/src/leveldb/util/coding.h b/src/leveldb/util/coding.h index 3993c4a75..af6d6d52e 100644 --- a/src/leveldb/util/coding.h +++ b/src/leveldb/util/coding.h @@ -82,6 +82,19 @@ inline uint64_t DecodeFixed64(const char* ptr) { } } +// Riak: return only lowest 8 bits of 64 bit number, +// optimization for internal key's ValueType +inline unsigned char DecodeLeastFixed64(const char * ptr) { + unsigned char ret_char; + + if (port::kLittleEndian) + ret_char=(unsigned char)*ptr; + else + ret_char=(unsigned char)*(ptr+7); + return(ret_char); +} + + // Internal routine for use by fallback path of GetVarint32Ptr extern const char* GetVarint32PtrFallback(const char* p, const char* limit, diff --git a/src/leveldb/util/coding_test.cc b/src/leveldb/util/coding_test.cc index 521541ea6..2c52b17b6 100644 --- a/src/leveldb/util/coding_test.cc +++ b/src/leveldb/util/coding_test.cc @@ -109,16 +109,16 @@ TEST(Coding, Varint64) { values.push_back(power); values.push_back(power-1); values.push_back(power+1); - } + }; std::string s; - for (size_t i = 0; i < values.size(); i++) { + for (int i = 0; i < values.size(); i++) { PutVarint64(&s, values[i]); } const char* p = s.data(); const char* limit = p + s.size(); - for (size_t i = 0; i < values.size(); i++) { + for (int i = 0; i < values.size(); i++) { ASSERT_TRUE(p < limit); uint64_t actual; const char* start = p; @@ -143,7 +143,7 @@ TEST(Coding, Varint32Truncation) { std::string s; PutVarint32(&s, large_value); uint32_t result; - for (size_t len = 0; len < s.size() - 1; len++) { + for (int len = 0; len < s.size() - 1; len++) { ASSERT_TRUE(GetVarint32Ptr(s.data(), s.data() + len, &result) == NULL); } ASSERT_TRUE(GetVarint32Ptr(s.data(), s.data() + s.size(), &result) != NULL); @@ -162,7 +162,7 @@ TEST(Coding, Varint64Truncation) { std::string s; PutVarint64(&s, large_value); uint64_t result; - for (size_t len = 0; len < s.size() - 1; len++) { + for (int len = 0; len < s.size() - 1; len++) { ASSERT_TRUE(GetVarint64Ptr(s.data(), s.data() + len, &result) == NULL); } ASSERT_TRUE(GetVarint64Ptr(s.data(), s.data() + s.size(), &result) != NULL); diff --git a/src/leveldb/util/comparator.cc b/src/leveldb/util/comparator.cc index 4b7b5724e..037d0b4f8 100644 --- a/src/leveldb/util/comparator.cc +++ b/src/leveldb/util/comparator.cc @@ -67,7 +67,7 @@ class BytewiseComparatorImpl : public Comparator { } // namespace static port::OnceType once = LEVELDB_ONCE_INIT; -static const Comparator* bytewise; +static const Comparator* bytewise = NULL; static void InitModule() { bytewise = new BytewiseComparatorImpl; @@ -78,4 +78,9 @@ const Comparator* BytewiseComparator() { return bytewise; } +void ComparatorShutdown() +{ + delete bytewise; + bytewise=NULL; +} } // namespace leveldb diff --git a/src/leveldb/util/crc32c.cc b/src/leveldb/util/crc32c.cc index b3f40eeee..d52492ca5 100644 --- a/src/leveldb/util/crc32c.cc +++ b/src/leveldb/util/crc32c.cc @@ -8,13 +8,16 @@ #include "util/crc32c.h" #include - -#include "port/port.h" #include "util/coding.h" namespace leveldb { namespace crc32c { +static uint32_t SoftCRC(uint32_t StartCrc, const char * BlockStart, size_t BlockSize); +static uint32_t HardCRC(uint32_t StartCrc, const char * BlockStart, size_t BlockSize); + +static uint32_t (*CrcFunction)(uint32_t, const char *, size_t)=&SoftCRC; + static const uint32_t table0_[256] = { 0x00000000, 0xf26b8303, 0xe13b70f7, 0x1350f3f4, 0xc79a971f, 0x35f1141c, 0x26a1e7e8, 0xd4ca64eb, @@ -285,27 +288,22 @@ static inline uint32_t LE_LOAD32(const uint8_t *p) { return DecodeFixed32(reinterpret_cast(p)); } -// Determine if the CPU running this program can accelerate the CRC32C -// calculation. -static bool CanAccelerateCRC32C() { - if (!port::HasAcceleratedCRC32C()) - return false; - // Double-check that the accelerated implementation functions correctly. - // port::AcceleretedCRC32C returns zero when unable to accelerate. - static const char kTestCRCBuffer[] = "TestCRCBuffer"; - static const char kBufSize = sizeof(kTestCRCBuffer) - 1; - static const uint32_t kTestCRCValue = 0xdcbc59fa; +uint32_t Extend(uint32_t crc, const char* buf, size_t size) +{ + return((*CrcFunction)(crc, buf, size)); +} // Extend - return port::AcceleratedCRC32C(0, kTestCRCBuffer, kBufSize) == kTestCRCValue; -} -uint32_t Extend(uint32_t crc, const char* buf, size_t size) { - static bool accelerate = CanAccelerateCRC32C(); - if (accelerate) { - return port::AcceleratedCRC32C(crc, buf, size); - } +void SwitchToHardwareCRC() {CrcFunction=&HardCRC;}; + +bool IsHardwareCRC() {return(&HardCRC==CrcFunction);}; + + +static uint32_t +SoftCRC(uint32_t crc, const char* buf, size_t size) +{ const uint8_t *p = reinterpret_cast(buf); const uint8_t *e = p + size; uint32_t l = crc ^ 0xffffffffu; @@ -347,8 +345,53 @@ uint32_t Extend(uint32_t crc, const char* buf, size_t size) { } #undef STEP4 #undef STEP1 + return l ^ 0xffffffffu; -} +} // SoftCRC + + +static uint32_t +HardCRC( + uint32_t StartCrc, + const char * BlockStart, + size_t BlockSize) +{ +#if defined(__x86_64__) + size_t fullqwords, remainder; + uint32_t ret_crc; + char * src_c; + uint64_t * src_q; + + fullqwords=BlockSize / 8; + remainder=BlockSize % 8; + + ret_crc=StartCrc ^ 0xffffffffu; + src_q=(uint64_t *)BlockStart; + + for ( ; 0!=fullqwords; --fullqwords, ++src_q) + { + __asm__ __volatile__ ( + ".byte 0xf2, 0x48, 0x0f, 0x38, 0xf1, 0xf1;" + : "=S"(ret_crc) + : "S"(ret_crc), "c"(*src_q)); + } // for + + src_c=(char *)src_q; + for ( ; 0!=remainder; --remainder, ++src_c) + { + __asm__ __volatile__ ( + ".byte 0xf2, 0x48, 0x0f, 0x38, 0xf0, 0xf1;" + : "=S"(ret_crc) + : "S"(ret_crc), "c"(*src_c)); + } // for + + return(ret_crc ^ 0xffffffffu); +#else + return(0); +#endif + +} // HardCRC + } // namespace crc32c } // namespace leveldb diff --git a/src/leveldb/util/crc32c.h b/src/leveldb/util/crc32c.h index 1d7e5c075..61253d235 100644 --- a/src/leveldb/util/crc32c.h +++ b/src/leveldb/util/crc32c.h @@ -21,6 +21,10 @@ inline uint32_t Value(const char* data, size_t n) { return Extend(0, data, n); } +// switch function pointer from software crc to hardware +extern void SwitchToHardwareCRC(); +extern bool IsHardwareCRC(); + static const uint32_t kMaskDelta = 0xa282ead8ul; // Return a masked representation of crc. diff --git a/src/leveldb/util/crc32c_test.cc b/src/leveldb/util/crc32c_test.cc index 4b957ee12..e87dd9f97 100644 --- a/src/leveldb/util/crc32c_test.cc +++ b/src/leveldb/util/crc32c_test.cc @@ -2,6 +2,7 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. +#include "leveldb/env.h" #include "util/crc32c.h" #include "util/testharness.h" @@ -68,5 +69,9 @@ TEST(CRC, Mask) { } // namespace leveldb int main(int argc, char** argv) { - return leveldb::test::RunAllTests(); + + // identify and potentially switch to hardware CRC + leveldb::Env::Default(); + + return leveldb::test::RunAllTests(); } diff --git a/src/leveldb/util/db_list.cc b/src/leveldb/util/db_list.cc new file mode 100644 index 000000000..2c3de802c --- /dev/null +++ b/src/leveldb/util/db_list.cc @@ -0,0 +1,192 @@ +// ------------------------------------------------------------------- +// +// db_list.cc +// +// Copyright (c) 2011-2013 Basho Technologies, Inc. All Rights Reserved. +// +// This file is provided to you under the Apache License, +// Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain +// a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// ------------------------------------------------------------------- + +#include +#include + +#include "util/db_list.h" +#include "util/mutexlock.h" + +#define __STDC_FORMAT_MACROS +#include + +namespace leveldb { + +// using singleton model from comparator.cc +static port::OnceType once = LEVELDB_ONCE_INIT; +static DBListImpl * dblist=NULL; + +static void InitModule() +{ + dblist=new DBListImpl; +} // InitModule + + +DBListImpl * DBList() +{ + port::InitOnce(&once, InitModule); + return(dblist); + +} // DBList + + +void +DBListShutdown() +{ + // retrieve point to handle any initialization/shutdown races + DBList(); + delete dblist; + + return; + +} // DBListShutdown + + + +DBListImpl::DBListImpl() + : m_UserDBCount(0), m_InternalDBCount(0) +{ +} // DBListImpl::DBListImpl + + +bool +DBListImpl::AddDB( + DBImpl * Dbase, + bool IsInternal) +{ + bool ret_flag; + + SpinLock lock(&m_Lock); + + if (IsInternal) + { + ret_flag=m_InternalDBs.insert(Dbase).second; + m_InternalDBCount=m_InternalDBs.size(); + } // if + else + { + ret_flag=m_UserDBs.insert(Dbase).second; + m_UserDBCount=m_UserDBs.size(); + } // else + + return(ret_flag); + +} // DBListImpl::AddDB + + +void +DBListImpl::ReleaseDB( + DBImpl * Dbase, + bool IsInternal) +{ + db_set_t::iterator it; + SpinLock lock(&m_Lock); + + if (IsInternal) + { + it=m_InternalDBs.find(Dbase); + if (m_InternalDBs.end()!=it) + { + m_InternalDBs.erase(it); + } // if + m_InternalDBCount=m_InternalDBs.size(); + } // if + else + { + it=m_UserDBs.find(Dbase); + if (m_UserDBs.end()!=it) + { + m_UserDBs.erase(it); + } // if + m_UserDBCount=m_UserDBs.size(); + } // else + + return; + +} // DBListImpl::ReleaseDB + + +size_t +DBListImpl::GetDBCount( + bool IsInternal) +{ + size_t ret_val; + + if (IsInternal) + ret_val=m_InternalDBCount; + else + ret_val=m_UserDBCount; + + return(ret_val); + +} // DBListImpl::GetDBCount + + +void +DBListImpl::ScanDBs( + bool IsInternal, + void (DBImpl::* Function)()) +{ + db_set_t::iterator it, first, last; + SpinLock lock(&m_Lock); + + size_t count; + + // for_each() would have been fun, but setup deadlock + // scenarios + // Now we have a race condition of us using the db object + // while someone is shutting it down ... hmm + if (IsInternal) + { + first=m_InternalDBs.begin(); + last=m_InternalDBs.end(); + count=m_InternalDBs.size(); + } // if + else + { + first=m_UserDBs.begin(); + last=m_UserDBs.end(); + count=m_UserDBs.size(); + } // else + +#if 0 // for debugging ... sometimes + m_Lock.Unlock(); /// might not be needed now + syslog(LOG_ERR, "count %zd, total memory %" PRIu64 ", db cache size %" PRIu64 ", internal %d", + count, gFlexCache.GetTotalMemory(), gFlexCache.GetDBCacheCapacity(IsInternal), + (int)IsInternal); + m_Lock.Lock(); +#else + count=count*2; // kill off compiler warning +#endif + + // call member function of each database + for (it=first; last!=it; ++it) + { + // must protect list from db add/delete during scan, leave locks + ((*it)->*Function)(); + } // for + + return; + +} // DBListImpl::ScanDBs + +} // namespace leveldb diff --git a/src/leveldb/util/db_list.h b/src/leveldb/util/db_list.h new file mode 100644 index 000000000..709ab1ab5 --- /dev/null +++ b/src/leveldb/util/db_list.h @@ -0,0 +1,67 @@ +// ------------------------------------------------------------------- +// +// db_list.h +// +// Copyright (c) 2011-2013 Basho Technologies, Inc. All Rights Reserved. +// +// This file is provided to you under the Apache License, +// Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain +// a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// ------------------------------------------------------------------- + +#include "db/db_impl.h" +#include "port/port.h" + + +namespace leveldb +{ + +/** + * DBList: class to provide management access to all + * open databases (Riak vnodes) + */ +class DBListImpl +{ +protected: + typedef std::set db_set_t; + + port::Spin m_Lock; //!< thread protection for set + db_set_t m_UserDBs; //!< set of pointers for user db + db_set_t m_InternalDBs; //!< Riak internal dbs + + volatile size_t m_UserDBCount; //!< m_UserDBs size() for non-blocking retrieval + volatile size_t m_InternalDBCount; //!< m_InternalDBs size() for non-blocking retrieval + +public: + DBListImpl(); + virtual ~DBListImpl() {}; + + bool AddDB(DBImpl *, bool is_internal); + void ReleaseDB(DBImpl *, bool is_internal); + + size_t GetDBCount(bool is_internal); + + void ScanDBs(bool is_internal, void (DBImpl::*)()); + +}; // class DBListImpl + + +// Universal access to dblist ... initialization order independent +DBListImpl * DBList(); + +// cleanup memory, mostly for valgrind +void DBListShutdown(); + + +} // namespace leveldb diff --git a/src/leveldb/util/env.cc b/src/leveldb/util/env.cc index c58a0821e..5311c3883 100644 --- a/src/leveldb/util/env.cc +++ b/src/leveldb/util/env.cc @@ -2,17 +2,17 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. +#include +#include + #include "leveldb/env.h" +#include "leveldb/perf_count.h" namespace leveldb { Env::~Env() { } -Status Env::NewAppendableFile(const std::string& fname, WritableFile** result) { - return Status::NotSupported("NewAppendableFile", fname); -} - SequentialFile::~SequentialFile() { } @@ -29,19 +29,39 @@ FileLock::~FileLock() { } void Log(Logger* info_log, const char* format, ...) { - if (info_log != NULL) { va_list ap; + va_start(ap, format); - info_log->Logv(format, ap); + + if (info_log != NULL) + { + info_log->Logv(format, ap); + } // if + else + { + // perf counter is clue to check syslog + vsyslog(LOG_ERR, format, ap); + gPerfCounters->Inc(ePerfSyslogWrite); + } // else + va_end(ap); - } } static Status DoWriteStringToFile(Env* env, const Slice& data, const std::string& fname, bool should_sync) { WritableFile* file; - Status s = env->NewWritableFile(fname, &file); + size_t map_size; + + // adjust file map size to speed up corruption test's + // writing of 40M files, but keep small for normal + // case of writing CURRENT file (code will round up to page_size) + if (gMapSizeNewWritableFile(fname, &file, map_size); if (!s.ok()) { return s; } diff --git a/src/leveldb/util/env_posix.cc b/src/leveldb/util/env_posix.cc index f77918313..b446c5a30 100644 --- a/src/leveldb/util/env_posix.cc +++ b/src/leveldb/util/env_posix.cc @@ -1,8 +1,9 @@ // Copyright (c) 2011 The LevelDB Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#if !defined(LEVELDB_PLATFORM_WINDOWS) +#include +#include #include #include #include @@ -10,84 +11,88 @@ #include #include #include +#include #include -#include #include #include #include +#include #include #include -#include -#include -#include +#if defined(LEVELDB_PLATFORM_ANDROID) +#include +#endif +#include "leveldb/atomics.h" #include "leveldb/env.h" +#include "leveldb/filter_policy.h" #include "leveldb/slice.h" #include "port/port.h" +#include "util/crc32c.h" +#include "util/db_list.h" +#include "util/hot_threads.h" #include "util/logging.h" #include "util/mutexlock.h" #include "util/posix_logger.h" -#include "util/env_posix_test_helper.h" +#include "util/thread_tasks.h" +#include "util/throttle.h" +#include "db/dbformat.h" +#include "leveldb/perf_count.h" + + +#if _XOPEN_SOURCE >= 600 || _POSIX_C_SOURCE >= 200112L +#define HAVE_FADVISE +#endif namespace leveldb { -namespace { +volatile size_t gMapSize=20*1024*1024L; -static int open_read_only_file_limit = -1; -static int mmap_limit = -1; +// ugly global used to change fadvise behaviour +bool gFadviseWillNeed=false; + +namespace { static Status IOError(const std::string& context, int err_number) { return Status::IOError(context, strerror(err_number)); } -// Helper class to limit resource usage to avoid exhaustion. -// Currently used to limit read-only file descriptors and mmap file usage -// so that we do not end up running out of file descriptors, virtual memory, -// or running into kernel performance problems for very large databases. -class Limiter { - public: - // Limit maximum number of resources to |n|. - Limiter(intptr_t n) { - SetAllowed(n); - } +// background routines to close and/or unmap files +static void BGFileUnmapper2(void* file_info); - // If another resource is available, acquire it and return true. - // Else return false. - bool Acquire() { - if (GetAllowed() <= 0) { - return false; - } - MutexLock l(&mu_); - intptr_t x = GetAllowed(); - if (x <= 0) { - return false; - } else { - SetAllowed(x - 1); - return true; - } - } +// data needed by background routines for close/unmap +class BGCloseInfo : public ThreadTask +{ +public: + int fd_; + void * base_; + size_t offset_; + size_t length_; + volatile uint64_t * ref_count_; + uint64_t metadata_; - // Release a resource acquired by a previous call to Acquire() that returned - // true. - void Release() { - MutexLock l(&mu_); - SetAllowed(GetAllowed() + 1); - } + BGCloseInfo(int fd, void * base, size_t offset, size_t length, + volatile uint64_t * ref_count, uint64_t metadata) + : fd_(fd), base_(base), offset_(offset), length_(length), + ref_count_(ref_count), metadata_(metadata) + { + // reference count of independent file object count + if (NULL!=ref_count_) + inc_and_fetch(ref_count_); - private: - port::Mutex mu_; - port::AtomicPointer allowed_; + // reference count of threads/paths using this object + // (because there is a direct path and a threaded path usage) + RefInc(); + }; - intptr_t GetAllowed() const { - return reinterpret_cast(allowed_.Acquire_Load()); - } + virtual ~BGCloseInfo() {}; - // REQUIRES: mu_ must be held - void SetAllowed(intptr_t v) { - allowed_.Release_Store(reinterpret_cast(v)); - } + virtual void operator()() {BGFileUnmapper2(this);}; + +private: + BGCloseInfo(); + BGCloseInfo(const BGCloseInfo &); + BGCloseInfo & operator=(const BGCloseInfo &); - Limiter(const Limiter&); - void operator=(const Limiter&); }; class PosixSequentialFile: public SequentialFile { @@ -121,183 +126,384 @@ class PosixSequentialFile: public SequentialFile { } return Status::OK(); } - - virtual std::string GetName() const { return filename_; } }; // pread() based random-access class PosixRandomAccessFile: public RandomAccessFile { private: std::string filename_; - bool temporary_fd_; // If true, fd_ is -1 and we open on every read. int fd_; - Limiter* limiter_; + bool is_compaction_; + uint64_t file_size_; public: - PosixRandomAccessFile(const std::string& fname, int fd, Limiter* limiter) - : filename_(fname), fd_(fd), limiter_(limiter) { - temporary_fd_ = !limiter->Acquire(); - if (temporary_fd_) { - // Open file on every access. - close(fd_); - fd_ = -1; - } + PosixRandomAccessFile(const std::string& fname, int fd) + : filename_(fname), fd_(fd), is_compaction_(false), file_size_(0) + { +#if defined(HAVE_FADVISE) + posix_fadvise(fd_, 0, file_size_, POSIX_FADV_RANDOM); +#endif + gPerfCounters->Inc(ePerfROFileOpen); } + virtual ~PosixRandomAccessFile() + { + if (is_compaction_) + { +#if defined(HAVE_FADVISE) + posix_fadvise(fd_, 0, file_size_, POSIX_FADV_DONTNEED); +#endif + } // if - virtual ~PosixRandomAccessFile() { - if (!temporary_fd_) { - close(fd_); - limiter_->Release(); - } + gPerfCounters->Inc(ePerfROFileClose); + close(fd_); } virtual Status Read(uint64_t offset, size_t n, Slice* result, char* scratch) const { - int fd = fd_; - if (temporary_fd_) { - fd = open(filename_.c_str(), O_RDONLY); - if (fd < 0) { - return IOError(filename_, errno); - } - } - Status s; - ssize_t r = pread(fd, scratch, n, static_cast(offset)); + ssize_t r = pread(fd_, scratch, n, static_cast(offset)); *result = Slice(scratch, (r < 0) ? 0 : r); if (r < 0) { // An error: return a non-ok status s = IOError(filename_, errno); } - if (temporary_fd_) { - // Close the temporary file descriptor opened earlier. - close(fd); - } return s; } - virtual std::string GetName() const { return filename_; } + virtual void SetForCompaction(uint64_t file_size) + { + is_compaction_=true; + file_size_=file_size; +#if defined(HAVE_FADVISE) + posix_fadvise(fd_, 0, file_size_, POSIX_FADV_SEQUENTIAL); +#endif + + }; + + // Riak addition: size of this structure in bytes + virtual size_t ObjectSize() {return(sizeof(PosixRandomAccessFile)+filename_.length());}; + }; -// mmap() based random-access -class PosixMmapReadableFile: public RandomAccessFile { + +// We preallocate up to an extra megabyte and use memcpy to append new +// data to the file. This is safe since we either properly close the +// file before reading from it, or for log files, the reading code +// knows enough to skip zero suffixes. +class PosixMmapFile : public WritableFile { private: std::string filename_; - void* mmapped_region_; - size_t length_; - Limiter* limiter_; + int fd_; + size_t page_size_; + size_t map_size_; // How much extra memory to map at a time + char* base_; // The mapped region + char* limit_; // Limit of the mapped region + char* dst_; // Where to write next (in range [base_,limit_]) + char* last_sync_; // Where have we synced up to + uint64_t file_offset_; // Offset of base_ in file + uint64_t metadata_offset_; // Offset where sst metadata starts, or zero + bool pending_sync_; // Have we done an munmap of unsynced data? + bool is_async_; // can this file process in background + volatile uint64_t * ref_count_; // alternative to std:shared_ptr that is thread safe everywhere - public: - // base[0,length-1] contains the mmapped contents of the file. - PosixMmapReadableFile(const std::string& fname, void* base, size_t length, - Limiter* limiter) - : filename_(fname), mmapped_region_(base), length_(length), - limiter_(limiter) { + // Roundup x to a multiple of y + static size_t Roundup(size_t x, size_t y) { + return ((x + y - 1) / y) * y; } - virtual ~PosixMmapReadableFile() { - munmap(mmapped_region_, length_); - limiter_->Release(); - } - - virtual Status Read(uint64_t offset, size_t n, Slice* result, - char* scratch) const { - Status s; - if (offset + n > length_) { - *result = Slice(); - s = IOError(filename_, EINVAL); - } else { - *result = Slice(reinterpret_cast(mmapped_region_) + offset, n); - } + size_t TruncateToPageBoundary(size_t s) { + s -= (s & (page_size_ - 1)); + assert((s % page_size_) == 0); return s; } - virtual std::string GetName() const { return filename_; } -}; + bool UnmapCurrentRegion() { + bool result = true; + if (base_ != NULL) { + if (last_sync_ < limit_) { + // Defer syncing this data until next Sync() call, if any + pending_sync_ = true; + } -class PosixWritableFile : public WritableFile { - private: - std::string filename_; - FILE* file_; + + // write only files can perform operations async, but not + // files that might re-open and read again soon + if (!is_async_) + { + BGCloseInfo * ptr=new BGCloseInfo(fd_, base_, file_offset_, limit_-base_, + NULL, metadata_offset_); + BGFileUnmapper2(ptr); + } // if + + // called from user thread, move these operations to background + // queue + else + { + BGCloseInfo * ptr=new BGCloseInfo(fd_, base_, file_offset_, limit_-base_, + ref_count_, metadata_offset_); + gWriteThreads->Submit(ptr); + } // else + + file_offset_ += limit_ - base_; + base_ = NULL; + limit_ = NULL; + last_sync_ = NULL; + dst_ = NULL; + + } + + return result; + } + + bool MapNewRegion() { + size_t offset_adjust; + + // append mode file might not have file_offset_ on a page boundry + offset_adjust=file_offset_ % page_size_; + if (0!=offset_adjust) + file_offset_-=offset_adjust; + + assert(base_ == NULL); + if (ftruncate(fd_, file_offset_ + map_size_) < 0) { + return false; + } + void* ptr = mmap(NULL, map_size_, PROT_WRITE, MAP_SHARED, + fd_, file_offset_); + if (ptr == MAP_FAILED) { + return false; + } + base_ = reinterpret_cast(ptr); + limit_ = base_ + map_size_; + dst_ = base_ + offset_adjust; + last_sync_ = base_; + return true; + } public: - PosixWritableFile(const std::string& fname, FILE* f) - : filename_(fname), file_(f) { } + PosixMmapFile(const std::string& fname, int fd, + size_t page_size, size_t file_offset=0L, + bool is_async=false, + size_t map_size=gMapSize) + : filename_(fname), + fd_(fd), + page_size_(page_size), + map_size_(Roundup(map_size, page_size)), + base_(NULL), + limit_(NULL), + dst_(NULL), + last_sync_(NULL), + file_offset_(file_offset), + metadata_offset_(0), + pending_sync_(false), + is_async_(is_async), + ref_count_(NULL) + { + assert((page_size & (page_size - 1)) == 0); - ~PosixWritableFile() { - if (file_ != NULL) { - // Ignoring any potential errors - fclose(file_); + if (is_async_) + { + ref_count_=new volatile uint64_t[2]; + *ref_count_=1; // one ref count for PosixMmapFile object + *(ref_count_+1)=0; // filesize + } // if + + // when global set, make entire file use FADV_WILLNEED + if (gFadviseWillNeed) + metadata_offset_=1; + + gPerfCounters->Inc(ePerfRWFileOpen); + } + + ~PosixMmapFile() { + if (fd_ >= 0) { + PosixMmapFile::Close(); } } virtual Status Append(const Slice& data) { - size_t r = fwrite_unlocked(data.data(), 1, data.size(), file_); - if (r != data.size()) { - return IOError(filename_, errno); + const char* src = data.data(); + size_t left = data.size(); + while (left > 0) { + assert(base_ <= dst_); + assert(dst_ <= limit_); + size_t avail = limit_ - dst_; + if (avail == 0) { + if (!UnmapCurrentRegion() || + !MapNewRegion()) { + return IOError(filename_, errno); + } + } + + size_t n = (left <= avail) ? left : avail; + memcpy(dst_, src, n); + dst_ += n; + src += n; + left -= n; } return Status::OK(); } virtual Status Close() { - Status result; - if (fclose(file_) != 0) { - result = IOError(filename_, errno); + Status s; + size_t file_length; + int ret_val; + + + // compute actual file length before final unmap + file_length=file_offset_ + (dst_ - base_); + + if (!UnmapCurrentRegion()) { + s = IOError(filename_, errno); } - file_ = NULL; - return result; + + // hard code + if (!is_async_) + { + ret_val=ftruncate(fd_, file_length); + if (0!=ret_val) + { + syslog(LOG_ERR,"Close ftruncate failed [%d, %m]", errno); + s = IOError(filename_, errno); + } // if + + ret_val=close(fd_); + } // if + + // async close + else + { + *(ref_count_ +1)=file_length; + ret_val=ReleaseRef(ref_count_, fd_); + + // retry once if failed + if (0!=ret_val) + { + Env::Default()->SleepForMicroseconds(500000); + ret_val=ReleaseRef(ref_count_, fd_); + if (0!=ret_val) + { + syslog(LOG_ERR,"ReleaseRef failed in Close"); + s = IOError(filename_, errno); + delete [] ref_count_; + + // force close + ret_val=close(fd_); + } // if + } // if + } // else + + fd_ = -1; + ref_count_=NULL; + base_ = NULL; + limit_ = NULL; + return s; } virtual Status Flush() { - if (fflush_unlocked(file_) != 0) { - return IOError(filename_, errno); - } return Status::OK(); } - Status SyncDirIfManifest() { - const char* f = filename_.c_str(); - const char* sep = strrchr(f, '/'); - Slice basename; - std::string dir; - if (sep == NULL) { - dir = "."; - basename = f; - } else { - dir = std::string(f, sep - f); - basename = sep + 1; - } + virtual Status Sync() { Status s; - if (basename.starts_with("MANIFEST")) { - int fd = open(dir.c_str(), O_RDONLY); - if (fd < 0) { - s = IOError(dir, errno); - } else { - if (fsync(fd) < 0 && errno != EINVAL) { - s = IOError(dir, errno); - } - close(fd); + + if (pending_sync_) { + // Some unmapped data was not synced + pending_sync_ = false; + if (fdatasync(fd_) < 0) { + s = IOError(filename_, errno); } } + + if (dst_ > last_sync_) { + // Find the beginnings of the pages that contain the first and last + // bytes to be synced. + size_t p1 = TruncateToPageBoundary(last_sync_ - base_); + size_t p2 = TruncateToPageBoundary(dst_ - base_ - 1); + last_sync_ = dst_; + if (msync(base_ + p1, p2 - p1 + page_size_, MS_SYNC) < 0) { + s = IOError(filename_, errno); + } + } + return s; } - virtual Status Sync() { - // Ensure new files referred to by the manifest are in the filesystem. - Status s = SyncDirIfManifest(); - if (!s.ok()) { - return s; - } - if (fflush_unlocked(file_) != 0 || - fdatasync(fileno(file_)) != 0) { - s = Status::IOError(filename_, strerror(errno)); - } - return s; - } + virtual void SetMetadataOffset(uint64_t Metadata) + { + // when global set, make entire file use FADV_WILLNEED, + // so ignore this setting + if (!gFadviseWillNeed && 1!=metadata_offset_) + metadata_offset_=Metadata; + } // SetMetadataOffset + + + // if std::shared_ptr was guaranteed thread safe everywhere + // the following function would be best written differently + static int ReleaseRef(volatile uint64_t * Count, int File) + { + bool good; + + good=true; + if (NULL!=Count) + { + int ret_val; + + ret_val=dec_and_fetch(Count); + if (0==ret_val) + { + ret_val=ftruncate(File, *(Count +1)); + if (0!=ret_val) + { + syslog(LOG_ERR,"ReleaseRef ftruncate failed [%d, %m]", errno); + gPerfCounters->Inc(ePerfBGWriteError); + good=false; + } // if + + if (good) + { + ret_val=close(File); + if (0==ret_val) + { + gPerfCounters->Inc(ePerfRWFileClose); + } // if + else + { + syslog(LOG_ERR,"ReleaseRef close failed [%d, %m]", errno); + gPerfCounters->Inc(ePerfBGWriteError); + good=false; + } // else + + } // if + + if (good) + delete [] Count; + else + inc_and_fetch(Count); // try again. + + } // if + } // if + + return(good ? 0 : -1); + + } // static ReleaseRef - virtual std::string GetName() const { return filename_; } }; + +// matthewv July 17, 2012 ... riak was overlapping activity on the +// same database directory due to the incorrect assumption that the +// code below worked within the riak process space. The fix leads to a choice: +// fcntl() only locks against external processes, not multiple locks from +// same process. But it has worked great with NFS forever +// flock() locks against both external processes and multiple locks from +// same process. It does not with NFS until Linux 2.6.12 ... other OS may vary. +// SmartOS/Solaris do not appear to support flock() though there is a man page. +// Pick the fcntl() or flock() below as appropriate for your environment / needs. + static int LockOrUnlock(int fd, bool lock) { +#ifndef LOCK_UN + // works with NFS, but fails if same process attempts second access to + // db, i.e. makes second DB object to same directory errno = 0; struct flock f; memset(&f, 0, sizeof(f)); @@ -306,6 +512,10 @@ static int LockOrUnlock(int fd, bool lock) { f.l_start = 0; f.l_len = 0; // Lock/unlock entire file return fcntl(fd, F_SETLK, &f); +#else + // does NOT work with NFS, but DOES work within same process + return flock(fd, (lock ? LOCK_EX : LOCK_UN) | LOCK_NB); +#endif } class PosixFileLock : public FileLock { @@ -332,14 +542,12 @@ class PosixLockTable { } }; +static PosixLockTable gFileLocks; + class PosixEnv : public Env { public: PosixEnv(); - virtual ~PosixEnv() { - char msg[] = "Destroying Env::Default()\n"; - fwrite(msg, 1, sizeof(msg), stderr); - abort(); - } + virtual ~PosixEnv(); virtual Status NewSequentialFile(const std::string& fname, SequentialFile** result) { @@ -360,53 +568,84 @@ class PosixEnv : public Env { int fd = open(fname.c_str(), O_RDONLY); if (fd < 0) { s = IOError(fname, errno); - } else if (mmap_limit_.Acquire()) { +#if 0 + // going to let page cache tune the file + // system reads instead of hoping to better + // manage through memory mapped files. + } else if (sizeof(void*) >= 8) { + // Use mmap when virtual address-space is plentiful. uint64_t size; s = GetFileSize(fname, &size); if (s.ok()) { void* base = mmap(NULL, size, PROT_READ, MAP_SHARED, fd, 0); if (base != MAP_FAILED) { - *result = new PosixMmapReadableFile(fname, base, size, &mmap_limit_); + *result = new PosixMmapReadableFile(fname, base, size, fd); } else { s = IOError(fname, errno); + close(fd); } } - close(fd); - if (!s.ok()) { - mmap_limit_.Release(); - } +#endif } else { - *result = new PosixRandomAccessFile(fname, fd, &fd_limit_); + *result = new PosixRandomAccessFile(fname, fd); } return s; } virtual Status NewWritableFile(const std::string& fname, - WritableFile** result) { + WritableFile** result, + size_t map_size) { Status s; - FILE* f = fopen(fname.c_str(), "w"); - if (f == NULL) { + const int fd = open(fname.c_str(), O_CREAT | O_RDWR | O_TRUNC, 0644); + if (fd < 0) { *result = NULL; s = IOError(fname, errno); } else { - *result = new PosixWritableFile(fname, f); + *result = new PosixMmapFile(fname, fd, page_size_, 0, false, map_size); } return s; } virtual Status NewAppendableFile(const std::string& fname, - WritableFile** result) { + WritableFile** result, + size_t map_size) { Status s; - FILE* f = fopen(fname.c_str(), "a"); - if (f == NULL) { + const int fd = open(fname.c_str(), O_CREAT | O_RDWR, 0644); + if (fd < 0) { + *result = NULL; + s = IOError(fname, errno); + } else + { + uint64_t size; + s = GetFileSize(fname, &size); + if (s.ok()) + { + *result = new PosixMmapFile(fname, fd, page_size_, size, false, map_size); + } // if + else + { + s = IOError(fname, errno); + close(fd); + } // else + } // else + return s; + } + + virtual Status NewWriteOnlyFile(const std::string& fname, + WritableFile** result, + size_t map_size) { + Status s; + const int fd = open(fname.c_str(), O_CREAT | O_RDWR | O_TRUNC, 0644); + if (fd < 0) { *result = NULL; s = IOError(fname, errno); } else { - *result = new PosixWritableFile(fname, f); + *result = new PosixMmapFile(fname, fd, page_size_, 0, true, map_size); } return s; } + virtual bool FileExists(const std::string& fname) { return access(fname.c_str(), F_OK) == 0; } @@ -432,7 +671,7 @@ class PosixEnv : public Env { result = IOError(fname, errno); } return result; - } + }; virtual Status CreateDir(const std::string& name) { Status result; @@ -440,7 +679,7 @@ class PosixEnv : public Env { result = IOError(name, errno); } return result; - } + }; virtual Status DeleteDir(const std::string& name) { Status result; @@ -448,7 +687,7 @@ class PosixEnv : public Env { result = IOError(name, errno); } return result; - } + }; virtual Status GetFileSize(const std::string& fname, uint64_t* size) { Status s; @@ -476,17 +715,18 @@ class PosixEnv : public Env { int fd = open(fname.c_str(), O_RDWR | O_CREAT, 0644); if (fd < 0) { result = IOError(fname, errno); - } else if (!locks_.Insert(fname)) { + } else if (!gFileLocks.Insert(fname)) { close(fd); result = Status::IOError("lock " + fname, "already held by process"); } else if (LockOrUnlock(fd, true) == -1) { result = IOError("lock " + fname, errno); close(fd); - locks_.Remove(fname); + gFileLocks.Remove(fname); } else { PosixFileLock* my_lock = new PosixFileLock; my_lock->fd_ = fd; my_lock->name_ = fname; + *lock = my_lock; } return result; @@ -498,15 +738,18 @@ class PosixEnv : public Env { if (LockOrUnlock(my_lock->fd_, false) == -1) { result = IOError("unlock", errno); } - locks_.Remove(my_lock->name_); + gFileLocks.Remove(my_lock->name_); close(my_lock->fd_); + + my_lock->fd_=-1; + delete my_lock; return result; } virtual void Schedule(void (*function)(void*), void* arg); - virtual void StartThread(void (*function)(void* arg), void* arg); + virtual pthread_t StartThread(void (*function)(void* arg), void* arg); virtual Status GetTestDirectory(std::string* result) { const char* env = getenv("TEST_TMPDIR"); @@ -541,122 +784,110 @@ class PosixEnv : public Env { } virtual uint64_t NowMicros() { +#if _POSIX_TIMERS >= 200801L + struct timespec ts; + + // this is rumored to be faster that gettimeofday(), + // and sometimes shift less ... someday use CLOCK_MONOTONIC_RAW + clock_gettime(CLOCK_MONOTONIC, &ts); + return static_cast(ts.tv_sec) * 1000000 + ts.tv_nsec/1000; +#else struct timeval tv; gettimeofday(&tv, NULL); return static_cast(tv.tv_sec) * 1000000 + tv.tv_usec; +#endif } virtual void SleepForMicroseconds(int micros) { - usleep(micros); - } + struct timespec ts; + int ret_val; + + if (0!=micros) + { + micros=(micros/clock_res_ +1)*clock_res_; + ts.tv_sec=micros/1000000; + ts.tv_nsec=(micros - ts.tv_sec*1000000) *1000; + + do + { +#if _POSIX_TIMERS >= 200801L + // later ... add test for CLOCK_MONOTONIC_RAW where supported (better) + ret_val=clock_nanosleep(CLOCK_MONOTONIC,0, &ts, &ts); +#else + ret_val=nanosleep(&ts, &ts); +#endif + } while(EINTR==ret_val && 0!=(ts.tv_sec+ts.tv_nsec)); + } // if + } // SleepForMicroSeconds + + + virtual size_t RecoveryMmapSize(const struct Options * options) const + { + size_t map_size; + + if (NULL!=options) + { + // large buffers, try for a little bit bigger than half hoping + // for two writes ... not three + if (10*1024*1024 < options->write_buffer_size) + map_size=(options->write_buffer_size/6)*4; + else + map_size=(options->write_buffer_size*12)/10; // integer multiply 1.2 + } // if + else + map_size=2*1024*1024L; + + return(map_size); + }; private: + void PthreadCall(const char* label, int result) { if (result != 0) { fprintf(stderr, "pthread %s: %s\n", label, strerror(result)); - abort(); + exit(1); } } - // BGThread() is the body of the background thread - void BGThread(); - static void* BGThreadWrapper(void* arg) { - reinterpret_cast(arg)->BGThread(); - return NULL; - } - + size_t page_size_; pthread_mutex_t mu_; pthread_cond_t bgsignal_; - pthread_t bgthread_; - bool started_bgthread_; + int64_t clock_res_; // Entry per Schedule() call - struct BGItem { void* arg; void (*function)(void*); }; - typedef std::deque BGQueue; - BGQueue queue_; + struct BGItem { void* arg; void (*function)(void*); int priority;}; - PosixLockTable locks_; - Limiter mmap_limit_; - Limiter fd_limit_; }; -// Return the maximum number of concurrent mmaps. -static int MaxMmaps() { - if (mmap_limit >= 0) { - return mmap_limit; - } - // Up to 4096 mmaps for 64-bit binaries; none for smaller pointer sizes. - mmap_limit = sizeof(void*) >= 8 ? 4096 : 0; - return mmap_limit; -} -// Return the maximum number of read-only files to keep open. -static intptr_t MaxOpenFiles() { - if (open_read_only_file_limit >= 0) { - return open_read_only_file_limit; - } - struct rlimit rlim; - if (getrlimit(RLIMIT_NOFILE, &rlim)) { - // getrlimit failed, fallback to hard-coded default. - open_read_only_file_limit = 50; - } else if (rlim.rlim_cur == RLIM_INFINITY) { - open_read_only_file_limit = std::numeric_limits::max(); - } else { - // Allow use of 20% of available file descriptors for read-only files. - open_read_only_file_limit = rlim.rlim_cur / 5; - } - return open_read_only_file_limit; -} +PosixEnv::PosixEnv() : page_size_(getpagesize()), + clock_res_(1) +{ + +#if _POSIX_TIMERS >= 200801L + struct timespec ts; + clock_getres(CLOCK_MONOTONIC, &ts); + clock_res_=ts.tv_sec*1000000+ts.tv_nsec/1000; + if (0==clock_res_) + ++clock_res_; +#endif -PosixEnv::PosixEnv() - : started_bgthread_(false), - mmap_limit_(MaxMmaps()), - fd_limit_(MaxOpenFiles()) { PthreadCall("mutex_init", pthread_mutex_init(&mu_, NULL)); PthreadCall("cvar_init", pthread_cond_init(&bgsignal_, NULL)); } + +PosixEnv::~PosixEnv() +{ +} // PosixEnf::~PosixEnv + void PosixEnv::Schedule(void (*function)(void*), void* arg) { - PthreadCall("lock", pthread_mutex_lock(&mu_)); + ThreadTask * task; - // Start background thread if necessary - if (!started_bgthread_) { - started_bgthread_ = true; - PthreadCall( - "create thread", - pthread_create(&bgthread_, NULL, &PosixEnv::BGThreadWrapper, this)); - } - - // If the queue is currently empty, the background thread may currently be - // waiting. - if (queue_.empty()) { - PthreadCall("signal", pthread_cond_signal(&bgsignal_)); - } - - // Add to priority queue - queue_.push_back(BGItem()); - queue_.back().function = function; - queue_.back().arg = arg; - - PthreadCall("unlock", pthread_mutex_unlock(&mu_)); + task=new LegacyTask(function,arg); + gCompactionThreads->Submit(task, true); } -void PosixEnv::BGThread() { - while (true) { - // Wait until there is an item that is ready to run - PthreadCall("lock", pthread_mutex_lock(&mu_)); - while (queue_.empty()) { - PthreadCall("wait", pthread_cond_wait(&bgsignal_, &mu_)); - } - - void (*function)(void*) = queue_.front().function; - void* arg = queue_.front().arg; - queue_.pop_front(); - - PthreadCall("unlock", pthread_mutex_unlock(&mu_)); - (*function)(arg); - } -} namespace { struct StartThreadState { @@ -671,29 +902,185 @@ static void* StartThreadWrapper(void* arg) { return NULL; } -void PosixEnv::StartThread(void (*function)(void* arg), void* arg) { +pthread_t PosixEnv::StartThread(void (*function)(void* arg), void* arg) { pthread_t t; StartThreadState* state = new StartThreadState; state->user_function = function; state->arg = arg; PthreadCall("start thread", pthread_create(&t, NULL, &StartThreadWrapper, state)); + + return(t); } + +// Called by BGFileUnmapper which manages retries +// this was a new file: unmap, hold in page cache +int +BGFileUnmapper(void * arg) +{ + BGCloseInfo * file_ptr; + bool err_flag; + int ret_val; + + // + // Reminder: this could get called multiple times for + // same "arg" due to error retry + // + + err_flag=false; + file_ptr=(BGCloseInfo *)arg; + + // non-null implies this is a background job, + // i.e. not on direct thread of compaction. + if (NULL!=file_ptr->ref_count_) + gPerfCounters->Inc(ePerfBGCloseUnmap); + + if (NULL!=file_ptr->base_) + { + ret_val=munmap(file_ptr->base_, file_ptr->length_); + if (0==ret_val) + { + file_ptr->base_=NULL; + } // if + else + { + syslog(LOG_ERR,"BGFileUnmapper2 munmap failed [%d, %m]", errno); + err_flag=true; + } // else + } // if + +#if defined(HAVE_FADVISE) + if (0==file_ptr->metadata_ + || (file_ptr->offset_ + file_ptr->length_ < file_ptr->metadata_)) + { + // must fdatasync for DONTNEED to work + ret_val=fdatasync(file_ptr->fd_); + if (0!=ret_val) + { + syslog(LOG_ERR,"BGFileUnmapper2 fdatasync failed on %d [%d, %m]", file_ptr->fd_, errno); + err_flag=true; + } // if + + ret_val=posix_fadvise(file_ptr->fd_, file_ptr->offset_, file_ptr->length_, POSIX_FADV_DONTNEED); + if (0!=ret_val) + { + syslog(LOG_ERR,"BGFileUnmapper2 posix_fadvise DONTNEED failed on %d [%d]", file_ptr->fd_, ret_val); + err_flag=true; + } // if + } // if + else + { + ret_val=posix_fadvise(file_ptr->fd_, file_ptr->offset_, file_ptr->length_, POSIX_FADV_WILLNEED); + if (0!=ret_val) + { + syslog(LOG_ERR,"BGFileUnmapper2 posix_fadvise WILLNEED failed on %d [%d]", file_ptr->fd_, ret_val); + err_flag=true; + } // if + } // else +#endif + + // release access to file, maybe close it + if (!err_flag) + { + ret_val=PosixMmapFile::ReleaseRef(file_ptr->ref_count_, file_ptr->fd_); + err_flag=(0!=ret_val); + } // if + + if (err_flag) + gPerfCounters->Inc(ePerfBGWriteError); + + // routine called directly or via async thread, this + // controls when to delete file_ptr object + if (!err_flag) + { + gPerfCounters->Inc(ePerfRWFileUnmap); + file_ptr->RefDec(); + } // if + + return(err_flag ? -1 : 0); + +} // BGFileUnmapper + + +// Thread entry point, and retry loop +void BGFileUnmapper2(void * arg) +{ + int retries, ret_val; + + retries=0; + ret_val=0; + + do + { + if (1SleepForMicroseconds(100000); + + ret_val=BGFileUnmapper(arg); + ++retries; + } while(retries<3 && 0!=ret_val); + + // release object's memory here + if (0!=ret_val) + { + BGCloseInfo * file_ptr; + + file_ptr=(BGCloseInfo *)arg; + file_ptr->RefDec(); + } // if + + return; + +} // BGFileUnmapper2 + + + } // namespace +// how many blocks of 4 priority background threads/queues +/// for riak, make sure this is an odd number (and especially not 4) +#define THREAD_BLOCKS 1 + +static bool HasSSE4_2(); + static pthread_once_t once = PTHREAD_ONCE_INIT; static Env* default_env; -static void InitDefaultEnv() { default_env = new PosixEnv; } +static volatile bool started=false; +static void InitDefaultEnv() +{ + default_env=new PosixEnv; -void EnvPosixTestHelper::SetReadOnlyFDLimit(int limit) { - assert(default_env == NULL); - open_read_only_file_limit = limit; -} + ThrottleInit(); -void EnvPosixTestHelper::SetReadOnlyMMapLimit(int limit) { - assert(default_env == NULL); - mmap_limit = limit; + // force the loading of code for both filters in case they + // are hidden in a shared library + const FilterPolicy * ptr; + ptr=NewBloomFilterPolicy(16); + delete ptr; + ptr=NewBloomFilterPolicy2(16); + delete ptr; + + if (HasSSE4_2()) + crc32c::SwitchToHardwareCRC(); + + PerformanceCounters::Init(false); + + gImmThreads=new HotThreadPool(5, "ImmWrite", + ePerfBGImmDirect, ePerfBGImmQueued, + ePerfBGImmDequeued, ePerfBGImmWeighted); + gWriteThreads=new HotThreadPool(3, "RecoveryWrite", + ePerfBGUnmapDirect, ePerfBGUnmapQueued, + ePerfBGUnmapDequeued, ePerfBGUnmapWeighted); + gLevel0Threads=new HotThreadPool(3, "Level0Compact", + ePerfBGLevel0Direct, ePerfBGLevel0Queued, + ePerfBGLevel0Dequeued, ePerfBGLevel0Weighted); + // "2" is for Linux OS "nice", assumption is "1" nice might be + // used by AAE hash trees in the future + gCompactionThreads=new HotThreadPool(3, "GeneralCompact", + ePerfBGCompactDirect, ePerfBGCompactQueued, + ePerfBGCompactDequeued, ePerfBGCompactWeighted, 2); + + started=true; } Env* Env::Default() { @@ -701,6 +1088,73 @@ Env* Env::Default() { return default_env; } -} // namespace leveldb +void Env::Shutdown() +{ + if (started) + { + // prevent throttle from initiating new compactions + ThrottleStopThreads(); + } // if + + DBListShutdown(); + + delete gImmThreads; + gImmThreads=NULL; + + delete gWriteThreads; + gWriteThreads=NULL; + + delete gLevel0Threads; + gLevel0Threads=NULL; + + delete gCompactionThreads; + gCompactionThreads=NULL; + + if (started) + { + // release throttle globals now that + // background compaction threads done + ThrottleClose(); + + delete default_env; + default_env=NULL; + } // if + + ExpiryModule::ShutdownExpiryModule(); + + // wait until compaction threads complete before + // releasing comparator object (else segfault possible) + ComparatorShutdown(); + + PerformanceCounters::Close(gPerfCounters); + +} // Env::Shutdown + + +static bool +HasSSE4_2() +{ +#if defined(__x86_64__) + uint64_t ecx; + ecx=0; + + __asm__ __volatile__ + ("mov %%rbx, %%rdi\n\t" /* 32bit PIC: don't clobber ebx */ + "mov $1,%%rax\n\t" + "cpuid\n\t" + "mov %%rdi, %%rbx\n\t" + : "=c" (ecx) + : + : "%rax", "%rbx", "%rdx", "%rdi" ); + + return( 0 != (ecx & 1<<20)); +#else + return(false); #endif + +} // HasSSE4_2 + + + +} // namespace leveldb diff --git a/src/leveldb/util/env_posix_test.cc b/src/leveldb/util/env_posix_test.cc deleted file mode 100644 index 295f8ae44..000000000 --- a/src/leveldb/util/env_posix_test.cc +++ /dev/null @@ -1,66 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "leveldb/env.h" - -#include "port/port.h" -#include "util/testharness.h" -#include "util/env_posix_test_helper.h" - -namespace leveldb { - -static const int kDelayMicros = 100000; -static const int kReadOnlyFileLimit = 4; -static const int kMMapLimit = 4; - -class EnvPosixTest { - public: - Env* env_; - EnvPosixTest() : env_(Env::Default()) { } - - static void SetFileLimits(int read_only_file_limit, int mmap_limit) { - EnvPosixTestHelper::SetReadOnlyFDLimit(read_only_file_limit); - EnvPosixTestHelper::SetReadOnlyMMapLimit(mmap_limit); - } -}; - -TEST(EnvPosixTest, TestOpenOnRead) { - // Write some test data to a single file that will be opened |n| times. - std::string test_dir; - ASSERT_OK(env_->GetTestDirectory(&test_dir)); - std::string test_file = test_dir + "/open_on_read.txt"; - - FILE* f = fopen(test_file.c_str(), "w"); - ASSERT_TRUE(f != NULL); - const char kFileData[] = "abcdefghijklmnopqrstuvwxyz"; - fputs(kFileData, f); - fclose(f); - - // Open test file some number above the sum of the two limits to force - // open-on-read behavior of POSIX Env leveldb::RandomAccessFile. - const int kNumFiles = kReadOnlyFileLimit + kMMapLimit + 5; - leveldb::RandomAccessFile* files[kNumFiles] = {0}; - for (int i = 0; i < kNumFiles; i++) { - ASSERT_OK(env_->NewRandomAccessFile(test_file, &files[i])); - } - char scratch; - Slice read_result; - for (int i = 0; i < kNumFiles; i++) { - ASSERT_OK(files[i]->Read(i, 1, &read_result, &scratch)); - ASSERT_EQ(kFileData[i], read_result[0]); - } - for (int i = 0; i < kNumFiles; i++) { - delete files[i]; - } - ASSERT_OK(env_->DeleteFile(test_file)); -} - -} // namespace leveldb - -int main(int argc, char** argv) { - // All tests currently run with the same read-only file limits. - leveldb::EnvPosixTest::SetFileLimits(leveldb::kReadOnlyFileLimit, - leveldb::kMMapLimit); - return leveldb::test::RunAllTests(); -} diff --git a/src/leveldb/util/env_posix_test_helper.h b/src/leveldb/util/env_posix_test_helper.h deleted file mode 100644 index 038696059..000000000 --- a/src/leveldb/util/env_posix_test_helper.h +++ /dev/null @@ -1,28 +0,0 @@ -// Copyright 2017 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_UTIL_ENV_POSIX_TEST_HELPER_H_ -#define STORAGE_LEVELDB_UTIL_ENV_POSIX_TEST_HELPER_H_ - -namespace leveldb { - -class EnvPosixTest; - -// A helper for the POSIX Env to facilitate testing. -class EnvPosixTestHelper { - private: - friend class EnvPosixTest; - - // Set the maximum number of read-only files that will be opened. - // Must be called before creating an Env. - static void SetReadOnlyFDLimit(int limit); - - // Set the maximum number of read-only files that will be mapped via mmap. - // Must be called before creating an Env. - static void SetReadOnlyMMapLimit(int limit); -}; - -} // namespace leveldb - -#endif // STORAGE_LEVELDB_UTIL_ENV_POSIX_TEST_HELPER_H_ diff --git a/src/leveldb/util/env_test.cc b/src/leveldb/util/env_test.cc index 839ae56a1..8091103cd 100644 --- a/src/leveldb/util/env_test.cc +++ b/src/leveldb/util/env_test.cc @@ -10,31 +10,30 @@ namespace leveldb { static const int kDelayMicros = 100000; -static const int kReadOnlyFileLimit = 4; -static const int kMMapLimit = 4; -class EnvTest { +class EnvPosixTest { private: port::Mutex mu_; std::string events_; public: Env* env_; - EnvTest() : env_(Env::Default()) { } + EnvPosixTest() : env_(Env::Default()) { } }; static void SetBool(void* ptr) { reinterpret_cast(ptr)->NoBarrier_Store(ptr); } -TEST(EnvTest, RunImmediately) { +TEST(EnvPosixTest, RunImmediately) { port::AtomicPointer called (NULL); env_->Schedule(&SetBool, &called); - env_->SleepForMicroseconds(kDelayMicros); + Env::Default()->SleepForMicroseconds(kDelayMicros); ASSERT_TRUE(called.NoBarrier_Load() != NULL); } -TEST(EnvTest, RunMany) { +#if 0 // test assumes single thread and queue. No long valid assumption +TEST(EnvPosixTest, RunMany) { port::AtomicPointer last_id (NULL); struct CB { @@ -61,10 +60,11 @@ TEST(EnvTest, RunMany) { env_->Schedule(&CB::Run, &cb3); env_->Schedule(&CB::Run, &cb4); - env_->SleepForMicroseconds(kDelayMicros); + Env::Default()->SleepForMicroseconds(kDelayMicros); void* cur = last_id.Acquire_Load(); ASSERT_EQ(4, reinterpret_cast(cur)); } +#endif struct State { port::Mutex mu; @@ -80,12 +80,14 @@ static void ThreadBody(void* arg) { s->mu.Unlock(); } -TEST(EnvTest, StartThread) { +TEST(EnvPosixTest, StartThread) { State state; + pthread_t pid; state.val = 0; state.num_running = 3; for (int i = 0; i < 3; i++) { - env_->StartThread(&ThreadBody, &state); + pid=env_->StartThread(&ThreadBody, &state); + pthread_detach(pid); } while (true) { state.mu.Lock(); @@ -94,7 +96,7 @@ TEST(EnvTest, StartThread) { if (num == 0) { break; } - env_->SleepForMicroseconds(kDelayMicros); + Env::Default()->SleepForMicroseconds(kDelayMicros); } ASSERT_EQ(state.val, 3); } diff --git a/src/leveldb/util/env_win.cc b/src/leveldb/util/env_win.cc deleted file mode 100644 index 81380216b..000000000 --- a/src/leveldb/util/env_win.cc +++ /dev/null @@ -1,901 +0,0 @@ -// This file contains source that originates from: -// http://code.google.com/p/leveldbwin/source/browse/trunk/win32_impl_src/env_win32.h -// http://code.google.com/p/leveldbwin/source/browse/trunk/win32_impl_src/port_win32.cc -// Those files don't have any explicit license headers but the -// project (http://code.google.com/p/leveldbwin/) lists the 'New BSD License' -// as the license. -#if defined(LEVELDB_PLATFORM_WINDOWS) -#include - - -#include "leveldb/env.h" - -#include "port/port.h" -#include "leveldb/slice.h" -#include "util/logging.h" - -#include -#include -#include -#include -#include -#include -#include - -#ifdef max -#undef max -#endif - -#ifndef va_copy -#define va_copy(d,s) ((d) = (s)) -#endif - -#if defined DeleteFile -#undef DeleteFile -#endif - -//Declarations -namespace leveldb -{ - -namespace Win32 -{ - -#define DISALLOW_COPY_AND_ASSIGN(TypeName) \ - TypeName(const TypeName&); \ - void operator=(const TypeName&) - -std::string GetCurrentDir(); -std::wstring GetCurrentDirW(); - -static const std::string CurrentDir = GetCurrentDir(); -static const std::wstring CurrentDirW = GetCurrentDirW(); - -std::string& ModifyPath(std::string& path); -std::wstring& ModifyPath(std::wstring& path); - -std::string GetLastErrSz(); -std::wstring GetLastErrSzW(); - -size_t GetPageSize(); - -typedef void (*ScheduleProc)(void*) ; - -struct WorkItemWrapper -{ - WorkItemWrapper(ScheduleProc proc_,void* content_); - ScheduleProc proc; - void* pContent; -}; - -DWORD WINAPI WorkItemWrapperProc(LPVOID pContent); - -class Win32SequentialFile : public SequentialFile -{ -public: - friend class Win32Env; - virtual ~Win32SequentialFile(); - virtual Status Read(size_t n, Slice* result, char* scratch); - virtual Status Skip(uint64_t n); - BOOL isEnable(); - virtual std::string GetName() const { return _filename; } -private: - BOOL _Init(); - void _CleanUp(); - Win32SequentialFile(const std::string& fname); - std::string _filename; - ::HANDLE _hFile; - DISALLOW_COPY_AND_ASSIGN(Win32SequentialFile); -}; - -class Win32RandomAccessFile : public RandomAccessFile -{ -public: - friend class Win32Env; - virtual ~Win32RandomAccessFile(); - virtual Status Read(uint64_t offset, size_t n, Slice* result,char* scratch) const; - BOOL isEnable(); - virtual std::string GetName() const { return _filename; } -private: - BOOL _Init(LPCWSTR path); - void _CleanUp(); - Win32RandomAccessFile(const std::string& fname); - HANDLE _hFile; - const std::string _filename; - DISALLOW_COPY_AND_ASSIGN(Win32RandomAccessFile); -}; - -class Win32WritableFile : public WritableFile -{ -public: - Win32WritableFile(const std::string& fname, bool append); - ~Win32WritableFile(); - - virtual Status Append(const Slice& data); - virtual Status Close(); - virtual Status Flush(); - virtual Status Sync(); - BOOL isEnable(); - virtual std::string GetName() const { return filename_; } -private: - std::string filename_; - ::HANDLE _hFile; -}; - -class Win32FileLock : public FileLock -{ -public: - friend class Win32Env; - virtual ~Win32FileLock(); - BOOL isEnable(); -private: - BOOL _Init(LPCWSTR path); - void _CleanUp(); - Win32FileLock(const std::string& fname); - HANDLE _hFile; - std::string _filename; - DISALLOW_COPY_AND_ASSIGN(Win32FileLock); -}; - -class Win32Logger : public Logger -{ -public: - friend class Win32Env; - virtual ~Win32Logger(); - virtual void Logv(const char* format, va_list ap); -private: - explicit Win32Logger(WritableFile* pFile); - WritableFile* _pFileProxy; - DISALLOW_COPY_AND_ASSIGN(Win32Logger); -}; - -class Win32Env : public Env -{ -public: - Win32Env(); - virtual ~Win32Env(); - virtual Status NewSequentialFile(const std::string& fname, - SequentialFile** result); - - virtual Status NewRandomAccessFile(const std::string& fname, - RandomAccessFile** result); - virtual Status NewWritableFile(const std::string& fname, - WritableFile** result); - virtual Status NewAppendableFile(const std::string& fname, - WritableFile** result); - - virtual bool FileExists(const std::string& fname); - - virtual Status GetChildren(const std::string& dir, - std::vector* result); - - virtual Status DeleteFile(const std::string& fname); - - virtual Status CreateDir(const std::string& dirname); - - virtual Status DeleteDir(const std::string& dirname); - - virtual Status GetFileSize(const std::string& fname, uint64_t* file_size); - - virtual Status RenameFile(const std::string& src, - const std::string& target); - - virtual Status LockFile(const std::string& fname, FileLock** lock); - - virtual Status UnlockFile(FileLock* lock); - - virtual void Schedule( - void (*function)(void* arg), - void* arg); - - virtual void StartThread(void (*function)(void* arg), void* arg); - - virtual Status GetTestDirectory(std::string* path); - - //virtual void Logv(WritableFile* log, const char* format, va_list ap); - - virtual Status NewLogger(const std::string& fname, Logger** result); - - virtual uint64_t NowMicros(); - - virtual void SleepForMicroseconds(int micros); -}; - -void ToWidePath(const std::string& value, std::wstring& target) { - wchar_t buffer[MAX_PATH]; - MultiByteToWideChar(CP_ACP, 0, value.c_str(), -1, buffer, MAX_PATH); - target = buffer; -} - -void ToNarrowPath(const std::wstring& value, std::string& target) { - char buffer[MAX_PATH]; - WideCharToMultiByte(CP_ACP, 0, value.c_str(), -1, buffer, MAX_PATH, NULL, NULL); - target = buffer; -} - -std::string GetCurrentDir() -{ - CHAR path[MAX_PATH]; - ::GetModuleFileNameA(::GetModuleHandleA(NULL),path,MAX_PATH); - *strrchr(path,'\\') = 0; - return std::string(path); -} - -std::wstring GetCurrentDirW() -{ - WCHAR path[MAX_PATH]; - ::GetModuleFileNameW(::GetModuleHandleW(NULL),path,MAX_PATH); - *wcsrchr(path,L'\\') = 0; - return std::wstring(path); -} - -std::string& ModifyPath(std::string& path) -{ - if(path[0] == '/' || path[0] == '\\'){ - path = CurrentDir + path; - } - std::replace(path.begin(),path.end(),'/','\\'); - - return path; -} - -std::wstring& ModifyPath(std::wstring& path) -{ - if(path[0] == L'/' || path[0] == L'\\'){ - path = CurrentDirW + path; - } - std::replace(path.begin(),path.end(),L'/',L'\\'); - return path; -} - -std::string GetLastErrSz() -{ - LPWSTR lpMsgBuf; - FormatMessageW( - FORMAT_MESSAGE_ALLOCATE_BUFFER | - FORMAT_MESSAGE_FROM_SYSTEM | - FORMAT_MESSAGE_IGNORE_INSERTS, - NULL, - GetLastError(), - 0, // Default language - (LPWSTR) &lpMsgBuf, - 0, - NULL - ); - std::string Err; - ToNarrowPath(lpMsgBuf, Err); - LocalFree( lpMsgBuf ); - return Err; -} - -std::wstring GetLastErrSzW() -{ - LPVOID lpMsgBuf; - FormatMessageW( - FORMAT_MESSAGE_ALLOCATE_BUFFER | - FORMAT_MESSAGE_FROM_SYSTEM | - FORMAT_MESSAGE_IGNORE_INSERTS, - NULL, - GetLastError(), - 0, // Default language - (LPWSTR) &lpMsgBuf, - 0, - NULL - ); - std::wstring Err = (LPCWSTR)lpMsgBuf; - LocalFree(lpMsgBuf); - return Err; -} - -WorkItemWrapper::WorkItemWrapper( ScheduleProc proc_,void* content_ ) : - proc(proc_),pContent(content_) -{ - -} - -DWORD WINAPI WorkItemWrapperProc(LPVOID pContent) -{ - WorkItemWrapper* item = static_cast(pContent); - ScheduleProc TempProc = item->proc; - void* arg = item->pContent; - delete item; - TempProc(arg); - return 0; -} - -size_t GetPageSize() -{ - SYSTEM_INFO si; - GetSystemInfo(&si); - return std::max(si.dwPageSize,si.dwAllocationGranularity); -} - -const size_t g_PageSize = GetPageSize(); - - -Win32SequentialFile::Win32SequentialFile( const std::string& fname ) : - _filename(fname),_hFile(NULL) -{ - _Init(); -} - -Win32SequentialFile::~Win32SequentialFile() -{ - _CleanUp(); -} - -Status Win32SequentialFile::Read( size_t n, Slice* result, char* scratch ) -{ - Status sRet; - DWORD hasRead = 0; - if(_hFile && ReadFile(_hFile,scratch,n,&hasRead,NULL) ){ - *result = Slice(scratch,hasRead); - } else { - sRet = Status::IOError(_filename, Win32::GetLastErrSz() ); - } - return sRet; -} - -Status Win32SequentialFile::Skip( uint64_t n ) -{ - Status sRet; - LARGE_INTEGER Move,NowPointer; - Move.QuadPart = n; - if(!SetFilePointerEx(_hFile,Move,&NowPointer,FILE_CURRENT)){ - sRet = Status::IOError(_filename,Win32::GetLastErrSz()); - } - return sRet; -} - -BOOL Win32SequentialFile::isEnable() -{ - return _hFile ? TRUE : FALSE; -} - -BOOL Win32SequentialFile::_Init() -{ - std::wstring path; - ToWidePath(_filename, path); - _hFile = CreateFileW(path.c_str(), - GENERIC_READ, - FILE_SHARE_READ | FILE_SHARE_WRITE, - NULL, - OPEN_EXISTING, - FILE_ATTRIBUTE_NORMAL | FILE_FLAG_SEQUENTIAL_SCAN, - NULL); - if (_hFile == INVALID_HANDLE_VALUE) - _hFile = NULL; - return _hFile ? TRUE : FALSE; -} - -void Win32SequentialFile::_CleanUp() -{ - if(_hFile){ - CloseHandle(_hFile); - _hFile = NULL; - } -} - -Win32RandomAccessFile::Win32RandomAccessFile( const std::string& fname ) : - _filename(fname),_hFile(NULL) -{ - std::wstring path; - ToWidePath(fname, path); - _Init( path.c_str() ); -} - -Win32RandomAccessFile::~Win32RandomAccessFile() -{ - _CleanUp(); -} - -Status Win32RandomAccessFile::Read(uint64_t offset,size_t n,Slice* result,char* scratch) const -{ - Status sRet; - OVERLAPPED ol = {0}; - ZeroMemory(&ol,sizeof(ol)); - ol.Offset = (DWORD)offset; - ol.OffsetHigh = (DWORD)(offset >> 32); - DWORD hasRead = 0; - if(!ReadFile(_hFile,scratch,n,&hasRead,&ol)) - sRet = Status::IOError(_filename,Win32::GetLastErrSz()); - else - *result = Slice(scratch,hasRead); - return sRet; -} - -BOOL Win32RandomAccessFile::_Init( LPCWSTR path ) -{ - BOOL bRet = FALSE; - if(!_hFile) - _hFile = ::CreateFileW(path,GENERIC_READ,FILE_SHARE_READ|FILE_SHARE_WRITE,NULL,OPEN_EXISTING, - FILE_ATTRIBUTE_NORMAL | FILE_FLAG_RANDOM_ACCESS,NULL); - if(!_hFile || _hFile == INVALID_HANDLE_VALUE ) - _hFile = NULL; - else - bRet = TRUE; - return bRet; -} - -BOOL Win32RandomAccessFile::isEnable() -{ - return _hFile ? TRUE : FALSE; -} - -void Win32RandomAccessFile::_CleanUp() -{ - if(_hFile){ - ::CloseHandle(_hFile); - _hFile = NULL; - } -} - -Win32WritableFile::Win32WritableFile(const std::string& fname, bool append) - : filename_(fname) -{ - std::wstring path; - ToWidePath(fname, path); - // NewAppendableFile: append to an existing file, or create a new one - // if none exists - this is OPEN_ALWAYS behavior, with - // FILE_APPEND_DATA to avoid having to manually position the file - // pointer at the end of the file. - // NewWritableFile: create a new file, delete if it exists - this is - // CREATE_ALWAYS behavior. This file is used for writing only so - // use GENERIC_WRITE. - _hFile = CreateFileW(path.c_str(), - append ? FILE_APPEND_DATA : GENERIC_WRITE, - FILE_SHARE_READ|FILE_SHARE_DELETE|FILE_SHARE_WRITE, - NULL, - append ? OPEN_ALWAYS : CREATE_ALWAYS, - FILE_ATTRIBUTE_NORMAL, - NULL); - // CreateFileW returns INVALID_HANDLE_VALUE in case of error, always check isEnable() before use -} - -Win32WritableFile::~Win32WritableFile() -{ - if (_hFile != INVALID_HANDLE_VALUE) - Close(); -} - -Status Win32WritableFile::Append(const Slice& data) -{ - DWORD r = 0; - if (!WriteFile(_hFile, data.data(), data.size(), &r, NULL) || r != data.size()) { - return Status::IOError("Win32WritableFile.Append::WriteFile: "+filename_, Win32::GetLastErrSz()); - } - return Status::OK(); -} - -Status Win32WritableFile::Close() -{ - if (!CloseHandle(_hFile)) { - return Status::IOError("Win32WritableFile.Close::CloseHandle: "+filename_, Win32::GetLastErrSz()); - } - _hFile = INVALID_HANDLE_VALUE; - return Status::OK(); -} - -Status Win32WritableFile::Flush() -{ - // Nothing to do here, there are no application-side buffers - return Status::OK(); -} - -Status Win32WritableFile::Sync() -{ - if (!FlushFileBuffers(_hFile)) { - return Status::IOError("Win32WritableFile.Sync::FlushFileBuffers "+filename_, Win32::GetLastErrSz()); - } - return Status::OK(); -} - -BOOL Win32WritableFile::isEnable() -{ - return _hFile != INVALID_HANDLE_VALUE; -} - -Win32FileLock::Win32FileLock( const std::string& fname ) : - _hFile(NULL),_filename(fname) -{ - std::wstring path; - ToWidePath(fname, path); - _Init(path.c_str()); -} - -Win32FileLock::~Win32FileLock() -{ - _CleanUp(); -} - -BOOL Win32FileLock::_Init( LPCWSTR path ) -{ - BOOL bRet = FALSE; - if(!_hFile) - _hFile = ::CreateFileW(path,0,0,NULL,CREATE_ALWAYS,FILE_ATTRIBUTE_NORMAL,NULL); - if(!_hFile || _hFile == INVALID_HANDLE_VALUE ){ - _hFile = NULL; - } - else - bRet = TRUE; - return bRet; -} - -void Win32FileLock::_CleanUp() -{ - ::CloseHandle(_hFile); - _hFile = NULL; -} - -BOOL Win32FileLock::isEnable() -{ - return _hFile ? TRUE : FALSE; -} - -Win32Logger::Win32Logger(WritableFile* pFile) : _pFileProxy(pFile) -{ - assert(_pFileProxy); -} - -Win32Logger::~Win32Logger() -{ - if(_pFileProxy) - delete _pFileProxy; -} - -void Win32Logger::Logv( const char* format, va_list ap ) -{ - uint64_t thread_id = ::GetCurrentThreadId(); - - // We try twice: the first time with a fixed-size stack allocated buffer, - // and the second time with a much larger dynamically allocated buffer. - char buffer[500]; - for (int iter = 0; iter < 2; iter++) { - char* base; - int bufsize; - if (iter == 0) { - bufsize = sizeof(buffer); - base = buffer; - } else { - bufsize = 30000; - base = new char[bufsize]; - } - char* p = base; - char* limit = base + bufsize; - - SYSTEMTIME st; - GetLocalTime(&st); - p += snprintf(p, limit - p, - "%04d/%02d/%02d-%02d:%02d:%02d.%06d %llx ", - int(st.wYear), - int(st.wMonth), - int(st.wDay), - int(st.wHour), - int(st.wMinute), - int(st.wMinute), - int(st.wMilliseconds), - static_cast(thread_id)); - - // Print the message - if (p < limit) { - va_list backup_ap; - va_copy(backup_ap, ap); - p += vsnprintf(p, limit - p, format, backup_ap); - va_end(backup_ap); - } - - // Truncate to available space if necessary - if (p >= limit) { - if (iter == 0) { - continue; // Try again with larger buffer - } else { - p = limit - 1; - } - } - - // Add newline if necessary - if (p == base || p[-1] != '\n') { - *p++ = '\n'; - } - - assert(p <= limit); - DWORD hasWritten = 0; - if(_pFileProxy){ - _pFileProxy->Append(Slice(base, p - base)); - _pFileProxy->Flush(); - } - if (base != buffer) { - delete[] base; - } - break; - } -} - -bool Win32Env::FileExists(const std::string& fname) -{ - std::string path = fname; - std::wstring wpath; - ToWidePath(ModifyPath(path), wpath); - return ::PathFileExistsW(wpath.c_str()) ? true : false; -} - -Status Win32Env::GetChildren(const std::string& dir, std::vector* result) -{ - Status sRet; - ::WIN32_FIND_DATAW wfd; - std::string path = dir; - ModifyPath(path); - path += "\\*.*"; - std::wstring wpath; - ToWidePath(path, wpath); - - ::HANDLE hFind = ::FindFirstFileW(wpath.c_str() ,&wfd); - if(hFind && hFind != INVALID_HANDLE_VALUE){ - BOOL hasNext = TRUE; - std::string child; - while(hasNext){ - ToNarrowPath(wfd.cFileName, child); - if(child != ".." && child != ".") { - result->push_back(child); - } - hasNext = ::FindNextFileW(hFind,&wfd); - } - ::FindClose(hFind); - } - else - sRet = Status::IOError(dir,"Could not get children."); - return sRet; -} - -void Win32Env::SleepForMicroseconds( int micros ) -{ - ::Sleep((micros + 999) /1000); -} - - -Status Win32Env::DeleteFile( const std::string& fname ) -{ - Status sRet; - std::string path = fname; - std::wstring wpath; - ToWidePath(ModifyPath(path), wpath); - - if(!::DeleteFileW(wpath.c_str())) { - sRet = Status::IOError(path, "Could not delete file."); - } - return sRet; -} - -Status Win32Env::GetFileSize( const std::string& fname, uint64_t* file_size ) -{ - Status sRet; - std::string path = fname; - std::wstring wpath; - ToWidePath(ModifyPath(path), wpath); - - HANDLE file = ::CreateFileW(wpath.c_str(), - GENERIC_READ,FILE_SHARE_READ|FILE_SHARE_WRITE,NULL,OPEN_EXISTING,FILE_ATTRIBUTE_NORMAL,NULL); - LARGE_INTEGER li; - if(::GetFileSizeEx(file,&li)){ - *file_size = (uint64_t)li.QuadPart; - }else - sRet = Status::IOError(path,"Could not get the file size."); - CloseHandle(file); - return sRet; -} - -Status Win32Env::RenameFile( const std::string& src, const std::string& target ) -{ - Status sRet; - std::string src_path = src; - std::wstring wsrc_path; - ToWidePath(ModifyPath(src_path), wsrc_path); - std::string target_path = target; - std::wstring wtarget_path; - ToWidePath(ModifyPath(target_path), wtarget_path); - - if(!MoveFileW(wsrc_path.c_str(), wtarget_path.c_str() ) ){ - DWORD err = GetLastError(); - if(err == 0x000000b7){ - if(!::DeleteFileW(wtarget_path.c_str() ) ) - sRet = Status::IOError(src, "Could not rename file."); - else if(!::MoveFileW(wsrc_path.c_str(), - wtarget_path.c_str() ) ) - sRet = Status::IOError(src, "Could not rename file."); - } - } - return sRet; -} - -Status Win32Env::LockFile( const std::string& fname, FileLock** lock ) -{ - Status sRet; - std::string path = fname; - ModifyPath(path); - Win32FileLock* _lock = new Win32FileLock(path); - if(!_lock->isEnable()){ - delete _lock; - *lock = NULL; - sRet = Status::IOError(path, "Could not lock file."); - } - else - *lock = _lock; - return sRet; -} - -Status Win32Env::UnlockFile( FileLock* lock ) -{ - Status sRet; - delete lock; - return sRet; -} - -void Win32Env::Schedule( void (*function)(void* arg), void* arg ) -{ - QueueUserWorkItem(Win32::WorkItemWrapperProc, - new Win32::WorkItemWrapper(function,arg), - WT_EXECUTEDEFAULT); -} - -void Win32Env::StartThread( void (*function)(void* arg), void* arg ) -{ - ::_beginthread(function,0,arg); -} - -Status Win32Env::GetTestDirectory( std::string* path ) -{ - Status sRet; - WCHAR TempPath[MAX_PATH]; - ::GetTempPathW(MAX_PATH,TempPath); - ToNarrowPath(TempPath, *path); - path->append("leveldb\\test\\"); - ModifyPath(*path); - return sRet; -} - -uint64_t Win32Env::NowMicros() -{ -#ifndef USE_VISTA_API -#define GetTickCount64 GetTickCount -#endif - return (uint64_t)(GetTickCount64()*1000); -} - -static Status CreateDirInner( const std::string& dirname ) -{ - Status sRet; - DWORD attr = ::GetFileAttributes(dirname.c_str()); - if (attr == INVALID_FILE_ATTRIBUTES) { // doesn't exist: - std::size_t slash = dirname.find_last_of("\\"); - if (slash != std::string::npos){ - sRet = CreateDirInner(dirname.substr(0, slash)); - if (!sRet.ok()) return sRet; - } - BOOL result = ::CreateDirectory(dirname.c_str(), NULL); - if (result == FALSE) { - sRet = Status::IOError(dirname, "Could not create directory."); - return sRet; - } - } - return sRet; -} - -Status Win32Env::CreateDir( const std::string& dirname ) -{ - std::string path = dirname; - if(path[path.length() - 1] != '\\'){ - path += '\\'; - } - ModifyPath(path); - - return CreateDirInner(path); -} - -Status Win32Env::DeleteDir( const std::string& dirname ) -{ - Status sRet; - std::wstring path; - ToWidePath(dirname, path); - ModifyPath(path); - if(!::RemoveDirectoryW( path.c_str() ) ){ - sRet = Status::IOError(dirname, "Could not delete directory."); - } - return sRet; -} - -Status Win32Env::NewSequentialFile( const std::string& fname, SequentialFile** result ) -{ - Status sRet; - std::string path = fname; - ModifyPath(path); - Win32SequentialFile* pFile = new Win32SequentialFile(path); - if(pFile->isEnable()){ - *result = pFile; - }else { - delete pFile; - sRet = Status::IOError(path, Win32::GetLastErrSz()); - } - return sRet; -} - -Status Win32Env::NewRandomAccessFile( const std::string& fname, RandomAccessFile** result ) -{ - Status sRet; - std::string path = fname; - Win32RandomAccessFile* pFile = new Win32RandomAccessFile(ModifyPath(path)); - if(!pFile->isEnable()){ - delete pFile; - *result = NULL; - sRet = Status::IOError(path, Win32::GetLastErrSz()); - }else - *result = pFile; - return sRet; -} - -Status Win32Env::NewLogger( const std::string& fname, Logger** result ) -{ - Status sRet; - std::string path = fname; - // Logs are opened with write semantics, not with append semantics - // (see PosixEnv::NewLogger) - Win32WritableFile* pMapFile = new Win32WritableFile(ModifyPath(path), false); - if(!pMapFile->isEnable()){ - delete pMapFile; - *result = NULL; - sRet = Status::IOError(path,"could not create a logger."); - }else - *result = new Win32Logger(pMapFile); - return sRet; -} - -Status Win32Env::NewWritableFile( const std::string& fname, WritableFile** result ) -{ - Status sRet; - std::string path = fname; - Win32WritableFile* pFile = new Win32WritableFile(ModifyPath(path), false); - if(!pFile->isEnable()){ - *result = NULL; - sRet = Status::IOError(fname,Win32::GetLastErrSz()); - }else - *result = pFile; - return sRet; -} - -Status Win32Env::NewAppendableFile( const std::string& fname, WritableFile** result ) -{ - Status sRet; - std::string path = fname; - Win32WritableFile* pFile = new Win32WritableFile(ModifyPath(path), true); - if(!pFile->isEnable()){ - *result = NULL; - sRet = Status::IOError(fname,Win32::GetLastErrSz()); - }else - *result = pFile; - return sRet; -} - -Win32Env::Win32Env() -{ - -} - -Win32Env::~Win32Env() -{ - -} - - -} // Win32 namespace - -static port::OnceType once = LEVELDB_ONCE_INIT; -static Env* default_env; -static void InitDefaultEnv() { default_env = new Win32::Win32Env(); } - -Env* Env::Default() { - port::InitOnce(&once, InitDefaultEnv); - return default_env; -} - -} // namespace leveldb - -#endif // defined(LEVELDB_PLATFORM_WINDOWS) diff --git a/src/leveldb/util/expiry_os.cc b/src/leveldb/util/expiry_os.cc new file mode 100644 index 000000000..57aadcac3 --- /dev/null +++ b/src/leveldb/util/expiry_os.cc @@ -0,0 +1,408 @@ +// ------------------------------------------------------------------- +// +// expiry_os.cc +// +// Copyright (c) 2016-2017 Basho Technologies, Inc. All Rights Reserved. +// +// This file is provided to you under the Apache License, +// Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain +// a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// ------------------------------------------------------------------- + +#define __STDC_FORMAT_MACROS +#include +#include + +#include "leveldb/perf_count.h" +#include "leveldb/env.h" +#include "db/dbformat.h" +#include "db/db_impl.h" +#include "db/version_set.h" +#include "util/expiry_os.h" +#include "util/logging.h" +#include "util/throttle.h" + +namespace leveldb { + +// sext key for Riak's meta data +static const char * lRiakMetaDataKey= + {"\x10\x00\x00\x00\x02\x0c\xb6\xd9\x00\x08"}; +static const size_t lRiakMetaDataKeyLen=10; + +/** + * settings information that gets dumped to LOG upon + * leveldb start + */ +void +ExpiryModuleOS::Dump( + Logger * log) const +{ + Log(log," ExpiryModuleOS.expiry_enabled: %s", IsExpiryEnabled() ? "true" : "false"); + Log(log," ExpiryModuleOS.expiry_minutes: %" PRIu64, GetExpiryMinutes()); + Log(log,"ExpiryModuleOS.expiry_unlimited: %s", IsExpiryUnlimited() ? "true" : "false"); + Log(log," ExpiryModuleOS.whole_files: %s", IsWholeFileExpiryEnabled() ? "true" : "false"); + + return; + +} // ExpiryModuleOS::Dump + + +/** + * db/write_batch.cc MemTableInserter() uses this to initialize + * expiry info. + */ +bool +ExpiryModuleOS::MemTableInserterCallback( + const Slice & Key, // input: user's key about to be written + const Slice & Value, // input: user's value object + ValueType & ValType, // input/output: key type. call might change + ExpiryTimeMicros & Expiry) // input/output: 0 or specific expiry. call might change + const +{ + bool good(true); + + // only update the expiry time if explicit type + // without expiry, OR ExpiryMinutes set and not internal key + if ((kTypeValueWriteTime==ValType && 0==Expiry) + || (kTypeValue==ValType + && (0!=GetExpiryMinutes() || IsExpiryUnlimited()) + && IsExpiryEnabled() + && (Key.size() & files(Ver.GetFileList(Level)); + std::vector::const_iterator it; + + now_micros=GetCachedTimeMicros(); + for (it=files.begin(); (!expired_file || WantAll) && files.end()!=it; ++it) + { + // First, is file eligible? + expired_file=IsFileExpired(*(*it), now_micros); + + // identified an expired file, do any higher levels overlap + // its key range? + if (expired_file) + { + int test; + Slice small, large; + + for (test=Level+1; + testsmallest.user_key(); + large=(*it)->largest.user_key(); + expired_file=!Ver.OverlapInLevel(test, &small, + &large); + } // for + ret_flag=ret_flag || expired_file; + } // if + + // expired_file and no overlap? mark it for delete + if (expired_file && NULL!=Edit) + { + Edit->DeleteFile((*it)->level, (*it)->number); + } // if + } // for + } // if + + return(ret_flag); + +} // ExpiryModuleOS::CompactionFinalizeCallback + + +/** + * Review the metadata of one file to see if it is + * eligible for file expiry + */ +bool +ExpiryModuleOS::IsFileExpired( + const FileMetaData & SstFile, + ExpiryTimeMicros NowMicros) const +{ + bool expired_file; + ExpiryTimeMicros aged_micros; + + aged_micros=NowMicros - GetExpiryMinutes()*60*port::UINT64_ONE_SECOND_MICROS; + + // must test whole_file_expiry here since this could be + // a bucket's ExpiryModuleOS object, not the default in Options + expired_file = (IsExpiryEnabled() && IsWholeFileExpiryEnabled()); + + // - if exp_write_low is zero, game over - contains non-expiry records + // - if exp_write_high is below current aged time and aging enabled, + // or no exp_write_high keys (is zero) + // - highest explicit expiry (exp_explicit_high) is non-zero and below now + // Note: say file only contained deleted records: ... still delete file + // exp_write_low would be ULLONG_MAX, exp_write_high would be 0, exp_explicit_high would be zero + expired_file = expired_file && (0!=SstFile.exp_write_low) + && (0!=SstFile.exp_write_high || 0!=SstFile.exp_explicit_high); + expired_file = expired_file && ((SstFile.exp_write_high<=aged_micros + && 0!=GetExpiryMinutes() && !IsExpiryUnlimited()) + || 0==SstFile.exp_write_high); + + expired_file = expired_file && (0==SstFile.exp_explicit_high + || (0!=SstFile.exp_explicit_high + && SstFile.exp_explicit_high<=NowMicros)); + + return(expired_file); + +} // ExpiryModuleOS::IsFileExpired + + +/** + * Riak specific routine to process whole file expiry. + * Code here derived from DBImpl::CompactMemTable() in db/db_impl.cc + */ +Status +DBImpl::BackgroundExpiry( + Compaction * Compact) +{ + Status s; + size_t count; + + mutex_.AssertHeld(); + assert(NULL != Compact && NULL!=options_.expiry_module.get()); + assert(NULL != Compact->version()); + + if (NULL!=Compact && options_.ExpiryActivated()) + { + VersionEdit edit; + int level(Compact->level()); + + // Compact holds a reference count to version()/input_version_ + const Version* base = Compact->version(); + options_.expiry_module->CompactionFinalizeCallback(true, *base, level, + &edit); + count=edit.DeletedFileCount(); + + if (s.ok() && shutting_down_.Acquire_Load()) { + s = Status::IOError("Deleting DB during expiry compaction"); + } + + // push expired list to manifest + if (s.ok() && 0!=count) + { + s = versions_->LogAndApply(&edit, &mutex_); + if (s.ok()) + gPerfCounters->Add(ePerfExpiredFiles, count); + else + s = Status::IOError("LogAndApply error during expiry compaction"); + } // if + + // Commit to the new state + if (s.ok() && 0!=count) + { + // get rid of Compact now to potential free + // input version's files + delete Compact; + Compact=NULL; + + DeleteObsoleteFiles(); + + // release mutex when writing to log file + mutex_.Unlock(); + + Log(options_.info_log, + "Expired: %zd files from level %d", + count, level); + mutex_.Lock(); + } // if + } // if + + // convention in BackgroundCompaction() is to delete Compact here + delete Compact; + + return s; + +} // DBImpl:BackgroundExpiry + + +} // namespace leveldb diff --git a/src/leveldb/util/expiry_os.h b/src/leveldb/util/expiry_os.h new file mode 100644 index 000000000..f4044b85e --- /dev/null +++ b/src/leveldb/util/expiry_os.h @@ -0,0 +1,137 @@ +// ------------------------------------------------------------------- +// +// expiry_os.h +// +// Copyright (c) 2016-2017 Basho Technologies, Inc. All Rights Reserved. +// +// This file is provided to you under the Apache License, +// Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain +// a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// ------------------------------------------------------------------- + +#ifndef EXPIRY_OS_H +#define EXPIRY_OS_H + +#include + +#include "leveldb/options.h" +#include "leveldb/expiry.h" +#include "leveldb/perf_count.h" +#include "db/dbformat.h" +#include "db/version_edit.h" + +namespace leveldb +{ + +class ExpiryModuleOS : public ExpiryModule +{ +public: + ExpiryModuleOS() + : expiry_enabled(false), expiry_minutes(0), + expiry_unlimited(false), whole_file_expiry(false) + {}; + + virtual ~ExpiryModuleOS() {}; + + // Print expiry options to LOG file + virtual void Dump(Logger * log) const; + + // Quick test to allow manifest logic and such know if + // extra expiry logic should be checked + virtual bool ExpiryActivated() const {return(expiry_enabled);}; + + // db/write_batch.cc MemTableInserter::Put() calls this. + // returns false on internal error + virtual bool MemTableInserterCallback( + const Slice & Key, // input: user's key about to be written + const Slice & Value, // input: user's value object + ValueType & ValType, // input/output: key type. call might change + ExpiryTimeMicros & Expiry) const; // input/output: 0 or specific expiry. call might change + + // db/dbformat.cc KeyRetirement::operator() calls this. + // db/version_set.cc SaveValue() calls this too. + // returns true if key is expired, returns false if key not expired + virtual bool KeyRetirementCallback( + const ParsedInternalKey & Ikey) const; // input: key to examine for retirement + + // table/table_builder.cc TableBuilder::Add() calls this. + // returns false on internal error + virtual bool TableBuilderCallback( + const Slice & key, // input: internal key + SstCounters & counters) const; // input/output: counters for new sst table + + // db/memtable.cc MemTable::Get() calls this. + // returns true if type/expiry is expired, returns false if not expired + virtual bool MemTableCallback( + const Slice & Key) const; // input: leveldb internal key + + // db/version_set.cc VersionSet::Finalize() calls this if no + // other compaction selected for a level + // returns true if there is an expiry compaction eligible + virtual bool CompactionFinalizeCallback( + bool WantAll, // input: true - examine all expired files + const Version & Ver, // input: database state for examination + int Level, // input: level to review for expiry + VersionEdit * Edit) const; // output: NULL or destination of delete list + + // utility to CompactionFinalizeCallback to review + // characteristics of one SstFile to see if entirely expired + virtual bool IsFileExpired(const FileMetaData & SstFile, ExpiryTimeMicros Now) const; + + // Accessors to option parameters + bool IsExpiryEnabled() const {return(expiry_enabled);}; + void SetExpiryEnabled(bool Flag=true) {expiry_enabled=Flag;}; + + bool IsExpiryUnlimited() const {return(expiry_unlimited);}; + void SetExpiryUnlimited(bool Flag=true) {expiry_unlimited=Flag;}; + + uint64_t GetExpiryMinutes() const {return(expiry_minutes);}; + void SetExpiryMinutes(uint64_t Minutes) {expiry_minutes=Minutes; expiry_unlimited=false;}; + + bool IsWholeFileExpiryEnabled() const {return(whole_file_expiry);}; + void SetWholeFileExpiryEnabled(bool Flag=true) {whole_file_expiry=Flag;}; + +public: + // NOTE: option names below are intentionally public and lowercase with underscores. + // This is to match style of options within include/leveldb/options.h. + + // Riak specific option to enable/disable expiry features globally + // true: expiry enabled + // false: disabled (some expired keys may reappear) + bool expiry_enabled; + + // Riak specific option giving number of minutes a stored key/value + // may stay within the database before automatic deletion. Zero + // disables expiry by age feature. + uint64_t expiry_minutes; + bool expiry_unlimited; + + // Riak specific option authorizing leveldb to eliminate entire + // files that contain expired data (delete files instead of + // removing expired data during compactions). + bool whole_file_expiry; + +protected: + // When "creating" write time, chose its source based upon + // open source versus enterprise edition + virtual uint64_t GenerateWriteTimeMicros(const Slice & Key, const Slice & Value) const; + + +}; // ExpiryModuleOS + +uint64_t CuttlefishDurationMinutes(const char * Buf); + +} // namespace leveldb + +#endif // ifndef diff --git a/src/leveldb/util/expiry_os_test.cc b/src/leveldb/util/expiry_os_test.cc new file mode 100644 index 000000000..5505ef339 --- /dev/null +++ b/src/leveldb/util/expiry_os_test.cc @@ -0,0 +1,1659 @@ +// ------------------------------------------------------------------- +// +// expiry_os_tests.cc +// +// Copyright (c) 2016-2017 Basho Technologies, Inc. All Rights Reserved. +// +// This file is provided to you under the Apache License, +// Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain +// a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// ------------------------------------------------------------------- + +#include +#include +#include +#include + +#include "util/testharness.h" +#include "util/testutil.h" + +#include "leveldb/comparator.h" +#include "leveldb/env.h" +#include "leveldb/options.h" +#include "leveldb/slice.h" +#include "leveldb/write_batch.h" + +#include "db/db_impl.h" +#include "db/dbformat.h" +#include "db/filename.h" +#include "db/version_set.h" +#include "port/port.h" +#include "util/expiry_os.h" +#include "util/mutexlock.h" +#include "util/throttle.h" + +/** + * Execution routine + */ +int main(int argc, char** argv) +{ + return leveldb::test::RunAllTests(); +} + + +namespace leveldb { + +// helper function to clean up heap objects +static void ClearMetaArray(Version::FileMetaDataVector_t & ClearMe); + + +/** + * Wrapper class for tests. Holds working variables + * and helper functions. + */ +class ExpiryTester +{ +public: + ExpiryTester() + { + }; + + ~ExpiryTester() + { + }; +}; // class ExpiryTester + + +/** + * Validate option defaults + */ +TEST(ExpiryTester, Defaults) +{ + ExpiryModuleOS expiry; + + ASSERT_EQ(expiry.IsExpiryEnabled(), false); + ASSERT_EQ(expiry.GetExpiryMinutes(), 0); + ASSERT_EQ(expiry.IsExpiryUnlimited(), false); + ASSERT_EQ(expiry.IsWholeFileExpiryEnabled(), false); + ASSERT_EQ(expiry.ExpiryActivated(), false); + +} // test Defaults + + +/** + * Validate MemTableInserterCallback + */ +TEST(ExpiryTester, MemTableInserterCallback) +{ + bool flag; + uint64_t before, after; + ExpiryModuleOS module; + ValueType type; + ExpiryTimeMicros expiry; + Slice key, value; + + module.SetExpiryEnabled(true); + module.SetWholeFileExpiryEnabled(true); + ASSERT_EQ(module.ExpiryActivated(), true); + + // deletion, do nothing + type=kTypeDeletion; + expiry=0; + flag=module.MemTableInserterCallback(key, value, type, expiry); + ASSERT_EQ(flag, true); + ASSERT_EQ(type, kTypeDeletion); + ASSERT_EQ(expiry, 0); + + // plain value, needs expiry + type=kTypeValue; + expiry=0; + module.SetExpiryMinutes(30); + before=port::TimeMicros(); + SetCachedTimeMicros(before); + flag=module.MemTableInserterCallback(key, value, type, expiry); + after=port::TimeMicros(); + ASSERT_EQ(flag, true); + ASSERT_EQ(type, kTypeValueWriteTime); + ASSERT_TRUE(before <= expiry && expiry <=after && 0!=expiry); + + // plain value, needs expiry + type=kTypeValue; + expiry=0; + module.SetExpiryUnlimited(true); + before=port::TimeMicros(); + SetCachedTimeMicros(before); + flag=module.MemTableInserterCallback(key, value, type, expiry); + after=port::TimeMicros(); + ASSERT_EQ(flag, true); + ASSERT_EQ(type, kTypeValueWriteTime); + ASSERT_TRUE(before <= expiry && expiry <=after && 0!=expiry); + + // plain value, expiry disabled + type=kTypeValue; + expiry=0; + module.SetExpiryMinutes(0); + before=port::TimeMicros(); + SetCachedTimeMicros(before); + flag=module.MemTableInserterCallback(key, value, type, expiry); + after=port::TimeMicros(); + ASSERT_EQ(flag, true); + ASSERT_EQ(type, kTypeValue); + ASSERT_EQ(expiry, 0); + + // write time value, needs expiry + type=kTypeValueWriteTime; + expiry=0; + module.SetExpiryMinutes(30); + before=port::TimeMicros(); + SetCachedTimeMicros(before); + flag=module.MemTableInserterCallback(key, value, type, expiry); + after=port::TimeMicros(); + ASSERT_EQ(flag, true); + ASSERT_EQ(type, kTypeValueWriteTime); + ASSERT_TRUE(before <= expiry && expiry <=after && 0!=expiry); + + // write time value, expiry supplied (as if copied from another db) + type=kTypeValueWriteTime; + module.SetExpiryMinutes(30); + before=port::TimeMicros(); + expiry=before - 1000; + SetCachedTimeMicros(before); + flag=module.MemTableInserterCallback(key, value, type, expiry); + after=port::TimeMicros(); + ASSERT_EQ(flag, true); + ASSERT_EQ(type, kTypeValueWriteTime); + ASSERT_TRUE((before - 1000) == expiry && expiry <=after && 0!=expiry); + + // explicit expiry, not changed + type=kTypeValueExplicitExpiry; + expiry=97531; + module.SetExpiryMinutes(30); + flag=module.MemTableInserterCallback(key, value, type, expiry); + ASSERT_EQ(flag, true); + ASSERT_EQ(type, kTypeValueExplicitExpiry); + ASSERT_EQ(expiry, 97531); + +} // test MemTableInserterCallback + + +/** + * Validate MemTableCallback + * (supports KeyRetirementCallback in generic case) + */ +TEST(ExpiryTester, MemTableCallback) +{ + bool flag; + uint64_t before, after; + ExpiryModuleOS module; + ValueType type; + ExpiryTimeMicros expiry; + Slice key, value; + + ASSERT_EQ(module.ExpiryActivated(), false); + module.SetExpiryEnabled(true); + module.SetWholeFileExpiryEnabled(true); + module.SetExpiryMinutes(5); + ASSERT_EQ(module.ExpiryActivated(), true); + + before=port::TimeMicros(); + SetCachedTimeMicros(before); + + // deletion, do nothing + InternalKey key1("DeleteMeKey", 0, 0, kTypeDeletion); + flag=module.MemTableCallback(key1.internal_key()); + ASSERT_EQ(flag, false); + + // plain value, no expiry + InternalKey key2("PlainKey", 0, 0, kTypeValue); + flag=module.MemTableCallback(key2.internal_key()); + ASSERT_EQ(flag, false); + + // explicit, but time in the future + after=GetCachedTimeMicros() + 60*port::UINT64_ONE_SECOND_MICROS; + InternalKey key3("ExplicitKey", after, 0, kTypeValueExplicitExpiry); + flag=module.MemTableCallback(key3.internal_key()); + ASSERT_EQ(flag, false); + // advance the clock + SetCachedTimeMicros(after + 60*port::UINT64_ONE_SECOND_MICROS); + flag=module.MemTableCallback(key3.internal_key()); + ASSERT_EQ(flag, true); + // disable expiry + module.SetExpiryEnabled(false); + ASSERT_EQ(module.ExpiryActivated(), false); + + flag=module.MemTableCallback(key3.internal_key()); + ASSERT_EQ(flag, false); + + // age expiry + module.SetExpiryEnabled(true); + ASSERT_EQ(module.ExpiryActivated(), true); + module.SetExpiryMinutes(2); + after=GetCachedTimeMicros(); + InternalKey key4("AgeKey", after, 0, kTypeValueWriteTime); + flag=module.MemTableCallback(key4.internal_key()); + ASSERT_EQ(flag, false); + // advance the clock + SetCachedTimeMicros(after + 60*port::UINT64_ONE_SECOND_MICROS); + flag=module.MemTableCallback(key4.internal_key()); + ASSERT_EQ(flag, false); + SetCachedTimeMicros(after + 120*port::UINT64_ONE_SECOND_MICROS); + flag=module.MemTableCallback(key4.internal_key()); + ASSERT_EQ(flag, true); + // disable expiry + module.SetExpiryEnabled(false); + flag=module.MemTableCallback(key4.internal_key()); + ASSERT_EQ(flag, false); + // switch to unlimited + module.SetExpiryEnabled(true); + module.SetExpiryUnlimited(true); + flag=module.MemTableCallback(key4.internal_key()); + ASSERT_EQ(flag, false); + +} // test MemTableCallback + + +/** + * Wrapper class to Version that allows manipulation + * of internal objects for testing purposes + */ +class VersionTester : public Version +{ +public: + VersionTester() : Version(&m_Vset), m_Icmp(m_Options.comparator), + m_Vset("", &m_Options, NULL, &m_Icmp) {}; + + void SetFileList(int Level, FileMetaDataVector_t & Files) + {files_[Level]=Files;}; + + Options m_Options; + InternalKeyComparator m_Icmp; + VersionSet m_Vset; +}; // class VersionTester + + +/** + * Validate CompactionFinalizeCallback's + * identification of expired files + */ + +TEST(ExpiryTester, CompactionFinalizeCallback1) +{ + bool flag; + uint64_t now, aged, temp_time; + std::vector files; + FileMetaData * file_ptr; + ExpiryModuleOS module; + VersionTester ver; + int level; + + ASSERT_EQ(ver.m_Options.ExpiryActivated(), false); + + module.SetExpiryEnabled(true); + module.SetWholeFileExpiryEnabled(true); + module.SetExpiryMinutes(5); + level=config::kNumOverlapLevels; + + now=port::TimeMicros(); + SetCachedTimeMicros(now); + + // put two files into the level, no expiry + file_ptr=new FileMetaData; + file_ptr->smallest.SetFrom(ParsedInternalKey("AA1", 0, 1, kTypeValue)); + file_ptr->largest.SetFrom(ParsedInternalKey("CC1", 0, 2, kTypeValue)); + files.push_back(file_ptr); + + file_ptr=new FileMetaData; + file_ptr->smallest.SetFrom(ParsedInternalKey("DD1", 0, 3, kTypeValue)); + file_ptr->largest.SetFrom(ParsedInternalKey("FF1", 0, 4, kTypeValue)); + files.push_back(file_ptr); + + // disable + module.SetExpiryEnabled(false); + module.SetWholeFileExpiryEnabled(false); + module.SetExpiryMinutes(0); + ver.SetFileList(level, files); + flag=module.CompactionFinalizeCallback(true, ver, level, NULL); + ASSERT_EQ(flag, false); + flag=module.CompactionFinalizeCallback(false, ver, level, NULL); + ASSERT_EQ(flag, false); + + // enable and move clock + module.SetExpiryEnabled(true); + module.SetWholeFileExpiryEnabled(true); + module.SetExpiryMinutes(1); + SetCachedTimeMicros(now + 120*port::UINT64_ONE_SECOND_MICROS); + ver.SetFileList(level, files); + flag=module.CompactionFinalizeCallback(true, ver, level, NULL); + ASSERT_EQ(flag, false); + flag=module.CompactionFinalizeCallback(false, ver, level, NULL); + ASSERT_EQ(flag, false); + + // add file only containing explicit + // (explicit only shown in counts, not keys) + file_ptr=new FileMetaData; + file_ptr->smallest.SetFrom(ParsedInternalKey("GG1", 0, 5, kTypeValue)); + file_ptr->largest.SetFrom(ParsedInternalKey("HH1", 0, 6, kTypeValue)); + file_ptr->exp_write_low=ULLONG_MAX; // sign of no aged expiry, or plain keys + file_ptr->exp_explicit_high=now + 60*port::UINT64_ONE_SECOND_MICROS; + files.push_back(file_ptr); + + // disable + module.SetExpiryEnabled(false); + module.SetWholeFileExpiryEnabled(false); + module.SetExpiryMinutes(0); + ver.SetFileList(level, files); + flag=module.CompactionFinalizeCallback(true, ver, level, NULL); + ASSERT_EQ(flag, false); + flag=module.CompactionFinalizeCallback(false, ver, level, NULL); + ASSERT_EQ(flag, false); + + // enable compaction expiry only + module.SetExpiryEnabled(true); + module.SetWholeFileExpiryEnabled(false); + module.SetExpiryMinutes(1); + ver.SetFileList(level, files); + flag=module.CompactionFinalizeCallback(true, ver, level, NULL); + ASSERT_EQ(flag, false); + flag=module.CompactionFinalizeCallback(false, ver, level, NULL); + ASSERT_EQ(flag, false); + + // enable file expiry too + module.SetWholeFileExpiryEnabled(true); + module.SetExpiryMinutes(1); + flag=module.CompactionFinalizeCallback(true, ver, level, NULL); + ASSERT_EQ(flag, true); + flag=module.CompactionFinalizeCallback(false, ver, level, NULL); + ASSERT_EQ(flag, true); + + // enable file, but not expiry minutes (disable) + // ... but file without aged expiries or plain keys + module.SetWholeFileExpiryEnabled(true); + module.SetExpiryMinutes(0); + ver.SetFileList(level, files); + flag=module.CompactionFinalizeCallback(true, ver, level, NULL); + ASSERT_EQ(flag, true); + flag=module.CompactionFinalizeCallback(false, ver, level, NULL); + ASSERT_EQ(flag, true); + + // enable file, minutes as unlimited + // ... but file without aged expiries or plain keys + module.SetWholeFileExpiryEnabled(true); + module.SetExpiryUnlimited(true); + ver.SetFileList(level, files); + flag=module.CompactionFinalizeCallback(true, ver, level, NULL); + ASSERT_EQ(flag, true); + flag=module.CompactionFinalizeCallback(false, ver, level, NULL); + ASSERT_EQ(flag, true); + + // remove explicit + files.pop_back(); + delete file_ptr; + + // add file only containing aged + // (aging only shown in counts, not keys) + file_ptr=new FileMetaData; + file_ptr->smallest.SetFrom(ParsedInternalKey("II1", 0, 7, kTypeValue)); + file_ptr->largest.SetFrom(ParsedInternalKey("JJ1", 0, 8, kTypeValue)); + file_ptr->exp_write_low=now - 60*port::UINT64_ONE_SECOND_MICROS; + file_ptr->exp_write_high=now + 60*port::UINT64_ONE_SECOND_MICROS; + files.push_back(file_ptr); + + // disable + module.SetWholeFileExpiryEnabled(false); + module.SetExpiryMinutes(0); + ver.SetFileList(level, files); + flag=module.CompactionFinalizeCallback(true, ver, level, NULL); + ASSERT_EQ(flag, false); + flag=module.CompactionFinalizeCallback(false, ver, level, NULL); + ASSERT_EQ(flag, false); + + // enable compaction only + module.SetWholeFileExpiryEnabled(false); + module.SetExpiryMinutes(1); + flag=module.CompactionFinalizeCallback(true, ver, level, NULL); + ASSERT_EQ(flag, false); + flag=module.CompactionFinalizeCallback(false, ver, level, NULL); + ASSERT_EQ(flag, false); + + // enable file too + module.SetWholeFileExpiryEnabled(true); + module.SetExpiryMinutes(1); + flag=module.CompactionFinalizeCallback(true, ver, level, NULL); + ASSERT_EQ(flag, true); + flag=module.CompactionFinalizeCallback(false, ver, level, NULL); + ASSERT_EQ(flag, true); + + // enable file, but not expiry minutes (disable) + module.SetWholeFileExpiryEnabled(true); + module.SetExpiryMinutes(0); + flag=module.CompactionFinalizeCallback(true, ver, level, NULL); + ASSERT_EQ(flag, false); + flag=module.CompactionFinalizeCallback(false, ver, level, NULL); + ASSERT_EQ(flag, false); + + // enable file, but unlimited minutes + module.SetWholeFileExpiryEnabled(true); + module.SetExpiryUnlimited(true); + flag=module.CompactionFinalizeCallback(true, ver, level, NULL); + ASSERT_EQ(flag, false); + flag=module.CompactionFinalizeCallback(false, ver, level, NULL); + ASSERT_EQ(flag, false); + + // file_ptr at 1min, setting at 5 min + module.SetWholeFileExpiryEnabled(true); + module.SetExpiryMinutes(5); + flag=module.CompactionFinalizeCallback(true, ver, level, NULL); + ASSERT_EQ(flag, false); + flag=module.CompactionFinalizeCallback(false, ver, level, NULL); + ASSERT_EQ(flag, false); + + // file_ptr at 1min, setting at 1m, clock at 30 seconds + module.SetWholeFileExpiryEnabled(true); + module.SetExpiryMinutes(1); + SetCachedTimeMicros(now + 30*port::UINT64_ONE_SECOND_MICROS); + flag=module.CompactionFinalizeCallback(true, ver, level, NULL); + ASSERT_EQ(flag, false); + flag=module.CompactionFinalizeCallback(false, ver, level, NULL); + ASSERT_EQ(flag, false); + + // file_ptr at 1min, setting at 1m, clock at 1.5minutes + module.SetWholeFileExpiryEnabled(true); + module.SetExpiryMinutes(1); + SetCachedTimeMicros(now + 90*port::UINT64_ONE_SECOND_MICROS); + flag=module.CompactionFinalizeCallback(true, ver, level, NULL); + ASSERT_EQ(flag, false); + flag=module.CompactionFinalizeCallback(false, ver, level, NULL); + ASSERT_EQ(flag, false); + + // file_ptr at 1min, setting at 1m, clock at 2minutes + module.SetWholeFileExpiryEnabled(true); + module.SetExpiryMinutes(1); + SetCachedTimeMicros(now + 120*port::UINT64_ONE_SECOND_MICROS); + flag=module.CompactionFinalizeCallback(true, ver, level, NULL); + ASSERT_EQ(flag, true); + flag=module.CompactionFinalizeCallback(false, ver, level, NULL); + ASSERT_EQ(flag, true); + + // same settings, but show an explicit expiry too that has not + // expired + file_ptr->exp_explicit_high=now +240*port::UINT64_ONE_SECOND_MICROS; + flag=module.CompactionFinalizeCallback(true, ver, level, NULL); + ASSERT_EQ(flag, false); + flag=module.CompactionFinalizeCallback(false, ver, level, NULL); + ASSERT_EQ(flag, false); + + // same settings, but show an explicit expiry has expired + // expired + file_ptr->exp_explicit_high=now +90*port::UINT64_ONE_SECOND_MICROS; + flag=module.CompactionFinalizeCallback(true, ver, level, NULL); + ASSERT_EQ(flag, true); + flag=module.CompactionFinalizeCallback(false, ver, level, NULL); + ASSERT_EQ(flag, true); + + // bug 1 - thank you Paul Place + // try having the expired file first in the list, followed by non-expired files + std::vector files1(files.size()); + std::reverse_copy(files.begin(), files.end(), files1.begin()); + ver.SetFileList(level, files1); + flag=module.CompactionFinalizeCallback(true, ver, level, NULL); + ASSERT_EQ(flag, true); + flag=module.CompactionFinalizeCallback(false, ver, level, NULL); + ASSERT_EQ(flag, true); + ver.SetFileList(level, files); + + // same settings, explicit has expired, but not the aged + // expired + file_ptr->exp_write_high=now +240*port::UINT64_ONE_SECOND_MICROS; + flag=module.CompactionFinalizeCallback(true, ver, level, NULL); + ASSERT_EQ(flag, false); + flag=module.CompactionFinalizeCallback(false, ver, level, NULL); + ASSERT_EQ(flag, false); + + // variations on Bug 1 test. Put singleton expired file in + // first, second, then third position. Other two no expiry + files[0]->exp_write_low=ULLONG_MAX; // sign of no aged expiry, or plain keys + files[0]->exp_write_high=0; + files[0]->exp_explicit_high=now +90*port::UINT64_ONE_SECOND_MICROS; + files[1]->exp_write_low=ULLONG_MAX; // sign of no aged expiry, or plain keys + files[1]->exp_write_high=0; + files[1]->exp_explicit_high=0; + files[2]->exp_write_low=ULLONG_MAX; // sign of no aged expiry, or plain keys + files[2]->exp_write_high=0; + files[2]->exp_explicit_high=0; + flag=module.CompactionFinalizeCallback(true, ver, level, NULL); + ASSERT_EQ(flag, true); + flag=module.CompactionFinalizeCallback(false, ver, level, NULL); + ASSERT_EQ(flag, true); + files[0]->exp_explicit_high=0; + files[1]->exp_explicit_high=now +90*port::UINT64_ONE_SECOND_MICROS; + flag=module.CompactionFinalizeCallback(true, ver, level, NULL); + ASSERT_EQ(flag, true); + flag=module.CompactionFinalizeCallback(false, ver, level, NULL); + ASSERT_EQ(flag, true); + files[1]->exp_explicit_high=0; + files[2]->exp_explicit_high=now +90*port::UINT64_ONE_SECOND_MICROS; + flag=module.CompactionFinalizeCallback(true, ver, level, NULL); + ASSERT_EQ(flag, true); + flag=module.CompactionFinalizeCallback(false, ver, level, NULL); + ASSERT_EQ(flag, true); + + // clean up phony files or Version destructor will crash + ClearMetaArray(files); + ver.SetFileList(level,files); + +} // test CompactionFinalizeCallback + + +/** + * Building static sets of file levels to increase visibility + */ + +struct TestFileMetaData +{ + uint64_t m_Number; // file number + const char * m_Smallest; + const char * m_Largest; + ExpiryTimeMicros m_Expiry1; // minutes + ExpiryTimeMicros m_Expiry2; + ExpiryTimeMicros m_Expiry3; +}; + + +static void +ClearMetaArray( + Version::FileMetaDataVector_t & ClearMe) +{ + // clean up phony files or Version destructor will crash + std::vector::iterator it; + for (it=ClearMe.begin(); ClearMe.end()!=it; ++it) + delete (*it); + ClearMe.clear(); + +} // ClearMetaArray + + +static void +CreateMetaArray( + Version::FileMetaDataVector_t & Output, + TestFileMetaData * Data, + size_t Count) +{ + size_t loop; + TestFileMetaData * cursor; + FileMetaData * file_ptr; + ExpiryTimeMicros now; + + ClearMetaArray(Output); + now=GetCachedTimeMicros(); + + for (loop=0, cursor=Data; loopnumber=cursor->m_Number; + file_ptr->smallest.SetFrom(ParsedInternalKey(cursor->m_Smallest, 0, cursor->m_Number, kTypeValue)); + file_ptr->largest.SetFrom(ParsedInternalKey(cursor->m_Largest, 0, cursor->m_Number, kTypeValue)); + if (0!=cursor->m_Expiry1) + { + if (ULLONG_MAX!=cursor->m_Expiry1) + file_ptr->exp_write_low=now + cursor->m_Expiry1*60000000; + else + file_ptr->exp_write_low=cursor->m_Expiry1; + } // if + + if (0!=cursor->m_Expiry2) + file_ptr->exp_write_high=now + cursor->m_Expiry2*60000000; + + if (0!=cursor->m_Expiry3) + file_ptr->exp_explicit_high=now + cursor->m_Expiry3*60000000; + + Output.push_back(file_ptr); + } // for + +} // CreateMetaArray + + +/** case: two levels, no overlap, no expiry **/ +TestFileMetaData levelA[]= +{ + {100, "AA", "BA", 0, 0, 0}, + {101, "LA", "NA", 0, 0, 0} +}; // levelA + +TestFileMetaData levelB[]= +{ + {200, "CA", "DA", 0, 0, 0}, + {201, "SA", "TA", 0, 0, 0} +}; // levelB + + +/** case: two levels, 100% overlap, both levels expired **/ +TestFileMetaData levelC[]= +{ + {200, "CA", "DA", 1, 3, 0}, + {201, "SA", "TA", ULLONG_MAX, 0, 4} +}; // levelC + +TestFileMetaData levelD[]= +{ + {200, "CA", "DA", 1, 2, 0}, + {201, "SA", "TA", ULLONG_MAX, 0, 2} +}; // levelD + + +TEST(ExpiryTester, OverlapTests) +{ + bool flag; + Version::FileMetaDataVector_t level1, level2, level_clear, expired_files; + uint64_t now; + ExpiryModuleOS module; + VersionTester ver; + const int overlap0(0), overlap1(1), sorted0(3), sorted1(4); + VersionEdit edit; + + module.SetExpiryEnabled(true); + module.SetWholeFileExpiryEnabled(true); + module.SetExpiryMinutes(2); + + now=port::TimeMicros(); + SetCachedTimeMicros(now); + + + /** case: two levels, no overlap, no expiry **/ + CreateMetaArray(level1, levelA, 2); + CreateMetaArray(level2, levelB, 2); + ver.SetFileList(sorted0, level1); + ver.SetFileList(sorted1, level2); + flag=module.CompactionFinalizeCallback(true, ver, sorted0, &edit); + ASSERT_EQ(flag, false); + ASSERT_EQ(edit.DeletedFileCount(), 0); + ver.SetFileList(sorted0, level_clear); + ver.SetFileList(sorted1, level_clear); + + ver.SetFileList(overlap0, level1); + ver.SetFileList(overlap1, level2); + flag=module.CompactionFinalizeCallback(true, ver, overlap0, &edit); + ASSERT_EQ(flag, false); + ASSERT_EQ(edit.DeletedFileCount(), 0); + ver.SetFileList(overlap0, level_clear); + ver.SetFileList(overlap1, level_clear); + + ver.SetFileList(overlap0, level1); + ver.SetFileList(sorted1, level2); + flag=module.CompactionFinalizeCallback(true, ver, overlap0, &edit); + ASSERT_EQ(flag, false); + ASSERT_EQ(edit.DeletedFileCount(), 0); + ver.SetFileList(overlap0, level_clear); + ver.SetFileList(sorted1, level_clear); + + /** case: two levels, 100% overlap, both levels expired **/ + SetCachedTimeMicros(now); + CreateMetaArray(level1, levelC, 2); + CreateMetaArray(level2, levelD, 2); + SetCachedTimeMicros(now + 5*60000000); + ver.SetFileList(sorted0, level1); + ver.SetFileList(sorted1, level2); + flag=module.CompactionFinalizeCallback(true, ver, sorted0, &edit); + ASSERT_EQ(flag, false); + ASSERT_EQ(edit.DeletedFileCount(), 0); + flag=module.CompactionFinalizeCallback(true, ver, sorted1, &edit); + ASSERT_EQ(flag, true); + ASSERT_EQ(edit.DeletedFileCount(), 2); + + // retest sorted1 with unlimited + module.SetExpiryUnlimited(true); + flag=module.CompactionFinalizeCallback(true, ver, sorted1, &edit); + ASSERT_EQ(flag, true); + ASSERT_EQ(edit.DeletedFileCount(), 2); + + // cleanup + ver.SetFileList(sorted0, level_clear); + ver.SetFileList(sorted1, level_clear); + + ClearMetaArray(level1); + ClearMetaArray(level2); + +} // OverlapTests + + +enum eExpiryType +{ + eEXPIRY_NONE=1, + eEXPIRY_AGED=2, + eEXPIRY_EXPLICIT=3 +}; // enum eExpiryType + + +struct sExpiryTestKey +{ + const char * m_Key; // string key + eExpiryType m_Type; // type of expiry + int m_NowMinus; // expiry time to set +}; + + +struct sExpiryTestFile +{ + // File size is generated + int m_Number; + int m_Level; // level for file in manifest + int m_LastValidState; // in a "state" test, how long should this file be around + sExpiryTestKey m_Keys[3]; // low, middle, high key +}; + + +/** + * Note: constructor and destructor NOT called, this is + * an interface class only + */ + +class ExpDB : public DBImpl +{ +public: + ExpDB(const Options& options, const std::string& dbname) + : DBImpl(options, dbname) {} + + + + virtual ~ExpDB() {}; + + VersionSet * GetVersionSet() {return(versions_);}; + const Options * GetOptions() {return(&options_);}; + + void OneCompaction() + { + MutexLock l(&mutex_); + MaybeScheduleCompaction(); + while (IsCompactionScheduled()) + bg_cv_.Wait(); + }; // OneCompaction + + void SetClock(uint64_t Time) + {SetCachedTimeMicros(Time);}; + + void ShiftClockMinutes(int Min) + { + uint64_t shift; + + shift=Min * 60 * port::UINT64_ONE_SECOND_MICROS; + SetCachedTimeMicros(GetCachedTimeMicros() + shift); + }; +}; // class ExpDB + + +class ExpTestModule : public ExpiryModuleOS +{ +public: + ExpTestModule() : m_ExpiryAllow(0), m_AllowLevel(-1) {}; + + mutable int m_ExpiryAllow; + mutable int m_AllowLevel; + + virtual bool CompactionFinalizeCallback( + bool WantAll, const Version & Ver, int Level, + VersionEdit * Edit) const + { + bool flag(false); + + if (0!=m_ExpiryAllow && NULL==Edit) + { + flag=ExpiryModuleOS::CompactionFinalizeCallback(WantAll, Ver, Level, Edit); + + if (flag) + { + m_AllowLevel=Level; + -- m_ExpiryAllow; + } // if + } // if + else if (-1!=m_AllowLevel && NULL!=Edit) + { + flag=ExpiryModuleOS::CompactionFinalizeCallback(WantAll, Ver, Level, Edit); + + if (flag) + { + m_AllowLevel=-1; + } + } // else if + + return(flag); + + } // CoompactionFinalizeCallback +}; + + +class ExpiryManifestTester +{ +public: + ExpiryManifestTester() + : m_Good(false), m_DB(NULL), m_Env(Env::Default()), + m_BaseTime(port::TimeMicros()), m_Sequence(1) + { + m_DBName = test::TmpDir() + "/expiry"; + + // clean up previous execution + leveldb::DestroyDB(m_DBName, m_Options); + + m_Options.create_if_missing=true; + m_Options.error_if_exists=false; + + // Note: m_Options.expiry_module is a smart pointer. It + // owns the m_Expiry object and will automatically delete the + // allocation. + m_Expiry=new ExpTestModule; + m_Options.expiry_module=m_Expiry; + m_Expiry->SetExpiryEnabled(true); + + OpenTestDB(); + }; + + ~ExpiryManifestTester() + { + // clean up + delete m_DB; + leveldb::DestroyDB(m_DBName, m_Options); + }; + + bool m_Good; + std::string m_DBName; + Options m_Options; + ExpTestModule * m_Expiry; + Env * m_Env; + ExpDB * m_DB; + uint64_t m_BaseTime; + SequenceNumber m_Sequence; + + void OpenTestDB() + { + leveldb::Status status; + + status=leveldb::DB::Open(m_Options, m_DBName, (DB**)&m_DB); + + m_Good=status.ok(); + ASSERT_OK(status); + m_DB->SetClock(m_BaseTime); + } // OpenTestDB + + + void CreateKey(const sExpiryTestKey & Key, InternalKey & Output) + { + ExpiryTimeMicros expiry; + ValueType type; + + switch(Key.m_Type) + { + case(eEXPIRY_NONE): + expiry=0; + type=kTypeValue; + break; + + case(eEXPIRY_AGED): + expiry=m_BaseTime - Key.m_NowMinus * 60 * port::UINT64_ONE_SECOND_MICROS; + type=kTypeValueWriteTime; + break; + + case(eEXPIRY_EXPLICIT): + expiry=m_BaseTime + Key.m_NowMinus * 60 * port::UINT64_ONE_SECOND_MICROS; + type=kTypeValueExplicitExpiry; + break; + } // switch + + ParsedInternalKey ikey(Key.m_Key, expiry, m_Sequence, type); + + Output.SetFrom(ikey); + ++m_Sequence; + } // CreateKey + + + void CreateFile(const sExpiryTestFile & File, VersionEdit & Edit) + { + std::string fname; + Status s; + WritableFile * outfile; + TableBuilder * builder; + InternalKey low_key, mid_key, high_key; + uint64_t count1, count2, count3, file_size; + + fname = TableFileName(*m_DB->GetOptions(), File.m_Number, File.m_Level); + s = m_Env->NewWritableFile(fname, &outfile, gMapSize); + ASSERT_OK(s); + builder = new TableBuilder(*m_DB->GetOptions(), outfile); + + CreateKey(File.m_Keys[0], low_key); + CreateKey(File.m_Keys[1], mid_key); + CreateKey(File.m_Keys[2], high_key); + + builder->Add(low_key.internal_key(), "Value"); + builder->Add(mid_key.internal_key(), "Value"); + builder->Add(high_key.internal_key(), "Value"); + + s = builder->Finish(); + ASSERT_OK(s); + + count1=builder->GetExpiryWriteLow(); + count2=builder->GetExpiryWriteHigh(); + count3=builder->GetExpiryExplicitHigh(); + + s = outfile->Sync(); + ASSERT_OK(s); + s = outfile->Close(); + ASSERT_OK(s); + + delete builder; + delete outfile; + + m_Env->GetFileSize(fname, &file_size); + + Edit.AddFile2(File.m_Level, File.m_Number, file_size, + low_key, high_key, + count1, count2, count3); + } // CreateFile + + + void CreateManifest(const sExpiryTestFile * Files, size_t Count) + { + int loop; + const sExpiryTestFile * cursor; + VersionEdit edit; + port::Mutex mutex; + Status s; + + m_Sequence=1; + for (cursor=Files, loop=0; loopGetVersionSet()->LogAndApply(&edit, &mutex); + mutex.Unlock(); + ASSERT_OK(s); + + } // CreateManifest + + + void VerifyManifest(const sExpiryTestFile * Files, size_t Count) + { + const Version::FileMetaDataVector_t * file_list; + Version::FileMetaDataVector_t::const_iterator it; + int current_level, loop, loop1; + const sExpiryTestFile * cursor; + InternalKey low_key, mid_key, high_key; + uint64_t exp_write_low, exp_write_high, exp_explicit_high, expires; + + // setup + current_level=config::kNumLevels; + file_list=NULL; + m_Sequence=1; + + for (cursor=Files, loop=0; loopm_Level!=current_level) + { + current_level=cursor->m_Level; + file_list=&m_DB->GetVersionSet()->current()->GetFileList(current_level); + it=file_list->begin(); + } // if + + // not set by builder ASSERT_EQ((*it)->num_entries, 3); + ASSERT_EQ((*it)->level, cursor->m_Level); + + // same code as above, just basic verification + CreateKey(cursor->m_Keys[0], low_key); + CreateKey(cursor->m_Keys[1], mid_key); // need to keep sequence # correct + CreateKey(cursor->m_Keys[2], high_key); + + ASSERT_TRUE(0==m_Options.comparator->Compare(low_key.internal_key(), + (*it)->smallest.internal_key())); + ASSERT_TRUE(0==m_Options.comparator->Compare(high_key.internal_key(), + (*it)->largest.internal_key())); + + // create our idea of the expiry settings + exp_write_low=ULLONG_MAX; + exp_write_high=0; + exp_explicit_high=0; + + for (loop1=0; loop1<3; ++loop1) + { + switch(cursor->m_Keys[loop1].m_Type) + { + case eEXPIRY_NONE: + exp_write_low=0; + break; + + case eEXPIRY_AGED: + expires=m_BaseTime - cursor->m_Keys[loop1].m_NowMinus * 60 * port::UINT64_ONE_SECOND_MICROS; + if (expiresm_Keys[loop1].m_NowMinus * 60 * port::UINT64_ONE_SECOND_MICROS; + if (exp_explicit_highexp_write_low); + ASSERT_EQ(exp_write_high, (*it)->exp_write_high); + ASSERT_EQ(exp_explicit_high, (*it)->exp_explicit_high); + + // inc here since not initialized upon for loop entry + ++it; + } // for + + return; + + } // VerifyManifest + + void VerifyFiles(const sExpiryTestFile * Files, size_t Count, int State) + { + int current_level, loop, loop1; + std::vector file_names; + std::vector::iterator f_it; + + std::string dir_name, target; + const sExpiryTestFile * cursor; + + current_level=-1; + + for (cursor=Files, loop=0; loopm_Level!=current_level) + { + // should be no files left in list upon level change + // (except "." and "..") + ASSERT_LE(file_names.size(), 2); + file_names.clear(); + + current_level=cursor->m_Level; + dir_name=MakeDirName2(*m_DB->GetOptions(), current_level, "sst"); + m_Env->GetChildren(dir_name, &file_names); + } // if + + // is file still found on disk? + if (State <= cursor->m_LastValidState) + { + // -2 omits directory + target=TableFileName(*m_DB->GetOptions(), cursor->m_Number, -2); + target.erase(0,target.find_last_of('/')+1); + f_it=std::find(file_names.begin(), file_names.end(), target); + ASSERT_TRUE(file_names.end()!=f_it); + file_names.erase(f_it); + } // if + } // for + + // verify last populated level was good + ASSERT_LE(file_names.size(), 2); + + return; + + } // VerifyManifest + + + void VerifyKeys(const sExpiryTestKey * Key, size_t Count, int Minutes) + { + Iterator * it; + const sExpiryTestKey * cursor; + int loop; + + it=m_DB->NewIterator(ReadOptions()); + it->SeekToFirst(); + + for (cursor=Key, loop=0; loopm_Type && Minutes <= cursor->m_NowMinus) + || (eEXPIRY_AGED == cursor->m_Type && MinutesGetExpiryMinutes())) + { + ASSERT_TRUE(it->Valid()); + ASSERT_TRUE(0==strcmp(cursor->m_Key, it->key().ToString().c_str())); + it->Next(); + } // if + } // for + + delete it; + + return; + + } // VerifyKeys + + +}; // ExpiryManifestTester + + +sExpiryTestFile Manifest1[]= +{ + {101, 6, 0, {{"02", eEXPIRY_NONE, 0}, {"05", eEXPIRY_NONE, 0}, {"07", eEXPIRY_NONE, 0}}}, + {102, 6, 0, {{"12", eEXPIRY_NONE, 0}, {"15", eEXPIRY_AGED, 25}, {"17", eEXPIRY_AGED, 25}}}, + {103, 6, 0, {{"22", eEXPIRY_AGED, 25}, {"25", eEXPIRY_EXPLICIT, 20}, {"27", eEXPIRY_EXPLICIT, 20}}}, + {104, 6, 0, {{"32", eEXPIRY_AGED, 25}, {"35", eEXPIRY_AGED, 25}, {"37", eEXPIRY_NONE, 0}}}, + {105, 6, 0, {{"42", eEXPIRY_AGED, 25}, {"45", eEXPIRY_NONE, 0}, {"47", eEXPIRY_AGED, 25}}}, + + {201, 5, 0, {{"03", eEXPIRY_AGED, 10}, {"05", eEXPIRY_AGED, 10}, {"06", eEXPIRY_AGED, 10}}}, + {202, 5, 0, {{"11", eEXPIRY_NONE, 0}, {"15", eEXPIRY_EXPLICIT, 15}, {"18", eEXPIRY_EXPLICIT, 15}}}, + {203, 5, 0, {{"21", eEXPIRY_EXPLICIT, 15}, {"25", eEXPIRY_EXPLICIT, 15}, {"29", eEXPIRY_AGED, 10}}}, + {204, 5, 0, {{"34", eEXPIRY_EXPLICIT, 15}, {"35", eEXPIRY_EXPLICIT, 15}, {"39", eEXPIRY_NONE, 0}}}, + {205, 5, 0, {{"44", eEXPIRY_EXPLICIT, 15}, {"45", eEXPIRY_NONE, 0}, {"46", eEXPIRY_EXPLICIT, 15}}}, + + {301, 4, 0, {{"03", eEXPIRY_EXPLICIT, 5}, {"05", eEXPIRY_EXPLICIT, 5}, {"06", eEXPIRY_EXPLICIT, 5}}}, + {302, 4, 0, {{"11", eEXPIRY_NONE, 0}, {"15", eEXPIRY_AGED, 5}, {"18", eEXPIRY_EXPLICIT, 5}}}, + {303, 4, 0, {{"21", eEXPIRY_EXPLICIT, 5}, {"25", eEXPIRY_AGED, 5}, {"29", eEXPIRY_EXPLICIT, 5}}}, + {304, 4, 0, {{"34", eEXPIRY_EXPLICIT, 5}, {"35", eEXPIRY_AGED, 5}, {"39", eEXPIRY_NONE, 0}}}, + {305, 4, 0, {{"44", eEXPIRY_AGED, 5}, {"45", eEXPIRY_NONE, 0}, {"46", eEXPIRY_EXPLICIT, 5}}} + +}; // Manifest1 + +/** + * Does manifest create correctly? + */ +TEST(ExpiryManifestTester, Manifest1) +{ + size_t manifest_count; + Status s; + + manifest_count=sizeof(Manifest1) / sizeof(Manifest1[0]); + CreateManifest(Manifest1, manifest_count); + + // quick verify + ASSERT_EQ(m_DB->GetVersionSet()->NumLevelFiles(6), 5); + ASSERT_EQ(m_DB->GetVersionSet()->NumLevelFiles(5), 5); + ASSERT_EQ(m_DB->GetVersionSet()->NumLevelFiles(4), 5); + ASSERT_EQ(m_DB->GetVersionSet()->NumLevelFiles(3), 0); + ASSERT_EQ(m_DB->GetVersionSet()->NumLevelFiles(2), 0); + ASSERT_EQ(m_DB->GetVersionSet()->NumLevelFiles(1), 0); + ASSERT_EQ(m_DB->GetVersionSet()->NumLevelFiles(0), 0); + + // full verify + VerifyManifest(Manifest1, manifest_count); + + // close, open, verify again + delete m_DB; + OpenTestDB(); + VerifyManifest(Manifest1, manifest_count); + + // close, repair, open, verify + delete m_DB; + s=RepairDB(m_DBName, m_Options); + ASSERT_OK(s); + OpenTestDB(); + VerifyManifest(Manifest1, manifest_count); + + return; +}; + + +sExpiryTestFile Overlap1[]= +{ + // sorted levels + {101, 6, 5, {{"02", eEXPIRY_NONE, 0}, {"05", eEXPIRY_NONE, 0}, {"07", eEXPIRY_NONE, 0}}}, + {102, 6, 2, {{"15", eEXPIRY_AGED, 25}, {"17", eEXPIRY_AGED, 25}, {"20", eEXPIRY_AGED, 25}}}, + + {201, 5, 5, {{"22", eEXPIRY_NONE, 0}, {"24", eEXPIRY_NONE, 0}, {"25", eEXPIRY_NONE, 0}}}, + + {301, 4, 5, {{"06", eEXPIRY_EXPLICIT, 5}, {"07", eEXPIRY_EXPLICIT, 5}, {"10", eEXPIRY_EXPLICIT, 5}}}, + {302, 4, 0, {{"35", eEXPIRY_EXPLICIT, 5}, {"37", eEXPIRY_EXPLICIT, 5}, {"40", eEXPIRY_EXPLICIT, 5}}}, + + {401, 3, 5, {{"45", eEXPIRY_NONE, 0}, {"46", eEXPIRY_NONE, 0}, {"47", eEXPIRY_NONE, 0}}}, + + {450, 2, 3, {{"11", eEXPIRY_AGED, 25}, {"17", eEXPIRY_AGED, 25}, {"21", eEXPIRY_AGED, 25}}}, + + // Overlap levels + {501, 1, 5, {{"10", eEXPIRY_AGED, 25}, {"17", eEXPIRY_AGED, 25}, {"23", eEXPIRY_AGED, 25}}}, + {502, 1, 5, {{"11", eEXPIRY_NONE, 0}, {"12", eEXPIRY_NONE, 0}, {"15", eEXPIRY_NONE, 0}}}, + {503, 1, 1, {{"33", eEXPIRY_AGED, 25}, {"34", eEXPIRY_AGED, 25}, {"42", eEXPIRY_AGED, 25}}} + + +}; + + +/* + * Test sequence that expired files get selected + */ +TEST(ExpiryManifestTester, Overlap1) +{ + size_t manifest_count; + Status s; + + manifest_count=sizeof(Overlap1) / sizeof(Overlap1[0]); + CreateManifest(Overlap1, manifest_count); + + // quick verify + ASSERT_EQ(m_DB->GetVersionSet()->NumLevelFiles(6), 2); + ASSERT_EQ(m_DB->GetVersionSet()->NumLevelFiles(5), 1); + ASSERT_EQ(m_DB->GetVersionSet()->NumLevelFiles(4), 2); + ASSERT_EQ(m_DB->GetVersionSet()->NumLevelFiles(3), 1); + ASSERT_EQ(m_DB->GetVersionSet()->NumLevelFiles(2), 1); + ASSERT_EQ(m_DB->GetVersionSet()->NumLevelFiles(1), 3); + ASSERT_EQ(m_DB->GetVersionSet()->NumLevelFiles(0), 0); + + // full verify + VerifyManifest(Overlap1, manifest_count); + VerifyFiles(Overlap1, manifest_count, 0); + + + // fully enable compaction expiry + m_Expiry->SetExpiryEnabled(false); + ASSERT_EQ(m_Options.ExpiryActivated(), false); + m_Expiry->SetExpiryEnabled(true); + m_Expiry->SetExpiryMinutes(60); + m_Expiry->SetWholeFileExpiryEnabled(true); + ASSERT_EQ(m_Options.ExpiryActivated(), true); + + m_DB->ShiftClockMinutes(10); + m_Expiry->m_ExpiryAllow=1; + m_DB->OneCompaction(); + VerifyFiles(Overlap1, manifest_count, 1); + + // total shift now 30 min + m_DB->ShiftClockMinutes(30); + m_Expiry->m_ExpiryAllow=1; + m_DB->OneCompaction(); + VerifyFiles(Overlap1, manifest_count, 2); + + m_Expiry->m_ExpiryAllow=1; + m_DB->OneCompaction(); + VerifyFiles(Overlap1, manifest_count, 3); + + m_Expiry->m_ExpiryAllow=1; + m_DB->OneCompaction(); + VerifyFiles(Overlap1, manifest_count, 4); + + m_Expiry->m_ExpiryAllow=1; + m_DB->OneCompaction(); + VerifyFiles(Overlap1, manifest_count, 5); + + return; +}; + + +/* + * Test compaction will find all without prompting + */ +TEST(ExpiryManifestTester, Overlap2) +{ + size_t manifest_count; + Status s; + + manifest_count=sizeof(Overlap1) / sizeof(Overlap1[0]); + CreateManifest(Overlap1, manifest_count); + + // quick verify + ASSERT_EQ(m_DB->GetVersionSet()->NumLevelFiles(6), 2); + ASSERT_EQ(m_DB->GetVersionSet()->NumLevelFiles(5), 1); + ASSERT_EQ(m_DB->GetVersionSet()->NumLevelFiles(4), 2); + ASSERT_EQ(m_DB->GetVersionSet()->NumLevelFiles(3), 1); + ASSERT_EQ(m_DB->GetVersionSet()->NumLevelFiles(2), 1); + ASSERT_EQ(m_DB->GetVersionSet()->NumLevelFiles(1), 3); + ASSERT_EQ(m_DB->GetVersionSet()->NumLevelFiles(0), 0); + + // full verify + VerifyManifest(Overlap1, manifest_count); + VerifyFiles(Overlap1, manifest_count, 0); + + // enable compaction expiry + m_Expiry->SetExpiryEnabled(true); + m_Expiry->SetExpiryMinutes(60); + m_Expiry->SetWholeFileExpiryEnabled(true); + m_DB->ShiftClockMinutes(61); + + m_Expiry->m_ExpiryAllow=10; + m_DB->OneCompaction(); + + // let multiple threads complete + /// sleep(1) required for Smart OS 1.8 buildbot + /// then rased to sleep(2) for freebsd buildbot + sleep(2); + VerifyFiles(Overlap1, manifest_count, 5); + + return; +}; + + +sExpiryTestKey Compact1[]= +{ + {"01", eEXPIRY_AGED, 0}, + {"02", eEXPIRY_EXPLICIT, 35}, + {"03", eEXPIRY_AGED, 0}, + {"04", eEXPIRY_EXPLICIT, 55}, + {"05", eEXPIRY_AGED, 0}, + {"06", eEXPIRY_EXPLICIT, 15}, + {"07", eEXPIRY_AGED, 0}, + {"08", eEXPIRY_EXPLICIT, 5}, + {"09", eEXPIRY_AGED, 0}, + {"10", eEXPIRY_EXPLICIT, 55}, + {"11", eEXPIRY_AGED, 0}, + {"12", eEXPIRY_EXPLICIT, 65}, + {"13", eEXPIRY_AGED, 0} + +}; + + +/* + * Test expiry records get filtered during regular compaction + * (and expiring all leads to file deletion) + */ +TEST(ExpiryManifestTester, Compact1) +{ + size_t key_count; + const sExpiryTestKey * Key; + Status s; + WriteBatch batch; + KeyMetaData meta; + int loop; + ExpiryTimeMicros expiry; + ValueType type; + + // enable compaction expiry + m_Expiry->SetExpiryEnabled(true); + m_Expiry->SetExpiryMinutes(30); + m_Expiry->SetWholeFileExpiryEnabled(false); + + key_count=sizeof(Compact1) / sizeof(Compact1[0]); + + for (loop=0, Key=Compact1; loopm_Type) + { + case(eEXPIRY_NONE): + expiry=0; + type=kTypeValue; + break; + + case(eEXPIRY_AGED): + expiry=m_BaseTime - Key->m_NowMinus * 60 * port::UINT64_ONE_SECOND_MICROS; + type=kTypeValueWriteTime; + break; + + case(eEXPIRY_EXPLICIT): + expiry=m_BaseTime + Key->m_NowMinus * 60 * port::UINT64_ONE_SECOND_MICROS; + type=kTypeValueExplicitExpiry; + break; + } // switch + + meta.m_Type=type; + meta.m_Expiry=expiry; + s=m_DB->Put(WriteOptions(), Key->m_Key, "gig\'em", &meta); + ASSERT_OK(s); + } // for + + // load seem ok? + VerifyKeys(Compact1, key_count, 0); + + // move write buffer to .sst file + // (no expiry in buffer to .sst conversion) + m_DB->TEST_CompactMemTable(); + VerifyKeys(Compact1, key_count, 0); + + m_DB->ShiftClockMinutes(20); + m_DB->TEST_CompactRange(3, NULL, NULL); + VerifyKeys(Compact1, key_count, 20); + + m_DB->ShiftClockMinutes(16); + m_DB->TEST_CompactRange(4, NULL, NULL); + VerifyKeys(Compact1, key_count, 36); + + m_DB->ShiftClockMinutes(35); + m_DB->TEST_CompactRange(5, NULL, NULL); + VerifyKeys(Compact1, key_count, 71); + +} // Compact1 + + +struct sExpiryDBObject +{ + const char * m_Key; // string key + const char * m_Value; // string value + int m_NowMinus; // expiry time to set +}; + + +class ExpiryDBTester +{ +public: + ExpiryDBTester() + : m_Good(false), m_DB(NULL), + m_BaseTime(port::TimeMicros()) + { + m_DBName = test::TmpDir() + "/expiry"; + + // clean up previous execution + leveldb::DestroyDB(m_DBName, m_Options); + + m_Options.create_if_missing=true; + m_Options.error_if_exists=false; + + // Note: m_Options.expiry_module is a smart pointer. It + // owns the m_Expiry object and will automatically delete the + // allocation. + m_Expiry=new leveldb::ExpiryModuleOS; + m_Options.expiry_module=m_Expiry; + + OpenTestDB(); + }; + + ~ExpiryDBTester() + { + // clean up + delete m_DB; + leveldb::DestroyDB(m_DBName, m_Options); + }; + + void OpenTestDB() + { + leveldb::Status status; + + status=leveldb::DB::Open(m_Options, m_DBName, (DB**)&m_DB); + + m_Good=status.ok(); + ASSERT_OK(status); + m_DB->SetClock(m_BaseTime); + } // OpenTestDB + +protected: + bool m_Good; + std::string m_DBName; + Options m_Options; + leveldb::ExpiryModuleOS * m_Expiry; + ExpDB * m_DB; + uint64_t m_BaseTime; + +}; // ExpiryDBTester + + +sExpiryDBObject SimpleData[]= +{ + {"aa", "one", 0}, + {"bb", "two", 0}, + {"cc", "three", 0}, + {"dd", "four", 0}, + {"ee", "five", 0} +}; + + +/* + * Do simple writes, see if data disappears + * + */ +TEST(ExpiryDBTester, Simple) +{ + size_t obj_count, loop; + Status s; + sExpiryDBObject * cursor; + std::string buffer; + std::auto_ptr iterator; + + // enable compaction expiry + m_Expiry->SetExpiryEnabled(true); + m_Expiry->SetExpiryMinutes(2); + m_Expiry->SetWholeFileExpiryEnabled(false); + + obj_count=sizeof(SimpleData) / sizeof(SimpleData[0]); + + // load data (now in memory buffer) + for (loop=0, cursor=SimpleData; loopPut(WriteOptions(), cursor->m_Key, cursor->m_Value); + ASSERT_OK(s); + } // for + + // verify we can find it + for (loop=0, cursor=SimpleData; loopGet(ReadOptions(), cursor->m_Key, &buffer); + ASSERT_OK(s); + } // for + + // verify we can walk it + iterator.reset(m_DB->NewIterator(ReadOptions())); + for (loop=0, iterator->SeekToFirst(); loopNext()) + { + ASSERT_EQ(iterator->Valid(), true); + } // for + ASSERT_EQ(iterator->Valid(), false); + + // expiry set to 2 min, so shift 10 + m_DB->ShiftClockMinutes(10); + + // all data gone? + for (loop=0, cursor=SimpleData; loopGet(ReadOptions(), cursor->m_Key, &buffer); + ASSERT_TRUE(s.IsNotFound()); + } // for + + // make it reappear + m_Expiry->SetExpiryUnlimited(true); + for (loop=0, cursor=SimpleData; loopGet(ReadOptions(), cursor->m_Key, &buffer); + ASSERT_OK(s); + } // for + + m_Expiry->SetExpiryMinutes(2); + iterator.reset(m_DB->NewIterator(ReadOptions())); + iterator->SeekToFirst(); + ASSERT_EQ(iterator->Valid(), false); + + // force data from memory buffer to .sst file + // (after shifting clock!!) + m_DB->SetClock(m_BaseTime); + m_DB->CompactRange(NULL, NULL); + + // verify we can find it + for (loop=0, cursor=SimpleData; loopGet(ReadOptions(), cursor->m_Key, &buffer); + ASSERT_OK(s); + } // for + + // verify we can walk it + iterator.reset(m_DB->NewIterator(ReadOptions())); + for (loop=0, iterator->SeekToFirst(); loopNext()) + { + ASSERT_EQ(iterator->Valid(), true); + } // for + ASSERT_EQ(iterator->Valid(), false); + + // expiry set to 2 min, so shift 10 + m_DB->ShiftClockMinutes(10); + + // all data gone? + for (loop=0, cursor=SimpleData; loopGet(ReadOptions(), cursor->m_Key, &buffer); + ASSERT_TRUE(s.IsNotFound()); + } // for + + iterator.reset(m_DB->NewIterator(ReadOptions())); + iterator->SeekToFirst(); + ASSERT_EQ(iterator->Valid(), false); + + + // run compaction again with clock advanced + // to physically remove records. Then move + // clock to starting time and prove records gone gone. + /// (note that we "know" .sst file is on level 3) + m_DB->TEST_CompactRange(3, NULL, NULL); + m_DB->SetClock(m_BaseTime); + + // all data gone? + for (loop=0, cursor=SimpleData; loopGet(ReadOptions(), cursor->m_Key, &buffer); + ASSERT_TRUE(s.IsNotFound()); + } // for + + iterator.reset(m_DB->NewIterator(ReadOptions())); + iterator->SeekToFirst(); + ASSERT_EQ(iterator->Valid(), false); + +} // ExpiryDBTester::Simple + + +/** + * Riak uses a special key to mark a "feature upgrade". That + * key must never expire. + */ +// from riak_kv_eleveldb_backend.erl: sext:encode({md,fixed_indexes}). +static const char * MDKey= +{"\x10\x00\x00\x00\x02\x0c\xb6\xd9\x00\x08\x0c\xb3\x5a\x6f\x16\x5b\x25\x7e\xd3\x6e\xb2\x59\x64\x16\x5b\x98\x08"}; +static const int MDKeyLen=27; + +// example Riak key: sext:encode({o,{<>,<>,<>}). +static const char * RiakKey= +{"\x10\x00\x00\x00\x03\x0c\xb7\x80\x08\x10\x00\x00\x00\x02\x12\xb1\x5b\xec\x53\x10\x08\x12\xb1\x5d\x6c\x76\xb9\x88\x08\x12\xb5\xd9\x6f\x33\x10\x08"}; +static const int RiakKeyLen=36; + +TEST(ExpiryDBTester, MetaDataKey) +{ + Slice key_md(MDKey, MDKeyLen); + Slice key_riak(RiakKey, RiakKeyLen); + Slice no_value; + std::string return_value; + KeyMetaData meta; + Status s; + + // enable expiry + m_Expiry->SetExpiryEnabled(true); + m_Expiry->SetExpiryMinutes(2); + m_Expiry->SetWholeFileExpiryEnabled(false); + + // write special key that should not receive expiry + s=m_DB->Put(WriteOptions(), key_md, no_value); + ASSERT_OK(s); + + // verify + s=m_DB->Get(ReadOptions(), key_md, &return_value, &meta); + ASSERT_OK(s); + ASSERT_EQ(meta.m_Type, kTypeValue); + + // write a normal key that SHOULD get expiry + s=m_DB->Put(WriteOptions(), key_riak, no_value); + ASSERT_OK(s); + + // verify + s=m_DB->Get(ReadOptions(), key_riak, &return_value, &meta); + ASSERT_OK(s); + ASSERT_EQ(meta.m_Type, kTypeValueWriteTime); + +} // ExpiryDBTester, MetaDataKey + + + +} // namespace leveldb + diff --git a/src/leveldb/util/flexcache.cc b/src/leveldb/util/flexcache.cc new file mode 100644 index 000000000..d2aebd8d6 --- /dev/null +++ b/src/leveldb/util/flexcache.cc @@ -0,0 +1,129 @@ +// ------------------------------------------------------------------- +// +// flexcache.cc +// +// Copyright (c) 2011-2013 Basho Technologies, Inc. All Rights Reserved. +// +// This file is provided to you under the Apache License, +// Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain +// a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// ------------------------------------------------------------------- + +#include +#include + +#include "util/db_list.h" +#include "util/flexcache.h" + +namespace leveldb { + + +// global cache control +FlexCache gFlexCache; + + +/** + * Initialize object + */ +FlexCache::FlexCache() + : m_TotalMemory(0) +{ + struct rlimit limit; + int ret_val; + + // initialize total memory available based upon system data + ret_val=getrlimit(RLIMIT_DATA, &limit); + + // unsigned long caste to fix warning in smartos1.8, smartos 13.1, solaris10 + if (0==ret_val && (unsigned long)RLIM_INFINITY!=limit.rlim_max) + { + // 2Gig is "small ram", Riak going to be tight + if (limit.rlim_max < flex::kRlimSizeIsSmall) + m_TotalMemory=flex::kRlimSmall; + else + m_TotalMemory=(limit.rlim_max - flex::kRlimLargeReserve) / 2; + } // if + + // create a default similar to Google's original, + // but enough for 2 vnodes including Riak default buffer sizes + else + { + m_TotalMemory=flex::kDefaultMemory; + } // else + + return; + +} // FlexCache::FlexCache + + +/** + * Return current capacity limit for cache flavor indicated, + * default is zero if unknown flavor. + */ +uint64_t +FlexCache::GetDBCacheCapacity( + bool IsInternal) //!< value describing cache attributes of caller +{ + uint64_t ret_val, shared_total; + size_t count, internal_count; + + // get count of database by type + count=DBList()->GetDBCount(IsInternal); + if (IsInternal) + internal_count=count; + else + internal_count=DBList()->GetDBCount(true); + + // what is total memory assigned to a type + if (IsInternal) + shared_total=(m_TotalMemory*2)/10; // integer *.2 + else if (0!=internal_count) + shared_total=(m_TotalMemory*8)/10; + else // no internal database + shared_total=m_TotalMemory; + + // split up type specific aggregate to "per database" value + if (0!=count) + ret_val=shared_total / count; + else + ret_val=shared_total; + + return(ret_val); + +} // FlexCache::GetDBCacheCapacity + + +/** + * Change the memory allocated to all caches, and actively resize + * existing caches + */ +void +FlexCache::SetTotalMemory( + uint64_t Total) //!< new memory allocated to all caches +{ + // only review current allocation if new value is different + // and not zero default + if (0!=Total && Total!=m_TotalMemory) + { + m_TotalMemory=Total; + } // if + + DBList()->ScanDBs(true, &DBImpl::ResizeCaches); + DBList()->ScanDBs(false, &DBImpl::ResizeCaches); + + return; + +} // FlexCache::SetTotalMemory + +} // namespace leveldb diff --git a/src/leveldb/util/flexcache.h b/src/leveldb/util/flexcache.h new file mode 100644 index 000000000..768693a35 --- /dev/null +++ b/src/leveldb/util/flexcache.h @@ -0,0 +1,72 @@ +// ------------------------------------------------------------------- +// +// flexcache.h +// +// Copyright (c) 2011-2013 Basho Technologies, Inc. All Rights Reserved. +// +// This file is provided to you under the Apache License, +// Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain +// a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// ------------------------------------------------------------------- + +#include "util/cache2.h" + +#ifndef STORAGE_LEVELDB_INCLUDE_FLEXCACHE_H_ +#define STORAGE_LEVELDB_INCLUDE_FLEXCACHE_H_ + +namespace leveldb +{ + +// Constants declared in style of db/dbformat.h +namespace flex +{ + + static const uint64_t kRlimSizeIsSmall = 2*1024*1024*1024ULL; // above 2G is lots of ram + static const uint64_t kRlimSmall = 256*1024*1024ULL; + static const uint64_t kRlimLargeReserve = 1024*1024*1024ULL; + static const uint64_t kDefaultMemory = 340*1024*1024ULL; + static const uint64_t kMinimumDBMemory = 10*1024*1024ULL; + +} // namespace flex + +/** + * FlexCache tunes file cache versus block cache versus number + * of open databases + */ + +class FlexCache +{ +public: + FlexCache(); + + uint64_t GetDBCacheCapacity(bool IsInternalDB); + + void SetTotalMemory(uint64_t Total); + + void RecalculateAllocations() {SetTotalMemory(0);}; + + uint64_t GetTotalMemory() const {return(m_TotalMemory);}; + +protected: + + uint64_t m_TotalMemory; //!< complete memory assigned to all FlexCache clients + +}; // class FlexCache + + +extern FlexCache gFlexCache; + +} // namespace leveldb + +#endif // STORAGE_LEVELDB_INCLUDE_FLEXCACHE_H_ diff --git a/src/leveldb/util/flexcache_test.cc b/src/leveldb/util/flexcache_test.cc new file mode 100644 index 000000000..d4b49bd0e --- /dev/null +++ b/src/leveldb/util/flexcache_test.cc @@ -0,0 +1,246 @@ +// ------------------------------------------------------------------- +// +// flexcache_test.cc +// +// Copyright (c) 2013 Basho Technologies, Inc. All Rights Reserved. +// +// This file is provided to you under the Apache License, +// Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain +// a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// ------------------------------------------------------------------- + +#include + +#include "leveldb/db.h" +#include "leveldb/env.h" +#include "leveldb/filter_policy.h" +#include "leveldb/options.h" +#include "leveldb/status.h" +#include "util/db_list.h" +#include "util/testharness.h" + +namespace leveldb { + +class FlexCacheTest { }; + +TEST(FlexCacheTest, UserSizing) { + Options options; + DB * db[10]; + Status st; + std::string dbname, value; + int loop; + char buffer[12]; + + options.create_if_missing=true; + options.filter_policy=NewBloomFilterPolicy2(16); + options.total_leveldb_mem=1000*1024*1024L; + options.write_buffer_size=45*1024*1024L; + + // verify accounting with one database + dbname = test::TmpDir() + "/flexcache0"; + st=DB::Open(options, dbname, &db[0]); + ASSERT_OK(st); + ASSERT_EQ(1, DBList()->GetDBCount(false)); + + db[0]->GetProperty("leveldb.block-cache", &value); + ASSERT_EQ(922742784L, atoi(value.c_str())); + + db[0]->GetProperty("leveldb.file-cache", &value); + ASSERT_EQ(920645632L, atoi(value.c_str())); + + // verify accounting with three databases + dbname = test::TmpDir() + "/flexcache1"; + st=DB::Open(options, dbname, &db[1]); + ASSERT_OK(st); + dbname = test::TmpDir() + "/flexcache2"; + st=DB::Open(options, dbname, &db[2]); + ASSERT_OK(st); + ASSERT_EQ(3, DBList()->GetDBCount(false)); + + db[0]->GetProperty("leveldb.block-cache", &value); + ASSERT_EQ(223692117L, atoi(value.c_str())); + + db[0]->GetProperty("leveldb.file-cache", &value); + ASSERT_EQ(221594965L, atoi(value.c_str())); + + db[1]->GetProperty("leveldb.block-cache", &value); + ASSERT_EQ(223692117L, atoi(value.c_str())); + + db[1]->GetProperty("leveldb.file-cache", &value); + ASSERT_EQ(221594965L, atoi(value.c_str())); + + db[2]->GetProperty("leveldb.block-cache", &value); + ASSERT_EQ(223692117L, atoi(value.c_str())); + + db[2]->GetProperty("leveldb.file-cache", &value); + ASSERT_EQ(221594965L, atoi(value.c_str())); + + // verify accounting after two databases go away + delete db[0]; + delete db[2]; + + db[1]->GetProperty("leveldb.block-cache", &value); + ASSERT_EQ(922742784L, atoi(value.c_str())); + + db[1]->GetProperty("leveldb.file-cache", &value); + ASSERT_EQ(920645632L, atoi(value.c_str())); + + // rebuild from zero to ten databases, verify accounting + delete db[1]; + + options.total_leveldb_mem=3000*1024*1024L; + for(loop=0; loop<10; ++loop) + { + snprintf(buffer, sizeof(buffer), "/flexcache%u", loop); + dbname=test::TmpDir() + buffer; + st=DB::Open(options, dbname, &db[loop]); + ASSERT_OK(st); + ASSERT_EQ(loop+1, DBList()->GetDBCount(false)); + } // for + + for(loop=0; loop<10; ++loop) + { + db[loop]->GetProperty("leveldb.block-cache", &value); + ASSERT_EQ(188739584l, atoi(value.c_str())); + + db[loop]->GetProperty("leveldb.file-cache", &value); + ASSERT_EQ(186642432L, atoi(value.c_str())); + } // for + + for (loop=0; loop<10; ++loop) + { + delete db[loop]; + snprintf(buffer, sizeof(buffer), "/flexcache%u", loop); + dbname=test::TmpDir() + buffer; + st=DestroyDB(dbname, options); + ASSERT_OK(st); + } // for + + delete options.filter_policy; + options.filter_policy=NULL; +} + +TEST(FlexCacheTest, MixedSizing) { + Options options; + DB * db[10]; + Status st; + std::string dbname, value; + int loop; + char buffer[12]; + + options.create_if_missing=true; + options.filter_policy=NewBloomFilterPolicy2(16); + options.total_leveldb_mem=1000*1024*1024L; + options.write_buffer_size=45*1024*1024L; + + // verify accounting with one user & one internal + dbname = test::TmpDir() + "/flexcache0"; + st=DB::Open(options, dbname, &db[0]); + ASSERT_OK(st); + ASSERT_EQ(1, DBList()->GetDBCount(false)); + ASSERT_EQ(0, DBList()->GetDBCount(true)); + + db[0]->GetProperty("leveldb.block-cache", &value); + ASSERT_EQ(922742784l, atoi(value.c_str())); + + db[0]->GetProperty("leveldb.file-cache", &value); + ASSERT_EQ(920645632L, atoi(value.c_str())); + + // add internal + dbname = test::TmpDir() + "/flexcache1"; + options.is_internal_db=true; + options.total_leveldb_mem=1600*1024*1024L; + st=DB::Open(options, dbname, &db[1]); + ASSERT_OK(st); + ASSERT_EQ(1, DBList()->GetDBCount(false)); + ASSERT_EQ(1, DBList()->GetDBCount(true)); + + db[0]->GetProperty("leveldb.block-cache", &value); + ASSERT_EQ(1216344064l, atoi(value.c_str())); + + db[0]->GetProperty("leveldb.file-cache", &value); + ASSERT_EQ(1214246912L, atoi(value.c_str())); + + db[1]->GetProperty("leveldb.block-cache", &value); + ASSERT_EQ(209711104l, atoi(value.c_str())); + + db[1]->GetProperty("leveldb.file-cache", &value); + ASSERT_EQ(207613952L, atoi(value.c_str())); + + delete db[0]; + ASSERT_EQ(0, DBList()->GetDBCount(false)); + ASSERT_EQ(1, DBList()->GetDBCount(true)); + db[1]->GetProperty("leveldb.block-cache", &value); + ASSERT_EQ(209711104L, atoi(value.c_str())); + + db[1]->GetProperty("leveldb.file-cache", &value); + ASSERT_EQ(207613952L, atoi(value.c_str())); + + delete db[1]; + + + // rebuild from zero to ten databases, verify accounting + options.total_leveldb_mem=4000*1024*1024L; + + for(loop=0; loop<10; ++loop) + { + options.is_internal_db=(1==(loop %2)); + snprintf(buffer, sizeof(buffer), "/flexcache%u", loop); + dbname=test::TmpDir() + buffer; + st=DB::Open(options, dbname, &db[loop]); + ASSERT_OK(st); + } // for + + ASSERT_EQ(5, DBList()->GetDBCount(false)); + ASSERT_EQ(5, DBList()->GetDBCount(true)); + + for(loop=0; loop<10; ++loop) + { + if (0==(loop %2)) + { + db[loop]->GetProperty("leveldb.block-cache", &value); + ASSERT_EQ(545255424l, atoi(value.c_str())); + + db[loop]->GetProperty("leveldb.file-cache", &value); + ASSERT_EQ(543158272L, atoi(value.c_str())); + } // if + else + { + db[loop]->GetProperty("leveldb.block-cache", &value); + ASSERT_EQ(41938944l, atoi(value.c_str())); + + db[loop]->GetProperty("leveldb.file-cache", &value); + ASSERT_EQ(39841792L, atoi(value.c_str())); + } // else + } // for + + for (loop=0; loop<10; ++loop) + { + delete db[loop]; + snprintf(buffer, sizeof(buffer), "/flexcache%u", loop); + dbname=test::TmpDir() + buffer; + st=DestroyDB(dbname, options); + ASSERT_OK(st); + } // for + + delete options.filter_policy; + options.filter_policy=NULL; +} + + +} // namespace leveldb + +int main(int argc, char** argv) { + return leveldb::test::RunAllTests(); +} diff --git a/src/leveldb/util/hash.cc b/src/leveldb/util/hash.cc index ed439ce7a..ba1818082 100644 --- a/src/leveldb/util/hash.cc +++ b/src/leveldb/util/hash.cc @@ -6,13 +6,6 @@ #include "util/coding.h" #include "util/hash.h" -// The FALLTHROUGH_INTENDED macro can be used to annotate implicit fall-through -// between switch labels. The real definition should be provided externally. -// This one is a fallback version for unsupported compilers. -#ifndef FALLTHROUGH_INTENDED -#define FALLTHROUGH_INTENDED do { } while (0) -#endif - namespace leveldb { uint32_t Hash(const char* data, size_t n, uint32_t seed) { @@ -34,13 +27,13 @@ uint32_t Hash(const char* data, size_t n, uint32_t seed) { // Pick up remaining bytes switch (limit - data) { case 3: - h += static_cast(data[2]) << 16; - FALLTHROUGH_INTENDED; + h += data[2] << 16; + // fall through case 2: - h += static_cast(data[1]) << 8; - FALLTHROUGH_INTENDED; + h += data[1] << 8; + // fall through case 1: - h += static_cast(data[0]); + h += data[0]; h *= m; h ^= (h >> r); break; diff --git a/src/leveldb/util/hash_test.cc b/src/leveldb/util/hash_test.cc deleted file mode 100644 index eaa1c92c2..000000000 --- a/src/leveldb/util/hash_test.cc +++ /dev/null @@ -1,54 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "util/hash.h" -#include "util/testharness.h" - -namespace leveldb { - -class HASH { }; - -TEST(HASH, SignedUnsignedIssue) { - const unsigned char data1[1] = {0x62}; - const unsigned char data2[2] = {0xc3, 0x97}; - const unsigned char data3[3] = {0xe2, 0x99, 0xa5}; - const unsigned char data4[4] = {0xe1, 0x80, 0xb9, 0x32}; - const unsigned char data5[48] = { - 0x01, 0xc0, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, - 0x14, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x04, 0x00, - 0x00, 0x00, 0x00, 0x14, - 0x00, 0x00, 0x00, 0x18, - 0x28, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, - 0x02, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, - }; - - ASSERT_EQ(Hash(0, 0, 0xbc9f1d34), 0xbc9f1d34); - ASSERT_EQ( - Hash(reinterpret_cast(data1), sizeof(data1), 0xbc9f1d34), - 0xef1345c4); - ASSERT_EQ( - Hash(reinterpret_cast(data2), sizeof(data2), 0xbc9f1d34), - 0x5b663814); - ASSERT_EQ( - Hash(reinterpret_cast(data3), sizeof(data3), 0xbc9f1d34), - 0x323c078f); - ASSERT_EQ( - Hash(reinterpret_cast(data4), sizeof(data4), 0xbc9f1d34), - 0xed21633a); - ASSERT_EQ( - Hash(reinterpret_cast(data5), sizeof(data5), 0x12345678), - 0xf333dabb); -} - -} // namespace leveldb - -int main(int argc, char** argv) { - return leveldb::test::RunAllTests(); -} diff --git a/src/leveldb/util/hot_threads.cc b/src/leveldb/util/hot_threads.cc new file mode 100644 index 000000000..95e13229d --- /dev/null +++ b/src/leveldb/util/hot_threads.cc @@ -0,0 +1,351 @@ +// ------------------------------------------------------------------- +// +// hot_threads.cc +// +// Copyright (c) 2011-2015 Basho Technologies, Inc. All Rights Reserved. +// +// This file is provided to you under the Apache License, +// Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain +// a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// ------------------------------------------------------------------- + +// ------------------------------------------------------------------- +// HotThread is a subtle variation on the eleveldb_thread_pool. Both +// represent a design pattern that is tested to perform better under +// the Erlang VM than other traditional designs. +// ------------------------------------------------------------------- + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "leveldb/atomics.h" +#include "util/hot_threads.h" +#include "util/thread_tasks.h" + +namespace leveldb { + +HotThreadPool * gImmThreads=NULL; +HotThreadPool * gWriteThreads=NULL; +HotThreadPool * gLevel0Threads=NULL; +HotThreadPool * gCompactionThreads=NULL; + + + +void *ThreadStaticEntry(void *args) +{ + HotThread &tdata = *(HotThread *)args; + + return(tdata.ThreadRoutine()); + +} // ThreadStaticEntry + + +/** + * Worker threads: worker threads have 3 states: + * A. doing nothing, available to be claimed: m_Available=1 + * B. processing work passed by Erlang thread: m_Available=0, m_DirectWork= + * C. processing backlog queue of work: m_Available=0, m_DirectWork=NULL + */ +void * +HotThread::ThreadRoutine() +{ + ThreadTask * submission; + + submission=NULL; + + port::SetCurrentThreadName(m_Pool.m_PoolName.c_str()); +#ifdef OS_LINUX + if (0!=m_Nice) + { + pid_t tid; + int ret_val; + + tid = syscall(SYS_gettid); + if (-1!=(int)tid) + { + errno=0; + ret_val=getpriority(PRIO_PROCESS, tid); + // ret_val could be -1 legally, so double test + if (-1!=ret_val || 0==errno) + setpriority(PRIO_PROCESS, tid, ret_val+m_Nice); + + assert((ret_val+m_Nice)==getpriority(PRIO_PROCESS, tid)); + } // if + } // if +#endif + while(!m_Pool.m_Shutdown) + { + // is work assigned yet? + // check backlog work queue if not + if (NULL==submission) + { + // test non-blocking size for hint (much faster) + if (0!=m_Pool.m_WorkQueueAtomic) + { + // retest with locking + SpinLock lock(&m_Pool.m_QueueLock); + + if (!m_Pool.m_WorkQueue.empty()) + { + submission=m_Pool.m_WorkQueue.front(); + m_Pool.m_WorkQueue.pop_front(); + dec_and_fetch(&m_Pool.m_WorkQueueAtomic); + m_Pool.IncWorkDequeued(); + m_Pool.IncWorkWeighted(Env::Default()->NowMicros() + - submission->m_QueueStart); + } // if + } // if + } // if + + + // a work item identified (direct or queue), work it! + // then loop to test queue again + if (NULL!=submission) + { + // execute the job + (*submission)(); + if (submission->resubmit()) + { + submission->recycle(); + m_Pool.Submit(submission); + } + + submission->RefDec(); + + submission=NULL; + } // if + + // no work found, attempt to go into wait state + // (but retest queue before sleep due to race condition) + else + { + MutexLock lock(&m_Mutex); + + m_DirectWork=NULL; // safety + + // only wait if we are really sure no work pending + if (0==m_Pool.m_WorkQueueAtomic) + { + // yes, thread going to wait. set available now. + m_Available=1; + m_Condition.Wait(); + } // if + + m_Available=0; // safety + submission=(ThreadTask *)m_DirectWork; // NULL is valid + m_DirectWork=NULL;// safety + } // else + } // while + + return 0; + +} // HotThread::ThreadRoutine + + + + +HotThreadPool::HotThreadPool( + const size_t PoolSize, + const char * Name, + enum PerformanceCountersEnum Direct, + enum PerformanceCountersEnum Queued, + enum PerformanceCountersEnum Dequeued, + enum PerformanceCountersEnum Weighted, + int Nice) + : m_PoolName((Name?Name:"")), // this crashes if Name is NULL ...but need it set now + m_Shutdown(false), + m_WorkQueueAtomic(0), + m_DirectCounter(Direct), m_QueuedCounter(Queued), + m_DequeuedCounter(Dequeued), m_WeightedCounter(Weighted) +{ + int ret_val; + size_t loop; + HotThread * hot_ptr; + + ret_val=0; + for (loop=0; loopm_ThreadId, NULL, &ThreadStaticEntry, hot_ptr); + if (0==ret_val) + m_Threads.push_back(hot_ptr); + else + delete hot_ptr; + } // for + + m_Shutdown=(0!=ret_val); + + return; + +} // HotThreadPool::HotThreadPool + + +HotThreadPool::~HotThreadPool() +{ + ThreadPool_t::iterator thread_it; + WorkQueue_t::iterator work_it; + // set flag + m_Shutdown=true; + + // get all threads stopped + for (thread_it=m_Threads.begin(); m_Threads.end()!=thread_it; ++thread_it) + { + { + MutexLock lock(&(*thread_it)->m_Mutex); + (*thread_it)->m_Condition.SignalAll(); + } // lock + + pthread_join((*thread_it)->m_ThreadId, NULL); + delete *thread_it; + } // for + + // release any objects hanging in work queue + for (work_it=m_WorkQueue.begin(); m_WorkQueue.end()!=work_it; ++work_it) + { + (*work_it)->RefDec(); + } // for + + return; + +} // HotThreadPool::~HotThreadPool + + +bool // returns true if available worker thread found and claimed +HotThreadPool::FindWaitingThread( + ThreadTask * work, // non-NULL to pass current work directly to a thread, + // NULL to potentially nudge an available worker toward backlog queue + bool OkToQueue) +{ + bool ret_flag; + size_t start, index, pool_size; + + ret_flag=false; + + // pick "random" place in thread list. hopefully + // list size is prime number. + pool_size=m_Threads.size(); + if (OkToQueue) + start=(size_t)pthread_self() % pool_size; + else + start=0; + index=start; + + do + { + // perform quick test to see thread available + if (0!=m_Threads[index]->m_Available && !shutdown_pending()) + { + // perform expensive compare and swap to potentially + // claim worker thread (this is an exclusive claim to the worker) + ret_flag = compare_and_swap(&m_Threads[index]->m_Available, 1, 0); + + // the compare/swap only succeeds if worker thread is sitting on + // pthread_cond_wait ... or is about to be there but is holding + // the mutex already + if (ret_flag) + { + + // man page says mutex lock optional, experience in + // this code says it is not. using broadcast instead + // of signal to cover one other race condition + // that should never happen with single thread waiting. + MutexLock lock(&m_Threads[index]->m_Mutex); + m_Threads[index]->m_DirectWork=work; + m_Threads[index]->m_Condition.SignalAll(); + } // if + } // if + + index=(index+1)%pool_size; + + } while(index!=start && !ret_flag && OkToQueue); + + return(ret_flag); + +} // FindWaitingThread + + +bool +HotThreadPool::Submit( + ThreadTask* item, + bool OkToQueue) +{ + bool ret_flag(false); + + if (NULL!=item) + { + item->RefInc(); + + // do nothing if shutting down + if(shutdown_pending()) + { + item->RefDec(); + ret_flag=false; + } // if + + // try to give work to a waiting thread first + else if (FindWaitingThread(item, OkToQueue)) + { + IncWorkDirect(); + ret_flag=true; + } // else if + + else if (OkToQueue) + { + // hold mutex of only thread 0, this synchronizes this + // thread and the first worker thread to ensure at least + // one thread will eventually see the work item on the queue + // before m_Condition.Wait() + { + item->m_QueueStart=Env::Default()->NowMicros(); + + MutexLock lock(&m_Threads[0]->m_Mutex); + + // no waiting threads, put on backlog queue + { + SpinLock lock(&m_QueueLock); + inc_and_fetch(&m_WorkQueueAtomic); + m_WorkQueue.push_back(item); + } + } // mutex released + + // to address race condition, thread might be waiting now + FindWaitingThread(NULL, true); + + IncWorkQueued(); + ret_flag=true; + } // else if + + // did not post to thread or queue + else + { + item->RefDec(); + ret_flag=false; // redundant, but safe + } // else + } // if + + return(ret_flag); + +} // HotThreadPool::Submit + +}; // namespace leveldb diff --git a/src/leveldb/util/hot_threads.h b/src/leveldb/util/hot_threads.h new file mode 100644 index 000000000..039e2506d --- /dev/null +++ b/src/leveldb/util/hot_threads.h @@ -0,0 +1,141 @@ +// ------------------------------------------------------------------- +// +// hot_threads.h +// +// Copyright (c) 2011-2015 Basho Technologies, Inc. All Rights Reserved. +// +// This file is provided to you under the Apache License, +// Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain +// a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// ------------------------------------------------------------------- + +// ------------------------------------------------------------------- +// HotThread is a subtle variation on the eleveldb_thread_pool. Both +// represent a design pattern that is tested to perform better under +// the Erlang VM than other traditional designs. +// ------------------------------------------------------------------- + +#ifndef STORAGE_LEVELDB_INCLUDE_HOT_THREADS_H_ +#define STORAGE_LEVELDB_INCLUDE_HOT_THREADS_H_ + +#include +#include +#include +#include + +#include "leveldb/perf_count.h" +#include "port/port.h" +#include "util/mutexlock.h" + +namespace leveldb +{ + +// forward declare +class ThreadTask; + +/** + * Meta / managment data related to a worker thread. + */ +struct HotThread +{ +public: + pthread_t m_ThreadId; //!< handle for this thread + + volatile uint32_t m_Available; //!< 1 if thread waiting, using standard type for atomic operation + class HotThreadPool & m_Pool; //!< parent pool object + volatile ThreadTask * m_DirectWork; //!< work passed direct to thread + int m_Nice; //!< amount to adjust sched priority + + port::Mutex m_Mutex; //!< mutex for condition variable + port::CondVar m_Condition; //!< condition for thread waiting + +public: + HotThread(class HotThreadPool & Pool, int Nice) + : m_Available(0), m_Pool(Pool), m_DirectWork(NULL), m_Nice(Nice), + m_Condition(&m_Mutex) + {} // HotThread + + virtual ~HotThread() {}; + + // actual work loop + void * ThreadRoutine(); + +private: + HotThread(); // no default + HotThread(const HotThread &); // no copy + HotThread & operator=(const HotThread&); // no assign + +}; // class HotThread + + +class HotThreadPool +{ +public: + std::string m_PoolName; //!< used to name threads for gdb / core + typedef std::deque WorkQueue_t; + typedef std::vector ThreadPool_t; + + volatile bool m_Shutdown; //!< should we stop threads and shut down? + + ThreadPool_t m_Threads; //!< pool of fast response workers + + WorkQueue_t m_WorkQueue; + port::Spin m_QueueLock; //!< protects access to work_queue + volatile size_t m_WorkQueueAtomic; //!< atomic size to parallel work_queue.size(). + + enum PerformanceCountersEnum m_DirectCounter; + enum PerformanceCountersEnum m_QueuedCounter; + enum PerformanceCountersEnum m_DequeuedCounter; + enum PerformanceCountersEnum m_WeightedCounter; + +public: + HotThreadPool(const size_t thread_pool_size, const char * Name, + enum PerformanceCountersEnum Direct, + enum PerformanceCountersEnum Queued, + enum PerformanceCountersEnum Dequeued, + enum PerformanceCountersEnum Weighted, + int Nice=0); + + virtual ~HotThreadPool(); + + static void *ThreadStart(void *args); + + bool FindWaitingThread(ThreadTask * work, bool OkToQueue=true); + + bool Submit(ThreadTask * item, bool OkToQueue=true); + + size_t work_queue_size() const { return m_WorkQueue.size();} + bool shutdown_pending() const { return m_Shutdown; } + leveldb::PerformanceCounters * perf() const {return(leveldb::gPerfCounters);}; + + void IncWorkDirect() {leveldb::gPerfCounters->Inc(m_DirectCounter);}; + void IncWorkQueued() {leveldb::gPerfCounters->Inc(m_QueuedCounter);}; + void IncWorkDequeued() {leveldb::gPerfCounters->Inc(m_DequeuedCounter);}; + void IncWorkWeighted(uint64_t Count) {leveldb::gPerfCounters->Add(m_WeightedCounter, Count);}; + +private: + HotThreadPool(const HotThreadPool &); // nocopy + HotThreadPool& operator=(const HotThreadPool&); // nocopyassign + +}; // class HotThreadPool + +extern HotThreadPool * gImmThreads; +extern HotThreadPool * gWriteThreads; +extern HotThreadPool * gLevel0Threads; +extern HotThreadPool * gCompactionThreads; + +} // namespace leveldb + + +#endif // STORAGE_LEVELDB_INCLUDE_HOT_THREADS_H_ diff --git a/src/leveldb/util/hot_threads_test.cc b/src/leveldb/util/hot_threads_test.cc new file mode 100644 index 000000000..b85c69e66 --- /dev/null +++ b/src/leveldb/util/hot_threads_test.cc @@ -0,0 +1,133 @@ +// ------------------------------------------------------------------- +// +// hot_threads_test.cc +// +// Copyright (c) 2016 Basho Technologies, Inc. All Rights Reserved. +// +// This file is provided to you under the Apache License, +// Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain +// a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// ------------------------------------------------------------------- + + +#include "util/testharness.h" +#include "util/testutil.h" + +#include "port/port.h" +#include "util/hot_threads.h" +#include "util/mutexlock.h" +#include "util/thread_tasks.h" + +/** + * Execution routine + */ +int main(int argc, char** argv) +{ + return leveldb::test::RunAllTests(); +} + + +namespace leveldb { + +// helper function to clean up heap objects +static void ClearMetaArray(Version::FileMetaDataVector_t & ClearMe); + + +/** + * Wrapper class for tests. Holds working variables + * and helper functions. + */ +class HotThreadsTester +{ +public: + HotThreadsTester() + { + }; + + ~HotThreadsTester() + { + }; +}; // class HotThreadsTester + + +class RaceTask : public ThreadTask +{ +public: + port::Mutex * m_Mutex; + port::CondVar * m_Condition; + volatile bool * m_ReadyFlag; + + RaceTask() {}; + virtual ~RaceTask() {}; + + virtual void operator()() + { + volatile bool flag; + + // is other thread waiting yet + do + { + MutexLock lock(m_Mutex); + flag=*m_ReadyFlag; + } while(!flag); + + { + MutexLock lock(m_Mutex); + *m_ReadyFlag=false; + m_Condition->SignalAll(); + } + }; // operator() + +}; // class RaceTask + +/** + * Reproduce race condition where all threads go to + * into m_Condition.Wait() without seeing new work item + * on queue (valgrind helps make failed code fail). + */ +TEST(HotThreadsTester, RaceCondition) +{ + HotThreadPool pool(1, "RacePool", ePerfDebug0,ePerfDebug1,ePerfDebug2,ePerfDebug3); + port::Mutex race_mutex; + port::CondVar race_condition(&race_mutex); + int loop_count(0); + volatile bool ready_flag; + int loop; + RaceTask * task; + + for (loop=0; loop<10000000; ++loop) + { + task=new RaceTask; + task->m_Mutex=&race_mutex; + task->m_Condition=&race_condition; + task->m_ReadyFlag=&ready_flag; + + ready_flag=false; + pool.Submit(task,true); + + { + MutexLock lock(&race_mutex); + ready_flag=true; + race_condition.Wait(); + } + } // for + + printf("loop: %d\n",loop); +} // test + + + + +} // namespace leveldb + diff --git a/src/leveldb/util/logging.cc b/src/leveldb/util/logging.cc index 6995d9021..a24501bca 100644 --- a/src/leveldb/util/logging.cc +++ b/src/leveldb/util/logging.cc @@ -45,14 +45,37 @@ std::string EscapeString(const Slice& value) { return r; } +std::string +HexString(const Slice& value) +{ + std::string str; + for (size_t i = 0; i < value.size(); i++) { + char c = value[i]; + char buf[10]; + snprintf(buf, sizeof(buf), "%02x", + static_cast(c) & 0xff); + str.append(buf); + } // for + return(str); +} // HexString + +bool ConsumeChar(Slice* in, char c) { + if (!in->empty() && (*in)[0] == c) { + in->remove_prefix(1); + return true; + } else { + return false; + } +} + bool ConsumeDecimalNumber(Slice* in, uint64_t* val) { uint64_t v = 0; int digits = 0; while (!in->empty()) { - unsigned char c = (*in)[0]; + char c = (*in)[0]; if (c >= '0' && c <= '9') { ++digits; - const unsigned int delta = (c - '0'); + const uint64_t delta = (c - '0'); static const uint64_t kMaxUint64 = ~static_cast(0); if (v > kMaxUint64/10 || (v == kMaxUint64/10 && delta > kMaxUint64%10)) { diff --git a/src/leveldb/util/logging.h b/src/leveldb/util/logging.h index 1b450d248..9a3c5b41e 100644 --- a/src/leveldb/util/logging.h +++ b/src/leveldb/util/logging.h @@ -32,6 +32,13 @@ extern std::string NumberToString(uint64_t num); // Escapes any non-printable characters found in "value". extern std::string EscapeString(const Slice& value); +// Return human-readable hex string version of "value" +extern std::string HexString(const Slice & value); + +// If *in starts with "c", advances *in past the first character and +// returns true. Otherwise, returns false. +extern bool ConsumeChar(Slice* in, char c); + // Parse a human-readable number from "*in" into *value. On success, // advances "*in" past the consumed number and sets "*val" to the // numeric value. Otherwise, returns false and leaves *in in an diff --git a/src/leveldb/util/lz4.c b/src/leveldb/util/lz4.c new file mode 100644 index 000000000..08cf6b5cd --- /dev/null +++ b/src/leveldb/util/lz4.c @@ -0,0 +1,1516 @@ +/* + LZ4 - Fast LZ compression algorithm + Copyright (C) 2011-2015, Yann Collet. + + BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following disclaimer + in the documentation and/or other materials provided with the + distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + You can contact the author at : + - LZ4 source repository : https://github.com/Cyan4973/lz4 + - LZ4 public forum : https://groups.google.com/forum/#!forum/lz4c +*/ + + +/************************************** +* Tuning parameters +**************************************/ +/* + * HEAPMODE : + * Select how default compression functions will allocate memory for their hash table, + * in memory stack (0:default, fastest), or in memory heap (1:requires malloc()). + */ +#define HEAPMODE 0 + +/* + * ACCELERATION_DEFAULT : + * Select "acceleration" for LZ4_compress_fast() when parameter value <= 0 + */ +#define ACCELERATION_DEFAULT 1 + + +/************************************** +* CPU Feature Detection +**************************************/ +/* + * LZ4_FORCE_SW_BITCOUNT + * Define this parameter if your target system or compiler does not support hardware bit count + */ +#if defined(_MSC_VER) && defined(_WIN32_WCE) /* Visual Studio for Windows CE does not support Hardware bit count */ +# define LZ4_FORCE_SW_BITCOUNT +#endif + + +/************************************** +* Includes +**************************************/ +#include "lz4.h" + + +/************************************** +* Compiler Options +**************************************/ +#ifdef _MSC_VER /* Visual Studio */ +# define FORCE_INLINE static __forceinline +# include +# pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */ +# pragma warning(disable : 4293) /* disable: C4293: too large shift (32-bits) */ +#else +# if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */ +# if defined(__GNUC__) || defined(__clang__) +# define FORCE_INLINE static inline __attribute__((always_inline)) +# else +# define FORCE_INLINE static inline +# endif +# else +# define FORCE_INLINE static +# endif /* __STDC_VERSION__ */ +#endif /* _MSC_VER */ + +/* LZ4_GCC_VERSION is defined into lz4.h */ +#if (LZ4_GCC_VERSION >= 302) || (__INTEL_COMPILER >= 800) || defined(__clang__) +# define expect(expr,value) (__builtin_expect ((expr),(value)) ) +#else +# define expect(expr,value) (expr) +#endif + +#define likely(expr) expect((expr) != 0, 1) +#define unlikely(expr) expect((expr) != 0, 0) + + +/************************************** +* Memory routines +**************************************/ +#include /* malloc, calloc, free */ +#define ALLOCATOR(n,s) calloc(n,s) +#define FREEMEM free +#include /* memset, memcpy */ +#define MEM_INIT memset + + +/************************************** +* Basic Types +**************************************/ +#if defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */ +# include + typedef uint8_t BYTE; + typedef uint16_t U16; + typedef uint32_t U32; + typedef int32_t S32; + typedef uint64_t U64; +#else + typedef unsigned char BYTE; + typedef unsigned short U16; + typedef unsigned int U32; + typedef signed int S32; + typedef unsigned long long U64; +#endif + + +/************************************** +* Reading and writing into memory +**************************************/ +#define STEPSIZE sizeof(size_t) + +static unsigned LZ4_64bits(void) { return sizeof(void*)==8; } + +static unsigned LZ4_isLittleEndian(void) +{ + const union { U32 i; BYTE c[4]; } one = { 1 }; /* don't use static : performance detrimental */ + return one.c[0]; +} + + +static U16 LZ4_read16(const void* memPtr) +{ + U16 val16; + memcpy(&val16, memPtr, 2); + return val16; +} + +static U16 LZ4_readLE16(const void* memPtr) +{ + if (LZ4_isLittleEndian()) + { + return LZ4_read16(memPtr); + } + else + { + const BYTE* p = (const BYTE*)memPtr; + return (U16)((U16)p[0] + (p[1]<<8)); + } +} + +static void LZ4_writeLE16(void* memPtr, U16 value) +{ + if (LZ4_isLittleEndian()) + { + memcpy(memPtr, &value, 2); + } + else + { + BYTE* p = (BYTE*)memPtr; + p[0] = (BYTE) value; + p[1] = (BYTE)(value>>8); + } +} + +static U32 LZ4_read32(const void* memPtr) +{ + U32 val32; + memcpy(&val32, memPtr, 4); + return val32; +} + +static U64 LZ4_read64(const void* memPtr) +{ + U64 val64; + memcpy(&val64, memPtr, 8); + return val64; +} + +static size_t LZ4_read_ARCH(const void* p) +{ + if (LZ4_64bits()) + return (size_t)LZ4_read64(p); + else + return (size_t)LZ4_read32(p); +} + + +static void LZ4_copy4(void* dstPtr, const void* srcPtr) { memcpy(dstPtr, srcPtr, 4); } + +static void LZ4_copy8(void* dstPtr, const void* srcPtr) { memcpy(dstPtr, srcPtr, 8); } + +/* customized version of memcpy, which may overwrite up to 7 bytes beyond dstEnd */ +static void LZ4_wildCopy(void* dstPtr, const void* srcPtr, void* dstEnd) +{ + BYTE* d = (BYTE*)dstPtr; + const BYTE* s = (const BYTE*)srcPtr; + BYTE* e = (BYTE*)dstEnd; + do { LZ4_copy8(d,s); d+=8; s+=8; } while (d>3); +# elif (defined(__clang__) || (LZ4_GCC_VERSION >= 304)) && !defined(LZ4_FORCE_SW_BITCOUNT) + return (__builtin_ctzll((U64)val) >> 3); +# else + static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5, 3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, 5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4, 4, 5, 7, 2, 6, 5, 7, 6, 7, 7 }; + return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58]; +# endif + } + else /* 32 bits */ + { +# if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT) + unsigned long r; + _BitScanForward( &r, (U32)val ); + return (int)(r>>3); +# elif (defined(__clang__) || (LZ4_GCC_VERSION >= 304)) && !defined(LZ4_FORCE_SW_BITCOUNT) + return (__builtin_ctz((U32)val) >> 3); +# else + static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, 3, 2, 2, 1, 3, 2, 0, 1, 3, 3, 1, 2, 2, 2, 2, 0, 3, 1, 2, 0, 1, 0, 1, 1 }; + return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27]; +# endif + } + } + else /* Big Endian CPU */ + { + if (LZ4_64bits()) + { +# if defined(_MSC_VER) && defined(_WIN64) && !defined(LZ4_FORCE_SW_BITCOUNT) + unsigned long r = 0; + _BitScanReverse64( &r, val ); + return (unsigned)(r>>3); +# elif (defined(__clang__) || (LZ4_GCC_VERSION >= 304)) && !defined(LZ4_FORCE_SW_BITCOUNT) + return (__builtin_clzll((U64)val) >> 3); +# else + unsigned r; + if (!(val>>32)) { r=4; } else { r=0; val>>=32; } + if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; } + r += (!val); + return r; +# endif + } + else /* 32 bits */ + { +# if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT) + unsigned long r = 0; + _BitScanReverse( &r, (unsigned long)val ); + return (unsigned)(r>>3); +# elif (defined(__clang__) || (LZ4_GCC_VERSION >= 304)) && !defined(LZ4_FORCE_SW_BITCOUNT) + return (__builtin_clz((U32)val) >> 3); +# else + unsigned r; + if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; } + r += (!val); + return r; +# endif + } + } +} + +static unsigned LZ4_count(const BYTE* pIn, const BYTE* pMatch, const BYTE* pInLimit) +{ + const BYTE* const pStart = pIn; + + while (likely(pIn compression run slower on incompressible data */ + + +/************************************** +* Local Structures and types +**************************************/ +typedef struct { + U32 hashTable[HASH_SIZE_U32]; + U32 currentOffset; + U32 initCheck; + const BYTE* dictionary; + BYTE* bufferStart; /* obsolete, used for slideInputBuffer */ + U32 dictSize; +} LZ4_stream_t_internal; + +typedef enum { notLimited = 0, limitedOutput = 1 } limitedOutput_directive; +typedef enum { byPtr, byU32, byU16 } tableType_t; + +typedef enum { noDict = 0, withPrefix64k, usingExtDict } dict_directive; +typedef enum { noDictIssue = 0, dictSmall } dictIssue_directive; + +typedef enum { endOnOutputSize = 0, endOnInputSize = 1 } endCondition_directive; +typedef enum { full = 0, partial = 1 } earlyEnd_directive; + + +/************************************** +* Local Utils +**************************************/ +int LZ4_versionNumber (void) { return LZ4_VERSION_NUMBER; } +int LZ4_compressBound(int isize) { return LZ4_COMPRESSBOUND(isize); } +int LZ4_sizeofState() { return LZ4_STREAMSIZE; } + + + +/******************************** +* Compression functions +********************************/ + +static U32 LZ4_hashSequence(U32 sequence, tableType_t const tableType) +{ + if (tableType == byU16) + return (((sequence) * 2654435761U) >> ((MINMATCH*8)-(LZ4_HASHLOG+1))); + else + return (((sequence) * 2654435761U) >> ((MINMATCH*8)-LZ4_HASHLOG)); +} + +static const U64 prime5bytes = 889523592379ULL; +static U32 LZ4_hashSequence64(size_t sequence, tableType_t const tableType) +{ + const U32 hashLog = (tableType == byU16) ? LZ4_HASHLOG+1 : LZ4_HASHLOG; + const U32 hashMask = (1<> (40 - hashLog)) & hashMask; +} + +static U32 LZ4_hashSequenceT(size_t sequence, tableType_t const tableType) +{ + if (LZ4_64bits()) + return LZ4_hashSequence64(sequence, tableType); + return LZ4_hashSequence((U32)sequence, tableType); +} + +static U32 LZ4_hashPosition(const void* p, tableType_t tableType) { return LZ4_hashSequenceT(LZ4_read_ARCH(p), tableType); } + +static void LZ4_putPositionOnHash(const BYTE* p, U32 h, void* tableBase, tableType_t const tableType, const BYTE* srcBase) +{ + switch (tableType) + { + case byPtr: { const BYTE** hashTable = (const BYTE**)tableBase; hashTable[h] = p; return; } + case byU32: { U32* hashTable = (U32*) tableBase; hashTable[h] = (U32)(p-srcBase); return; } + case byU16: { U16* hashTable = (U16*) tableBase; hashTable[h] = (U16)(p-srcBase); return; } + } +} + +static void LZ4_putPosition(const BYTE* p, void* tableBase, tableType_t tableType, const BYTE* srcBase) +{ + U32 h = LZ4_hashPosition(p, tableType); + LZ4_putPositionOnHash(p, h, tableBase, tableType, srcBase); +} + +static const BYTE* LZ4_getPositionOnHash(U32 h, void* tableBase, tableType_t tableType, const BYTE* srcBase) +{ + if (tableType == byPtr) { const BYTE** hashTable = (const BYTE**) tableBase; return hashTable[h]; } + if (tableType == byU32) { U32* hashTable = (U32*) tableBase; return hashTable[h] + srcBase; } + { U16* hashTable = (U16*) tableBase; return hashTable[h] + srcBase; } /* default, to ensure a return */ +} + +static const BYTE* LZ4_getPosition(const BYTE* p, void* tableBase, tableType_t tableType, const BYTE* srcBase) +{ + U32 h = LZ4_hashPosition(p, tableType); + return LZ4_getPositionOnHash(h, tableBase, tableType, srcBase); +} + +FORCE_INLINE int LZ4_compress_generic( + void* const ctx, + const char* const source, + char* const dest, + const int inputSize, + const int maxOutputSize, + const limitedOutput_directive outputLimited, + const tableType_t tableType, + const dict_directive dict, + const dictIssue_directive dictIssue, + const U32 acceleration) +{ + LZ4_stream_t_internal* const dictPtr = (LZ4_stream_t_internal*)ctx; + + const BYTE* ip = (const BYTE*) source; + const BYTE* base; + const BYTE* lowLimit; + const BYTE* const lowRefLimit = ip - dictPtr->dictSize; + const BYTE* const dictionary = dictPtr->dictionary; + const BYTE* const dictEnd = dictionary + dictPtr->dictSize; + const size_t dictDelta = dictEnd - (const BYTE*)source; + const BYTE* anchor = (const BYTE*) source; + const BYTE* const iend = ip + inputSize; + const BYTE* const mflimit = iend - MFLIMIT; + const BYTE* const matchlimit = iend - LASTLITERALS; + + BYTE* op = (BYTE*) dest; + BYTE* const olimit = op + maxOutputSize; + + U32 forwardH; + size_t refDelta=0; + + /* Init conditions */ + if ((U32)inputSize > (U32)LZ4_MAX_INPUT_SIZE) return 0; /* Unsupported input size, too large (or negative) */ + switch(dict) + { + case noDict: + default: + base = (const BYTE*)source; + lowLimit = (const BYTE*)source; + break; + case withPrefix64k: + base = (const BYTE*)source - dictPtr->currentOffset; + lowLimit = (const BYTE*)source - dictPtr->dictSize; + break; + case usingExtDict: + base = (const BYTE*)source - dictPtr->currentOffset; + lowLimit = (const BYTE*)source; + break; + } + if ((tableType == byU16) && (inputSize>=LZ4_64Klimit)) return 0; /* Size too large (not within 64K limit) */ + if (inputSize> LZ4_skipTrigger); + + if (unlikely(forwardIp > mflimit)) goto _last_literals; + + match = LZ4_getPositionOnHash(h, ctx, tableType, base); + if (dict==usingExtDict) + { + if (match<(const BYTE*)source) + { + refDelta = dictDelta; + lowLimit = dictionary; + } + else + { + refDelta = 0; + lowLimit = (const BYTE*)source; + } + } + forwardH = LZ4_hashPosition(forwardIp, tableType); + LZ4_putPositionOnHash(ip, h, ctx, tableType, base); + + } while ( ((dictIssue==dictSmall) ? (match < lowRefLimit) : 0) + || ((tableType==byU16) ? 0 : (match + MAX_DISTANCE < ip)) + || (LZ4_read32(match+refDelta) != LZ4_read32(ip)) ); + } + + /* Catch up */ + while ((ip>anchor) && (match+refDelta > lowLimit) && (unlikely(ip[-1]==match[refDelta-1]))) { ip--; match--; } + + { + /* Encode Literal length */ + unsigned litLength = (unsigned)(ip - anchor); + token = op++; + if ((outputLimited) && (unlikely(op + litLength + (2 + 1 + LASTLITERALS) + (litLength/255) > olimit))) + return 0; /* Check output limit */ + if (litLength>=RUN_MASK) + { + int len = (int)litLength-RUN_MASK; + *token=(RUN_MASK<= 255 ; len-=255) *op++ = 255; + *op++ = (BYTE)len; + } + else *token = (BYTE)(litLength< matchlimit) limit = matchlimit; + matchLength = LZ4_count(ip+MINMATCH, match+MINMATCH, limit); + ip += MINMATCH + matchLength; + if (ip==limit) + { + unsigned more = LZ4_count(ip, (const BYTE*)source, matchlimit); + matchLength += more; + ip += more; + } + } + else + { + matchLength = LZ4_count(ip+MINMATCH, match+MINMATCH, matchlimit); + ip += MINMATCH + matchLength; + } + + if ((outputLimited) && (unlikely(op + (1 + LASTLITERALS) + (matchLength>>8) > olimit))) + return 0; /* Check output limit */ + if (matchLength>=ML_MASK) + { + *token += ML_MASK; + matchLength -= ML_MASK; + for (; matchLength >= 510 ; matchLength-=510) { *op++ = 255; *op++ = 255; } + if (matchLength >= 255) { matchLength-=255; *op++ = 255; } + *op++ = (BYTE)matchLength; + } + else *token += (BYTE)(matchLength); + } + + anchor = ip; + + /* Test end of chunk */ + if (ip > mflimit) break; + + /* Fill table */ + LZ4_putPosition(ip-2, ctx, tableType, base); + + /* Test next position */ + match = LZ4_getPosition(ip, ctx, tableType, base); + if (dict==usingExtDict) + { + if (match<(const BYTE*)source) + { + refDelta = dictDelta; + lowLimit = dictionary; + } + else + { + refDelta = 0; + lowLimit = (const BYTE*)source; + } + } + LZ4_putPosition(ip, ctx, tableType, base); + if ( ((dictIssue==dictSmall) ? (match>=lowRefLimit) : 1) + && (match+MAX_DISTANCE>=ip) + && (LZ4_read32(match+refDelta)==LZ4_read32(ip)) ) + { token=op++; *token=0; goto _next_match; } + + /* Prepare next loop */ + forwardH = LZ4_hashPosition(++ip, tableType); + } + +_last_literals: + /* Encode Last Literals */ + { + const size_t lastRun = (size_t)(iend - anchor); + if ((outputLimited) && ((op - (BYTE*)dest) + lastRun + 1 + ((lastRun+255-RUN_MASK)/255) > (U32)maxOutputSize)) + return 0; /* Check output limit */ + if (lastRun >= RUN_MASK) + { + size_t accumulator = lastRun - RUN_MASK; + *op++ = RUN_MASK << ML_BITS; + for(; accumulator >= 255 ; accumulator-=255) *op++ = 255; + *op++ = (BYTE) accumulator; + } + else + { + *op++ = (BYTE)(lastRun<= LZ4_compressBound(inputSize)) + { + if (inputSize < LZ4_64Klimit) + return LZ4_compress_generic(state, source, dest, inputSize, 0, notLimited, byU16, noDict, noDictIssue, acceleration); + else + return LZ4_compress_generic(state, source, dest, inputSize, 0, notLimited, LZ4_64bits() ? byU32 : byPtr, noDict, noDictIssue, acceleration); + } + else + { + if (inputSize < LZ4_64Klimit) + return LZ4_compress_generic(state, source, dest, inputSize, maxOutputSize, limitedOutput, byU16, noDict, noDictIssue, acceleration); + else + return LZ4_compress_generic(state, source, dest, inputSize, maxOutputSize, limitedOutput, LZ4_64bits() ? byU32 : byPtr, noDict, noDictIssue, acceleration); + } +} + + +int LZ4_compress_fast(const char* source, char* dest, int inputSize, int maxOutputSize, int acceleration) +{ +#if (HEAPMODE) + void* ctxPtr = ALLOCATOR(1, sizeof(LZ4_stream_t)); /* malloc-calloc always properly aligned */ +#else + LZ4_stream_t ctx; + void* ctxPtr = &ctx; +#endif + + int result = LZ4_compress_fast_extState(ctxPtr, source, dest, inputSize, maxOutputSize, acceleration); + +#if (HEAPMODE) + FREEMEM(ctxPtr); +#endif + return result; +} + + +int LZ4_compress_default(const char* source, char* dest, int inputSize, int maxOutputSize) +{ + return LZ4_compress_fast(source, dest, inputSize, maxOutputSize, 1); +} + + +/* hidden debug function */ +/* strangely enough, gcc generates faster code when this function is uncommented, even if unused */ +int LZ4_compress_fast_force(const char* source, char* dest, int inputSize, int maxOutputSize, int acceleration) +{ + LZ4_stream_t ctx; + + LZ4_resetStream(&ctx); + + if (inputSize < LZ4_64Klimit) + return LZ4_compress_generic(&ctx, source, dest, inputSize, maxOutputSize, limitedOutput, byU16, noDict, noDictIssue, acceleration); + else + return LZ4_compress_generic(&ctx, source, dest, inputSize, maxOutputSize, limitedOutput, LZ4_64bits() ? byU32 : byPtr, noDict, noDictIssue, acceleration); +} + + +/******************************** +* destSize variant +********************************/ + +static int LZ4_compress_destSize_generic( + void* const ctx, + const char* const src, + char* const dst, + int* const srcSizePtr, + const int targetDstSize, + const tableType_t tableType) +{ + const BYTE* ip = (const BYTE*) src; + const BYTE* base = (const BYTE*) src; + const BYTE* lowLimit = (const BYTE*) src; + const BYTE* anchor = ip; + const BYTE* const iend = ip + *srcSizePtr; + const BYTE* const mflimit = iend - MFLIMIT; + const BYTE* const matchlimit = iend - LASTLITERALS; + + BYTE* op = (BYTE*) dst; + BYTE* const oend = op + targetDstSize; + BYTE* const oMaxLit = op + targetDstSize - 2 /* offset */ - 8 /* because 8+MINMATCH==MFLIMIT */ - 1 /* token */; + BYTE* const oMaxMatch = op + targetDstSize - (LASTLITERALS + 1 /* token */); + BYTE* const oMaxSeq = oMaxLit - 1 /* token */; + + U32 forwardH; + + + /* Init conditions */ + if (targetDstSize < 1) return 0; /* Impossible to store anything */ + if ((U32)*srcSizePtr > (U32)LZ4_MAX_INPUT_SIZE) return 0; /* Unsupported input size, too large (or negative) */ + if ((tableType == byU16) && (*srcSizePtr>=LZ4_64Klimit)) return 0; /* Size too large (not within 64K limit) */ + if (*srcSizePtr> LZ4_skipTrigger); + + if (unlikely(forwardIp > mflimit)) + goto _last_literals; + + match = LZ4_getPositionOnHash(h, ctx, tableType, base); + forwardH = LZ4_hashPosition(forwardIp, tableType); + LZ4_putPositionOnHash(ip, h, ctx, tableType, base); + + } while ( ((tableType==byU16) ? 0 : (match + MAX_DISTANCE < ip)) + || (LZ4_read32(match) != LZ4_read32(ip)) ); + } + + /* Catch up */ + while ((ip>anchor) && (match > lowLimit) && (unlikely(ip[-1]==match[-1]))) { ip--; match--; } + + { + /* Encode Literal length */ + unsigned litLength = (unsigned)(ip - anchor); + token = op++; + if (op + ((litLength+240)/255) + litLength > oMaxLit) + { + /* Not enough space for a last match */ + op--; + goto _last_literals; + } + if (litLength>=RUN_MASK) + { + unsigned len = litLength - RUN_MASK; + *token=(RUN_MASK<= 255 ; len-=255) *op++ = 255; + *op++ = (BYTE)len; + } + else *token = (BYTE)(litLength< oMaxMatch) + { + /* Match description too long : reduce it */ + matchLength = (15-1) + (oMaxMatch-op) * 255; + } + //printf("offset %5i, matchLength%5i \n", (int)(ip-match), matchLength + MINMATCH); + ip += MINMATCH + matchLength; + + if (matchLength>=ML_MASK) + { + *token += ML_MASK; + matchLength -= ML_MASK; + while (matchLength >= 255) { matchLength-=255; *op++ = 255; } + *op++ = (BYTE)matchLength; + } + else *token += (BYTE)(matchLength); + } + + anchor = ip; + + /* Test end of block */ + if (ip > mflimit) break; + if (op > oMaxSeq) break; + + /* Fill table */ + LZ4_putPosition(ip-2, ctx, tableType, base); + + /* Test next position */ + match = LZ4_getPosition(ip, ctx, tableType, base); + LZ4_putPosition(ip, ctx, tableType, base); + if ( (match+MAX_DISTANCE>=ip) + && (LZ4_read32(match)==LZ4_read32(ip)) ) + { token=op++; *token=0; goto _next_match; } + + /* Prepare next loop */ + forwardH = LZ4_hashPosition(++ip, tableType); + } + +_last_literals: + /* Encode Last Literals */ + { + size_t lastRunSize = (size_t)(iend - anchor); + if (op + 1 /* token */ + ((lastRunSize+240)/255) /* litLength */ + lastRunSize /* literals */ > oend) + { + /* adapt lastRunSize to fill 'dst' */ + lastRunSize = (oend-op) - 1; + lastRunSize -= (lastRunSize+240)/255; + } + ip = anchor + lastRunSize; + + if (lastRunSize >= RUN_MASK) + { + size_t accumulator = lastRunSize - RUN_MASK; + *op++ = RUN_MASK << ML_BITS; + for(; accumulator >= 255 ; accumulator-=255) *op++ = 255; + *op++ = (BYTE) accumulator; + } + else + { + *op++ = (BYTE)(lastRunSize<= LZ4_compressBound(*srcSizePtr)) /* compression success is guaranteed */ + { + return LZ4_compress_fast_extState(state, src, dst, *srcSizePtr, targetDstSize, 1); + } + else + { + if (*srcSizePtr < LZ4_64Klimit) + return LZ4_compress_destSize_generic(state, src, dst, srcSizePtr, targetDstSize, byU16); + else + return LZ4_compress_destSize_generic(state, src, dst, srcSizePtr, targetDstSize, LZ4_64bits() ? byU32 : byPtr); + } +} + + +int LZ4_compress_destSize(const char* src, char* dst, int* srcSizePtr, int targetDstSize) +{ +#if (HEAPMODE) + void* ctx = ALLOCATOR(1, sizeof(LZ4_stream_t)); /* malloc-calloc always properly aligned */ +#else + LZ4_stream_t ctxBody; + void* ctx = &ctxBody; +#endif + + int result = LZ4_compress_destSize_extState(ctx, src, dst, srcSizePtr, targetDstSize); + +#if (HEAPMODE) + FREEMEM(ctx); +#endif + return result; +} + + + +/******************************** +* Streaming functions +********************************/ + +LZ4_stream_t* LZ4_createStream(void) +{ + LZ4_stream_t* lz4s = (LZ4_stream_t*)ALLOCATOR(8, LZ4_STREAMSIZE_U64); + LZ4_STATIC_ASSERT(LZ4_STREAMSIZE >= sizeof(LZ4_stream_t_internal)); /* A compilation error here means LZ4_STREAMSIZE is not large enough */ + LZ4_resetStream(lz4s); + return lz4s; +} + +void LZ4_resetStream (LZ4_stream_t* LZ4_stream) +{ + MEM_INIT(LZ4_stream, 0, sizeof(LZ4_stream_t)); +} + +int LZ4_freeStream (LZ4_stream_t* LZ4_stream) +{ + FREEMEM(LZ4_stream); + return (0); +} + + +#define HASH_UNIT sizeof(size_t) +int LZ4_loadDict (LZ4_stream_t* LZ4_dict, const char* dictionary, int dictSize) +{ + LZ4_stream_t_internal* dict = (LZ4_stream_t_internal*) LZ4_dict; + const BYTE* p = (const BYTE*)dictionary; + const BYTE* const dictEnd = p + dictSize; + const BYTE* base; + + if ((dict->initCheck) || (dict->currentOffset > 1 GB)) /* Uninitialized structure, or reuse overflow */ + LZ4_resetStream(LZ4_dict); + + if (dictSize < (int)HASH_UNIT) + { + dict->dictionary = NULL; + dict->dictSize = 0; + return 0; + } + + if ((dictEnd - p) > 64 KB) p = dictEnd - 64 KB; + dict->currentOffset += 64 KB; + base = p - dict->currentOffset; + dict->dictionary = p; + dict->dictSize = (U32)(dictEnd - p); + dict->currentOffset += dict->dictSize; + + while (p <= dictEnd-HASH_UNIT) + { + LZ4_putPosition(p, dict->hashTable, byU32, base); + p+=3; + } + + return dict->dictSize; +} + + +static void LZ4_renormDictT(LZ4_stream_t_internal* LZ4_dict, const BYTE* src) +{ + if ((LZ4_dict->currentOffset > 0x80000000) || + ((size_t)LZ4_dict->currentOffset > (size_t)src)) /* address space overflow */ + { + /* rescale hash table */ + U32 delta = LZ4_dict->currentOffset - 64 KB; + const BYTE* dictEnd = LZ4_dict->dictionary + LZ4_dict->dictSize; + int i; + for (i=0; ihashTable[i] < delta) LZ4_dict->hashTable[i]=0; + else LZ4_dict->hashTable[i] -= delta; + } + LZ4_dict->currentOffset = 64 KB; + if (LZ4_dict->dictSize > 64 KB) LZ4_dict->dictSize = 64 KB; + LZ4_dict->dictionary = dictEnd - LZ4_dict->dictSize; + } +} + + +int LZ4_compress_fast_continue (LZ4_stream_t* LZ4_stream, const char* source, char* dest, int inputSize, int maxOutputSize, int acceleration) +{ + LZ4_stream_t_internal* streamPtr = (LZ4_stream_t_internal*)LZ4_stream; + const BYTE* const dictEnd = streamPtr->dictionary + streamPtr->dictSize; + + const BYTE* smallest = (const BYTE*) source; + if (streamPtr->initCheck) return 0; /* Uninitialized structure detected */ + if ((streamPtr->dictSize>0) && (smallest>dictEnd)) smallest = dictEnd; + LZ4_renormDictT(streamPtr, smallest); + if (acceleration < 1) acceleration = ACCELERATION_DEFAULT; + + /* Check overlapping input/dictionary space */ + { + const BYTE* sourceEnd = (const BYTE*) source + inputSize; + if ((sourceEnd > streamPtr->dictionary) && (sourceEnd < dictEnd)) + { + streamPtr->dictSize = (U32)(dictEnd - sourceEnd); + if (streamPtr->dictSize > 64 KB) streamPtr->dictSize = 64 KB; + if (streamPtr->dictSize < 4) streamPtr->dictSize = 0; + streamPtr->dictionary = dictEnd - streamPtr->dictSize; + } + } + + /* prefix mode : source data follows dictionary */ + if (dictEnd == (const BYTE*)source) + { + int result; + if ((streamPtr->dictSize < 64 KB) && (streamPtr->dictSize < streamPtr->currentOffset)) + result = LZ4_compress_generic(LZ4_stream, source, dest, inputSize, maxOutputSize, limitedOutput, byU32, withPrefix64k, dictSmall, acceleration); + else + result = LZ4_compress_generic(LZ4_stream, source, dest, inputSize, maxOutputSize, limitedOutput, byU32, withPrefix64k, noDictIssue, acceleration); + streamPtr->dictSize += (U32)inputSize; + streamPtr->currentOffset += (U32)inputSize; + return result; + } + + /* external dictionary mode */ + { + int result; + if ((streamPtr->dictSize < 64 KB) && (streamPtr->dictSize < streamPtr->currentOffset)) + result = LZ4_compress_generic(LZ4_stream, source, dest, inputSize, maxOutputSize, limitedOutput, byU32, usingExtDict, dictSmall, acceleration); + else + result = LZ4_compress_generic(LZ4_stream, source, dest, inputSize, maxOutputSize, limitedOutput, byU32, usingExtDict, noDictIssue, acceleration); + streamPtr->dictionary = (const BYTE*)source; + streamPtr->dictSize = (U32)inputSize; + streamPtr->currentOffset += (U32)inputSize; + return result; + } +} + + +/* Hidden debug function, to force external dictionary mode */ +int LZ4_compress_forceExtDict (LZ4_stream_t* LZ4_dict, const char* source, char* dest, int inputSize) +{ + LZ4_stream_t_internal* streamPtr = (LZ4_stream_t_internal*)LZ4_dict; + int result; + const BYTE* const dictEnd = streamPtr->dictionary + streamPtr->dictSize; + + const BYTE* smallest = dictEnd; + if (smallest > (const BYTE*) source) smallest = (const BYTE*) source; + LZ4_renormDictT((LZ4_stream_t_internal*)LZ4_dict, smallest); + + result = LZ4_compress_generic(LZ4_dict, source, dest, inputSize, 0, notLimited, byU32, usingExtDict, noDictIssue, 1); + + streamPtr->dictionary = (const BYTE*)source; + streamPtr->dictSize = (U32)inputSize; + streamPtr->currentOffset += (U32)inputSize; + + return result; +} + + +int LZ4_saveDict (LZ4_stream_t* LZ4_dict, char* safeBuffer, int dictSize) +{ + LZ4_stream_t_internal* dict = (LZ4_stream_t_internal*) LZ4_dict; + const BYTE* previousDictEnd = dict->dictionary + dict->dictSize; + + if ((U32)dictSize > 64 KB) dictSize = 64 KB; /* useless to define a dictionary > 64 KB */ + if ((U32)dictSize > dict->dictSize) dictSize = dict->dictSize; + + memmove(safeBuffer, previousDictEnd - dictSize, dictSize); + + dict->dictionary = (const BYTE*)safeBuffer; + dict->dictSize = (U32)dictSize; + + return dictSize; +} + + + +/******************************* +* Decompression functions +*******************************/ +/* + * This generic decompression function cover all use cases. + * It shall be instantiated several times, using different sets of directives + * Note that it is essential this generic function is really inlined, + * in order to remove useless branches during compilation optimization. + */ +FORCE_INLINE int LZ4_decompress_generic( + const char* const source, + char* const dest, + int inputSize, + int outputSize, /* If endOnInput==endOnInputSize, this value is the max size of Output Buffer. */ + + int endOnInput, /* endOnOutputSize, endOnInputSize */ + int partialDecoding, /* full, partial */ + int targetOutputSize, /* only used if partialDecoding==partial */ + int dict, /* noDict, withPrefix64k, usingExtDict */ + const BYTE* const lowPrefix, /* == dest if dict == noDict */ + const BYTE* const dictStart, /* only if dict==usingExtDict */ + const size_t dictSize /* note : = 0 if noDict */ + ) +{ + /* Local Variables */ + const BYTE* ip = (const BYTE*) source; + const BYTE* const iend = ip + inputSize; + + BYTE* op = (BYTE*) dest; + BYTE* const oend = op + outputSize; + BYTE* cpy; + BYTE* oexit = op + targetOutputSize; + const BYTE* const lowLimit = lowPrefix - dictSize; + + const BYTE* const dictEnd = (const BYTE*)dictStart + dictSize; + const size_t dec32table[] = {4, 1, 2, 1, 4, 4, 4, 4}; + const size_t dec64table[] = {0, 0, 0, (size_t)-1, 0, 1, 2, 3}; + + const int safeDecode = (endOnInput==endOnInputSize); + const int checkOffset = ((safeDecode) && (dictSize < (int)(64 KB))); + + + /* Special cases */ + if ((partialDecoding) && (oexit> oend-MFLIMIT)) oexit = oend-MFLIMIT; /* targetOutputSize too high => decode everything */ + if ((endOnInput) && (unlikely(outputSize==0))) return ((inputSize==1) && (*ip==0)) ? 0 : -1; /* Empty output buffer */ + if ((!endOnInput) && (unlikely(outputSize==0))) return (*ip==0?1:-1); + + + /* Main Loop */ + while (1) + { + unsigned token; + size_t length; + const BYTE* match; + + /* get literal length */ + token = *ip++; + if ((length=(token>>ML_BITS)) == RUN_MASK) + { + unsigned s; + do + { + s = *ip++; + length += s; + } + while (likely((endOnInput)?ip(partialDecoding?oexit:oend-MFLIMIT)) || (ip+length>iend-(2+1+LASTLITERALS))) ) + || ((!endOnInput) && (cpy>oend-COPYLENGTH))) + { + if (partialDecoding) + { + if (cpy > oend) goto _output_error; /* Error : write attempt beyond end of output buffer */ + if ((endOnInput) && (ip+length > iend)) goto _output_error; /* Error : read attempt beyond end of input buffer */ + } + else + { + if ((!endOnInput) && (cpy != oend)) goto _output_error; /* Error : block decoding must stop exactly there */ + if ((endOnInput) && ((ip+length != iend) || (cpy > oend))) goto _output_error; /* Error : input must be consumed */ + } + memcpy(op, ip, length); + ip += length; + op += length; + break; /* Necessarily EOF, due to parsing restrictions */ + } + LZ4_wildCopy(op, ip, cpy); + ip += length; op = cpy; + + /* get offset */ + match = cpy - LZ4_readLE16(ip); ip+=2; + if ((checkOffset) && (unlikely(match < lowLimit))) goto _output_error; /* Error : offset outside destination buffer */ + + /* get matchlength */ + length = token & ML_MASK; + if (length == ML_MASK) + { + unsigned s; + do + { + if ((endOnInput) && (ip > iend-LASTLITERALS)) goto _output_error; + s = *ip++; + length += s; + } while (s==255); + if ((safeDecode) && unlikely((size_t)(op+length)<(size_t)op)) goto _output_error; /* overflow detection */ + } + length += MINMATCH; + + /* check external dictionary */ + if ((dict==usingExtDict) && (match < lowPrefix)) + { + if (unlikely(op+length > oend-LASTLITERALS)) goto _output_error; /* doesn't respect parsing restriction */ + + if (length <= (size_t)(lowPrefix-match)) + { + /* match can be copied as a single segment from external dictionary */ + match = dictEnd - (lowPrefix-match); + memmove(op, match, length); op += length; + } + else + { + /* match encompass external dictionary and current segment */ + size_t copySize = (size_t)(lowPrefix-match); + memcpy(op, dictEnd - copySize, copySize); + op += copySize; + copySize = length - copySize; + if (copySize > (size_t)(op-lowPrefix)) /* overlap within current segment */ + { + BYTE* const endOfMatch = op + copySize; + const BYTE* copyFrom = lowPrefix; + while (op < endOfMatch) *op++ = *copyFrom++; + } + else + { + memcpy(op, lowPrefix, copySize); + op += copySize; + } + } + continue; + } + + /* copy repeated sequence */ + cpy = op + length; + if (unlikely((op-match)<8)) + { + const size_t dec64 = dec64table[op-match]; + op[0] = match[0]; + op[1] = match[1]; + op[2] = match[2]; + op[3] = match[3]; + match += dec32table[op-match]; + LZ4_copy4(op+4, match); + op += 8; match -= dec64; + } else { LZ4_copy8(op, match); op+=8; match+=8; } + + if (unlikely(cpy>oend-12)) + { + if (cpy > oend-LASTLITERALS) goto _output_error; /* Error : last LASTLITERALS bytes must be literals */ + if (op < oend-8) + { + LZ4_wildCopy(op, match, oend-8); + match += (oend-8) - op; + op = oend-8; + } + while (opprefixSize = (size_t) dictSize; + lz4sd->prefixEnd = (const BYTE*) dictionary + dictSize; + lz4sd->externalDict = NULL; + lz4sd->extDictSize = 0; + return 1; +} + +/* +*_continue() : + These decoding functions allow decompression of multiple blocks in "streaming" mode. + Previously decoded blocks must still be available at the memory position where they were decoded. + If it's not possible, save the relevant part of decoded data into a safe buffer, + and indicate where it stands using LZ4_setStreamDecode() +*/ +int LZ4_decompress_safe_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* source, char* dest, int compressedSize, int maxOutputSize) +{ + LZ4_streamDecode_t_internal* lz4sd = (LZ4_streamDecode_t_internal*) LZ4_streamDecode; + int result; + + if (lz4sd->prefixEnd == (BYTE*)dest) + { + result = LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, + endOnInputSize, full, 0, + usingExtDict, lz4sd->prefixEnd - lz4sd->prefixSize, lz4sd->externalDict, lz4sd->extDictSize); + if (result <= 0) return result; + lz4sd->prefixSize += result; + lz4sd->prefixEnd += result; + } + else + { + lz4sd->extDictSize = lz4sd->prefixSize; + lz4sd->externalDict = lz4sd->prefixEnd - lz4sd->extDictSize; + result = LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, + endOnInputSize, full, 0, + usingExtDict, (BYTE*)dest, lz4sd->externalDict, lz4sd->extDictSize); + if (result <= 0) return result; + lz4sd->prefixSize = result; + lz4sd->prefixEnd = (BYTE*)dest + result; + } + + return result; +} + +int LZ4_decompress_fast_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* source, char* dest, int originalSize) +{ + LZ4_streamDecode_t_internal* lz4sd = (LZ4_streamDecode_t_internal*) LZ4_streamDecode; + int result; + + if (lz4sd->prefixEnd == (BYTE*)dest) + { + result = LZ4_decompress_generic(source, dest, 0, originalSize, + endOnOutputSize, full, 0, + usingExtDict, lz4sd->prefixEnd - lz4sd->prefixSize, lz4sd->externalDict, lz4sd->extDictSize); + if (result <= 0) return result; + lz4sd->prefixSize += originalSize; + lz4sd->prefixEnd += originalSize; + } + else + { + lz4sd->extDictSize = lz4sd->prefixSize; + lz4sd->externalDict = (BYTE*)dest - lz4sd->extDictSize; + result = LZ4_decompress_generic(source, dest, 0, originalSize, + endOnOutputSize, full, 0, + usingExtDict, (BYTE*)dest, lz4sd->externalDict, lz4sd->extDictSize); + if (result <= 0) return result; + lz4sd->prefixSize = originalSize; + lz4sd->prefixEnd = (BYTE*)dest + originalSize; + } + + return result; +} + + +/* +Advanced decoding functions : +*_usingDict() : + These decoding functions work the same as "_continue" ones, + the dictionary must be explicitly provided within parameters +*/ + +FORCE_INLINE int LZ4_decompress_usingDict_generic(const char* source, char* dest, int compressedSize, int maxOutputSize, int safe, const char* dictStart, int dictSize) +{ + if (dictSize==0) + return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, safe, full, 0, noDict, (BYTE*)dest, NULL, 0); + if (dictStart+dictSize == dest) + { + if (dictSize >= (int)(64 KB - 1)) + return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, safe, full, 0, withPrefix64k, (BYTE*)dest-64 KB, NULL, 0); + return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, safe, full, 0, noDict, (BYTE*)dest-dictSize, NULL, 0); + } + return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, safe, full, 0, usingExtDict, (BYTE*)dest, (const BYTE*)dictStart, dictSize); +} + +int LZ4_decompress_safe_usingDict(const char* source, char* dest, int compressedSize, int maxOutputSize, const char* dictStart, int dictSize) +{ + return LZ4_decompress_usingDict_generic(source, dest, compressedSize, maxOutputSize, 1, dictStart, dictSize); +} + +int LZ4_decompress_fast_usingDict(const char* source, char* dest, int originalSize, const char* dictStart, int dictSize) +{ + return LZ4_decompress_usingDict_generic(source, dest, 0, originalSize, 0, dictStart, dictSize); +} + +/* debug function */ +int LZ4_decompress_safe_forceExtDict(const char* source, char* dest, int compressedSize, int maxOutputSize, const char* dictStart, int dictSize) +{ + return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, endOnInputSize, full, 0, usingExtDict, (BYTE*)dest, (const BYTE*)dictStart, dictSize); +} + + +/*************************************************** +* Obsolete Functions +***************************************************/ +/* obsolete compression functions */ +int LZ4_compress_limitedOutput(const char* source, char* dest, int inputSize, int maxOutputSize) { return LZ4_compress_default(source, dest, inputSize, maxOutputSize); } +int LZ4_compress(const char* source, char* dest, int inputSize) { return LZ4_compress_default(source, dest, inputSize, LZ4_compressBound(inputSize)); } +int LZ4_compress_limitedOutput_withState (void* state, const char* src, char* dst, int srcSize, int dstSize) { return LZ4_compress_fast_extState(state, src, dst, srcSize, dstSize, 1); } +int LZ4_compress_withState (void* state, const char* src, char* dst, int srcSize) { return LZ4_compress_fast_extState(state, src, dst, srcSize, LZ4_compressBound(srcSize), 1); } +int LZ4_compress_limitedOutput_continue (LZ4_stream_t* LZ4_stream, const char* src, char* dst, int srcSize, int maxDstSize) { return LZ4_compress_fast_continue(LZ4_stream, src, dst, srcSize, maxDstSize, 1); } +int LZ4_compress_continue (LZ4_stream_t* LZ4_stream, const char* source, char* dest, int inputSize) { return LZ4_compress_fast_continue(LZ4_stream, source, dest, inputSize, LZ4_compressBound(inputSize), 1); } + +/* +These function names are deprecated and should no longer be used. +They are only provided here for compatibility with older user programs. +- LZ4_uncompress is totally equivalent to LZ4_decompress_fast +- LZ4_uncompress_unknownOutputSize is totally equivalent to LZ4_decompress_safe +*/ +int LZ4_uncompress (const char* source, char* dest, int outputSize) { return LZ4_decompress_fast(source, dest, outputSize); } +int LZ4_uncompress_unknownOutputSize (const char* source, char* dest, int isize, int maxOutputSize) { return LZ4_decompress_safe(source, dest, isize, maxOutputSize); } + + +/* Obsolete Streaming functions */ + +int LZ4_sizeofStreamState() { return LZ4_STREAMSIZE; } + +static void LZ4_init(LZ4_stream_t_internal* lz4ds, BYTE* base) +{ + MEM_INIT(lz4ds, 0, LZ4_STREAMSIZE); + lz4ds->bufferStart = base; +} + +int LZ4_resetStreamState(void* state, char* inputBuffer) +{ + if ((((size_t)state) & 3) != 0) return 1; /* Error : pointer is not aligned on 4-bytes boundary */ + LZ4_init((LZ4_stream_t_internal*)state, (BYTE*)inputBuffer); + return 0; +} + +void* LZ4_create (char* inputBuffer) +{ + void* lz4ds = ALLOCATOR(8, LZ4_STREAMSIZE_U64); + LZ4_init ((LZ4_stream_t_internal*)lz4ds, (BYTE*)inputBuffer); + return lz4ds; +} + +char* LZ4_slideInputBuffer (void* LZ4_Data) +{ + LZ4_stream_t_internal* ctx = (LZ4_stream_t_internal*)LZ4_Data; + int dictSize = LZ4_saveDict((LZ4_stream_t*)LZ4_Data, (char*)ctx->bufferStart, 64 KB); + return (char*)(ctx->bufferStart + dictSize); +} + +/* Obsolete streaming decompression functions */ + +int LZ4_decompress_safe_withPrefix64k(const char* source, char* dest, int compressedSize, int maxOutputSize) +{ + return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, endOnInputSize, full, 0, withPrefix64k, (BYTE*)dest - 64 KB, NULL, 64 KB); +} + +int LZ4_decompress_fast_withPrefix64k(const char* source, char* dest, int originalSize) +{ + return LZ4_decompress_generic(source, dest, 0, originalSize, endOnOutputSize, full, 0, withPrefix64k, (BYTE*)dest - 64 KB, NULL, 64 KB); +} + +#endif /* LZ4_COMMONDEFS_ONLY */ + diff --git a/src/leveldb/util/lz4.h b/src/leveldb/util/lz4.h new file mode 100644 index 000000000..99c6ebb03 --- /dev/null +++ b/src/leveldb/util/lz4.h @@ -0,0 +1,360 @@ +/* + LZ4 - Fast LZ compression algorithm + Header File + Copyright (C) 2011-2015, Yann Collet. + + BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following disclaimer + in the documentation and/or other materials provided with the + distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + You can contact the author at : + - LZ4 source repository : https://github.com/Cyan4973/lz4 + - LZ4 public forum : https://groups.google.com/forum/#!forum/lz4c +*/ +#pragma once + +#if defined (__cplusplus) +extern "C" { +#endif + +/* + * lz4.h provides block compression functions, and gives full buffer control to programmer. + * If you need to generate inter-operable compressed data (respecting LZ4 frame specification), + * and can let the library handle its own memory, please use lz4frame.h instead. +*/ + +/************************************** +* Version +**************************************/ +#define LZ4_VERSION_MAJOR 1 /* for breaking interface changes */ +#define LZ4_VERSION_MINOR 7 /* for new (non-breaking) interface capabilities */ +#define LZ4_VERSION_RELEASE 0 /* for tweaks, bug-fixes, or development */ +#define LZ4_VERSION_NUMBER (LZ4_VERSION_MAJOR *100*100 + LZ4_VERSION_MINOR *100 + LZ4_VERSION_RELEASE) +int LZ4_versionNumber (void); + +/************************************** +* Tuning parameter +**************************************/ +/* + * LZ4_MEMORY_USAGE : + * Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB; etc.) + * Increasing memory usage improves compression ratio + * Reduced memory usage can improve speed, due to cache effect + * Default value is 14, for 16KB, which nicely fits into Intel x86 L1 cache + */ +#define LZ4_MEMORY_USAGE 14 + + +/************************************** +* Simple Functions +**************************************/ + +int LZ4_compress_default(const char* source, char* dest, int sourceSize, int maxDestSize); +int LZ4_decompress_safe (const char* source, char* dest, int compressedSize, int maxDecompressedSize); + +/* +LZ4_compress_default() : + Compresses 'sourceSize' bytes from buffer 'source' + into already allocated 'dest' buffer of size 'maxDestSize'. + Compression is guaranteed to succeed if 'maxDestSize' >= LZ4_compressBound(sourceSize). + It also runs faster, so it's a recommended setting. + If the function cannot compress 'source' into a more limited 'dest' budget, + compression stops *immediately*, and the function result is zero. + As a consequence, 'dest' content is not valid. + This function never writes outside 'dest' buffer, nor read outside 'source' buffer. + sourceSize : Max supported value is LZ4_MAX_INPUT_VALUE + maxDestSize : full or partial size of buffer 'dest' (which must be already allocated) + return : the number of bytes written into buffer 'dest' (necessarily <= maxOutputSize) + or 0 if compression fails + +LZ4_decompress_safe() : + compressedSize : is the precise full size of the compressed block. + maxDecompressedSize : is the size of destination buffer, which must be already allocated. + return : the number of bytes decompressed into destination buffer (necessarily <= maxDecompressedSize) + If destination buffer is not large enough, decoding will stop and output an error code (<0). + If the source stream is detected malformed, the function will stop decoding and return a negative result. + This function is protected against buffer overflow exploits, including malicious data packets. + It never writes outside output buffer, nor reads outside input buffer. +*/ + + +/************************************** +* Advanced Functions +**************************************/ +#define LZ4_MAX_INPUT_SIZE 0x7E000000 /* 2 113 929 216 bytes */ +#define LZ4_COMPRESSBOUND(isize) ((unsigned)(isize) > (unsigned)LZ4_MAX_INPUT_SIZE ? 0 : (isize) + ((isize)/255) + 16) + +/* +LZ4_compressBound() : + Provides the maximum size that LZ4 compression may output in a "worst case" scenario (input data not compressible) + This function is primarily useful for memory allocation purposes (destination buffer size). + Macro LZ4_COMPRESSBOUND() is also provided for compilation-time evaluation (stack memory allocation for example). + Note that LZ4_compress_default() compress faster when dest buffer size is >= LZ4_compressBound(srcSize) + inputSize : max supported value is LZ4_MAX_INPUT_SIZE + return : maximum output size in a "worst case" scenario + or 0, if input size is too large ( > LZ4_MAX_INPUT_SIZE) +*/ +int LZ4_compressBound(int inputSize); + +/* +LZ4_compress_fast() : + Same as LZ4_compress_default(), but allows to select an "acceleration" factor. + The larger the acceleration value, the faster the algorithm, but also the lesser the compression. + It's a trade-off. It can be fine tuned, with each successive value providing roughly +~3% to speed. + An acceleration value of "1" is the same as regular LZ4_compress_default() + Values <= 0 will be replaced by ACCELERATION_DEFAULT (see lz4.c), which is 1. +*/ +int LZ4_compress_fast (const char* source, char* dest, int sourceSize, int maxDestSize, int acceleration); + + +/* +LZ4_compress_fast_extState() : + Same compression function, just using an externally allocated memory space to store compression state. + Use LZ4_sizeofState() to know how much memory must be allocated, + and allocate it on 8-bytes boundaries (using malloc() typically). + Then, provide it as 'void* state' to compression function. +*/ +int LZ4_sizeofState(void); +int LZ4_compress_fast_extState (void* state, const char* source, char* dest, int inputSize, int maxDestSize, int acceleration); + + +/* +LZ4_compress_destSize() : + Reverse the logic, by compressing as much data as possible from 'source' buffer + into already allocated buffer 'dest' of size 'targetDestSize'. + This function either compresses the entire 'source' content into 'dest' if it's large enough, + or fill 'dest' buffer completely with as much data as possible from 'source'. + *sourceSizePtr : will be modified to indicate how many bytes where read from 'source' to fill 'dest'. + New value is necessarily <= old value. + return : Nb bytes written into 'dest' (necessarily <= targetDestSize) + or 0 if compression fails +*/ +int LZ4_compress_destSize (const char* source, char* dest, int* sourceSizePtr, int targetDestSize); + + +/* +LZ4_decompress_fast() : + originalSize : is the original and therefore uncompressed size + return : the number of bytes read from the source buffer (in other words, the compressed size) + If the source stream is detected malformed, the function will stop decoding and return a negative result. + Destination buffer must be already allocated. Its size must be a minimum of 'originalSize' bytes. + note : This function fully respect memory boundaries for properly formed compressed data. + It is a bit faster than LZ4_decompress_safe(). + However, it does not provide any protection against intentionally modified data stream (malicious input). + Use this function in trusted environment only (data to decode comes from a trusted source). +*/ +int LZ4_decompress_fast (const char* source, char* dest, int originalSize); + +/* +LZ4_decompress_safe_partial() : + This function decompress a compressed block of size 'compressedSize' at position 'source' + into destination buffer 'dest' of size 'maxDecompressedSize'. + The function tries to stop decompressing operation as soon as 'targetOutputSize' has been reached, + reducing decompression time. + return : the number of bytes decoded in the destination buffer (necessarily <= maxDecompressedSize) + Note : this number can be < 'targetOutputSize' should the compressed block to decode be smaller. + Always control how many bytes were decoded. + If the source stream is detected malformed, the function will stop decoding and return a negative result. + This function never writes outside of output buffer, and never reads outside of input buffer. It is therefore protected against malicious data packets +*/ +int LZ4_decompress_safe_partial (const char* source, char* dest, int compressedSize, int targetOutputSize, int maxDecompressedSize); + + +/*********************************************** +* Streaming Compression Functions +***********************************************/ +#define LZ4_STREAMSIZE_U64 ((1 << (LZ4_MEMORY_USAGE-3)) + 4) +#define LZ4_STREAMSIZE (LZ4_STREAMSIZE_U64 * sizeof(long long)) +/* + * LZ4_stream_t + * information structure to track an LZ4 stream. + * important : init this structure content before first use ! + * note : only allocated directly the structure if you are statically linking LZ4 + * If you are using liblz4 as a DLL, please use below construction methods instead. + */ +typedef struct { long long table[LZ4_STREAMSIZE_U64]; } LZ4_stream_t; + +/* + * LZ4_resetStream + * Use this function to init an allocated LZ4_stream_t structure + */ +void LZ4_resetStream (LZ4_stream_t* streamPtr); + +/* + * LZ4_createStream will allocate and initialize an LZ4_stream_t structure + * LZ4_freeStream releases its memory. + * In the context of a DLL (liblz4), please use these methods rather than the static struct. + * They are more future proof, in case of a change of LZ4_stream_t size. + */ +LZ4_stream_t* LZ4_createStream(void); +int LZ4_freeStream (LZ4_stream_t* streamPtr); + +/* + * LZ4_loadDict + * Use this function to load a static dictionary into LZ4_stream. + * Any previous data will be forgotten, only 'dictionary' will remain in memory. + * Loading a size of 0 is allowed. + * Return : dictionary size, in bytes (necessarily <= 64 KB) + */ +int LZ4_loadDict (LZ4_stream_t* streamPtr, const char* dictionary, int dictSize); + +/* + * LZ4_compress_fast_continue + * Compress buffer content 'src', using data from previously compressed blocks as dictionary to improve compression ratio. + * Important : Previous data blocks are assumed to still be present and unmodified ! + * 'dst' buffer must be already allocated. + * If maxDstSize >= LZ4_compressBound(srcSize), compression is guaranteed to succeed, and runs faster. + * If not, and if compressed data cannot fit into 'dst' buffer size, compression stops, and function returns a zero. + */ +int LZ4_compress_fast_continue (LZ4_stream_t* streamPtr, const char* src, char* dst, int srcSize, int maxDstSize, int acceleration); + +/* + * LZ4_saveDict + * If previously compressed data block is not guaranteed to remain available at its memory location + * save it into a safer place (char* safeBuffer) + * Note : you don't need to call LZ4_loadDict() afterwards, + * dictionary is immediately usable, you can therefore call LZ4_compress_fast_continue() + * Return : saved dictionary size in bytes (necessarily <= dictSize), or 0 if error + */ +int LZ4_saveDict (LZ4_stream_t* streamPtr, char* safeBuffer, int dictSize); + + +/************************************************ +* Streaming Decompression Functions +************************************************/ + +#define LZ4_STREAMDECODESIZE_U64 4 +#define LZ4_STREAMDECODESIZE (LZ4_STREAMDECODESIZE_U64 * sizeof(unsigned long long)) +typedef struct { unsigned long long table[LZ4_STREAMDECODESIZE_U64]; } LZ4_streamDecode_t; +/* + * LZ4_streamDecode_t + * information structure to track an LZ4 stream. + * init this structure content using LZ4_setStreamDecode or memset() before first use ! + * + * In the context of a DLL (liblz4) please prefer usage of construction methods below. + * They are more future proof, in case of a change of LZ4_streamDecode_t size in the future. + * LZ4_createStreamDecode will allocate and initialize an LZ4_streamDecode_t structure + * LZ4_freeStreamDecode releases its memory. + */ +LZ4_streamDecode_t* LZ4_createStreamDecode(void); +int LZ4_freeStreamDecode (LZ4_streamDecode_t* LZ4_stream); + +/* + * LZ4_setStreamDecode + * Use this function to instruct where to find the dictionary. + * Setting a size of 0 is allowed (same effect as reset). + * Return : 1 if OK, 0 if error + */ +int LZ4_setStreamDecode (LZ4_streamDecode_t* LZ4_streamDecode, const char* dictionary, int dictSize); + +/* +*_continue() : + These decoding functions allow decompression of multiple blocks in "streaming" mode. + Previously decoded blocks *must* remain available at the memory position where they were decoded (up to 64 KB) + In the case of a ring buffers, decoding buffer must be either : + - Exactly same size as encoding buffer, with same update rule (block boundaries at same positions) + In which case, the decoding & encoding ring buffer can have any size, including very small ones ( < 64 KB). + - Larger than encoding buffer, by a minimum of maxBlockSize more bytes. + maxBlockSize is implementation dependent. It's the maximum size you intend to compress into a single block. + In which case, encoding and decoding buffers do not need to be synchronized, + and encoding ring buffer can have any size, including small ones ( < 64 KB). + - _At least_ 64 KB + 8 bytes + maxBlockSize. + In which case, encoding and decoding buffers do not need to be synchronized, + and encoding ring buffer can have any size, including larger than decoding buffer. + Whenever these conditions are not possible, save the last 64KB of decoded data into a safe buffer, + and indicate where it is saved using LZ4_setStreamDecode() +*/ +int LZ4_decompress_safe_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* source, char* dest, int compressedSize, int maxDecompressedSize); +int LZ4_decompress_fast_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* source, char* dest, int originalSize); + + +/* +Advanced decoding functions : +*_usingDict() : + These decoding functions work the same as + a combination of LZ4_setStreamDecode() followed by LZ4_decompress_x_continue() + They are stand-alone. They don't need nor update an LZ4_streamDecode_t structure. +*/ +int LZ4_decompress_safe_usingDict (const char* source, char* dest, int compressedSize, int maxDecompressedSize, const char* dictStart, int dictSize); +int LZ4_decompress_fast_usingDict (const char* source, char* dest, int originalSize, const char* dictStart, int dictSize); + + + +/************************************** +* Obsolete Functions +**************************************/ +/* Deprecate Warnings */ +/* Should these warnings messages be a problem, + it is generally possible to disable them, + with -Wno-deprecated-declarations for gcc + or _CRT_SECURE_NO_WARNINGS in Visual for example. + You can also define LZ4_DEPRECATE_WARNING_DEFBLOCK. */ +#ifndef LZ4_DEPRECATE_WARNING_DEFBLOCK +# define LZ4_DEPRECATE_WARNING_DEFBLOCK +# define LZ4_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__) +# if (LZ4_GCC_VERSION >= 405) || defined(__clang__) +# define LZ4_DEPRECATED(message) __attribute__((deprecated(message))) +# elif (LZ4_GCC_VERSION >= 301) +# define LZ4_DEPRECATED(message) __attribute__((deprecated)) +# elif defined(_MSC_VER) +# define LZ4_DEPRECATED(message) __declspec(deprecated(message)) +# else +# pragma message("WARNING: You need to implement LZ4_DEPRECATED for this compiler") +# define LZ4_DEPRECATED(message) +# endif +#endif /* LZ4_DEPRECATE_WARNING_DEFBLOCK */ + +/* Obsolete compression functions */ +/* These functions are planned to start generate warnings by r131 approximately */ +int LZ4_compress (const char* source, char* dest, int sourceSize); +int LZ4_compress_limitedOutput (const char* source, char* dest, int sourceSize, int maxOutputSize); +int LZ4_compress_withState (void* state, const char* source, char* dest, int inputSize); +int LZ4_compress_limitedOutput_withState (void* state, const char* source, char* dest, int inputSize, int maxOutputSize); +int LZ4_compress_continue (LZ4_stream_t* LZ4_streamPtr, const char* source, char* dest, int inputSize); +int LZ4_compress_limitedOutput_continue (LZ4_stream_t* LZ4_streamPtr, const char* source, char* dest, int inputSize, int maxOutputSize); + +/* Obsolete decompression functions */ +/* These function names are completely deprecated and must no longer be used. + They are only provided here for compatibility with older programs. + - LZ4_uncompress is the same as LZ4_decompress_fast + - LZ4_uncompress_unknownOutputSize is the same as LZ4_decompress_safe + These function prototypes are now disabled; uncomment them only if you really need them. + It is highly recommended to stop using these prototypes and migrate to maintained ones */ +/* int LZ4_uncompress (const char* source, char* dest, int outputSize); */ +/* int LZ4_uncompress_unknownOutputSize (const char* source, char* dest, int isize, int maxOutputSize); */ + +/* Obsolete streaming functions; use new streaming interface whenever possible */ +LZ4_DEPRECATED("use LZ4_createStream() instead") void* LZ4_create (char* inputBuffer); +LZ4_DEPRECATED("use LZ4_createStream() instead") int LZ4_sizeofStreamState(void); +LZ4_DEPRECATED("use LZ4_resetStream() instead") int LZ4_resetStreamState(void* state, char* inputBuffer); +LZ4_DEPRECATED("use LZ4_saveDict() instead") char* LZ4_slideInputBuffer (void* state); + +/* Obsolete streaming decoding functions */ +LZ4_DEPRECATED("use LZ4_decompress_safe_usingDict() instead") int LZ4_decompress_safe_withPrefix64k (const char* src, char* dst, int compressedSize, int maxDstSize); +LZ4_DEPRECATED("use LZ4_decompress_fast_usingDict() instead") int LZ4_decompress_fast_withPrefix64k (const char* src, char* dst, int originalSize); + + +#if defined (__cplusplus) +} +#endif diff --git a/src/leveldb/util/murmurhash.cc b/src/leveldb/util/murmurhash.cc new file mode 100644 index 000000000..2c650d8bd --- /dev/null +++ b/src/leveldb/util/murmurhash.cc @@ -0,0 +1,178 @@ +/* + Murmurhash from http://sites.google.com/site/murmurhash/ + + All code is released to the public domain. For business purposes, Murmurhash is + under the MIT license. +*/ +#include "murmurhash.h" + +#if defined(__x86_64__) + +// ------------------------------------------------------------------- +// +// The same caveats as 32-bit MurmurHash2 apply here - beware of alignment +// and endian-ness issues if used across multiple platforms. +// +// 64-bit hash for 64-bit platforms + +uint64_t MurmurHash64A ( const void * key, int len, unsigned int seed ) +{ + const uint64_t m = 0xc6a4a7935bd1e995; + const int r = 47; + + uint64_t h = seed ^ (len * m); + + const uint64_t * data = (const uint64_t *)key; + const uint64_t * end = data + (len/8); + + while(data != end) + { + uint64_t k = *data++; + + k *= m; + k ^= k >> r; + k *= m; + + h ^= k; + h *= m; + } + + const unsigned char * data2 = (const unsigned char*)data; + + switch(len & 7) + { + case 7: h ^= ((uint64_t)data2[6]) << 48; + case 6: h ^= ((uint64_t)data2[5]) << 40; + case 5: h ^= ((uint64_t)data2[4]) << 32; + case 4: h ^= ((uint64_t)data2[3]) << 24; + case 3: h ^= ((uint64_t)data2[2]) << 16; + case 2: h ^= ((uint64_t)data2[1]) << 8; + case 1: h ^= ((uint64_t)data2[0]); + h *= m; + }; + + h ^= h >> r; + h *= m; + h ^= h >> r; + + return h; +} + +#elif defined(__i386__) + +// ------------------------------------------------------------------- +// +// Note - This code makes a few assumptions about how your machine behaves - +// +// 1. We can read a 4-byte value from any address without crashing +// 2. sizeof(int) == 4 +// +// And it has a few limitations - +// +// 1. It will not work incrementally. +// 2. It will not produce the same results on little-endian and big-endian +// machines. + +unsigned int MurmurHash2 ( const void * key, int len, unsigned int seed ) +{ + // 'm' and 'r' are mixing constants generated offline. + // They're not really 'magic', they just happen to work well. + + const unsigned int m = 0x5bd1e995; + const int r = 24; + + // Initialize the hash to a 'random' value + + unsigned int h = seed ^ len; + + // Mix 4 bytes at a time into the hash + + const unsigned char * data = (const unsigned char *)key; + + while(len >= 4) + { + unsigned int k = *(unsigned int *)data; + + k *= m; + k ^= k >> r; + k *= m; + + h *= m; + h ^= k; + + data += 4; + len -= 4; + } + + // Handle the last few bytes of the input array + + switch(len) + { + case 3: h ^= data[2] << 16; + case 2: h ^= data[1] << 8; + case 1: h ^= data[0]; + h *= m; + }; + + // Do a few final mixes of the hash to ensure the last few + // bytes are well-incorporated. + + h ^= h >> 13; + h *= m; + h ^= h >> 15; + + return h; +} + +#else + +// ------------------------------------------------------------------- +// +// Same as MurmurHash2, but endian- and alignment-neutral. +// Half the speed though, alas. + +unsigned int MurmurHashNeutral2 ( const void * key, int len, unsigned int seed ) +{ + const unsigned int m = 0x5bd1e995; + const int r = 24; + + unsigned int h = seed ^ len; + + const unsigned char * data = (const unsigned char *)key; + + while(len >= 4) + { + unsigned int k; + + k = data[0]; + k |= data[1] << 8; + k |= data[2] << 16; + k |= data[3] << 24; + + k *= m; + k ^= k >> r; + k *= m; + + h *= m; + h ^= k; + + data += 4; + len -= 4; + } + + switch(len) + { + case 3: h ^= data[2] << 16; + case 2: h ^= data[1] << 8; + case 1: h ^= data[0]; + h *= m; + }; + + h ^= h >> 13; + h *= m; + h ^= h >> 15; + + return h; +} + +#endif diff --git a/src/leveldb/util/murmurhash.h b/src/leveldb/util/murmurhash.h new file mode 100644 index 000000000..1f476b664 --- /dev/null +++ b/src/leveldb/util/murmurhash.h @@ -0,0 +1,32 @@ +/* + Murmurhash from http://sites.google.com/site/murmurhash/ + + All code is released to the public domain. For business purposes, Murmurhash is + under the MIT license. +*/ +#ifndef MURMURHASH_H +#define MURMURHASH_H + +#include + +#if defined(__x86_64__) +#define MURMUR_HASH MurmurHash64A +uint64_t MurmurHash64A ( const void * key, int len, unsigned int seed ); +#define MurmurHash MurmurHash64A +typedef uint64_t murmur_t; + +#elif defined(__i386__) +#define MURMUR_HASH MurmurHash2 +unsigned int MurmurHash2 ( const void * key, int len, unsigned int seed ); +#define MurmurHash MurmurHash2 +typedef unsigned int murmur_t; + +#else +#define MURMUR_HASH MurmurHashNeutral2 +unsigned int MurmurHashNeutral2 ( const void * key, int len, unsigned int seed ); +#define MurmurHash MurmurHashNeutral2 +typedef unsigned int murmur_t; + +#endif + +#endif /* MURMURHASH_H */ diff --git a/src/leveldb/util/mutexlock.h b/src/leveldb/util/mutexlock.h index 1ff5a9efa..20dcf6f00 100644 --- a/src/leveldb/util/mutexlock.h +++ b/src/leveldb/util/mutexlock.h @@ -6,7 +6,6 @@ #define STORAGE_LEVELDB_UTIL_MUTEXLOCK_H_ #include "port/port.h" -#include "port/thread_annotations.h" namespace leveldb { @@ -20,13 +19,12 @@ namespace leveldb { // ... some complex code, possibly with multiple return paths ... // } -class SCOPED_LOCKABLE MutexLock { +class MutexLock { public: - explicit MutexLock(port::Mutex *mu) EXCLUSIVE_LOCK_FUNCTION(mu) - : mu_(mu) { + explicit MutexLock(port::Mutex *mu) : mu_(mu) { this->mu_->Lock(); } - ~MutexLock() UNLOCK_FUNCTION() { this->mu_->Unlock(); } + ~MutexLock() { this->mu_->Unlock(); } private: port::Mutex *const mu_; @@ -35,6 +33,51 @@ class SCOPED_LOCKABLE MutexLock { void operator=(const MutexLock&); }; + +class SpinLock { + public: + explicit SpinLock(port::Spin *sp) : sp_(sp) { + this->sp_->Lock(); + } + ~SpinLock() { this->sp_->Unlock(); } + + private: + port::Spin *const sp_; + // No copying allowed + SpinLock(const SpinLock&); + void operator=(const SpinLock&); +}; + + +class ReadLock { + public: + explicit ReadLock(port::RWMutex *mu) : mu_(mu) { + this->mu_->ReadLock(); + } + ~ReadLock() { this->mu_->Unlock(); } + + private: + port::RWMutex *const mu_; + // No copying allowed + ReadLock(const ReadLock&); + void operator=(const ReadLock&); +}; + + +class WriteLock { + public: + explicit WriteLock(port::RWMutex *mu) : mu_(mu) { + this->mu_->WriteLock(); + } + ~WriteLock() { this->mu_->Unlock(); } + + private: + port::RWMutex *const mu_; + // No copying allowed + WriteLock(const WriteLock&); + void operator=(const WriteLock&); +}; + } // namespace leveldb diff --git a/src/leveldb/util/options.cc b/src/leveldb/util/options.cc index b5e622761..c02635f2d 100644 --- a/src/leveldb/util/options.cc +++ b/src/leveldb/util/options.cc @@ -2,10 +2,26 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. +#define __STDC_FORMAT_MACROS +#include + #include "leveldb/options.h" #include "leveldb/comparator.h" #include "leveldb/env.h" +#include "leveldb/expiry.h" +#include "leveldb/filter_policy.h" +#include "util/cache2.h" +#include "util/crc32c.h" + +#include "leveldb/expiry.h" + +#if !defined(LEVELDB_VSN) +#define LEVELDB_VSN develop +#endif + +#define XSTR(x) #x +#define STR(x) XSTR(x) namespace leveldb { @@ -14,17 +30,72 @@ Options::Options() create_if_missing(false), error_if_exists(false), paranoid_checks(false), + verify_compactions(true), env(Env::Default()), info_log(NULL), - write_buffer_size(4<<20), + write_buffer_size(60<<20), max_open_files(1000), block_cache(NULL), block_size(4096), + block_size_steps(16), block_restart_interval(16), - max_file_size(2<<20), - compression(kSnappyCompression), - reuse_logs(false), - filter_policy(NULL) { + compression(kLZ4Compression), + filter_policy(NULL), + is_repair(false), + is_internal_db(false), + total_leveldb_mem(2684354560ll), + block_cache_threshold(32<<20), + limited_developer_mem(false), + mmap_size(0), + delete_threshold(1000), + fadvise_willneed(false), + tiered_slow_level(0), + cache_object_warming(true) +{ + } + +void +Options::Dump( + Logger * log) const +{ + Log(log," Version: %s %s", STR(LEVELDB_VSN), CompileOptionsString()); + Log(log," Options.comparator: %s", comparator->Name()); + Log(log," Options.create_if_missing: %d", create_if_missing); + Log(log," Options.error_if_exists: %d", error_if_exists); + Log(log," Options.paranoid_checks: %d", paranoid_checks); + Log(log," Options.verify_compactions: %d", verify_compactions); + Log(log," Options.env: %p", env); + Log(log," Options.info_log: %p", info_log); + Log(log," Options.write_buffer_size: %zd", write_buffer_size); + Log(log," Options.max_open_files: %d", max_open_files); + Log(log," Options.block_cache: %p", block_cache); + Log(log," Options.block_size: %zd", block_size); + Log(log," Options.block_size_steps: %d", block_size_steps); + Log(log,"Options.block_restart_interval: %d", block_restart_interval); + Log(log," Options.compression: %d", compression); + Log(log," Options.filter_policy: %s", filter_policy == NULL ? "NULL" : filter_policy->Name()); + Log(log," Options.is_repair: %s", is_repair ? "true" : "false"); + Log(log," Options.is_internal_db: %s", is_internal_db ? "true" : "false"); + Log(log," Options.total_leveldb_mem: %" PRIu64, total_leveldb_mem); + Log(log," Options.block_cache_threshold: %" PRIu64, block_cache_threshold); + Log(log," Options.limited_developer_mem: %s", limited_developer_mem ? "true" : "false"); + Log(log," Options.mmap_size: %" PRIu64, mmap_size); + Log(log," Options.delete_threshold: %" PRIu64, delete_threshold); + Log(log," Options.fadvise_willneed: %s", fadvise_willneed ? "true" : "false"); + Log(log," Options.tiered_slow_level: %d", tiered_slow_level); + Log(log," Options.tiered_fast_prefix: %s", tiered_fast_prefix.c_str()); + Log(log," Options.tiered_slow_prefix: %s", tiered_slow_prefix.c_str()); + Log(log," crc32c: %s", crc32c::IsHardwareCRC() ? "hardware" : "software"); + Log(log," Options.cache_object_warming: %s", cache_object_warming ? "true" : "false"); + Log(log," Options.ExpiryActivated: %s", ExpiryActivated() ? "true" : "false"); + + if (NULL!=expiry_module.get()) + expiry_module->Dump(log); + else + Log(log," Options.expiry_module: NULL"); + +} // Options::Dump + } // namespace leveldb diff --git a/src/leveldb/util/perf_count.cc b/src/leveldb/util/perf_count.cc new file mode 100644 index 000000000..a97efe6d7 --- /dev/null +++ b/src/leveldb/util/perf_count.cc @@ -0,0 +1,664 @@ +// ------------------------------------------------------------------- +// +// perf_count.cc: performance counters LevelDB +// +// Copyright (c) 2012-2016 Basho Technologies, Inc. All Rights Reserved. +// +// This file is provided to you under the Apache License, +// Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain +// a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// ------------------------------------------------------------------- + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef STORAGE_LEVELDB_INCLUDE_PERF_COUNT_H_ +#include "leveldb/perf_count.h" +#endif + +#include "leveldb/atomics.h" +#include "util/coding.h" + +#define __STDC_FORMAT_MACROS +#include + +#ifdef OS_SOLARIS +# include +#endif + + +namespace leveldb +{ + +// always have something active in gPerfCounters, eliminates +// need to test for "is shared object attached yet" +static PerformanceCounters LocalStartupCounters; +PerformanceCounters * gPerfCounters(&LocalStartupCounters); + + SstCounters::SstCounters() + : m_IsReadOnly(false), + m_Version(eSstCountVersion), + m_CounterSize(eSstCountEnumSize) + { + memset(m_Counter, 0, sizeof(m_Counter)); + + m_Counter[eSstCountKeySmallest]=ULLONG_MAX; + m_Counter[eSstCountValueSmallest]=ULLONG_MAX; + + return; + + }; // SstCounters::SstCounters + + + void + SstCounters::EncodeTo( + std::string & Dst) const + { + unsigned loop; + + PutVarint32(&Dst, m_Version); + PutVarint32(&Dst, m_CounterSize); + + for(loop=0; loopm_Version) + { + if (!IsReadOnly) + { + memset(ret_ptr, 0, sizeof(PerformanceCounters)); + ret_ptr->m_Version=ePerfVersion; + ret_ptr->m_CounterSize=ePerfCountEnumSize; + } // if + + // bad version match to existing segment + else + { + good=false; + errno=EINVAL; + } // else + } // if + } // if + else + { + good=false; + syslog(LOG_ERR, "shmat failed [%d, %m]", errno); + } // else + + if (good) + { + // make this available process wide + gPerfCounters=ret_ptr; + } // if + else + { + ret_ptr=NULL; + m_LastError=errno; + } // else + } // if + else + { + m_LastError=errno; + ret_ptr=NULL; + } // else + + return(ret_ptr); + + }; // PerformanceCounters::Init + + + int + PerformanceCounters::Close( + PerformanceCounters * Counts) + { + int ret_val; + + if (NULL!=Counts && &LocalStartupCounters != Counts) + { + // keep gPerf valid + if (gPerfCounters==Counts) + gPerfCounters=&LocalStartupCounters; + + ret_val=shmdt(Counts); + if (0!=ret_val) + ret_val=errno; + } // if + else + { + ret_val=EINVAL; + } // else + + return(ret_val); + } // PerformanceCounters::Close + + + uint64_t + PerformanceCounters::Inc( + unsigned Index) + { + uint64_t ret_val; + + ret_val=0; + if (Index +#include +#include +#include +#include + +#include "leveldb/perf_count.h" +#include "util/testharness.h" + +namespace leveldb { + +class PerfTest +{ +public: + static PerfTest* current_; + + PerfTest() + { + current_ = this; + } + + ~PerfTest() {}; + + bool + DeleteShm(key_t Key) + { + int ret_val, id; + + id=shmget(Key, 0, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH); + if (-1!=id) + ret_val=shmctl(id, IPC_RMID, NULL); + else + ret_val=-1; + + return(0==ret_val); + } + + + bool + CreateShm(key_t Key, size_t Size) + { + int ret_val; + + ret_val=shmget(Key, Size, IPC_CREAT | S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH); + return(-1!=ret_val); + } + + + void * + MapShm(key_t Key) + { + int id; + void * ret_ptr; + + id=shmget(Key, 0, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH); + if (-1!=id) + ret_ptr=shmat(id, NULL, 0); + else + ret_ptr=NULL; + + return(ret_ptr); + } + + + + size_t + GetShmSize(key_t Key) + { + int ret_val, id; + struct shmid_ds shm_info; + + memset(&shm_info, 0, sizeof(shm_info)); + id=shmget(Key, 0, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH); + if (-1!=id) + { + ret_val=shmctl(id, IPC_STAT, &shm_info); + + if (0!=ret_val) + shm_info.shm_segsz=0; + } // if + return(shm_info.shm_segsz); + } + +}; // class PerfTest + + +PerfTest* PerfTest::current_; + + +TEST(PerfTest, CreateNew) +{ + PerformanceCounters * perf_ptr; + + // clear any existing shm + DeleteShm(ePerfKey); + + // open for write, will create + perf_ptr=PerformanceCounters::Init(false); + ASSERT_NE(perf_ptr, (void*)NULL); + ASSERT_EQ(sizeof(PerformanceCounters), GetShmSize(ePerfKey)); + + // close and reopen for read + ASSERT_EQ(0, PerformanceCounters::Close(perf_ptr)); + + perf_ptr=PerformanceCounters::Init(true); + ASSERT_NE(perf_ptr, (void*)NULL); + ASSERT_EQ(sizeof(PerformanceCounters), GetShmSize(ePerfKey)); + ASSERT_EQ(0, PerformanceCounters::Close(perf_ptr)); + + // cleanup + ASSERT_EQ(true, DeleteShm(ePerfKey)); + + return; + +} // CreateNew + + +TEST(PerfTest, SizeUpgrade) +{ + PerformanceCounters * perf_ptr; + + // clear any existing shm + DeleteShm(ePerfKey); + + // Riak 1.2 was 536 bytes + ASSERT_NE(536, sizeof(PerformanceCounters)); + ASSERT_EQ(true, CreateShm(ePerfKey, 536)); + ASSERT_EQ(536, GetShmSize(ePerfKey)); + + // open for write, will recreate to current size + perf_ptr=PerformanceCounters::Init(false); + ASSERT_NE(perf_ptr, (void*)NULL); + ASSERT_EQ(sizeof(PerformanceCounters), GetShmSize(ePerfKey)); + + // cleanup + ASSERT_EQ(true, DeleteShm(ePerfKey)); + + return; +} // SizeUpgrade + +TEST(PerfTest, ReadLarger) +{ + PerformanceCounters * perf_ptr; + + // clear any existing shm + DeleteShm(ePerfKey); + + // create a new larger than today segment + ASSERT_EQ(true, CreateShm(ePerfKey, sizeof(PerformanceCounters)+64)); + perf_ptr=(PerformanceCounters *)MapShm(ePerfKey); + ASSERT_NE(perf_ptr, (void*)NULL); + memset(perf_ptr, 0, sizeof(PerformanceCounters)+64); + perf_ptr->SetVersion(ePerfVersion, ePerfCountEnumSize+8); + shmdt(perf_ptr); + + // open for read + perf_ptr=PerformanceCounters::Init(false); + ASSERT_NE(perf_ptr, (void*)NULL); + + // cleanup + ASSERT_EQ(true, DeleteShm(ePerfKey)); + + return; +} // ReadLarger + +} // namespace leveldb + +int main(int argc, char** argv) { + return leveldb::test::RunAllTests(); +} diff --git a/src/leveldb/util/posix_logger.h b/src/leveldb/util/posix_logger.h index c063c2b7c..9dea1d325 100644 --- a/src/leveldb/util/posix_logger.h +++ b/src/leveldb/util/posix_logger.h @@ -3,16 +3,16 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. // // Logger implementation that can be shared by all environments -// where enough Posix functionality is available. +// where enough posix functionality is available. #ifndef STORAGE_LEVELDB_UTIL_POSIX_LOGGER_H_ #define STORAGE_LEVELDB_UTIL_POSIX_LOGGER_H_ -#include #include #include #include #include "leveldb/env.h" +#include "util/mutexlock.h" namespace leveldb { @@ -20,11 +20,23 @@ class PosixLogger : public Logger { private: FILE* file_; uint64_t (*gettid_)(); // Return the thread id for the current thread + public: PosixLogger(FILE* f, uint64_t (*gettid)()) : file_(f), gettid_(gettid) { } virtual ~PosixLogger() { fclose(file_); } + virtual long LogSize() + { + long ret_val; + + // if ftell() gives error, return zero + // to match default class' "does not exist" response + ret_val=ftell(file_); + if (-1==ret_val) + ret_val=0; + return(ret_val); + }; virtual void Logv(const char* format, va_list ap) { const uint64_t thread_id = (*gettid_)(); diff --git a/src/leveldb/util/prop_cache.cc b/src/leveldb/util/prop_cache.cc new file mode 100644 index 000000000..a36fea04c --- /dev/null +++ b/src/leveldb/util/prop_cache.cc @@ -0,0 +1,341 @@ +// ------------------------------------------------------------------- +// +// prop_cache.cc +// +// Copyright (c) 2016-2017 Basho Technologies, Inc. All Rights Reserved. +// +// This file is provided to you under the Apache License, +// Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain +// a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// ------------------------------------------------------------------- + +#include +#include + +#include "port/port.h" +#include "util/prop_cache.h" +#include "util/logging.h" +#include "util/mutexlock.h" +#include "util/throttle.h" + +namespace leveldb { + +/** + * lPropCacheLock and lPropCache exist to address race condition + * where Erlang respond to an information request after telling + * leveldb to shutdown. + */ +static port::Spin lPropCacheLock; +static PropertyCachePtr_t lPropCache; + +/** + * Create the cache. Called only once upon + * leveldb initialization + */ +void +PropertyCache::InitPropertyCache( + EleveldbRouter_t Router) +{ + ShutdownPropertyCache(); + lPropCache = new PropertyCache(Router); + + return; + +} // PropertyCache + + +void +PropertyCache::ShutdownPropertyCache() +{ + + SpinLock l(&lPropCacheLock); + lPropCache.reset(); + +} // PropertyCache::ShutdownPropertyCache + + +/** + * Unit test support. Allows use of derived versions + * of PropertyCache that easy testing + */ +void +PropertyCache::SetGlobalPropertyCache( + PropertyCache * NewGlobal) +{ + // (creates infinite loop) ShutdownPropertyCache(); + lPropCache = NewGlobal; + + return; + +} // PropertyCache::SetGlobalPropertyCache + + +/** + * Unit test support. Allows use of derived versions + * of PropertyCache that easy testing + */ +Cache & +PropertyCache::GetCache() +{ + + return(*lPropCache->GetCachePtr()); + +} // PropertyCache::GetCache + + +/** + * Unit test support. Destroy current cache, start new ond + */ +void +PropertyCache::Flush() +{ + PropertyCachePtr_t ptr; + + // stablize the object by locking it and + // getting a reference count. Flush while + // holding lock to keep others away + // ... anyone already using the object may segfault + // this is so dangerous ... only for testing + { + SpinLock l(&lPropCacheLock); + ptr=lPropCache; + + if (NULL!=ptr.get()) + ptr->FlushInternal(); + } + +} // PropertyCache::Flush + + +/** + * Construct property cache object (likely singleton) + */ +PropertyCache::PropertyCache( + EleveldbRouter_t Router) + : m_Cache(NULL), m_Router(Router), + m_Cond(&m_Mutex) +{ + m_Cache = NewLRUCache2(GetCacheLimit()); + +} // PopertyCache::PropertyCache + + +PropertyCache::~PropertyCache() +{ + delete m_Cache; + m_Cache=NULL; +} // PropertyCache::~PropertyCache + + +/** + * used by unit & integration tests, must protect against + * background AAE operation requests + */ +void +PropertyCache::FlushInternal() +{ + delete m_Cache; + m_Cache = NewLRUCache2(GetCacheLimit()); + + return; + +} // PropertyCache::FlushInternal + + +/** + * Retrieve property from cache if available, + * else call out to Riak to get properties + */ +Cache::Handle * +PropertyCache::Lookup( + const Slice & CompositeBucket) +{ + Cache::Handle * ret_handle(NULL); + PropertyCachePtr_t ptr; + + // race condition ... lPropCache going away as ptr assigned + // (unlikely here, but seen in Insert) + { + SpinLock l(&lPropCacheLock); + ptr=lPropCache; + } // lock + + if (NULL!=ptr.get()) + { + ret_handle=ptr->LookupInternal(CompositeBucket); + } // if + + return(ret_handle); + +} // PropertyCache::Lookup + + +/** + * Test if global cache is running, + * does NOT imply it will stay valid + */ +bool +PropertyCache::Valid() +{ + PropertyCachePtr_t ptr; + bool ret_flag(false); + + // race condition ... lPropCache going away as ptr assigned + // (unlikely here, but seen in Insert) + { + SpinLock l(&lPropCacheLock); + ptr=lPropCache; + } // lock + + if (NULL!=ptr.get()) + { + ret_flag=(NULL!=ptr->m_Cache); + } // if + + return(ret_flag); + +} // PropertyCache::Valid + + +/** + * Retrieve property from cache if available, + * else call out to Riak to get properties + */ +Cache::Handle * +PropertyCache::LookupInternal( + const Slice & CompositeBucket) +{ + Cache::Handle * ret_handle(NULL); + + if (NULL!=m_Cache) + { + ret_handle=m_Cache->Lookup(CompositeBucket); + + // force a reread of properties every 5 minutes + if (NULL!=ret_handle) + { + uint64_t now; + ExpiryModule * mod_ptr; + + now=GetCachedTimeMicros(); + mod_ptr=(ExpiryModule *)m_Cache->Value(ret_handle); + + // some unit tests of mod_ptr of NULL + if (NULL!=mod_ptr && 0!=mod_ptr->ExpiryModuleExpiryMicros() + && mod_ptr->ExpiryModuleExpiryMicros()Release(ret_handle); + m_Cache->Erase(CompositeBucket); + ret_handle=NULL; + } // if + } // if + + // not waiting in the cache already. Request info + if (NULL==ret_handle && NULL!=m_Router) + { + // call to Riak required + ret_handle=LookupWait(CompositeBucket); + gPerfCounters->Inc(ePerfPropCacheMiss); + } // if + else if (NULL!=ret_handle) + { + // cached or no router + gPerfCounters->Inc(ePerfPropCacheHit); + } // else if + } // if + + // never supposed to be missing if property cache in play + if (NULL==ret_handle) + gPerfCounters->Inc(ePerfPropCacheError); + + return(ret_handle); + +} // PropertyCache::LookupInternal + + +/** + * Callback function used when Cache drops an object + * to make room for another due to cache size being exceeded + */ +static void +DeleteProperty( + const Slice& key, + void* value) +{ + ExpiryModuleOS * expiry; + + expiry=(ExpiryModuleOS *)value; + + delete expiry; +} // static DeleteProperty + + +/** + * (static) Add / Overwrite key in property cache. Manage handle + * on caller's behalf + */ +bool +PropertyCache::Insert( + const Slice & CompositeBucket, + void * Props, + Cache::Handle ** OutputPtr) +{ + PropertyCachePtr_t ptr; + bool ret_flag(false); + Cache::Handle * ret_handle(NULL); + + // race condition ... lPropCache going away as ptr assigned + { + SpinLock l(&lPropCacheLock); + ptr=lPropCache; + } // lock + + if (NULL!=ptr.get() && NULL!=ptr->GetCachePtr()) + { + ret_handle=ptr->InsertInternal(CompositeBucket, Props); + + if (NULL!=OutputPtr) + *OutputPtr=ret_handle; + else if (NULL!=ret_handle) + GetCache().Release(ret_handle); + + ret_flag=(NULL!=ret_handle); + } // if + + return(ret_flag); + +} // PropertyCache::Insert + + +Cache::Handle * +PropertyCache::InsertInternal( + const Slice & CompositeBucket, + void * Props) +{ + assert(NULL!=m_Cache); + + Cache::Handle * ret_handle(NULL); + + { + MutexLock lock(&m_Mutex); + + ret_handle=m_Cache->Insert(CompositeBucket, Props, 1, DeleteProperty); + m_Cond.SignalAll(); + } + + return(ret_handle); + +} // PropertyCache::InsertInternal + +} // namespace leveldb diff --git a/src/leveldb/util/prop_cache.h b/src/leveldb/util/prop_cache.h new file mode 100644 index 000000000..9ea31f5f1 --- /dev/null +++ b/src/leveldb/util/prop_cache.h @@ -0,0 +1,219 @@ +// ------------------------------------------------------------------- +// +// prop_cache.h +// +// Copyright (c) 2016-2017 Basho Technologies, Inc. All Rights Reserved. +// +// This file is provided to you under the Apache License, +// Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain +// a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// ------------------------------------------------------------------- + +#ifndef PROP_CACHE_H +#define PROP_CACHE_H + +#include "leveldb/cache.h" +#include "util/expiry_os.h" +#include "util/refobject_base.h" +#include "port/port.h" + + +namespace leveldb +{ + +class PropertyCache : public RefObjectBase +{ +public: + /** + * static functions are API for production usage + */ + + // create global cache object + static void InitPropertyCache(EleveldbRouter_t Router); + + // release global cache object + static void ShutdownPropertyCache(); + + // unit test support + static void SetGlobalPropertyCache(PropertyCache * NewCache); + + // static lookup, usually from CachePtr + static Cache::Handle * Lookup(const Slice & CompositeBucket); + + // static insert, usually from eleveldb::property_cache() + static bool Insert(const Slice & CompositeBucket, void * Props, Cache::Handle ** OutputPtr); + + // static retrieval of active cache + static Cache & GetCache(); + + // for unit tests, "flush" cache + static void Flush(); + + // test if cache is running (so OS builds know to ignore) + static bool Valid(); + + // virtual destructor to facilitate unit tests + virtual ~PropertyCache(); + +protected: + /** + * protected functions are API for unit tests. The static functions + * route program flow to these. + */ + + // only allow creation from InitPropertyCache() or unit tests + PropertyCache(EleveldbRouter_t); + + // accessor to m_Cache pointer (really bad if NULL m_Cache) + Cache * GetCachePtr() {return(m_Cache);}; + + // unit & integration test support to get rid of current cache entries + void FlushInternal(); + + // internal equivalent to static Lookup() function + Cache::Handle * LookupInternal(const Slice & CompositeBucket); + + // internal routine to launch lookup request via eleveldb router, then wait + Cache::Handle * LookupWait(const Slice & CompositeBucket); + + // internal routine to insert object and signal condition variable + Cache::Handle * InsertInternal(const Slice & CompositeBucket, void * Props); + + // 1000 is number of cache entries. Just pulled + // that number out of the air. + // virtual for unit test to override + virtual int GetCacheLimit() const {return(1000);} + + Cache * m_Cache; + EleveldbRouter_t m_Router; + port::Mutex m_Mutex; + port::CondVar m_Cond; + +// The follow explicitly disable use of default constructor, copy constructor, +// and assignment operator. +private: + PropertyCache(); + PropertyCache(const PropertyCache &); + PropertyCache operator=(const PropertyCache &); + +}; // class PropertyCache + + +/** + * This temple wraps the entire property cache + */ +typedef RefPtr PropertyCachePtr_t; + + +/** + * This template wraps an object in property cache + * to insure it is properly released. + * Makes calls to static functions of PropertyCache. + */ +template class CachePtr +{ + /**************************************************************** + * Member objects + ****************************************************************/ +public: + +protected: + Cache::Handle * m_Ptr; // NULL or object in cache + +private: + + /**************************************************************** + * Member functions + ****************************************************************/ +public: + CachePtr() : m_Ptr(NULL) {}; + + ~CachePtr() {Release();}; + + // unprotected if GetCache is NULL + void Release() + { + if (NULL!=m_Ptr) + PropertyCache::GetCache().Release(m_Ptr); + m_Ptr=NULL; + }; + + CachePtr & operator=(Cache::Handle * Hand) {reset(Hand);}; + + void reset(Cache::Handle * Hand=NULL) + { + if (m_Ptr!=Hand) + { + Release(); + m_Ptr=Hand; + } // if + } + + + Object * get() + {return(PropertyCache::Valid() + ? (Object *)PropertyCache::GetCache().Value(m_Ptr) + : NULL);}; + + // unprotected if GetCache is NULL + const Object * get() const + {return(PropertyCache::Valid() + ? (const Object *)PropertyCache::GetCache().Value(m_Ptr) + : NULL);}; + + Object * operator->() {return(get());}; + const Object * operator->() const {return(get());}; + + Object & operator*() {return(*get());}; + const Object & operator*() const {return(*get());}; + + bool Lookup(const Slice & Key) + { + Release(); + m_Ptr=PropertyCache::Lookup(Key); + return(NULL!=m_Ptr); + }; + + bool Insert(const Slice & Key, Object * Value) + { + bool ret_flag(false); + Release(); + ret_flag=PropertyCache::Insert(Key, (void *)Value, &m_Ptr); + return(ret_flag); + }; + + // unprotected if GetCache is NULL + void Erase(const Slice & Key) + { + Release(); + if (PropertyCache::Valid()) + PropertyCache::GetCache().Erase(Key); + return; + }; + +protected: + +private: + CachePtr(const CachePtr &); + CachePtr & operator=(const CachePtr &); + +}; // template CachePtr + + +typedef CachePtr ExpiryPropPtr_t; + + +} // namespace leveldb + +#endif // ifndef diff --git a/src/leveldb/util/random.h b/src/leveldb/util/random.h index ddd51b1c7..07538242e 100644 --- a/src/leveldb/util/random.h +++ b/src/leveldb/util/random.h @@ -16,12 +16,7 @@ class Random { private: uint32_t seed_; public: - explicit Random(uint32_t s) : seed_(s & 0x7fffffffu) { - // Avoid bad seeds. - if (seed_ == 0 || seed_ == 2147483647L) { - seed_ = 1; - } - } + explicit Random(uint32_t s) : seed_(s & 0x7fffffffu) { } uint32_t Next() { static const uint32_t M = 2147483647L; // 2^31-1 static const uint64_t A = 16807; // bits 14, 8, 7, 5, 2, 1, 0 diff --git a/src/leveldb/util/refobject_base.h b/src/leveldb/util/refobject_base.h new file mode 100644 index 000000000..782e860b4 --- /dev/null +++ b/src/leveldb/util/refobject_base.h @@ -0,0 +1,192 @@ +// ------------------------------------------------------------------- +// +// refobject_base.h +// +// Copyright (c) 2015 Basho Technologies, Inc. All Rights Reserved. +// +// This file is provided to you under the Apache License, +// Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain +// a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// ------------------------------------------------------------------- + +// ------------------------------------------------------------------- +// Base class for reference-counted types; refactored from +// eleveldb/c_src/refobjects.h and leveldb/util/thread_tasks.h +// ------------------------------------------------------------------- + +#ifndef LEVELDB_INCLUDE_REFOBJECT_BASE_H_ +#define LEVELDB_INCLUDE_REFOBJECT_BASE_H_ + +#include + +#include "port/port.h" +#include "leveldb/atomics.h" +#include "util/mutexlock.h" + +namespace leveldb { + +/** + * Base class for reference-counted types + * + * A user of a reference-counted object makes the reference explicit by + * calling the RefInc() method, which increments the internal reference + * counter in a thread safe manner. When the user of the object is done + * with the object, it releases the reference by calling the RefDec() + * method, which decrements the internal counter in a thread safe manner. + * When the reference counter reaches 0, the RefDec() method deletes + * the current object by executing a "delete this" statement. + * + * Note that the because RefDec() executes "delete this" when the reference + * count reaches 0, the reference-counted object must be allocated on the + * heap. + */ +class RefObjectBase +{ + // force this private so everyone is using memory fenced GetRefCount + private: + volatile uint32_t m_RefCount; + + public: + RefObjectBase() : m_RefCount(0) {} + virtual ~RefObjectBase() {} + + virtual uint32_t RefInc() {return(inc_and_fetch(&m_RefCount));} + + virtual uint32_t RefDec() + { + uint32_t current_refs; + + current_refs=dec_and_fetch(&m_RefCount); + if (0==current_refs) { + delete this; + } + + return(current_refs); + } // RefDec + + // some derived objects might need other cleanup before delete (see ErlRefObject) + virtual uint32_t RefDecNoDelete() {return(dec_and_fetch(&m_RefCount));}; + + // establish memory fence via atomic operation call + virtual uint32_t GetRefCount() {return(add_and_fetch(&m_RefCount, (uint32_t)0));}; + + private: + // hide the copy ctor and assignment operator (not implemented) + RefObjectBase(const RefObjectBase&); + RefObjectBase& operator=(const RefObjectBase&); +}; + + +template class RefPtr +{ + /**************************************************************** + * Member objects + ****************************************************************/ +public: + +protected: + port::Spin m_Spin; + Object * m_Ptr; // NULL or object being reference counted + +private: + + /**************************************************************** + * Member functions + ****************************************************************/ +public: + RefPtr() : m_Ptr(NULL) {}; + + virtual ~RefPtr() {RefDecrement();}; + + RefPtr(const RefPtr & rhs) : m_Ptr(NULL) {reset(rhs.m_Ptr);}; + RefPtr(Object * Ptr) : m_Ptr(NULL) {reset(Ptr);}; + RefPtr(Object & Obj) : m_Ptr(NULL) {reset(&Obj);}; + +// RefPtr & operator=(const Object & rhs) {reset(rhs.m_Ptr); return(*this);}; + RefPtr & operator=(Object & rhs) {reset(&rhs); return(*this);}; + RefPtr & operator=(Object * Ptr) {reset(Ptr); return(*this);}; + RefPtr & operator=(RefPtr & RPtr) {reset(RPtr.m_Ptr); return(*this);}; + RefPtr & operator=(const RefPtr & RPtr) {reset(RPtr.m_Ptr); return(*this);}; + + bool operator==(const Object & Obj) const {return(m_Ptr==&Obj);}; + bool operator!=(const Object & Obj) const {return(m_Ptr!=&Obj);}; + operator void*() {return(m_Ptr);}; + + // stl like functions + void assign(Object * Ptr) {reset(Ptr);}; + + void reset(Object * ObjectPtr=NULL) + { + SpinLock l(&m_Spin); + Object * old_ptr; + + // increment new before decrement old in case + // there are any side effects / contained / circular objects + old_ptr=m_Ptr; + m_Ptr=ObjectPtr; + + if (NULL!=m_Ptr) + { + RefIncrement(); + } // if + // swap back for the moment + if (NULL!=old_ptr) + { + m_Ptr=old_ptr; + RefDecrement(); + } // if + + // final pointer + m_Ptr=ObjectPtr; + } + + Object * get() {return(m_Ptr);}; + + const Object * get() const {return(m_Ptr);}; + + Object * operator->() {return(m_Ptr);}; + const Object * operator->() const {return(m_Ptr);}; + + Object & operator*() {return(*get());}; + const Object & operator*() const {return(*get());}; + + bool operator<(const RefPtr & rhs) const + {return(*get()<*rhs.get());}; + +protected: + // reduce reference count, delete if 0 + void RefDecrement() + { + if (NULL!=m_Ptr) + { + m_Ptr->RefDec(); + m_Ptr=NULL; + } // if + }; + + void RefIncrement() + { + if (NULL!=m_Ptr) + m_Ptr->RefInc(); + }; + +private: + + +}; // template RefPtr + + +} // namespace leveldb + +#endif // LEVELDB_INCLUDE_REFOBJECT_BASE_H_ diff --git a/src/leveldb/util/testharness.cc b/src/leveldb/util/testharness.cc index 402fab34d..be8ebfd7d 100644 --- a/src/leveldb/util/testharness.cc +++ b/src/leveldb/util/testharness.cc @@ -38,7 +38,7 @@ int RunAllTests() { int num = 0; if (tests != NULL) { - for (size_t i = 0; i < tests->size(); i++) { + for (int i = 0; i < tests->size(); i++) { const Test& t = (*tests)[i]; if (matcher != NULL) { std::string name = t.base; @@ -54,6 +54,11 @@ int RunAllTests() { } } fprintf(stderr, "==== PASSED %d tests\n", num); + + // cleanup memory for valgrind + leveldb::Env::Shutdown(); + delete tests; + return 0; } diff --git a/src/leveldb/util/testharness.h b/src/leveldb/util/testharness.h index da4fe68bb..70ae51158 100644 --- a/src/leveldb/util/testharness.h +++ b/src/leveldb/util/testharness.h @@ -74,6 +74,14 @@ class Tester { return *this; } + Tester& IsNotOk(const Status& s) { + if (s.ok()) { + ss_ << "Test needed to fail."; + ok_ = false; + } + return *this; + } + #define BINARY_OP(name,op) \ template \ Tester& name(const X& x, const Y& y) { \ @@ -103,7 +111,9 @@ class Tester { }; #define ASSERT_TRUE(c) ::leveldb::test::Tester(__FILE__, __LINE__).Is((c), #c) +#define ASSERT_FALSE(c) ::leveldb::test::Tester(__FILE__, __LINE__).Is(!(c), #c) #define ASSERT_OK(s) ::leveldb::test::Tester(__FILE__, __LINE__).IsOk((s)) +#define ASSERT_NOTOK(s) ::leveldb::test::Tester(__FILE__, __LINE__).IsNotOk((s)) #define ASSERT_EQ(a,b) ::leveldb::test::Tester(__FILE__, __LINE__).IsEq((a),(b)) #define ASSERT_NE(a,b) ::leveldb::test::Tester(__FILE__, __LINE__).IsNe((a),(b)) #define ASSERT_GE(a,b) ::leveldb::test::Tester(__FILE__, __LINE__).IsGe((a),(b)) diff --git a/src/leveldb/util/testutil.cc b/src/leveldb/util/testutil.cc index bee56bf75..538d09516 100644 --- a/src/leveldb/util/testutil.cc +++ b/src/leveldb/util/testutil.cc @@ -32,7 +32,7 @@ std::string RandomKey(Random* rnd, int len) { extern Slice CompressibleString(Random* rnd, double compressed_fraction, - size_t len, std::string* dst) { + int len, std::string* dst) { int raw = static_cast(len * compressed_fraction); if (raw < 1) raw = 1; std::string raw_data; diff --git a/src/leveldb/util/testutil.h b/src/leveldb/util/testutil.h index d7e458370..e84323b71 100644 --- a/src/leveldb/util/testutil.h +++ b/src/leveldb/util/testutil.h @@ -24,7 +24,7 @@ extern std::string RandomKey(Random* rnd, int len); // "N*compressed_fraction" bytes and return a Slice that references // the generated data. extern Slice CompressibleString(Random* rnd, double compressed_fraction, - size_t len, std::string* dst); + int len, std::string* dst); // A wrapper that allows injection of errors. class ErrorEnv : public EnvWrapper { @@ -37,23 +37,13 @@ class ErrorEnv : public EnvWrapper { num_writable_file_errors_(0) { } virtual Status NewWritableFile(const std::string& fname, - WritableFile** result) { + WritableFile** result, size_t map_size) { if (writable_file_error_) { ++num_writable_file_errors_; *result = NULL; return Status::IOError(fname, "fake error"); } - return target()->NewWritableFile(fname, result); - } - - virtual Status NewAppendableFile(const std::string& fname, - WritableFile** result) { - if (writable_file_error_) { - ++num_writable_file_errors_; - *result = NULL; - return Status::IOError(fname, "fake error"); - } - return target()->NewAppendableFile(fname, result); + return target()->NewWritableFile(fname, result, map_size); } }; diff --git a/src/leveldb/util/thread_tasks.cc b/src/leveldb/util/thread_tasks.cc new file mode 100644 index 000000000..d17813246 --- /dev/null +++ b/src/leveldb/util/thread_tasks.cc @@ -0,0 +1,63 @@ +// ------------------------------------------------------------------- +// +// thread_tasks.cc +// +// Copyright (c) 2015 Basho Technologies, Inc. All Rights Reserved. +// +// This file is provided to you under the Apache License, +// Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain +// a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// ------------------------------------------------------------------- + +#include "util/db_list.h" +#include "util/hot_threads.h" +#include "util/thread_tasks.h" + +namespace leveldb { + +void +CompactionTask::operator()() +{ + m_DBImpl->BackgroundCall2(m_Compaction); + m_Compaction=NULL; + + // look for grooming compactions in other databases. + // MUST submit to different pool, or will seldom work. + if (0==gCompactionThreads->m_WorkQueueAtomic) + { + ThreadTask * task=new GroomingPollTask; + + // this sequence could be a race condition, and that is ok. + // Race is when this thread is the grooming thread and + // it deschedules for the entire time of the GroomingPollTasks' + // scan. oh well. not critical. + gWriteThreads->Submit(task, true); + } // if +} // CompactionTask::operator()() + + +void +GroomingPollTask::operator()() +{ + // if there is no current backlog ... see if + // databases have grooming opportunity waiting + // "false" only scan user databases, not internal + if (0==gCompactionThreads->m_WorkQueueAtomic) + DBList()->ScanDBs(false, &DBImpl::CheckAvailableCompactions); + if (0==gCompactionThreads->m_WorkQueueAtomic) + DBList()->ScanDBs(true, &DBImpl::CheckAvailableCompactions); + +} // GroomingPollTask::operator() + +} // namespace leveldb diff --git a/src/leveldb/util/thread_tasks.h b/src/leveldb/util/thread_tasks.h new file mode 100644 index 000000000..1971a4b36 --- /dev/null +++ b/src/leveldb/util/thread_tasks.h @@ -0,0 +1,185 @@ +// ------------------------------------------------------------------- +// +// thread_tasks.h +// +// Copyright (c) 2011-2015 Basho Technologies, Inc. All Rights Reserved. +// +// This file is provided to you under the Apache License, +// Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain +// a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// ------------------------------------------------------------------- + +// ------------------------------------------------------------------- +// Modeled after eleveldb's workitems.h/.cc +// ------------------------------------------------------------------- + + +#ifndef STORAGE_LEVELDB_INCLUDE_THREAD_TASKS_H_ +#define STORAGE_LEVELDB_INCLUDE_THREAD_TASKS_H_ + +#include + +#include "db/db_impl.h" +#include "db/version_set.h" +#include "leveldb/atomics.h" +#include "refobject_base.h" + +namespace leveldb { + + +/** + * Virtual base class for leveldb background tasks + */ +class ThreadTask : public RefObjectBase +{ + protected: + bool m_ResubmitWork; //!< true if this work item is loaded for prefetch + + public: + uint64_t m_QueueStart; //!< NowMicros() time placed on work queue + + public: + ThreadTask() : m_ResubmitWork(false), m_QueueStart(0) {} + + virtual ~ThreadTask() {} + + // this is the derived object's task routine + virtual void operator()() = 0; + + // methods used by the thread pool to potentially reuse this task object + bool resubmit() const {return(m_ResubmitWork);} + virtual void recycle() {} + + private: + ThreadTask(const ThreadTask &); + ThreadTask & operator=(const ThreadTask &); + +}; // class ThreadTask + + +/** + * Background write of imm buffer to Level-0 file + */ + +class ImmWriteTask : public ThreadTask +{ +protected: + DBImpl * m_DBImpl; + +public: + explicit ImmWriteTask(DBImpl * Db) + : m_DBImpl(Db) {}; + + virtual ~ImmWriteTask() {}; + + virtual void operator()() {m_DBImpl->BackgroundImmCompactCall();}; + +private: + ImmWriteTask(); + ImmWriteTask(const ImmWriteTask &); + ImmWriteTask & operator=(const ImmWriteTask &); + +}; // class ImmWriteTask + + +/** + * Background compaction + */ + +class CompactionTask : public ThreadTask +{ +protected: + DBImpl * m_DBImpl; + Compaction * m_Compaction; + +public: + CompactionTask(DBImpl * Db, Compaction * Compact) + : m_DBImpl(Db), m_Compaction(Compact) {}; + + virtual ~CompactionTask() {delete m_Compaction;}; + + virtual void operator()(); + +private: + CompactionTask(); + CompactionTask(const CompactionTask &); + CompactionTask & operator=(const CompactionTask &); + +}; // class CompactionTask + + +/** + * Poll all databases for grooming opportunities + */ + +class GroomingPollTask : public ThreadTask +{ +protected: + +public: + GroomingPollTask() {}; + + virtual ~GroomingPollTask() {}; + + virtual void operator()(); + +private: + GroomingPollTask(const GroomingPollTask &); + GroomingPollTask & operator=(const GroomingPollTask &); + +}; // class GroomingPollTask + + +/** + * Original env_posix.cc task + */ + +class LegacyTask : public ThreadTask +{ +protected: + void (*m_Function)(void*); + void * m_Arg; + +public: + LegacyTask(void (*Function)(void*), void * Arg) + : m_Function(Function), m_Arg(Arg) {}; + + virtual ~LegacyTask() {}; + + virtual void operator()() + { + (*m_Function)(m_Arg); + }; + +private: + LegacyTask(); + LegacyTask(const LegacyTask &); + LegacyTask & operator=(const LegacyTask &); + +}; // class LegacyTask + + +/** + * Riak Enterprise Edition's hot backup entry point + * + * Called every 60 seconds to test for external hot backup trigger + * (initiates backup if trigger seen) + */ + +void CheckHotBackupTrigger(); + +} // namespace leveldb + + +#endif // STORAGE_LEVELDB_INCLUDE_THREAD_TASKS_H_ diff --git a/src/leveldb/util/throttle.cc b/src/leveldb/util/throttle.cc new file mode 100644 index 000000000..25fd53199 --- /dev/null +++ b/src/leveldb/util/throttle.cc @@ -0,0 +1,392 @@ +// ------------------------------------------------------------------- +// +// throttle.cc +// +// Copyright (c) 2011-2017 Basho Technologies, Inc. All Rights Reserved. +// +// This file is provided to you under the Apache License, +// Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain +// a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// ------------------------------------------------------------------- + +#include +#include + +#include "leveldb/perf_count.h" +#include "leveldb/env.h" + +#include "db/db_impl.h" +#include "util/cache2.h" +#include "util/db_list.h" +#include "util/flexcache.h" +#include "util/hot_threads.h" +#include "util/thread_tasks.h" +#include "util/throttle.h" + +#define __STDC_FORMAT_MACROS +#include + +namespace leveldb { + +// mutex and condition variable objects for use in the code below +port::Mutex* gThrottleMutex=NULL; +port::CondVar* gThrottleCond=NULL; + +// current time, on roughly a 60 second scale +// (used to reduce number of OS calls for expiry) +uint64_t gCurrentTime=0; + +#define THROTTLE_SECONDS 60 +#define THROTTLE_TIME THROTTLE_SECONDS*1000000 +#define THROTTLE_INTERVALS 63 +// following is a heristic value, determined by trial and error. +// its job is slow down the rate of change in the current throttle. +// do not want sudden changes in one or two intervals to swing +// the throttle value wildly. Goal is a nice, even throttle value. +#define THROTTLE_SCALING 17 + +struct ThrottleData_t +{ + uint64_t m_Micros; + uint64_t m_Keys; + uint64_t m_Backlog; + uint64_t m_Compactions; +}; + +// this array stores compaction statistics used in throttle calculation. +// Index 0 of this array accumulates the current minute's compaction data for level 0. +// Index 1 accumulates accumulates current minute's compaction +// statistics for all other levels. Remaining intervals contain +// most recent interval statistics for last hour. +ThrottleData_t gThrottleData[THROTTLE_INTERVALS]; + +uint64_t gThrottleRate, gUnadjustedThrottleRate; + +static volatile bool gThrottleRunning=false; +static pthread_t gThrottleThreadId; + +static void * ThrottleThread(void * arg); + + +void +ThrottleInit() +{ + gThrottleMutex = new port::Mutex; + gThrottleCond = new port::CondVar(gThrottleMutex); + + memset(&gThrottleData, 0, sizeof(gThrottleData)); + gThrottleRate=0; + gUnadjustedThrottleRate=0; + + // addresses race condition during fast start/stop + { + MutexLock lock(gThrottleMutex); + + pthread_create(&gThrottleThreadId, NULL, &ThrottleThread, NULL); + + while(!gThrottleRunning) + gThrottleCond->Wait(); + } // mutex + + return; + +} // ThrottleInit + + +static void * +ThrottleThread( + void * /*arg*/) +{ + uint64_t tot_micros, tot_keys, tot_backlog, tot_compact; + int replace_idx, loop, ret_val; + uint64_t new_throttle, new_unadjusted; + time_t now_seconds, cache_expire; + struct timespec wait_time; + + replace_idx=2; + now_seconds=0; + cache_expire=0; + new_unadjusted=1; + + // addresses race condition during fast start/stop + { + MutexLock lock(gThrottleMutex); + gThrottleRunning=true; + gThrottleCond->Signal(); + } // mutex + + while(gThrottleRunning) + { + // update our global clock, not intended to be a precise + // 60 second interval. + gCurrentTime=port::TimeMicros(); + + // + // This is code polls for existance of /etc/riak/perf_counters and sets + // the global gPerfCountersDisabled accordingly. + // Sure, there should be a better place for this code. But fits here nicely today. + // + ret_val=access("/etc/riak/perf_counters", F_OK); + gPerfCountersDisabled=(-1==ret_val); + + // + // start actual throttle work + // + { + // lock gThrottleMutex while we update gThrottleData and + // wait on gThrottleCond + MutexLock lock(gThrottleMutex); + + // sleep 1 minute +#if _POSIX_TIMERS >= 200801L + clock_gettime(CLOCK_REALTIME, &wait_time); +#else + struct timeval tv; + gettimeofday(&tv, NULL); + wait_time.tv_sec=tv.tv_sec; + wait_time.tv_nsec=tv.tv_usec*1000; +#endif + + now_seconds=wait_time.tv_sec; + wait_time.tv_sec+=THROTTLE_SECONDS; + if (gThrottleRunning) { // test in case of race at shutdown + gThrottleCond->Wait(&wait_time); + } + gThrottleData[replace_idx]=gThrottleData[1]; + gThrottleData[replace_idx].m_Backlog=0; + memset(&gThrottleData[1], 0, sizeof(gThrottleData[1])); + } // unlock gThrottleMutex + + tot_micros=0; + tot_keys=0; + tot_backlog=0; + tot_compact=0; + + // this could be faster by keeping running totals and + // subtracting [replace_idx] before copying [0] into it, + // then adding new [replace_idx]. But that needs more + // time for testing. + for (loop=2; loopm_WorkQueueAtomic; + gPerfCounters->Add(ePerfThrottleBacklog1, gThrottleData[replace_idx].m_Backlog); + + gThrottleData[0].m_Backlog=gLevel0Threads->m_WorkQueueAtomic; + gPerfCounters->Add(ePerfThrottleBacklog0, gThrottleData[0].m_Backlog); + + // non-level0 data available? + if (0!=tot_keys) + { + if (0==tot_compact) + tot_compact=1; + + // average write time for level 1+ compactions per key + // times the average number of tasks waiting + // ( the *100 stuff is to exploit fractional data in integers ) + new_throttle=((tot_micros*100) / tot_keys) + * ((tot_backlog*100) / tot_compact); + + new_throttle /= 10000; // remove *100 stuff + //new_throttle /= gCompactionThreads->m_Threads.size(); // number of general compaction threads + + if (0==new_throttle) + new_throttle=1; // throttle must have an effect + + new_unadjusted=(tot_micros*100) / tot_keys; + new_unadjusted /= 100; + if (0==new_unadjusted) + new_unadjusted=1; + } // if + + // attempt to most recent level0 + // (only use most recent level0 until level1+ data becomes available, + // useful on restart of heavily loaded server) + else if (0!=gThrottleData[0].m_Keys && 0!=gThrottleData[0].m_Compactions) + { + new_throttle=(gThrottleData[0].m_Micros / gThrottleData[0].m_Keys) + * (gThrottleData[0].m_Backlog / gThrottleData[0].m_Compactions); + + new_unadjusted=(gThrottleData[0].m_Micros / gThrottleData[0].m_Keys); + if (0==new_unadjusted) + new_unadjusted=1; + } // else if + else + { + new_throttle=1; + } // else + + // change the throttle slowly + // (+1 & +2 keep throttle moving toward goal when difference new and + // old is less than THROTTLE_SCALING) + int temp_rate; + + temp_rate=gThrottleRate; + if (temp_rate < new_throttle) + temp_rate+=(new_throttle - temp_rate)/THROTTLE_SCALING +1; + else + temp_rate-=(temp_rate - new_throttle)/THROTTLE_SCALING +2; + + // +2 can make this go negative + if (temp_rate<1) + temp_rate=1; // throttle must always have an effect + + gThrottleRate=temp_rate; + gUnadjustedThrottleRate=new_unadjusted; + + // Log(NULL, "ThrottleRate %" PRIu64 ", UnadjustedThrottleRate %" PRIu64, gThrottleRate, gUnadjustedThrottleRate); + + gPerfCounters->Set(ePerfThrottleGauge, gThrottleRate); + gPerfCounters->Add(ePerfThrottleCounter, gThrottleRate*THROTTLE_SECONDS); + gPerfCounters->Set(ePerfThrottleUnadjusted, gUnadjustedThrottleRate); + + // prepare for next interval + memset(&gThrottleData[0], 0, sizeof(gThrottleData[0])); + } // unlock gThrottleMutex + + ++replace_idx; + if (THROTTLE_INTERVALS==replace_idx) + replace_idx=2; + + // + // This is code to manage / flush the flexcache's old file cache entries. + // Sure, there should be a better place for this code. But fits here nicely today. + // + if (cache_expire < now_seconds) + { + cache_expire = now_seconds + 60*60; // hard coded to one hour for now + DBList()->ScanDBs(true, &DBImpl::PurgeExpiredFileCache); + DBList()->ScanDBs(false, &DBImpl::PurgeExpiredFileCache); + } // if + + // + // This is a second non-throttle task added to this one minute loop. Pattern forming. + // See if hot backup wants to initiate. + // + CheckHotBackupTrigger(); + + // nudge compaction logic of potential grooming + if (0==gCompactionThreads->m_WorkQueueAtomic) // user databases + DBList()->ScanDBs(false, &DBImpl::CheckAvailableCompactions); + if (0==gCompactionThreads->m_WorkQueueAtomic) // internal databases + DBList()->ScanDBs(true, &DBImpl::CheckAvailableCompactions); + + } // while + + return(NULL); + +} // ThrottleThread + + +void SetThrottleWriteRate(uint64_t Micros, uint64_t Keys, bool IsLevel0) +{ + if (IsLevel0) + { + // lock gThrottleMutex while we update gThrottleData + { + MutexLock lock(gThrottleMutex); + + gThrottleData[0].m_Micros+=Micros; + gThrottleData[0].m_Keys+=Keys; + gThrottleData[0].m_Backlog=0; + gThrottleData[0].m_Compactions+=1; + } // unlock gThrottleMutex + + gPerfCounters->Add(ePerfThrottleMicros0, Micros); + gPerfCounters->Add(ePerfThrottleKeys0, Keys); + gPerfCounters->Inc(ePerfThrottleCompacts0); + } // if + + else + { + // lock gThrottleMutex while we update gThrottleData + { + MutexLock lock(gThrottleMutex); + + gThrottleData[1].m_Micros+=Micros; + gThrottleData[1].m_Keys+=Keys; + gThrottleData[1].m_Backlog=0; + gThrottleData[1].m_Compactions+=1; + } // unlock gThrottleMutex + + gPerfCounters->Add(ePerfThrottleMicros1, Micros); + gPerfCounters->Add(ePerfThrottleKeys1, Keys); + gPerfCounters->Inc(ePerfThrottleCompacts1); + } // else + + return; +}; + +uint64_t GetThrottleWriteRate() {return(gThrottleRate);}; +uint64_t GetUnadjustedThrottleWriteRate() {return(gUnadjustedThrottleRate);}; + +// clock_gettime but only updated once every 60 seconds (roughly) +uint64_t GetCachedTimeMicros() {return(gCurrentTime);}; +void SetCachedTimeMicros(uint64_t Time) {gCurrentTime=Time;}; +/** + * ThrottleStopThreads() is the first step in a two step shutdown. + * This stops the 1 minute throttle calculation loop that also + * can initiate leveldb compaction actions. Background compaction + * threads should stop between these two steps. + */ +void ThrottleStopThreads() +{ + if (gThrottleRunning) + { + gThrottleRunning=false; + + // lock gThrottleMutex so that we can signal gThrottleCond + { + MutexLock lock(gThrottleMutex); + gThrottleCond->Signal(); + } // unlock gThrottleMutex + + pthread_join(gThrottleThreadId, NULL); + } // if + + return; + +} // ThrottleShutdown + +/** + * ThrottleClose is the second step in a two step shutdown of + * throttle. The intent is for background compaction threads + * to stop between these two steps. + */ +void ThrottleClose() +{ + // safety check + if (gThrottleRunning) + ThrottleStopThreads(); + + delete gThrottleCond; + gThrottleCond = NULL; + + delete gThrottleMutex; + gThrottleMutex = NULL; + + return; +} // ThrottleShutdown + +} // namespace leveldb diff --git a/src/leveldb/util/throttle.h b/src/leveldb/util/throttle.h new file mode 100644 index 000000000..2a06fd6a9 --- /dev/null +++ b/src/leveldb/util/throttle.h @@ -0,0 +1,47 @@ +// ------------------------------------------------------------------- +// +// throttle.h +// +// Copyright (c) 2011-2013 Basho Technologies, Inc. All Rights Reserved. +// +// This file is provided to you under the Apache License, +// Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain +// a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// ------------------------------------------------------------------- + +#include + + +namespace leveldb +{ + +void ThrottleInit(); + +void SetThrottleWriteRate(uint64_t Micros, uint64_t Keys, bool IsLevel0); + +uint64_t GetThrottleWriteRate(); +uint64_t GetUnadjustedThrottleWriteRate(); + +// clock_gettime but only updated once every 60 seconds (roughly) +// (SetCachedTimeMicros() intended for unit tests) +uint64_t GetCachedTimeMicros(); +void SetCachedTimeMicros(uint64_t); + +// step 1 in two step shutdown +void ThrottleStopThreads(); + +// step 2 in two step shutdown +void ThrottleClose(); + +} // namespace leveldb