Compare commits

...

4 commits

Author SHA1 Message Date
Brannon King
1069eb65b5 switched to use basho LevelDB
This is much more efficient in disk writes.
more careful incorporation of basho


made it build out of the box (but not well)
2019-08-20 09:50:49 -06:00
Brannon King
83319b7f31 optimized a little 2019-08-02 17:27:28 -06:00
Brannon King
6b8935718e first pass at not loading full claimtrie into RAM
tweaks
2019-08-01 18:35:58 -06:00
Brannon King
05d89e91cf fix unit test crash on OSX
pulled in some fixes from v18
2019-08-01 15:44:05 -06:00
209 changed files with 23549 additions and 8105 deletions

View file

@ -475,7 +475,6 @@ lbrycrdd_LDADD = \
$(LIBBITCOIN_CONSENSUS) \ $(LIBBITCOIN_CONSENSUS) \
$(LIBBITCOIN_CRYPTO) \ $(LIBBITCOIN_CRYPTO) \
$(LIBLEVELDB) \ $(LIBLEVELDB) \
$(LIBLEVELDB_SSE42) \
$(LIBMEMENV) \ $(LIBMEMENV) \
$(LIBSECP256K1) $(LIBSECP256K1)
@ -573,7 +572,7 @@ $(top_srcdir)/$(subdir)/config/bitcoin-config.h.in: $(am__configure_deps)
clean-local: clean-local:
-$(MAKE) -C secp256k1 clean -$(MAKE) -C secp256k1 clean
-$(MAKE) -C univalue clean -$(MAKE) -C univalue clean
-rm -f leveldb/*/*.gcda leveldb/*/*.gcno leveldb/helpers/memenv/*.gcda leveldb/helpers/memenv/*.gcno -$(MAKE) -C leveldb clean
-rm -f config.h -rm -f config.h
-rm -rf test/__pycache__ -rm -rf test/__pycache__

View file

@ -42,7 +42,6 @@ bench_bench_bitcoin_LDADD = \
$(LIBBITCOIN_CONSENSUS) \ $(LIBBITCOIN_CONSENSUS) \
$(LIBBITCOIN_CRYPTO) \ $(LIBBITCOIN_CRYPTO) \
$(LIBLEVELDB) \ $(LIBLEVELDB) \
$(LIBLEVELDB_SSE42) \
$(LIBMEMENV) \ $(LIBMEMENV) \
$(LIBSECP256K1) \ $(LIBSECP256K1) \
$(LIBUNIVALUE) $(LIBUNIVALUE)

View file

@ -2,148 +2,23 @@
# Distributed under the MIT software license, see the accompanying # Distributed under the MIT software license, see the accompanying
# file COPYING or http://www.opensource.org/licenses/mit-license.php. # file COPYING or http://www.opensource.org/licenses/mit-license.php.
SUBDIRS = leveldb
LIBLEVELDB_INT = leveldb/libleveldb.a LIBLEVELDB_INT = leveldb/libleveldb.a
LIBMEMENV_INT = leveldb/libmemenv.a LIBMEMENV_INT = leveldb/libmemenv.a
LIBLEVELDB_SSE42_INT = leveldb/libleveldb_sse42.a
EXTRA_LIBRARIES += $(LIBLEVELDB_INT) EXTRA_LIBRARIES += $(LIBLEVELDB_INT)
EXTRA_LIBRARIES += $(LIBMEMENV_INT) EXTRA_LIBRARIES += $(LIBMEMENV_INT)
EXTRA_LIBRARIES += $(LIBLEVELDB_SSE42_INT)
LIBLEVELDB += $(LIBLEVELDB_INT) LIBLEVELDB += $(LIBLEVELDB_INT)
LIBMEMENV += $(LIBMEMENV_INT) LIBMEMENV += $(LIBMEMENV_INT)
LIBLEVELDB_SSE42 = $(LIBLEVELDB_SSE42_INT)
LEVELDB_CPPFLAGS += -I$(srcdir)/leveldb/include LEVELDB_CPPFLAGS += -I$(srcdir)/leveldb/include
LEVELDB_CPPFLAGS += -I$(srcdir)/leveldb/helpers/memenv LEVELDB_CPPFLAGS += -I$(srcdir)/leveldb/helpers/memenv
LEVELDB_CPPFLAGS += -I$(srcdir)/leveldb
LEVELDB_CPPFLAGS_INT = leveldb/libleveldb.a:
LEVELDB_CPPFLAGS_INT += -I$(srcdir)/leveldb $(AM_V_at)$(MAKE) $(AM_MAKEFLAGS) -C leveldb
LEVELDB_CPPFLAGS_INT += $(LEVELDB_TARGET_FLAGS)
LEVELDB_CPPFLAGS_INT += -DLEVELDB_ATOMIC_PRESENT
LEVELDB_CPPFLAGS_INT += -D__STDC_LIMIT_MACROS
if TARGET_WINDOWS leveldb/libmemenv.a: leveldb/libleveldb.a
LEVELDB_CPPFLAGS_INT += -DLEVELDB_PLATFORM_WINDOWS -DWINVER=0x0500 -D__USE_MINGW_ANSI_STDIO=1 $(AM_V_at)$(MAKE) $(AM_MAKEFLAGS) -C leveldb memenv_test
else
LEVELDB_CPPFLAGS_INT += -DLEVELDB_PLATFORM_POSIX
endif
leveldb_libleveldb_a_CPPFLAGS = $(AM_CPPFLAGS) $(LEVELDB_CPPFLAGS_INT) $(LEVELDB_CPPFLAGS)
leveldb_libleveldb_a_CXXFLAGS = $(AM_CXXFLAGS) $(PIE_FLAGS)
leveldb_libleveldb_a_SOURCES=
leveldb_libleveldb_a_SOURCES += leveldb/port/atomic_pointer.h
leveldb_libleveldb_a_SOURCES += leveldb/port/port_example.h
leveldb_libleveldb_a_SOURCES += leveldb/port/port_posix.h
leveldb_libleveldb_a_SOURCES += leveldb/port/win/stdint.h
leveldb_libleveldb_a_SOURCES += leveldb/port/port.h
leveldb_libleveldb_a_SOURCES += leveldb/port/port_win.h
leveldb_libleveldb_a_SOURCES += leveldb/port/thread_annotations.h
leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/db.h
leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/options.h
leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/comparator.h
leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/filter_policy.h
leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/slice.h
leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/table_builder.h
leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/env.h
leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/c.h
leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/iterator.h
leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/cache.h
leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/dumpfile.h
leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/table.h
leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/write_batch.h
leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/status.h
leveldb_libleveldb_a_SOURCES += leveldb/db/log_format.h
leveldb_libleveldb_a_SOURCES += leveldb/db/memtable.h
leveldb_libleveldb_a_SOURCES += leveldb/db/version_set.h
leveldb_libleveldb_a_SOURCES += leveldb/db/write_batch_internal.h
leveldb_libleveldb_a_SOURCES += leveldb/db/filename.h
leveldb_libleveldb_a_SOURCES += leveldb/db/version_edit.h
leveldb_libleveldb_a_SOURCES += leveldb/db/dbformat.h
leveldb_libleveldb_a_SOURCES += leveldb/db/builder.h
leveldb_libleveldb_a_SOURCES += leveldb/db/log_writer.h
leveldb_libleveldb_a_SOURCES += leveldb/db/db_iter.h
leveldb_libleveldb_a_SOURCES += leveldb/db/skiplist.h
leveldb_libleveldb_a_SOURCES += leveldb/db/db_impl.h
leveldb_libleveldb_a_SOURCES += leveldb/db/table_cache.h
leveldb_libleveldb_a_SOURCES += leveldb/db/snapshot.h
leveldb_libleveldb_a_SOURCES += leveldb/db/log_reader.h
leveldb_libleveldb_a_SOURCES += leveldb/table/filter_block.h
leveldb_libleveldb_a_SOURCES += leveldb/table/block_builder.h
leveldb_libleveldb_a_SOURCES += leveldb/table/block.h
leveldb_libleveldb_a_SOURCES += leveldb/table/two_level_iterator.h
leveldb_libleveldb_a_SOURCES += leveldb/table/merger.h
leveldb_libleveldb_a_SOURCES += leveldb/table/format.h
leveldb_libleveldb_a_SOURCES += leveldb/table/iterator_wrapper.h
leveldb_libleveldb_a_SOURCES += leveldb/util/crc32c.h
leveldb_libleveldb_a_SOURCES += leveldb/util/env_posix_test_helper.h
leveldb_libleveldb_a_SOURCES += leveldb/util/arena.h
leveldb_libleveldb_a_SOURCES += leveldb/util/random.h
leveldb_libleveldb_a_SOURCES += leveldb/util/posix_logger.h
leveldb_libleveldb_a_SOURCES += leveldb/util/hash.h
leveldb_libleveldb_a_SOURCES += leveldb/util/histogram.h
leveldb_libleveldb_a_SOURCES += leveldb/util/coding.h
leveldb_libleveldb_a_SOURCES += leveldb/util/testutil.h
leveldb_libleveldb_a_SOURCES += leveldb/util/mutexlock.h
leveldb_libleveldb_a_SOURCES += leveldb/util/logging.h
leveldb_libleveldb_a_SOURCES += leveldb/util/testharness.h
leveldb_libleveldb_a_SOURCES += leveldb/db/builder.cc
leveldb_libleveldb_a_SOURCES += leveldb/db/c.cc
leveldb_libleveldb_a_SOURCES += leveldb/db/dbformat.cc
leveldb_libleveldb_a_SOURCES += leveldb/db/db_impl.cc
leveldb_libleveldb_a_SOURCES += leveldb/db/db_iter.cc
leveldb_libleveldb_a_SOURCES += leveldb/db/dumpfile.cc
leveldb_libleveldb_a_SOURCES += leveldb/db/filename.cc
leveldb_libleveldb_a_SOURCES += leveldb/db/log_reader.cc
leveldb_libleveldb_a_SOURCES += leveldb/db/log_writer.cc
leveldb_libleveldb_a_SOURCES += leveldb/db/memtable.cc
leveldb_libleveldb_a_SOURCES += leveldb/db/repair.cc
leveldb_libleveldb_a_SOURCES += leveldb/db/table_cache.cc
leveldb_libleveldb_a_SOURCES += leveldb/db/version_edit.cc
leveldb_libleveldb_a_SOURCES += leveldb/db/version_set.cc
leveldb_libleveldb_a_SOURCES += leveldb/db/write_batch.cc
leveldb_libleveldb_a_SOURCES += leveldb/table/block_builder.cc
leveldb_libleveldb_a_SOURCES += leveldb/table/block.cc
leveldb_libleveldb_a_SOURCES += leveldb/table/filter_block.cc
leveldb_libleveldb_a_SOURCES += leveldb/table/format.cc
leveldb_libleveldb_a_SOURCES += leveldb/table/iterator.cc
leveldb_libleveldb_a_SOURCES += leveldb/table/merger.cc
leveldb_libleveldb_a_SOURCES += leveldb/table/table_builder.cc
leveldb_libleveldb_a_SOURCES += leveldb/table/table.cc
leveldb_libleveldb_a_SOURCES += leveldb/table/two_level_iterator.cc
leveldb_libleveldb_a_SOURCES += leveldb/util/arena.cc
leveldb_libleveldb_a_SOURCES += leveldb/util/bloom.cc
leveldb_libleveldb_a_SOURCES += leveldb/util/cache.cc
leveldb_libleveldb_a_SOURCES += leveldb/util/coding.cc
leveldb_libleveldb_a_SOURCES += leveldb/util/comparator.cc
leveldb_libleveldb_a_SOURCES += leveldb/util/crc32c.cc
leveldb_libleveldb_a_SOURCES += leveldb/util/env.cc
leveldb_libleveldb_a_SOURCES += leveldb/util/env_posix.cc
leveldb_libleveldb_a_SOURCES += leveldb/util/filter_policy.cc
leveldb_libleveldb_a_SOURCES += leveldb/util/hash.cc
leveldb_libleveldb_a_SOURCES += leveldb/util/histogram.cc
leveldb_libleveldb_a_SOURCES += leveldb/util/logging.cc
leveldb_libleveldb_a_SOURCES += leveldb/util/options.cc
leveldb_libleveldb_a_SOURCES += leveldb/util/status.cc
if TARGET_WINDOWS
leveldb_libleveldb_a_SOURCES += leveldb/util/env_win.cc
leveldb_libleveldb_a_SOURCES += leveldb/port/port_win.cc
else
leveldb_libleveldb_a_SOURCES += leveldb/port/port_posix.cc
endif
leveldb_libmemenv_a_CPPFLAGS = $(leveldb_libleveldb_a_CPPFLAGS)
leveldb_libmemenv_a_CXXFLAGS = $(leveldb_libleveldb_a_CXXFLAGS)
leveldb_libmemenv_a_SOURCES = leveldb/helpers/memenv/memenv.cc
leveldb_libmemenv_a_SOURCES += leveldb/helpers/memenv/memenv.h
leveldb_libleveldb_sse42_a_CPPFLAGS = $(leveldb_libleveldb_a_CPPFLAGS)
leveldb_libleveldb_sse42_a_CXXFLAGS = $(leveldb_libleveldb_a_CXXFLAGS)
if ENABLE_HWCRC32
leveldb_libleveldb_sse42_a_CPPFLAGS += -DLEVELDB_PLATFORM_POSIX_SSE
leveldb_libleveldb_sse42_a_CXXFLAGS += $(SSE42_CXXFLAGS)
endif
leveldb_libleveldb_sse42_a_SOURCES = leveldb/port/port_posix_sse.cc

View file

@ -408,7 +408,7 @@ endif
if ENABLE_ZMQ if ENABLE_ZMQ
qt_lbrycrd_qt_LDADD += $(LIBBITCOIN_ZMQ) $(ZMQ_LIBS) qt_lbrycrd_qt_LDADD += $(LIBBITCOIN_ZMQ) $(ZMQ_LIBS)
endif endif
qt_lbrycrd_qt_LDADD += $(LIBBITCOIN_CLI) $(LIBBITCOIN_COMMON) $(LIBBITCOIN_UTIL) $(LIBBITCOIN_CONSENSUS) $(LIBBITCOIN_CRYPTO) $(LIBUNIVALUE) $(LIBLEVELDB) $(LIBLEVELDB_SSE42) $(LIBMEMENV) \ qt_lbrycrd_qt_LDADD += $(LIBBITCOIN_CLI) $(LIBBITCOIN_COMMON) $(LIBBITCOIN_UTIL) $(LIBBITCOIN_CONSENSUS) $(LIBBITCOIN_CRYPTO) $(LIBUNIVALUE) $(LIBLEVELDB) $(LIBMEMENV) \
$(BOOST_LIBS) $(QT_LIBS) $(QT_DBUS_LIBS) $(QR_LIBS) $(PROTOBUF_LIBS) $(ICU_LIBS) $(BDB_LIBS) $(SSL_LIBS) $(CRYPTO_LIBS) $(MINIUPNPC_LIBS) $(LIBSECP256K1) \ $(BOOST_LIBS) $(QT_LIBS) $(QT_DBUS_LIBS) $(QR_LIBS) $(PROTOBUF_LIBS) $(ICU_LIBS) $(BDB_LIBS) $(SSL_LIBS) $(CRYPTO_LIBS) $(MINIUPNPC_LIBS) $(LIBSECP256K1) \
$(EVENT_PTHREADS_LIBS) $(EVENT_LIBS) $(EVENT_PTHREADS_LIBS) $(EVENT_LIBS)
qt_lbrycrd_qt_LDFLAGS = $(RELDFLAGS) $(AM_LDFLAGS) $(QT_LDFLAGS) $(LIBTOOL_APP_LDFLAGS) qt_lbrycrd_qt_LDFLAGS = $(RELDFLAGS) $(AM_LDFLAGS) $(QT_LDFLAGS) $(LIBTOOL_APP_LDFLAGS)

View file

@ -63,7 +63,7 @@ if ENABLE_ZMQ
qt_test_test_lbrycrd_qt_LDADD += $(LIBBITCOIN_ZMQ) $(ZMQ_LIBS) qt_test_test_lbrycrd_qt_LDADD += $(LIBBITCOIN_ZMQ) $(ZMQ_LIBS)
endif endif
qt_test_test_lbrycrd_qt_LDADD += $(LIBBITCOIN_CLI) $(LIBBITCOIN_COMMON) $(LIBBITCOIN_UTIL) $(LIBBITCOIN_CONSENSUS) $(LIBBITCOIN_CRYPTO) $(LIBUNIVALUE) $(LIBLEVELDB) \ qt_test_test_lbrycrd_qt_LDADD += $(LIBBITCOIN_CLI) $(LIBBITCOIN_COMMON) $(LIBBITCOIN_UTIL) $(LIBBITCOIN_CONSENSUS) $(LIBBITCOIN_CRYPTO) $(LIBUNIVALUE) $(LIBLEVELDB) \
$(LIBLEVELDB_SSE42) $(LIBMEMENV) $(BOOST_LIBS) $(QT_DBUS_LIBS) $(QT_TEST_LIBS) $(QT_LIBS) \ $(LIBMEMENV) $(BOOST_LIBS) $(QT_DBUS_LIBS) $(QT_TEST_LIBS) $(QT_LIBS) \
$(QR_LIBS) $(PROTOBUF_LIBS) $(ICU_LIBS) $(BDB_LIBS) $(SSL_LIBS) $(CRYPTO_LIBS) $(MINIUPNPC_LIBS) $(LIBSECP256K1) \ $(QR_LIBS) $(PROTOBUF_LIBS) $(ICU_LIBS) $(BDB_LIBS) $(SSL_LIBS) $(CRYPTO_LIBS) $(MINIUPNPC_LIBS) $(LIBSECP256K1) \
$(EVENT_PTHREADS_LIBS) $(EVENT_LIBS) $(EVENT_PTHREADS_LIBS) $(EVENT_LIBS)
qt_test_test_lbrycrd_qt_LDFLAGS = $(RELDFLAGS) $(AM_LDFLAGS) $(QT_LDFLAGS) $(LIBTOOL_APP_LDFLAGS) qt_test_test_lbrycrd_qt_LDFLAGS = $(RELDFLAGS) $(AM_LDFLAGS) $(QT_LDFLAGS) $(LIBTOOL_APP_LDFLAGS)

View file

@ -122,7 +122,7 @@ test_test_lbrycrd_LDADD += $(LIBBITCOIN_WALLET)
endif endif
test_test_lbrycrd_LDADD += $(LIBBITCOIN_SERVER) $(LIBBITCOIN_CLI) $(LIBBITCOIN_COMMON) $(LIBBITCOIN_UTIL) $(LIBBITCOIN_CONSENSUS) $(LIBBITCOIN_CRYPTO) $(LIBUNIVALUE) \ test_test_lbrycrd_LDADD += $(LIBBITCOIN_SERVER) $(LIBBITCOIN_CLI) $(LIBBITCOIN_COMMON) $(LIBBITCOIN_UTIL) $(LIBBITCOIN_CONSENSUS) $(LIBBITCOIN_CRYPTO) $(LIBUNIVALUE) \
$(LIBLEVELDB) $(LIBLEVELDB_SSE42) $(LIBMEMENV) $(BOOST_LIBS) $(BOOST_UNIT_TEST_FRAMEWORK_LIB) $(LIBSECP256K1) $(EVENT_LIBS) $(EVENT_PTHREADS_LIBS) $(LIBLEVELDB) $(LIBMEMENV) $(BOOST_LIBS) $(BOOST_UNIT_TEST_FRAMEWORK_LIB) $(LIBSECP256K1) $(EVENT_LIBS) $(EVENT_PTHREADS_LIBS)
test_test_lbrycrd_CXXFLAGS = $(AM_CXXFLAGS) $(PIE_FLAGS) test_test_lbrycrd_CXXFLAGS = $(AM_CXXFLAGS) $(PIE_FLAGS)
test_test_lbrycrd_LDADD += $(LIBBITCOIN_CONSENSUS) $(BDB_LIBS) $(CRYPTO_LIBS) $(ICU_LIBS) $(MINIUPNPC_LIBS) test_test_lbrycrd_LDADD += $(LIBBITCOIN_CONSENSUS) $(BDB_LIBS) $(CRYPTO_LIBS) $(ICU_LIBS) $(MINIUPNPC_LIBS)

View file

@ -8,8 +8,6 @@
#include <algorithm> #include <algorithm>
#include <memory> #include <memory>
#include <boost/scoped_ptr.hpp>
static const uint256 one = uint256S("0000000000000000000000000000000000000000000000000000000000000001"); static const uint256 one = uint256S("0000000000000000000000000000000000000000000000000000000000000001");
std::vector<unsigned char> heightToVch(int n) std::vector<unsigned char> heightToVch(int n)
@ -123,13 +121,13 @@ void CClaimTrieData::reorderClaims(const supportEntryType& supports)
claim.nEffectiveAmount += support.nAmount; claim.nEffectiveAmount += support.nAmount;
} }
std::make_heap(claims.begin(), claims.end()); std::sort(claims.rbegin(), claims.rend());
} }
CClaimTrie::CClaimTrie(bool fMemory, bool fWipe, int proportionalDelayFactor) CClaimTrie::CClaimTrie(bool fMemory, bool fWipe, int proportionalDelayFactor)
{ {
nProportionalDelayFactor = proportionalDelayFactor; nProportionalDelayFactor = proportionalDelayFactor;
db.reset(new CDBWrapper(GetDataDir() / "claimtrie", 100 * 1024 * 1024, fMemory, fWipe, false)); db.reset(new CDBWrapper(GetDataDir() / "claimtrie", 200 * 1024 * 1024, fMemory, fWipe, false));
} }
bool CClaimTrie::SyncToDisk() bool CClaimTrie::SyncToDisk()
@ -200,7 +198,7 @@ typename queueNameType::value_type* CClaimTrieCacheBase::getQueueCacheNameRow(co
template <> template <>
typename expirationQueueType::value_type* CClaimTrieCacheBase::getExpirationQueueCacheRow<CClaimValue>(int nHeight, bool createIfNotExists) typename expirationQueueType::value_type* CClaimTrieCacheBase::getExpirationQueueCacheRow<CClaimValue>(int nHeight, bool createIfNotExists)
{ {
return getQueue(*(base->db), EXP_QUEUE_ROW, nHeight, expirationQueueCache, createIfNotExists); return getQueue(*(base->db), CLAIM_EXP_QUEUE_ROW, nHeight, expirationQueueCache, createIfNotExists);
} }
template <> template <>
@ -218,8 +216,14 @@ typename expirationQueueType::value_type* CClaimTrieCacheBase::getExpirationQueu
bool CClaimTrieCacheBase::haveClaim(const std::string& name, const COutPoint& outPoint) const bool CClaimTrieCacheBase::haveClaim(const std::string& name, const COutPoint& outPoint) const
{ {
auto it = find(name); auto it = nodesToAddOrUpdate.find(name);
return it && it->haveClaim(outPoint); if (it && it->haveClaim(outPoint))
return true;
if (it || nodesToDelete.count(name))
return false;
CClaimTrieDataNode node;
node.childrenSerialization = false;
return base->find(name, node) && node.data.haveClaim(outPoint);
} }
bool CClaimTrieCacheBase::haveSupport(const std::string& name, const COutPoint& outPoint) const bool CClaimTrieCacheBase::haveSupport(const std::string& name, const COutPoint& outPoint) const
@ -272,39 +276,63 @@ bool CClaimTrieCacheBase::haveSupportInQueue(const std::string& name, const COut
return haveInQueue<CSupportValue>(name, outPoint, nValidAtHeight); return haveInQueue<CSupportValue>(name, outPoint, nValidAtHeight);
} }
std::size_t CClaimTrieCacheBase::getTotalNamesInTrie() const void CClaimTrie::recurseAllHashedNodes(const std::string& name, const CClaimTrieDataNode& current, std::function<void(const std::string&, const CClaimTrieDataNode&)> function) const {
function(name, current);
for (auto& child: current.children) {
CClaimTrieDataNode node;
if (find(child.second, node))
recurseAllHashedNodes(name + child.first, node, function);
}
}
std::size_t CClaimTrie::getTotalNamesInTrie() const
{ {
std::size_t count = 0; std::size_t count = 0;
for (auto it = base->cbegin(); it != base->cend(); ++it) CClaimTrieDataNode node;
if (!it->empty()) ++count; if (find("", node))
recurseAllHashedNodes("", node, [&count](const std::string&, const CClaimTrieDataNode& node) {
count += !node.data.empty();
});
return count; return count;
} }
std::size_t CClaimTrieCacheBase::getTotalClaimsInTrie() const std::size_t CClaimTrie::getTotalClaimsInTrie() const
{ {
std::size_t count = 0; std::size_t count = 0;
for (auto it = base->cbegin(); it != base->cend(); ++it) CClaimTrieDataNode node;
count += it->claims.size(); if (find("", node))
recurseAllHashedNodes("", node, [&count](const std::string&, const CClaimTrieDataNode& node) {
count += node.data.claims.size();
});
return count; return count;
} }
CAmount CClaimTrieCacheBase::getTotalValueOfClaimsInTrie(bool fControllingOnly) const CAmount CClaimTrie::getTotalValueOfClaimsInTrie(bool fControllingOnly) const
{ {
CAmount value_in_subtrie = 0; CAmount value_in_subtrie = 0;
for (auto it = base->cbegin(); it != base->cend(); ++it) { std::size_t count = 0;
for (const auto& claim : it->claims) { CClaimTrieDataNode node;
if (find("", node))
recurseAllHashedNodes("", node, [&value_in_subtrie, fControllingOnly](const std::string&, const CClaimTrieDataNode& node) {
for (const auto& claim : node.data.claims) {
value_in_subtrie += claim.nAmount; value_in_subtrie += claim.nAmount;
if (fControllingOnly) if (fControllingOnly)
break; break;
} }
} });
return value_in_subtrie; return value_in_subtrie;
} }
bool CClaimTrieCacheBase::getInfoForName(const std::string& name, CClaimValue& claim) const bool CClaimTrieCacheBase::getInfoForName(const std::string& name, CClaimValue& claim) const
{ {
auto it = find(name); auto it = nodesToAddOrUpdate.find(name);
return it && it->getBestClaim(claim); if (it && it->getBestClaim(claim))
return true;
if (it || nodesToDelete.count(name))
return false;
CClaimTrieDataNode node;
node.childrenSerialization = false;
return base->find(name, node) && node.data.getBestClaim(claim);
} }
CClaimsForNameType CClaimTrieCacheBase::getClaimsForName(const std::string& name) const CClaimsForNameType CClaimTrieCacheBase::getClaimsForName(const std::string& name) const
@ -313,10 +341,16 @@ CClaimsForNameType CClaimTrieCacheBase::getClaimsForName(const std::string& name
int nLastTakeoverHeight = 0; int nLastTakeoverHeight = 0;
auto supports = getSupportsForName(name); auto supports = getSupportsForName(name);
if (auto it = find(name)) { CClaimTrieDataNode node;
node.childrenSerialization = false;
if (auto it = nodesToAddOrUpdate.find(name)) {
claims = it->claims; claims = it->claims;
nLastTakeoverHeight = it->nHeightOfLastTakeover; nLastTakeoverHeight = it->nHeightOfLastTakeover;
} }
else if (!nodesToDelete.count(name) && base->find(name, node)) {
claims = node.data.claims;
nLastTakeoverHeight = node.data.nHeightOfLastTakeover;
}
return {std::move(claims), std::move(supports), nLastTakeoverHeight, name}; return {std::move(claims), std::move(supports), nLastTakeoverHeight, name};
} }
@ -381,60 +415,97 @@ uint256 recursiveMerkleHash(TIterator& it, const iCbType<TIterator>& process, co
return Hash(vchToHash.begin(), vchToHash.end()); return Hash(vchToHash.begin(), vchToHash.end());
} }
bool recursiveCheckConsistency(CClaimTrie::const_iterator& it, std::string& failed) bool CClaimTrie::checkConsistency(const uint256& rootHash) const
{ {
struct CRecursiveBreak : public std::exception {}; CClaimTrieDataNode node;
if (!find("", node) || node.data.hash != rootHash) {
using iterator = CClaimTrie::const_iterator; if (rootHash == one)
iCbType<iterator> verify = [&failed](iterator& it) {
if (!it.hasChildren()) {
// we don't allow a situation of no children and no claims; no empty leaf nodes allowed
failed = it.key();
throw CRecursiveBreak();
}
};
iCbType<iterator> process = [&failed, &process, &verify](iterator& it) {
if (it->hash != recursiveMerkleHash(it, process, verify)) {
failed = it.key();
throw CRecursiveBreak();
}
};
try {
process(it);
} catch (const CRecursiveBreak&) {
return false;
}
return true; return true;
return error("Mismatched root claim trie hashes. This may happen when there is not a clean process shutdown. Please run with -reindex.");
}
bool success = true;
recurseAllHashedNodes("", node, [&success, this](const std::string& name, const CClaimTrieDataNode& node) {
if (!success) return;
success &= contains(name);
std::vector<uint8_t> vchToHash;
const auto pos = name.size();
for (auto& child : node.children) {
auto key = name + child.first;
auto hash = child.second;
completeHash(hash, key, pos);
vchToHash.push_back(key[pos]);
vchToHash.insert(vchToHash.end(), hash.begin(), hash.end());
}
CClaimValue claim;
if (node.data.getBestClaim(claim)) {
uint256 valueHash = getValueHash(claim.outPoint, node.data.nHeightOfLastTakeover);
vchToHash.insert(vchToHash.end(), valueHash.begin(), valueHash.end());
} else {
success &= !node.children.empty(); // we disallow leaf nodes without claims
}
success &= node.data.hash == Hash(vchToHash.begin(), vchToHash.end());
});
return success;
} }
bool CClaimTrieCacheBase::checkConsistency() const std::vector<std::pair<std::string, CClaimTrieDataNode>> CClaimTrie::nodes(const std::string &key) const {
{ std::vector<std::pair<std::string, CClaimTrieDataNode>> ret;
if (base->empty()) CClaimTrieDataNode node;
return true;
auto it = base->cbegin(); if (!find("", node))
std::string failed; return ret;
auto consistent = recursiveCheckConsistency(it, failed); ret.emplace_back("", node);
if (!consistent) {
LogPrintf("\nPrinting base tree from its parent:\n"); std::string partialKey = key;
auto basePath = base->nodes(failed);
if (basePath.size() > 1) basePath.pop_back(); while (!node.children.empty()) {
dumpToLog(basePath.back(), false); // auto it = node.children.lower_bound(partialKey); // for using a std::map
auto cachePath = nodesToAddOrUpdate.nodes(failed); auto it = std::lower_bound(node.children.begin(), node.children.end(), std::make_pair(partialKey, uint256()));
if (!cachePath.empty()) { if (it != node.children.end() && it->first == partialKey) {
LogPrintf("\nPrinting %s's parent from cache:\n", failed); // we're completely done
if (cachePath.size() > 1) cachePath.pop_back(); if (find(it->second, node))
dumpToLog(cachePath.back(), false); ret.emplace_back(key, node);
break;
} }
if (!nodesToDelete.empty()) { if (it != node.children.begin()) --it;
std::string joined; const auto count = match(partialKey, it->first);
for (const auto &piece : nodesToDelete) joined += ", " + piece;
LogPrintf("Nodes to be deleted: %s\n", joined.substr(2)); if (count != it->first.size()) break;
if (count == partialKey.size()) break;
partialKey = partialKey.substr(count);
if (find(it->second, node))
ret.emplace_back(key.substr(0, key.size() - partialKey.size()), node);
else break;
} }
}
return consistent; return ret;
}
bool CClaimTrie::contains(const std::string &key) const {
return db->Exists(std::make_pair(TRIE_NODE_BY_NAME, key));
}
bool CClaimTrie::empty() const {
return !contains("");
}
bool CClaimTrie::find(const std::string &key, CClaimTrieDataNode &node) const {
uint256 hash;
if (!db->Read(std::make_pair(TRIE_NODE_BY_NAME, key), hash))
return false;
auto found = find(hash, node);
return found;
}
bool CClaimTrie::find(const uint256 &key, CClaimTrieDataNode &node) const {
return db->Read(std::make_pair(TRIE_NODE_BY_HASH, key), node);
} }
bool CClaimTrieCacheBase::getClaimById(const uint160& claimId, std::string& name, CClaimValue& claim) const bool CClaimTrieCacheBase::getClaimById(const uint160& claimId, std::string& name, CClaimValue& claim) const
@ -486,99 +557,79 @@ bool CClaimTrieCacheBase::flush()
getMerkleHash(); getMerkleHash();
std::set<std::string> forDeletion;
for (const auto& nodeName : nodesToDelete) { for (const auto& nodeName : nodesToDelete) {
if (nodesToAddOrUpdate.contains(nodeName)) // TODO: we don't need to deserialize all the nodes right here
continue; // we could be smarter about this and fill in the whole list in removeClaimFromTrie
auto nodes = base->nodes(nodeName); auto nodes = base->nodes(nodeName);
base->erase(nodeName);
for (auto& node : nodes) for (auto& node : nodes)
if (!node) forDeletion.insert(node.first);
batch.Erase(std::make_pair(TRIE_NODE, node.key()));
} }
for (auto it = nodesToAddOrUpdate.begin(); it != nodesToAddOrUpdate.end(); ++it) { for (auto it = nodesToAddOrUpdate.begin(); it != nodesToAddOrUpdate.end(); ++it) {
auto old = base->find(it.key()); forDeletion.erase(it.key());
if (!old || old.data() != it.data()) { if (!dirtyNodes.count(it.key()))
base->copy(it); continue;
batch.Write(std::make_pair(TRIE_NODE, it.key()), it.data());
CClaimTrieDataNode node;
node.data = it.data();
for (auto &child: it.children()) // ordering here is important
node.children.emplace_back(child.key().substr(it.key().size()), child->hash);
batch.Write(std::make_pair(TRIE_NODE_BY_HASH, it->hash), node);
batch.Write(std::make_pair(TRIE_NODE_BY_NAME, it.key()), it->hash);
} }
for (auto& name: forDeletion) {
batch.Erase(std::make_pair(TRIE_NODE_BY_NAME, name));
} }
BatchWriteQueue(batch, SUPPORT, supportCache); BatchWriteQueue(batch, SUPPORT, supportCache);
BatchWriteQueue(batch, CLAIM_QUEUE_ROW, claimQueueCache); BatchWriteQueue(batch, CLAIM_QUEUE_ROW, claimQueueCache);
BatchWriteQueue(batch, CLAIM_QUEUE_NAME_ROW, claimQueueNameCache); BatchWriteQueue(batch, CLAIM_QUEUE_NAME_ROW, claimQueueNameCache);
BatchWriteQueue(batch, EXP_QUEUE_ROW, expirationQueueCache); BatchWriteQueue(batch, CLAIM_EXP_QUEUE_ROW, expirationQueueCache);
BatchWriteQueue(batch, SUPPORT_QUEUE_ROW, supportQueueCache); BatchWriteQueue(batch, SUPPORT_QUEUE_ROW, supportQueueCache);
BatchWriteQueue(batch, SUPPORT_QUEUE_NAME_ROW, supportQueueNameCache); BatchWriteQueue(batch, SUPPORT_QUEUE_NAME_ROW, supportQueueNameCache);
BatchWriteQueue(batch, SUPPORT_EXP_QUEUE_ROW, supportExpirationQueueCache); BatchWriteQueue(batch, SUPPORT_EXP_QUEUE_ROW, supportExpirationQueueCache);
base->nNextHeight = nNextHeight; base->nNextHeight = nNextHeight;
if (!nodesToAddOrUpdate.empty()) if (!nodesToAddOrUpdate.empty() && (LogAcceptCategory(BCLog::CLAIMS) || LogAcceptCategory(BCLog::BENCH))) {
LogPrint(BCLog::CLAIMS, "Cache size: %zu from base size: %zu on block %d\n", nodesToAddOrUpdate.height(), base->height(), nNextHeight); LogPrintf("TrieCache size: %zu nodes on block %d, batch writes %zu bytes.\n",
nodesToAddOrUpdate.height(), nNextHeight, batch.SizeEstimate(), base->db->DynamicMemoryUsage());
}
auto ret = base->db->WriteBatch(batch); auto ret = base->db->WriteBatch(batch);
clear(); clear();
return ret; return ret;
} }
bool CClaimTrieCacheBase::ReadFromDisk(const CBlockIndex* tip) bool CClaimTrieCacheBase::validateTrieConsistency(const CBlockIndex* tip)
{ {
LogPrintf("Loading the claim trie from disk...\n"); if (!tip || tip->nHeight < 1)
return true;
base->nNextHeight = nNextHeight = tip ? tip->nHeight + 1 : 0;
clear();
base->clear();
boost::scoped_ptr<CDBIterator> pcursor(base->db->NewIterator());
std::vector<std::pair<std::string, uint256>> hashesOnEmptyNodes;
for (pcursor->SeekToFirst(); pcursor->Valid(); pcursor->Next()) {
std::pair<uint8_t, std::string> key;
if (!pcursor->GetKey(key) || key.first != TRIE_NODE)
continue;
CClaimTrieData data;
if (pcursor->GetValue(data)) {
if (data.empty()) {
// we have a situation where our old trie had many empty nodes
// we don't want to automatically throw those all into our prefix trie
hashesOnEmptyNodes.emplace_back(key.second, data.hash);
continue;
}
// nEffectiveAmount isn't serialized but it needs to be initialized (as done in reorderClaims):
auto supports = getSupportsForName(key.second);
data.reorderClaims(supports);
base->insert(key.second, std::move(data));
} else {
return error("%s(): error reading claim trie from disk", __func__);
}
}
CDBBatch batch(*(base->db));
for (auto& kvp: hashesOnEmptyNodes) {
auto hit = base->find(kvp.first);
if (hit != base->end())
hit->hash = kvp.second;
else {
// the first time the prefix trie is ran there will be many unused nodes
// we need to clean those out so that we can go faster next time
batch.Erase(std::make_pair(TRIE_NODE, kvp.first));
}
}
LogPrintf("Checking claim trie consistency... "); LogPrintf("Checking claim trie consistency... ");
if (checkConsistency()) { if (base->checkConsistency(tip->hashClaimTrie)) {
LogPrintf("consistent\n"); LogPrintf("consistent\n");
if (tip && tip->hashClaimTrie != getMerkleHash())
return error("%s(): hashes don't match when reading claimtrie from disk", __func__);
base->db->WriteBatch(batch);
return true; return true;
} }
LogPrintf("inconsistent!\n"); LogPrintf("inconsistent!\n");
return false; return false;
} }
bool CClaimTrieCacheBase::ReadFromDisk(const CBlockIndex* tip)
{
base->nNextHeight = nNextHeight = tip ? tip->nHeight + 1 : 0;
clear();
if (tip && (base->db->Exists(std::make_pair(TRIE_NODE, std::string())) || !base->db->Exists(std::make_pair(TRIE_NODE_BY_HASH, tip->hashClaimTrie)))) {
LogPrintf("The claim trie database contains deprecated data and will need to be rebuilt");
return false;
}
return validateTrieConsistency(tip);
}
CClaimTrieCacheBase::CClaimTrieCacheBase(CClaimTrie* base) : base(base) CClaimTrieCacheBase::CClaimTrieCacheBase(CClaimTrie* base) : base(base)
{ {
assert(base); assert(base);
@ -590,9 +641,9 @@ int CClaimTrieCacheBase::expirationTime() const
return Params().GetConsensus().nOriginalClaimExpirationTime; return Params().GetConsensus().nOriginalClaimExpirationTime;
} }
uint256 CClaimTrieCacheBase::recursiveComputeMerkleHash(CClaimTrie::iterator& it) uint256 CClaimTrieCacheBase::recursiveComputeMerkleHash(CClaimPrefixTrie::iterator& it)
{ {
using iterator = CClaimTrie::iterator; using iterator = CClaimPrefixTrie::iterator;
iCbType<iterator> process = [&process](iterator& it) { iCbType<iterator> process = [&process](iterator& it) {
if (it->hash.IsNull()) if (it->hash.IsNull())
it->hash = recursiveMerkleHash(it, process); it->hash = recursiveMerkleHash(it, process);
@ -604,54 +655,52 @@ uint256 CClaimTrieCacheBase::recursiveComputeMerkleHash(CClaimTrie::iterator& it
uint256 CClaimTrieCacheBase::getMerkleHash() uint256 CClaimTrieCacheBase::getMerkleHash()
{ {
auto it = nodesToAddOrUpdate.begin(); auto it = nodesToAddOrUpdate.begin();
if (nodesToAddOrUpdate.empty() && nodesToDelete.empty()) if (it)
it = base->begin(); return recursiveComputeMerkleHash(it);
return !it ? one : recursiveComputeMerkleHash(it); if (nodesToDelete.empty() && nodesAlreadyCached.empty()) {
CClaimTrieDataNode node;
node.childrenSerialization = false;
if (base->find("", node))
return node.data.hash; // it may be valuable to have base cache its current root hash
}
return one; // we have no data or we deleted everything
} }
CClaimTrie::const_iterator CClaimTrieCacheBase::begin() const CClaimPrefixTrie::const_iterator CClaimTrieCacheBase::begin() const
{ {
return nodesToAddOrUpdate.empty() && nodesToDelete.empty() ? base->cbegin() : nodesToAddOrUpdate.begin(); return nodesToAddOrUpdate.begin();
} }
CClaimTrie::const_iterator CClaimTrieCacheBase::end() const CClaimPrefixTrie::const_iterator CClaimTrieCacheBase::end() const
{ {
return nodesToAddOrUpdate.empty() && nodesToDelete.empty() ? base->cend() : nodesToAddOrUpdate.end(); return nodesToAddOrUpdate.end();
}
CClaimTrie::const_iterator CClaimTrieCacheBase::find(const std::string& name) const
{
if (auto it = nodesToAddOrUpdate.find(name))
return it;
return base->find(name);
} }
bool CClaimTrieCacheBase::empty() const bool CClaimTrieCacheBase::empty() const
{ {
return base->empty() && nodesToAddOrUpdate.empty(); return nodesToAddOrUpdate.empty();
} }
CClaimTrie::iterator CClaimTrieCacheBase::cacheData(const std::string& name, bool create) CClaimPrefixTrie::iterator CClaimTrieCacheBase::cacheData(const std::string& name, bool create)
{ {
// get data from the cache. if no data, create empty one
const auto insert = [this](CClaimTrie::iterator& it) {
auto& key = it.key();
// we only ever cache nodes once per cache instance
if (!nodesAlreadyCached.count(key)) {
// do not insert nodes that are already present
nodesAlreadyCached.insert(key);
nodesToAddOrUpdate.insert(key, it.data());
}
};
// we need all parent nodes and their one level deep children // we need all parent nodes and their one level deep children
// to calculate merkle hash // to calculate merkle hash
auto nodes = base->nodes(name); auto nodes = base->nodes(name);
for (auto& node: nodes) { for (auto& node: nodes) {
for (auto& child : node.children()) if (nodesAlreadyCached.insert(node.first).second) {
if (!nodesAlreadyCached.count(child.key())) // do not insert nodes that are already present
nodesToAddOrUpdate.copy(child); nodesToAddOrUpdate.insert(node.first, node.second.data);
insert(node); }
for (auto& child : node.second.children) {
auto childKey = node.first + child.first;
if (nodesAlreadyCached.insert(childKey).second) {
CClaimTrieDataNode childNode;
childNode.childrenSerialization = false;
if (base->find(child.second, childNode)) {
nodesToAddOrUpdate.insert(childKey, childNode.data);
}
}
}
} }
auto it = nodesToAddOrUpdate.find(name); auto it = nodesToAddOrUpdate.find(name);
@ -677,10 +726,12 @@ bool CClaimTrieCacheBase::getLastTakeoverForName(const std::string& name, uint16
std::tie(claimId, takeoverHeight) = cit->second; std::tie(claimId, takeoverHeight) = cit->second;
return true; return true;
} }
if (auto it = base->find(name)) { CClaimTrieDataNode data;
takeoverHeight = it->nHeightOfLastTakeover; data.childrenSerialization = false;
if (base->find(name, data)) {
takeoverHeight = data.data.nHeightOfLastTakeover;
CClaimValue claim; CClaimValue claim;
if (it->getBestClaim(claim)) { if (data.data.getBestClaim(claim)) {
claimId = claim.claimId; claimId = claim.claimId;
return true; return true;
} }
@ -690,8 +741,10 @@ bool CClaimTrieCacheBase::getLastTakeoverForName(const std::string& name, uint16
void CClaimTrieCacheBase::markAsDirty(const std::string& name, bool fCheckTakeover) void CClaimTrieCacheBase::markAsDirty(const std::string& name, bool fCheckTakeover)
{ {
for (auto& node : nodesToAddOrUpdate.nodes(name)) for (auto& node : nodesToAddOrUpdate.nodes(name)) {
dirtyNodes.insert(node.key());
node->hash.SetNull(); node->hash.SetNull();
}
if (fCheckTakeover) if (fCheckTakeover)
namesToCheckForTakeover.insert(name); namesToCheckForTakeover.insert(name);
@ -712,7 +765,7 @@ bool CClaimTrieCacheBase::removeClaimFromTrie(const std::string& name, const COu
auto it = cacheData(name, false); auto it = cacheData(name, false);
if (!it || !it->removeClaim(outPoint, claim)) { if (!it || !it->removeClaim(outPoint, claim)) {
LogPrint(BCLog::CLAIMS, "%s: Removing a claim was unsuccessful. name = %s, txhash = %s, nOut = %d", __func__, name, outPoint.hash.GetHex(), outPoint.n); LogPrint(BCLog::CLAIMS, "%s: Removing a claim was unsuccessful. name = %s, txhash = %s, nOut = %d\n", __func__, name, outPoint.hash.GetHex(), outPoint.n);
return false; return false;
} }
@ -963,11 +1016,14 @@ bool CClaimTrieCacheBase::removeSupportFromMap(const std::string& name, const CO
return false; return false;
} }
void CClaimTrieCacheBase::dumpToLog(CClaimTrie::const_iterator it, bool diffFromBase) const void CClaimTrieCacheBase::dumpToLog(CClaimPrefixTrie::const_iterator it, bool diffFromBase) const
{ {
if (!it) return;
if (diffFromBase) { if (diffFromBase) {
auto hit = base->find(it.key()); CClaimTrieDataNode node;
if (hit && hit->hash == it->hash) node.childrenSerialization = false;
if (base->find(it.key(), node) && node.data.hash == it->hash)
return; return;
} }
@ -1281,8 +1337,16 @@ int CClaimTrieCacheBase::getNumBlocksOfContinuousOwnership(const std::string& na
that->removalWorkaround.erase(hit); that->removalWorkaround.erase(hit);
return 0; return 0;
} }
auto it = find(name); auto it = nodesToAddOrUpdate.find(name);
return it && !it->empty() ? nNextHeight - it->nHeightOfLastTakeover : 0; if (it && !it->empty())
return nNextHeight - it->nHeightOfLastTakeover;
if (it) // we specifically ignore deleted nodes here to allow this to fall into the base lookup in that scenario
return 0;
CClaimTrieDataNode node;
node.childrenSerialization = false;
if (base->find(name, node) && !node.data.empty())
return nNextHeight - node.data.nHeightOfLastTakeover;
return 0;
} }
int CClaimTrieCacheBase::getDelayForName(const std::string& name) const int CClaimTrieCacheBase::getDelayForName(const std::string& name) const
@ -1311,6 +1375,7 @@ bool CClaimTrieCacheBase::clear()
{ {
nodesToAddOrUpdate.clear(); nodesToAddOrUpdate.clear();
claimsToAddToByIdIndex.clear(); claimsToAddToByIdIndex.clear();
dirtyNodes.clear();
supportCache.clear(); supportCache.clear();
nodesToDelete.clear(); nodesToDelete.clear();
claimsToDeleteFromByIdIndex.clear(); claimsToDeleteFromByIdIndex.clear();

View file

@ -18,11 +18,13 @@
#include <unordered_set> #include <unordered_set>
// leveldb keys // leveldb keys
#define TRIE_NODE 'n' #define TRIE_NODE 'n' // deprecated
#define TRIE_NODE_BY_HASH 'h'
#define TRIE_NODE_BY_NAME 'g'
#define CLAIM_BY_ID 'i' #define CLAIM_BY_ID 'i'
#define CLAIM_QUEUE_ROW 'r' #define CLAIM_QUEUE_ROW 'r'
#define CLAIM_QUEUE_NAME_ROW 'm' #define CLAIM_QUEUE_NAME_ROW 'm'
#define EXP_QUEUE_ROW 'e' #define CLAIM_EXP_QUEUE_ROW 'e'
#define SUPPORT 's' #define SUPPORT 's'
#define SUPPORT_QUEUE_ROW 'u' #define SUPPORT_QUEUE_ROW 'u'
#define SUPPORT_QUEUE_NAME_ROW 'p' #define SUPPORT_QUEUE_NAME_ROW 'p'
@ -61,6 +63,7 @@ struct CClaimValue
READWRITE(nAmount); READWRITE(nAmount);
READWRITE(nHeight); READWRITE(nHeight);
READWRITE(nValidAtHeight); READWRITE(nValidAtHeight);
READWRITE(nEffectiveAmount);
} }
bool operator<(const CClaimValue& other) const bool operator<(const CClaimValue& other) const
@ -157,17 +160,6 @@ struct CClaimTrieData
inline void SerializationOp(Stream& s, Operation ser_action) inline void SerializationOp(Stream& s, Operation ser_action)
{ {
READWRITE(hash); READWRITE(hash);
if (ser_action.ForRead()) {
if (s.eof()) {
claims.clear();
nHeightOfLastTakeover = 0;
return;
}
}
else if (claims.empty())
return;
READWRITE(claims); READWRITE(claims);
READWRITE(nHeightOfLastTakeover); READWRITE(nHeightOfLastTakeover);
} }
@ -188,6 +180,30 @@ struct CClaimTrieData
} }
}; };
struct CClaimTrieDataNode {
CClaimTrieData data;
// we're using a vector to avoid RAM thrashing and for faster serialization ops.
// We're assuming its data is inserted in order and never modified.
std::vector<std::pair<std::string, uint256>> children;
bool childrenSerialization = true;
CClaimTrieDataNode() = default;
CClaimTrieDataNode(CClaimTrieDataNode&&) = default;
CClaimTrieDataNode(const CClaimTrieDataNode&) = default;
CClaimTrieDataNode& operator=(CClaimTrieDataNode&&) = default;
CClaimTrieDataNode& operator=(const CClaimTrieDataNode& d) = default;
ADD_SERIALIZE_METHODS;
template <typename Stream, typename Operation>
inline void SerializationOp(Stream& s, Operation ser_action)
{
READWRITE(data);
if (childrenSerialization) // wanting constexpr but hoping the compiler is smart enough anyway
READWRITE(children);
}
};
struct COutPointHeightType struct COutPointHeightType
{ {
COutPoint outPoint; COutPoint outPoint;
@ -301,7 +317,7 @@ struct CClaimsForNameType
CClaimsForNameType& operator=(const CClaimsForNameType&) = default; CClaimsForNameType& operator=(const CClaimsForNameType&) = default;
}; };
class CClaimTrie : public CPrefixTrie<std::string, CClaimTrieData> class CClaimTrie
{ {
int nNextHeight = 0; int nNextHeight = 0;
int nProportionalDelayFactor = 0; int nProportionalDelayFactor = 0;
@ -322,6 +338,19 @@ public:
friend struct ClaimTrieChainFixture; friend struct ClaimTrieChainFixture;
friend class CClaimTrieCacheExpirationFork; friend class CClaimTrieCacheExpirationFork;
friend class CClaimTrieCacheNormalizationFork; friend class CClaimTrieCacheNormalizationFork;
std::size_t getTotalNamesInTrie() const;
std::size_t getTotalClaimsInTrie() const;
CAmount getTotalValueOfClaimsInTrie(bool fControllingOnly) const;
bool checkConsistency(const uint256& rootHash) const;
bool contains(const std::string& key) const;
bool empty() const;
bool find(const uint256& key, CClaimTrieDataNode& node) const;
bool find(const std::string& key, CClaimTrieDataNode& node) const;
std::vector<std::pair<std::string, CClaimTrieDataNode>> nodes(const std::string& key) const;
void recurseAllHashedNodes(const std::string& name, const CClaimTrieDataNode& current, std::function<void(const std::string&, const CClaimTrieDataNode&)> function) const;
}; };
struct CClaimTrieProofNode struct CClaimTrieProofNode
@ -381,6 +410,8 @@ typedef std::map<int, expirationQueueRowType> expirationQueueType;
typedef std::set<CClaimValue> claimIndexClaimListType; typedef std::set<CClaimValue> claimIndexClaimListType;
typedef std::vector<CClaimIndexElement> claimIndexElementListType; typedef std::vector<CClaimIndexElement> claimIndexElementListType;
typedef CPrefixTrie<std::string, CClaimTrieData> CClaimPrefixTrie;
class CClaimTrieCacheBase class CClaimTrieCacheBase
{ {
public: public:
@ -388,7 +419,6 @@ public:
virtual ~CClaimTrieCacheBase() = default; virtual ~CClaimTrieCacheBase() = default;
uint256 getMerkleHash(); uint256 getMerkleHash();
bool checkConsistency() const;
bool getClaimById(const uint160& claimId, std::string& name, CClaimValue& claim) const; bool getClaimById(const uint160& claimId, std::string& name, CClaimValue& claim) const;
@ -402,10 +432,6 @@ public:
bool haveSupport(const std::string& name, const COutPoint& outPoint) const; bool haveSupport(const std::string& name, const COutPoint& outPoint) const;
bool haveSupportInQueue(const std::string& name, const COutPoint& outPoint, int& nValidAtHeight); bool haveSupportInQueue(const std::string& name, const COutPoint& outPoint, int& nValidAtHeight);
std::size_t getTotalNamesInTrie() const;
std::size_t getTotalClaimsInTrie() const;
CAmount getTotalValueOfClaimsInTrie(bool fControllingOnly) const;
bool addClaim(const std::string& name, const COutPoint& outPoint, const uint160& claimId, CAmount nAmount, int nHeight); bool addClaim(const std::string& name, const COutPoint& outPoint, const uint160& claimId, CAmount nAmount, int nHeight);
bool undoAddClaim(const std::string& name, const COutPoint& outPoint, int nHeight); bool undoAddClaim(const std::string& name, const COutPoint& outPoint, int nHeight);
@ -441,18 +467,18 @@ public:
CAmount getEffectiveAmountForClaim(const std::string& name, const uint160& claimId, std::vector<CSupportValue>* supports = nullptr) const; CAmount getEffectiveAmountForClaim(const std::string& name, const uint160& claimId, std::vector<CSupportValue>* supports = nullptr) const;
CAmount getEffectiveAmountForClaim(const CClaimsForNameType& claims, const uint160& claimId, std::vector<CSupportValue>* supports = nullptr) const; CAmount getEffectiveAmountForClaim(const CClaimsForNameType& claims, const uint160& claimId, std::vector<CSupportValue>* supports = nullptr) const;
CClaimTrie::const_iterator begin() const; CClaimPrefixTrie::const_iterator begin() const;
CClaimTrie::const_iterator end() const; CClaimPrefixTrie::const_iterator end() const;
CClaimTrie::const_iterator find(const std::string& name) const;
void dumpToLog(CClaimTrie::const_iterator it, bool diffFromBase = true) const; void dumpToLog(CClaimPrefixTrie::const_iterator it, bool diffFromBase = true) const;
virtual std::string adjustNameForValidHeight(const std::string& name, int validHeight) const;
protected: protected:
CClaimTrie* base; CClaimTrie* base;
CClaimTrie nodesToAddOrUpdate; // nodes pulled in from base (and possibly modified thereafter), written to base on flush CClaimPrefixTrie nodesToAddOrUpdate; // nodes pulled in from base (and possibly modified thereafter), written to base on flush
std::unordered_set<std::string> namesToCheckForTakeover; // takeover numbers are updated on increment std::unordered_set<std::string> namesToCheckForTakeover; // takeover numbers are updated on increment
uint256 recursiveComputeMerkleHash(CClaimTrie::iterator& it); uint256 recursiveComputeMerkleHash(CClaimPrefixTrie::iterator& it);
virtual bool insertClaimIntoTrie(const std::string& name, const CClaimValue& claim, bool fCheckTakeover); virtual bool insertClaimIntoTrie(const std::string& name, const CClaimValue& claim, bool fCheckTakeover);
virtual bool removeClaimFromTrie(const std::string& name, const COutPoint& outPoint, CClaimValue& claim, bool fCheckTakeover); virtual bool removeClaimFromTrie(const std::string& name, const COutPoint& outPoint, CClaimValue& claim, bool fCheckTakeover);
@ -460,14 +486,12 @@ protected:
virtual bool insertSupportIntoMap(const std::string& name, const CSupportValue& support, bool fCheckTakeover); virtual bool insertSupportIntoMap(const std::string& name, const CSupportValue& support, bool fCheckTakeover);
virtual bool removeSupportFromMap(const std::string& name, const COutPoint& outPoint, CSupportValue& support, bool fCheckTakeover); virtual bool removeSupportFromMap(const std::string& name, const COutPoint& outPoint, CSupportValue& support, bool fCheckTakeover);
virtual std::string adjustNameForValidHeight(const std::string& name, int validHeight) const;
supportEntryType getSupportsForName(const std::string& name) const; supportEntryType getSupportsForName(const std::string& name) const;
int getDelayForName(const std::string& name) const; int getDelayForName(const std::string& name) const;
virtual int getDelayForName(const std::string& name, const uint160& claimId) const; virtual int getDelayForName(const std::string& name, const uint160& claimId) const;
CClaimTrie::iterator cacheData(const std::string& name, bool create = true); CClaimPrefixTrie::iterator cacheData(const std::string& name, bool create = true);
bool getLastTakeoverForName(const std::string& name, uint160& claimId, int& takeoverHeight) const; bool getLastTakeoverForName(const std::string& name, uint160& claimId, int& takeoverHeight) const;
@ -499,6 +523,7 @@ private:
std::unordered_set<std::string> nodesAlreadyCached; // set of nodes already pulled into cache from base std::unordered_set<std::string> nodesAlreadyCached; // set of nodes already pulled into cache from base
std::unordered_map<std::string, bool> takeoverWorkaround; std::unordered_map<std::string, bool> takeoverWorkaround;
std::unordered_set<std::string> removalWorkaround; std::unordered_set<std::string> removalWorkaround;
std::unordered_set<std::string> dirtyNodes;
bool shouldUseTakeoverWorkaround(const std::string& key) const; bool shouldUseTakeoverWorkaround(const std::string& key) const;
void addTakeoverWorkaroundPotential(const std::string& key); void addTakeoverWorkaroundPotential(const std::string& key);
@ -510,6 +535,8 @@ private:
bool removeSupport(const std::string& name, const COutPoint& outPoint, int nHeight, int& nValidAtHeight, bool fCheckTakeover); bool removeSupport(const std::string& name, const COutPoint& outPoint, int nHeight, int& nValidAtHeight, bool fCheckTakeover);
bool removeClaim(const std::string& name, const COutPoint& outPoint, int nHeight, int& nValidAtHeight, bool fCheckTakeover); bool removeClaim(const std::string& name, const COutPoint& outPoint, int nHeight, int& nValidAtHeight, bool fCheckTakeover);
bool validateTrieConsistency(const CBlockIndex* tip);
template <typename T> template <typename T>
std::pair<const int, std::vector<queueEntryType<T>>>* getQueueCacheRow(int nHeight, bool createIfNotExists = false); std::pair<const int, std::vector<queueEntryType<T>>>* getQueueCacheRow(int nHeight, bool createIfNotExists = false);
@ -614,6 +641,7 @@ public:
bool getProofForName(const std::string& name, CClaimTrieProof& proof) override; bool getProofForName(const std::string& name, CClaimTrieProof& proof) override;
bool getInfoForName(const std::string& name, CClaimValue& claim) const override; bool getInfoForName(const std::string& name, CClaimValue& claim) const override;
CClaimsForNameType getClaimsForName(const std::string& name) const override; CClaimsForNameType getClaimsForName(const std::string& name) const override;
std::string adjustNameForValidHeight(const std::string& name, int validHeight) const override;
protected: protected:
bool insertClaimIntoTrie(const std::string& name, const CClaimValue& claim, bool fCheckTakeover) override; bool insertClaimIntoTrie(const std::string& name, const CClaimValue& claim, bool fCheckTakeover) override;
@ -624,8 +652,6 @@ protected:
int getDelayForName(const std::string& name, const uint160& claimId) const override; int getDelayForName(const std::string& name, const uint160& claimId) const override;
std::string adjustNameForValidHeight(const std::string& name, int validHeight) const override;
private: private:
bool overrideInsertNormalization; bool overrideInsertNormalization;
bool overrideRemoveNormalization; bool overrideRemoveNormalization;

View file

@ -8,6 +8,7 @@
#include <boost/locale/conversion.hpp> #include <boost/locale/conversion.hpp>
#include <boost/locale/localization_backend.hpp> #include <boost/locale/localization_backend.hpp>
#include <boost/scope_exit.hpp> #include <boost/scope_exit.hpp>
#include <boost/scoped_ptr.hpp>
CClaimTrieCacheExpirationFork::CClaimTrieCacheExpirationFork(CClaimTrie* base) CClaimTrieCacheExpirationFork::CClaimTrieCacheExpirationFork(CClaimTrie* base)
: CClaimTrieCacheBase(base) : CClaimTrieCacheBase(base)
@ -66,7 +67,7 @@ bool CClaimTrieCacheExpirationFork::forkForExpirationChange(bool increment)
if (!pcursor->GetKey(key)) if (!pcursor->GetKey(key))
continue; continue;
int height = key.second; int height = key.second;
if (key.first == EXP_QUEUE_ROW) { if (key.first == CLAIM_EXP_QUEUE_ROW) {
expirationQueueRowType row; expirationQueueRowType row;
if (pcursor->GetValue(row)) { if (pcursor->GetValue(row)) {
reactivateClaim(row, height, increment); reactivateClaim(row, height, increment);
@ -160,40 +161,48 @@ bool CClaimTrieCacheNormalizationFork::normalizeAllNamesInTrieIfNecessary(insert
// run the one-time upgrade of all names that need to change // run the one-time upgrade of all names that need to change
// it modifies the (cache) trie as it goes, so we need to grab everything to be modified first // it modifies the (cache) trie as it goes, so we need to grab everything to be modified first
for (auto it = base->begin(); it != base->end(); ++it) { boost::scoped_ptr<CDBIterator> pcursor(base->db->NewIterator());
const std::string normalized = normalizeClaimName(it.key(), true); for (pcursor->SeekToFirst(); pcursor->Valid(); pcursor->Next()) {
if (normalized == it.key()) std::pair<uint8_t, std::string> key;
if (!pcursor->GetKey(key) || key.first != TRIE_NODE_BY_NAME)
continue; continue;
auto supports = getSupportsForName(it.key()); const auto& name = key.second;
const std::string normalized = normalizeClaimName(name, true);
if (normalized == key.second)
continue;
auto supports = getSupportsForName(name);
for (auto support : supports) { for (auto support : supports) {
// if it's already going to expire just skip it // if it's already going to expire just skip it
if (support.nHeight + expirationTime() <= nNextHeight) if (support.nHeight + expirationTime() <= nNextHeight)
continue; continue;
assert(removeSupportFromMap(it.key(), support.outPoint, support, false)); assert(removeSupportFromMap(name, support.outPoint, support, false));
expireSupportUndo.emplace_back(it.key(), support); expireSupportUndo.emplace_back(name, support);
assert(insertSupportIntoMap(normalized, support, false)); assert(insertSupportIntoMap(normalized, support, false));
insertSupportUndo.emplace_back(it.key(), support.outPoint, -1); insertSupportUndo.emplace_back(name, support.outPoint, -1);
} }
namesToCheckForTakeover.insert(normalized); namesToCheckForTakeover.insert(normalized);
auto cached = cacheData(it.key(), false); auto cached = cacheData(name, false);
if (!cached || cached->empty()) if (!cached || cached->empty())
continue; continue;
for (auto claim : it->claims) { auto claimsCopy = cached->claims;
auto takeoverHeightCopy = cached->nHeightOfLastTakeover;
for (auto claim : claimsCopy) {
if (claim.nHeight + expirationTime() <= nNextHeight) if (claim.nHeight + expirationTime() <= nNextHeight)
continue; continue;
assert(removeClaimFromTrie(it.key(), claim.outPoint, claim, false)); assert(removeClaimFromTrie(name, claim.outPoint, claim, false));
removeUndo.emplace_back(it.key(), claim); removeUndo.emplace_back(name, claim);
assert(insertClaimIntoTrie(normalized, claim, true)); assert(insertClaimIntoTrie(normalized, claim, true));
insertUndo.emplace_back(it.key(), claim.outPoint, -1); insertUndo.emplace_back(name, claim.outPoint, -1);
} }
takeoverHeightUndo.emplace_back(it.key(), it->nHeightOfLastTakeover); takeoverHeightUndo.emplace_back(name, takeoverHeightCopy);
} }
return true; return true;
} }

View file

@ -97,11 +97,45 @@ static void SetMaxOpenFiles(leveldb::Options *options) {
options->max_open_files, default_open_files); options->max_open_files, default_open_files);
} }
class CappedLenCache: public leveldb::Cache {
leveldb::Cache* inner;
std::size_t maxKeyLen;
public:
CappedLenCache(std::size_t capacity, std::size_t maxKeyLen)
: inner(leveldb::NewLRUCache(capacity)), maxKeyLen(maxKeyLen) {}
~CappedLenCache() override { delete inner; }
Handle* Insert(const leveldb::Slice& key, void* value, size_t charge,
void (*deleter)(const leveldb::Slice& key, void* value)) override {
if (key.size() <= maxKeyLen)
return inner->Insert(key, value, charge, deleter);
deleter(key, value);
return nullptr;
}
Handle* Lookup(const leveldb::Slice& key) override { return inner->Lookup(key); }
void Release(Handle* handle) override { return inner->Release(handle); }
void* Value(Handle* handle) override { return inner->Value(handle); }
void Erase(const leveldb::Slice& key) override {return inner->Erase(key); }
uint64_t NewId() override { return inner->NewId(); }
};
static leveldb::Options GetOptions(size_t nCacheSize) static leveldb::Options GetOptions(size_t nCacheSize)
{ {
leveldb::Options options; leveldb::Options options;
auto write_cache = std::min(nCacheSize / 4, size_t(16) << 20U); // cap write_cache at 16MB (4x default)
options.filter_policy=leveldb::NewBloomFilterPolicy2(16);
options.write_buffer_size=60 * 1024 * 1024;
options.total_leveldb_mem=2500ULL * 1024ULL * 1024ULL;
options.env=leveldb::Env::Default();
options.compression = leveldb::kNoCompression;
options.info_log = new CBitcoinLevelDBLogger();
return options;
auto write_cache = std::min(nCacheSize / 4, size_t(4 * 1024 * 1024)); // cap write_cache at 4MB (default)
options.block_cache = leveldb::NewLRUCache(nCacheSize - write_cache * 2); options.block_cache = leveldb::NewLRUCache(nCacheSize - write_cache * 2);
// options.block_cache = new CappedLenCache(nCacheSize - write_cache * 2, 6);
options.write_buffer_size = write_cache; // up to two write buffers may be held in memory simultaneously options.write_buffer_size = write_cache; // up to two write buffers may be held in memory simultaneously
options.filter_policy = leveldb::NewBloomFilterPolicy(10); options.filter_policy = leveldb::NewBloomFilterPolicy(10);
options.compression = leveldb::kNoCompression; options.compression = leveldb::kNoCompression;
@ -112,6 +146,7 @@ static leveldb::Options GetOptions(size_t nCacheSize)
options.paranoid_checks = true; options.paranoid_checks = true;
} }
SetMaxOpenFiles(&options); SetMaxOpenFiles(&options);
options.max_open_files = 30000;
return options; return options;
} }

View file

@ -81,7 +81,7 @@ public:
ssValue.Xor(dbwrapper_private::GetObfuscateKey(parent)); ssValue.Xor(dbwrapper_private::GetObfuscateKey(parent));
leveldb::Slice slValue(ssValue.data(), ssValue.size()); leveldb::Slice slValue(ssValue.data(), ssValue.size());
batch.Put(slKey, slValue); batch.Put(slKey, slValue, nullptr);
// LevelDB serializes writes as: // LevelDB serializes writes as:
// - byte: header // - byte: header
// - varint: key length (1 byte up to 127B, 2 bytes up to 16383B, ...) // - varint: key length (1 byte up to 127B, 2 bytes up to 16383B, ...)

View file

@ -1461,7 +1461,7 @@ bool AppInitMain()
pblocktree.reset(); pblocktree.reset();
pblocktree.reset(new CBlockTreeDB(nBlockTreeDBCache, false, fReset)); pblocktree.reset(new CBlockTreeDB(nBlockTreeDBCache, false, fReset));
delete pclaimTrie; delete pclaimTrie;
pclaimTrie = new CClaimTrie(false, fReindex); pclaimTrie = new CClaimTrie(false, fReindex || fReindexChainState);
if (fReset) { if (fReset) {
pblocktree->WriteReindexing(true); pblocktree->WriteReindexing(true);

View file

@ -1,13 +0,0 @@
build_config.mk
*.a
*.o
*.dylib*
*.so
*.so.*
*_test
db_bench
leveldbutil
Release
Debug
Benchmark
vs2010.*

View file

@ -6,7 +6,3 @@ Google Inc.
# Initial version authors: # Initial version authors:
Jeffrey Dean <jeff@google.com> Jeffrey Dean <jeff@google.com>
Sanjay Ghemawat <sanjay@google.com> Sanjay Ghemawat <sanjay@google.com>
# Partial list of contributors:
Kevin Regan <kevin.d.regan@gmail.com>
Johan Bilien <jobi@litl.com>

View file

@ -0,0 +1,72 @@
github.com tag 2.0.34 - February 15, 2017
-----------------------------------------
mv-hot-backup2: - correct MakeTieredDbname() within db/filename.cc
for case where dbname input is blank and fast/slow
already populated in options. Corrects issue
with hot backup in non-tiered storage situations
github.com tag 2.0.33 - November 21, 2016
-----------------------------------------
mv-bucket-expiry: - partial branch to enable X-Riak-Meta-Expiry-Base-Seconds
property within enterprise edition
--- no 2.0.32 tag on leveldb ---
github.com tag 2.0.31 - November 1, 2016
----------------------------------------
- version shipped with Riak 2.2
mv-no-md-expiry: - Riak specific
- never convert a key prefix of sext:encoded "{md" to expiry
- update sst_scan for dumping Riak formated keys
mv-tuning8: - rework penalty rules in version_set.cc UpdatePenalty()
- add unit test framework for UpdatePenalty()
github.com tag 2.0.30 - October 11, 2016
----------------------------------------
mv-delayed-bloom: - when opening an .sst table file, only load
bloom filter on second Get() operation. Saves time.
- correct VersionSet::Finalize() logic for level 1 when
when level 2 is above desired size
- move hot backup to Riak ee build
github.com tag 2.0.29 - September 13, 2016
------------------------------------------
mv-expiry-manifest: only switch to expiry enabled manifest format
if expiry function enabled. Eases downgrade
during early Riak releases containing expiry
github.com tag 2.0.28 - September 6, 2016
-----------------------------------------
mv-hot-backup: add externally triggered hot backup feature
github.com tag 2.0.27 - August 22, 2016
---------------------------------------
mv-mem-fences: fix iterator double delete bug in eleveldb and
build better memory fenced operations for referenced count objects.
github.com tag 2.0.26 - August 21, 2016
---------------------------------------
mv-expiry-iter-bug: DBImpl::NewIterator() was not setting the new expiry parameter.
github.com tag 2.0.25 - August 10, 2016
---------------------------------------
Make LZ4 the default compression instead of Snappy.
github.com tag 2.0.24 - August 2, 2016
--------------------------------------
mv-expiry: open source expiry. Supports one expiry policy for all databases.
github.com tag 2.0.23 - July 20, 2016
-------------------------------------
mv-no-semaphore: remove semaphore controlled thread in hot_threads.cc. Instead use
use mutex of thread 0 (only one thread's mutex) to address know race condition.
github.com tag 2.0.22 - June 22, 2016
-------------------------------------
no change: iterator fix in eleveldb
github.com tag 2.0.21 - June 16, 2016
-------------------------------------
branch mv-iterator-hot-threads: correct condition where eleveldb MoveTask
could hang an iterator. (https://github.com/basho/leveldb/wiki/mv-iterator-hot-threads)

View file

@ -1,36 +0,0 @@
# Contributing
We'd love to accept your code patches! However, before we can take them, we
have to jump a couple of legal hurdles.
## Contributor License Agreements
Please fill out either the individual or corporate Contributor License
Agreement as appropriate.
* If you are an individual writing original source code and you're sure you
own the intellectual property, then sign an [individual CLA](https://developers.google.com/open-source/cla/individual).
* If you work for a company that wants to allow you to contribute your work,
then sign a [corporate CLA](https://developers.google.com/open-source/cla/corporate).
Follow either of the two links above to access the appropriate CLA and
instructions for how to sign and return it.
## Submitting a Patch
1. Sign the contributors license agreement above.
2. Decide which code you want to submit. A submission should be a set of changes
that addresses one issue in the [issue tracker](https://github.com/google/leveldb/issues).
Please don't mix more than one logical change per submission, because it makes
the history hard to follow. If you want to make a change
(e.g. add a sample or feature) that doesn't have a corresponding issue in the
issue tracker, please create one.
3. **Submitting**: When you are ready to submit, send us a Pull Request. Be
sure to include the issue number you fixed and the name you used to sign
the CLA.
## Writing Code ##
If your contribution contains code, please make sure that it follows
[the style guide](http://google-styleguide.googlecode.com/svn/trunk/cppguide.xml).
Otherwise we will have to ask you to make changes, and that's no fun for anyone.

View file

@ -2,423 +2,219 @@
# Use of this source code is governed by a BSD-style license that can be # Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file. See the AUTHORS file for names of contributors. # found in the LICENSE file. See the AUTHORS file for names of contributors.
# Inherit some settings from environment variables, if available
INSTALL_PATH ?= $(CURDIR)
#----------------------------------------------- #-----------------------------------------------
# Uncomment exactly one of the lines labelled (A), (B), and (C) below # Uncomment exactly one of the lines labelled (A), (B), and (C) below
# to switch between compilation modes. # to switch between compilation modes.
# NOTE: targets "debug" and "prof" provide same functionality
# NOTE 2: -DNDEBUG disables assert() statements within C code,
# i.e. no assert()s in production code
# (A) Production use (optimized mode) OPT ?= -O2 -g -DNDEBUG # (A) Production use (optimized mode)
OPT ?= -O2 -DNDEBUG # OPT ?= -g2 # (B) Debug mode, w/ full line-level debugging symbols
# (B) Debug mode, w/ full line-level debugging symbols # OPT ?= -O2 -g2 -DNDEBUG # (C) Profiling mode: opt, but w/debugging symbols
# OPT ?= -g2
# (C) Profiling mode: opt, but w/debugging symbols
# OPT ?= -O2 -g2 -DNDEBUG
#----------------------------------------------- #-----------------------------------------------
# detect what platform we're building on # detect what platform we're building on
$(shell CC="$(CC)" CXX="$(CXX)" TARGET_OS="$(TARGET_OS)" \ ifeq ($(wildcard build_config.mk),)
./build_detect_platform build_config.mk ./) $(shell ./build_detect_platform build_config.mk)
endif
# this file is generated by the previous line to set build flags and sources # this file is generated by the previous line to set build flags and sources
include build_config.mk include build_config.mk
TESTS = \
db/autocompact_test \
db/c_test \
db/corruption_test \
db/db_test \
db/dbformat_test \
db/fault_injection_test \
db/filename_test \
db/log_test \
db/recovery_test \
db/skiplist_test \
db/version_edit_test \
db/version_set_test \
db/write_batch_test \
helpers/memenv/memenv_test \
issues/issue178_test \
issues/issue200_test \
table/filter_block_test \
table/table_test \
util/arena_test \
util/bloom_test \
util/cache_test \
util/coding_test \
util/crc32c_test \
util/env_posix_test \
util/env_test \
util/hash_test
UTILS = \
db/db_bench \
db/leveldbutil
# Put the object files in a subdirectory, but the application at the top of the object dir.
PROGNAMES := $(notdir $(TESTS) $(UTILS))
# On Linux may need libkyotocabinet-dev for dependency.
BENCHMARKS = \
doc/bench/db_bench_sqlite3 \
doc/bench/db_bench_tree_db
CFLAGS += -I. -I./include $(PLATFORM_CCFLAGS) $(OPT) CFLAGS += -I. -I./include $(PLATFORM_CCFLAGS) $(OPT)
CXXFLAGS += -I. -I./include $(PLATFORM_CXXFLAGS) $(OPT) CXXFLAGS += -I. -I./include $(PLATFORM_CXXFLAGS) $(OPT)
LDFLAGS += $(PLATFORM_LDFLAGS) LDFLAGS += $(PLATFORM_LDFLAGS)
LIBS += $(PLATFORM_LIBS)
SIMULATOR_OUTDIR=out-ios-x86 LIBOBJECTS := $(SOURCES:.cc=.o)
DEVICE_OUTDIR=out-ios-arm LIBOBJECTS += util/lz4.o
MEMENVOBJECTS = $(MEMENV_SOURCES:.cc=.o)
DEPEND := $(SOURCES:.cc=.d)
ifeq ($(PLATFORM), IOS) TESTUTIL = ./util/testutil.o
# Note: iOS should probably be using libtool, not ar. TESTHARNESS = ./util/testharness.o $(TESTUTIL)
AR=xcrun ar
SIMULATORSDK=$(shell xcrun -sdk iphonesimulator --show-sdk-path) TESTS := $(sort $(notdir $(basename $(TEST_SOURCES))))
DEVICESDK=$(shell xcrun -sdk iphoneos --show-sdk-path)
DEVICE_CFLAGS = -isysroot "$(DEVICESDK)" -arch armv6 -arch armv7 -arch armv7s -arch arm64 TOOLS = \
SIMULATOR_CFLAGS = -isysroot "$(SIMULATORSDK)" -arch i686 -arch x86_64 leveldb_repair \
STATIC_OUTDIR=out-ios-universal perf_dump \
sst_rewrite \
sst_scan
PROGRAMS = db_bench $(TESTS) $(TOOLS)
BENCHMARKS = db_bench_sqlite3 db_bench_tree_db
LIBRARY = libleveldb.a
MEMENVLIBRARY = libmemenv.a
#
# static link leveldb to tools to simplify platform usage (if Linux)
#
ifeq ($(PLATFORM),OS_LINUX)
LEVEL_LDFLAGS := -L . -Wl,-non_shared -lleveldb -Wl,-call_shared
else else
STATIC_OUTDIR=out-static LEVEL_LDFLAGS := -L . -lleveldb
SHARED_OUTDIR=out-shared
STATIC_PROGRAMS := $(addprefix $(STATIC_OUTDIR)/, $(PROGNAMES))
SHARED_PROGRAMS := $(addprefix $(SHARED_OUTDIR)/, db_bench)
endif endif
STATIC_LIBOBJECTS := $(addprefix $(STATIC_OUTDIR)/, $(SOURCES:.cc=.o))
STATIC_MEMENVOBJECTS := $(addprefix $(STATIC_OUTDIR)/, $(MEMENV_SOURCES:.cc=.o))
DEVICE_LIBOBJECTS := $(addprefix $(DEVICE_OUTDIR)/, $(SOURCES:.cc=.o))
DEVICE_MEMENVOBJECTS := $(addprefix $(DEVICE_OUTDIR)/, $(MEMENV_SOURCES:.cc=.o))
SIMULATOR_LIBOBJECTS := $(addprefix $(SIMULATOR_OUTDIR)/, $(SOURCES:.cc=.o))
SIMULATOR_MEMENVOBJECTS := $(addprefix $(SIMULATOR_OUTDIR)/, $(MEMENV_SOURCES:.cc=.o))
SHARED_LIBOBJECTS := $(addprefix $(SHARED_OUTDIR)/, $(SOURCES:.cc=.o))
SHARED_MEMENVOBJECTS := $(addprefix $(SHARED_OUTDIR)/, $(MEMENV_SOURCES:.cc=.o))
TESTUTIL := $(STATIC_OUTDIR)/util/testutil.o
TESTHARNESS := $(STATIC_OUTDIR)/util/testharness.o $(TESTUTIL)
STATIC_TESTOBJS := $(addprefix $(STATIC_OUTDIR)/, $(addsuffix .o, $(TESTS)))
STATIC_UTILOBJS := $(addprefix $(STATIC_OUTDIR)/, $(addsuffix .o, $(UTILS)))
STATIC_ALLOBJS := $(STATIC_LIBOBJECTS) $(STATIC_MEMENVOBJECTS) $(STATIC_TESTOBJS) $(STATIC_UTILOBJS) $(TESTHARNESS)
DEVICE_ALLOBJS := $(DEVICE_LIBOBJECTS) $(DEVICE_MEMENVOBJECTS)
SIMULATOR_ALLOBJS := $(SIMULATOR_LIBOBJECTS) $(SIMULATOR_MEMENVOBJECTS)
default: all default: all
# Should we build shared libraries? # Should we build shared libraries?
ifneq ($(PLATFORM_SHARED_EXT),) ifneq ($(PLATFORM_SHARED_EXT),)
# Many leveldb test apps use non-exported API's. Only build a subset for testing.
SHARED_ALLOBJS := $(SHARED_LIBOBJECTS) $(SHARED_MEMENVOBJECTS) $(TESTHARNESS)
ifneq ($(PLATFORM_SHARED_VERSIONED),true) ifneq ($(PLATFORM_SHARED_VERSIONED),true)
SHARED_LIB1 = libleveldb.$(PLATFORM_SHARED_EXT) SHARED1 = libleveldb.$(PLATFORM_SHARED_EXT)
SHARED_LIB2 = $(SHARED_LIB1) SHARED2 = $(SHARED1)
SHARED_LIB3 = $(SHARED_LIB1) SHARED3 = $(SHARED1)
SHARED_LIBS = $(SHARED_LIB1) SHARED = $(SHARED1)
SHARED_MEMENVLIB = $(SHARED_OUTDIR)/libmemenv.a
else else
# Update db.h if you change these. # Update db.h if you change these.
SHARED_VERSION_MAJOR = 1 SHARED_MAJOR = 1
SHARED_VERSION_MINOR = 20 SHARED_MINOR = 9
SHARED_LIB1 = libleveldb.$(PLATFORM_SHARED_EXT) SHARED1 = libleveldb.$(PLATFORM_SHARED_EXT)
SHARED_LIB2 = $(SHARED_LIB1).$(SHARED_VERSION_MAJOR) SHARED2 = $(SHARED1).$(SHARED_MAJOR)
SHARED_LIB3 = $(SHARED_LIB1).$(SHARED_VERSION_MAJOR).$(SHARED_VERSION_MINOR) SHARED3 = $(SHARED1).$(SHARED_MAJOR).$(SHARED_MINOR)
SHARED_LIBS = $(SHARED_OUTDIR)/$(SHARED_LIB1) $(SHARED_OUTDIR)/$(SHARED_LIB2) $(SHARED_OUTDIR)/$(SHARED_LIB3) SHARED = $(SHARED1) $(SHARED2) $(SHARED3)
$(SHARED_OUTDIR)/$(SHARED_LIB1): $(SHARED_OUTDIR)/$(SHARED_LIB3) $(SHARED1): $(SHARED3)
ln -fs $(SHARED_LIB3) $(SHARED_OUTDIR)/$(SHARED_LIB1) ln -fs $(SHARED3) $(SHARED1)
$(SHARED_OUTDIR)/$(SHARED_LIB2): $(SHARED_OUTDIR)/$(SHARED_LIB3) $(SHARED2): $(SHARED3)
ln -fs $(SHARED_LIB3) $(SHARED_OUTDIR)/$(SHARED_LIB2) ln -fs $(SHARED3) $(SHARED2)
SHARED_MEMENVLIB = $(SHARED_OUTDIR)/libmemenv.a
endif endif
$(SHARED_OUTDIR)/$(SHARED_LIB3): $(SHARED_LIBOBJECTS) $(SHARED3): $(LIBOBJECTS)
$(CXX) $(LDFLAGS) $(PLATFORM_SHARED_LDFLAGS)$(SHARED_LIB2) $(SHARED_LIBOBJECTS) -o $(SHARED_OUTDIR)/$(SHARED_LIB3) $(LIBS) $(CXX) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) $(LIBOBJECTS) -o $(SHARED3) $(LDFLAGS) $(PLATFORM_SHARED_LDFLAGS)$(SHARED2)
endif # PLATFORM_SHARED_EXT endif # PLATFORM_SHARED_EXT
all: $(SHARED_LIBS) $(SHARED_PROGRAMS) $(STATIC_OUTDIR)/libleveldb.a $(STATIC_OUTDIR)/libmemenv.a $(STATIC_PROGRAMS) all: $(SHARED) $(LIBRARY)
check: $(STATIC_PROGRAMS) test check: all $(PROGRAMS) $(TESTS)
for t in $(notdir $(TESTS)); do echo "***** Running $$t"; $(STATIC_OUTDIR)/$$t || exit 1; done for t in $(TESTS); do echo "***** Running $$t"; ./$$t || exit 1; done
clean: tools: all $(TOOLS)
-rm -rf out-static out-shared out-ios-x86 out-ios-arm out-ios-universal
-rm -f build_config.mk
-rm -rf ios-x86 ios-arm
$(STATIC_OUTDIR): #
mkdir $@ # command line targets: debug and prof
# just like
$(STATIC_OUTDIR)/db: | $(STATIC_OUTDIR) ifneq ($(filter debug,$(MAKECMDGOALS)),)
mkdir $@ OPT := -g2 # (B) Debug mode, w/ full line-level debugging symbols
debug: all
$(STATIC_OUTDIR)/helpers/memenv: | $(STATIC_OUTDIR)
mkdir -p $@
$(STATIC_OUTDIR)/port: | $(STATIC_OUTDIR)
mkdir $@
$(STATIC_OUTDIR)/table: | $(STATIC_OUTDIR)
mkdir $@
$(STATIC_OUTDIR)/util: | $(STATIC_OUTDIR)
mkdir $@
.PHONY: STATIC_OBJDIRS
STATIC_OBJDIRS: \
$(STATIC_OUTDIR)/db \
$(STATIC_OUTDIR)/port \
$(STATIC_OUTDIR)/table \
$(STATIC_OUTDIR)/util \
$(STATIC_OUTDIR)/helpers/memenv
$(SHARED_OUTDIR):
mkdir $@
$(SHARED_OUTDIR)/db: | $(SHARED_OUTDIR)
mkdir $@
$(SHARED_OUTDIR)/helpers/memenv: | $(SHARED_OUTDIR)
mkdir -p $@
$(SHARED_OUTDIR)/port: | $(SHARED_OUTDIR)
mkdir $@
$(SHARED_OUTDIR)/table: | $(SHARED_OUTDIR)
mkdir $@
$(SHARED_OUTDIR)/util: | $(SHARED_OUTDIR)
mkdir $@
.PHONY: SHARED_OBJDIRS
SHARED_OBJDIRS: \
$(SHARED_OUTDIR)/db \
$(SHARED_OUTDIR)/port \
$(SHARED_OUTDIR)/table \
$(SHARED_OUTDIR)/util \
$(SHARED_OUTDIR)/helpers/memenv
$(DEVICE_OUTDIR):
mkdir $@
$(DEVICE_OUTDIR)/db: | $(DEVICE_OUTDIR)
mkdir $@
$(DEVICE_OUTDIR)/helpers/memenv: | $(DEVICE_OUTDIR)
mkdir -p $@
$(DEVICE_OUTDIR)/port: | $(DEVICE_OUTDIR)
mkdir $@
$(DEVICE_OUTDIR)/table: | $(DEVICE_OUTDIR)
mkdir $@
$(DEVICE_OUTDIR)/util: | $(DEVICE_OUTDIR)
mkdir $@
.PHONY: DEVICE_OBJDIRS
DEVICE_OBJDIRS: \
$(DEVICE_OUTDIR)/db \
$(DEVICE_OUTDIR)/port \
$(DEVICE_OUTDIR)/table \
$(DEVICE_OUTDIR)/util \
$(DEVICE_OUTDIR)/helpers/memenv
$(SIMULATOR_OUTDIR):
mkdir $@
$(SIMULATOR_OUTDIR)/db: | $(SIMULATOR_OUTDIR)
mkdir $@
$(SIMULATOR_OUTDIR)/helpers/memenv: | $(SIMULATOR_OUTDIR)
mkdir -p $@
$(SIMULATOR_OUTDIR)/port: | $(SIMULATOR_OUTDIR)
mkdir $@
$(SIMULATOR_OUTDIR)/table: | $(SIMULATOR_OUTDIR)
mkdir $@
$(SIMULATOR_OUTDIR)/util: | $(SIMULATOR_OUTDIR)
mkdir $@
.PHONY: SIMULATOR_OBJDIRS
SIMULATOR_OBJDIRS: \
$(SIMULATOR_OUTDIR)/db \
$(SIMULATOR_OUTDIR)/port \
$(SIMULATOR_OUTDIR)/table \
$(SIMULATOR_OUTDIR)/util \
$(SIMULATOR_OUTDIR)/helpers/memenv
$(STATIC_ALLOBJS): | STATIC_OBJDIRS
$(DEVICE_ALLOBJS): | DEVICE_OBJDIRS
$(SIMULATOR_ALLOBJS): | SIMULATOR_OBJDIRS
$(SHARED_ALLOBJS): | SHARED_OBJDIRS
ifeq ($(PLATFORM), IOS)
$(DEVICE_OUTDIR)/libleveldb.a: $(DEVICE_LIBOBJECTS)
rm -f $@
$(AR) -rs $@ $(DEVICE_LIBOBJECTS)
$(SIMULATOR_OUTDIR)/libleveldb.a: $(SIMULATOR_LIBOBJECTS)
rm -f $@
$(AR) -rs $@ $(SIMULATOR_LIBOBJECTS)
$(DEVICE_OUTDIR)/libmemenv.a: $(DEVICE_MEMENVOBJECTS)
rm -f $@
$(AR) -rs $@ $(DEVICE_MEMENVOBJECTS)
$(SIMULATOR_OUTDIR)/libmemenv.a: $(SIMULATOR_MEMENVOBJECTS)
rm -f $@
$(AR) -rs $@ $(SIMULATOR_MEMENVOBJECTS)
# For iOS, create universal object libraries to be used on both the simulator and
# a device.
$(STATIC_OUTDIR)/libleveldb.a: $(STATIC_OUTDIR) $(DEVICE_OUTDIR)/libleveldb.a $(SIMULATOR_OUTDIR)/libleveldb.a
lipo -create $(DEVICE_OUTDIR)/libleveldb.a $(SIMULATOR_OUTDIR)/libleveldb.a -output $@
$(STATIC_OUTDIR)/libmemenv.a: $(STATIC_OUTDIR) $(DEVICE_OUTDIR)/libmemenv.a $(SIMULATOR_OUTDIR)/libmemenv.a
lipo -create $(DEVICE_OUTDIR)/libmemenv.a $(SIMULATOR_OUTDIR)/libmemenv.a -output $@
else
$(STATIC_OUTDIR)/libleveldb.a:$(STATIC_LIBOBJECTS)
rm -f $@
$(AR) -rs $@ $(STATIC_LIBOBJECTS)
$(STATIC_OUTDIR)/libmemenv.a:$(STATIC_MEMENVOBJECTS)
rm -f $@
$(AR) -rs $@ $(STATIC_MEMENVOBJECTS)
endif endif
$(SHARED_MEMENVLIB):$(SHARED_MEMENVOBJECTS) ifneq ($(filter prof,$(MAKECMDGOALS)),)
OPT := -O2 -g2 -DNDEBUG # (C) Profiling mode: opt, but w/debugging symbols
prof: all
endif
clean:
-rm -f $(PROGRAMS) $(BENCHMARKS) $(LIBRARY) $(SHARED) $(MEMENVLIBRARY) */*.o */*/*.o */*.d */*/*.d ios-x86/*/*.o ios-arm/*/*.o build_config.mk include/leveldb/ldb_config.h
-rm -rf ios-x86/* ios-arm/* *.dSYM
$(LIBRARY): $(LIBOBJECTS)
rm -f $@ rm -f $@
$(AR) -rs $@ $(SHARED_MEMENVOBJECTS) $(AR) -rs $@ $(LIBOBJECTS)
$(STATIC_OUTDIR)/db_bench:db/db_bench.cc $(STATIC_LIBOBJECTS) $(TESTUTIL) #
$(CXX) $(LDFLAGS) $(CXXFLAGS) db/db_bench.cc $(STATIC_LIBOBJECTS) $(TESTUTIL) -o $@ $(LIBS) # all tools, programs, and tests depend upon the static library
$(TESTS) $(PROGRAMS) $(TOOLS) : $(LIBRARY)
$(STATIC_OUTDIR)/db_bench_sqlite3:doc/bench/db_bench_sqlite3.cc $(STATIC_LIBOBJECTS) $(TESTUTIL) #
$(CXX) $(LDFLAGS) $(CXXFLAGS) doc/bench/db_bench_sqlite3.cc $(STATIC_LIBOBJECTS) $(TESTUTIL) -o $@ -lsqlite3 $(LIBS) # all tests depend upon the test harness
$(TESTS) : $(TESTHARNESS)
$(STATIC_OUTDIR)/db_bench_tree_db:doc/bench/db_bench_tree_db.cc $(STATIC_LIBOBJECTS) $(TESTUTIL) #
$(CXX) $(LDFLAGS) $(CXXFLAGS) doc/bench/db_bench_tree_db.cc $(STATIC_LIBOBJECTS) $(TESTUTIL) -o $@ -lkyotocabinet $(LIBS) # tools, programs, and tests will compile to the root directory
# but their .cc source file will be in one of the following subdirectories
vpath %.cc db:table:util:leveldb_ee:leveldb_os
$(STATIC_OUTDIR)/leveldbutil:db/leveldbutil.cc $(STATIC_LIBOBJECTS) # special case for c_test
$(CXX) $(LDFLAGS) $(CXXFLAGS) db/leveldbutil.cc $(STATIC_LIBOBJECTS) -o $@ $(LIBS) vpath %.c db
$(STATIC_OUTDIR)/arena_test:util/arena_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) db_bench: db/db_bench.o $(LIBRARY) $(TESTUTIL)
$(CXX) $(LDFLAGS) $(CXXFLAGS) util/arena_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS) $(CXX) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) $< $(TESTUTIL) -o $@ $(LEVEL_LDFLAGS) $(LDFLAGS)
$(STATIC_OUTDIR)/autocompact_test:db/autocompact_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) db_bench_sqlite3: doc/bench/db_bench_sqlite3.o $(LIBRARY) $(TESTUTIL)
$(CXX) $(LDFLAGS) $(CXXFLAGS) db/autocompact_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
$(STATIC_OUTDIR)/bloom_test:util/bloom_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) db_bench_tree_db: doc/bench/db_bench_tree_db.o $(LIBRARY) $(TESTUTIL)
$(CXX) $(LDFLAGS) $(CXXFLAGS) util/bloom_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
$(STATIC_OUTDIR)/c_test:$(STATIC_OUTDIR)/db/c_test.o $(STATIC_LIBOBJECTS) $(TESTHARNESS)
$(CXX) $(LDFLAGS) $(STATIC_OUTDIR)/db/c_test.o $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
$(STATIC_OUTDIR)/cache_test:util/cache_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) #
$(CXX) $(LDFLAGS) $(CXXFLAGS) util/cache_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS) # build line taken from lz4 makefile
#
util/lz4.o: util/lz4.c util/lz4.h
$(CC) $(CFLAGS) $(PLATFORM_SHARED_CFLAGS) -O3 -std=c99 -Wall -Wextra -Wundef -Wshadow -Wcast-qual -Wcast-align -Wstrict-prototypes -pedantic -DLZ4_VERSION=\"r130\" -c util/lz4.c -o util/lz4.o
$(STATIC_OUTDIR)/coding_test:util/coding_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) #
$(CXX) $(LDFLAGS) $(CXXFLAGS) util/coding_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS) # memory env
#
$(MEMENVLIBRARY) : $(MEMENVOBJECTS)
rm -f $@
$(AR) -rs $@ $(MEMENVOBJECTS)
$(STATIC_OUTDIR)/corruption_test:db/corruption_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) memenv_test : helpers/memenv/memenv_test.o $(MEMENVLIBRARY) $(LIBRARY) $(TESTHARNESS)
$(CXX) $(LDFLAGS) $(CXXFLAGS) db/corruption_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS) $(CXX) helpers/memenv/memenv_test.o $(MEMENVLIBRARY) $(LIBRARY) $(TESTHARNESS) -o $@ $(LDFLAGS)
$(STATIC_OUTDIR)/crc32c_test:util/crc32c_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) #
$(CXX) $(LDFLAGS) $(CXXFLAGS) util/crc32c_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS) # IOS build
#
ifeq ($(PLATFORM), IOS)
# For iOS, create universal object files to be used on both the simulator and
# a device.
PLATFORMSROOT=/Applications/Xcode.app/Contents/Developer/Platforms
SIMULATORROOT=$(PLATFORMSROOT)/iPhoneSimulator.platform/Developer
DEVICEROOT=$(PLATFORMSROOT)/iPhoneOS.platform/Developer
IOSVERSION=$(shell defaults read $(PLATFORMSROOT)/iPhoneOS.platform/version CFBundleShortVersionString)
$(STATIC_OUTDIR)/db_test:db/db_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) .cc.o:
$(CXX) $(LDFLAGS) $(CXXFLAGS) db/db_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS) mkdir -p ios-x86/$(dir $@)
$(SIMULATORROOT)/usr/bin/$(CXX) $(CXXFLAGS) -isysroot $(SIMULATORROOT)/SDKs/iPhoneSimulator$(IOSVERSION).sdk -arch i686 -c $< -o ios-x86/$@
mkdir -p ios-arm/$(dir $@)
$(DEVICEROOT)/usr/bin/$(CXX) $(CXXFLAGS) -isysroot $(DEVICEROOT)/SDKs/iPhoneOS$(IOSVERSION).sdk -arch armv6 -arch armv7 -c $< -o ios-arm/$@
lipo ios-x86/$@ ios-arm/$@ -create -output $@
$(STATIC_OUTDIR)/dbformat_test:db/dbformat_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) .c.o:
$(CXX) $(LDFLAGS) $(CXXFLAGS) db/dbformat_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS) mkdir -p ios-x86/$(dir $@)
$(SIMULATORROOT)/usr/bin/$(CC) $(CFLAGS) -isysroot $(SIMULATORROOT)/SDKs/iPhoneSimulator$(IOSVERSION).sdk -arch i686 -c $< -o ios-x86/$@
mkdir -p ios-arm/$(dir $@)
$(DEVICEROOT)/usr/bin/$(CC) $(CFLAGS) -isysroot $(DEVICEROOT)/SDKs/iPhoneOS$(IOSVERSION).sdk -arch armv6 -arch armv7 -c $< -o ios-arm/$@
lipo ios-x86/$@ ios-arm/$@ -create -output $@
$(STATIC_OUTDIR)/env_posix_test:util/env_posix_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) else
$(CXX) $(LDFLAGS) $(CXXFLAGS) util/env_posix_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS) #
# build for everything NOT IOS
$(STATIC_OUTDIR)/env_test:util/env_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) #
$(CXX) $(LDFLAGS) $(CXXFLAGS) util/env_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS) .cc.o:
$(STATIC_OUTDIR)/fault_injection_test:db/fault_injection_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
$(CXX) $(LDFLAGS) $(CXXFLAGS) db/fault_injection_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
$(STATIC_OUTDIR)/filename_test:db/filename_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
$(CXX) $(LDFLAGS) $(CXXFLAGS) db/filename_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
$(STATIC_OUTDIR)/filter_block_test:table/filter_block_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
$(CXX) $(LDFLAGS) $(CXXFLAGS) table/filter_block_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
$(STATIC_OUTDIR)/hash_test:util/hash_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
$(CXX) $(LDFLAGS) $(CXXFLAGS) util/hash_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
$(STATIC_OUTDIR)/issue178_test:issues/issue178_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
$(CXX) $(LDFLAGS) $(CXXFLAGS) issues/issue178_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
$(STATIC_OUTDIR)/issue200_test:issues/issue200_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
$(CXX) $(LDFLAGS) $(CXXFLAGS) issues/issue200_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
$(STATIC_OUTDIR)/log_test:db/log_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
$(CXX) $(LDFLAGS) $(CXXFLAGS) db/log_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
$(STATIC_OUTDIR)/recovery_test:db/recovery_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
$(CXX) $(LDFLAGS) $(CXXFLAGS) db/recovery_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
$(STATIC_OUTDIR)/table_test:table/table_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
$(CXX) $(LDFLAGS) $(CXXFLAGS) table/table_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
$(STATIC_OUTDIR)/skiplist_test:db/skiplist_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
$(CXX) $(LDFLAGS) $(CXXFLAGS) db/skiplist_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
$(STATIC_OUTDIR)/version_edit_test:db/version_edit_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
$(CXX) $(LDFLAGS) $(CXXFLAGS) db/version_edit_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
$(STATIC_OUTDIR)/version_set_test:db/version_set_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
$(CXX) $(LDFLAGS) $(CXXFLAGS) db/version_set_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
$(STATIC_OUTDIR)/write_batch_test:db/write_batch_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
$(CXX) $(LDFLAGS) $(CXXFLAGS) db/write_batch_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
$(STATIC_OUTDIR)/memenv_test:$(STATIC_OUTDIR)/helpers/memenv/memenv_test.o $(STATIC_OUTDIR)/libmemenv.a $(STATIC_OUTDIR)/libleveldb.a $(TESTHARNESS)
$(XCRUN) $(CXX) $(LDFLAGS) $(STATIC_OUTDIR)/helpers/memenv/memenv_test.o $(STATIC_OUTDIR)/libmemenv.a $(STATIC_OUTDIR)/libleveldb.a $(TESTHARNESS) -o $@ $(LIBS)
$(SHARED_OUTDIR)/db_bench:$(SHARED_OUTDIR)/db/db_bench.o $(SHARED_LIBS) $(TESTUTIL)
$(XCRUN) $(CXX) $(LDFLAGS) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) $(SHARED_OUTDIR)/db/db_bench.o $(TESTUTIL) $(SHARED_OUTDIR)/$(SHARED_LIB3) -o $@ $(LIBS)
.PHONY: run-shared
run-shared: $(SHARED_OUTDIR)/db_bench
LD_LIBRARY_PATH=$(SHARED_OUTDIR) $(SHARED_OUTDIR)/db_bench
$(SIMULATOR_OUTDIR)/%.o: %.cc
xcrun -sdk iphonesimulator $(CXX) $(CXXFLAGS) $(SIMULATOR_CFLAGS) -c $< -o $@
$(DEVICE_OUTDIR)/%.o: %.cc
xcrun -sdk iphoneos $(CXX) $(CXXFLAGS) $(DEVICE_CFLAGS) -c $< -o $@
$(SIMULATOR_OUTDIR)/%.o: %.c
xcrun -sdk iphonesimulator $(CC) $(CFLAGS) $(SIMULATOR_CFLAGS) -c $< -o $@
$(DEVICE_OUTDIR)/%.o: %.c
xcrun -sdk iphoneos $(CC) $(CFLAGS) $(DEVICE_CFLAGS) -c $< -o $@
$(STATIC_OUTDIR)/%.o: %.cc
$(CXX) $(CXXFLAGS) -c $< -o $@
$(STATIC_OUTDIR)/%.o: %.c
$(CC) $(CFLAGS) -c $< -o $@
$(SHARED_OUTDIR)/%.o: %.cc
$(CXX) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) -c $< -o $@ $(CXX) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) -c $< -o $@
$(SHARED_OUTDIR)/%.o: %.c .c.o:
$(CC) $(CFLAGS) $(PLATFORM_SHARED_CFLAGS) -c $< -o $@ $(CC) $(CFLAGS) $(PLATFORM_SHARED_CFLAGS) -c $< -o $@
$(STATIC_OUTDIR)/port/port_posix_sse.o: port/port_posix_sse.cc ## @echo -- Creating dependency file for $<
$(CXX) $(CXXFLAGS) $(PLATFORM_SSEFLAGS) -c $< -o $@ %.d: %.cc
$(CC) $(CFLAGS) $(PLATFORM_SHARED_CFLAGS) -MM -E -MT $(basename $@).d -MT $(basename $@).o -MF $@ $<
@echo $(basename $@).o: $(basename $@).d >>$@
$(SHARED_OUTDIR)/port/port_posix_sse.o: port/port_posix_sse.cc # generic build for command line tests
$(CXX) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) $(PLATFORM_SSEFLAGS) -c $< -o $@ %: %.cc
$(CXX) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) $< $(TESTHARNESS) -o $@ $(LEVEL_LDFLAGS) $(LDFLAGS)
%: db/%.c
$(CXX) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) $< $(TESTHARNESS) -o $@ $(LEVEL_LDFLAGS) $(LDFLAGS)
# for tools, omits test harness
%: tools/%.cc
$(CXX) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) $< -o $@ $(LEVEL_LDFLAGS) $(LDFLAGS)
endif
#
# load dependency files
#
ifeq ($(filter tar clean allclean distclean,$(MAKECMDGOALS)),)
-include $(DEPEND)
endif

83
src/leveldb/README Normal file
View file

@ -0,0 +1,83 @@
leveldb: A key-value store
Authors: Sanjay Ghemawat (sanjay@google.com) and Jeff Dean (jeff@google.com)
The original Google README is now README.GOOGLE.
** Introduction
This repository contains the Google source code as modified to benefit
the Riak environment. The typical Riak environment has two attributes
that necessitate leveldb adjustments, both in options and code:
- production servers: Riak often runs in heavy Internet environments:
servers with many CPU cores, lots of memory, and 24x7 disk activity.
Basho's leveldb takes advantage of the environment by adding
hardware CRC calculation, increasing Bloom filter accuracy, and
defaulting to integrity checking enabled.
- multiple databases open: Riak opens 8 to 128 databases
simultaneously. Google's leveldb supports this, but its background
compaction thread can fall behind. leveldb will "stall" new user
writes whenever the compaction thread gets too far behind. Basho's
leveldb modification include multiple thread blocks that each
contain prioritized threads for specific compaction activities.
Details for Basho's customizations exist in the leveldb wiki:
http://github.com/basho/leveldb/wiki
** Branch pattern
This repository follows the Basho standard for branch management
as of November 28, 2013. The standard is found here:
https://github.com/basho/riak/wiki/Basho-repository-management
In summary, the "develop" branch contains the most recently reviewed
engineering work. The "master" branch contains the most recently
released work, i.e. distributed as part of a Riak release.
** Basic options needed
Those wishing to truly savor the benefits of Basho's modifications
need to initialize a new leveldb::Options structure similar to the
following before each call to leveldb::DB::Open:
leveldb::Options * options;
options=new Leveldb::Options;
options.filter_policy=leveldb::NewBloomFilterPolicy2(16);
options.write_buffer_size=62914560; // 60Mbytes
options.total_leveldb_mem=2684354560; // 2.5Gbytes (details below)
options.env=leveldb::Env::Default();
** Memory plan
Basho's leveldb dramatically departed from Google's original internal
memory allotment plan with Riak 2.0. Basho's leveldb uses a methodology
called flexcache. The technical details are here:
https://github.com/basho/leveldb/wiki/mv-flexcache
The key points are:
- options.total_leveldb_mem is an allocation for the entire process,
not a single database
- giving different values to options.total_leveldb_mem on subsequent Open
calls causes memory to rearrange to current value across all databases
- recommended minimum for Basho's leveldb is 340Mbytes per database.
- performance improves rapidly from 340Mbytes to 2.5Gbytes per database (3.0Gbytes
if using Riak's active anti-entropy). Even more is nice, but not as helpful.
- never assign more than 75% of available RAM to total_leveldb_mem. There is
too much unaccounted memory overhead (worse if you use tcmalloc library).
- options.max_open_files and options.block_cache should not be used.

51
src/leveldb/README.GOOGLE Normal file
View file

@ -0,0 +1,51 @@
leveldb: A key-value store
Authors: Sanjay Ghemawat (sanjay@google.com) and Jeff Dean (jeff@google.com)
The code under this directory implements a system for maintaining a
persistent key/value store.
See doc/index.html for more explanation.
See doc/impl.html for a brief overview of the implementation.
The public interface is in include/*.h. Callers should not include or
rely on the details of any other header files in this package. Those
internal APIs may be changed without warning.
Guide to header files:
include/db.h
Main interface to the DB: Start here
include/options.h
Control over the behavior of an entire database, and also
control over the behavior of individual reads and writes.
include/comparator.h
Abstraction for user-specified comparison function. If you want
just bytewise comparison of keys, you can use the default comparator,
but clients can write their own comparator implementations if they
want custom ordering (e.g. to handle different character
encodings, etc.)
include/iterator.h
Interface for iterating over data. You can get an iterator
from a DB object.
include/write_batch.h
Interface for atomically applying multiple updates to a database.
include/slice.h
A simple module for maintaining a pointer and a length into some
other byte array.
include/status.h
Status is returned from many of the public interfaces and is used
to report success and various kinds of errors.
include/env.h
Abstraction of the OS environment. A posix implementation of
this interface is in util/env_posix.cc
include/table.h
include/table_builder.h
Lower-level modules that most clients probably won't use directly

View file

@ -1,174 +0,0 @@
**LevelDB is a fast key-value storage library written at Google that provides an ordered mapping from string keys to string values.**
[![Build Status](https://travis-ci.org/google/leveldb.svg?branch=master)](https://travis-ci.org/google/leveldb)
Authors: Sanjay Ghemawat (sanjay@google.com) and Jeff Dean (jeff@google.com)
# Features
* Keys and values are arbitrary byte arrays.
* Data is stored sorted by key.
* Callers can provide a custom comparison function to override the sort order.
* The basic operations are `Put(key,value)`, `Get(key)`, `Delete(key)`.
* Multiple changes can be made in one atomic batch.
* Users can create a transient snapshot to get a consistent view of data.
* Forward and backward iteration is supported over the data.
* Data is automatically compressed using the [Snappy compression library](http://google.github.io/snappy/).
* External activity (file system operations etc.) is relayed through a virtual interface so users can customize the operating system interactions.
# Documentation
[LevelDB library documentation](https://github.com/google/leveldb/blob/master/doc/index.md) is online and bundled with the source code.
# Limitations
* This is not a SQL database. It does not have a relational data model, it does not support SQL queries, and it has no support for indexes.
* Only a single process (possibly multi-threaded) can access a particular database at a time.
* There is no client-server support builtin to the library. An application that needs such support will have to wrap their own server around the library.
# Contributing to the leveldb Project
The leveldb project welcomes contributions. leveldb's primary goal is to be
a reliable and fast key/value store. Changes that are in line with the
features/limitations outlined above, and meet the requirements below,
will be considered.
Contribution requirements:
1. **POSIX only**. We _generally_ will only accept changes that are both
compiled, and tested on a POSIX platform - usually Linux. Very small
changes will sometimes be accepted, but consider that more of an
exception than the rule.
2. **Stable API**. We strive very hard to maintain a stable API. Changes that
require changes for projects using leveldb _might_ be rejected without
sufficient benefit to the project.
3. **Tests**: All changes must be accompanied by a new (or changed) test, or
a sufficient explanation as to why a new (or changed) test is not required.
## Submitting a Pull Request
Before any pull request will be accepted the author must first sign a
Contributor License Agreement (CLA) at https://cla.developers.google.com/.
In order to keep the commit timeline linear
[squash](https://git-scm.com/book/en/v2/Git-Tools-Rewriting-History#Squashing-Commits)
your changes down to a single commit and [rebase](https://git-scm.com/docs/git-rebase)
on google/leveldb/master. This keeps the commit timeline linear and more easily sync'ed
with the internal repository at Google. More information at GitHub's
[About Git rebase](https://help.github.com/articles/about-git-rebase/) page.
# Performance
Here is a performance report (with explanations) from the run of the
included db_bench program. The results are somewhat noisy, but should
be enough to get a ballpark performance estimate.
## Setup
We use a database with a million entries. Each entry has a 16 byte
key, and a 100 byte value. Values used by the benchmark compress to
about half their original size.
LevelDB: version 1.1
Date: Sun May 1 12:11:26 2011
CPU: 4 x Intel(R) Core(TM)2 Quad CPU Q6600 @ 2.40GHz
CPUCache: 4096 KB
Keys: 16 bytes each
Values: 100 bytes each (50 bytes after compression)
Entries: 1000000
Raw Size: 110.6 MB (estimated)
File Size: 62.9 MB (estimated)
## Write performance
The "fill" benchmarks create a brand new database, in either
sequential, or random order. The "fillsync" benchmark flushes data
from the operating system to the disk after every operation; the other
write operations leave the data sitting in the operating system buffer
cache for a while. The "overwrite" benchmark does random writes that
update existing keys in the database.
fillseq : 1.765 micros/op; 62.7 MB/s
fillsync : 268.409 micros/op; 0.4 MB/s (10000 ops)
fillrandom : 2.460 micros/op; 45.0 MB/s
overwrite : 2.380 micros/op; 46.5 MB/s
Each "op" above corresponds to a write of a single key/value pair.
I.e., a random write benchmark goes at approximately 400,000 writes per second.
Each "fillsync" operation costs much less (0.3 millisecond)
than a disk seek (typically 10 milliseconds). We suspect that this is
because the hard disk itself is buffering the update in its memory and
responding before the data has been written to the platter. This may
or may not be safe based on whether or not the hard disk has enough
power to save its memory in the event of a power failure.
## Read performance
We list the performance of reading sequentially in both the forward
and reverse direction, and also the performance of a random lookup.
Note that the database created by the benchmark is quite small.
Therefore the report characterizes the performance of leveldb when the
working set fits in memory. The cost of reading a piece of data that
is not present in the operating system buffer cache will be dominated
by the one or two disk seeks needed to fetch the data from disk.
Write performance will be mostly unaffected by whether or not the
working set fits in memory.
readrandom : 16.677 micros/op; (approximately 60,000 reads per second)
readseq : 0.476 micros/op; 232.3 MB/s
readreverse : 0.724 micros/op; 152.9 MB/s
LevelDB compacts its underlying storage data in the background to
improve read performance. The results listed above were done
immediately after a lot of random writes. The results after
compactions (which are usually triggered automatically) are better.
readrandom : 11.602 micros/op; (approximately 85,000 reads per second)
readseq : 0.423 micros/op; 261.8 MB/s
readreverse : 0.663 micros/op; 166.9 MB/s
Some of the high cost of reads comes from repeated decompression of blocks
read from disk. If we supply enough cache to the leveldb so it can hold the
uncompressed blocks in memory, the read performance improves again:
readrandom : 9.775 micros/op; (approximately 100,000 reads per second before compaction)
readrandom : 5.215 micros/op; (approximately 190,000 reads per second after compaction)
## Repository contents
See [doc/index.md](doc/index.md) for more explanation. See
[doc/impl.md](doc/impl.md) for a brief overview of the implementation.
The public interface is in include/*.h. Callers should not include or
rely on the details of any other header files in this package. Those
internal APIs may be changed without warning.
Guide to header files:
* **include/db.h**: Main interface to the DB: Start here
* **include/options.h**: Control over the behavior of an entire database,
and also control over the behavior of individual reads and writes.
* **include/comparator.h**: Abstraction for user-specified comparison function.
If you want just bytewise comparison of keys, you can use the default
comparator, but clients can write their own comparator implementations if they
want custom ordering (e.g. to handle different character encodings, etc.)
* **include/iterator.h**: Interface for iterating over data. You can get
an iterator from a DB object.
* **include/write_batch.h**: Interface for atomically applying multiple
updates to a database.
* **include/slice.h**: A simple module for maintaining a pointer and a
length into some other byte array.
* **include/status.h**: Status is returned from many of the public interfaces
and is used to report success and various kinds of errors.
* **include/env.h**:
Abstraction of the OS environment. A posix implementation of this interface is
in util/env_posix.cc
* **include/table.h, include/table_builder.h**: Lower-level modules that most
clients probably won't use directly

View file

@ -7,7 +7,6 @@ db
within [start_key..end_key]? For Chrome, deletion of obsolete within [start_key..end_key]? For Chrome, deletion of obsolete
object stores, etc. can be done in the background anyway, so object stores, etc. can be done in the background anyway, so
probably not that important. probably not that important.
- There have been requests for MultiGet.
After a range is completely deleted, what gets rid of the After a range is completely deleted, what gets rid of the
corresponding files if we do no future changes to that range. Make corresponding files if we do no future changes to that range. Make

View file

@ -1,39 +0,0 @@
# Building LevelDB On Windows
## Prereqs
Install the [Windows Software Development Kit version 7.1](http://www.microsoft.com/downloads/dlx/en-us/listdetailsview.aspx?FamilyID=6b6c21d2-2006-4afa-9702-529fa782d63b).
Download and extract the [Snappy source distribution](http://snappy.googlecode.com/files/snappy-1.0.5.tar.gz)
1. Open the "Windows SDK 7.1 Command Prompt" :
Start Menu -> "Microsoft Windows SDK v7.1" > "Windows SDK 7.1 Command Prompt"
2. Change the directory to the leveldb project
## Building the Static lib
* 32 bit Version
setenv /x86
msbuild.exe /p:Configuration=Release /p:Platform=Win32 /p:Snappy=..\snappy-1.0.5
* 64 bit Version
setenv /x64
msbuild.exe /p:Configuration=Release /p:Platform=x64 /p:Snappy=..\snappy-1.0.5
## Building and Running the Benchmark app
* 32 bit Version
setenv /x86
msbuild.exe /p:Configuration=Benchmark /p:Platform=Win32 /p:Snappy=..\snappy-1.0.5
Benchmark\leveldb.exe
* 64 bit Version
setenv /x64
msbuild.exe /p:Configuration=Benchmark /p:Platform=x64 /p:Snappy=..\snappy-1.0.5
x64\Benchmark\leveldb.exe

View file

@ -7,11 +7,8 @@
# CC C Compiler path # CC C Compiler path
# CXX C++ Compiler path # CXX C++ Compiler path
# PLATFORM_LDFLAGS Linker flags # PLATFORM_LDFLAGS Linker flags
# PLATFORM_LIBS Libraries flags
# PLATFORM_SHARED_EXT Extension for shared libraries # PLATFORM_SHARED_EXT Extension for shared libraries
# PLATFORM_SHARED_LDFLAGS Flags for building shared library # PLATFORM_SHARED_LDFLAGS Flags for building shared library
# This flag is embedded just before the name
# of the shared library without intervening spaces
# PLATFORM_SHARED_CFLAGS Flags for compiling objects for shared library # PLATFORM_SHARED_CFLAGS Flags for compiling objects for shared library
# PLATFORM_CCFLAGS C compiler flags # PLATFORM_CCFLAGS C compiler flags
# PLATFORM_CXXFLAGS C++ compiler flags. Will contain: # PLATFORM_CXXFLAGS C++ compiler flags. Will contain:
@ -20,15 +17,14 @@
# #
# The PLATFORM_CCFLAGS and PLATFORM_CXXFLAGS might include the following: # The PLATFORM_CCFLAGS and PLATFORM_CXXFLAGS might include the following:
# #
# -DLEVELDB_ATOMIC_PRESENT if <atomic> is present # -DLEVELDB_CSTDATOMIC_PRESENT if <cstdatomic> is present
# -DLEVELDB_PLATFORM_POSIX for Posix-based platforms # -DLEVELDB_PLATFORM_POSIX for Posix-based platforms
# -DSNAPPY if the Snappy library is present # -DSNAPPY if the Snappy library is present
# #
OUTPUT=$1 OUTPUT=$1
PREFIX=$2 if test -z "$OUTPUT"; then
if test -z "$OUTPUT" || test -z "$PREFIX"; then echo "usage: $0 <output-filename>" >&2
echo "usage: $0 <output-filename> <directory_prefix>" >&2
exit 1 exit 1
fi fi
@ -44,10 +40,6 @@ if test -z "$CXX"; then
CXX=g++ CXX=g++
fi fi
if test -z "$TMPDIR"; then
TMPDIR=/tmp
fi
# Detect OS # Detect OS
if test -z "$TARGET_OS"; then if test -z "$TARGET_OS"; then
TARGET_OS=`uname -s` TARGET_OS=`uname -s`
@ -58,119 +50,77 @@ CROSS_COMPILE=
PLATFORM_CCFLAGS= PLATFORM_CCFLAGS=
PLATFORM_CXXFLAGS= PLATFORM_CXXFLAGS=
PLATFORM_LDFLAGS= PLATFORM_LDFLAGS=
PLATFORM_LIBS= PLATFORM_SHARED_EXT=
PLATFORM_SHARED_EXT="so"
PLATFORM_SHARED_LDFLAGS="-shared -Wl,-soname -Wl," PLATFORM_SHARED_LDFLAGS="-shared -Wl,-soname -Wl,"
PLATFORM_SHARED_CFLAGS="-fPIC" PLATFORM_SHARED_CFLAGS="-fPIC"
PLATFORM_SHARED_VERSIONED=true PLATFORM_SHARED_VERSIONED=true
PLATFORM_SSEFLAGS=
MEMCMP_FLAG= if test -n "$LEVELDB_VSN"; then
if [ "$CXX" = "g++" ]; then VERSION_FLAGS="$VERSION_FLAGS -DLEVELDB_VSN=\"$LEVELDB_VSN\""
# Use libc's memcmp instead of GCC's memcmp. This results in ~40%
# performance improvement on readrandom under gcc 4.4.3 on Linux/x86.
MEMCMP_FLAG="-fno-builtin-memcmp"
fi fi
# On GCC, we pick libc's memcmp over GCC's memcmp via -fno-builtin-memcmp
case "$TARGET_OS" in case "$TARGET_OS" in
CYGWIN_*)
PLATFORM=OS_LINUX
COMMON_FLAGS="$MEMCMP_FLAG -lpthread -DOS_LINUX -DCYGWIN"
PLATFORM_LDFLAGS="-lpthread"
PORT_FILE=port/port_posix.cc
PORT_SSE_FILE=port/port_posix_sse.cc
;;
Darwin) Darwin)
PLATFORM=OS_MACOSX PLATFORM=OS_MACOSX
COMMON_FLAGS="$MEMCMP_FLAG -DOS_MACOSX" oIFS="$IFS"; IFS=.
PLATFORM_SHARED_EXT=dylib set `uname -r`
[ -z "$INSTALL_PATH" ] && INSTALL_PATH=`pwd` IFS="$oIFS"
PLATFORM_SHARED_LDFLAGS="-dynamiclib -install_name $INSTALL_PATH/" if [ "$1" -ge 13 ]; then
# assume clang compiler
COMMON_FLAGS="-mmacosx-version-min=10.8 -DOS_MACOSX -stdlib=libc++"
PLATFORM_LDFLAGS="-mmacosx-version-min=10.8"
else
COMMON_FLAGS="-fno-builtin-memcmp -DOS_MACOSX"
fi
PLATFORM_SHARED_EXT=
PLATFORM_SHARED_LDFLAGS="-dynamiclib -install_name "
PORT_FILE=port/port_posix.cc PORT_FILE=port/port_posix.cc
PORT_SSE_FILE=port/port_posix_sse.cc
;; ;;
Linux) Linux)
PLATFORM=OS_LINUX PLATFORM=OS_LINUX
COMMON_FLAGS="$MEMCMP_FLAG -pthread -DOS_LINUX" COMMON_FLAGS="-fno-builtin-memcmp -pthread -DOS_LINUX"
PLATFORM_LDFLAGS="-pthread" PLATFORM_LDFLAGS="-pthread -lrt"
PORT_FILE=port/port_posix.cc PORT_FILE=port/port_posix.cc
PORT_SSE_FILE=port/port_posix_sse.cc
;; ;;
SunOS) SunOS)
PLATFORM=OS_SOLARIS PLATFORM=OS_SOLARIS
COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_SOLARIS" COMMON_FLAGS="-fno-builtin-memcmp -D_REENTRANT -DOS_SOLARIS -m64"
PLATFORM_LIBS="-lpthread -lrt" PLATFORM_LDFLAGS="-lpthread -lrt"
PLATFORM_SHARED_EXT=
PORT_FILE=port/port_posix.cc PORT_FILE=port/port_posix.cc
PORT_SSE_FILE=port/port_posix_sse.cc
;; ;;
FreeBSD) FreeBSD)
CC=cc
CXX=c++
PLATFORM=OS_FREEBSD PLATFORM=OS_FREEBSD
COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_FREEBSD" COMMON_FLAGS="-fno-builtin-memcmp -D_REENTRANT -DOS_FREEBSD"
PLATFORM_LIBS="-lpthread" PLATFORM_LDFLAGS="-lpthread"
PORT_FILE=port/port_posix.cc
PORT_SSE_FILE=port/port_posix_sse.cc
;;
GNU/kFreeBSD)
PLATFORM=OS_KFREEBSD
COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_KFREEBSD"
PLATFORM_LIBS="-lpthread"
PORT_FILE=port/port_posix.cc PORT_FILE=port/port_posix.cc
;; ;;
NetBSD) NetBSD)
PLATFORM=OS_NETBSD PLATFORM=OS_NETBSD
COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_NETBSD" COMMON_FLAGS="-fno-builtin-memcmp -D_REENTRANT -DOS_NETBSD"
PLATFORM_LIBS="-lpthread -lgcc_s" PLATFORM_LDFLAGS="-lpthread -lgcc_s"
PORT_FILE=port/port_posix.cc PORT_FILE=port/port_posix.cc
PORT_SSE_FILE=port/port_posix_sse.cc
;; ;;
OpenBSD) OpenBSD)
PLATFORM=OS_OPENBSD PLATFORM=OS_OPENBSD
COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_OPENBSD" COMMON_FLAGS="-fno-builtin-memcmp -D_REENTRANT -DOS_OPENBSD"
PLATFORM_LDFLAGS="-pthread" PLATFORM_LDFLAGS="-pthread"
PORT_FILE=port/port_posix.cc PORT_FILE=port/port_posix.cc
PORT_SSE_FILE=port/port_posix_sse.cc
;; ;;
DragonFly) DragonFly)
PLATFORM=OS_DRAGONFLYBSD PLATFORM=OS_DRAGONFLYBSD
COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_DRAGONFLYBSD" COMMON_FLAGS="-fno-builtin-memcmp -D_REENTRANT -DOS_DRAGONFLYBSD"
PLATFORM_LIBS="-lpthread" PLATFORM_LDFLAGS="-lpthread"
PORT_FILE=port/port_posix.cc PORT_FILE=port/port_posix.cc
PORT_SSE_FILE=port/port_posix_sse.cc
;; ;;
OS_ANDROID_CROSSCOMPILE) OS_ANDROID_CROSSCOMPILE)
PLATFORM=OS_ANDROID PLATFORM=OS_ANDROID
COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_ANDROID -DLEVELDB_PLATFORM_POSIX" COMMON_FLAGS="-fno-builtin-memcmp -D_REENTRANT -DOS_ANDROID -DLEVELDB_PLATFORM_POSIX"
PLATFORM_LDFLAGS="" # All pthread features are in the Android C library PLATFORM_LDFLAGS="" # All pthread features are in the Android C library
PORT_FILE=port/port_posix.cc PORT_FILE=port/port_posix.cc
PORT_SSE_FILE=port/port_posix_sse.cc
CROSS_COMPILE=true
;;
HP-UX)
PLATFORM=OS_HPUX
COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_HPUX"
PLATFORM_LDFLAGS="-pthread"
PORT_FILE=port/port_posix.cc
PORT_SSE_FILE=port/port_posix_sse.cc
# man ld: +h internal_name
PLATFORM_SHARED_LDFLAGS="-shared -Wl,+h -Wl,"
;;
IOS)
PLATFORM=IOS
COMMON_FLAGS="$MEMCMP_FLAG -DOS_MACOSX"
[ -z "$INSTALL_PATH" ] && INSTALL_PATH=`pwd`
PORT_FILE=port/port_posix.cc
PORT_SSE_FILE=port/port_posix_sse.cc
PLATFORM_SHARED_EXT=
PLATFORM_SHARED_LDFLAGS=
PLATFORM_SHARED_CFLAGS=
PLATFORM_SHARED_VERSIONED=
;;
OS_WINDOWS_CROSSCOMPILE | NATIVE_WINDOWS)
PLATFORM=OS_WINDOWS
COMMON_FLAGS="-fno-builtin-memcmp -D_REENTRANT -DOS_WINDOWS -DLEVELDB_PLATFORM_WINDOWS -DWINVER=0x0500 -D__USE_MINGW_ANSI_STDIO=1"
PLATFORM_SOURCES="util/env_win.cc"
PLATFORM_LIBS="-lshlwapi"
PORT_FILE=port/port_win.cc
CROSS_COMPILE=true CROSS_COMPILE=true
;; ;;
*) *)
@ -182,78 +132,106 @@ esac
# except for the test and benchmark files. By default, find will output a list # except for the test and benchmark files. By default, find will output a list
# of all files matching either rule, so we need to append -print to make the # of all files matching either rule, so we need to append -print to make the
# prune take effect. # prune take effect.
DIRS="$PREFIX/db $PREFIX/util $PREFIX/table" if [ -f leveldb_ee/README.md ]; then
DIRS="util db table leveldb_ee"
else
DIRS="util db table leveldb_os"
fi
set -f # temporarily disable globbing so that our patterns aren't expanded set -f # temporarily disable globbing so that our patterns aren't expanded
PRUNE_TEST="-name *test*.cc -prune" PRUNE_TEST="-name *test*.cc -prune"
PRUNE_BENCH="-name *_bench.cc -prune" PRUNE_BENCH="-name *_bench.cc -prune"
PRUNE_TOOL="-name leveldbutil.cc -prune" PORTABLE_FILES=`find $DIRS $PRUNE_TEST -o $PRUNE_BENCH -o -name '*.cc' -print | sort | tr "\n" " "`
PORTABLE_FILES=`find $DIRS $PRUNE_TEST -o $PRUNE_BENCH -o $PRUNE_TOOL -o -name '*.cc' -print | sort | sed "s,^$PREFIX/,," | tr "\n" " "` TESTS=`find $DIRS -name '*_test.c*' -print | sort | tr "\n" " "`
set +f # re-enable globbing set +f # re-enable globbing
# The sources consist of the portable files, plus the platform-specific port # The sources consist of the portable files, plus the platform-specific port
# file. # file.
echo "SOURCES=$PORTABLE_FILES $PORT_FILE $PORT_SSE_FILE" >> $OUTPUT echo "SOURCES=$PORTABLE_FILES $PORT_FILE" >> $OUTPUT
echo "MEMENV_SOURCES=helpers/memenv/memenv.cc" >> $OUTPUT echo "MEMENV_SOURCES=helpers/memenv/memenv.cc" >> $OUTPUT
echo "TEST_SOURCES=$TESTS" >>$OUTPUT
if [ "$CROSS_COMPILE" = "true" ]; then if [ "$CROSS_COMPILE" = "true" ]; then
# Cross-compiling; do not try any compilation tests. # Cross-compiling; do not try any compilation tests.
true true
else else
CXXOUTPUT="${TMPDIR}/leveldb_build_detect_platform-cxx.$$" # If -std=c++0x works, use <cstdatomic>. Otherwise use port_posix.h.
$CXX $CFLAGS -std=c++0x -x c++ - -o /dev/null 2>/dev/null <<EOF
# If -std=c++0x works, use <atomic> as fallback for when memory barriers #include <cstdatomic>
# are not available.
$CXX $CXXFLAGS -std=c++0x -x c++ - -o $CXXOUTPUT 2>/dev/null <<EOF
#include <atomic>
int main() {} int main() {}
EOF EOF
if [ "$?" = 0 ]; then if [ "$?" = 0 ]; then
COMMON_FLAGS="$COMMON_FLAGS -DLEVELDB_PLATFORM_POSIX -DLEVELDB_ATOMIC_PRESENT" COMMON_FLAGS="$COMMON_FLAGS -DLEVELDB_PLATFORM_POSIX -DLEVELDB_CSTDATOMIC_PRESENT"
PLATFORM_CXXFLAGS="-std=c++0x" PLATFORM_CXXFLAGS="-std=c++0x"
else else
COMMON_FLAGS="$COMMON_FLAGS -DLEVELDB_PLATFORM_POSIX" COMMON_FLAGS="$COMMON_FLAGS -DLEVELDB_PLATFORM_POSIX"
fi fi
# Test whether Snappy library is installed
# http://code.google.com/p/snappy/
$CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null <<EOF
#include <snappy.h>
int main() {}
EOF
if [ "$?" = 0 ]; then
COMMON_FLAGS="$COMMON_FLAGS -DSNAPPY"
if [ "$PLATFORM" = "OS_LINUX" ]; then
# Basho: switching to static snappy library to make tools more portable
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -Wl,-non_shared -lsnappy -Wl,-call_shared"
else
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lsnappy"
fi
fi
# Test whether tcmalloc is available # Test whether tcmalloc is available
$CXX $CXXFLAGS -x c++ - -o $CXXOUTPUT -ltcmalloc 2>/dev/null <<EOF $CXX $CFLAGS -x c++ - -o /dev/null -ltcmalloc 2>/dev/null <<EOF
int main() {} int main() {}
EOF EOF
if [ "$?" = 0 ]; then if [ "$?" = 0 ]; then
PLATFORM_LIBS="$PLATFORM_LIBS -ltcmalloc" PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -ltcmalloc"
fi fi
rm -f $CXXOUTPUT 2>/dev/null
# Test if gcc SSE 4.2 is supported
$CXX $CXXFLAGS -x c++ - -o $CXXOUTPUT -msse4.2 2>/dev/null <<EOF
int main() {}
EOF
if [ "$?" = 0 ]; then
PLATFORM_SSEFLAGS="-msse4.2"
fi
rm -f $CXXOUTPUT 2>/dev/null
fi fi
# Use the SSE 4.2 CRC32C intrinsics iff runtime checks indicate compiler supports them. PLATFORM_CCFLAGS="$PLATFORM_CCFLAGS $COMMON_FLAGS $VERSION_FLAGS"
if [ -n "$PLATFORM_SSEFLAGS" ]; then PLATFORM_CXXFLAGS="$PLATFORM_CXXFLAGS $COMMON_FLAGS $VERSION_FLAGS"
PLATFORM_SSEFLAGS="$PLATFORM_SSEFLAGS -DLEVELDB_PLATFORM_POSIX_SSE"
fi
PLATFORM_CCFLAGS="$PLATFORM_CCFLAGS $COMMON_FLAGS"
PLATFORM_CXXFLAGS="$PLATFORM_CXXFLAGS $COMMON_FLAGS"
echo "CC=$CC" >> $OUTPUT echo "CC=$CC" >> $OUTPUT
echo "CXX=$CXX" >> $OUTPUT echo "CXX=$CXX" >> $OUTPUT
echo "PLATFORM=$PLATFORM" >> $OUTPUT echo "PLATFORM=$PLATFORM" >> $OUTPUT
echo "PLATFORM_LDFLAGS=$PLATFORM_LDFLAGS" >> $OUTPUT echo "PLATFORM_LDFLAGS=$PLATFORM_LDFLAGS" >> $OUTPUT
echo "PLATFORM_LIBS=$PLATFORM_LIBS" >> $OUTPUT
echo "PLATFORM_CCFLAGS=$PLATFORM_CCFLAGS" >> $OUTPUT echo "PLATFORM_CCFLAGS=$PLATFORM_CCFLAGS" >> $OUTPUT
echo "PLATFORM_CXXFLAGS=$PLATFORM_CXXFLAGS" >> $OUTPUT echo "PLATFORM_CXXFLAGS=$PLATFORM_CXXFLAGS" >> $OUTPUT
echo "PLATFORM_SSEFLAGS=$PLATFORM_SSEFLAGS" >> $OUTPUT
echo "PLATFORM_SHARED_CFLAGS=$PLATFORM_SHARED_CFLAGS" >> $OUTPUT echo "PLATFORM_SHARED_CFLAGS=$PLATFORM_SHARED_CFLAGS" >> $OUTPUT
echo "PLATFORM_SHARED_EXT=$PLATFORM_SHARED_EXT" >> $OUTPUT echo "PLATFORM_SHARED_EXT=$PLATFORM_SHARED_EXT" >> $OUTPUT
echo "PLATFORM_SHARED_LDFLAGS=$PLATFORM_SHARED_LDFLAGS" >> $OUTPUT echo "PLATFORM_SHARED_LDFLAGS=$PLATFORM_SHARED_LDFLAGS" >> $OUTPUT
echo "PLATFORM_SHARED_VERSIONED=$PLATFORM_SHARED_VERSIONED" >> $OUTPUT echo "PLATFORM_SHARED_VERSIONED=$PLATFORM_SHARED_VERSIONED" >> $OUTPUT
#
# Basho extension to place -D variable in include/leveldb/ldb_config.h
#
LDB_CONFIG="include/leveldb/ldb_config.h"
# Delete existing output, if it exists
rm -f $LDB_CONFIG
write_config_h()
{
for param in $@
do
prefix=$(expr -- $param : "\(..\)")
if [ X$prefix = "X-D" ]
then
echo "" >>$LDB_CONFIG
echo "#ifndef $(expr -- $param : '..\(.*\)')" >>$LDB_CONFIG
echo " #define $(expr -- $param : '..\(.*\)')" >>$LDB_CONFIG
echo "#endif" >>$LDB_CONFIG
fi
done
}
echo "/** This file is generated by build_detect_platform." >$LDB_CONFIG
echo " * It saves the state of compile flags. This benefits the reuse" >>$LDB_CONFIG
echo " * of internal include files outside of a leveldb build." >>$LDB_CONFIG
echo " */" >>$LDB_CONFIG
write_config_h $COMMON_FLAGS

View file

@ -1,118 +0,0 @@
// Copyright (c) 2013 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "leveldb/db.h"
#include "db/db_impl.h"
#include "leveldb/cache.h"
#include "util/testharness.h"
#include "util/testutil.h"
namespace leveldb {
class AutoCompactTest {
public:
std::string dbname_;
Cache* tiny_cache_;
Options options_;
DB* db_;
AutoCompactTest() {
dbname_ = test::TmpDir() + "/autocompact_test";
tiny_cache_ = NewLRUCache(100);
options_.block_cache = tiny_cache_;
DestroyDB(dbname_, options_);
options_.create_if_missing = true;
options_.compression = kNoCompression;
ASSERT_OK(DB::Open(options_, dbname_, &db_));
}
~AutoCompactTest() {
delete db_;
DestroyDB(dbname_, Options());
delete tiny_cache_;
}
std::string Key(int i) {
char buf[100];
snprintf(buf, sizeof(buf), "key%06d", i);
return std::string(buf);
}
uint64_t Size(const Slice& start, const Slice& limit) {
Range r(start, limit);
uint64_t size;
db_->GetApproximateSizes(&r, 1, &size);
return size;
}
void DoReads(int n);
};
static const int kValueSize = 200 * 1024;
static const int kTotalSize = 100 * 1024 * 1024;
static const int kCount = kTotalSize / kValueSize;
// Read through the first n keys repeatedly and check that they get
// compacted (verified by checking the size of the key space).
void AutoCompactTest::DoReads(int n) {
std::string value(kValueSize, 'x');
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
// Fill database
for (int i = 0; i < kCount; i++) {
ASSERT_OK(db_->Put(WriteOptions(), Key(i), value));
}
ASSERT_OK(dbi->TEST_CompactMemTable());
// Delete everything
for (int i = 0; i < kCount; i++) {
ASSERT_OK(db_->Delete(WriteOptions(), Key(i)));
}
ASSERT_OK(dbi->TEST_CompactMemTable());
// Get initial measurement of the space we will be reading.
const int64_t initial_size = Size(Key(0), Key(n));
const int64_t initial_other_size = Size(Key(n), Key(kCount));
// Read until size drops significantly.
std::string limit_key = Key(n);
for (int read = 0; true; read++) {
ASSERT_LT(read, 100) << "Taking too long to compact";
Iterator* iter = db_->NewIterator(ReadOptions());
for (iter->SeekToFirst();
iter->Valid() && iter->key().ToString() < limit_key;
iter->Next()) {
// Drop data
}
delete iter;
// Wait a little bit to allow any triggered compactions to complete.
Env::Default()->SleepForMicroseconds(1000000);
uint64_t size = Size(Key(0), Key(n));
fprintf(stderr, "iter %3d => %7.3f MB [other %7.3f MB]\n",
read+1, size/1048576.0, Size(Key(n), Key(kCount))/1048576.0);
if (size <= initial_size/10) {
break;
}
}
// Verify that the size of the key space not touched by the reads
// is pretty much unchanged.
const int64_t final_other_size = Size(Key(n), Key(kCount));
ASSERT_LE(final_other_size, initial_other_size + 1048576);
ASSERT_GE(final_other_size, initial_other_size/5 - 1048576);
}
TEST(AutoCompactTest, ReadAll) {
DoReads(kCount);
}
TEST(AutoCompactTest, ReadHalf) {
DoReads(kCount/2);
}
} // namespace leveldb
int main(int argc, char** argv) {
return leveldb::test::RunAllTests();
}

View file

@ -2,12 +2,16 @@
// Use of this source code is governed by a BSD-style license that can be // Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors. // found in the LICENSE file. See the AUTHORS file for names of contributors.
#define __STDC_FORMAT_MACROS
#include <inttypes.h>
#include "db/builder.h" #include "db/builder.h"
#include "db/filename.h" #include "db/filename.h"
#include "db/dbformat.h" #include "db/dbformat.h"
#include "db/table_cache.h" #include "db/table_cache.h"
#include "db/version_edit.h" #include "db/version_edit.h"
#include "db/version_set.h"
#include "leveldb/db.h" #include "leveldb/db.h"
#include "leveldb/env.h" #include "leveldb/env.h"
#include "leveldb/iterator.h" #include "leveldb/iterator.h"
@ -17,27 +21,51 @@ namespace leveldb {
Status BuildTable(const std::string& dbname, Status BuildTable(const std::string& dbname,
Env* env, Env* env,
const Options& options, const Options& options,
const Comparator * user_comparator,
TableCache* table_cache, TableCache* table_cache,
Iterator* iter, Iterator* iter,
FileMetaData* meta) { FileMetaData* meta,
SequenceNumber smallest_snapshot) {
Status s; Status s;
size_t keys_seen, keys_retired;
keys_seen=0;
keys_retired=0;
meta->file_size = 0; meta->file_size = 0;
iter->SeekToFirst(); iter->SeekToFirst();
std::string fname = TableFileName(dbname, meta->number); KeyRetirement retire(user_comparator, smallest_snapshot, &options);
std::string fname = TableFileName(options, meta->number, meta->level);
if (iter->Valid()) { if (iter->Valid()) {
WritableFile* file; WritableFile* file;
s = env->NewWritableFile(fname, &file);
s = env->NewWritableFile(fname, &file,
env->RecoveryMmapSize(&options));
if (!s.ok()) { if (!s.ok()) {
return s; return s;
} }
// tune fadvise to keep all of this lower level file in page cache
// (compaction of unsorted files causes severe cache misses)
file->SetMetadataOffset(1);
TableBuilder* builder = new TableBuilder(options, file); TableBuilder* builder = new TableBuilder(options, file);
meta->smallest.DecodeFrom(iter->key()); meta->smallest.DecodeFrom(iter->key());
for (; iter->Valid(); iter->Next()) { for (; iter->Valid(); iter->Next()) {
++keys_seen;
Slice key = iter->key(); Slice key = iter->key();
if (!retire(key))
{
meta->largest.DecodeFrom(key); meta->largest.DecodeFrom(key);
builder->Add(key, iter->value()); builder->Add(key, iter->value());
++meta->num_entries;
} // if
else
{
++keys_retired;
} // else
} }
// Finish and check for builder errors // Finish and check for builder errors
@ -45,6 +73,9 @@ Status BuildTable(const std::string& dbname,
s = builder->Finish(); s = builder->Finish();
if (s.ok()) { if (s.ok()) {
meta->file_size = builder->FileSize(); meta->file_size = builder->FileSize();
meta->exp_write_low = builder->GetExpiryWriteLow();
meta->exp_write_high = builder->GetExpiryWriteHigh();
meta->exp_explicit_high = builder->GetExpiryExplicitHigh();
assert(meta->file_size > 0); assert(meta->file_size > 0);
} }
} else { } else {
@ -64,10 +95,20 @@ Status BuildTable(const std::string& dbname,
if (s.ok()) { if (s.ok()) {
// Verify that the table is usable // Verify that the table is usable
Table * table_ptr;
Iterator* it = table_cache->NewIterator(ReadOptions(), Iterator* it = table_cache->NewIterator(ReadOptions(),
meta->number, meta->number,
meta->file_size); meta->file_size,
meta->level,
&table_ptr);
s = it->status(); s = it->status();
// Riak specific: bloom filter is no longer read by default,
// force read on highly used overlapped table files
if (s.ok() && VersionSet::IsLevelOverlapped(meta->level))
table_ptr->ReadFilter();
// table_ptr is owned by it and therefore invalidated by this delete
delete it; delete it;
} }
} }
@ -79,6 +120,11 @@ Status BuildTable(const std::string& dbname,
if (s.ok() && meta->file_size > 0) { if (s.ok() && meta->file_size > 0) {
// Keep it // Keep it
if (0!=keys_retired)
{
Log(options.info_log, "Level-0 table #%" PRIu64 ": %zd keys seen, %zd keys retired, %zd keys expired",
meta->number, keys_seen, retire.GetDroppedCount(), retire.GetExpiredCount());
} // if
} else { } else {
env->DeleteFile(fname); env->DeleteFile(fname);
} }

View file

@ -6,6 +6,7 @@
#define STORAGE_LEVELDB_DB_BUILDER_H_ #define STORAGE_LEVELDB_DB_BUILDER_H_
#include "leveldb/status.h" #include "leveldb/status.h"
#include "db/dbformat.h"
namespace leveldb { namespace leveldb {
@ -25,9 +26,11 @@ class VersionEdit;
extern Status BuildTable(const std::string& dbname, extern Status BuildTable(const std::string& dbname,
Env* env, Env* env,
const Options& options, const Options& options,
const Comparator * user_comparator,
TableCache* table_cache, TableCache* table_cache,
Iterator* iter, Iterator* iter,
FileMetaData* meta); FileMetaData* meta,
SequenceNumber smallest_snapshot);
} // namespace leveldb } // namespace leveldb

View file

@ -6,6 +6,7 @@
#include <stdlib.h> #include <stdlib.h>
#include <unistd.h> #include <unistd.h>
#include <stdint.h>
#include "leveldb/cache.h" #include "leveldb/cache.h"
#include "leveldb/comparator.h" #include "leveldb/comparator.h"
#include "leveldb/db.h" #include "leveldb/db.h"
@ -40,6 +41,8 @@ using leveldb::Status;
using leveldb::WritableFile; using leveldb::WritableFile;
using leveldb::WriteBatch; using leveldb::WriteBatch;
using leveldb::WriteOptions; using leveldb::WriteOptions;
using leveldb::KeyMetaData;
using leveldb::ValueType;
extern "C" { extern "C" {
@ -49,6 +52,7 @@ struct leveldb_writebatch_t { WriteBatch rep; };
struct leveldb_snapshot_t { const Snapshot* rep; }; struct leveldb_snapshot_t { const Snapshot* rep; };
struct leveldb_readoptions_t { ReadOptions rep; }; struct leveldb_readoptions_t { ReadOptions rep; };
struct leveldb_writeoptions_t { WriteOptions rep; }; struct leveldb_writeoptions_t { WriteOptions rep; };
struct leveldb_keymetadata_t { KeyMetaData rep; };
struct leveldb_options_t { Options rep; }; struct leveldb_options_t { Options rep; };
struct leveldb_cache_t { Cache* rep; }; struct leveldb_cache_t { Cache* rep; };
struct leveldb_seqfile_t { SequentialFile* rep; }; struct leveldb_seqfile_t { SequentialFile* rep; };
@ -173,8 +177,19 @@ void leveldb_put(
const char* key, size_t keylen, const char* key, size_t keylen,
const char* val, size_t vallen, const char* val, size_t vallen,
char** errptr) { char** errptr) {
return(leveldb_put2(db, options, key, keylen, val, vallen, errptr, NULL));
}
void leveldb_put2(
leveldb_t* db,
const leveldb_writeoptions_t* options,
const char* key, size_t keylen,
const char* val, size_t vallen,
char** errptr,
const leveldb_keymetadata_t * metadata) {
SaveError(errptr, SaveError(errptr,
db->rep->Put(options->rep, Slice(key, keylen), Slice(val, vallen))); db->rep->Put(options->rep, Slice(key, keylen), Slice(val, vallen),
(NULL==metadata ? NULL : &metadata->rep)));
} }
void leveldb_delete( void leveldb_delete(
@ -200,9 +215,21 @@ char* leveldb_get(
const char* key, size_t keylen, const char* key, size_t keylen,
size_t* vallen, size_t* vallen,
char** errptr) { char** errptr) {
return(leveldb_get2(db, options, key, keylen, vallen, errptr, NULL));
}
char* leveldb_get2(
leveldb_t* db,
const leveldb_readoptions_t* options,
const char* key, size_t keylen,
size_t* vallen,
char** errptr,
leveldb_keymetadata_t * metadata) {
char* result = NULL; char* result = NULL;
std::string tmp; std::string tmp;
Status s = db->rep->Get(options->rep, Slice(key, keylen), &tmp); Status s = db->rep->Get(options->rep, Slice(key, keylen), &tmp,
(NULL==metadata ? NULL : &metadata->rep));
if (s.ok()) { if (s.ok()) {
*vallen = tmp.size(); *vallen = tmp.size();
result = CopyString(tmp); result = CopyString(tmp);
@ -330,6 +357,15 @@ const char* leveldb_iter_value(const leveldb_iterator_t* iter, size_t* vlen) {
return s.data(); return s.data();
} }
const void leveldb_iter_keymetadata(const leveldb_iterator_t* iter,
leveldb_keymetadata_t * meta)
{
if (NULL!=iter && NULL!=meta)
{
meta->rep=iter->rep->keymetadata();
} // if
}
void leveldb_iter_get_error(const leveldb_iterator_t* iter, char** errptr) { void leveldb_iter_get_error(const leveldb_iterator_t* iter, char** errptr) {
SaveError(errptr, iter->rep->status()); SaveError(errptr, iter->rep->status());
} }
@ -350,7 +386,16 @@ void leveldb_writebatch_put(
leveldb_writebatch_t* b, leveldb_writebatch_t* b,
const char* key, size_t klen, const char* key, size_t klen,
const char* val, size_t vlen) { const char* val, size_t vlen) {
b->rep.Put(Slice(key, klen), Slice(val, vlen)); leveldb_writebatch_put2(b, key, klen, val, vlen,NULL);
}
void leveldb_writebatch_put2(
leveldb_writebatch_t* b,
const char* key, size_t klen,
const char* val, size_t vlen,
const leveldb_keymetadata_t * metadata) {
b->rep.Put(Slice(key, klen), Slice(val, vlen),
(NULL==metadata ? NULL : &metadata->rep));
} }
void leveldb_writebatch_delete( void leveldb_writebatch_delete(
@ -362,15 +407,20 @@ void leveldb_writebatch_delete(
void leveldb_writebatch_iterate( void leveldb_writebatch_iterate(
leveldb_writebatch_t* b, leveldb_writebatch_t* b,
void* state, void* state,
void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen), void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen,
const int & type, const uint64_t & expiry),
void (*deleted)(void*, const char* k, size_t klen)) { void (*deleted)(void*, const char* k, size_t klen)) {
class H : public WriteBatch::Handler { class H : public WriteBatch::Handler {
public: public:
void* state_; void* state_;
void (*put_)(void*, const char* k, size_t klen, const char* v, size_t vlen); void (*put_)(void*, const char* k, size_t klen, const char* v, size_t vlen,
const int & type, const uint64_t & expiry);
void (*deleted_)(void*, const char* k, size_t klen); void (*deleted_)(void*, const char* k, size_t klen);
virtual void Put(const Slice& key, const Slice& value) { virtual void Put(const Slice& key, const Slice& value,
(*put_)(state_, key.data(), key.size(), value.data(), value.size()); const leveldb::ValueType & type,
const leveldb::ExpiryTimeMicros & expiry)
{
(*put_)(state_, key.data(), key.size(), value.data(), value.size(), (int)type, (uint64_t)expiry);
} }
virtual void Delete(const Slice& key) { virtual void Delete(const Slice& key) {
(*deleted_)(state_, key.data(), key.size()); (*deleted_)(state_, key.data(), key.size());
@ -418,6 +468,11 @@ void leveldb_options_set_paranoid_checks(
opt->rep.paranoid_checks = v; opt->rep.paranoid_checks = v;
} }
void leveldb_options_set_verify_compactions(
leveldb_options_t* opt, unsigned char v) {
opt->rep.verify_compactions = v;
}
void leveldb_options_set_env(leveldb_options_t* opt, leveldb_env_t* env) { void leveldb_options_set_env(leveldb_options_t* opt, leveldb_env_t* env) {
opt->rep.env = (env ? env->rep : NULL); opt->rep.env = (env ? env->rep : NULL);
} }
@ -450,6 +505,10 @@ void leveldb_options_set_compression(leveldb_options_t* opt, int t) {
opt->rep.compression = static_cast<CompressionType>(t); opt->rep.compression = static_cast<CompressionType>(t);
} }
void leveldb_options_set_total_leveldb_mem(leveldb_options_t* opt, size_t s) {
opt->rep.total_leveldb_mem = s;
}
leveldb_comparator_t* leveldb_comparator_create( leveldb_comparator_t* leveldb_comparator_create(
void* state, void* state,
void (*destructor)(void*), void (*destructor)(void*),
@ -580,7 +639,17 @@ void leveldb_env_destroy(leveldb_env_t* env) {
delete env; delete env;
} }
void leveldb_env_shutdown() {
Env::Shutdown();
}
/**
* CAUTION: this call is only for char * objects returned by
* functions like leveldb_get and leveldb_property_value.
* Also used to release errptr strings.
*/
void leveldb_free(void* ptr) { void leveldb_free(void* ptr) {
if (NULL!=ptr)
free(ptr); free(ptr);
} }

View file

@ -3,6 +3,8 @@
found in the LICENSE file. See the AUTHORS file for names of contributors. */ found in the LICENSE file. See the AUTHORS file for names of contributors. */
#include "leveldb/c.h" #include "leveldb/c.h"
#include "leveldb/options.h"
#include "port/port.h"
#include <stddef.h> #include <stddef.h>
#include <stdio.h> #include <stdio.h>
@ -11,8 +13,13 @@
#include <sys/types.h> #include <sys/types.h>
#include <unistd.h> #include <unistd.h>
using leveldb::ValueType;
struct leveldb_keymetadata_t { leveldb::KeyMetaData rep; };
const char* phase = ""; const char* phase = "";
static char dbname[200]; static char dbname[200];
static leveldb::ExpiryTimeMicros gStartTime;
static void StartPhase(const char* name) { static void StartPhase(const char* name) {
fprintf(stderr, "=== Test %s\n", name); fprintf(stderr, "=== Test %s\n", name);
@ -49,7 +56,7 @@ static void CheckEqual(const char* expected, const char* v, size_t n) {
fprintf(stderr, "%s: expected '%s', got '%s'\n", fprintf(stderr, "%s: expected '%s', got '%s'\n",
phase, phase,
(expected ? expected : "(null)"), (expected ? expected : "(null)"),
(v ? v : "(null")); (v ? v : "(null)"));
abort(); abort();
} }
} }
@ -112,6 +119,117 @@ static void CheckDel(void* ptr, const char* k, size_t klen) {
(*state)++; (*state)++;
} }
// (expiry enabled)
static void CheckGet2(
leveldb_t* db,
const leveldb_readoptions_t* options,
const char* key,
const char* expected,
ValueType type,
uint64_t expiry) {
char* err = NULL;
size_t val_len;
char* val;
leveldb_keymetadata_t meta;
val = leveldb_get2(db, options, key, strlen(key), &val_len, &err, &meta);
CheckNoError(err);
CheckEqual(expected, val, val_len);
CheckCondition(type==meta.rep.m_Type);
if (0==expiry && leveldb::kTypeValueWriteTime==type)
{
leveldb::ExpiryTimeMicros now=leveldb::port::TimeMicros();
CheckCondition(gStartTime<=meta.rep.m_Expiry && meta.rep.m_Expiry<=now);
} // if
else
{CheckCondition(expiry==meta.rep.m_Expiry);}
Free(&val);
}
// (expiry enabled)
static void CheckIter2(leveldb_iterator_t* iter,
const char* key, const char* val,
const leveldb::KeyMetaData & meta) {
size_t len;
const char* str;
leveldb_keymetadata_t it_meta;
str = leveldb_iter_key(iter, &len);
CheckEqual(key, str, len);
str = leveldb_iter_value(iter, &len);
CheckEqual(val, str, len);
leveldb_iter_keymetadata(iter, &it_meta);
CheckCondition(meta.m_Type==it_meta.rep.m_Type);
if (0==meta.m_Expiry && leveldb::kTypeValueWriteTime==meta.m_Type)
{
leveldb::ExpiryTimeMicros now=leveldb::port::TimeMicros();
CheckCondition(gStartTime<=it_meta.rep.m_Expiry && it_meta.rep.m_Expiry<=now);
} // if
else
{CheckCondition(meta.m_Expiry==it_meta.rep.m_Expiry);}
}
// Callback from leveldb_writebatch_iterate()
// (expiry enabled)
struct CheckPut2Data
{
const char * m_Key;
const char * m_Value;
ValueType m_Type;
uint64_t m_Expiry;
};
static struct CheckPut2Data gCheckPut2Data[]=
{
{"foo","hello_put2",leveldb::kTypeValue,0},
{"box","c_put2",leveldb::kTypeValue,0},
{"disney","cartoon_put2",leveldb::kTypeValueWriteTime, 0},
{"money","lotsof_put2",leveldb::kTypeValueWriteTime, 9988776655},
{"time","ismoney_put2",leveldb::kTypeValueExplicitExpiry, 221199887766}
};
static struct CheckPut2Data gCheckPut2ItrData[]=
{
{"bar","b",leveldb::kTypeValue,0},
{"box","c",leveldb::kTypeValue,0},
{"bar","",leveldb::kTypeDeletion,0},
{"mom","texas",leveldb::kTypeValueWriteTime,0},
{"dad","poland",leveldb::kTypeValueExplicitExpiry,22446688}
};
static void CheckPut2(void* ptr,
const char* k, size_t klen,
const char* v, size_t vlen,
const int & type_int,
const uint64_t & expiry) {
int* state = (int*) ptr;
CheckCondition(*state < (sizeof(gCheckPut2ItrData)/sizeof(gCheckPut2ItrData[0])));
struct CheckPut2Data * test;
test=&gCheckPut2ItrData[*state];
CheckEqual(test->m_Key, k, klen);
CheckEqual(test->m_Value, v, vlen);
CheckCondition((int)test->m_Type==type_int);
if (leveldb::kTypeValueWriteTime!=test->m_Type)
{CheckCondition((uint64_t)test->m_Expiry==expiry);}
(*state)++;
}
// Callback from leveldb_writebatch_iterate()
// (expiry enabled)
static void CheckDel2(void* ptr, const char* k, size_t klen) {
int* state = (int*) ptr;
CheckCondition(*state < (sizeof(gCheckPut2ItrData)/sizeof(gCheckPut2ItrData[0])));
struct CheckPut2Data * test;
test=&gCheckPut2ItrData[*state];
CheckEqual(test->m_Key, k, klen);
(*state)++;
}
static void CmpDestroy(void* arg) { } static void CmpDestroy(void* arg) { }
static int CmpCompare(void* arg, const char* a, size_t alen, static int CmpCompare(void* arg, const char* a, size_t alen,
@ -141,7 +259,7 @@ static char* FilterCreate(
int num_keys, int num_keys,
size_t* filter_length) { size_t* filter_length) {
*filter_length = 4; *filter_length = 4;
char* result = malloc(4); char* result = (char*)malloc(4);
memcpy(result, "fake", 4); memcpy(result, "fake", 4);
return result; return result;
} }
@ -167,6 +285,7 @@ int main(int argc, char** argv) {
CheckCondition(leveldb_major_version() >= 1); CheckCondition(leveldb_major_version() >= 1);
CheckCondition(leveldb_minor_version() >= 1); CheckCondition(leveldb_minor_version() >= 1);
gStartTime=leveldb::port::TimeMicros();
snprintf(dbname, sizeof(dbname), snprintf(dbname, sizeof(dbname),
"%s/leveldb_c_test-%d", "%s/leveldb_c_test-%d",
@ -207,12 +326,6 @@ int main(int argc, char** argv) {
CheckCondition(err != NULL); CheckCondition(err != NULL);
Free(&err); Free(&err);
StartPhase("leveldb_free");
db = leveldb_open(options, dbname, &err);
CheckCondition(err != NULL);
leveldb_free(err);
err = NULL;
StartPhase("open"); StartPhase("open");
leveldb_options_set_create_if_missing(options, 1); leveldb_options_set_create_if_missing(options, 1);
db = leveldb_open(options, dbname, &err); db = leveldb_open(options, dbname, &err);
@ -234,42 +347,74 @@ int main(int argc, char** argv) {
StartPhase("writebatch"); StartPhase("writebatch");
{ {
leveldb_keymetadata_t meta;
leveldb_writebatch_t* wb = leveldb_writebatch_create(); leveldb_writebatch_t* wb = leveldb_writebatch_create();
leveldb_writebatch_put(wb, "foo", 3, "a", 1); leveldb_writebatch_put(wb, "foo", 3, "a", 1);
leveldb_writebatch_clear(wb); leveldb_writebatch_clear(wb);
leveldb_writebatch_put(wb, "bar", 3, "b", 1); leveldb_writebatch_put(wb, "bar", 3, "b", 1);
leveldb_writebatch_put(wb, "box", 3, "c", 1); leveldb_writebatch_put(wb, "box", 3, "c", 1);
leveldb_writebatch_delete(wb, "bar", 3); leveldb_writebatch_delete(wb, "bar", 3);
meta.rep.m_Type=leveldb::kTypeValueWriteTime;
meta.rep.m_Expiry=0;
leveldb_writebatch_put2(wb, "mom", 3, "texas", 5, &meta);
meta.rep.m_Type=leveldb::kTypeValueExplicitExpiry;
meta.rep.m_Expiry=22446688;
leveldb_writebatch_put2(wb, "dad", 3, "poland", 6, &meta);
leveldb_write(db, woptions, wb, &err); leveldb_write(db, woptions, wb, &err);
CheckNoError(err); CheckNoError(err);
CheckGet(db, roptions, "foo", "hello"); CheckGet(db, roptions, "foo", "hello");
CheckGet(db, roptions, "bar", NULL); CheckGet(db, roptions, "bar", NULL);
CheckGet(db, roptions, "box", "c"); CheckGet(db, roptions, "box", "c");
CheckGet2(db, roptions, "dad", "poland", leveldb::kTypeValueExplicitExpiry, 22446688);
CheckGet2(db, roptions, "mom", "texas", leveldb::kTypeValueWriteTime, 0);
int pos = 0; int pos = 0;
leveldb_writebatch_iterate(wb, &pos, CheckPut, CheckDel); leveldb_writebatch_iterate(wb, &pos, CheckPut2, CheckDel2);
CheckCondition(pos == 3); CheckCondition(pos == 5);
leveldb_writebatch_destroy(wb); leveldb_writebatch_destroy(wb);
} }
// reminder: keymetadata not supported on backward iteration
StartPhase("iter"); StartPhase("iter");
{ {
leveldb::KeyMetaData meta;
leveldb_iterator_t* iter = leveldb_create_iterator(db, roptions); leveldb_iterator_t* iter = leveldb_create_iterator(db, roptions);
CheckCondition(!leveldb_iter_valid(iter)); CheckCondition(!leveldb_iter_valid(iter));
leveldb_iter_seek_to_first(iter); leveldb_iter_seek_to_first(iter);
CheckCondition(leveldb_iter_valid(iter)); CheckCondition(leveldb_iter_valid(iter));
CheckIter(iter, "box", "c"); CheckIter(iter, "box", "c");
meta.m_Type=leveldb::kTypeValue;
meta.m_Expiry=0;
CheckIter2(iter, "box", "c", meta);
meta.m_Type=leveldb::kTypeValueExplicitExpiry;
meta.m_Expiry=22446688;
leveldb_iter_next(iter);
CheckIter2(iter, "dad", "poland", meta);
leveldb_iter_next(iter); leveldb_iter_next(iter);
CheckIter(iter, "foo", "hello"); CheckIter(iter, "foo", "hello");
leveldb_iter_prev(iter); leveldb_iter_prev(iter);
CheckIter(iter, "dad", "poland");
leveldb_iter_prev(iter);
CheckIter(iter, "box", "c"); CheckIter(iter, "box", "c");
leveldb_iter_prev(iter); leveldb_iter_prev(iter);
CheckCondition(!leveldb_iter_valid(iter)); CheckCondition(!leveldb_iter_valid(iter));
leveldb_iter_seek_to_last(iter); leveldb_iter_seek_to_last(iter);
CheckIter(iter, "foo", "hello"); CheckIter(iter, "mom", "texas");
leveldb_iter_seek(iter, "b", 1); leveldb_iter_seek(iter, "b", 1);
CheckIter(iter, "box", "c"); CheckIter(iter, "box", "c");
leveldb_iter_get_error(iter, &err); leveldb_iter_get_error(iter, &err);
CheckNoError(err); CheckNoError(err);
meta.m_Type=leveldb::kTypeValue;
meta.m_Expiry=0;
CheckIter2(iter, "box", "c", meta);
leveldb_iter_seek(iter, "m", 1);
meta.m_Type=leveldb::kTypeValueWriteTime;
meta.m_Expiry=0;
CheckIter2(iter, "mom", "texas", meta);
leveldb_iter_get_error(iter, &err);
CheckNoError(err);
leveldb_iter_destroy(iter); leveldb_iter_destroy(iter);
} }
@ -335,6 +480,70 @@ int main(int argc, char** argv) {
leveldb_options_set_error_if_exists(options, 1); leveldb_options_set_error_if_exists(options, 1);
} }
StartPhase("put expiry");
{
leveldb_keymetadata_t meta;
int loop, count;
count = sizeof(gCheckPut2Data) / sizeof(gCheckPut2Data[0]);
for (loop=0; loop<count; ++loop)
{
size_t klen, vlen;
leveldb_keymetadata_t meta;
struct CheckPut2Data * test;
test=&gCheckPut2Data[loop];
klen=strlen(test->m_Key);
vlen=strlen(test->m_Value);
meta.rep.m_Type=test->m_Type;
meta.rep.m_Expiry=test->m_Expiry;
leveldb_put2(db, woptions, test->m_Key, klen,
test->m_Value, vlen, &err,
&meta);
CheckNoError(err);
} // for
// testing memtable right now
for (loop=0; loop<count; ++loop)
{
size_t klen, vlen;
leveldb_keymetadata_t meta;
struct CheckPut2Data * test;
test=&gCheckPut2Data[loop];
klen=strlen(test->m_Key);
vlen=strlen(test->m_Value);
CheckGet2(db, roptions, test->m_Key, test->m_Value,
test->m_Type, test->m_Expiry);
} // for
// close and open to force memory table into .sst upon open
leveldb_close(db);
leveldb_options_set_error_if_exists(options, 0);
db = leveldb_open(options, dbname, &err);
CheckNoError(err);
// now testing get from a level-0 .sst file
for (loop=0; loop<count; ++loop)
{
size_t klen, vlen;
leveldb_keymetadata_t meta;
struct CheckPut2Data * test;
test=&gCheckPut2Data[loop];
klen=strlen(test->m_Key);
vlen=strlen(test->m_Value);
CheckGet2(db, roptions, test->m_Key, test->m_Value,
test->m_Type, test->m_Expiry);
} // for
}
//
// This screws up "options" for real database work. execute last.
StartPhase("filter"); StartPhase("filter");
for (run = 0; run < 2; run++) { for (run = 0; run < 2; run++) {
// First run uses custom filter, second run uses bloom filter // First run uses custom filter, second run uses bloom filter
@ -376,6 +585,8 @@ int main(int argc, char** argv) {
leveldb_filterpolicy_destroy(policy); leveldb_filterpolicy_destroy(policy);
} }
StartPhase("cleanup"); StartPhase("cleanup");
leveldb_close(db); leveldb_close(db);
leveldb_options_destroy(options); leveldb_options_destroy(options);
@ -386,5 +597,7 @@ int main(int argc, char** argv) {
leveldb_env_destroy(env); leveldb_env_destroy(env);
fprintf(stderr, "PASS\n"); fprintf(stderr, "PASS\n");
leveldb_env_shutdown();
return 0; return 0;
} }

View file

@ -35,8 +35,8 @@ class CorruptionTest {
CorruptionTest() { CorruptionTest() {
tiny_cache_ = NewLRUCache(100); tiny_cache_ = NewLRUCache(100);
options_.env = &env_; options_.env = &env_;
options_.block_cache = tiny_cache_; dbname_ = test::TmpDir() + "/db_test";
dbname_ = test::TmpDir() + "/corruption_test"; dbname_ = MakeTieredDbname(dbname_, options_);
DestroyDB(dbname_, options_); DestroyDB(dbname_, options_);
db_ = NULL; db_ = NULL;
@ -51,14 +51,17 @@ class CorruptionTest {
delete tiny_cache_; delete tiny_cache_;
} }
Status TryReopen() { Status TryReopen(Options* options = NULL) {
delete db_; delete db_;
db_ = NULL; db_ = NULL;
return DB::Open(options_, dbname_, &db_); Options opt = (options ? *options : options_);
opt.env = &env_;
opt.block_cache = tiny_cache_;
return DB::Open(opt, dbname_, &db_);
} }
void Reopen() { void Reopen(Options* options = NULL) {
ASSERT_OK(TryReopen()); ASSERT_OK(TryReopen(options));
} }
void RepairDB() { void RepairDB() {
@ -75,13 +78,7 @@ class CorruptionTest {
Slice key = Key(i, &key_space); Slice key = Key(i, &key_space);
batch.Clear(); batch.Clear();
batch.Put(key, Value(i, &value_space)); batch.Put(key, Value(i, &value_space));
WriteOptions options; ASSERT_OK(db_->Write(WriteOptions(), &batch));
// Corrupt() doesn't work without this sync on windows; stat reports 0 for
// the file size.
if (i == n - 1) {
options.sync = true;
}
ASSERT_OK(db_->Write(options, &batch));
} }
} }
@ -96,10 +93,6 @@ class CorruptionTest {
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
uint64_t key; uint64_t key;
Slice in(iter->key()); Slice in(iter->key());
if (in == "" || in == "~") {
// Ignore boundary keys.
continue;
}
if (!ConsumeDecimalNumber(&in, &key) || if (!ConsumeDecimalNumber(&in, &key) ||
!in.empty() || !in.empty() ||
key < next_expected) { key < next_expected) {
@ -123,19 +116,26 @@ class CorruptionTest {
ASSERT_GE(max_expected, correct); ASSERT_GE(max_expected, correct);
} }
void Corrupt(FileType filetype, int offset, int bytes_to_corrupt) { void Corrupt(FileType filetype, int offset, int bytes_to_corrupt, int level=0) {
// Pick file to corrupt // Pick file to corrupt
std::vector<std::string> filenames; std::vector<std::string> filenames;
ASSERT_OK(env_.GetChildren(dbname_, &filenames)); std::string dirname;
if (leveldb::kTableFile!=filetype)
dirname=dbname_;
else
dirname=MakeDirName2(options_, level, "sst");
ASSERT_OK(env_.GetChildren(dirname, &filenames));
uint64_t number; uint64_t number;
FileType type; FileType type;
std::string fname; std::string fname;
int picked_number = -1; int picked_number = -1;
for (size_t i = 0; i < filenames.size(); i++) { for (int i = 0; i < filenames.size(); i++) {
if (ParseFileName(filenames[i], &number, &type) && if (ParseFileName(filenames[i], &number, &type) &&
type == filetype && type == filetype &&
int(number) > picked_number) { // Pick latest file int(number) > picked_number) { // Pick latest file
fname = dbname_ + "/" + filenames[i]; fname = dirname + "/" + filenames[i];
picked_number = number; picked_number = number;
} }
} }
@ -222,12 +222,14 @@ TEST(CorruptionTest, NewFileErrorDuringWrite) {
const int num = 3 + (Options().write_buffer_size / kValueSize); const int num = 3 + (Options().write_buffer_size / kValueSize);
std::string value_storage; std::string value_storage;
Status s; Status s;
for (int i = 0; s.ok() && i < num; i++) { for (int i = 0;
s.ok() && i < num && 0==env_.num_writable_file_errors_;
i++) {
WriteBatch batch; WriteBatch batch;
batch.Put("a", Value(100, &value_storage)); batch.Put("a", Value(100, &value_storage));
s = db_->Write(WriteOptions(), &batch); s = db_->Write(WriteOptions(), &batch);
} }
ASSERT_TRUE(!s.ok()); // ASSERT_TRUE(!s.ok()); Background write thread will never report this
ASSERT_GE(env_.num_writable_file_errors_, 1); ASSERT_GE(env_.num_writable_file_errors_, 1);
env_.writable_file_error_ = false; env_.writable_file_error_ = false;
Reopen(); Reopen();
@ -240,34 +242,18 @@ TEST(CorruptionTest, TableFile) {
dbi->TEST_CompactRange(0, NULL, NULL); dbi->TEST_CompactRange(0, NULL, NULL);
dbi->TEST_CompactRange(1, NULL, NULL); dbi->TEST_CompactRange(1, NULL, NULL);
Corrupt(kTableFile, 100, 1); Corrupt(kTableFile, 100, 1, config::kMaxMemCompactLevel);
Check(90, 99);
}
TEST(CorruptionTest, TableFileRepair) {
options_.block_size = 2 * kValueSize; // Limit scope of corruption
options_.paranoid_checks = true;
Reopen();
Build(100);
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
dbi->TEST_CompactMemTable();
dbi->TEST_CompactRange(0, NULL, NULL);
dbi->TEST_CompactRange(1, NULL, NULL);
Corrupt(kTableFile, 100, 1);
RepairDB();
Reopen();
Check(95, 99); Check(95, 99);
} }
TEST(CorruptionTest, TableFileIndexData) { TEST(CorruptionTest, TableFileIndexData) {
Build(10000); // Enough to build multiple Tables Build(100000); // Enough to build multiple Tables
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_); DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
dbi->TEST_CompactMemTable(); dbi->TEST_CompactMemTable();
Corrupt(kTableFile, -2000, 500); Corrupt(kTableFile, -2000, 500, config::kMaxMemCompactLevel);
Reopen(); Reopen();
Check(5000, 9999); Check(50000, 99999);
} }
TEST(CorruptionTest, MissingDescriptor) { TEST(CorruptionTest, MissingDescriptor) {
@ -319,10 +305,10 @@ TEST(CorruptionTest, CompactionInputError) {
Build(10); Build(10);
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_); DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
dbi->TEST_CompactMemTable(); dbi->TEST_CompactMemTable();
const int last = config::kMaxMemCompactLevel; const int last = config::kMaxMemCompactLevel; // Riak does not "move" files
ASSERT_EQ(1, Property("leveldb.num-files-at-level" + NumberToString(last))); ASSERT_EQ(1, Property("leveldb.num-files-at-level" + NumberToString(last)));
Corrupt(kTableFile, 100, 1); Corrupt(kTableFile, 100, 1, last);
Check(5, 9); Check(5, 9);
// Force compactions by writing lots of values // Force compactions by writing lots of values
@ -331,23 +317,42 @@ TEST(CorruptionTest, CompactionInputError) {
} }
TEST(CorruptionTest, CompactionInputErrorParanoid) { TEST(CorruptionTest, CompactionInputErrorParanoid) {
options_.paranoid_checks = true; Options options;
options_.write_buffer_size = 512 << 10; options.paranoid_checks = true;
Reopen(); options.write_buffer_size = 1048576;
Reopen(&options);
int current_corruption=Property("leveldb.ReadBlockError");
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_); DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
// Make multiple inputs so we need to compact. // Fill levels >= 1 so memtable compaction outputs to level 1
for (int i = 0; i < 2; i++) { // matthewv 1/10/14 - what does "levels" have to do with this,
// switching to compaction trigger.
// 7/10/14 - compaction starts between 4 and 6 files ... assume 4 and 1 move
// (will make a new, descriptive constant for 4)
for (int level = Property("leveldb.num-files-at-level0")+1;
level < config::kL0_GroomingTrigger; level++) {
dbi->Put(WriteOptions(), "", "begin");
dbi->Put(WriteOptions(), "~", "end");
dbi->TEST_CompactMemTable();
}
Build(10); Build(10);
dbi->TEST_CompactMemTable(); dbi->TEST_CompactMemTable();
Corrupt(kTableFile, 100, 1); ASSERT_TRUE(1 < Property("leveldb.num-files-at-level0"));
env_.SleepForMicroseconds(100000);
}
dbi->CompactRange(NULL, NULL);
// Write must fail because of corrupted table Corrupt(kTableFile, 100, 1, 0);
Check(5, 9);
// Write must eventually fail because of corrupted table
Status s;
std::string tmp1, tmp2; std::string tmp1, tmp2;
Status s = db_->Put(WriteOptions(), Key(5, &tmp1), Value(5, &tmp2)); for (int i = 0; i < 10000 && s.ok(); i++) {
s = db_->Put(WriteOptions(), Key(i, &tmp1), Value(i, &tmp2));
}
if (s.ok())
ASSERT_NE(current_corruption, Property("leveldb.ReadBlockError")) << "no ReadBlockError seen";
else
ASSERT_TRUE(!s.ok()) << "write did not fail in corrupted paranoid db"; ASSERT_TRUE(!s.ok()) << "write did not fail in corrupted paranoid db";
} }
@ -355,7 +360,7 @@ TEST(CorruptionTest, UnrelatedKeys) {
Build(10); Build(10);
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_); DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
dbi->TEST_CompactMemTable(); dbi->TEST_CompactMemTable();
Corrupt(kTableFile, 100, 1); Corrupt(kTableFile, 100, 1, config::kMaxMemCompactLevel);
std::string tmp1, tmp2; std::string tmp1, tmp2;
ASSERT_OK(db_->Put(WriteOptions(), Key(1000, &tmp1), Value(1000, &tmp2))); ASSERT_OK(db_->Put(WriteOptions(), Key(1000, &tmp1), Value(1000, &tmp2)));

View file

@ -33,7 +33,6 @@
// readmissing -- read N missing keys in random order // readmissing -- read N missing keys in random order
// readhot -- read N times in random order from 1% section of DB // readhot -- read N times in random order from 1% section of DB
// seekrandom -- N random seeks // seekrandom -- N random seeks
// open -- cost of opening a DB
// crc32c -- repeated crc32c of 4K of data // crc32c -- repeated crc32c of 4K of data
// acquireload -- load N*1000 times // acquireload -- load N*1000 times
// Meta operations: // Meta operations:
@ -84,14 +83,6 @@ static bool FLAGS_histogram = false;
// (initialized to default value by "main") // (initialized to default value by "main")
static int FLAGS_write_buffer_size = 0; static int FLAGS_write_buffer_size = 0;
// Number of bytes written to each file.
// (initialized to default value by "main")
static int FLAGS_max_file_size = 0;
// Approximate size of user data packed per block (before compression.
// (initialized to default value by "main")
static int FLAGS_block_size = 0;
// Number of bytes to use as a cache of uncompressed data. // Number of bytes to use as a cache of uncompressed data.
// Negative means use default settings. // Negative means use default settings.
static int FLAGS_cache_size = -1; static int FLAGS_cache_size = -1;
@ -103,21 +94,26 @@ static int FLAGS_open_files = 0;
// Negative means use default settings. // Negative means use default settings.
static int FLAGS_bloom_bits = -1; static int FLAGS_bloom_bits = -1;
// Riak bloom adaptation
static int FLAGS_bloom2_bits = -1;
// Riak param for total memory allocation (flex_cache)
static uint64_t FLAGS_leveldb_memory = -1;
// Riak param for compression setting
static int FLAGS_compression = 2;
// If true, do not destroy the existing database. If you set this // If true, do not destroy the existing database. If you set this
// flag and also specify a benchmark that wants a fresh database, that // flag and also specify a benchmark that wants a fresh database, that
// benchmark will fail. // benchmark will fail.
static bool FLAGS_use_existing_db = false; static bool FLAGS_use_existing_db = false;
// If true, reuse existing log/MANIFEST files when re-opening a database.
static bool FLAGS_reuse_logs = false;
// Use the db with the following name. // Use the db with the following name.
static const char* FLAGS_db = NULL; static const char* FLAGS_db = NULL;
namespace leveldb { namespace leveldb {
namespace { namespace {
leveldb::Env* g_env = NULL;
// Helper for quickly generating random data. // Helper for quickly generating random data.
class RandomGenerator { class RandomGenerator {
@ -141,7 +137,7 @@ class RandomGenerator {
pos_ = 0; pos_ = 0;
} }
Slice Generate(size_t len) { Slice Generate(int len) {
if (pos_ + len > data_.size()) { if (pos_ + len > data_.size()) {
pos_ = 0; pos_ = 0;
assert(len < data_.size()); assert(len < data_.size());
@ -151,19 +147,17 @@ class RandomGenerator {
} }
}; };
#if defined(__linux)
static Slice TrimSpace(Slice s) { static Slice TrimSpace(Slice s) {
size_t start = 0; int start = 0;
while (start < s.size() && isspace(s[start])) { while (start < s.size() && isspace(s[start])) {
start++; start++;
} }
size_t limit = s.size(); int limit = s.size();
while (limit > start && isspace(s[limit-1])) { while (limit > start && isspace(s[limit-1])) {
limit--; limit--;
} }
return Slice(s.data() + start, limit - start); return Slice(s.data() + start, limit - start);
} }
#endif
static void AppendWithSpace(std::string* str, Slice msg) { static void AppendWithSpace(std::string* str, Slice msg) {
if (msg.empty()) return; if (msg.empty()) return;
@ -195,7 +189,7 @@ class Stats {
done_ = 0; done_ = 0;
bytes_ = 0; bytes_ = 0;
seconds_ = 0; seconds_ = 0;
start_ = g_env->NowMicros(); start_ = Env::Default()->NowMicros();
finish_ = start_; finish_ = start_;
message_.clear(); message_.clear();
} }
@ -213,7 +207,7 @@ class Stats {
} }
void Stop() { void Stop() {
finish_ = g_env->NowMicros(); finish_ = Env::Default()->NowMicros();
seconds_ = (finish_ - start_) * 1e-6; seconds_ = (finish_ - start_) * 1e-6;
} }
@ -223,7 +217,7 @@ class Stats {
void FinishedSingleOp() { void FinishedSingleOp() {
if (FLAGS_histogram) { if (FLAGS_histogram) {
double now = g_env->NowMicros(); double now = Env::Default()->NowMicros();
double micros = now - last_op_finish_; double micros = now - last_op_finish_;
hist_.Add(micros); hist_.Add(micros);
if (micros > 20000) { if (micros > 20000) {
@ -405,7 +399,7 @@ class Benchmark {
: cache_(FLAGS_cache_size >= 0 ? NewLRUCache(FLAGS_cache_size) : NULL), : cache_(FLAGS_cache_size >= 0 ? NewLRUCache(FLAGS_cache_size) : NULL),
filter_policy_(FLAGS_bloom_bits >= 0 filter_policy_(FLAGS_bloom_bits >= 0
? NewBloomFilterPolicy(FLAGS_bloom_bits) ? NewBloomFilterPolicy(FLAGS_bloom_bits)
: NULL), : (FLAGS_bloom2_bits >=0 ? NewBloomFilterPolicy2(FLAGS_bloom2_bits) : NULL)),
db_(NULL), db_(NULL),
num_(FLAGS_num), num_(FLAGS_num),
value_size_(FLAGS_value_size), value_size_(FLAGS_value_size),
@ -413,10 +407,10 @@ class Benchmark {
reads_(FLAGS_reads < 0 ? FLAGS_num : FLAGS_reads), reads_(FLAGS_reads < 0 ? FLAGS_num : FLAGS_reads),
heap_counter_(0) { heap_counter_(0) {
std::vector<std::string> files; std::vector<std::string> files;
g_env->GetChildren(FLAGS_db, &files); Env::Default()->GetChildren(FLAGS_db, &files);
for (size_t i = 0; i < files.size(); i++) { for (int i = 0; i < files.size(); i++) {
if (Slice(files[i]).starts_with("heap-")) { if (Slice(files[i]).starts_with("heap-")) {
g_env->DeleteFile(std::string(FLAGS_db) + "/" + files[i]); Env::Default()->DeleteFile(std::string(FLAGS_db) + "/" + files[i]);
} }
} }
if (!FLAGS_use_existing_db) { if (!FLAGS_use_existing_db) {
@ -446,7 +440,7 @@ class Benchmark {
benchmarks = sep + 1; benchmarks = sep + 1;
} }
// Reset parameters that may be overridden below // Reset parameters that may be overriddden bwlow
num_ = FLAGS_num; num_ = FLAGS_num;
reads_ = (FLAGS_reads < 0 ? FLAGS_num : FLAGS_reads); reads_ = (FLAGS_reads < 0 ? FLAGS_num : FLAGS_reads);
value_size_ = FLAGS_value_size; value_size_ = FLAGS_value_size;
@ -457,11 +451,7 @@ class Benchmark {
bool fresh_db = false; bool fresh_db = false;
int num_threads = FLAGS_threads; int num_threads = FLAGS_threads;
if (name == Slice("open")) { if (name == Slice("fillseq")) {
method = &Benchmark::OpenBench;
num_ /= 10000;
if (num_ < 1) num_ = 1;
} else if (name == Slice("fillseq")) {
fresh_db = true; fresh_db = true;
method = &Benchmark::WriteSeq; method = &Benchmark::WriteSeq;
} else if (name == Slice("fillbatch")) { } else if (name == Slice("fillbatch")) {
@ -553,6 +543,7 @@ class Benchmark {
SharedState* shared; SharedState* shared;
ThreadState* thread; ThreadState* thread;
void (Benchmark::*method)(ThreadState*); void (Benchmark::*method)(ThreadState*);
pthread_t thread_id;
}; };
static void ThreadBody(void* v) { static void ThreadBody(void* v) {
@ -598,7 +589,8 @@ class Benchmark {
arg[i].shared = &shared; arg[i].shared = &shared;
arg[i].thread = new ThreadState(i); arg[i].thread = new ThreadState(i);
arg[i].thread->shared = &shared; arg[i].thread->shared = &shared;
g_env->StartThread(ThreadBody, &arg[i]); arg[i].thread_id=Env::Default()->StartThread(ThreadBody, &arg[i]);
pthread_detach(arg[i].thread_id);
} }
shared.mu.Lock(); shared.mu.Lock();
@ -709,15 +701,12 @@ class Benchmark {
void Open() { void Open() {
assert(db_ == NULL); assert(db_ == NULL);
Options options; Options options;
options.env = g_env;
options.create_if_missing = !FLAGS_use_existing_db; options.create_if_missing = !FLAGS_use_existing_db;
options.block_cache = cache_; options.block_cache = cache_;
options.write_buffer_size = FLAGS_write_buffer_size; options.write_buffer_size = FLAGS_write_buffer_size;
options.max_file_size = FLAGS_max_file_size;
options.block_size = FLAGS_block_size;
options.max_open_files = FLAGS_open_files;
options.filter_policy = filter_policy_; options.filter_policy = filter_policy_;
options.reuse_logs = FLAGS_reuse_logs; options.compression = (leveldb::CompressionType)FLAGS_compression;
options.total_leveldb_mem = FLAGS_leveldb_memory;
Status s = DB::Open(options, FLAGS_db, &db_); Status s = DB::Open(options, FLAGS_db, &db_);
if (!s.ok()) { if (!s.ok()) {
fprintf(stderr, "open error: %s\n", s.ToString().c_str()); fprintf(stderr, "open error: %s\n", s.ToString().c_str());
@ -725,14 +714,6 @@ class Benchmark {
} }
} }
void OpenBench(ThreadState* thread) {
for (int i = 0; i < num_; i++) {
delete db_;
Open();
thread->stats.FinishedSingleOp();
}
}
void WriteSeq(ThreadState* thread) { void WriteSeq(ThreadState* thread) {
DoWrite(thread, true); DoWrite(thread, true);
} }
@ -842,6 +823,7 @@ class Benchmark {
void SeekRandom(ThreadState* thread) { void SeekRandom(ThreadState* thread) {
ReadOptions options; ReadOptions options;
std::string value;
int found = 0; int found = 0;
for (int i = 0; i < reads_; i++) { for (int i = 0; i < reads_; i++) {
Iterator* iter = db_->NewIterator(options); Iterator* iter = db_->NewIterator(options);
@ -937,7 +919,7 @@ class Benchmark {
char fname[100]; char fname[100];
snprintf(fname, sizeof(fname), "%s/heap-%04d", FLAGS_db, ++heap_counter_); snprintf(fname, sizeof(fname), "%s/heap-%04d", FLAGS_db, ++heap_counter_);
WritableFile* file; WritableFile* file;
Status s = g_env->NewWritableFile(fname, &file); Status s = Env::Default()->NewWritableFile(fname, &file, 2<<20);
if (!s.ok()) { if (!s.ok()) {
fprintf(stderr, "%s\n", s.ToString().c_str()); fprintf(stderr, "%s\n", s.ToString().c_str());
return; return;
@ -946,7 +928,7 @@ class Benchmark {
delete file; delete file;
if (!ok) { if (!ok) {
fprintf(stderr, "heap profiling not supported\n"); fprintf(stderr, "heap profiling not supported\n");
g_env->DeleteFile(fname); Env::Default()->DeleteFile(fname);
} }
} }
}; };
@ -955,14 +937,14 @@ class Benchmark {
int main(int argc, char** argv) { int main(int argc, char** argv) {
FLAGS_write_buffer_size = leveldb::Options().write_buffer_size; FLAGS_write_buffer_size = leveldb::Options().write_buffer_size;
FLAGS_max_file_size = leveldb::Options().max_file_size;
FLAGS_block_size = leveldb::Options().block_size;
FLAGS_open_files = leveldb::Options().max_open_files; FLAGS_open_files = leveldb::Options().max_open_files;
FLAGS_leveldb_memory = 25000000000LL;
std::string default_db_path; std::string default_db_path;
for (int i = 1; i < argc; i++) { for (int i = 1; i < argc; i++) {
double d; double d;
int n; int n;
uint64_t u;
char junk; char junk;
if (leveldb::Slice(argv[i]).starts_with("--benchmarks=")) { if (leveldb::Slice(argv[i]).starts_with("--benchmarks=")) {
FLAGS_benchmarks = argv[i] + strlen("--benchmarks="); FLAGS_benchmarks = argv[i] + strlen("--benchmarks=");
@ -974,9 +956,6 @@ int main(int argc, char** argv) {
} else if (sscanf(argv[i], "--use_existing_db=%d%c", &n, &junk) == 1 && } else if (sscanf(argv[i], "--use_existing_db=%d%c", &n, &junk) == 1 &&
(n == 0 || n == 1)) { (n == 0 || n == 1)) {
FLAGS_use_existing_db = n; FLAGS_use_existing_db = n;
} else if (sscanf(argv[i], "--reuse_logs=%d%c", &n, &junk) == 1 &&
(n == 0 || n == 1)) {
FLAGS_reuse_logs = n;
} else if (sscanf(argv[i], "--num=%d%c", &n, &junk) == 1) { } else if (sscanf(argv[i], "--num=%d%c", &n, &junk) == 1) {
FLAGS_num = n; FLAGS_num = n;
} else if (sscanf(argv[i], "--reads=%d%c", &n, &junk) == 1) { } else if (sscanf(argv[i], "--reads=%d%c", &n, &junk) == 1) {
@ -987,16 +966,18 @@ int main(int argc, char** argv) {
FLAGS_value_size = n; FLAGS_value_size = n;
} else if (sscanf(argv[i], "--write_buffer_size=%d%c", &n, &junk) == 1) { } else if (sscanf(argv[i], "--write_buffer_size=%d%c", &n, &junk) == 1) {
FLAGS_write_buffer_size = n; FLAGS_write_buffer_size = n;
} else if (sscanf(argv[i], "--max_file_size=%d%c", &n, &junk) == 1) {
FLAGS_max_file_size = n;
} else if (sscanf(argv[i], "--block_size=%d%c", &n, &junk) == 1) {
FLAGS_block_size = n;
} else if (sscanf(argv[i], "--cache_size=%d%c", &n, &junk) == 1) { } else if (sscanf(argv[i], "--cache_size=%d%c", &n, &junk) == 1) {
FLAGS_cache_size = n; FLAGS_cache_size = n;
} else if (sscanf(argv[i], "--bloom_bits=%d%c", &n, &junk) == 1) { } else if (sscanf(argv[i], "--bloom_bits=%d%c", &n, &junk) == 1) {
FLAGS_bloom_bits = n; FLAGS_bloom_bits = n;
} else if (sscanf(argv[i], "--bloom_bits2=%d%c", &n, &junk) == 1) {
FLAGS_bloom2_bits = n;
} else if (sscanf(argv[i], "--leveldb_memory=%d%c", &n, &junk) == 1) {
FLAGS_leveldb_memory = n * 1024 * 1024LL;
} else if (sscanf(argv[i], "--open_files=%d%c", &n, &junk) == 1) { } else if (sscanf(argv[i], "--open_files=%d%c", &n, &junk) == 1) {
FLAGS_open_files = n; FLAGS_open_files = n;
} else if (sscanf(argv[i], "--compression=%d%c", &n, &junk) == 1) {
FLAGS_compression = n;
} else if (strncmp(argv[i], "--db=", 5) == 0) { } else if (strncmp(argv[i], "--db=", 5) == 0) {
FLAGS_db = argv[i] + 5; FLAGS_db = argv[i] + 5;
} else { } else {
@ -1005,16 +986,20 @@ int main(int argc, char** argv) {
} }
} }
leveldb::g_env = leveldb::Env::Default();
// Choose a location for the test database if none given with --db=<path> // Choose a location for the test database if none given with --db=<path>
if (FLAGS_db == NULL) { if (FLAGS_db == NULL) {
leveldb::g_env->GetTestDirectory(&default_db_path); leveldb::Env::Default()->GetTestDirectory(&default_db_path);
default_db_path += "/dbbench"; default_db_path += "/dbbench";
FLAGS_db = default_db_path.c_str(); FLAGS_db = default_db_path.c_str();
} }
// benchmark class needs to destruct before Shutdown call
{
leveldb::Benchmark benchmark; leveldb::Benchmark benchmark;
benchmark.Run(); benchmark.Run();
}
leveldb::Env::Shutdown();
return 0; return 0;
} }

File diff suppressed because it is too large Load diff

View file

@ -13,7 +13,7 @@
#include "leveldb/db.h" #include "leveldb/db.h"
#include "leveldb/env.h" #include "leveldb/env.h"
#include "port/port.h" #include "port/port.h"
#include "port/thread_annotations.h" #include "util/cache2.h"
namespace leveldb { namespace leveldb {
@ -29,26 +29,37 @@ class DBImpl : public DB {
virtual ~DBImpl(); virtual ~DBImpl();
// Implementations of the DB interface // Implementations of the DB interface
virtual Status Put(const WriteOptions&, const Slice& key, const Slice& value); virtual Status Put(const WriteOptions&, const Slice& key, const Slice& value, const KeyMetaData * meta=NULL);
virtual Status Delete(const WriteOptions&, const Slice& key); virtual Status Delete(const WriteOptions&, const Slice& key);
virtual Status Write(const WriteOptions& options, WriteBatch* updates); virtual Status Write(const WriteOptions& options, WriteBatch* updates);
virtual Status Get(const ReadOptions& options, virtual Status Get(const ReadOptions& options,
const Slice& key, const Slice& key,
std::string* value); std::string* value,
KeyMetaData * meta=NULL);
virtual Status Get(const ReadOptions& options,
const Slice& key,
Value* value,
KeyMetaData * meta=NULL);
virtual Iterator* NewIterator(const ReadOptions&); virtual Iterator* NewIterator(const ReadOptions&);
virtual const Snapshot* GetSnapshot(); virtual const Snapshot* GetSnapshot();
virtual void ReleaseSnapshot(const Snapshot* snapshot); virtual void ReleaseSnapshot(const Snapshot* snapshot);
virtual bool GetProperty(const Slice& property, std::string* value); virtual bool GetProperty(const Slice& property, std::string* value);
virtual void GetApproximateSizes(const Range* range, int n, uint64_t* sizes); virtual void GetApproximateSizes(const Range* range, int n, uint64_t* sizes);
virtual void CompactRange(const Slice* begin, const Slice* end); virtual void CompactRange(const Slice* begin, const Slice* end);
virtual Status VerifyLevels();
virtual void CheckAvailableCompactions();
virtual Logger* GetLogger() const { return options_.info_log; }
// Extra methods (for testing) that are not in the public DB interface // Extra methods (for testing) that are not in the public DB interface
const Options & GetOptions() const { return options_; };
// Compact any files in the named level that overlap [*begin,*end] // Compact any files in the named level that overlap [*begin,*end]
void TEST_CompactRange(int level, const Slice* begin, const Slice* end); void TEST_CompactRange(int level, const Slice* begin, const Slice* end);
// Force current memtable contents to be compacted. // Force current memtable contents to be compacted, waits for completion
Status TEST_CompactMemTable(); Status CompactMemTableSynchronous();
Status TEST_CompactMemTable(); // wraps CompactMemTableSynchronous (historical)
// Return an internal iterator over the current state of the database. // Return an internal iterator over the current state of the database.
// The keys of this iterator are internal keys (see format.h). // The keys of this iterator are internal keys (see format.h).
@ -59,64 +70,82 @@ class DBImpl : public DB {
// file at a level >= 1. // file at a level >= 1.
int64_t TEST_MaxNextLevelOverlappingBytes(); int64_t TEST_MaxNextLevelOverlappingBytes();
// Record a sample of bytes read at the specified internal key. // These are routines that DBListImpl calls across all open databases
// Samples are taken approximately once every config::kReadBytesPeriod void ResizeCaches() {double_cache.ResizeCaches();};
// bytes. size_t GetCacheCapacity() {return(double_cache.GetCapacity(false));}
void RecordReadSample(Slice key); void PurgeExpiredFileCache() {double_cache.PurgeExpiredFiles();};
private: // in util/hot_backup.cc
void HotBackup();
bool PurgeWriteBuffer();
bool WriteBackupManifest();
bool CreateBackupLinks(Version * Version, Options & BackupOptions);
bool CopyLOGSegment(long FileEnd);
void HotBackupComplete();
void BackgroundCall2(Compaction * Compact);
void BackgroundImmCompactCall();
bool IsCompactionScheduled();
uint32_t RunningCompactionCount() {mutex_.AssertHeld(); return(running_compactions_);};
protected:
friend class DB; friend class DB;
struct CompactionState; struct CompactionState;
struct Writer; struct Writer;
Iterator* NewInternalIterator(const ReadOptions&, Iterator* NewInternalIterator(const ReadOptions&,
SequenceNumber* latest_snapshot, SequenceNumber* latest_snapshot);
uint32_t* seed);
Status NewDB(); Status NewDB();
// Recover the descriptor from persistent storage. May do a significant // Recover the descriptor from persistent storage. May do a significant
// amount of work to recover recently logged updates. Any changes to // amount of work to recover recently logged updates. Any changes to
// be made to the descriptor are added to *edit. // be made to the descriptor are added to *edit.
Status Recover(VersionEdit* edit, bool* save_manifest) Status Recover(VersionEdit* edit);
EXCLUSIVE_LOCKS_REQUIRED(mutex_);
// Riak routine: pause DB::Open if too many compactions
// stacked up immediately. Happens in some repairs and
// some Riak upgrades
void CheckCompactionState();
void MaybeIgnoreError(Status* s) const; void MaybeIgnoreError(Status* s) const;
// Delete any unneeded files and stale in-memory entries. // Delete any unneeded files and stale in-memory entries.
void DeleteObsoleteFiles(); void DeleteObsoleteFiles();
void KeepOrDelete(const std::string & Filename, int level, const std::set<uint64_t> & Live);
// Compact the in-memory write buffer to disk. Switches to a new // Compact the in-memory write buffer to disk. Switches to a new
// log-file/memtable and writes a new descriptor iff successful. // log-file/memtable and writes a new descriptor iff successful.
// Errors are recorded in bg_error_. Status CompactMemTable();
void CompactMemTable() EXCLUSIVE_LOCKS_REQUIRED(mutex_);
Status RecoverLogFile(uint64_t log_number, bool last_log, bool* save_manifest, Status RecoverLogFile(uint64_t log_number,
VersionEdit* edit, SequenceNumber* max_sequence) VersionEdit* edit,
EXCLUSIVE_LOCKS_REQUIRED(mutex_); SequenceNumber* max_sequence);
Status WriteLevel0Table(MemTable* mem, VersionEdit* edit, Version* base) Status WriteLevel0Table(volatile MemTable* mem, VersionEdit* edit, Version* base);
EXCLUSIVE_LOCKS_REQUIRED(mutex_);
Status MakeRoomForWrite(bool force /* TRUE forces memtable rotation to disk (for testing) */);
Status NewRecoveryLog(uint64_t NewLogNumber);
Status MakeRoomForWrite(bool force /* compact even if there is room? */)
EXCLUSIVE_LOCKS_REQUIRED(mutex_);
WriteBatch* BuildBatchGroup(Writer** last_writer); WriteBatch* BuildBatchGroup(Writer** last_writer);
void RecordBackgroundError(const Status& s); void MaybeScheduleCompaction();
void MaybeScheduleCompaction() EXCLUSIVE_LOCKS_REQUIRED(mutex_); Status BackgroundCompaction(Compaction * Compact=NULL);
static void BGWork(void* db); Status BackgroundExpiry(Compaction * Compact=NULL);
void BackgroundCall();
void BackgroundCompaction() EXCLUSIVE_LOCKS_REQUIRED(mutex_);
void CleanupCompaction(CompactionState* compact)
EXCLUSIVE_LOCKS_REQUIRED(mutex_);
Status DoCompactionWork(CompactionState* compact)
EXCLUSIVE_LOCKS_REQUIRED(mutex_);
Status OpenCompactionOutputFile(CompactionState* compact); void CleanupCompaction(CompactionState* compact);
Status DoCompactionWork(CompactionState* compact);
int64_t PrioritizeWork(bool IsLevel0);
Status OpenCompactionOutputFile(CompactionState* compact, size_t sample_value_size);
bool Send2PageCache(CompactionState * compact);
size_t MaybeRaiseBlockSize(Compaction & CompactionStuff, size_t SampleValueSize);
Status FinishCompactionOutputFile(CompactionState* compact, Iterator* input); Status FinishCompactionOutputFile(CompactionState* compact, Iterator* input);
Status InstallCompactionResults(CompactionState* compact) Status InstallCompactionResults(CompactionState* compact);
EXCLUSIVE_LOCKS_REQUIRED(mutex_);
// initialized before options so its block_cache is available
class DoubleCache double_cache;
// Constant after construction // Constant after construction
Env* const env_; Env* const env_;
@ -130,20 +159,22 @@ class DBImpl : public DB {
// table_cache_ provides its own synchronization // table_cache_ provides its own synchronization
TableCache* table_cache_; TableCache* table_cache_;
// Lock over the persistent DB state. Non-NULL iff successfully acquired. // Lock over the persistent DB state. Non-NULL iff successfully acquired.
FileLock* db_lock_; FileLock* db_lock_;
// State below is protected by mutex_ // State below is protected by mutex_
port::Mutex mutex_; port::Mutex mutex_;
port::Mutex throttle_mutex_; // used by write throttle to force sequential waits on callers
port::AtomicPointer shutting_down_; port::AtomicPointer shutting_down_;
port::CondVar bg_cv_; // Signalled when background work finishes port::CondVar bg_cv_; // Signalled when background work finishes
MemTable* mem_; MemTable* mem_;
MemTable* imm_; // Memtable being compacted volatile MemTable* imm_; // Memtable being compacted
port::AtomicPointer has_imm_; // So bg thread can detect non-NULL imm_ port::AtomicPointer has_imm_; // So bg thread can detect non-NULL imm_
WritableFile* logfile_; WritableFile* logfile_;
uint64_t logfile_number_; uint64_t logfile_number_;
log::Writer* log_; log::Writer* log_;
uint32_t seed_; // For sampling.
// Queue of writers. // Queue of writers.
std::deque<Writer*> writers_; std::deque<Writer*> writers_;
@ -155,9 +186,6 @@ class DBImpl : public DB {
// part of ongoing compactions. // part of ongoing compactions.
std::set<uint64_t> pending_outputs_; std::set<uint64_t> pending_outputs_;
// Has a background compaction been scheduled or is running?
bool bg_compaction_scheduled_;
// Information for a manual compaction // Information for a manual compaction
struct ManualCompaction { struct ManualCompaction {
int level; int level;
@ -166,7 +194,7 @@ class DBImpl : public DB {
const InternalKey* end; // NULL means end of key range const InternalKey* end; // NULL means end of key range
InternalKey tmp_storage; // Used to keep track of compaction progress InternalKey tmp_storage; // Used to keep track of compaction progress
}; };
ManualCompaction* manual_compaction_; volatile ManualCompaction* manual_compaction_;
VersionSet* versions_; VersionSet* versions_;
@ -190,6 +218,18 @@ class DBImpl : public DB {
}; };
CompactionStats stats_[config::kNumLevels]; CompactionStats stats_[config::kNumLevels];
volatile uint64_t throttle_end;
volatile uint32_t running_compactions_;
volatile size_t current_block_size_; // last dynamic block size computed
volatile uint64_t block_size_changed_; // NowMicros() when block size computed
volatile uint64_t last_low_mem_; // NowMicros() when low memory last seen
// accessor to new, dynamic block_cache
Cache * block_cache() {return(double_cache.GetBlockCache());};
Cache * file_cache() {return(double_cache.GetFileCache());};
volatile bool hotbackup_pending_;
// No copying allowed // No copying allowed
DBImpl(const DBImpl&); DBImpl(const DBImpl&);
void operator=(const DBImpl&); void operator=(const DBImpl&);
@ -204,7 +244,8 @@ class DBImpl : public DB {
extern Options SanitizeOptions(const std::string& db, extern Options SanitizeOptions(const std::string& db,
const InternalKeyComparator* icmp, const InternalKeyComparator* icmp,
const InternalFilterPolicy* ipolicy, const InternalFilterPolicy* ipolicy,
const Options& src); const Options& src,
Cache * block_cache);
} // namespace leveldb } // namespace leveldb

View file

@ -5,14 +5,14 @@
#include "db/db_iter.h" #include "db/db_iter.h"
#include "db/filename.h" #include "db/filename.h"
#include "db/db_impl.h"
#include "db/dbformat.h" #include "db/dbformat.h"
#include "leveldb/env.h" #include "leveldb/env.h"
#include "leveldb/expiry.h"
#include "leveldb/iterator.h" #include "leveldb/iterator.h"
#include "leveldb/perf_count.h"
#include "port/port.h" #include "port/port.h"
#include "util/logging.h" #include "util/logging.h"
#include "util/mutexlock.h" #include "util/mutexlock.h"
#include "util/random.h"
namespace leveldb { namespace leveldb {
@ -48,18 +48,20 @@ class DBIter: public Iterator {
kReverse kReverse
}; };
DBIter(DBImpl* db, const Comparator* cmp, Iterator* iter, SequenceNumber s, DBIter(const std::string* dbname, Env* env,
uint32_t seed) const Comparator* cmp, Iterator* iter, SequenceNumber s,
: db_(db), const ExpiryModule * expiry)
: dbname_(dbname),
env_(env),
user_comparator_(cmp), user_comparator_(cmp),
iter_(iter), iter_(iter),
sequence_(s), sequence_(s),
direction_(kForward), direction_(kForward),
valid_(false), valid_(false),
rnd_(seed), expiry_(expiry) {
bytes_counter_(RandomPeriod()) {
} }
virtual ~DBIter() { virtual ~DBIter() {
gPerfCounters->Inc(ePerfIterDelete);
delete iter_; delete iter_;
} }
virtual bool Valid() const { return valid_; } virtual bool Valid() const { return valid_; }
@ -71,6 +73,26 @@ class DBIter: public Iterator {
assert(valid_); assert(valid_);
return (direction_ == kForward) ? iter_->value() : saved_value_; return (direction_ == kForward) ? iter_->value() : saved_value_;
} }
// Riak specific: if a database iterator, returns key meta data
// REQUIRES: Valid() and forward iteration
// (reverse iteration is possible, just needs code)
virtual KeyMetaData & keymetadata() const
{
assert(valid_ && kForward==direction_);
if (kForward==direction_)
{
ParsedInternalKey parsed;
// this initialization clears a warning. ParsedInternalKey says
// it is not initializing for performance reasons ... oh well
parsed.type=kTypeValue; parsed.sequence=0; parsed.expiry=0;
ParseInternalKey(iter_->key(), &parsed);
keymetadata_.m_Type=parsed.type;
keymetadata_.m_Sequence=parsed.sequence;
keymetadata_.m_Expiry=parsed.expiry;
}
return(keymetadata_);
}
virtual Status status() const { virtual Status status() const {
if (status_.ok()) { if (status_.ok()) {
return iter_->status(); return iter_->status();
@ -103,12 +125,8 @@ class DBIter: public Iterator {
} }
} }
// Pick next gap with average value of config::kReadBytesPeriod. const std::string* const dbname_;
ssize_t RandomPeriod() { Env* const env_;
return rnd_.Uniform(2*config::kReadBytesPeriod);
}
DBImpl* db_;
const Comparator* const user_comparator_; const Comparator* const user_comparator_;
Iterator* const iter_; Iterator* const iter_;
SequenceNumber const sequence_; SequenceNumber const sequence_;
@ -118,9 +136,7 @@ class DBIter: public Iterator {
std::string saved_value_; // == current raw value when direction_==kReverse std::string saved_value_; // == current raw value when direction_==kReverse
Direction direction_; Direction direction_;
bool valid_; bool valid_;
const ExpiryModule * expiry_;
Random rnd_;
ssize_t bytes_counter_;
// No copying allowed // No copying allowed
DBIter(const DBIter&); DBIter(const DBIter&);
@ -128,14 +144,7 @@ class DBIter: public Iterator {
}; };
inline bool DBIter::ParseKey(ParsedInternalKey* ikey) { inline bool DBIter::ParseKey(ParsedInternalKey* ikey) {
Slice k = iter_->key(); if (!ParseInternalKey(iter_->key(), ikey)) {
ssize_t n = k.size() + iter_->value().size();
bytes_counter_ -= n;
while (bytes_counter_ < 0) {
bytes_counter_ += RandomPeriod();
db_->RecordReadSample(k);
}
if (!ParseInternalKey(k, ikey)) {
status_ = Status::Corruption("corrupted internal key in DBIter"); status_ = Status::Corruption("corrupted internal key in DBIter");
return false; return false;
} else { } else {
@ -146,6 +155,7 @@ inline bool DBIter::ParseKey(ParsedInternalKey* ikey) {
void DBIter::Next() { void DBIter::Next() {
assert(valid_); assert(valid_);
gPerfCounters->Inc(ePerfIterNext);
if (direction_ == kReverse) { // Switch directions? if (direction_ == kReverse) { // Switch directions?
direction_ = kForward; direction_ = kForward;
// iter_ is pointing just before the entries for this->key(), // iter_ is pointing just before the entries for this->key(),
@ -161,13 +171,12 @@ void DBIter::Next() {
saved_key_.clear(); saved_key_.clear();
return; return;
} }
// saved_key_ already contains the key to skip past.
} else {
// Store in saved_key_ the current key so we skip it below.
SaveKey(ExtractUserKey(iter_->key()), &saved_key_);
} }
FindNextUserEntry(true, &saved_key_); // Temporarily use saved_key_ as storage for key to skip.
std::string* skip = &saved_key_;
SaveKey(ExtractUserKey(iter_->key()), skip);
FindNextUserEntry(true, skip);
} }
void DBIter::FindNextUserEntry(bool skipping, std::string* skip) { void DBIter::FindNextUserEntry(bool skipping, std::string* skip) {
@ -177,6 +186,9 @@ void DBIter::FindNextUserEntry(bool skipping, std::string* skip) {
do { do {
ParsedInternalKey ikey; ParsedInternalKey ikey;
if (ParseKey(&ikey) && ikey.sequence <= sequence_) { if (ParseKey(&ikey) && ikey.sequence <= sequence_) {
if (IsExpiryKey(ikey.type) && NULL!=expiry_
&& expiry_->KeyRetirementCallback(ikey))
ikey.type=kTypeDeletion;
switch (ikey.type) { switch (ikey.type) {
case kTypeDeletion: case kTypeDeletion:
// Arrange to skip all upcoming entries for this key since // Arrange to skip all upcoming entries for this key since
@ -184,6 +196,9 @@ void DBIter::FindNextUserEntry(bool skipping, std::string* skip) {
SaveKey(ikey.user_key, skip); SaveKey(ikey.user_key, skip);
skipping = true; skipping = true;
break; break;
case kTypeValueWriteTime:
case kTypeValueExplicitExpiry:
case kTypeValue: case kTypeValue:
if (skipping && if (skipping &&
user_comparator_->Compare(ikey.user_key, *skip) <= 0) { user_comparator_->Compare(ikey.user_key, *skip) <= 0) {
@ -205,6 +220,7 @@ void DBIter::FindNextUserEntry(bool skipping, std::string* skip) {
void DBIter::Prev() { void DBIter::Prev() {
assert(valid_); assert(valid_);
gPerfCounters->Inc(ePerfIterPrev);
if (direction_ == kForward) { // Switch directions? if (direction_ == kForward) { // Switch directions?
// iter_ is pointing at the current entry. Scan backwards until // iter_ is pointing at the current entry. Scan backwards until
// the key changes so we can use the normal reverse scanning code. // the key changes so we can use the normal reverse scanning code.
@ -242,6 +258,10 @@ void DBIter::FindPrevUserEntry() {
// We encountered a non-deleted value in entries for previous keys, // We encountered a non-deleted value in entries for previous keys,
break; break;
} }
if (IsExpiryKey(ikey.type) && NULL!=expiry_
&& expiry_->KeyRetirementCallback(ikey))
ikey.type=kTypeDeletion;
value_type = ikey.type; value_type = ikey.type;
if (value_type == kTypeDeletion) { if (value_type == kTypeDeletion) {
saved_key_.clear(); saved_key_.clear();
@ -272,11 +292,12 @@ void DBIter::FindPrevUserEntry() {
} }
void DBIter::Seek(const Slice& target) { void DBIter::Seek(const Slice& target) {
gPerfCounters->Inc(ePerfIterSeek);
direction_ = kForward; direction_ = kForward;
ClearSavedValue(); ClearSavedValue();
saved_key_.clear(); saved_key_.clear();
AppendInternalKey( AppendInternalKey(
&saved_key_, ParsedInternalKey(target, sequence_, kValueTypeForSeek)); &saved_key_, ParsedInternalKey(target, 0, sequence_, kValueTypeForSeek));
iter_->Seek(saved_key_); iter_->Seek(saved_key_);
if (iter_->Valid()) { if (iter_->Valid()) {
FindNextUserEntry(false, &saved_key_ /* temporary storage */); FindNextUserEntry(false, &saved_key_ /* temporary storage */);
@ -286,6 +307,7 @@ void DBIter::Seek(const Slice& target) {
} }
void DBIter::SeekToFirst() { void DBIter::SeekToFirst() {
gPerfCounters->Inc(ePerfIterSeekFirst);
direction_ = kForward; direction_ = kForward;
ClearSavedValue(); ClearSavedValue();
iter_->SeekToFirst(); iter_->SeekToFirst();
@ -297,6 +319,7 @@ void DBIter::SeekToFirst() {
} }
void DBIter::SeekToLast() { void DBIter::SeekToLast() {
gPerfCounters->Inc(ePerfIterSeekLast);
direction_ = kReverse; direction_ = kReverse;
ClearSavedValue(); ClearSavedValue();
iter_->SeekToLast(); iter_->SeekToLast();
@ -306,12 +329,13 @@ void DBIter::SeekToLast() {
} // anonymous namespace } // anonymous namespace
Iterator* NewDBIterator( Iterator* NewDBIterator(
DBImpl* db, const std::string* dbname,
Env* env,
const Comparator* user_key_comparator, const Comparator* user_key_comparator,
Iterator* internal_iter, Iterator* internal_iter,
SequenceNumber sequence, const SequenceNumber& sequence,
uint32_t seed) { const ExpiryModule * expiry) {
return new DBIter(db, user_key_comparator, internal_iter, sequence, seed); return new DBIter(dbname, env, user_key_comparator, internal_iter, sequence, expiry);
} }
} // namespace leveldb } // namespace leveldb

View file

@ -7,21 +7,21 @@
#include <stdint.h> #include <stdint.h>
#include "leveldb/db.h" #include "leveldb/db.h"
#include "leveldb/expiry.h"
#include "db/dbformat.h" #include "db/dbformat.h"
namespace leveldb { namespace leveldb {
class DBImpl;
// Return a new iterator that converts internal keys (yielded by // Return a new iterator that converts internal keys (yielded by
// "*internal_iter") that were live at the specified "sequence" number // "*internal_iter") that were live at the specified "sequence" number
// into appropriate user keys. // into appropriate user keys.
extern Iterator* NewDBIterator( extern Iterator* NewDBIterator(
DBImpl* db, const std::string* dbname,
Env* env,
const Comparator* user_key_comparator, const Comparator* user_key_comparator,
Iterator* internal_iter, Iterator* internal_iter,
SequenceNumber sequence, const SequenceNumber& sequence,
uint32_t seed); const ExpiryModule * expiry=NULL);
} // namespace leveldb } // namespace leveldb

View file

@ -33,11 +33,8 @@ class AtomicCounter {
public: public:
AtomicCounter() : count_(0) { } AtomicCounter() : count_(0) { }
void Increment() { void Increment() {
IncrementBy(1);
}
void IncrementBy(int count) {
MutexLock l(&mu_); MutexLock l(&mu_);
count_ += count; count_++;
} }
int Read() { int Read() {
MutexLock l(&mu_); MutexLock l(&mu_);
@ -48,20 +45,13 @@ class AtomicCounter {
count_ = 0; count_ = 0;
} }
}; };
void DelayMilliseconds(int millis) {
Env::Default()->SleepForMicroseconds(millis * 1000);
}
} }
// Special Env used to delay background operations // Special Env used to delay background operations
class SpecialEnv : public EnvWrapper { class SpecialEnv : public EnvWrapper {
public: public:
// sstable/log Sync() calls are blocked while this pointer is non-NULL. // sstable Sync() calls are blocked while this pointer is non-NULL.
port::AtomicPointer delay_data_sync_; port::AtomicPointer delay_sstable_sync_;
// sstable/log Sync() calls return an error.
port::AtomicPointer data_sync_error_;
// Simulate no-space errors while this pointer is non-NULL. // Simulate no-space errors while this pointer is non-NULL.
port::AtomicPointer no_space_; port::AtomicPointer no_space_;
@ -69,37 +59,30 @@ class SpecialEnv : public EnvWrapper {
// Simulate non-writable file system while this pointer is non-NULL // Simulate non-writable file system while this pointer is non-NULL
port::AtomicPointer non_writable_; port::AtomicPointer non_writable_;
// Force sync of manifest files to fail while this pointer is non-NULL
port::AtomicPointer manifest_sync_error_;
// Force write to manifest files to fail while this pointer is non-NULL
port::AtomicPointer manifest_write_error_;
bool count_random_reads_; bool count_random_reads_;
AtomicCounter random_read_counter_; AtomicCounter random_read_counter_;
AtomicCounter sleep_counter_;
explicit SpecialEnv(Env* base) : EnvWrapper(base) { explicit SpecialEnv(Env* base) : EnvWrapper(base) {
delay_data_sync_.Release_Store(NULL); delay_sstable_sync_.Release_Store(NULL);
data_sync_error_.Release_Store(NULL);
no_space_.Release_Store(NULL); no_space_.Release_Store(NULL);
non_writable_.Release_Store(NULL); non_writable_.Release_Store(NULL);
count_random_reads_ = false; count_random_reads_ = false;
manifest_sync_error_.Release_Store(NULL);
manifest_write_error_.Release_Store(NULL);
} }
Status NewWritableFile(const std::string& f, WritableFile** r) { Status NewWritableFile(const std::string& f, WritableFile** r, size_t map_size) {
class DataFile : public WritableFile { class SSTableFile : public WritableFile {
private: private:
SpecialEnv* env_; SpecialEnv* env_;
WritableFile* base_; WritableFile* base_;
public: public:
DataFile(SpecialEnv* env, WritableFile* base) SSTableFile(SpecialEnv* env, WritableFile* base)
: env_(env), : env_(env),
base_(base) { base_(base) {
} }
~DataFile() { delete base_; } ~SSTableFile() { delete base_; }
Status Append(const Slice& data) { Status Append(const Slice& data) {
if (env_->no_space_.Acquire_Load() != NULL) { if (env_->no_space_.Acquire_Load() != NULL) {
// Drop writes on the floor // Drop writes on the floor
@ -111,51 +94,21 @@ class SpecialEnv : public EnvWrapper {
Status Close() { return base_->Close(); } Status Close() { return base_->Close(); }
Status Flush() { return base_->Flush(); } Status Flush() { return base_->Flush(); }
Status Sync() { Status Sync() {
if (env_->data_sync_error_.Acquire_Load() != NULL) { while (env_->delay_sstable_sync_.Acquire_Load() != NULL) {
return Status::IOError("simulated data sync error"); env_->SleepForMicroseconds(100000);
}
while (env_->delay_data_sync_.Acquire_Load() != NULL) {
DelayMilliseconds(100);
} }
return base_->Sync(); return base_->Sync();
} }
}; };
class ManifestFile : public WritableFile {
private:
SpecialEnv* env_;
WritableFile* base_;
public:
ManifestFile(SpecialEnv* env, WritableFile* b) : env_(env), base_(b) { }
~ManifestFile() { delete base_; }
Status Append(const Slice& data) {
if (env_->manifest_write_error_.Acquire_Load() != NULL) {
return Status::IOError("simulated writer error");
} else {
return base_->Append(data);
}
}
Status Close() { return base_->Close(); }
Status Flush() { return base_->Flush(); }
Status Sync() {
if (env_->manifest_sync_error_.Acquire_Load() != NULL) {
return Status::IOError("simulated sync error");
} else {
return base_->Sync();
}
}
};
if (non_writable_.Acquire_Load() != NULL) { if (non_writable_.Acquire_Load() != NULL) {
return Status::IOError("simulated write error"); return Status::IOError("simulated write error");
} }
Status s = target()->NewWritableFile(f, r); Status s = target()->NewWritableFile(f, r, 2<<20);
if (s.ok()) { if (s.ok()) {
if (strstr(f.c_str(), ".ldb") != NULL || if (strstr(f.c_str(), ".sst") != NULL) {
strstr(f.c_str(), ".log") != NULL) { *r = new SSTableFile(this, *r);
*r = new DataFile(this, *r);
} else if (strstr(f.c_str(), "MANIFEST") != NULL) {
*r = new ManifestFile(this, *r);
} }
} }
return s; return s;
@ -184,6 +137,11 @@ class SpecialEnv : public EnvWrapper {
} }
return s; return s;
} }
virtual void SleepForMicroseconds(int micros) {
sleep_counter_.Increment();
target()->SleepForMicroseconds(micros);
}
}; };
class DBTest { class DBTest {
@ -193,7 +151,6 @@ class DBTest {
// Sequence of option configurations to try // Sequence of option configurations to try
enum OptionConfig { enum OptionConfig {
kDefault, kDefault,
kReuse,
kFilter, kFilter,
kUncompressed, kUncompressed,
kEnd kEnd
@ -209,7 +166,7 @@ class DBTest {
DBTest() : option_config_(kDefault), DBTest() : option_config_(kDefault),
env_(new SpecialEnv(Env::Default())) { env_(new SpecialEnv(Env::Default())) {
filter_policy_ = NewBloomFilterPolicy(10); filter_policy_ = NewBloomFilterPolicy2(16);
dbname_ = test::TmpDir() + "/db_test"; dbname_ = test::TmpDir() + "/db_test";
DestroyDB(dbname_, Options()); DestroyDB(dbname_, Options());
db_ = NULL; db_ = NULL;
@ -238,11 +195,7 @@ class DBTest {
// Return the current option configuration. // Return the current option configuration.
Options CurrentOptions() { Options CurrentOptions() {
Options options; Options options;
options.reuse_logs = false;
switch (option_config_) { switch (option_config_) {
case kReuse:
options.reuse_logs = true;
break;
case kFilter: case kFilter:
options.filter_policy = filter_policy_; options.filter_policy = filter_policy_;
break; break;
@ -290,6 +243,23 @@ class DBTest {
return DB::Open(opts, dbname_, &db_); return DB::Open(opts, dbname_, &db_);
} }
Status DoubleOpen(Options* options = NULL) {
DB * db_fail;
delete db_;
db_ = NULL;
Options opts, opts2;
if (options != NULL) {
opts = *options;
} else {
opts = CurrentOptions();
opts.create_if_missing = true;
}
last_options_ = opts;
DB::Open(opts, dbname_, &db_);
return DB::Open(opts2, dbname_, &db_fail);
}
Status Put(const std::string& k, const std::string& v) { Status Put(const std::string& k, const std::string& v) {
return db_->Put(WriteOptions(), k, v); return db_->Put(WriteOptions(), k, v);
} }
@ -311,6 +281,20 @@ class DBTest {
return result; return result;
} }
std::string GetNoCache(const std::string& k, const Snapshot* snapshot = NULL) {
ReadOptions options;
options.snapshot = snapshot;
options.fill_cache=false;
std::string result;
Status s = db_->Get(options, k, &result);
if (s.IsNotFound()) {
result = "NOT_FOUND";
} else if (!s.ok()) {
result = s.ToString();
}
return result;
}
// Return a string that contains all key,value pairs in order, // Return a string that contains all key,value pairs in order,
// formatted like "(k1->v1)(k2->v2)". // formatted like "(k1->v1)(k2->v2)".
std::string Contents() { std::string Contents() {
@ -326,7 +310,7 @@ class DBTest {
} }
// Check reverse iteration results are the reverse of forward results // Check reverse iteration results are the reverse of forward results
size_t matched = 0; int matched = 0;
for (iter->SeekToLast(); iter->Valid(); iter->Prev()) { for (iter->SeekToLast(); iter->Valid(); iter->Prev()) {
ASSERT_LT(matched, forward.size()); ASSERT_LT(matched, forward.size());
ASSERT_EQ(IterStatus(iter), forward[forward.size() - matched - 1]); ASSERT_EQ(IterStatus(iter), forward[forward.size() - matched - 1]);
@ -340,7 +324,7 @@ class DBTest {
std::string AllEntriesFor(const Slice& user_key) { std::string AllEntriesFor(const Slice& user_key) {
Iterator* iter = dbfull()->TEST_NewInternalIterator(); Iterator* iter = dbfull()->TEST_NewInternalIterator();
InternalKey target(user_key, kMaxSequenceNumber, kTypeValue); InternalKey target(user_key, 0, kMaxSequenceNumber, kTypeValue);
iter->Seek(target.Encode()); iter->Seek(target.Encode());
std::string result; std::string result;
if (!iter->status().ok()) { if (!iter->status().ok()) {
@ -361,6 +345,8 @@ class DBTest {
} }
first = false; first = false;
switch (ikey.type) { switch (ikey.type) {
case kTypeValueWriteTime:
case kTypeValueExplicitExpiry:
case kTypeValue: case kTypeValue:
result += iter->value().ToString(); result += iter->value().ToString();
break; break;
@ -474,38 +460,6 @@ class DBTest {
} }
return result; return result;
} }
bool DeleteAnSSTFile() {
std::vector<std::string> filenames;
ASSERT_OK(env_->GetChildren(dbname_, &filenames));
uint64_t number;
FileType type;
for (size_t i = 0; i < filenames.size(); i++) {
if (ParseFileName(filenames[i], &number, &type) && type == kTableFile) {
ASSERT_OK(env_->DeleteFile(TableFileName(dbname_, number)));
return true;
}
}
return false;
}
// Returns number of files renamed.
int RenameLDBToSST() {
std::vector<std::string> filenames;
ASSERT_OK(env_->GetChildren(dbname_, &filenames));
uint64_t number;
FileType type;
int files_renamed = 0;
for (size_t i = 0; i < filenames.size(); i++) {
if (ParseFileName(filenames[i], &number, &type) && type == kTableFile) {
const std::string from = TableFileName(dbname_, number);
const std::string to = SSTTableFileName(dbname_, number);
ASSERT_OK(env_->RenameFile(from, to));
files_renamed++;
}
}
return files_renamed;
}
}; };
TEST(DBTest, Empty) { TEST(DBTest, Empty) {
@ -515,6 +469,11 @@ TEST(DBTest, Empty) {
} while (ChangeOptions()); } while (ChangeOptions());
} }
TEST(DBTest, DoubleOpen)
{
ASSERT_NOTOK(DoubleOpen());
}
TEST(DBTest, ReadWrite) { TEST(DBTest, ReadWrite) {
do { do {
ASSERT_OK(Put("foo", "v1")); ASSERT_OK(Put("foo", "v1"));
@ -547,11 +506,11 @@ TEST(DBTest, GetFromImmutableLayer) {
ASSERT_OK(Put("foo", "v1")); ASSERT_OK(Put("foo", "v1"));
ASSERT_EQ("v1", Get("foo")); ASSERT_EQ("v1", Get("foo"));
env_->delay_data_sync_.Release_Store(env_); // Block sync calls env_->delay_sstable_sync_.Release_Store(env_); // Block sync calls
Put("k1", std::string(100000, 'x')); // Fill memtable Put("k1", std::string(100000, 'x')); // Fill memtable
Put("k2", std::string(100000, 'y')); // Trigger compaction Put("k2", std::string(100000, 'y')); // Trigger compaction
ASSERT_EQ("v1", Get("foo")); ASSERT_EQ("v1", Get("foo"));
env_->delay_data_sync_.Release_Store(NULL); // Release sync calls env_->delay_sstable_sync_.Release_Store(NULL); // Release sync calls
} while (ChangeOptions()); } while (ChangeOptions());
} }
@ -563,17 +522,6 @@ TEST(DBTest, GetFromVersions) {
} while (ChangeOptions()); } while (ChangeOptions());
} }
TEST(DBTest, GetMemUsage) {
do {
ASSERT_OK(Put("foo", "v1"));
std::string val;
ASSERT_TRUE(db_->GetProperty("leveldb.approximate-memory-usage", &val));
int mem_usage = atoi(val.c_str());
ASSERT_GT(mem_usage, 0);
ASSERT_LT(mem_usage, 5*1024*1024);
} while (ChangeOptions());
}
TEST(DBTest, GetSnapshot) { TEST(DBTest, GetSnapshot) {
do { do {
// Try with both a short key and a long key // Try with both a short key and a long key
@ -634,6 +582,9 @@ TEST(DBTest, GetPicksCorrectFile) {
} while (ChangeOptions()); } while (ChangeOptions());
} }
#if 0
// riak does not execute compaction due to reads
TEST(DBTest, GetEncountersEmptyLevel) { TEST(DBTest, GetEncountersEmptyLevel) {
do { do {
// Arrange for the following to happen: // Arrange for the following to happen:
@ -642,7 +593,7 @@ TEST(DBTest, GetEncountersEmptyLevel) {
// * sstable B in level 2 // * sstable B in level 2
// Then do enough Get() calls to arrange for an automatic compaction // Then do enough Get() calls to arrange for an automatic compaction
// of sstable A. A bug would cause the compaction to be marked as // of sstable A. A bug would cause the compaction to be marked as
// occurring at level 1 (instead of the correct level 0). // occuring at level 1 (instead of the correct level 0).
// Step 1: First place sstables in levels 0 and 2 // Step 1: First place sstables in levels 0 and 2
int compaction_count = 0; int compaction_count = 0;
@ -667,11 +618,12 @@ TEST(DBTest, GetEncountersEmptyLevel) {
} }
// Step 4: Wait for compaction to finish // Step 4: Wait for compaction to finish
DelayMilliseconds(1000); env_->SleepForMicroseconds(1000000);
ASSERT_EQ(NumTableFilesAtLevel(0), 0); ASSERT_EQ(NumTableFilesAtLevel(0), 0);
} while (ChangeOptions()); } while (ChangeOptions());
} }
#endif
TEST(DBTest, IterEmpty) { TEST(DBTest, IterEmpty) {
Iterator* iter = db_->NewIterator(ReadOptions()); Iterator* iter = db_->NewIterator(ReadOptions());
@ -996,7 +948,8 @@ TEST(DBTest, CompactionsGenerateMultipleFiles) {
dbfull()->TEST_CompactRange(0, NULL, NULL); dbfull()->TEST_CompactRange(0, NULL, NULL);
ASSERT_EQ(NumTableFilesAtLevel(0), 0); ASSERT_EQ(NumTableFilesAtLevel(0), 0);
ASSERT_GT(NumTableFilesAtLevel(1), 1); // not riak ASSERT_GT(NumTableFilesAtLevel(1), 1);
ASSERT_EQ(NumTableFilesAtLevel(1), 1); // yes riak
for (int i = 0; i < 80; i++) { for (int i = 0; i < 80; i++) {
ASSERT_EQ(Get(Key(i)), values[i]); ASSERT_EQ(Get(Key(i)), values[i]);
} }
@ -1010,7 +963,8 @@ TEST(DBTest, RepeatedWritesToSameKey) {
// We must have at most one file per level except for level-0, // We must have at most one file per level except for level-0,
// which may have up to kL0_StopWritesTrigger files. // which may have up to kL0_StopWritesTrigger files.
const int kMaxFiles = config::kNumLevels + config::kL0_StopWritesTrigger; // ... basho adds *2 since level-1 is now overlapped too
const int kMaxFiles = config::kNumLevels + config::kL0_StopWritesTrigger*2;
Random rnd(301); Random rnd(301);
std::string value = RandomString(&rnd, 2 * options.write_buffer_size); std::string value = RandomString(&rnd, 2 * options.write_buffer_size);
@ -1054,11 +1008,13 @@ TEST(DBTest, SparseMerge) {
// Compactions should not cause us to create a situation where // Compactions should not cause us to create a situation where
// a file overlaps too much data at the next level. // a file overlaps too much data at the next level.
ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576); // 07/10/14 matthewv - we overlap first two levels. sparse test not appropriate there,
// and we set overlaps into 100s of megabytes as "normal"
// ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576);
dbfull()->TEST_CompactRange(0, NULL, NULL); dbfull()->TEST_CompactRange(0, NULL, NULL);
ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576); // ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576);
dbfull()->TEST_CompactRange(1, NULL, NULL); dbfull()->TEST_CompactRange(1, NULL, NULL);
ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576); // ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576);
} }
static bool Between(uint64_t val, uint64_t low, uint64_t high) { static bool Between(uint64_t val, uint64_t low, uint64_t high) {
@ -1096,14 +1052,6 @@ TEST(DBTest, ApproximateSizes) {
// 0 because GetApproximateSizes() does not account for memtable space // 0 because GetApproximateSizes() does not account for memtable space
ASSERT_TRUE(Between(Size("", Key(50)), 0, 0)); ASSERT_TRUE(Between(Size("", Key(50)), 0, 0));
if (options.reuse_logs) {
// Recovery will reuse memtable, and GetApproximateSizes() does not
// account for memtable usage;
Reopen(&options);
ASSERT_TRUE(Between(Size("", Key(50)), 0, 0));
continue;
}
// Check sizes across recovery by reopening a few times // Check sizes across recovery by reopening a few times
for (int run = 0; run < 3; run++) { for (int run = 0; run < 3; run++) {
Reopen(&options); Reopen(&options);
@ -1147,11 +1095,6 @@ TEST(DBTest, ApproximateSizes_MixOfSmallAndLarge) {
ASSERT_OK(Put(Key(6), RandomString(&rnd, 300000))); ASSERT_OK(Put(Key(6), RandomString(&rnd, 300000)));
ASSERT_OK(Put(Key(7), RandomString(&rnd, 10000))); ASSERT_OK(Put(Key(7), RandomString(&rnd, 10000)));
if (options.reuse_logs) {
// Need to force a memtable compaction since recovery does not do so.
ASSERT_OK(dbfull()->TEST_CompactMemTable());
}
// Check sizes across recovery by reopening a few times // Check sizes across recovery by reopening a few times
for (int run = 0; run < 3; run++) { for (int run = 0; run < 3; run++) {
Reopen(&options); Reopen(&options);
@ -1223,7 +1166,7 @@ TEST(DBTest, Snapshot) {
ASSERT_EQ("v4", Get("foo")); ASSERT_EQ("v4", Get("foo"));
} while (ChangeOptions()); } while (ChangeOptions());
} }
#if 0 // trouble under Riak due to assumed file sizes
TEST(DBTest, HiddenValuesAreRemoved) { TEST(DBTest, HiddenValuesAreRemoved) {
do { do {
Random rnd(301); Random rnd(301);
@ -1254,7 +1197,7 @@ TEST(DBTest, HiddenValuesAreRemoved) {
ASSERT_TRUE(Between(Size("", "pastfoo"), 0, 1000)); ASSERT_TRUE(Between(Size("", "pastfoo"), 0, 1000));
} while (ChangeOptions()); } while (ChangeOptions());
} }
#endif
TEST(DBTest, DeletionMarkers1) { TEST(DBTest, DeletionMarkers1) {
Put("foo", "v1"); Put("foo", "v1");
ASSERT_OK(dbfull()->TEST_CompactMemTable()); ASSERT_OK(dbfull()->TEST_CompactMemTable());
@ -1271,13 +1214,14 @@ TEST(DBTest, DeletionMarkers1) {
Delete("foo"); Delete("foo");
Put("foo", "v2"); Put("foo", "v2");
ASSERT_EQ(AllEntriesFor("foo"), "[ v2, DEL, v1 ]"); ASSERT_EQ(AllEntriesFor("foo"), "[ v2, DEL, v1 ]");
ASSERT_OK(dbfull()->TEST_CompactMemTable()); // Moves to level last-2 ASSERT_OK(dbfull()->TEST_CompactMemTable()); // stays at level 0
ASSERT_EQ(AllEntriesFor("foo"), "[ v2, DEL, v1 ]"); ASSERT_EQ(AllEntriesFor("foo"), "[ v2, v1 ]"); // riak 1.3, DEL merged out by BuildTable
Slice z("z"); Slice z("z");
dbfull()->TEST_CompactRange(last-2, NULL, &z); dbfull()->TEST_CompactRange(0, NULL, &z);
dbfull()->TEST_CompactRange(1, NULL, &z);
// DEL eliminated, but v1 remains because we aren't compacting that level // DEL eliminated, but v1 remains because we aren't compacting that level
// (DEL can be eliminated because v2 hides v1). // (DEL can be eliminated because v2 hides v1).
ASSERT_EQ(AllEntriesFor("foo"), "[ v2, v1 ]"); ASSERT_EQ(AllEntriesFor("foo"), "[ v2, v1 ]"); // Riak 1.4 has merged to level 1
dbfull()->TEST_CompactRange(last-1, NULL, NULL); dbfull()->TEST_CompactRange(last-1, NULL, NULL);
// Merging last-1 w/ last, so we are the base level for "foo", so // Merging last-1 w/ last, so we are the base level for "foo", so
// DEL is removed. (as is v1). // DEL is removed. (as is v1).
@ -1289,39 +1233,47 @@ TEST(DBTest, DeletionMarkers2) {
ASSERT_OK(dbfull()->TEST_CompactMemTable()); ASSERT_OK(dbfull()->TEST_CompactMemTable());
const int last = config::kMaxMemCompactLevel; const int last = config::kMaxMemCompactLevel;
ASSERT_EQ(NumTableFilesAtLevel(last), 1); // foo => v1 is now in last level ASSERT_EQ(NumTableFilesAtLevel(last), 1); // foo => v1 is now in last level
dbfull()->TEST_CompactRange(0, NULL, NULL);
ASSERT_EQ(NumTableFilesAtLevel(last), 1); // foo => v1 is now in last level
ASSERT_EQ(NumTableFilesAtLevel(last-1), 0);
// Place a table at level last-1 to prevent merging with preceding mutation // Place a table at level last-1 to prevent merging with preceding mutation
Put("a", "begin"); Put("a", "begin");
Put("z", "end"); Put("z", "end");
dbfull()->TEST_CompactMemTable(); dbfull()->TEST_CompactMemTable(); // goes to last-1
ASSERT_EQ(NumTableFilesAtLevel(last), 1);
ASSERT_EQ(NumTableFilesAtLevel(last-1), 1); ASSERT_EQ(NumTableFilesAtLevel(last-1), 1);
Delete("foo"); Delete("foo");
ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]"); ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]");
ASSERT_OK(dbfull()->TEST_CompactMemTable()); // Moves to level last-2 ASSERT_OK(dbfull()->TEST_CompactMemTable()); // Moves to level 0
ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]"); ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]");
dbfull()->TEST_CompactRange(last-2, NULL, NULL); dbfull()->TEST_CompactRange(0, NULL, NULL); // Riak overlaps level 1
// DEL kept: "last" file overlaps // DEL kept: "last" file overlaps
ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]"); ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]");
dbfull()->TEST_CompactRange(last-1, NULL, NULL);
// Merging last-1 w/ last, so we are the base level for "foo", so // Merging last-1 w/ last, so we are the base level for "foo", so
// DEL is removed. (as is v1). // DEL is removed. (as is v1).
dbfull()->TEST_CompactRange(1, NULL, NULL);
ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]");
dbfull()->TEST_CompactRange(2, NULL, NULL);
ASSERT_EQ(AllEntriesFor("foo"), "[ ]"); ASSERT_EQ(AllEntriesFor("foo"), "[ ]");
} }
TEST(DBTest, OverlapInLevel0) { TEST(DBTest, OverlapInLevel0) {
do { do {
ASSERT_EQ(config::kMaxMemCompactLevel, 2) << "Fix test to match config"; ASSERT_EQ(config::kMaxMemCompactLevel, 3) << "Fix test to match config";
// Fill levels 1 and 2 to disable the pushing of new memtables to levels > 0. // Fill levels 1 and 2 to disable the pushing of new memtables to levels > 0.
ASSERT_OK(Put("100", "v100")); ASSERT_OK(Put("100", "v100"));
ASSERT_OK(Put("999", "v999")); ASSERT_OK(Put("999", "v999"));
dbfull()->TEST_CompactMemTable(); dbfull()->TEST_CompactMemTable();
dbfull()->TEST_CompactRange(0, NULL, NULL);
dbfull()->TEST_CompactRange(1, NULL, NULL);
ASSERT_OK(Delete("100")); ASSERT_OK(Delete("100"));
ASSERT_OK(Delete("999")); ASSERT_OK(Delete("999"));
dbfull()->TEST_CompactMemTable(); dbfull()->TEST_CompactMemTable();
ASSERT_EQ("0,1,1", FilesPerLevel()); dbfull()->TEST_CompactRange(0, NULL, NULL);
ASSERT_EQ("0,0,1,1", FilesPerLevel());
// Make files spanning the following ranges in level-0: // Make files spanning the following ranges in level-0:
// files[0] 200 .. 900 // files[0] 200 .. 900
@ -1334,7 +1286,7 @@ TEST(DBTest, OverlapInLevel0) {
ASSERT_OK(Put("600", "v600")); ASSERT_OK(Put("600", "v600"));
ASSERT_OK(Put("900", "v900")); ASSERT_OK(Put("900", "v900"));
dbfull()->TEST_CompactMemTable(); dbfull()->TEST_CompactMemTable();
ASSERT_EQ("2,1,1", FilesPerLevel()); ASSERT_EQ("2,0,1,1", FilesPerLevel());
// Compact away the placeholder files we created initially // Compact away the placeholder files we created initially
dbfull()->TEST_CompactRange(1, NULL, NULL); dbfull()->TEST_CompactRange(1, NULL, NULL);
@ -1364,7 +1316,7 @@ TEST(DBTest, L0_CompactionBug_Issue44_a) {
Reopen(); Reopen();
Reopen(); Reopen();
ASSERT_EQ("(a->v)", Contents()); ASSERT_EQ("(a->v)", Contents());
DelayMilliseconds(1000); // Wait for compaction to finish env_->SleepForMicroseconds(1000000); // Wait for compaction to finish
ASSERT_EQ("(a->v)", Contents()); ASSERT_EQ("(a->v)", Contents());
} }
@ -1380,7 +1332,7 @@ TEST(DBTest, L0_CompactionBug_Issue44_b) {
Put("",""); Put("","");
Reopen(); Reopen();
Put("",""); Put("","");
DelayMilliseconds(1000); // Wait for compaction to finish env_->SleepForMicroseconds(1000000); // Wait for compaction to finish
Reopen(); Reopen();
Put("d","dv"); Put("d","dv");
Reopen(); Reopen();
@ -1390,7 +1342,7 @@ TEST(DBTest, L0_CompactionBug_Issue44_b) {
Delete("b"); Delete("b");
Reopen(); Reopen();
ASSERT_EQ("(->)(c->cv)", Contents()); ASSERT_EQ("(->)(c->cv)", Contents());
DelayMilliseconds(1000); // Wait for compaction to finish env_->SleepForMicroseconds(1000000); // Wait for compaction to finish
ASSERT_EQ("(->)(c->cv)", Contents()); ASSERT_EQ("(->)(c->cv)", Contents());
} }
@ -1473,37 +1425,37 @@ TEST(DBTest, CustomComparator) {
} }
TEST(DBTest, ManualCompaction) { TEST(DBTest, ManualCompaction) {
ASSERT_EQ(config::kMaxMemCompactLevel, 2) ASSERT_EQ(config::kMaxMemCompactLevel, 3)
<< "Need to update this test to match kMaxMemCompactLevel"; << "Need to update this test to match kMaxMemCompactLevel";
MakeTables(3, "p", "q"); MakeTables(3, "p", "q");
ASSERT_EQ("1,1,1", FilesPerLevel()); ASSERT_EQ("1,0,1,1", FilesPerLevel());
// Compaction range falls before files // Compaction range falls before files
Compact("", "c"); Compact("", "c");
ASSERT_EQ("1,1,1", FilesPerLevel()); ASSERT_EQ("0,1,1,1", FilesPerLevel());
// Compaction range falls after files // Compaction range falls after files
Compact("r", "z"); Compact("r", "z");
ASSERT_EQ("1,1,1", FilesPerLevel()); ASSERT_EQ("0,1,1,1", FilesPerLevel());
// Compaction range overlaps files // Compaction range overlaps files
Compact("p1", "p9"); Compact("p1", "p9");
ASSERT_EQ("0,0,1", FilesPerLevel()); ASSERT_EQ("0,0,0,1", FilesPerLevel());
// Populate a different range // Populate a different range
MakeTables(3, "c", "e"); MakeTables(3, "c", "e");
ASSERT_EQ("1,1,2", FilesPerLevel()); ASSERT_EQ("1,0,1,2", FilesPerLevel());
// Compact just the new range // Compact just the new range
Compact("b", "f"); Compact("b", "f");
ASSERT_EQ("0,0,2", FilesPerLevel()); ASSERT_EQ("0,0,0,2", FilesPerLevel());
// Compact all // Compact all
MakeTables(1, "a", "z"); MakeTables(1, "a", "z");
ASSERT_EQ("0,1,2", FilesPerLevel()); ASSERT_EQ("0,0,1,2", FilesPerLevel());
db_->CompactRange(NULL, NULL); db_->CompactRange(NULL, NULL);
ASSERT_EQ("0,0,1", FilesPerLevel()); ASSERT_EQ("0,0,0,1", FilesPerLevel());
} }
TEST(DBTest, DBOpen_Options) { TEST(DBTest, DBOpen_Options) {
@ -1545,12 +1497,6 @@ TEST(DBTest, DBOpen_Options) {
db = NULL; db = NULL;
} }
TEST(DBTest, Locking) {
DB* db2 = NULL;
Status s = DB::Open(CurrentOptions(), dbname_, &db2);
ASSERT_TRUE(!s.ok()) << "Locking did not prevent re-opening db";
}
// Check that number of files does not grow when we are out of space // Check that number of files does not grow when we are out of space
TEST(DBTest, NoSpace) { TEST(DBTest, NoSpace) {
Options options = CurrentOptions(); Options options = CurrentOptions();
@ -1562,15 +1508,19 @@ TEST(DBTest, NoSpace) {
Compact("a", "z"); Compact("a", "z");
const int num_files = CountFiles(); const int num_files = CountFiles();
env_->no_space_.Release_Store(env_); // Force out-of-space errors env_->no_space_.Release_Store(env_); // Force out-of-space errors
for (int i = 0; i < 10; i++) { env_->sleep_counter_.Reset();
for (int i = 0; i < 5; i++) {
for (int level = 0; level < config::kNumLevels-1; level++) { for (int level = 0; level < config::kNumLevels-1; level++) {
dbfull()->TEST_CompactRange(level, NULL, NULL); dbfull()->TEST_CompactRange(level, NULL, NULL);
} }
} }
env_->no_space_.Release_Store(NULL); env_->no_space_.Release_Store(NULL);
ASSERT_LT(CountFiles(), num_files + 3); ASSERT_LT(CountFiles(), num_files + 3);
}
// Check that compaction attempts slept after errors
ASSERT_GE(env_->sleep_counter_.Read(), 5);
}
#if 0
TEST(DBTest, NonWritableFileSystem) { TEST(DBTest, NonWritableFileSystem) {
Options options = CurrentOptions(); Options options = CurrentOptions();
options.write_buffer_size = 1000; options.write_buffer_size = 1000;
@ -1584,119 +1534,13 @@ TEST(DBTest, NonWritableFileSystem) {
fprintf(stderr, "iter %d; errors %d\n", i, errors); fprintf(stderr, "iter %d; errors %d\n", i, errors);
if (!Put("foo", big).ok()) { if (!Put("foo", big).ok()) {
errors++; errors++;
DelayMilliseconds(100); env_->SleepForMicroseconds(100000);
} }
} }
ASSERT_GT(errors, 0); ASSERT_GT(errors, 0);
env_->non_writable_.Release_Store(NULL); env_->non_writable_.Release_Store(NULL);
} }
#endif
TEST(DBTest, WriteSyncError) {
// Check that log sync errors cause the DB to disallow future writes.
// (a) Cause log sync calls to fail
Options options = CurrentOptions();
options.env = env_;
Reopen(&options);
env_->data_sync_error_.Release_Store(env_);
// (b) Normal write should succeed
WriteOptions w;
ASSERT_OK(db_->Put(w, "k1", "v1"));
ASSERT_EQ("v1", Get("k1"));
// (c) Do a sync write; should fail
w.sync = true;
ASSERT_TRUE(!db_->Put(w, "k2", "v2").ok());
ASSERT_EQ("v1", Get("k1"));
ASSERT_EQ("NOT_FOUND", Get("k2"));
// (d) make sync behave normally
env_->data_sync_error_.Release_Store(NULL);
// (e) Do a non-sync write; should fail
w.sync = false;
ASSERT_TRUE(!db_->Put(w, "k3", "v3").ok());
ASSERT_EQ("v1", Get("k1"));
ASSERT_EQ("NOT_FOUND", Get("k2"));
ASSERT_EQ("NOT_FOUND", Get("k3"));
}
TEST(DBTest, ManifestWriteError) {
// Test for the following problem:
// (a) Compaction produces file F
// (b) Log record containing F is written to MANIFEST file, but Sync() fails
// (c) GC deletes F
// (d) After reopening DB, reads fail since deleted F is named in log record
// We iterate twice. In the second iteration, everything is the
// same except the log record never makes it to the MANIFEST file.
for (int iter = 0; iter < 2; iter++) {
port::AtomicPointer* error_type = (iter == 0)
? &env_->manifest_sync_error_
: &env_->manifest_write_error_;
// Insert foo=>bar mapping
Options options = CurrentOptions();
options.env = env_;
options.create_if_missing = true;
options.error_if_exists = false;
DestroyAndReopen(&options);
ASSERT_OK(Put("foo", "bar"));
ASSERT_EQ("bar", Get("foo"));
// Memtable compaction (will succeed)
dbfull()->TEST_CompactMemTable();
ASSERT_EQ("bar", Get("foo"));
const int last = config::kMaxMemCompactLevel;
ASSERT_EQ(NumTableFilesAtLevel(last), 1); // foo=>bar is now in last level
// Merging compaction (will fail)
error_type->Release_Store(env_);
dbfull()->TEST_CompactRange(last, NULL, NULL); // Should fail
ASSERT_EQ("bar", Get("foo"));
// Recovery: should not lose data
error_type->Release_Store(NULL);
Reopen(&options);
ASSERT_EQ("bar", Get("foo"));
}
}
TEST(DBTest, MissingSSTFile) {
ASSERT_OK(Put("foo", "bar"));
ASSERT_EQ("bar", Get("foo"));
// Dump the memtable to disk.
dbfull()->TEST_CompactMemTable();
ASSERT_EQ("bar", Get("foo"));
Close();
ASSERT_TRUE(DeleteAnSSTFile());
Options options = CurrentOptions();
options.paranoid_checks = true;
Status s = TryReopen(&options);
ASSERT_TRUE(!s.ok());
ASSERT_TRUE(s.ToString().find("issing") != std::string::npos)
<< s.ToString();
}
TEST(DBTest, StillReadSST) {
ASSERT_OK(Put("foo", "bar"));
ASSERT_EQ("bar", Get("foo"));
// Dump the memtable to disk.
dbfull()->TEST_CompactMemTable();
ASSERT_EQ("bar", Get("foo"));
Close();
ASSERT_GT(RenameLDBToSST(), 0);
Options options = CurrentOptions();
options.paranoid_checks = true;
Status s = TryReopen(&options);
ASSERT_TRUE(s.ok());
ASSERT_EQ("bar", Get("foo"));
}
TEST(DBTest, FilesDeletedAfterCompaction) { TEST(DBTest, FilesDeletedAfterCompaction) {
ASSERT_OK(Put("foo", "v2")); ASSERT_OK(Put("foo", "v2"));
Compact("a", "z"); Compact("a", "z");
@ -1713,7 +1557,7 @@ TEST(DBTest, BloomFilter) {
Options options = CurrentOptions(); Options options = CurrentOptions();
options.env = env_; options.env = env_;
options.block_cache = NewLRUCache(0); // Prevent cache hits options.block_cache = NewLRUCache(0); // Prevent cache hits
options.filter_policy = NewBloomFilterPolicy(10); options.filter_policy = NewBloomFilterPolicy2(16);
Reopen(&options); Reopen(&options);
// Populate multiple layers // Populate multiple layers
@ -1728,12 +1572,12 @@ TEST(DBTest, BloomFilter) {
dbfull()->TEST_CompactMemTable(); dbfull()->TEST_CompactMemTable();
// Prevent auto compactions triggered by seeks // Prevent auto compactions triggered by seeks
env_->delay_data_sync_.Release_Store(env_); env_->delay_sstable_sync_.Release_Store(env_);
// Lookup present keys. Should rarely read from small sstable. // Lookup present keys. Should rarely read from small sstable.
env_->random_read_counter_.Reset(); env_->random_read_counter_.Reset();
for (int i = 0; i < N; i++) { for (int i = 0; i < N; i++) {
ASSERT_EQ(Key(i), Get(Key(i))); ASSERT_EQ(Key(i), GetNoCache(Key(i)));
} }
int reads = env_->random_read_counter_.Read(); int reads = env_->random_read_counter_.Read();
fprintf(stderr, "%d present => %d reads\n", N, reads); fprintf(stderr, "%d present => %d reads\n", N, reads);
@ -1743,13 +1587,13 @@ TEST(DBTest, BloomFilter) {
// Lookup present keys. Should rarely read from either sstable. // Lookup present keys. Should rarely read from either sstable.
env_->random_read_counter_.Reset(); env_->random_read_counter_.Reset();
for (int i = 0; i < N; i++) { for (int i = 0; i < N; i++) {
ASSERT_EQ("NOT_FOUND", Get(Key(i) + ".missing")); ASSERT_EQ("NOT_FOUND", GetNoCache(Key(i) + ".missing"));
} }
reads = env_->random_read_counter_.Read(); reads = env_->random_read_counter_.Read();
fprintf(stderr, "%d missing => %d reads\n", N, reads); fprintf(stderr, "%d missing => %d reads\n", N, reads);
ASSERT_LE(reads, 3*N/100); ASSERT_LE(reads, 3*N/100);
env_->delay_data_sync_.Release_Store(NULL); env_->delay_sstable_sync_.Release_Store(NULL);
Close(); Close();
delete options.block_cache; delete options.block_cache;
delete options.filter_policy; delete options.filter_policy;
@ -1809,7 +1653,7 @@ static void MTThreadBody(void* arg) {
ASSERT_EQ(k, key); ASSERT_EQ(k, key);
ASSERT_GE(w, 0); ASSERT_GE(w, 0);
ASSERT_LT(w, kNumThreads); ASSERT_LT(w, kNumThreads);
ASSERT_LE(static_cast<uintptr_t>(c), reinterpret_cast<uintptr_t>( ASSERT_LE(c, reinterpret_cast<uintptr_t>(
t->state->counter[w].Acquire_Load())); t->state->counter[w].Acquire_Load()));
} }
} }
@ -1834,27 +1678,35 @@ TEST(DBTest, MultiThreaded) {
// Start threads // Start threads
MTThread thread[kNumThreads]; MTThread thread[kNumThreads];
pthread_t tid;
for (int id = 0; id < kNumThreads; id++) { for (int id = 0; id < kNumThreads; id++) {
thread[id].state = &mt; thread[id].state = &mt;
thread[id].id = id; thread[id].id = id;
env_->StartThread(MTThreadBody, &thread[id]); tid=env_->StartThread(MTThreadBody, &thread[id]);
pthread_detach(tid);
} }
// Let them run for a while // Let them run for a while
DelayMilliseconds(kTestSeconds * 1000); env_->SleepForMicroseconds(kTestSeconds * 1000000);
// Stop the threads and wait for them to finish // Stop the threads and wait for them to finish
mt.stop.Release_Store(&mt); mt.stop.Release_Store(&mt);
for (int id = 0; id < kNumThreads; id++) { for (int id = 0; id < kNumThreads; id++) {
while (mt.thread_done[id].Acquire_Load() == NULL) { while (mt.thread_done[id].Acquire_Load() == NULL) {
DelayMilliseconds(100); env_->SleepForMicroseconds(100000);
} }
} }
} while (ChangeOptions()); } while (ChangeOptions());
} }
namespace { namespace {
typedef std::map<std::string, std::string> KVMap; struct KVEntry
{
std::string m_Value;
KeyMetaData m_Meta;
};
typedef std::map<std::string, KVEntry> KVMap;
} }
class ModelDB: public DB { class ModelDB: public DB {
@ -1866,14 +1718,21 @@ class ModelDB: public DB {
explicit ModelDB(const Options& options): options_(options) { } explicit ModelDB(const Options& options): options_(options) { }
~ModelDB() { } ~ModelDB() { }
virtual Status Put(const WriteOptions& o, const Slice& k, const Slice& v) { virtual Status Put(const WriteOptions& o, const Slice& k, const Slice& v, const KeyMetaData * meta=NULL) {
return DB::Put(o, k, v); return DB::Put(o, k, v, meta);
} }
virtual Status Delete(const WriteOptions& o, const Slice& key) { virtual Status Delete(const WriteOptions& o, const Slice& key) {
return DB::Delete(o, key); return DB::Delete(o, key);
} }
virtual Status Get(const ReadOptions& options, virtual Status Get(const ReadOptions& options,
const Slice& key, std::string* value) { const Slice& key, std::string* value,
KeyMetaData * meta = NULL) {
assert(false); // Not implemented
return Status::NotFound(key);
}
virtual Status Get(const ReadOptions& options,
const Slice& key, Value* value,
KeyMetaData * meta = NULL) {
assert(false); // Not implemented assert(false); // Not implemented
return Status::NotFound(key); return Status::NotFound(key);
} }
@ -1901,8 +1760,13 @@ class ModelDB: public DB {
class Handler : public WriteBatch::Handler { class Handler : public WriteBatch::Handler {
public: public:
KVMap* map_; KVMap* map_;
virtual void Put(const Slice& key, const Slice& value) { virtual void Put(const Slice& key, const Slice& value,
(*map_)[key.ToString()] = value.ToString(); const ValueType & type, const ExpiryTimeMicros & expiry) {
KVEntry ent;
ent.m_Value=value.ToString();
ent.m_Meta.m_Type=type;
ent.m_Meta.m_Expiry=expiry;
(*map_)[key.ToString()] = ent;
} }
virtual void Delete(const Slice& key) { virtual void Delete(const Slice& key) {
map_->erase(key.ToString()); map_->erase(key.ToString());
@ -1948,7 +1812,7 @@ class ModelDB: public DB {
virtual void Next() { ++iter_; } virtual void Next() { ++iter_; }
virtual void Prev() { --iter_; } virtual void Prev() { --iter_; }
virtual Slice key() const { return iter_->first; } virtual Slice key() const { return iter_->first; }
virtual Slice value() const { return iter_->second; } virtual Slice value() const { return iter_->second.m_Value; }
virtual Status status() const { return Status::OK(); } virtual Status status() const { return Status::OK(); }
private: private:
const KVMap* const map_; const KVMap* const map_;
@ -2085,6 +1949,44 @@ TEST(DBTest, Randomized) {
} while (ChangeOptions()); } while (ChangeOptions());
} }
class SimpleBugs
{
// need a class for the test harness
};
TEST(SimpleBugs, TieredRecoveryLog)
{
// DB::Open created first recovery log directly
// which lead to it NOT being in tiered storage location.
// nope std::string dbname = test::TmpDir() + "/leveldb_nontiered";
std::string dbname = "leveldb";
std::string fastname = test::TmpDir() + "/leveldb_fast";
std::string slowname = test::TmpDir() + "/leveldb_slow";
std::string combined;
DB* db = NULL;
Options opts;
opts.tiered_slow_level = 4;
opts.tiered_fast_prefix = fastname;
opts.tiered_slow_prefix = slowname;
opts.create_if_missing = true;
Env::Default()->CreateDir(fastname);
Env::Default()->CreateDir(slowname);
Status s = DB::Open(opts, dbname, &db);
ASSERT_OK(s);
ASSERT_TRUE(db != NULL);
delete db;
DestroyDB(dbname, opts);
} // TieredRecoveryLog
std::string MakeKey(unsigned int num) { std::string MakeKey(unsigned int num) {
char buf[30]; char buf[30];
snprintf(buf, sizeof(buf), "%016u", num); snprintf(buf, sizeof(buf), "%016u", num);
@ -2113,14 +2015,13 @@ void BM_LogAndApply(int iters, int num_base_files) {
InternalKeyComparator cmp(BytewiseComparator()); InternalKeyComparator cmp(BytewiseComparator());
Options options; Options options;
VersionSet vset(dbname, &options, NULL, &cmp); VersionSet vset(dbname, &options, NULL, &cmp);
bool save_manifest; ASSERT_OK(vset.Recover());
ASSERT_OK(vset.Recover(&save_manifest));
VersionEdit vbase; VersionEdit vbase;
uint64_t fnum = 1; uint64_t fnum = 1;
for (int i = 0; i < num_base_files; i++) { for (int i = 0; i < num_base_files; i++) {
InternalKey start(MakeKey(2*fnum), 1, kTypeValue); InternalKey start(MakeKey(2*fnum), 0, 1, kTypeValue);
InternalKey limit(MakeKey(2*fnum+1), 1, kTypeDeletion); InternalKey limit(MakeKey(2*fnum+1), 0, 1, kTypeDeletion);
vbase.AddFile(2, fnum++, 1 /* file size */, start, limit); vbase.AddFile2(2, fnum++, 1 /* file size */, start, limit, 0,0,0);
} }
ASSERT_OK(vset.LogAndApply(&vbase, &mu)); ASSERT_OK(vset.LogAndApply(&vbase, &mu));
@ -2129,9 +2030,9 @@ void BM_LogAndApply(int iters, int num_base_files) {
for (int i = 0; i < iters; i++) { for (int i = 0; i < iters; i++) {
VersionEdit vedit; VersionEdit vedit;
vedit.DeleteFile(2, fnum); vedit.DeleteFile(2, fnum);
InternalKey start(MakeKey(2*fnum), 1, kTypeValue); InternalKey start(MakeKey(2*fnum), 0, 1, kTypeValue);
InternalKey limit(MakeKey(2*fnum+1), 1, kTypeDeletion); InternalKey limit(MakeKey(2*fnum+1), 0, 1, kTypeDeletion);
vedit.AddFile(2, fnum++, 1 /* file size */, start, limit); vedit.AddFile2(2, fnum++, 1 /* file size */, start, limit, 0,0,0);
vset.LogAndApply(&vedit, &mu); vset.LogAndApply(&vedit, &mu);
} }
uint64_t stop_micros = env->NowMicros(); uint64_t stop_micros = env->NowMicros();

View file

@ -3,7 +3,9 @@
// found in the LICENSE file. See the AUTHORS file for names of contributors. // found in the LICENSE file. See the AUTHORS file for names of contributors.
#include <stdio.h> #include <stdio.h>
//#include "leveldb/expiry.h"
#include "db/dbformat.h" #include "db/dbformat.h"
#include "db/version_set.h"
#include "port/port.h" #include "port/port.h"
#include "util/coding.h" #include "util/coding.h"
@ -11,26 +13,66 @@ namespace leveldb {
static uint64_t PackSequenceAndType(uint64_t seq, ValueType t) { static uint64_t PackSequenceAndType(uint64_t seq, ValueType t) {
assert(seq <= kMaxSequenceNumber); assert(seq <= kMaxSequenceNumber);
assert(t <= kValueTypeForSeek); // assert(t <= kValueTypeForSeek); requires revisit once expiry live
assert(t <= kTypeValueExplicitExpiry); // temp replacement for above
return (seq << 8) | t; return (seq << 8) | t;
} }
void AppendInternalKey(std::string* result, const ParsedInternalKey& key) { void AppendInternalKey(std::string* result, const ParsedInternalKey& key) {
result->append(key.user_key.data(), key.user_key.size()); result->append(key.user_key.data(), key.user_key.size());
if (IsExpiryKey(key.type))
PutFixed64(result, key.expiry);
PutFixed64(result, PackSequenceAndType(key.sequence, key.type)); PutFixed64(result, PackSequenceAndType(key.sequence, key.type));
} }
std::string ParsedInternalKey::DebugString() const { std::string ParsedInternalKey::DebugString() const {
char buf[50]; char buf[50];
if (IsExpiryKey(type))
snprintf(buf, sizeof(buf), "' @ %llu %llu : %d",
(unsigned long long) expiry,
(unsigned long long) sequence,
int(type));
else
snprintf(buf, sizeof(buf), "' @ %llu : %d", snprintf(buf, sizeof(buf), "' @ %llu : %d",
(unsigned long long) sequence, (unsigned long long) sequence,
int(type)); int(type));
std::string result = "'"; std::string result = "'";
result += EscapeString(user_key.ToString()); result += HexString(user_key.ToString());
result += buf; result += buf;
return result; return result;
} }
std::string ParsedInternalKey::DebugStringHex() const {
char buf[50];
if (IsExpiryKey(type))
snprintf(buf, sizeof(buf), "' @ %llu %llu : %d",
(unsigned long long) expiry,
(unsigned long long) sequence,
int(type));
else
snprintf(buf, sizeof(buf), "' @ %llu : %d",
(unsigned long long) sequence,
int(type));
std::string result = "'";
result += HexString(user_key);
result += buf;
return result;
}
const char * KeyTypeString(ValueType val_type) {
const char * ret_ptr;
switch(val_type)
{
case kTypeDeletion: ret_ptr="kTypeDelete"; break;
case kTypeValue: ret_ptr="kTypeValue"; break;
case kTypeValueWriteTime: ret_ptr="kTypeValueWriteTime"; break;
case kTypeValueExplicitExpiry: ret_ptr="kTypeValueExplicitExpiry"; break;
default: ret_ptr="(unknown ValueType)"; break;
} // switch
return(ret_ptr);
}
std::string InternalKey::DebugString() const { std::string InternalKey::DebugString() const {
std::string result; std::string result;
ParsedInternalKey parsed; ParsedInternalKey parsed;
@ -54,8 +96,10 @@ int InternalKeyComparator::Compare(const Slice& akey, const Slice& bkey) const {
// decreasing type (though sequence# should be enough to disambiguate) // decreasing type (though sequence# should be enough to disambiguate)
int r = user_comparator_->Compare(ExtractUserKey(akey), ExtractUserKey(bkey)); int r = user_comparator_->Compare(ExtractUserKey(akey), ExtractUserKey(bkey));
if (r == 0) { if (r == 0) {
const uint64_t anum = DecodeFixed64(akey.data() + akey.size() - 8); uint64_t anum = DecodeFixed64(akey.data() + akey.size() - 8);
const uint64_t bnum = DecodeFixed64(bkey.data() + bkey.size() - 8); uint64_t bnum = DecodeFixed64(bkey.data() + bkey.size() - 8);
if (IsExpiryKey((ValueType)*(unsigned char *)&anum)) *(unsigned char*)&anum=(unsigned char)kTypeValue;
if (IsExpiryKey((ValueType)*(unsigned char *)&bnum)) *(unsigned char*)&bnum=(unsigned char)kTypeValue;
if (anum > bnum) { if (anum > bnum) {
r = -1; r = -1;
} else if (anum < bnum) { } else if (anum < bnum) {
@ -118,7 +162,8 @@ bool InternalFilterPolicy::KeyMayMatch(const Slice& key, const Slice& f) const {
return user_policy_->KeyMayMatch(ExtractUserKey(key), f); return user_policy_->KeyMayMatch(ExtractUserKey(key), f);
} }
LookupKey::LookupKey(const Slice& user_key, SequenceNumber s) { LookupKey::LookupKey(const Slice& user_key, SequenceNumber s, KeyMetaData * meta) {
meta_=meta;
size_t usize = user_key.size(); size_t usize = user_key.size();
size_t needed = usize + 13; // A conservative estimate size_t needed = usize + 13; // A conservative estimate
char* dst; char* dst;
@ -137,4 +182,109 @@ LookupKey::LookupKey(const Slice& user_key, SequenceNumber s) {
end_ = dst; end_ = dst;
} }
KeyRetirement::KeyRetirement(
const Comparator * Comparator,
SequenceNumber SmallestSnapshot,
const Options * Opts,
Compaction * const Compaction)
: has_current_user_key(false), last_sequence_for_key(kMaxSequenceNumber),
user_comparator(Comparator), smallest_snapshot(SmallestSnapshot),
options(Opts), compaction(Compaction),
valid(false), dropped(0), expired(0)
{
// NULL is ok for compaction
valid=(NULL!=user_comparator);
return;
} // KeyRetirement::KeyRetirement
KeyRetirement::~KeyRetirement()
{
if (0!=expired)
gPerfCounters->Add(ePerfExpiredKeys, expired);
} // KeyRetirement::~KeyRetirement
bool
KeyRetirement::operator()(
Slice & key)
{
ParsedInternalKey ikey;
bool drop = false, expire_flag;
if (valid)
{
if (!ParseInternalKey(key, &ikey))
{
// Do not hide error keys
current_user_key.clear();
has_current_user_key = false;
last_sequence_for_key = kMaxSequenceNumber;
} // else
else
{
if (!has_current_user_key ||
user_comparator->Compare(ikey.user_key,
Slice(current_user_key)) != 0)
{
// First occurrence of this user key
current_user_key.assign(ikey.user_key.data(), ikey.user_key.size());
has_current_user_key = true;
last_sequence_for_key = kMaxSequenceNumber;
} // if
if (last_sequence_for_key <= smallest_snapshot)
{
// Hidden by an newer entry for same user key
drop = true; // (A)
} // if
else
{
expire_flag=false;
if (NULL!=options && options->ExpiryActivated())
expire_flag=options->expiry_module->KeyRetirementCallback(ikey);
if ((ikey.type == kTypeDeletion || expire_flag)
&& ikey.sequence <= smallest_snapshot
&& NULL!=compaction // mem to level0 ignores this test
&& compaction->IsBaseLevelForKey(ikey.user_key))
{
// For this user key:
// (1) there is no data in higher levels
// (2) data in lower levels will have larger sequence numbers
// (3) data in layers that are being compacted here and have
// smaller sequence numbers will be dropped in the next
// few iterations of this loop (by rule (A) above).
// Therefore this deletion marker is obsolete and can be dropped.
drop = true;
if (expire_flag)
++expired;
else
++dropped;
} // if
} // else
last_sequence_for_key = ikey.sequence;
} // else
} // if
#if 0
// needs clean up to be used again
Log(options_.info_log,
" Compact: %s, seq %d, type: %d %d, drop: %d, is_base: %d, "
"%d smallest_snapshot: %d",
ikey.user_key.ToString().c_str(),
(int)ikey.sequence, ikey.type, kTypeValue, drop,
compact->compaction->IsBaseLevelForKey(ikey.user_key),
(int)last_sequence_for_key, (int)compact->smallest_snapshot);
#endif
return(drop);
} // KeyRetirement::operator(Slice & )
} // namespace leveldb } // namespace leveldb

View file

@ -2,13 +2,14 @@
// Use of this source code is governed by a BSD-style license that can be // Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors. // found in the LICENSE file. See the AUTHORS file for names of contributors.
#ifndef STORAGE_LEVELDB_DB_DBFORMAT_H_ #ifndef STORAGE_LEVELDB_DB_FORMAT_H_
#define STORAGE_LEVELDB_DB_DBFORMAT_H_ #define STORAGE_LEVELDB_DB_FORMAT_H_
#include <stdio.h> #include <stdio.h>
#include "leveldb/comparator.h" #include "leveldb/comparator.h"
#include "leveldb/db.h" #include "leveldb/db.h"
#include "leveldb/filter_policy.h" #include "leveldb/filter_policy.h"
#include "leveldb/options.h"
#include "leveldb/slice.h" #include "leveldb/slice.h"
#include "leveldb/table_builder.h" #include "leveldb/table_builder.h"
#include "util/coding.h" #include "util/coding.h"
@ -16,19 +17,33 @@
namespace leveldb { namespace leveldb {
class Compaction;
// Grouping of constants. We may want to make some of these // Grouping of constants. We may want to make some of these
// parameters set via options. // parameters set via options.
namespace config { namespace config {
static const int kNumLevels = 7; static const int kNumLevels = 7;
static const int kNumOverlapLevels = 2;
// Level-0 compaction is started when we hit this many files. // Level-0 compaction is started when we hit this many files.
static const int kL0_CompactionTrigger = 4; // Google: static const size_t kL0_CompactionTrigger = 4;
static const size_t kL0_CompactionTrigger = 6;
// Level-0 (any overlapped level) number of files where a grooming
// compaction could start
static const size_t kL0_GroomingTrigger = 4;
static const size_t kL0_GroomingTrigger10min = 2;
static const size_t kL0_GroomingTrigger20min = 1;
// ... time limits in microseconds
static const size_t kL0_Grooming10minMicros = 10 * 60 * 1000000;
static const size_t kL0_Grooming20minMicros = 20 * 60 * 1000000;
// Soft limit on number of level-0 files. We slow down writes at this point. // Soft limit on number of level-0 files. We slow down writes at this point.
static const int kL0_SlowdownWritesTrigger = 8; static const size_t kL0_SlowdownWritesTrigger = 8;
// Maximum number of level-0 files. We stop writes at this point. // Maximum number of level-0 files. We stop writes at this point.
static const int kL0_StopWritesTrigger = 12; static const size_t kL0_StopWritesTrigger = 12;
// Maximum level to which a new compacted memtable is pushed if it // Maximum level to which a new compacted memtable is pushed if it
// does not create overlap. We try to push to level 2 to avoid the // does not create overlap. We try to push to level 2 to avoid the
@ -36,31 +51,28 @@ static const int kL0_StopWritesTrigger = 12;
// expensive manifest file operations. We do not push all the way to // expensive manifest file operations. We do not push all the way to
// the largest level since that can generate a lot of wasted disk // the largest level since that can generate a lot of wasted disk
// space if the same key space is being repeatedly overwritten. // space if the same key space is being repeatedly overwritten.
static const int kMaxMemCompactLevel = 2; // Basho: push to kNumOverlapLevels +1 ... beyond "landing level"
static const unsigned kMaxMemCompactLevel = kNumOverlapLevels+1;
// Approximate gap in bytes between samples of data read during iteration.
static const int kReadBytesPeriod = 1048576;
} // namespace config } // namespace config
class InternalKey; class InternalKey;
// Value types encoded as the last component of internal keys.
// DO NOT CHANGE THESE ENUM VALUES: they are embedded in the on-disk
// data structures.
enum ValueType {
kTypeDeletion = 0x0,
kTypeValue = 0x1
};
// kValueTypeForSeek defines the ValueType that should be passed when // kValueTypeForSeek defines the ValueType that should be passed when
// constructing a ParsedInternalKey object for seeking to a particular // constructing a ParsedInternalKey object for seeking to a particular
// sequence number (since we sort sequence numbers in decreasing order // sequence number (since we sort sequence numbers in decreasing order
// and the value type is embedded as the low 8 bits in the sequence // and the value type is embedded as the low 8 bits in the sequence
// number in internal keys, we need to use the highest-numbered // number in internal keys, we need to use the highest-numbered
// ValueType, not the lowest). // ValueType, not the lowest).
// Riak note: kValueTypeForSeek is placed within temporary keys
// for comparisons. Using kTypeValueExplicitExpiry would
// force more code changes to increase internal key size.
// But ValueTypeForSeek is redundant to sequence number for
// disambiguaty. Therefore going for easiest path and NOT changing.
static const ValueType kValueTypeForSeek = kTypeValue; static const ValueType kValueTypeForSeek = kTypeValue;
typedef uint64_t SequenceNumber; typedef uint64_t SequenceNumber;
typedef uint64_t ExpiryTimeMicros;
// We leave eight bits empty at the bottom so a type and sequence# // We leave eight bits empty at the bottom so a type and sequence#
// can be packed together into 64-bits. // can be packed together into 64-bits.
@ -69,20 +81,17 @@ static const SequenceNumber kMaxSequenceNumber =
struct ParsedInternalKey { struct ParsedInternalKey {
Slice user_key; Slice user_key;
ExpiryTimeMicros expiry;
SequenceNumber sequence; SequenceNumber sequence;
ValueType type; ValueType type;
ParsedInternalKey() { } // Intentionally left uninitialized (for speed) ParsedInternalKey() { } // Intentionally left uninitialized (for speed)
ParsedInternalKey(const Slice& u, const SequenceNumber& seq, ValueType t) ParsedInternalKey(const Slice& u, const ExpiryTimeMicros & exp, const SequenceNumber& seq, ValueType t)
: user_key(u), sequence(seq), type(t) { } : user_key(u), expiry(exp), sequence(seq), type(t) { }
std::string DebugString() const; std::string DebugString() const;
std::string DebugStringHex() const;
}; };
// Return the length of the encoding of "key".
inline size_t InternalKeyEncodingLength(const ParsedInternalKey& key) {
return key.user_key.size() + 8;
}
// Append the serialization of "key" to *result. // Append the serialization of "key" to *result.
extern void AppendInternalKey(std::string* result, extern void AppendInternalKey(std::string* result,
const ParsedInternalKey& key); const ParsedInternalKey& key);
@ -94,20 +103,76 @@ extern void AppendInternalKey(std::string* result,
extern bool ParseInternalKey(const Slice& internal_key, extern bool ParseInternalKey(const Slice& internal_key,
ParsedInternalKey* result); ParsedInternalKey* result);
// Returns the user key portion of an internal key.
inline Slice ExtractUserKey(const Slice& internal_key) {
assert(internal_key.size() >= 8);
return Slice(internal_key.data(), internal_key.size() - 8);
}
inline ValueType ExtractValueType(const Slice& internal_key) { inline ValueType ExtractValueType(const Slice& internal_key) {
assert(internal_key.size() >= 8); assert(internal_key.size() >= 8);
const size_t n = internal_key.size(); const size_t n = internal_key.size();
uint64_t num = DecodeFixed64(internal_key.data() + n - 8); unsigned char c = DecodeLeastFixed64(internal_key.data() + n - sizeof(SequenceNumber));
unsigned char c = num & 0xff;
return static_cast<ValueType>(c); return static_cast<ValueType>(c);
} }
inline size_t KeySuffixSize(ValueType val_type) {
size_t ret_val;
switch(val_type)
{
case kTypeDeletion:
case kTypeValue:
ret_val=sizeof(SequenceNumber);
break;
case kTypeValueWriteTime:
case kTypeValueExplicitExpiry:
ret_val=sizeof(SequenceNumber) + sizeof(ExpiryTimeMicros);
break;
default:
// assert(0); cannot use because bloom filter block's name is passed as internal key
ret_val=sizeof(SequenceNumber);
break;
} // switch
return(ret_val);
}
const char * KeyTypeString(ValueType val_type);
inline size_t KeySuffixSize(const Slice & internal_key) {
return(KeySuffixSize(ExtractValueType(internal_key)));
}
// Returns the user key portion of an internal key.
inline Slice ExtractUserKey(const Slice& internal_key) {
assert(internal_key.size() >= 8);
return Slice(internal_key.data(), internal_key.size() - KeySuffixSize(internal_key));
}
// Returns the sequence number with ValueType removed
inline SequenceNumber ExtractSequenceNumber(const Slice& internal_key) {
assert(internal_key.size() >= 8);
return(DecodeFixed64(internal_key.data() + internal_key.size() - 8)>>8);
}
// Return the length of the encoding of "key".
inline size_t InternalKeyEncodingLength(const ParsedInternalKey& key) {
return key.user_key.size() + KeySuffixSize(key.type);
}
// Riak: is this an expiry key and therefore contain extra ExpiryTime field
inline bool IsExpiryKey(ValueType val_type) {
return(kTypeValueWriteTime==val_type || kTypeValueExplicitExpiry==val_type);
}
// Riak: is this an expiry key and therefore contain extra ExpiryTime field
inline bool IsExpiryKey(const Slice & internal_key) {
return(internal_key.size()>=KeySuffixSize(kTypeValueWriteTime)
&& IsExpiryKey(ExtractValueType(internal_key)));
}
// Riak: extracts expiry value
inline ExpiryTimeMicros ExtractExpiry(const Slice& internal_key) {
assert(internal_key.size() >= KeySuffixSize(kTypeValueWriteTime));
assert(IsExpiryKey(internal_key));
return(DecodeFixed64(internal_key.data() + internal_key.size() - KeySuffixSize(kTypeValueWriteTime)));
}
// A comparator for internal keys that uses a specified comparator for // A comparator for internal keys that uses a specified comparator for
// the user key portion and breaks ties by decreasing sequence number. // the user key portion and breaks ties by decreasing sequence number.
class InternalKeyComparator : public Comparator { class InternalKeyComparator : public Comparator {
@ -129,7 +194,7 @@ class InternalKeyComparator : public Comparator {
// Filter policy wrapper that converts from internal keys to user keys // Filter policy wrapper that converts from internal keys to user keys
class InternalFilterPolicy : public FilterPolicy { class InternalFilterPolicy : public FilterPolicy {
private: protected:
const FilterPolicy* const user_policy_; const FilterPolicy* const user_policy_;
public: public:
explicit InternalFilterPolicy(const FilterPolicy* p) : user_policy_(p) { } explicit InternalFilterPolicy(const FilterPolicy* p) : user_policy_(p) { }
@ -138,6 +203,12 @@ class InternalFilterPolicy : public FilterPolicy {
virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const; virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const;
}; };
class InternalFilterPolicy2 : public InternalFilterPolicy {
public:
explicit InternalFilterPolicy2(const FilterPolicy* p) : InternalFilterPolicy(p) { }
virtual ~InternalFilterPolicy2() {delete user_policy_;};
};
// Modules in this directory should keep internal keys wrapped inside // Modules in this directory should keep internal keys wrapped inside
// the following class instead of plain strings so that we do not // the following class instead of plain strings so that we do not
// incorrectly use string comparisons instead of an InternalKeyComparator. // incorrectly use string comparisons instead of an InternalKeyComparator.
@ -146,8 +217,8 @@ class InternalKey {
std::string rep_; std::string rep_;
public: public:
InternalKey() { } // Leave rep_ as empty to indicate it is invalid InternalKey() { } // Leave rep_ as empty to indicate it is invalid
InternalKey(const Slice& user_key, SequenceNumber s, ValueType t) { InternalKey(const Slice& user_key, ExpiryTimeMicros exp, SequenceNumber s, ValueType t) {
AppendInternalKey(&rep_, ParsedInternalKey(user_key, s, t)); AppendInternalKey(&rep_, ParsedInternalKey(user_key, exp, s, t));
} }
void DecodeFrom(const Slice& s) { rep_.assign(s.data(), s.size()); } void DecodeFrom(const Slice& s) { rep_.assign(s.data(), s.size()); }
@ -157,6 +228,7 @@ class InternalKey {
} }
Slice user_key() const { return ExtractUserKey(rep_); } Slice user_key() const { return ExtractUserKey(rep_); }
Slice internal_key() const { return Slice(rep_); }
void SetFrom(const ParsedInternalKey& p) { void SetFrom(const ParsedInternalKey& p) {
rep_.clear(); rep_.clear();
@ -181,8 +253,12 @@ inline bool ParseInternalKey(const Slice& internal_key,
unsigned char c = num & 0xff; unsigned char c = num & 0xff;
result->sequence = num >> 8; result->sequence = num >> 8;
result->type = static_cast<ValueType>(c); result->type = static_cast<ValueType>(c);
result->user_key = Slice(internal_key.data(), n - 8); if (IsExpiryKey((ValueType)c))
return (c <= static_cast<unsigned char>(kTypeValue)); result->expiry=DecodeFixed64(internal_key.data() + n - KeySuffixSize((ValueType)c));
else
result->expiry=0;
result->user_key = Slice(internal_key.data(), n - KeySuffixSize((ValueType)c));
return (c <= static_cast<unsigned char>(kTypeValueExplicitExpiry));
} }
// A helper class useful for DBImpl::Get() // A helper class useful for DBImpl::Get()
@ -190,7 +266,7 @@ class LookupKey {
public: public:
// Initialize *this for looking up user_key at a snapshot with // Initialize *this for looking up user_key at a snapshot with
// the specified sequence number. // the specified sequence number.
LookupKey(const Slice& user_key, SequenceNumber sequence); LookupKey(const Slice& user_key, SequenceNumber sequence, KeyMetaData * meta=NULL);
~LookupKey(); ~LookupKey();
@ -201,12 +277,38 @@ class LookupKey {
Slice internal_key() const { return Slice(kstart_, end_ - kstart_); } Slice internal_key() const { return Slice(kstart_, end_ - kstart_); }
// Return the user key // Return the user key
Slice user_key() const { return Slice(kstart_, end_ - kstart_ - 8); } Slice user_key() const
{ return Slice(kstart_, end_ - kstart_ - KeySuffixSize(internal_key())); }
// did requestor have metadata object?
bool WantsKeyMetaData() const {return(NULL!=meta_);};
void SetKeyMetaData(ValueType type, SequenceNumber seq, ExpiryTimeMicros expiry) const
{if (NULL!=meta_)
{
meta_->m_Type=type;
meta_->m_Sequence=seq;
meta_->m_Expiry=expiry;
} // if
};
void SetKeyMetaData(const ParsedInternalKey & pi_key) const
{if (NULL!=meta_)
{
meta_->m_Type=pi_key.type;
meta_->m_Sequence=pi_key.sequence;
meta_->m_Expiry=pi_key.expiry;
} // if
};
void SetKeyMetaData(const KeyMetaData & meta) const
{if (NULL!=meta_) *meta_=meta;};
private: private:
// We construct a char array of the form: // We construct a char array of the form:
// klength varint32 <-- start_ // klength varint32 <-- start_
// userkey char[klength] <-- kstart_ // userkey char[klength] <-- kstart_
// optional uint64
// tag uint64 // tag uint64
// <-- end_ // <-- end_
// The array is a suitable MemTable key. // The array is a suitable MemTable key.
@ -216,6 +318,9 @@ class LookupKey {
const char* end_; const char* end_;
char space_[200]; // Avoid allocation for short keys char space_[200]; // Avoid allocation for short keys
// allow code that finds the key to place metadata here, even if 'const'
mutable KeyMetaData * meta_;
// No copying allowed // No copying allowed
LookupKey(const LookupKey&); LookupKey(const LookupKey&);
void operator=(const LookupKey&); void operator=(const LookupKey&);
@ -223,8 +328,47 @@ class LookupKey {
inline LookupKey::~LookupKey() { inline LookupKey::~LookupKey() {
if (start_ != space_) delete[] start_; if (start_ != space_) delete[] start_;
} };
// this class was constructed from code with DBImpl::DoCompactionWork (db_impl.cc)
// so it could be shared within BuildTable (and thus reduce Level 0 bloating)
class KeyRetirement
{
protected:
// "state" from previous key reviewed
std::string current_user_key;
bool has_current_user_key;
SequenceNumber last_sequence_for_key;
// database values needed for processing
const Comparator * user_comparator;
SequenceNumber smallest_snapshot;
const Options * options;
Compaction * const compaction;
bool valid;
size_t dropped; // tombstone or old version dropped
size_t expired; // expired dropped
public:
KeyRetirement(const Comparator * UserComparator, SequenceNumber SmallestSnapshot,
const Options * Opts, Compaction * const Compaction=NULL);
virtual ~KeyRetirement();
bool operator()(Slice & key);
size_t GetDroppedCount() const {return(dropped);};
size_t GetExpiredCount() const {return(expired);};
private:
KeyRetirement();
KeyRetirement(const KeyRetirement &);
const KeyRetirement & operator=(const KeyRetirement &);
}; // class KeyRetirement
} // namespace leveldb } // namespace leveldb
#endif // STORAGE_LEVELDB_DB_DBFORMAT_H_ #endif // STORAGE_LEVELDB_DB_FORMAT_H_

View file

@ -9,10 +9,11 @@
namespace leveldb { namespace leveldb {
static std::string IKey(const std::string& user_key, static std::string IKey(const std::string& user_key,
ExpiryTimeMicros exp,
uint64_t seq, uint64_t seq,
ValueType vt) { ValueType vt) {
std::string encoded; std::string encoded;
AppendInternalKey(&encoded, ParsedInternalKey(user_key, seq, vt)); AppendInternalKey(&encoded, ParsedInternalKey(user_key, exp, seq, vt));
return encoded; return encoded;
} }
@ -29,12 +30,13 @@ static std::string ShortSuccessor(const std::string& s) {
} }
static void TestKey(const std::string& key, static void TestKey(const std::string& key,
ExpiryTimeMicros exp,
uint64_t seq, uint64_t seq,
ValueType vt) { ValueType vt) {
std::string encoded = IKey(key, seq, vt); std::string encoded = IKey(key, exp, seq, vt);
Slice in(encoded); Slice in(encoded);
ParsedInternalKey decoded("", 0, kTypeValue); ParsedInternalKey decoded("", 0, 0, kTypeValue);
ASSERT_TRUE(ParseInternalKey(in, &decoded)); ASSERT_TRUE(ParseInternalKey(in, &decoded));
ASSERT_EQ(key, decoded.user_key.ToString()); ASSERT_EQ(key, decoded.user_key.ToString());
@ -56,53 +58,53 @@ TEST(FormatTest, InternalKey_EncodeDecode) {
}; };
for (int k = 0; k < sizeof(keys) / sizeof(keys[0]); k++) { for (int k = 0; k < sizeof(keys) / sizeof(keys[0]); k++) {
for (int s = 0; s < sizeof(seq) / sizeof(seq[0]); s++) { for (int s = 0; s < sizeof(seq) / sizeof(seq[0]); s++) {
TestKey(keys[k], seq[s], kTypeValue); TestKey(keys[k], 0, seq[s], kTypeValue);
TestKey("hello", 1, kTypeDeletion); TestKey("hello", 0, 1, kTypeDeletion);
} }
} }
} }
TEST(FormatTest, InternalKeyShortSeparator) { TEST(FormatTest, InternalKeyShortSeparator) {
// When user keys are same // When user keys are same
ASSERT_EQ(IKey("foo", 100, kTypeValue), ASSERT_EQ(IKey("foo", 0, 100, kTypeValue),
Shorten(IKey("foo", 100, kTypeValue), Shorten(IKey("foo", 0, 100, kTypeValue),
IKey("foo", 99, kTypeValue))); IKey("foo", 0, 99, kTypeValue)));
ASSERT_EQ(IKey("foo", 100, kTypeValue), ASSERT_EQ(IKey("foo", 0, 100, kTypeValue),
Shorten(IKey("foo", 100, kTypeValue), Shorten(IKey("foo", 0, 100, kTypeValue),
IKey("foo", 101, kTypeValue))); IKey("foo", 0, 101, kTypeValue)));
ASSERT_EQ(IKey("foo", 100, kTypeValue), ASSERT_EQ(IKey("foo", 0, 100, kTypeValue),
Shorten(IKey("foo", 100, kTypeValue), Shorten(IKey("foo", 0, 100, kTypeValue),
IKey("foo", 100, kTypeValue))); IKey("foo", 0, 100, kTypeValue)));
ASSERT_EQ(IKey("foo", 100, kTypeValue), ASSERT_EQ(IKey("foo", 0, 100, kTypeValue),
Shorten(IKey("foo", 100, kTypeValue), Shorten(IKey("foo", 0, 100, kTypeValue),
IKey("foo", 100, kTypeDeletion))); IKey("foo", 0, 100, kTypeDeletion)));
// When user keys are misordered // When user keys are misordered
ASSERT_EQ(IKey("foo", 100, kTypeValue), ASSERT_EQ(IKey("foo", 0, 100, kTypeValue),
Shorten(IKey("foo", 100, kTypeValue), Shorten(IKey("foo", 0, 100, kTypeValue),
IKey("bar", 99, kTypeValue))); IKey("bar", 0, 99, kTypeValue)));
// When user keys are different, but correctly ordered // When user keys are different, but correctly ordered
ASSERT_EQ(IKey("g", kMaxSequenceNumber, kValueTypeForSeek), ASSERT_EQ(IKey("g", 0, kMaxSequenceNumber, kValueTypeForSeek),
Shorten(IKey("foo", 100, kTypeValue), Shorten(IKey("foo", 0, 100, kTypeValue),
IKey("hello", 200, kTypeValue))); IKey("hello", 0, 200, kTypeValue)));
// When start user key is prefix of limit user key // When start user key is prefix of limit user key
ASSERT_EQ(IKey("foo", 100, kTypeValue), ASSERT_EQ(IKey("foo", 0, 100, kTypeValue),
Shorten(IKey("foo", 100, kTypeValue), Shorten(IKey("foo", 0, 100, kTypeValue),
IKey("foobar", 200, kTypeValue))); IKey("foobar", 0, 200, kTypeValue)));
// When limit user key is prefix of start user key // When limit user key is prefix of start user key
ASSERT_EQ(IKey("foobar", 100, kTypeValue), ASSERT_EQ(IKey("foobar", 0, 100, kTypeValue),
Shorten(IKey("foobar", 100, kTypeValue), Shorten(IKey("foobar", 0, 100, kTypeValue),
IKey("foo", 200, kTypeValue))); IKey("foo", 0, 200, kTypeValue)));
} }
TEST(FormatTest, InternalKeyShortestSuccessor) { TEST(FormatTest, InternalKeyShortestSuccessor) {
ASSERT_EQ(IKey("g", kMaxSequenceNumber, kValueTypeForSeek), ASSERT_EQ(IKey("g", 0, kMaxSequenceNumber, kValueTypeForSeek),
ShortSuccessor(IKey("foo", 100, kTypeValue))); ShortSuccessor(IKey("foo", 0, 100, kTypeValue)));
ASSERT_EQ(IKey("\xff\xff", 100, kTypeValue), ASSERT_EQ(IKey("\xff\xff", 0, 100, kTypeValue),
ShortSuccessor(IKey("\xff\xff", 100, kTypeValue))); ShortSuccessor(IKey("\xff\xff", 0, 100, kTypeValue)));
} }
} // namespace leveldb } // namespace leveldb

View file

@ -1,554 +0,0 @@
// Copyright 2014 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
// This test uses a custom Env to keep track of the state of a filesystem as of
// the last "sync". It then checks for data loss errors by purposely dropping
// file data (or entire files) not protected by a "sync".
#include "leveldb/db.h"
#include <map>
#include <set>
#include "db/db_impl.h"
#include "db/filename.h"
#include "db/log_format.h"
#include "db/version_set.h"
#include "leveldb/cache.h"
#include "leveldb/env.h"
#include "leveldb/table.h"
#include "leveldb/write_batch.h"
#include "util/logging.h"
#include "util/mutexlock.h"
#include "util/testharness.h"
#include "util/testutil.h"
namespace leveldb {
static const int kValueSize = 1000;
static const int kMaxNumValues = 2000;
static const size_t kNumIterations = 3;
class FaultInjectionTestEnv;
namespace {
// Assume a filename, and not a directory name like "/foo/bar/"
static std::string GetDirName(const std::string filename) {
size_t found = filename.find_last_of("/\\");
if (found == std::string::npos) {
return "";
} else {
return filename.substr(0, found);
}
}
Status SyncDir(const std::string& dir) {
// As this is a test it isn't required to *actually* sync this directory.
return Status::OK();
}
// A basic file truncation function suitable for this test.
Status Truncate(const std::string& filename, uint64_t length) {
leveldb::Env* env = leveldb::Env::Default();
SequentialFile* orig_file;
Status s = env->NewSequentialFile(filename, &orig_file);
if (!s.ok())
return s;
char* scratch = new char[length];
leveldb::Slice result;
s = orig_file->Read(length, &result, scratch);
delete orig_file;
if (s.ok()) {
std::string tmp_name = GetDirName(filename) + "/truncate.tmp";
WritableFile* tmp_file;
s = env->NewWritableFile(tmp_name, &tmp_file);
if (s.ok()) {
s = tmp_file->Append(result);
delete tmp_file;
if (s.ok()) {
s = env->RenameFile(tmp_name, filename);
} else {
env->DeleteFile(tmp_name);
}
}
}
delete[] scratch;
return s;
}
struct FileState {
std::string filename_;
ssize_t pos_;
ssize_t pos_at_last_sync_;
ssize_t pos_at_last_flush_;
FileState(const std::string& filename)
: filename_(filename),
pos_(-1),
pos_at_last_sync_(-1),
pos_at_last_flush_(-1) { }
FileState() : pos_(-1), pos_at_last_sync_(-1), pos_at_last_flush_(-1) {}
bool IsFullySynced() const { return pos_ <= 0 || pos_ == pos_at_last_sync_; }
Status DropUnsyncedData() const;
};
} // anonymous namespace
// A wrapper around WritableFile which informs another Env whenever this file
// is written to or sync'ed.
class TestWritableFile : public WritableFile {
public:
TestWritableFile(const FileState& state,
WritableFile* f,
FaultInjectionTestEnv* env);
virtual ~TestWritableFile();
virtual Status Append(const Slice& data);
virtual Status Close();
virtual Status Flush();
virtual Status Sync();
private:
FileState state_;
WritableFile* target_;
bool writable_file_opened_;
FaultInjectionTestEnv* env_;
Status SyncParent();
};
class FaultInjectionTestEnv : public EnvWrapper {
public:
FaultInjectionTestEnv() : EnvWrapper(Env::Default()), filesystem_active_(true) {}
virtual ~FaultInjectionTestEnv() { }
virtual Status NewWritableFile(const std::string& fname,
WritableFile** result);
virtual Status NewAppendableFile(const std::string& fname,
WritableFile** result);
virtual Status DeleteFile(const std::string& f);
virtual Status RenameFile(const std::string& s, const std::string& t);
void WritableFileClosed(const FileState& state);
Status DropUnsyncedFileData();
Status DeleteFilesCreatedAfterLastDirSync();
void DirWasSynced();
bool IsFileCreatedSinceLastDirSync(const std::string& filename);
void ResetState();
void UntrackFile(const std::string& f);
// Setting the filesystem to inactive is the test equivalent to simulating a
// system reset. Setting to inactive will freeze our saved filesystem state so
// that it will stop being recorded. It can then be reset back to the state at
// the time of the reset.
bool IsFilesystemActive() const { return filesystem_active_; }
void SetFilesystemActive(bool active) { filesystem_active_ = active; }
private:
port::Mutex mutex_;
std::map<std::string, FileState> db_file_state_;
std::set<std::string> new_files_since_last_dir_sync_;
bool filesystem_active_; // Record flushes, syncs, writes
};
TestWritableFile::TestWritableFile(const FileState& state,
WritableFile* f,
FaultInjectionTestEnv* env)
: state_(state),
target_(f),
writable_file_opened_(true),
env_(env) {
assert(f != NULL);
}
TestWritableFile::~TestWritableFile() {
if (writable_file_opened_) {
Close();
}
delete target_;
}
Status TestWritableFile::Append(const Slice& data) {
Status s = target_->Append(data);
if (s.ok() && env_->IsFilesystemActive()) {
state_.pos_ += data.size();
}
return s;
}
Status TestWritableFile::Close() {
writable_file_opened_ = false;
Status s = target_->Close();
if (s.ok()) {
env_->WritableFileClosed(state_);
}
return s;
}
Status TestWritableFile::Flush() {
Status s = target_->Flush();
if (s.ok() && env_->IsFilesystemActive()) {
state_.pos_at_last_flush_ = state_.pos_;
}
return s;
}
Status TestWritableFile::SyncParent() {
Status s = SyncDir(GetDirName(state_.filename_));
if (s.ok()) {
env_->DirWasSynced();
}
return s;
}
Status TestWritableFile::Sync() {
if (!env_->IsFilesystemActive()) {
return Status::OK();
}
// Ensure new files referred to by the manifest are in the filesystem.
Status s = target_->Sync();
if (s.ok()) {
state_.pos_at_last_sync_ = state_.pos_;
}
if (env_->IsFileCreatedSinceLastDirSync(state_.filename_)) {
Status ps = SyncParent();
if (s.ok() && !ps.ok()) {
s = ps;
}
}
return s;
}
Status FaultInjectionTestEnv::NewWritableFile(const std::string& fname,
WritableFile** result) {
WritableFile* actual_writable_file;
Status s = target()->NewWritableFile(fname, &actual_writable_file);
if (s.ok()) {
FileState state(fname);
state.pos_ = 0;
*result = new TestWritableFile(state, actual_writable_file, this);
// NewWritableFile doesn't append to files, so if the same file is
// opened again then it will be truncated - so forget our saved
// state.
UntrackFile(fname);
MutexLock l(&mutex_);
new_files_since_last_dir_sync_.insert(fname);
}
return s;
}
Status FaultInjectionTestEnv::NewAppendableFile(const std::string& fname,
WritableFile** result) {
WritableFile* actual_writable_file;
Status s = target()->NewAppendableFile(fname, &actual_writable_file);
if (s.ok()) {
FileState state(fname);
state.pos_ = 0;
{
MutexLock l(&mutex_);
if (db_file_state_.count(fname) == 0) {
new_files_since_last_dir_sync_.insert(fname);
} else {
state = db_file_state_[fname];
}
}
*result = new TestWritableFile(state, actual_writable_file, this);
}
return s;
}
Status FaultInjectionTestEnv::DropUnsyncedFileData() {
Status s;
MutexLock l(&mutex_);
for (std::map<std::string, FileState>::const_iterator it =
db_file_state_.begin();
s.ok() && it != db_file_state_.end(); ++it) {
const FileState& state = it->second;
if (!state.IsFullySynced()) {
s = state.DropUnsyncedData();
}
}
return s;
}
void FaultInjectionTestEnv::DirWasSynced() {
MutexLock l(&mutex_);
new_files_since_last_dir_sync_.clear();
}
bool FaultInjectionTestEnv::IsFileCreatedSinceLastDirSync(
const std::string& filename) {
MutexLock l(&mutex_);
return new_files_since_last_dir_sync_.find(filename) !=
new_files_since_last_dir_sync_.end();
}
void FaultInjectionTestEnv::UntrackFile(const std::string& f) {
MutexLock l(&mutex_);
db_file_state_.erase(f);
new_files_since_last_dir_sync_.erase(f);
}
Status FaultInjectionTestEnv::DeleteFile(const std::string& f) {
Status s = EnvWrapper::DeleteFile(f);
ASSERT_OK(s);
if (s.ok()) {
UntrackFile(f);
}
return s;
}
Status FaultInjectionTestEnv::RenameFile(const std::string& s,
const std::string& t) {
Status ret = EnvWrapper::RenameFile(s, t);
if (ret.ok()) {
MutexLock l(&mutex_);
if (db_file_state_.find(s) != db_file_state_.end()) {
db_file_state_[t] = db_file_state_[s];
db_file_state_.erase(s);
}
if (new_files_since_last_dir_sync_.erase(s) != 0) {
assert(new_files_since_last_dir_sync_.find(t) ==
new_files_since_last_dir_sync_.end());
new_files_since_last_dir_sync_.insert(t);
}
}
return ret;
}
void FaultInjectionTestEnv::ResetState() {
// Since we are not destroying the database, the existing files
// should keep their recorded synced/flushed state. Therefore
// we do not reset db_file_state_ and new_files_since_last_dir_sync_.
MutexLock l(&mutex_);
SetFilesystemActive(true);
}
Status FaultInjectionTestEnv::DeleteFilesCreatedAfterLastDirSync() {
// Because DeleteFile access this container make a copy to avoid deadlock
mutex_.Lock();
std::set<std::string> new_files(new_files_since_last_dir_sync_.begin(),
new_files_since_last_dir_sync_.end());
mutex_.Unlock();
Status s;
std::set<std::string>::const_iterator it;
for (it = new_files.begin(); s.ok() && it != new_files.end(); ++it) {
s = DeleteFile(*it);
}
return s;
}
void FaultInjectionTestEnv::WritableFileClosed(const FileState& state) {
MutexLock l(&mutex_);
db_file_state_[state.filename_] = state;
}
Status FileState::DropUnsyncedData() const {
ssize_t sync_pos = pos_at_last_sync_ == -1 ? 0 : pos_at_last_sync_;
return Truncate(filename_, sync_pos);
}
class FaultInjectionTest {
public:
enum ExpectedVerifResult { VAL_EXPECT_NO_ERROR, VAL_EXPECT_ERROR };
enum ResetMethod { RESET_DROP_UNSYNCED_DATA, RESET_DELETE_UNSYNCED_FILES };
FaultInjectionTestEnv* env_;
std::string dbname_;
Cache* tiny_cache_;
Options options_;
DB* db_;
FaultInjectionTest()
: env_(new FaultInjectionTestEnv),
tiny_cache_(NewLRUCache(100)),
db_(NULL) {
dbname_ = test::TmpDir() + "/fault_test";
DestroyDB(dbname_, Options()); // Destroy any db from earlier run
options_.reuse_logs = true;
options_.env = env_;
options_.paranoid_checks = true;
options_.block_cache = tiny_cache_;
options_.create_if_missing = true;
}
~FaultInjectionTest() {
CloseDB();
DestroyDB(dbname_, Options());
delete tiny_cache_;
delete env_;
}
void ReuseLogs(bool reuse) {
options_.reuse_logs = reuse;
}
void Build(int start_idx, int num_vals) {
std::string key_space, value_space;
WriteBatch batch;
for (int i = start_idx; i < start_idx + num_vals; i++) {
Slice key = Key(i, &key_space);
batch.Clear();
batch.Put(key, Value(i, &value_space));
WriteOptions options;
ASSERT_OK(db_->Write(options, &batch));
}
}
Status ReadValue(int i, std::string* val) const {
std::string key_space, value_space;
Slice key = Key(i, &key_space);
Value(i, &value_space);
ReadOptions options;
return db_->Get(options, key, val);
}
Status Verify(int start_idx, int num_vals,
ExpectedVerifResult expected) const {
std::string val;
std::string value_space;
Status s;
for (int i = start_idx; i < start_idx + num_vals && s.ok(); i++) {
Value(i, &value_space);
s = ReadValue(i, &val);
if (expected == VAL_EXPECT_NO_ERROR) {
if (s.ok()) {
ASSERT_EQ(value_space, val);
}
} else if (s.ok()) {
fprintf(stderr, "Expected an error at %d, but was OK\n", i);
s = Status::IOError(dbname_, "Expected value error:");
} else {
s = Status::OK(); // An expected error
}
}
return s;
}
// Return the ith key
Slice Key(int i, std::string* storage) const {
char buf[100];
snprintf(buf, sizeof(buf), "%016d", i);
storage->assign(buf, strlen(buf));
return Slice(*storage);
}
// Return the value to associate with the specified key
Slice Value(int k, std::string* storage) const {
Random r(k);
return test::RandomString(&r, kValueSize, storage);
}
Status OpenDB() {
delete db_;
db_ = NULL;
env_->ResetState();
return DB::Open(options_, dbname_, &db_);
}
void CloseDB() {
delete db_;
db_ = NULL;
}
void DeleteAllData() {
Iterator* iter = db_->NewIterator(ReadOptions());
WriteOptions options;
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
ASSERT_OK(db_->Delete(WriteOptions(), iter->key()));
}
delete iter;
}
void ResetDBState(ResetMethod reset_method) {
switch (reset_method) {
case RESET_DROP_UNSYNCED_DATA:
ASSERT_OK(env_->DropUnsyncedFileData());
break;
case RESET_DELETE_UNSYNCED_FILES:
ASSERT_OK(env_->DeleteFilesCreatedAfterLastDirSync());
break;
default:
assert(false);
}
}
void PartialCompactTestPreFault(int num_pre_sync, int num_post_sync) {
DeleteAllData();
Build(0, num_pre_sync);
db_->CompactRange(NULL, NULL);
Build(num_pre_sync, num_post_sync);
}
void PartialCompactTestReopenWithFault(ResetMethod reset_method,
int num_pre_sync,
int num_post_sync) {
env_->SetFilesystemActive(false);
CloseDB();
ResetDBState(reset_method);
ASSERT_OK(OpenDB());
ASSERT_OK(Verify(0, num_pre_sync, FaultInjectionTest::VAL_EXPECT_NO_ERROR));
ASSERT_OK(Verify(num_pre_sync, num_post_sync, FaultInjectionTest::VAL_EXPECT_ERROR));
}
void NoWriteTestPreFault() {
}
void NoWriteTestReopenWithFault(ResetMethod reset_method) {
CloseDB();
ResetDBState(reset_method);
ASSERT_OK(OpenDB());
}
void DoTest() {
Random rnd(0);
ASSERT_OK(OpenDB());
for (size_t idx = 0; idx < kNumIterations; idx++) {
int num_pre_sync = rnd.Uniform(kMaxNumValues);
int num_post_sync = rnd.Uniform(kMaxNumValues);
PartialCompactTestPreFault(num_pre_sync, num_post_sync);
PartialCompactTestReopenWithFault(RESET_DROP_UNSYNCED_DATA,
num_pre_sync,
num_post_sync);
NoWriteTestPreFault();
NoWriteTestReopenWithFault(RESET_DROP_UNSYNCED_DATA);
PartialCompactTestPreFault(num_pre_sync, num_post_sync);
// No new files created so we expect all values since no files will be
// dropped.
PartialCompactTestReopenWithFault(RESET_DELETE_UNSYNCED_FILES,
num_pre_sync + num_post_sync,
0);
NoWriteTestPreFault();
NoWriteTestReopenWithFault(RESET_DELETE_UNSYNCED_FILES);
}
}
};
TEST(FaultInjectionTest, FaultTestNoLogReuse) {
ReuseLogs(false);
DoTest();
}
TEST(FaultInjectionTest, FaultTestWithLogReuse) {
ReuseLogs(true);
DoTest();
}
} // namespace leveldb
int main(int argc, char** argv) {
return leveldb::test::RunAllTests();
}

View file

@ -4,9 +4,14 @@
#include <ctype.h> #include <ctype.h>
#include <stdio.h> #include <stdio.h>
#include <errno.h>
#include <sys/stat.h>
#include <sys/types.h>
#include "db/filename.h" #include "db/filename.h"
#include "db/dbformat.h" #include "db/dbformat.h"
#include "db/version_set.h"
#include "leveldb/env.h" #include "leveldb/env.h"
#include "leveldb/status.h"
#include "util/logging.h" #include "util/logging.h"
namespace leveldb { namespace leveldb {
@ -24,19 +29,50 @@ static std::string MakeFileName(const std::string& name, uint64_t number,
return name + buf; return name + buf;
} }
static std::string MakeFileName2(const Options & options, uint64_t number,
int level, const char* suffix) {
char buf[100];
if (0<=level)
snprintf(buf, sizeof(buf), "/%s_%-d/%06llu.%s",
suffix, level,
static_cast<unsigned long long>(number),
suffix);
else if (-1==level)
snprintf(buf, sizeof(buf), "/%s/%06llu.%s",
suffix,
static_cast<unsigned long long>(number),
suffix);
else if (-2==level)
snprintf(buf, sizeof(buf), "/%06llu.%s",
static_cast<unsigned long long>(number),
suffix);
return((level<(int)options.tiered_slow_level ?
options.tiered_fast_prefix : options.tiered_slow_prefix) + buf);
}
std::string MakeDirName2(const Options & options,
int level, const char* suffix) {
char buf[100];
if (-1!=level)
snprintf(buf, sizeof(buf), "/%s_%-d",
suffix, level);
else
snprintf(buf, sizeof(buf), "/%s",
suffix);
return((level<(int)options.tiered_slow_level ?
options.tiered_fast_prefix : options.tiered_slow_prefix) + buf);
}
std::string LogFileName(const std::string& name, uint64_t number) { std::string LogFileName(const std::string& name, uint64_t number) {
assert(number > 0); assert(number > 0);
return MakeFileName(name, number, "log"); return MakeFileName(name, number, "log");
} }
std::string TableFileName(const std::string& name, uint64_t number) { std::string TableFileName(const Options & options, uint64_t number, int level) {
assert(number > 0); assert(number > 0);
return MakeFileName(name, number, "ldb"); return MakeFileName2(options, number, level, "sst");
}
std::string SSTTableFileName(const std::string& name, uint64_t number) {
assert(number > 0);
return MakeFileName(name, number, "sst");
} }
std::string DescriptorFileName(const std::string& dbname, uint64_t number) { std::string DescriptorFileName(const std::string& dbname, uint64_t number) {
@ -69,6 +105,36 @@ std::string OldInfoLogFileName(const std::string& dbname) {
return dbname + "/LOG.old"; return dbname + "/LOG.old";
} }
//
std::string CowFileName(const std::string& dbname) {
return dbname + "/COW";
}
// Append appropriate "backup" string to input path
std::string BackupPath(const std::string& dbname, int backup_num) {
std::string dirname;
char buf[100];
if (0 != backup_num)
snprintf(buf, sizeof(buf), "/backup.%-d", backup_num);
else
snprintf(buf, sizeof(buf), "/backup");
return(dbname + buf);
}
// update tiered_fast_prefix and tiered_slow_prefix members of
// given Options object to point to desired backup path
bool SetBackupPaths(Options & options, int backup_num) {
options.tiered_fast_prefix = BackupPath(options.tiered_fast_prefix, backup_num);
options.tiered_slow_prefix = BackupPath(options.tiered_slow_prefix, backup_num);
return(true);
}
// Owned filenames have the form: // Owned filenames have the form:
// dbname/CURRENT // dbname/CURRENT
@ -76,7 +142,8 @@ std::string OldInfoLogFileName(const std::string& dbname) {
// dbname/LOG // dbname/LOG
// dbname/LOG.old // dbname/LOG.old
// dbname/MANIFEST-[0-9]+ // dbname/MANIFEST-[0-9]+
// dbname/[0-9]+.(log|sst|ldb) // dbname/[0-9]+.(log|sst)
// dbname/COW
bool ParseFileName(const std::string& fname, bool ParseFileName(const std::string& fname,
uint64_t* number, uint64_t* number,
FileType* type) { FileType* type) {
@ -84,6 +151,9 @@ bool ParseFileName(const std::string& fname,
if (rest == "CURRENT") { if (rest == "CURRENT") {
*number = 0; *number = 0;
*type = kCurrentFile; *type = kCurrentFile;
} else if (rest == "COW") {
*number = 0;
*type = kCacheWarming;
} else if (rest == "LOCK") { } else if (rest == "LOCK") {
*number = 0; *number = 0;
*type = kDBLockFile; *type = kDBLockFile;
@ -111,7 +181,7 @@ bool ParseFileName(const std::string& fname,
Slice suffix = rest; Slice suffix = rest;
if (suffix == Slice(".log")) { if (suffix == Slice(".log")) {
*type = kLogFile; *type = kLogFile;
} else if (suffix == Slice(".sst") || suffix == Slice(".ldb")) { } else if (suffix == Slice(".sst")) {
*type = kTableFile; *type = kTableFile;
} else if (suffix == Slice(".dbtmp")) { } else if (suffix == Slice(".dbtmp")) {
*type = kTempFile; *type = kTempFile;
@ -141,4 +211,99 @@ Status SetCurrentFile(Env* env, const std::string& dbname,
return s; return s;
} }
Status
MakeLevelDirectories(Env * env, const Options & options)
{
Status ret_stat;
int level;
std::string dirname;
for (level=0; level<config::kNumLevels && ret_stat.ok(); ++level)
{
dirname=MakeDirName2(options, level, "sst");
// ignoring error since no way to tell if "bad" error, or "already exists" error
env->CreateDir(dirname.c_str());
} // for
return(ret_stat);
} // MakeLevelDirectories
bool
TestForLevelDirectories(
Env * env,
const Options & options,
Version * version)
{
bool ret_flag, again;
int level;
std::string dirname;
ret_flag=true;
again=true;
// walk backwards, fault will be in higher levels if partial conversion
for (level=config::kNumLevels-1; 0<=level && again; --level)
{
again=false;
// does directory exist
dirname=MakeDirName2(options, level, "sst");
ret_flag=env->FileExists(dirname.c_str());
// do all files exist in level
if (ret_flag)
{
const std::vector<FileMetaData*> & level_files(version->GetFileList(level));
std::vector<FileMetaData*>::const_iterator it;
std::string table_name;
Status s;
for (it=level_files.begin(); level_files.end()!=it && ret_flag; ++it)
{
table_name=TableFileName(options, (*it)->number, level);
ret_flag=env->FileExists(table_name.c_str());
} // for
again=ret_flag && 0==level_files.size();
} // if
} // for
return(ret_flag);
} // TestForLevelDirectories
std::string // replacement dbname ... potentially tiered
MakeTieredDbname(
const std::string & dbname, // input ... original dbname from DBImpl constructor
Options & options) // input/output ... writable Options, tiered values changed
{
// case for "", used with internal calls to DestroyDB
if (0==dbname.size() && 0!=options.tiered_fast_prefix.size())
{
// do NOTHING ... options already initialized
} // if
else if (0<(int)options.tiered_slow_level && (int)options.tiered_slow_level<config::kNumLevels
&& 0!=options.tiered_fast_prefix.size() && 0!=options.tiered_slow_prefix.size())
{
options.tiered_fast_prefix.append("/");
options.tiered_fast_prefix.append(dbname);
options.tiered_slow_prefix.append("/");
options.tiered_slow_prefix.append(dbname);
} // else if
else
{
options.tiered_slow_level=0;
options.tiered_fast_prefix=dbname; // duplicate as is
options.tiered_slow_prefix=dbname;
} // else
return(options.tiered_fast_prefix);
} // MakeTieredDbname
} // namespace leveldb } // namespace leveldb

View file

@ -9,6 +9,7 @@
#include <stdint.h> #include <stdint.h>
#include <string> #include <string>
#include "leveldb/options.h"
#include "leveldb/slice.h" #include "leveldb/slice.h"
#include "leveldb/status.h" #include "leveldb/status.h"
#include "port/port.h" #include "port/port.h"
@ -16,6 +17,7 @@
namespace leveldb { namespace leveldb {
class Env; class Env;
class Version;
enum FileType { enum FileType {
kLogFile, kLogFile,
@ -24,9 +26,24 @@ enum FileType {
kDescriptorFile, kDescriptorFile,
kCurrentFile, kCurrentFile,
kTempFile, kTempFile,
kInfoLogFile // Either the current one, or an old one kInfoLogFile, // Either the current one, or an old one
kCacheWarming
}; };
// Riak specific routine to help create sst_? subdirectory names
std::string MakeDirName2(const Options & options,
int level, const char* suffix);
// Riak specific routine to help create sst_? subdirectories
Status MakeLevelDirectories(Env * env, const Options & options);
// Riak specific routine to test if sst_? subdirectories exist
bool TestForLevelDirectories(Env * env, const Options & options, class Version *);
// Riak specific routine to standardize conversion of dbname and
// Options' tiered directories (options parameter is MODIFIED)
std::string MakeTieredDbname(const std::string &dbname, Options & options_rw);
// Return the name of the log file with the specified number // Return the name of the log file with the specified number
// in the db named by "dbname". The result will be prefixed with // in the db named by "dbname". The result will be prefixed with
// "dbname". // "dbname".
@ -35,12 +52,8 @@ extern std::string LogFileName(const std::string& dbname, uint64_t number);
// Return the name of the sstable with the specified number // Return the name of the sstable with the specified number
// in the db named by "dbname". The result will be prefixed with // in the db named by "dbname". The result will be prefixed with
// "dbname". // "dbname".
extern std::string TableFileName(const std::string& dbname, uint64_t number); extern std::string TableFileName(const Options & options, uint64_t number,
int level);
// Return the legacy file name for an sstable with the specified number
// in the db named by "dbname". The result will be prefixed with
// "dbname".
extern std::string SSTTableFileName(const std::string& dbname, uint64_t number);
// Return the name of the descriptor file for the db named by // Return the name of the descriptor file for the db named by
// "dbname" and the specified incarnation number. The result will be // "dbname" and the specified incarnation number. The result will be
@ -67,10 +80,21 @@ extern std::string InfoLogFileName(const std::string& dbname);
// Return the name of the old info log file for "dbname". // Return the name of the old info log file for "dbname".
extern std::string OldInfoLogFileName(const std::string& dbname); extern std::string OldInfoLogFileName(const std::string& dbname);
// Return the name of the cache object file for the db named by
// "dbname". The result will be prefixed with "dbname".
extern std::string CowFileName(const std::string& dbname);
// Append appropriate "backup" string to input path
extern std::string BackupPath(const std::string& dbname, int backup_num);
// update tiered_fast_prefix and tiered_slow_prefix members of
// given Options object to point to backup path
extern bool SetBackupPaths(Options & options, int backup_num);
// If filename is a leveldb file, store the type of the file in *type. // If filename is a leveldb file, store the type of the file in *type.
// The number encoded in the filename is stored in *number. If the // The number encoded in the filename is stored in *number. If the
// filename was successfully parsed, returns true. Else return false. // filename was successfully parsed, returns true. Else return false.
extern bool ParseFileName(const std::string& filename, extern bool ParseFileName(const std::string& tiered_filename,
uint64_t* number, uint64_t* number,
FileType* type); FileType* type);

View file

@ -27,7 +27,6 @@ TEST(FileNameTest, Parse) {
{ "100.log", 100, kLogFile }, { "100.log", 100, kLogFile },
{ "0.log", 0, kLogFile }, { "0.log", 0, kLogFile },
{ "0.sst", 0, kTableFile }, { "0.sst", 0, kTableFile },
{ "0.ldb", 0, kTableFile },
{ "CURRENT", 0, kCurrentFile }, { "CURRENT", 0, kCurrentFile },
{ "LOCK", 0, kDBLockFile }, { "LOCK", 0, kDBLockFile },
{ "MANIFEST-2", 2, kDescriptorFile }, { "MANIFEST-2", 2, kDescriptorFile },
@ -71,13 +70,14 @@ TEST(FileNameTest, Parse) {
for (int i = 0; i < sizeof(errors) / sizeof(errors[0]); i++) { for (int i = 0; i < sizeof(errors) / sizeof(errors[0]); i++) {
std::string f = errors[i]; std::string f = errors[i];
ASSERT_TRUE(!ParseFileName(f, &number, &type)) << f; ASSERT_TRUE(!ParseFileName(f, &number, &type)) << f;
} };
} }
TEST(FileNameTest, Construction) { TEST(FileNameTest, Construction) {
uint64_t number; uint64_t number;
FileType type; FileType type;
std::string fname; std::string fname;
Options options;
fname = CurrentFileName("foo"); fname = CurrentFileName("foo");
ASSERT_EQ("foo/", std::string(fname.data(), 4)); ASSERT_EQ("foo/", std::string(fname.data(), 4));
@ -97,12 +97,40 @@ TEST(FileNameTest, Construction) {
ASSERT_EQ(192, number); ASSERT_EQ(192, number);
ASSERT_EQ(kLogFile, type); ASSERT_EQ(kLogFile, type);
fname = TableFileName("bar", 200); options.tiered_fast_prefix="bar";
options.tiered_slow_prefix="bar";
fname = TableFileName(options, 200, 1);
ASSERT_EQ("bar/", std::string(fname.data(), 4)); ASSERT_EQ("bar/", std::string(fname.data(), 4));
ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type)); ASSERT_EQ("sst_1/", std::string(fname.substr(4,6)));
ASSERT_TRUE(ParseFileName(fname.c_str() + 10, &number, &type));
ASSERT_EQ(200, number); ASSERT_EQ(200, number);
ASSERT_EQ(kTableFile, type); ASSERT_EQ(kTableFile, type);
fname = TableFileName(options, 400, 4);
ASSERT_EQ("bar/", std::string(fname.data(), 4));
ASSERT_EQ("sst_4/", std::string(fname.substr(4,6)));
ASSERT_TRUE(ParseFileName(fname.c_str() + 10, &number, &type));
ASSERT_EQ(400, number);
ASSERT_EQ(kTableFile, type);
options.tiered_slow_level=4;
options.tiered_fast_prefix="fast";
options.tiered_slow_prefix="slow";
fname = TableFileName(options, 500, 3);
ASSERT_EQ("fast/", std::string(fname.data(), 5));
ASSERT_EQ("sst_3/", std::string(fname.substr(5,6)));
ASSERT_TRUE(ParseFileName(fname.c_str() + 11, &number, &type));
ASSERT_EQ(500, number);
ASSERT_EQ(kTableFile, type);
fname = TableFileName(options, 600, 4);
ASSERT_EQ("slow/", std::string(fname.data(), 5));
ASSERT_EQ("sst_4/", std::string(fname.substr(5,6)));
ASSERT_TRUE(ParseFileName(fname.c_str() + 11, &number, &type));
ASSERT_EQ(600, number);
ASSERT_EQ(kTableFile, type);
fname = DescriptorFileName("bar", 100); fname = DescriptorFileName("bar", 100);
ASSERT_EQ("bar/", std::string(fname.data(), 4)); ASSERT_EQ("bar/", std::string(fname.data(), 4));
ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type)); ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
@ -114,6 +142,48 @@ TEST(FileNameTest, Construction) {
ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type)); ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
ASSERT_EQ(999, number); ASSERT_EQ(999, number);
ASSERT_EQ(kTempFile, type); ASSERT_EQ(kTempFile, type);
fname = CowFileName("/what/goes/moo");
ASSERT_EQ("/what/goes/moo/COW", fname);
fname = BackupPath("/var/db/riak/data/leveldb/0",0);
ASSERT_EQ("/var/db/riak/data/leveldb/0/backup", fname);
fname = BackupPath("/var/db/riak/data/leveldb/0",1);
ASSERT_EQ("/var/db/riak/data/leveldb/0/backup.1", fname);
fname = BackupPath("/var/db/riak/data/leveldb/0",5);
ASSERT_EQ("/var/db/riak/data/leveldb/0/backup.5", fname);
options.tiered_slow_level=4;
options.tiered_fast_prefix="fast";
options.tiered_slow_prefix="slow";
fname = SetBackupPaths(options,0);
ASSERT_EQ("fast/backup", options.tiered_fast_prefix);
ASSERT_EQ("slow/backup", options.tiered_slow_prefix);
options.tiered_slow_level=4;
options.tiered_fast_prefix="fast";
options.tiered_slow_prefix="slow";
fname = SetBackupPaths(options,3);
ASSERT_EQ("fast/backup.3", options.tiered_fast_prefix);
ASSERT_EQ("slow/backup.3", options.tiered_slow_prefix);
options.tiered_slow_level=4;
options.tiered_fast_prefix="//mnt/fast";
options.tiered_slow_prefix="//mnt/slow";
fname=MakeTieredDbname("riak/data/leveldb", options);
ASSERT_EQ("//mnt/fast/riak/data/leveldb", fname);
ASSERT_EQ("//mnt/fast/riak/data/leveldb", options.tiered_fast_prefix);
ASSERT_EQ("//mnt/slow/riak/data/leveldb", options.tiered_slow_prefix);
// special case with no dbname given, should have no changes
fname=MakeTieredDbname("", options);
ASSERT_EQ("//mnt/fast/riak/data/leveldb", fname);
ASSERT_EQ("//mnt/fast/riak/data/leveldb", options.tiered_fast_prefix);
ASSERT_EQ("//mnt/slow/riak/data/leveldb", options.tiered_slow_prefix);
} }
} // namespace leveldb } // namespace leveldb

View file

@ -1,65 +0,0 @@
// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include <stdio.h>
#include "leveldb/dumpfile.h"
#include "leveldb/env.h"
#include "leveldb/status.h"
namespace leveldb {
namespace {
class StdoutPrinter : public WritableFile {
public:
virtual Status Append(const Slice& data) {
fwrite(data.data(), 1, data.size(), stdout);
return Status::OK();
}
virtual Status Close() { return Status::OK(); }
virtual Status Flush() { return Status::OK(); }
virtual Status Sync() { return Status::OK(); }
virtual std::string GetName() const { return "[stdout]"; }
};
bool HandleDumpCommand(Env* env, char** files, int num) {
StdoutPrinter printer;
bool ok = true;
for (int i = 0; i < num; i++) {
Status s = DumpFile(env, files[i], &printer);
if (!s.ok()) {
fprintf(stderr, "%s\n", s.ToString().c_str());
ok = false;
}
}
return ok;
}
} // namespace
} // namespace leveldb
static void Usage() {
fprintf(
stderr,
"Usage: leveldbutil command...\n"
" dump files... -- dump contents of specified files\n"
);
}
int main(int argc, char** argv) {
leveldb::Env* env = leveldb::Env::Default();
bool ok = true;
if (argc < 2) {
Usage();
ok = false;
} else {
std::string command = argv[1];
if (command == "dump") {
ok = leveldb::HandleDumpCommand(env, argv+2, argc-2);
} else {
Usage();
ok = false;
}
}
return (ok ? 0 : 1);
}

View file

@ -3,7 +3,7 @@
// found in the LICENSE file. See the AUTHORS file for names of contributors. // found in the LICENSE file. See the AUTHORS file for names of contributors.
// //
// Log format information shared by reader and writer. // Log format information shared by reader and writer.
// See ../doc/log_format.md for more detail. // See ../doc/log_format.txt for more detail.
#ifndef STORAGE_LEVELDB_DB_LOG_FORMAT_H_ #ifndef STORAGE_LEVELDB_DB_LOG_FORMAT_H_
#define STORAGE_LEVELDB_DB_LOG_FORMAT_H_ #define STORAGE_LEVELDB_DB_LOG_FORMAT_H_
@ -26,8 +26,8 @@ static const int kMaxRecordType = kLastType;
static const int kBlockSize = 32768; static const int kBlockSize = 32768;
// Header is checksum (4 bytes), length (2 bytes), type (1 byte). // Header is checksum (4 bytes), type (1 byte), length (2 bytes).
static const int kHeaderSize = 4 + 2 + 1; static const int kHeaderSize = 4 + 1 + 2;
} // namespace log } // namespace log
} // namespace leveldb } // namespace leveldb

View file

@ -25,8 +25,7 @@ Reader::Reader(SequentialFile* file, Reporter* reporter, bool checksum,
eof_(false), eof_(false),
last_record_offset_(0), last_record_offset_(0),
end_of_buffer_offset_(0), end_of_buffer_offset_(0),
initial_offset_(initial_offset), initial_offset_(initial_offset) {
resyncing_(initial_offset > 0) {
} }
Reader::~Reader() { Reader::~Reader() {
@ -73,25 +72,8 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch) {
Slice fragment; Slice fragment;
while (true) { while (true) {
uint64_t physical_record_offset = end_of_buffer_offset_ - buffer_.size();
const unsigned int record_type = ReadPhysicalRecord(&fragment); const unsigned int record_type = ReadPhysicalRecord(&fragment);
// ReadPhysicalRecord may have only had an empty trailer remaining in its
// internal buffer. Calculate the offset of the next physical record now
// that it has returned, properly accounting for its header size.
uint64_t physical_record_offset =
end_of_buffer_offset_ - buffer_.size() - kHeaderSize - fragment.size();
if (resyncing_) {
if (record_type == kMiddleType) {
continue;
} else if (record_type == kLastType) {
resyncing_ = false;
continue;
} else {
resyncing_ = false;
}
}
switch (record_type) { switch (record_type) {
case kFullType: case kFullType:
if (in_fragmented_record) { if (in_fragmented_record) {
@ -151,9 +133,7 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch) {
case kEof: case kEof:
if (in_fragmented_record) { if (in_fragmented_record) {
// This can be caused by the writer dying immediately after ReportCorruption(scratch->size(), "partial record without end(3)");
// writing a physical record but before completing the next; don't
// treat it as a corruption, just ignore the entire logical record.
scratch->clear(); scratch->clear();
} }
return false; return false;
@ -185,20 +165,20 @@ uint64_t Reader::LastRecordOffset() {
return last_record_offset_; return last_record_offset_;
} }
void Reader::ReportCorruption(uint64_t bytes, const char* reason) { void Reader::ReportCorruption(size_t bytes, const char* reason) {
ReportDrop(bytes, Status::Corruption(reason, file_->GetName())); ReportDrop(bytes, Status::Corruption(reason));
} }
void Reader::ReportDrop(uint64_t bytes, const Status& reason) { void Reader::ReportDrop(size_t bytes, const Status& reason) {
if (reporter_ != NULL && if (reporter_ != NULL &&
end_of_buffer_offset_ - buffer_.size() - bytes >= initial_offset_) { end_of_buffer_offset_ - buffer_.size() - bytes >= initial_offset_) {
reporter_->Corruption(static_cast<size_t>(bytes), reason); reporter_->Corruption(bytes, reason);
} }
} }
unsigned int Reader::ReadPhysicalRecord(Slice* result) { unsigned int Reader::ReadPhysicalRecord(Slice* result) {
while (true) { while (true) {
if (buffer_.size() < kHeaderSize) { if (buffer_.size() < (size_t)kHeaderSize) {
if (!eof_) { if (!eof_) {
// Last read was a full read, so this is a trailer to skip // Last read was a full read, so this is a trailer to skip
buffer_.clear(); buffer_.clear();
@ -209,16 +189,17 @@ unsigned int Reader::ReadPhysicalRecord(Slice* result) {
ReportDrop(kBlockSize, status); ReportDrop(kBlockSize, status);
eof_ = true; eof_ = true;
return kEof; return kEof;
} else if (buffer_.size() < kBlockSize) { } else if (buffer_.size() < (size_t)kBlockSize) {
eof_ = true; eof_ = true;
} }
continue; continue;
} else if (buffer_.size() == 0) {
// End of file
return kEof;
} else { } else {
// Note that if buffer_ is non-empty, we have a truncated header at the size_t drop_size = buffer_.size();
// end of the file, which can be caused by the writer crashing in the
// middle of writing the header. Instead of considering this an error,
// just report EOF.
buffer_.clear(); buffer_.clear();
ReportCorruption(drop_size, "truncated record at end of file");
return kEof; return kEof;
} }
} }
@ -232,15 +213,9 @@ unsigned int Reader::ReadPhysicalRecord(Slice* result) {
if (kHeaderSize + length > buffer_.size()) { if (kHeaderSize + length > buffer_.size()) {
size_t drop_size = buffer_.size(); size_t drop_size = buffer_.size();
buffer_.clear(); buffer_.clear();
if (!eof_) {
ReportCorruption(drop_size, "bad record length"); ReportCorruption(drop_size, "bad record length");
return kBadRecord; return kBadRecord;
} }
// If the end of the file has been reached without reading |length| bytes
// of payload, assume the writer died in the middle of writing the record.
// Don't report a corruption.
return kEof;
}
if (type == kZeroType && length == 0) { if (type == kZeroType && length == 0) {
// Skip zero length record without reporting any drops since // Skip zero length record without reporting any drops since

View file

@ -73,11 +73,6 @@ class Reader {
// Offset at which to start looking for the first record to return // Offset at which to start looking for the first record to return
uint64_t const initial_offset_; uint64_t const initial_offset_;
// True if we are resynchronizing after a seek (initial_offset_ > 0). In
// particular, a run of kMiddleType and kLastType records can be silently
// skipped in this mode
bool resyncing_;
// Extend record types with the following special values // Extend record types with the following special values
enum { enum {
kEof = kMaxRecordType + 1, kEof = kMaxRecordType + 1,
@ -99,8 +94,8 @@ class Reader {
// Reports dropped bytes to the reporter. // Reports dropped bytes to the reporter.
// buffer_ must be updated to remove the dropped bytes prior to invocation. // buffer_ must be updated to remove the dropped bytes prior to invocation.
void ReportCorruption(uint64_t bytes, const char* reason); void ReportCorruption(size_t bytes, const char* reason);
void ReportDrop(uint64_t bytes, const Status& reason); void ReportDrop(size_t bytes, const Status& reason);
// No copying allowed // No copying allowed
Reader(const Reader&); Reader(const Reader&);

View file

@ -79,7 +79,7 @@ class LogTest {
virtual Status Skip(uint64_t n) { virtual Status Skip(uint64_t n) {
if (n > contents_.size()) { if (n > contents_.size()) {
contents_.clear(); contents_.clear();
return Status::NotFound("in-memory file skipped past end"); return Status::NotFound("in-memory file skipepd past end");
} }
contents_.remove_prefix(n); contents_.remove_prefix(n);
@ -104,34 +104,23 @@ class LogTest {
StringSource source_; StringSource source_;
ReportCollector report_; ReportCollector report_;
bool reading_; bool reading_;
Writer* writer_; Writer writer_;
Reader* reader_; Reader reader_;
// Record metadata for testing initial offset functionality // Record metadata for testing initial offset functionality
static size_t initial_offset_record_sizes_[]; static size_t initial_offset_record_sizes_[];
static uint64_t initial_offset_last_record_offsets_[]; static uint64_t initial_offset_last_record_offsets_[];
static int num_initial_offset_records_;
public: public:
LogTest() : reading_(false), LogTest() : reading_(false),
writer_(new Writer(&dest_)), writer_(&dest_),
reader_(new Reader(&source_, &report_, true/*checksum*/, reader_(&source_, &report_, true/*checksum*/,
0/*initial_offset*/)) { 0/*initial_offset*/) {
}
~LogTest() {
delete writer_;
delete reader_;
}
void ReopenForAppend() {
delete writer_;
writer_ = new Writer(&dest_, dest_.contents_.size());
} }
void Write(const std::string& msg) { void Write(const std::string& msg) {
ASSERT_TRUE(!reading_) << "Write() after starting to read"; ASSERT_TRUE(!reading_) << "Write() after starting to read";
writer_->AddRecord(Slice(msg)); writer_.AddRecord(Slice(msg));
} }
size_t WrittenBytes() const { size_t WrittenBytes() const {
@ -145,7 +134,7 @@ class LogTest {
} }
std::string scratch; std::string scratch;
Slice record; Slice record;
if (reader_->ReadRecord(&record, &scratch)) { if (reader_.ReadRecord(&record, &scratch)) {
return record.ToString(); return record.ToString();
} else { } else {
return "EOF"; return "EOF";
@ -193,18 +182,13 @@ class LogTest {
} }
void WriteInitialOffsetLog() { void WriteInitialOffsetLog() {
for (int i = 0; i < num_initial_offset_records_; i++) { for (int i = 0; i < 4; i++) {
std::string record(initial_offset_record_sizes_[i], std::string record(initial_offset_record_sizes_[i],
static_cast<char>('a' + i)); static_cast<char>('a' + i));
Write(record); Write(record);
} }
} }
void StartReadingAt(uint64_t initial_offset) {
delete reader_;
reader_ = new Reader(&source_, &report_, true/*checksum*/, initial_offset);
}
void CheckOffsetPastEndReturnsNoRecords(uint64_t offset_past_end) { void CheckOffsetPastEndReturnsNoRecords(uint64_t offset_past_end) {
WriteInitialOffsetLog(); WriteInitialOffsetLog();
reading_ = true; reading_ = true;
@ -224,11 +208,6 @@ class LogTest {
source_.contents_ = Slice(dest_.contents_); source_.contents_ = Slice(dest_.contents_);
Reader* offset_reader = new Reader(&source_, &report_, true/*checksum*/, Reader* offset_reader = new Reader(&source_, &report_, true/*checksum*/,
initial_offset); initial_offset);
// Read all records from expected_record_offset through the last one.
ASSERT_LT(expected_record_offset, num_initial_offset_records_);
for (; expected_record_offset < num_initial_offset_records_;
++expected_record_offset) {
Slice record; Slice record;
std::string scratch; std::string scratch;
ASSERT_TRUE(offset_reader->ReadRecord(&record, &scratch)); ASSERT_TRUE(offset_reader->ReadRecord(&record, &scratch));
@ -237,35 +216,24 @@ class LogTest {
ASSERT_EQ(initial_offset_last_record_offsets_[expected_record_offset], ASSERT_EQ(initial_offset_last_record_offsets_[expected_record_offset],
offset_reader->LastRecordOffset()); offset_reader->LastRecordOffset());
ASSERT_EQ((char)('a' + expected_record_offset), record.data()[0]); ASSERT_EQ((char)('a' + expected_record_offset), record.data()[0]);
}
delete offset_reader; delete offset_reader;
} }
}; };
size_t LogTest::initial_offset_record_sizes_[] = size_t LogTest::initial_offset_record_sizes_[] =
{10000, // Two sizable records in first block {10000, // Two sizable records in first block
10000, 10000,
2 * log::kBlockSize - 1000, // Span three blocks 2 * log::kBlockSize - 1000, // Span three blocks
1, 1};
13716, // Consume all but two bytes of block 3.
log::kBlockSize - kHeaderSize, // Consume the entirety of block 4.
};
uint64_t LogTest::initial_offset_last_record_offsets_[] = uint64_t LogTest::initial_offset_last_record_offsets_[] =
{0, {0,
kHeaderSize + 10000, kHeaderSize + 10000,
2 * (kHeaderSize + 10000), 2 * (kHeaderSize + 10000),
2 * (kHeaderSize + 10000) + 2 * (kHeaderSize + 10000) +
(2 * log::kBlockSize - 1000) + 3 * kHeaderSize, (2 * log::kBlockSize - 1000) + 3 * kHeaderSize};
2 * (kHeaderSize + 10000) +
(2 * log::kBlockSize - 1000) + 3 * kHeaderSize
+ kHeaderSize + 1,
3 * log::kBlockSize,
};
// LogTest::initial_offset_last_record_offsets_ must be defined before this.
int LogTest::num_initial_offset_records_ =
sizeof(LogTest::initial_offset_last_record_offsets_)/sizeof(uint64_t);
TEST(LogTest, Empty) { TEST(LogTest, Empty) {
ASSERT_EQ("EOF", Read()); ASSERT_EQ("EOF", Read());
@ -350,15 +318,6 @@ TEST(LogTest, AlignedEof) {
ASSERT_EQ("EOF", Read()); ASSERT_EQ("EOF", Read());
} }
TEST(LogTest, OpenForAppend) {
Write("hello");
ReopenForAppend();
Write("world");
ASSERT_EQ("hello", Read());
ASSERT_EQ("world", Read());
ASSERT_EQ("EOF", Read());
}
TEST(LogTest, RandomRead) { TEST(LogTest, RandomRead) {
const int N = 500; const int N = 500;
Random write_rnd(301); Random write_rnd(301);
@ -392,32 +351,20 @@ TEST(LogTest, BadRecordType) {
ASSERT_EQ("OK", MatchError("unknown record type")); ASSERT_EQ("OK", MatchError("unknown record type"));
} }
TEST(LogTest, TruncatedTrailingRecordIsIgnored) { TEST(LogTest, TruncatedTrailingRecord) {
Write("foo"); Write("foo");
ShrinkSize(4); // Drop all payload as well as a header byte ShrinkSize(4); // Drop all payload as well as a header byte
ASSERT_EQ("EOF", Read()); ASSERT_EQ("EOF", Read());
// Truncated last record is ignored, not treated as an error. ASSERT_EQ(kHeaderSize - 1, DroppedBytes());
ASSERT_EQ(0, DroppedBytes()); ASSERT_EQ("OK", MatchError("truncated record at end of file"));
ASSERT_EQ("", ReportMessage());
} }
TEST(LogTest, BadLength) { TEST(LogTest, BadLength) {
const int kPayloadSize = kBlockSize - kHeaderSize;
Write(BigString("bar", kPayloadSize));
Write("foo");
// Least significant size byte is stored in header[4].
IncrementByte(4, 1);
ASSERT_EQ("foo", Read());
ASSERT_EQ(kBlockSize, DroppedBytes());
ASSERT_EQ("OK", MatchError("bad record length"));
}
TEST(LogTest, BadLengthAtEndIsIgnored) {
Write("foo"); Write("foo");
ShrinkSize(1); ShrinkSize(1);
ASSERT_EQ("EOF", Read()); ASSERT_EQ("EOF", Read());
ASSERT_EQ(0, DroppedBytes()); ASSERT_EQ(kHeaderSize + 2, DroppedBytes());
ASSERT_EQ("", ReportMessage()); ASSERT_EQ("OK", MatchError("bad record length"));
} }
TEST(LogTest, ChecksumMismatch) { TEST(LogTest, ChecksumMismatch) {
@ -468,40 +415,6 @@ TEST(LogTest, UnexpectedFirstType) {
ASSERT_EQ("OK", MatchError("partial record without end")); ASSERT_EQ("OK", MatchError("partial record without end"));
} }
TEST(LogTest, MissingLastIsIgnored) {
Write(BigString("bar", kBlockSize));
// Remove the LAST block, including header.
ShrinkSize(14);
ASSERT_EQ("EOF", Read());
ASSERT_EQ("", ReportMessage());
ASSERT_EQ(0, DroppedBytes());
}
TEST(LogTest, PartialLastIsIgnored) {
Write(BigString("bar", kBlockSize));
// Cause a bad record length in the LAST block.
ShrinkSize(1);
ASSERT_EQ("EOF", Read());
ASSERT_EQ("", ReportMessage());
ASSERT_EQ(0, DroppedBytes());
}
TEST(LogTest, SkipIntoMultiRecord) {
// Consider a fragmented record:
// first(R1), middle(R1), last(R1), first(R2)
// If initial_offset points to a record after first(R1) but before first(R2)
// incomplete fragment errors are not actual errors, and must be suppressed
// until a new first or full record is encountered.
Write(BigString("foo", 3*kBlockSize));
Write("correct");
StartReadingAt(kBlockSize);
ASSERT_EQ("correct", Read());
ASSERT_EQ("", ReportMessage());
ASSERT_EQ(0, DroppedBytes());
ASSERT_EQ("EOF", Read());
}
TEST(LogTest, ErrorJoinsRecords) { TEST(LogTest, ErrorJoinsRecords) {
// Consider two fragmented records: // Consider two fragmented records:
// first(R1) last(R1) first(R2) last(R2) // first(R1) last(R1) first(R2) last(R2)
@ -520,7 +433,7 @@ TEST(LogTest, ErrorJoinsRecords) {
ASSERT_EQ("correct", Read()); ASSERT_EQ("correct", Read());
ASSERT_EQ("EOF", Read()); ASSERT_EQ("EOF", Read());
const size_t dropped = DroppedBytes(); const int dropped = DroppedBytes();
ASSERT_LE(dropped, 2*kBlockSize + 100); ASSERT_LE(dropped, 2*kBlockSize + 100);
ASSERT_GE(dropped, 2*kBlockSize); ASSERT_GE(dropped, 2*kBlockSize);
} }
@ -571,10 +484,6 @@ TEST(LogTest, ReadFourthStart) {
3); 3);
} }
TEST(LogTest, ReadInitialOffsetIntoBlockPadding) {
CheckInitialOffsetRecord(3 * log::kBlockSize - 3, 5);
}
TEST(LogTest, ReadEnd) { TEST(LogTest, ReadEnd) {
CheckOffsetPastEndReturnsNoRecords(0); CheckOffsetPastEndReturnsNoRecords(0);
} }

View file

@ -12,22 +12,13 @@
namespace leveldb { namespace leveldb {
namespace log { namespace log {
static void InitTypeCrc(uint32_t* type_crc) {
for (int i = 0; i <= kMaxRecordType; i++) {
char t = static_cast<char>(i);
type_crc[i] = crc32c::Value(&t, 1);
}
}
Writer::Writer(WritableFile* dest) Writer::Writer(WritableFile* dest)
: dest_(dest), : dest_(dest),
block_offset_(0) { block_offset_(0) {
InitTypeCrc(type_crc_); for (int i = 0; i <= kMaxRecordType; i++) {
} char t = static_cast<char>(i);
type_crc_[i] = crc32c::Value(&t, 1);
Writer::Writer(WritableFile* dest, uint64_t dest_length) }
: dest_(dest), block_offset_(dest_length % kBlockSize) {
InitTypeCrc(type_crc_);
} }
Writer::~Writer() { Writer::~Writer() {
@ -83,7 +74,7 @@ Status Writer::AddRecord(const Slice& slice) {
Status Writer::EmitPhysicalRecord(RecordType t, const char* ptr, size_t n) { Status Writer::EmitPhysicalRecord(RecordType t, const char* ptr, size_t n) {
assert(n <= 0xffff); // Must fit in two bytes assert(n <= 0xffff); // Must fit in two bytes
assert(block_offset_ + kHeaderSize + n <= kBlockSize); assert(block_offset_ + kHeaderSize + (int)n <= kBlockSize);
// Format the header // Format the header
char buf[kHeaderSize]; char buf[kHeaderSize];

View file

@ -9,11 +9,10 @@
#include "db/log_format.h" #include "db/log_format.h"
#include "leveldb/slice.h" #include "leveldb/slice.h"
#include "leveldb/status.h" #include "leveldb/status.h"
#include "leveldb/env.h"
namespace leveldb { namespace leveldb {
class WritableFile;
namespace log { namespace log {
class Writer { class Writer {
@ -22,16 +21,12 @@ class Writer {
// "*dest" must be initially empty. // "*dest" must be initially empty.
// "*dest" must remain live while this Writer is in use. // "*dest" must remain live while this Writer is in use.
explicit Writer(WritableFile* dest); explicit Writer(WritableFile* dest);
// Create a writer that will append data to "*dest".
// "*dest" must have initial length "dest_length".
// "*dest" must remain live while this Writer is in use.
Writer(WritableFile* dest, uint64_t dest_length);
~Writer(); ~Writer();
Status AddRecord(const Slice& slice); Status AddRecord(const Slice& slice);
void Close() {delete dest_; dest_=NULL;};
private: private:
WritableFile* dest_; WritableFile* dest_;
int block_offset_; // Current offset in block int block_offset_; // Current offset in block

View file

@ -6,6 +6,7 @@
#include "db/dbformat.h" #include "db/dbformat.h"
#include "leveldb/comparator.h" #include "leveldb/comparator.h"
#include "leveldb/env.h" #include "leveldb/env.h"
#include "leveldb/expiry.h"
#include "leveldb/iterator.h" #include "leveldb/iterator.h"
#include "util/coding.h" #include "util/coding.h"
@ -63,6 +64,8 @@ class MemTableIterator: public Iterator {
Slice key_slice = GetLengthPrefixedSlice(iter_.key()); Slice key_slice = GetLengthPrefixedSlice(iter_.key());
return GetLengthPrefixedSlice(key_slice.data() + key_slice.size()); return GetLengthPrefixedSlice(key_slice.data() + key_slice.size());
} }
virtual KeyMetaData & keymetadata() const
{MemTable::DecodeKeyMetaData(iter_.key(), keymetadata_); return(keymetadata_);};
virtual Status status() const { return Status::OK(); } virtual Status status() const { return Status::OK(); }
@ -81,7 +84,8 @@ Iterator* MemTable::NewIterator() {
void MemTable::Add(SequenceNumber s, ValueType type, void MemTable::Add(SequenceNumber s, ValueType type,
const Slice& key, const Slice& key,
const Slice& value) { const Slice& value,
const ExpiryTimeMicros & expiry) {
// Format of an entry is concatenation of: // Format of an entry is concatenation of:
// key_size : varint32 of internal_key.size() // key_size : varint32 of internal_key.size()
// key bytes : char[internal_key.size()] // key bytes : char[internal_key.size()]
@ -89,7 +93,7 @@ void MemTable::Add(SequenceNumber s, ValueType type,
// value bytes : char[value.size()] // value bytes : char[value.size()]
size_t key_size = key.size(); size_t key_size = key.size();
size_t val_size = value.size(); size_t val_size = value.size();
size_t internal_key_size = key_size + 8; size_t internal_key_size = key_size + KeySuffixSize(type);
const size_t encoded_len = const size_t encoded_len =
VarintLength(internal_key_size) + internal_key_size + VarintLength(internal_key_size) + internal_key_size +
VarintLength(val_size) + val_size; VarintLength(val_size) + val_size;
@ -97,15 +101,22 @@ void MemTable::Add(SequenceNumber s, ValueType type,
char* p = EncodeVarint32(buf, internal_key_size); char* p = EncodeVarint32(buf, internal_key_size);
memcpy(p, key.data(), key_size); memcpy(p, key.data(), key_size);
p += key_size; p += key_size;
if (IsExpiryKey(type))
{
EncodeFixed64(p, expiry);
p+=8;
}
EncodeFixed64(p, (s << 8) | type); EncodeFixed64(p, (s << 8) | type);
p += 8; p += 8;
p = EncodeVarint32(p, val_size); p = EncodeVarint32(p, val_size);
memcpy(p, value.data(), val_size); memcpy(p, value.data(), val_size);
assert(p + val_size == buf + encoded_len); assert((size_t)((p + val_size) - buf) == encoded_len);
table_.Insert(buf); table_.Insert(buf);
} }
bool MemTable::Get(const LookupKey& key, std::string* value, Status* s) { bool MemTable::Get(const LookupKey& key, Value* value, Status* s,
const Options * options) {
bool ret_flag(false);
Slice memkey = key.memtable_key(); Slice memkey = key.memtable_key();
Table::Iterator iter(&table_); Table::Iterator iter(&table_);
iter.Seek(memkey.data()); iter.Seek(memkey.data());
@ -113,6 +124,7 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s) {
// entry format is: // entry format is:
// klength varint32 // klength varint32
// userkey char[klength] // userkey char[klength]
// optional uint64
// tag uint64 // tag uint64
// vlength varint32 // vlength varint32
// value char[vlength] // value char[vlength]
@ -122,24 +134,66 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s) {
const char* entry = iter.key(); const char* entry = iter.key();
uint32_t key_length; uint32_t key_length;
const char* key_ptr = GetVarint32Ptr(entry, entry+5, &key_length); const char* key_ptr = GetVarint32Ptr(entry, entry+5, &key_length);
Slice internal_key(key_ptr, key_length);
if (comparator_.comparator.user_comparator()->Compare( if (comparator_.comparator.user_comparator()->Compare(
Slice(key_ptr, key_length - 8), ExtractUserKey(internal_key),
key.user_key()) == 0) { key.user_key()) == 0) {
// Correct user key // Correct user key
const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8); KeyMetaData meta;
switch (static_cast<ValueType>(tag & 0xff)) { DecodeKeyMetaData(entry, meta);
case kTypeValue: {
switch (meta.m_Type) {
case kTypeValueWriteTime:
case kTypeValueExplicitExpiry:
{
bool expired=false;
if (NULL!=options && options->ExpiryActivated())
expired=options->expiry_module->MemTableCallback(internal_key);
if (expired)
{
// like kTypeDeletion
*s = Status::NotFound(Slice());
ret_flag=true;
break;
} // if
//otherwise fall into kTypeValue code
} // case
case kTypeValue:
{
Slice v = GetLengthPrefixedSlice(key_ptr + key_length); Slice v = GetLengthPrefixedSlice(key_ptr + key_length);
value->assign(v.data(), v.size()); value->assign(v.data(), v.size());
return true; ret_flag=true;
break;
} }
case kTypeDeletion: case kTypeDeletion:
*s = Status::NotFound(Slice()); *s = Status::NotFound(Slice());
return true; ret_flag=true;
break;
} // switch
// only unpack metadata if requested
if (key.WantsKeyMetaData())
key.SetKeyMetaData(meta);
} }
} }
} return ret_flag;
return false;
} }
// this is a static function
void MemTable::DecodeKeyMetaData(
const char * key,
KeyMetaData & meta)
{
Slice key_slice = GetLengthPrefixedSlice(key);
meta.m_Type=ExtractValueType(key_slice);
meta.m_Sequence=ExtractSequenceNumber(key_slice);
if (IsExpiryKey(meta.m_Type))
meta.m_Expiry=ExtractExpiry(key_slice);
else
meta.m_Expiry=0;
} // DecodeKeyMetaData
} // namespace leveldb } // namespace leveldb

View file

@ -24,10 +24,10 @@ class MemTable {
explicit MemTable(const InternalKeyComparator& comparator); explicit MemTable(const InternalKeyComparator& comparator);
// Increase reference count. // Increase reference count.
void Ref() { ++refs_; } void Ref() volatile { ++refs_; }
// Drop reference count. Delete if no more references exist. // Drop reference count. Delete if no more references exist.
void Unref() { void Unref() volatile {
--refs_; --refs_;
assert(refs_ >= 0); assert(refs_ >= 0);
if (refs_ <= 0) { if (refs_ <= 0) {
@ -36,7 +36,10 @@ class MemTable {
} }
// Returns an estimate of the number of bytes of data in use by this // Returns an estimate of the number of bytes of data in use by this
// data structure. It is safe to call when MemTable is being modified. // data structure.
//
// REQUIRES: external synchronization to prevent simultaneous
// operations on the same MemTable.
size_t ApproximateMemoryUsage(); size_t ApproximateMemoryUsage();
// Return an iterator that yields the contents of the memtable. // Return an iterator that yields the contents of the memtable.
@ -52,13 +55,17 @@ class MemTable {
// Typically value will be empty if type==kTypeDeletion. // Typically value will be empty if type==kTypeDeletion.
void Add(SequenceNumber seq, ValueType type, void Add(SequenceNumber seq, ValueType type,
const Slice& key, const Slice& key,
const Slice& value); const Slice& value,
const ExpiryTimeMicros& expiry=0);
// If memtable contains a value for key, store it in *value and return true. // If memtable contains a value for key, store it in *value and return true.
// If memtable contains a deletion for key, store a NotFound() error // If memtable contains a deletion for key, store a NotFound() error
// in *status and return true. // in *status and return true.
// Else, return false. // Else, return false.
bool Get(const LookupKey& key, std::string* value, Status* s); bool Get(const LookupKey& key, Value* value, Status* s, const Options * options);
// parse keymetadata from skiplist key string
static void DecodeKeyMetaData(const char * key, KeyMetaData & meta);
private: private:
~MemTable(); // Private since only Unref() should be used to delete it ~MemTable(); // Private since only Unref() should be used to delete it
@ -69,7 +76,7 @@ class MemTable {
int operator()(const char* a, const char* b) const; int operator()(const char* a, const char* b) const;
}; };
friend class MemTableIterator; friend class MemTableIterator;
friend class MemTableBackwardIterator; friend class MemTableBackwardIterator; // does not exist
typedef SkipList<const char*, KeyComparator> Table; typedef SkipList<const char*, KeyComparator> Table;

View file

@ -0,0 +1,248 @@
// -------------------------------------------------------------------
//
// penalty_test.cc
//
// Copyright (c) 2016 Basho Technologies, Inc. All Rights Reserved.
//
// This file is provided to you under the Apache License,
// Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain
// a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
//
// -------------------------------------------------------------------
#include "util/testharness.h"
#include "util/testutil.h"
#include "leveldb/comparator.h"
#include "db/version_set.h"
/**
* Execution routine
*/
int main(int argc, char** argv)
{
return leveldb::test::RunAllTests();
}
namespace leveldb {
class TestVersion : public Version
{
public:
TestVersion()
: Version(NULL)
{
int loop;
for (loop=0; loop<config::kNumLevels; ++loop)
{
m_FalseFile[loop].file_size=0;
m_LevelFileCount[loop]=0;
} // for
};
virtual size_t NumFiles(int level) const {return(m_LevelFileCount[level]);};
virtual const std::vector<FileMetaData*> & GetFileList(int level) const
{
m_FalseVector.clear();
m_FalseVector.push_back(&m_FalseFile[level]);
return(m_FalseVector);
};
mutable std::vector<FileMetaData*> m_FalseVector;
mutable FileMetaData m_FalseFile[config::kNumLevels];
size_t m_LevelFileCount[config::kNumLevels];
}; // class TestVersion
/**
* Wrapper class for tests. Holds working variables
* and helper functions.
*/
class PenaltyTester : public VersionSet
{
public:
PenaltyTester()
: m_IntCompare(m_Options.comparator), VersionSet("", &m_Options, NULL, &m_IntCompare)
{
};
~PenaltyTester()
{
};
Options m_Options;
InternalKeyComparator m_IntCompare;
}; // class PenaltyTester
/*******************
* Form note:
* using ASSERT_TRUE(0==version.WritePenalty());
* instead of ASSERT_EQ / ASSERT_NE because WritePenalty
* returns a volatile int, which older compilers believe is
* not an equivalent type to a constant. RedHat 5, Solaris,
* and SmartOS were giving grief.
*******************/
/**
* Debug 1
*/
#if 0
TEST(PenaltyTester, Debug1)
{
TestVersion version;
int penalty;
m_Options.write_buffer_size=46416847;
version.m_FalseFile[2].file_size=1075676398;
version.m_LevelFileCount[1]=1;
UpdatePenalty(&version);
ASSERT_TRUE(0==version.WritePenalty());
} // test Debug1
#endif
/**
* No penalty scenarios
*/
TEST(PenaltyTester, NoPenalty)
{
TestVersion version;
int level;
m_Options.write_buffer_size=46416847;
// nothing
UpdatePenalty(&version);
ASSERT_TRUE(0==version.WritePenalty());
/**
* Level 0
* (overlapped level, penalty is count based)
*/
// no penalty
version.m_LevelFileCount[0]=config::kL0_CompactionTrigger;
UpdatePenalty(&version);
ASSERT_TRUE(0==version.WritePenalty());
version.m_LevelFileCount[0]=config::kL0_SlowdownWritesTrigger;
UpdatePenalty(&version);
ASSERT_TRUE(0==version.WritePenalty());
#if 0 // needs rewrite to be time based
// threshold reached ... some penalty
version.m_LevelFileCount[0]=config::kL0_SlowdownWritesTrigger+1;
UpdatePenalty(&version);
ASSERT_TRUE(0!=version.WritePenalty());
// clean up
version.m_LevelFileCount[0]=0;
/**
* Level 1
* (overlapped level, penalty is count based)
*/
// no penalty
version.m_LevelFileCount[1]=config::kL0_CompactionTrigger;
UpdatePenalty(&version);
ASSERT_TRUE(0==version.WritePenalty());
version.m_LevelFileCount[1]=config::kL0_SlowdownWritesTrigger;
UpdatePenalty(&version);
ASSERT_TRUE(0==version.WritePenalty());
// threshold reached ... some penalty
version.m_LevelFileCount[1]=config::kL0_SlowdownWritesTrigger+1;
UpdatePenalty(&version);
ASSERT_TRUE(0!=version.WritePenalty());
// clean up
version.m_LevelFileCount[1]=0;
/**
* Level 2
* (landing level, penalty size based)
*/
// no penalty
version.m_FalseFile[2].file_size=0;
UpdatePenalty(&version);
ASSERT_TRUE(0==version.WritePenalty());
version.m_FalseFile[2].file_size=VersionSet::DesiredBytesForLevel(2);
UpdatePenalty(&version);
ASSERT_TRUE(0==version.WritePenalty());
version.m_FalseFile[2].file_size=VersionSet::MaxBytesForLevel(2)-1;
UpdatePenalty(&version);
ASSERT_TRUE(0==version.WritePenalty());
version.m_FalseFile[2].file_size=VersionSet::MaxBytesForLevel(2);
UpdatePenalty(&version);
ASSERT_TRUE(0!=version.WritePenalty());
// interaction rule with level 1
version.m_FalseFile[2].file_size=VersionSet::MaxBytesForLevel(2)-1;
version.m_LevelFileCount[1]=config::kL0_CompactionTrigger/2;
UpdatePenalty(&version);
ASSERT_TRUE(0!=version.WritePenalty());
// clean up
version.m_LevelFileCount[1]=0;
version.m_FalseFile[2].file_size=0;
/**
* Level 3+
* (landing level, penalty size based)
*/
for (level=3; level<config::kNumLevels; ++level)
{
// no penalty
version.m_FalseFile[level].file_size=0;
UpdatePenalty(&version);
ASSERT_TRUE(0==version.WritePenalty());
version.m_FalseFile[level].file_size=VersionSet::DesiredBytesForLevel(level);
UpdatePenalty(&version);
ASSERT_TRUE(0==version.WritePenalty());
version.m_FalseFile[level].file_size=VersionSet::MaxBytesForLevel(level)-1;
UpdatePenalty(&version);
ASSERT_TRUE(0==version.WritePenalty());
version.m_FalseFile[level].file_size=VersionSet::MaxBytesForLevel(level);
UpdatePenalty(&version);
if ((config::kNumLevels-1)!=level)
ASSERT_TRUE(0!=version.WritePenalty());
else
ASSERT_TRUE(0==version.WritePenalty());
// clean up
version.m_FalseFile[level].file_size=0;
} // for
#endif
} // test NoPenalty
} // namespace leveldb

View file

@ -1,324 +0,0 @@
// Copyright (c) 2014 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/db_impl.h"
#include "db/filename.h"
#include "db/version_set.h"
#include "db/write_batch_internal.h"
#include "leveldb/db.h"
#include "leveldb/env.h"
#include "leveldb/write_batch.h"
#include "util/logging.h"
#include "util/testharness.h"
#include "util/testutil.h"
namespace leveldb {
class RecoveryTest {
public:
RecoveryTest() : env_(Env::Default()), db_(NULL) {
dbname_ = test::TmpDir() + "/recovery_test";
DestroyDB(dbname_, Options());
Open();
}
~RecoveryTest() {
Close();
DestroyDB(dbname_, Options());
}
DBImpl* dbfull() const { return reinterpret_cast<DBImpl*>(db_); }
Env* env() const { return env_; }
bool CanAppend() {
WritableFile* tmp;
Status s = env_->NewAppendableFile(CurrentFileName(dbname_), &tmp);
delete tmp;
if (s.IsNotSupportedError()) {
return false;
} else {
return true;
}
}
void Close() {
delete db_;
db_ = NULL;
}
void Open(Options* options = NULL) {
Close();
Options opts;
if (options != NULL) {
opts = *options;
} else {
opts.reuse_logs = true; // TODO(sanjay): test both ways
opts.create_if_missing = true;
}
if (opts.env == NULL) {
opts.env = env_;
}
ASSERT_OK(DB::Open(opts, dbname_, &db_));
ASSERT_EQ(1, NumLogs());
}
Status Put(const std::string& k, const std::string& v) {
return db_->Put(WriteOptions(), k, v);
}
std::string Get(const std::string& k, const Snapshot* snapshot = NULL) {
std::string result;
Status s = db_->Get(ReadOptions(), k, &result);
if (s.IsNotFound()) {
result = "NOT_FOUND";
} else if (!s.ok()) {
result = s.ToString();
}
return result;
}
std::string ManifestFileName() {
std::string current;
ASSERT_OK(ReadFileToString(env_, CurrentFileName(dbname_), &current));
size_t len = current.size();
if (len > 0 && current[len-1] == '\n') {
current.resize(len - 1);
}
return dbname_ + "/" + current;
}
std::string LogName(uint64_t number) {
return LogFileName(dbname_, number);
}
size_t DeleteLogFiles() {
std::vector<uint64_t> logs = GetFiles(kLogFile);
for (size_t i = 0; i < logs.size(); i++) {
ASSERT_OK(env_->DeleteFile(LogName(logs[i]))) << LogName(logs[i]);
}
return logs.size();
}
uint64_t FirstLogFile() {
return GetFiles(kLogFile)[0];
}
std::vector<uint64_t> GetFiles(FileType t) {
std::vector<std::string> filenames;
ASSERT_OK(env_->GetChildren(dbname_, &filenames));
std::vector<uint64_t> result;
for (size_t i = 0; i < filenames.size(); i++) {
uint64_t number;
FileType type;
if (ParseFileName(filenames[i], &number, &type) && type == t) {
result.push_back(number);
}
}
return result;
}
int NumLogs() {
return GetFiles(kLogFile).size();
}
int NumTables() {
return GetFiles(kTableFile).size();
}
uint64_t FileSize(const std::string& fname) {
uint64_t result;
ASSERT_OK(env_->GetFileSize(fname, &result)) << fname;
return result;
}
void CompactMemTable() {
dbfull()->TEST_CompactMemTable();
}
// Directly construct a log file that sets key to val.
void MakeLogFile(uint64_t lognum, SequenceNumber seq, Slice key, Slice val) {
std::string fname = LogFileName(dbname_, lognum);
WritableFile* file;
ASSERT_OK(env_->NewWritableFile(fname, &file));
log::Writer writer(file);
WriteBatch batch;
batch.Put(key, val);
WriteBatchInternal::SetSequence(&batch, seq);
ASSERT_OK(writer.AddRecord(WriteBatchInternal::Contents(&batch)));
ASSERT_OK(file->Flush());
delete file;
}
private:
std::string dbname_;
Env* env_;
DB* db_;
};
TEST(RecoveryTest, ManifestReused) {
if (!CanAppend()) {
fprintf(stderr, "skipping test because env does not support appending\n");
return;
}
ASSERT_OK(Put("foo", "bar"));
Close();
std::string old_manifest = ManifestFileName();
Open();
ASSERT_EQ(old_manifest, ManifestFileName());
ASSERT_EQ("bar", Get("foo"));
Open();
ASSERT_EQ(old_manifest, ManifestFileName());
ASSERT_EQ("bar", Get("foo"));
}
TEST(RecoveryTest, LargeManifestCompacted) {
if (!CanAppend()) {
fprintf(stderr, "skipping test because env does not support appending\n");
return;
}
ASSERT_OK(Put("foo", "bar"));
Close();
std::string old_manifest = ManifestFileName();
// Pad with zeroes to make manifest file very big.
{
uint64_t len = FileSize(old_manifest);
WritableFile* file;
ASSERT_OK(env()->NewAppendableFile(old_manifest, &file));
std::string zeroes(3*1048576 - static_cast<size_t>(len), 0);
ASSERT_OK(file->Append(zeroes));
ASSERT_OK(file->Flush());
delete file;
}
Open();
std::string new_manifest = ManifestFileName();
ASSERT_NE(old_manifest, new_manifest);
ASSERT_GT(10000, FileSize(new_manifest));
ASSERT_EQ("bar", Get("foo"));
Open();
ASSERT_EQ(new_manifest, ManifestFileName());
ASSERT_EQ("bar", Get("foo"));
}
TEST(RecoveryTest, NoLogFiles) {
ASSERT_OK(Put("foo", "bar"));
ASSERT_EQ(1, DeleteLogFiles());
Open();
ASSERT_EQ("NOT_FOUND", Get("foo"));
Open();
ASSERT_EQ("NOT_FOUND", Get("foo"));
}
TEST(RecoveryTest, LogFileReuse) {
if (!CanAppend()) {
fprintf(stderr, "skipping test because env does not support appending\n");
return;
}
for (int i = 0; i < 2; i++) {
ASSERT_OK(Put("foo", "bar"));
if (i == 0) {
// Compact to ensure current log is empty
CompactMemTable();
}
Close();
ASSERT_EQ(1, NumLogs());
uint64_t number = FirstLogFile();
if (i == 0) {
ASSERT_EQ(0, FileSize(LogName(number)));
} else {
ASSERT_LT(0, FileSize(LogName(number)));
}
Open();
ASSERT_EQ(1, NumLogs());
ASSERT_EQ(number, FirstLogFile()) << "did not reuse log file";
ASSERT_EQ("bar", Get("foo"));
Open();
ASSERT_EQ(1, NumLogs());
ASSERT_EQ(number, FirstLogFile()) << "did not reuse log file";
ASSERT_EQ("bar", Get("foo"));
}
}
TEST(RecoveryTest, MultipleMemTables) {
// Make a large log.
const int kNum = 1000;
for (int i = 0; i < kNum; i++) {
char buf[100];
snprintf(buf, sizeof(buf), "%050d", i);
ASSERT_OK(Put(buf, buf));
}
ASSERT_EQ(0, NumTables());
Close();
ASSERT_EQ(0, NumTables());
ASSERT_EQ(1, NumLogs());
uint64_t old_log_file = FirstLogFile();
// Force creation of multiple memtables by reducing the write buffer size.
Options opt;
opt.reuse_logs = true;
opt.write_buffer_size = (kNum*100) / 2;
Open(&opt);
ASSERT_LE(2, NumTables());
ASSERT_EQ(1, NumLogs());
ASSERT_NE(old_log_file, FirstLogFile()) << "must not reuse log";
for (int i = 0; i < kNum; i++) {
char buf[100];
snprintf(buf, sizeof(buf), "%050d", i);
ASSERT_EQ(buf, Get(buf));
}
}
TEST(RecoveryTest, MultipleLogFiles) {
ASSERT_OK(Put("foo", "bar"));
Close();
ASSERT_EQ(1, NumLogs());
// Make a bunch of uncompacted log files.
uint64_t old_log = FirstLogFile();
MakeLogFile(old_log+1, 1000, "hello", "world");
MakeLogFile(old_log+2, 1001, "hi", "there");
MakeLogFile(old_log+3, 1002, "foo", "bar2");
// Recover and check that all log files were processed.
Open();
ASSERT_LE(1, NumTables());
ASSERT_EQ(1, NumLogs());
uint64_t new_log = FirstLogFile();
ASSERT_LE(old_log+3, new_log);
ASSERT_EQ("bar2", Get("foo"));
ASSERT_EQ("world", Get("hello"));
ASSERT_EQ("there", Get("hi"));
// Test that previous recovery produced recoverable state.
Open();
ASSERT_LE(1, NumTables());
ASSERT_EQ(1, NumLogs());
if (CanAppend()) {
ASSERT_EQ(new_log, FirstLogFile());
}
ASSERT_EQ("bar2", Get("foo"));
ASSERT_EQ("world", Get("hello"));
ASSERT_EQ("there", Get("hi"));
// Check that introducing an older log file does not cause it to be re-read.
Close();
MakeLogFile(old_log+1, 2000, "hello", "stale write");
Open();
ASSERT_LE(1, NumTables());
ASSERT_EQ(1, NumLogs());
if (CanAppend()) {
ASSERT_EQ(new_log, FirstLogFile());
}
ASSERT_EQ("bar2", Get("foo"));
ASSERT_EQ("world", Get("hello"));
ASSERT_EQ("there", Get("hi"));
}
} // namespace leveldb
int main(int argc, char** argv) {
return leveldb::test::RunAllTests();
}

View file

@ -45,30 +45,56 @@ namespace {
class Repairer { class Repairer {
public: public:
Repairer(const std::string& dbname, const Options& options) Repairer(const std::string& dbname, const Options& options)
: dbname_(dbname), : double_cache_(options),
options_(SanitizeOptions(dbname, &icmp_, &ipolicy_, options, double_cache_.GetBlockCache())),
org_options_(options),
dbname_(options_.tiered_fast_prefix),
org_dbname_(dbname),
env_(options.env), env_(options.env),
icmp_(options.comparator), icmp_(options.comparator),
ipolicy_(options.filter_policy), ipolicy_(options.filter_policy),
options_(SanitizeOptions(dbname, &icmp_, &ipolicy_, options)),
owns_info_log_(options_.info_log != options.info_log), owns_info_log_(options_.info_log != options.info_log),
owns_cache_(options_.block_cache != options.block_cache), db_lock_(NULL),
next_file_number_(1) { next_file_number_(1)
{
// TableCache can be small since we expect each table to be opened once. // TableCache can be small since we expect each table to be opened once.
table_cache_ = new TableCache(dbname_, &options_, 10); table_cache_ = new TableCache(dbname_, &options_, double_cache_.GetFileCache(), double_cache_);
} }
~Repairer() { ~Repairer() {
delete table_cache_;
if (owns_info_log_) { if (owns_info_log_) {
delete options_.info_log; delete options_.info_log;
} }
if (owns_cache_) { // if (owns_cache_) {
delete options_.block_cache; // delete options_.block_cache;
} // }
// must remove second ref counter that keeps overlapped files locked
// table cache
bool is_overlap;
for (int level = 0; level < config::kNumLevels; level++) {
{
is_overlap=(level < leveldb::config::kNumOverlapLevels);
for (size_t i = 0; i < table_numbers_[level].size(); i++) {
table_cache_->Evict(table_numbers_[level][i], is_overlap);
} // for
} // if
} // for
delete table_cache_;
} }
Status Run() { Status Run() {
Status status = FindFiles(); Status status;
status = env_->LockFile(LockFileName(dbname_), &db_lock_);
if (status.ok())
status = MakeLevelDirectories(env_, options_);
if (status.ok()) {
status = FindFiles();
if (status.ok()) { if (status.ok()) {
ConvertLogFilesToTables(); ConvertLogFilesToTables();
ExtractMetaData(); ExtractMetaData();
@ -76,18 +102,56 @@ class Repairer {
} }
if (status.ok()) { if (status.ok()) {
unsigned long long bytes = 0; unsigned long long bytes = 0;
for (size_t i = 0; i < tables_.size(); i++) { unsigned long long files = 0;
bytes += tables_[i].meta.file_size;
// calculate size for log information
for (int level=0; level<config::kNumLevels;++level)
{
std::vector<TableInfo> * table_ptr;
std::vector<TableInfo>::const_iterator i;
table_ptr=&tables_[level];
files+=table_ptr->size();
for ( i = table_ptr->begin(); table_ptr->end()!= i; i++) {
bytes += i->meta.file_size;
} }
} // for
Log(options_.info_log, Log(options_.info_log,
"**** Repaired leveldb %s; " "**** Repaired leveldb %s; "
"recovered %d files; %llu bytes. " "recovered %d files; %llu bytes. "
"Some data may have been lost. " "Some data may have been lost. "
"****", "****",
dbname_.c_str(), dbname_.c_str(),
static_cast<int>(tables_.size()), static_cast<int>(files),
bytes); bytes);
} }
if (db_lock_ != NULL) {
env_->UnlockFile(db_lock_);
}
}
// perform Riak specific scan for overlapping .sst files
// within a level
if (status.ok())
{
leveldb::DB * db_ptr;
Options options;
db_ptr=NULL;
options=org_options_;
// options.block_cache=NULL; // not reusing for fear of edge cases
options.is_repair=true;
options.error_if_exists=false;
status=leveldb::DB::Open(options, org_dbname_, &db_ptr);
if (status.ok())
status=db_ptr->VerifyLevels();
delete db_ptr;
} // if
return status; return status;
} }
@ -97,34 +161,36 @@ class Repairer {
SequenceNumber max_sequence; SequenceNumber max_sequence;
}; };
std::string const dbname_; DoubleCache double_cache_;
Options const options_, org_options_;
std::string const dbname_, org_dbname_;
Env* const env_; Env* const env_;
InternalKeyComparator const icmp_; InternalKeyComparator const icmp_;
InternalFilterPolicy const ipolicy_; InternalFilterPolicy const ipolicy_;
Options const options_;
bool owns_info_log_; bool owns_info_log_;
bool owns_cache_; FileLock* db_lock_;
TableCache* table_cache_; TableCache* table_cache_;
VersionEdit edit_; VersionEdit edit_;
std::vector<std::string> manifests_; std::vector<std::string> manifests_;
std::vector<uint64_t> table_numbers_; std::vector<uint64_t> table_numbers_[config::kNumLevels];
std::vector<uint64_t> logs_; std::vector<uint64_t> logs_;
std::vector<TableInfo> tables_; std::vector<TableInfo> tables_[config::kNumLevels];
uint64_t next_file_number_; uint64_t next_file_number_;
Status FindFiles() { Status FindFiles()
{
std::vector<std::string> filenames; std::vector<std::string> filenames;
uint64_t number;
FileType type;
int level;
// base directory
Status status = env_->GetChildren(dbname_, &filenames); Status status = env_->GetChildren(dbname_, &filenames);
if (!status.ok()) { if (!status.ok()) {
return status; return status;
} }
if (filenames.empty()) {
return Status::IOError(dbname_, "repair found no files");
}
uint64_t number;
FileType type;
for (size_t i = 0; i < filenames.size(); i++) { for (size_t i = 0; i < filenames.size(); i++) {
if (ParseFileName(filenames[i], &number, &type)) { if (ParseFileName(filenames[i], &number, &type)) {
if (type == kDescriptorFile) { if (type == kDescriptorFile) {
@ -136,13 +202,38 @@ class Repairer {
if (type == kLogFile) { if (type == kLogFile) {
logs_.push_back(number); logs_.push_back(number);
} else if (type == kTableFile) { } else if (type == kTableFile) {
table_numbers_.push_back(number); table_numbers_[0].push_back(number);
} else { } else {
// Ignore other files // Ignore other files
} // else
} // else
} // if
} // for
for (level=0; level < config::kNumLevels; ++level)
{
std::string dirname;
filenames.clear();
dirname=MakeDirName2(options_, level, "sst");
Status status = env_->GetChildren(dirname, &filenames);
if (!status.ok()) {
return status;
} }
for (size_t i = 0; i < filenames.size(); i++) {
if (ParseFileName(filenames[i], &number, &type)) {
if (number + 1 > next_file_number_) {
next_file_number_ = number + 1;
} }
if (type == kTableFile) {
table_numbers_[level].push_back(number);
} }
} } // if
} // for
} // for
return status; return status;
} }
@ -186,7 +277,7 @@ class Repairer {
reporter.env = env_; reporter.env = env_;
reporter.info_log = options_.info_log; reporter.info_log = options_.info_log;
reporter.lognum = log; reporter.lognum = log;
// We intentionally make log::Reader do checksumming so that // We intentially make log::Reader do checksumming so that
// corruptions cause entire commits to be skipped instead of // corruptions cause entire commits to be skipped instead of
// propagating bad information (like overly large sequence // propagating bad information (like overly large sequence
// numbers). // numbers).
@ -203,11 +294,11 @@ class Repairer {
while (reader.ReadRecord(&record, &scratch)) { while (reader.ReadRecord(&record, &scratch)) {
if (record.size() < 12) { if (record.size() < 12) {
reporter.Corruption( reporter.Corruption(
record.size(), Status::Corruption("log record too small", logname)); record.size(), Status::Corruption("log record too small"));
continue; continue;
} }
WriteBatchInternal::SetContents(&batch, record); WriteBatchInternal::SetContents(&batch, record);
status = WriteBatchInternal::InsertInto(&batch, mem); status = WriteBatchInternal::InsertInto(&batch, mem, &options_);
if (status.ok()) { if (status.ok()) {
counter += WriteBatchInternal::Count(&batch); counter += WriteBatchInternal::Count(&batch);
} else { } else {
@ -223,14 +314,15 @@ class Repairer {
// since ExtractMetaData() will also generate edits. // since ExtractMetaData() will also generate edits.
FileMetaData meta; FileMetaData meta;
meta.number = next_file_number_++; meta.number = next_file_number_++;
meta.level = 0;
Iterator* iter = mem->NewIterator(); Iterator* iter = mem->NewIterator();
status = BuildTable(dbname_, env_, options_, table_cache_, iter, &meta); status = BuildTable(dbname_, env_, options_, icmp_.user_comparator(), table_cache_, iter, &meta, 0);
delete iter; delete iter;
mem->Unref(); mem->Unref();
mem = NULL; mem = NULL;
if (status.ok()) { if (status.ok()) {
if (meta.file_size > 0) { if (meta.file_size > 0) {
table_numbers_.push_back(meta.number); table_numbers_[0].push_back(meta.number);
} }
} }
Log(options_.info_log, "Log #%llu: %d ops saved to Table #%llu %s", Log(options_.info_log, "Log #%llu: %d ops saved to Table #%llu %s",
@ -242,52 +334,48 @@ class Repairer {
} }
void ExtractMetaData() { void ExtractMetaData() {
for (size_t i = 0; i < table_numbers_.size(); i++) { for (int level=0; level < config::kNumLevels; ++level)
ScanTable(table_numbers_[i]); {
} std::vector<uint64_t> * number_ptr;
} std::vector<uint64_t>::const_iterator i;
Iterator* NewTableIterator(const FileMetaData& meta) { number_ptr=&table_numbers_[level];
// Same as compaction iterators: if paranoid_checks are on, turn for (i = number_ptr->begin(); number_ptr->end()!= i; ++i) {
// on checksum verification.
ReadOptions r;
r.verify_checksums = options_.paranoid_checks;
return table_cache_->NewIterator(r, meta.number, meta.file_size);
}
void ScanTable(uint64_t number) {
TableInfo t; TableInfo t;
t.meta.number = number; t.meta.number = *i;
std::string fname = TableFileName(dbname_, number); t.meta.level = level;
Status status = env_->GetFileSize(fname, &t.meta.file_size); Status status = ScanTable(&t);
if (!status.ok()) { if (!status.ok())
// Try alternate file name. {
fname = SSTTableFileName(dbname_, number); std::string fname = TableFileName(options_, t.meta.number, t.meta.level);
Status s2 = env_->GetFileSize(fname, &t.meta.file_size); Log(options_.info_log, "Table #%llu: ignoring %s",
if (s2.ok()) {
status = Status::OK();
}
}
if (!status.ok()) {
ArchiveFile(TableFileName(dbname_, number));
ArchiveFile(SSTTableFileName(dbname_, number));
Log(options_.info_log, "Table #%llu: dropped: %s",
(unsigned long long) t.meta.number, (unsigned long long) t.meta.number,
status.ToString().c_str()); status.ToString().c_str());
return; ArchiveFile(fname, true);
} else {
tables_[level].push_back(t);
}
}
}
} }
// Extract metadata by scanning through table. Status ScanTable(TableInfo* t) {
Table * table_ptr;
SstCounters counters;
std::string fname = TableFileName(options_, t->meta.number, t->meta.level);
int counter = 0; int counter = 0;
Iterator* iter = NewTableIterator(t.meta); Status status = env_->GetFileSize(fname, &t->meta.file_size);
if (status.ok()) {
Iterator* iter = table_cache_->NewIterator(
ReadOptions(), t->meta.number, t->meta.file_size, t->meta.level, &table_ptr);
bool empty = true; bool empty = true;
ParsedInternalKey parsed; ParsedInternalKey parsed;
t.max_sequence = 0; t->max_sequence = 0;
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
Slice key = iter->key(); Slice key = iter->key();
if (!ParseInternalKey(key, &parsed)) { if (!ParseInternalKey(key, &parsed)) {
Log(options_.info_log, "Table #%llu: unparsable key %s", Log(options_.info_log, "Table #%llu: unparsable key %s",
(unsigned long long) t.meta.number, (unsigned long long) t->meta.number,
EscapeString(key).c_str()); EscapeString(key).c_str());
continue; continue;
} }
@ -295,115 +383,79 @@ class Repairer {
counter++; counter++;
if (empty) { if (empty) {
empty = false; empty = false;
t.meta.smallest.DecodeFrom(key); t->meta.smallest.DecodeFrom(key);
} }
t.meta.largest.DecodeFrom(key); t->meta.largest.DecodeFrom(key);
if (parsed.sequence > t.max_sequence) { if (parsed.sequence > t->max_sequence) {
t.max_sequence = parsed.sequence; t->max_sequence = parsed.sequence;
} }
} }
if (!iter->status().ok()) { if (!iter->status().ok()) {
status = iter->status(); status = iter->status();
} }
else {
counters=table_ptr->GetSstCounters();
t->meta.exp_write_low=counters.Value(eSstCountExpiry1);
t->meta.exp_write_high=counters.Value(eSstCountExpiry2);
t->meta.exp_explicit_high=counters.Value(eSstCountExpiry3);
}
delete iter; delete iter;
}
Log(options_.info_log, "Table #%llu: %d entries %s", Log(options_.info_log, "Table #%llu: %d entries %s",
(unsigned long long) t.meta.number, (unsigned long long) t->meta.number,
counter, counter,
status.ToString().c_str()); status.ToString().c_str());
return status;
if (status.ok()) {
tables_.push_back(t);
} else {
RepairTable(fname, t); // RepairTable archives input file.
}
}
void RepairTable(const std::string& src, TableInfo t) {
// We will copy src contents to a new table and then rename the
// new table over the source.
// Create builder.
std::string copy = TableFileName(dbname_, next_file_number_++);
WritableFile* file;
Status s = env_->NewWritableFile(copy, &file);
if (!s.ok()) {
return;
}
TableBuilder* builder = new TableBuilder(options_, file);
// Copy data.
Iterator* iter = NewTableIterator(t.meta);
int counter = 0;
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
builder->Add(iter->key(), iter->value());
counter++;
}
delete iter;
ArchiveFile(src);
if (counter == 0) {
builder->Abandon(); // Nothing to save
} else {
s = builder->Finish();
if (s.ok()) {
t.meta.file_size = builder->FileSize();
}
}
delete builder;
builder = NULL;
if (s.ok()) {
s = file->Close();
}
delete file;
file = NULL;
if (counter > 0 && s.ok()) {
std::string orig = TableFileName(dbname_, t.meta.number);
s = env_->RenameFile(copy, orig);
if (s.ok()) {
Log(options_.info_log, "Table #%llu: %d entries repaired",
(unsigned long long) t.meta.number, counter);
tables_.push_back(t);
}
}
if (!s.ok()) {
env_->DeleteFile(copy);
}
} }
Status WriteDescriptor() { Status WriteDescriptor() {
std::string tmp = TempFileName(dbname_, 1); std::string tmp = TempFileName(dbname_, 1);
WritableFile* file; WritableFile* file;
Status status = env_->NewWritableFile(tmp, &file); Status status = env_->NewWritableFile(tmp, &file, 4096);
if (!status.ok()) { if (!status.ok()) {
return status; return status;
} }
SequenceNumber max_sequence = 0; SequenceNumber max_sequence = 0;
for (size_t i = 0; i < tables_.size(); i++) { for (int level=0; level<config::kNumLevels;++level)
if (max_sequence < tables_[i].max_sequence) { {
max_sequence = tables_[i].max_sequence; std::vector<TableInfo> * table_ptr;
} std::vector<TableInfo>::const_iterator i;
table_ptr=&tables_[level];
for ( i = table_ptr->begin(); table_ptr->end()!= i; i++) {
if (max_sequence < i->max_sequence) {
max_sequence = i->max_sequence;
} }
} // for
} // for
edit_.SetComparatorName(icmp_.user_comparator()->Name()); edit_.SetComparatorName(icmp_.user_comparator()->Name());
edit_.SetLogNumber(0); edit_.SetLogNumber(0);
edit_.SetNextFile(next_file_number_); edit_.SetNextFile(next_file_number_);
edit_.SetLastSequence(max_sequence); edit_.SetLastSequence(max_sequence);
for (size_t i = 0; i < tables_.size(); i++) { for (int level=0; level<config::kNumLevels;++level)
// TODO(opt): separate out into multiple levels {
const TableInfo& t = tables_[i]; std::vector<TableInfo> * table_ptr;
edit_.AddFile(0, t.meta.number, t.meta.file_size, std::vector<TableInfo>::const_iterator i;
t.meta.smallest, t.meta.largest);
} table_ptr=&tables_[level];
for ( i = table_ptr->begin(); table_ptr->end()!= i; i++) {
edit_.AddFile2(level, i->meta.number, i->meta.file_size,
i->meta.smallest, i->meta.largest,
i->meta.exp_write_low, i->meta.exp_write_high, i->meta.exp_explicit_high);
} // for
} // for
//fprintf(stderr, "NewDescriptor:\n%s\n", edit_.DebugString().c_str()); //fprintf(stderr, "NewDescriptor:\n%s\n", edit_.DebugString().c_str());
{ {
log::Writer log(file); log::Writer log(file);
std::string record; std::string record;
edit_.EncodeTo(&record); edit_.EncodeTo(&record); // manifest format is default for release, options_ often incomplete
status = log.AddRecord(record); status = log.AddRecord(record);
} }
if (status.ok()) { if (status.ok()) {
@ -431,21 +483,33 @@ class Repairer {
return status; return status;
} }
void ArchiveFile(const std::string& fname) { void ArchiveFile(const std::string& fname, bool two_levels=false) {
// Move into another directory. E.g., for // Move into another directory. E.g., for
// dir/foo // dir/foo
// rename to // rename to
// dir/lost/foo // dir/lost/foo
const char* slash = strrchr(fname.c_str(), '/'); std::string::size_type slash, slash2;
slash=fname.rfind('/');
if (two_levels && std::string::npos!=slash && 0<slash)
{
slash2=fname.rfind('/',slash-1);
if (std::string::npos==slash2)
slash2=slash;
} // if
else
slash2=slash;
std::string new_dir; std::string new_dir;
if (slash != NULL) {
new_dir.assign(fname.data(), slash - fname.data()); if (std::string::npos != slash2 && 0<slash2)
} new_dir.append(fname,0,slash2);
new_dir.append("/lost"); new_dir.append("/lost");
env_->CreateDir(new_dir); // Ignore error env_->CreateDir(new_dir); // Ignore error
std::string new_file = new_dir; std::string new_file = new_dir;
new_file.append("/"); new_file.append("/");
new_file.append((slash == NULL) ? fname.c_str() : slash + 1); new_file.append((std::string::npos!=slash) ? fname.substr(slash+1) : fname);
Status s = env_->RenameFile(fname, new_file); Status s = env_->RenameFile(fname, new_file);
Log(options_.info_log, "Archiving %s: %s\n", Log(options_.info_log, "Archiving %s: %s\n",
fname.c_str(), s.ToString().c_str()); fname.c_str(), s.ToString().c_str());

View file

@ -1,10 +1,7 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved. // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be // Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors. // found in the LICENSE file. See the AUTHORS file for names of contributors.
//
#ifndef STORAGE_LEVELDB_DB_SKIPLIST_H_
#define STORAGE_LEVELDB_DB_SKIPLIST_H_
// Thread safety // Thread safety
// ------------- // -------------
// //
@ -55,6 +52,12 @@ class SkipList {
// Returns true iff an entry that compares equal to key is in the list. // Returns true iff an entry that compares equal to key is in the list.
bool Contains(const Key& key) const; bool Contains(const Key& key) const;
// Returns true if all inserts have been sequentially increasing;
// else this SkipList has had keys inserted in non-sequential order
bool InSequentialInsertMode() const {
return sequentialInsertMode_;
}
// Iteration over the contents of a skip list // Iteration over the contents of a skip list
class Iterator { class Iterator {
public: public:
@ -94,8 +97,22 @@ class SkipList {
// Intentionally copyable // Intentionally copyable
}; };
protected:
// Checks the structure of this SkipList object, ensuring the keys are
// properly ordered
//
// This is protected since it is intended for use by unit tests; if a lock
// is used to protect Insert(), then it should be used to protect this
// method as well
bool Valid() const;
// Disables the sequential insert optimizations (used in performance testing)
void DisableSequentialInsertMode() {
sequentialInsertMode_ = false;
}
private: private:
enum { kMaxHeight = 12 }; enum { kMaxHeight = 17 };
// Immutable after construction // Immutable after construction
Comparator const compare_; Comparator const compare_;
@ -115,6 +132,18 @@ class SkipList {
// Read/written only by Insert(). // Read/written only by Insert().
Random rnd_; Random rnd_;
// Points to the last node in the list; modified only by Insert()
Node* tail_;
// Pointers to the nodes previous to the tail node; have max_height_ entries
Node* tailPrev_[kMaxHeight];
// The height of the tail_ node
int tailHeight_;
// We track the tail node until we have a non-sequential insert
bool sequentialInsertMode_;
Node* NewNode(const Key& key, int height); Node* NewNode(const Key& key, int height);
int RandomHeight(); int RandomHeight();
bool Equal(const Key& a, const Key& b) const { return (compare_(a, b) == 0); } bool Equal(const Key& a, const Key& b) const { return (compare_(a, b) == 0); }
@ -129,6 +158,11 @@ class SkipList {
// node at "level" for every level in [0..max_height_-1]. // node at "level" for every level in [0..max_height_-1].
Node* FindGreaterOrEqual(const Key& key, Node** prev) const; Node* FindGreaterOrEqual(const Key& key, Node** prev) const;
// Similar to FindGreaterOrEqual() except it uses the barrier-free
// variant of Next(); this is used only by Insert() and it
// checks the tail_ pointer in case we're doing a sequential insert
Node* NoBarrier_FindGreaterOrEqual(const Key& key, Node** prev) const;
// Return the latest node with a key < key. // Return the latest node with a key < key.
// Return head_ if there is no such node. // Return head_ if there is no such node.
Node* FindLessThan(const Key& key) const; Node* FindLessThan(const Key& key) const;
@ -280,6 +314,54 @@ typename SkipList<Key,Comparator>::Node* SkipList<Key,Comparator>::FindGreaterOr
} }
} }
template<typename Key, class Comparator>
typename SkipList<Key,Comparator>::Node*
SkipList<Key,Comparator>::NoBarrier_FindGreaterOrEqual(const Key& key, Node** prev) const {
int level = GetMaxHeight() - 1;
// If we have only seen sequential inserts up to this point, we can use
// the tail_ node
if ( sequentialInsertMode_ ) {
if (tail_ == NULL) {
// The list is currently empty, so the node being inserted
// will be the new tail_
assert(level == 0);
if (prev != NULL) prev[0] = head_;
return NULL;
}
else if (KeyIsAfterNode(key, tail_)) {
// The new key must be inserted after the current tail_ node
if (prev != NULL) {
int i;
for (i = 0; i < tailHeight_; ++i) {
prev[i] = tail_;
}
for (/*continue with i*/; i <= level; ++i) {
prev[i] = tailPrev_[i];
}
}
return NULL;
}
}
Node* x = head_;
while (true) {
Node* next = x->NoBarrier_Next(level);
if (KeyIsAfterNode(key, next)) {
// Keep searching in this list
x = next;
} else {
if (prev != NULL) prev[level] = x;
if (level == 0) {
return next;
} else {
// Switch to next list
level--;
}
}
}
}
template<typename Key, class Comparator> template<typename Key, class Comparator>
typename SkipList<Key,Comparator>::Node* typename SkipList<Key,Comparator>::Node*
SkipList<Key,Comparator>::FindLessThan(const Key& key) const { SkipList<Key,Comparator>::FindLessThan(const Key& key) const {
@ -327,25 +409,41 @@ SkipList<Key,Comparator>::SkipList(Comparator cmp, Arena* arena)
arena_(arena), arena_(arena),
head_(NewNode(0 /* any key will do */, kMaxHeight)), head_(NewNode(0 /* any key will do */, kMaxHeight)),
max_height_(reinterpret_cast<void*>(1)), max_height_(reinterpret_cast<void*>(1)),
rnd_(0xdeadbeef) { rnd_(0xdeadbeef),
tail_(NULL),
tailHeight_(0),
sequentialInsertMode_(true) {
for (int i = 0; i < kMaxHeight; i++) { for (int i = 0; i < kMaxHeight; i++) {
head_->SetNext(i, NULL); head_->SetNext(i, NULL);
tailPrev_[i] = NULL;
} }
} }
template<typename Key, class Comparator> template<typename Key, class Comparator>
void SkipList<Key,Comparator>::Insert(const Key& key) { void SkipList<Key,Comparator>::Insert(const Key& key) {
// TODO(opt): We can use a barrier-free variant of FindGreaterOrEqual() // We use a barrier-free variant of FindGreaterOrEqual()
// here since Insert() is externally synchronized. // here since Insert() is externally synchronized.
Node* prev[kMaxHeight]; Node* prev[kMaxHeight];
Node* x = FindGreaterOrEqual(key, prev); Node* x = NoBarrier_FindGreaterOrEqual(key, prev);
// If we're still in sequential-insert mode, check if the new node is being
// inserted at the end of the list, which is indicated by x being NULL
if (sequentialInsertMode_) {
if (x != NULL) {
// we have a non-sequential (AKA random) insert, so stop maintaining
// the tail bookkeeping overhead
sequentialInsertMode_ = false;
}
}
// Our data structure does not allow duplicate insertion // Our data structure does not allow duplicate insertion
assert(x == NULL || !Equal(key, x->key)); assert(x == NULL || !Equal(key, x->key));
int height = RandomHeight(); int i, height = RandomHeight();
if (height > GetMaxHeight()) { if (height > GetMaxHeight()) {
for (int i = GetMaxHeight(); i < height; i++) { // We are extending max_height_ which means we need to fill in the blanks
// in prev[] that were not filled in by NoBarrier_FindGreaterOrEqual()
for (i = GetMaxHeight(); i < height; ++i) {
prev[i] = head_; prev[i] = head_;
} }
//fprintf(stderr, "Change height from %d to %d\n", max_height_, height); //fprintf(stderr, "Change height from %d to %d\n", max_height_, height);
@ -361,12 +459,37 @@ void SkipList<Key,Comparator>::Insert(const Key& key) {
} }
x = NewNode(key, height); x = NewNode(key, height);
for (int i = 0; i < height; i++) { for (i = 0; i < height; ++i) {
// NoBarrier_SetNext() suffices since we will add a barrier when // NoBarrier_SetNext() suffices since we will add a barrier when
// we publish a pointer to "x" in prev[i]. // we publish a pointer to "x" in prev[i].
x->NoBarrier_SetNext(i, prev[i]->NoBarrier_Next(i)); x->NoBarrier_SetNext(i, prev[i]->NoBarrier_Next(i));
prev[i]->SetNext(i, x); prev[i]->SetNext(i, x);
} }
// Do we need to update our tail_ pointer?
if (sequentialInsertMode_) {
Node* prevTail = tail_;
int prevTailHeight = tailHeight_;
tail_ = x;
tailHeight_ = height;
// We also need to update our tailPrev_ pointers; first we capture
// the nodes already pointing to the new tail_
for (i = 0; i < height; ++i) {
tailPrev_[i] = prev[i];
}
// If the previous tail node was taller than the new tail node, then
// the prev pointers above the current tail node's height (up to the
// height of the previous tail node) are simply the previous tail node
for (/*continue with i*/; i < prevTailHeight; ++i) {
tailPrev_[i] = prevTail;
}
// NOTE: any prev pointers above prevTailHeight (up to max_height_) were
// already set in tailPrev_ by previous calls to this method
}
} }
template<typename Key, class Comparator> template<typename Key, class Comparator>
@ -379,6 +502,115 @@ bool SkipList<Key,Comparator>::Contains(const Key& key) const {
} }
} }
} // namespace leveldb template<typename Key, class Comparator>
bool SkipList<Key,Comparator>::Valid() const
{
// Note that we can use barrier-free overloads in this method since it is
// protected by the same lock as Insert().
#endif // STORAGE_LEVELDB_DB_SKIPLIST_H_ // Ensure that the list is properly sorted; use an iterator for this check
const Key* pPrevKey = NULL;
typename SkipList<Key, Comparator>::Iterator iter(this);
for ( iter.SeekToFirst(); iter.Valid(); iter.Next() ) {
if ( pPrevKey != NULL ) {
if ( compare_( *pPrevKey, iter.key() ) >= 0 ) {
return false;
}
}
pPrevKey = &iter.key();
}
// Now walk the linked list at each level and ensure it's sorted. Also track
// how many nodes we see at each level; the number of nodes in the linked
// list at level n must not be larger than the number of nodes at level n-1.
std::vector<int> nodeCounts( GetMaxHeight() );
int level;
for ( level = GetMaxHeight() - 1; level >= 0; --level ) {
int nodeCount = 0;
pPrevKey = NULL;
for ( Node* pNode = head_->NoBarrier_Next( level );
pNode != NULL;
pNode = pNode->NoBarrier_Next( level ) ) {
++nodeCount;
if ( pPrevKey != NULL ) {
if ( compare_( *pPrevKey, pNode->key ) >= 0 ) {
return false;
}
}
pPrevKey = &pNode->key;
}
nodeCounts[ level ] = nodeCount;
}
// Ensure the node counts do not increase as we move up the levels
int prevNodeCount = nodeCounts[0];
for ( level = 1; level < GetMaxHeight(); ++level ) {
int currentNodeCount = nodeCounts[ level ];
if ( currentNodeCount > prevNodeCount ) {
return false;
}
prevNodeCount = currentNodeCount;
}
// Ensure that tail_ points to the last node
if ( sequentialInsertMode_ ) {
if ( tail_ == NULL ) {
// tail_ is not set, so the list must be empty
if ( tailPrev_[0] != NULL || head_->NoBarrier_Next(0) != NULL ) {
return false;
}
}
else {
// we have a tail_ node; first ensure that its prev pointer actually
// points to it
if ( tailPrev_[0] == NULL || tailPrev_[0]->NoBarrier_Next(0) != tail_ ) {
return false;
}
if ( compare_( tailPrev_[0]->key, tail_->key ) >= 0 ) {
return false;
}
// now check the rest of the pointers in tailPrev_; up to tailHeight_,
// the next pointer of the node in tailPrev_ should point to tail_; after
// that, the next pointer should be NULL
for ( level = 1; level < GetMaxHeight(); ++level ) {
Node* tailPrev = tailPrev_[ level ];
if ( tailPrev == NULL ) {
return false;
}
if ( level < tailHeight_ ) {
if ( tailPrev->NoBarrier_Next( level ) != tail_ ) {
return false;
}
if ( compare_( tailPrev->key, tail_->key ) >= 0 ) {
return false;
}
}
else {
if ( tailPrev->NoBarrier_Next( level ) != NULL ) {
return false;
}
}
}
// the remainder of the tailPrev_ pointers (above max_height_)
// should be NULL
for ( /*continue with level*/; level < kMaxHeight; ++level ) {
if ( tailPrev_[ level ] != NULL ) {
return false;
}
}
// now ensure that FindLast() returns tail_
Node* lastNode = FindLast();
if ( lastNode != tail_ ) {
return false;
}
}
}
// if we get here, all is good
return true;
}
} // namespace leveldb

View file

@ -2,11 +2,15 @@
// Use of this source code is governed by a BSD-style license that can be // Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors. // found in the LICENSE file. See the AUTHORS file for names of contributors.
#define __STDC_FORMAT_MACROS
#include <inttypes.h>
#include "db/skiplist.h" #include "db/skiplist.h"
#include <set> #include <set>
#include "leveldb/env.h" #include "leveldb/env.h"
#include "util/arena.h" #include "util/arena.h"
#include "util/hash.h" #include "util/hash.h"
#include "util/mutexlock.h"
#include "util/random.h" #include "util/random.h"
#include "util/testharness.h" #include "util/testharness.h"
@ -26,15 +30,29 @@ struct Comparator {
} }
}; };
template<typename Key, class Comparator>
class SkipListTest : public SkipList<Key, Comparator>
{
public:
SkipListTest(Comparator cmp, Arena* arena) : SkipList<Key, Comparator>(cmp, arena) {}
// check the validity of this SkipList object by calling the Valid() method
// in the base class
bool Valid() const { return SkipList<Key, Comparator>::Valid(); }
void DisableSequentialInsertMode() { SkipList<Key, Comparator>::DisableSequentialInsertMode(); }
};
class SkipTest { }; class SkipTest { };
TEST(SkipTest, Empty) { TEST(SkipTest, Empty) {
Arena arena; Arena arena;
Comparator cmp; Comparator cmp;
SkipList<Key, Comparator> list(cmp, &arena); SkipListTest<Key, Comparator> list(cmp, &arena);
ASSERT_TRUE(!list.Contains(10)); ASSERT_TRUE(!list.Contains(10));
ASSERT_TRUE(list.Valid());
SkipList<Key, Comparator>::Iterator iter(&list); SkipListTest<Key, Comparator>::Iterator iter(&list);
ASSERT_TRUE(!iter.Valid()); ASSERT_TRUE(!iter.Valid());
iter.SeekToFirst(); iter.SeekToFirst();
ASSERT_TRUE(!iter.Valid()); ASSERT_TRUE(!iter.Valid());
@ -51,13 +69,14 @@ TEST(SkipTest, InsertAndLookup) {
std::set<Key> keys; std::set<Key> keys;
Arena arena; Arena arena;
Comparator cmp; Comparator cmp;
SkipList<Key, Comparator> list(cmp, &arena); SkipListTest<Key, Comparator> list(cmp, &arena);
for (int i = 0; i < N; i++) { for (int i = 0; i < N; i++) {
Key key = rnd.Next() % R; Key key = rnd.Next() % R;
if (keys.insert(key).second) { if (keys.insert(key).second) {
list.Insert(key); list.Insert(key);
} }
} }
ASSERT_TRUE(list.Valid());
for (int i = 0; i < R; i++) { for (int i = 0; i < R; i++) {
if (list.Contains(i)) { if (list.Contains(i)) {
@ -69,7 +88,7 @@ TEST(SkipTest, InsertAndLookup) {
// Simple iterator tests // Simple iterator tests
{ {
SkipList<Key, Comparator>::Iterator iter(&list); SkipListTest<Key, Comparator>::Iterator iter(&list);
ASSERT_TRUE(!iter.Valid()); ASSERT_TRUE(!iter.Valid());
iter.Seek(0); iter.Seek(0);
@ -87,7 +106,7 @@ TEST(SkipTest, InsertAndLookup) {
// Forward iteration test // Forward iteration test
for (int i = 0; i < R; i++) { for (int i = 0; i < R; i++) {
SkipList<Key, Comparator>::Iterator iter(&list); SkipListTest<Key, Comparator>::Iterator iter(&list);
iter.Seek(i); iter.Seek(i);
// Compare against model iterator // Compare against model iterator
@ -107,7 +126,7 @@ TEST(SkipTest, InsertAndLookup) {
// Backward iteration test // Backward iteration test
{ {
SkipList<Key, Comparator>::Iterator iter(&list); SkipListTest<Key, Comparator>::Iterator iter(&list);
iter.SeekToLast(); iter.SeekToLast();
// Compare against model iterator // Compare against model iterator
@ -250,7 +269,7 @@ class ConcurrentTest {
// Note that generation 0 is never inserted, so it is ok if // Note that generation 0 is never inserted, so it is ok if
// <*,0,*> is missing. // <*,0,*> is missing.
ASSERT_TRUE((gen(pos) == 0) || ASSERT_TRUE((gen(pos) == 0) ||
(gen(pos) > static_cast<Key>(initial_state.Get(key(pos)))) (gen(pos) > initial_state.Get(key(pos)))
) << "key: " << key(pos) ) << "key: " << key(pos)
<< "; gen: " << gen(pos) << "; gen: " << gen(pos)
<< "; initgen: " << "; initgen: "
@ -313,18 +332,16 @@ class TestState {
state_cv_(&mu_) {} state_cv_(&mu_) {}
void Wait(ReaderState s) { void Wait(ReaderState s) {
mu_.Lock(); MutexLock lock(&mu_);
while (state_ != s) { while (state_ != s) {
state_cv_.Wait(); state_cv_.Wait();
} }
mu_.Unlock();
} }
void Change(ReaderState s) { void Change(ReaderState s) {
mu_.Lock(); MutexLock lock(&mu_);
state_ = s; state_ = s;
state_cv_.Signal(); state_cv_.Signal();
mu_.Unlock();
} }
private: private:
@ -371,6 +388,211 @@ TEST(SkipTest, Concurrent3) { RunConcurrent(3); }
TEST(SkipTest, Concurrent4) { RunConcurrent(4); } TEST(SkipTest, Concurrent4) { RunConcurrent(4); }
TEST(SkipTest, Concurrent5) { RunConcurrent(5); } TEST(SkipTest, Concurrent5) { RunConcurrent(5); }
static void
RunSequentialInsert(
const int NumKeys,
bool AcquireLock,
bool ReverseInsert,
bool SequentialInsertModeEnabled )
{
const int loopCount = 5; // repeat the whole process this many times and average the time spent
std::vector<uint64_t> timeSpent;
port::Mutex mutex;
Env* env = Env::Default();
fprintf( stderr,
"Sequentially inserting %d keys in %s order,\n"
" seqential insert mode is initially %sabled,\n"
" %sacquiring a lock for each insert (averaging over %d runs)\n",
NumKeys, ReverseInsert ? "reverse" : "forward",
SequentialInsertModeEnabled ? "en" : "dis",
AcquireLock ? "" : "not ", loopCount );
int k;
for ( k = 0; k < loopCount; ++k ) {
int j;
Arena arena;
Comparator cmp;
SkipListTest<Key, Comparator> list( cmp, &arena );
// initially the SkipList should be in sequential mode
ASSERT_TRUE( list.InSequentialInsertMode() );
// were we instructed to disable sequential insert mode?
if ( !SequentialInsertModeEnabled ) {
list.DisableSequentialInsertMode();
ASSERT_TRUE( !list.InSequentialInsertMode() );
}
uint64_t start = env->NowMicros();
for ( j = 0; j < NumKeys; ++j ) {
Key key = ReverseInsert ? NumKeys - 1 - j : j;
if ( AcquireLock ) mutex.Lock();
list.Insert( key );
if ( AcquireLock ) mutex.Unlock();
}
uint64_t stop = env->NowMicros();
timeSpent.push_back( stop - start );
//fprintf( stderr, " Time for run %d: %llu\n", k, timeSpent[k] );
// if SequentialInsertModeEnabled is true, the SkipList should still be
// in sequential mode iff ReverseInsert is false
if ( SequentialInsertModeEnabled ) {
ASSERT_TRUE( list.InSequentialInsertMode() != ReverseInsert );
}
else {
ASSERT_TRUE( !list.InSequentialInsertMode() );
}
// ensure the SkipLlist is properly sorted
if ( AcquireLock ) mutex.Lock();
ASSERT_TRUE( list.Valid() );
if ( AcquireLock ) mutex.Unlock();
// ensure the SkipList contains all the keys we inserted
for ( j = 0; j < NumKeys; ++j ) {
ASSERT_TRUE( list.Contains( j ) );
}
}
// throw out the low and high times and average the rest
uint64_t totalTime, lowTime, highTime;
totalTime = lowTime = highTime = timeSpent[0];
for ( k = 1; k < loopCount; ++k ) {
uint64_t currentTime = timeSpent[k];
totalTime += currentTime;
if ( lowTime > currentTime ) lowTime = currentTime;
if ( highTime < currentTime ) highTime = currentTime;
}
totalTime -= (lowTime + highTime);
uint64_t averageTime = (totalTime / (loopCount - 2));
double timePerKey = (double)averageTime / (double)NumKeys;
fprintf( stderr, " Average insertion time: %" PRIu64 " (%f/key)\n", averageTime, timePerKey );
}
TEST(SkipTest, SequentialInsert_NoLock_ForwardInsert)
{
int numKeys = 100000;
bool acquireLock = false;
bool reverseInsert = false;
bool sequentialInsertModeEnabled = true;
RunSequentialInsert( numKeys, acquireLock, reverseInsert, sequentialInsertModeEnabled );
sequentialInsertModeEnabled = false;
RunSequentialInsert( numKeys, acquireLock, reverseInsert, sequentialInsertModeEnabled );
}
TEST(SkipTest, SequentialInsert_Lock_ForwardInsert)
{
int numKeys = 100000;
bool acquireLock = true;
bool reverseInsert = false;
bool sequentialInsertModeEnabled = true;
RunSequentialInsert( numKeys, acquireLock, reverseInsert, sequentialInsertModeEnabled );
sequentialInsertModeEnabled = false;
RunSequentialInsert( numKeys, acquireLock, reverseInsert, sequentialInsertModeEnabled );
}
TEST(SkipTest, SequentialInsert_NoLock_ReverseInsert)
{
int numKeys = 100000;
bool acquireLock = false;
bool reverseInsert = true;
bool sequentialInsertModeEnabled = true;
RunSequentialInsert( numKeys, acquireLock, reverseInsert, sequentialInsertModeEnabled );
}
TEST(SkipTest, SequentialInsert_Lock_ReverseInsert)
{
int numKeys = 100000;
bool acquireLock = true;
bool reverseInsert = true;
bool sequentialInsertModeEnabled = true;
RunSequentialInsert( numKeys, acquireLock, reverseInsert, sequentialInsertModeEnabled );
}
TEST(SkipTest, SequentialInsert_IncreasingNumberOfInserts)
{
// test with increasing numbers of keys, with sequential-insert mode both
// enabled and disabled; we're looking to see if per-key insertion times
// trend upward as the number of keys increases
int numKeys = 10000;
bool acquireLock = false;
bool reverseInsert = false;
bool sequentialInsertModeEnabled = true;
RunSequentialInsert( numKeys, acquireLock, reverseInsert, sequentialInsertModeEnabled );
sequentialInsertModeEnabled = false;
RunSequentialInsert( numKeys, acquireLock, reverseInsert, sequentialInsertModeEnabled );
numKeys = 100000;
sequentialInsertModeEnabled = true;
RunSequentialInsert( numKeys, acquireLock, reverseInsert, sequentialInsertModeEnabled );
sequentialInsertModeEnabled = false;
RunSequentialInsert( numKeys, acquireLock, reverseInsert, sequentialInsertModeEnabled );
numKeys = 1000000;
sequentialInsertModeEnabled = true;
RunSequentialInsert( numKeys, acquireLock, reverseInsert, sequentialInsertModeEnabled );
sequentialInsertModeEnabled = false;
RunSequentialInsert( numKeys, acquireLock, reverseInsert, sequentialInsertModeEnabled );
}
TEST(SkipTest, SequentialInsert_MixedInsertionModes)
{
// start inserting sequentially, then switch to non-sequential inserts,
// ensuring all works as intended
int j, numSequentialKeys = 100000, numNonSequentialKeys = 100000;
int totalNumKeys = numSequentialKeys + numNonSequentialKeys;
Arena arena;
Comparator cmp;
SkipListTest<Key, Comparator> list( cmp, &arena );
// initially the SkipList should be in sequential mode
ASSERT_TRUE( list.InSequentialInsertMode() );
// start inserting at key=1; when we insert 0 below, the list should switch
// out of sequential insert mode
for ( j = 1; j < numSequentialKeys; ++j ) {
list.Insert( j );
}
// the SkipList should still be in sequential mode
ASSERT_TRUE( list.InSequentialInsertMode() );
ASSERT_TRUE( list.Valid() );
list.Insert( 0 );
ASSERT_TRUE( !list.InSequentialInsertMode() );
ASSERT_TRUE( list.Valid() );
// now insert the remaining keys in non-sequential order (they're not
// random, but that doesn't matter here; just ensure we switch to
// non-sequential mode and that all continues to work)
for ( j = 0; j < numNonSequentialKeys; j += 2 ) {
int key = totalNumKeys - j - 1;
list.Insert( key );
}
for ( j = 0; j < numNonSequentialKeys; j += 2 ) {
int key = numSequentialKeys + j;
list.Insert( key );
}
ASSERT_TRUE( !list.InSequentialInsertMode() );
ASSERT_TRUE( list.Valid() );
// ensure the SkipList contains all the keys we inserted
for ( j = 0; j < totalNumKeys; ++j ) {
ASSERT_TRUE( list.Contains( j ) );
}
}
} // namespace leveldb } // namespace leveldb
int main(int argc, char** argv) { int main(int argc, char** argv) {

View file

@ -5,7 +5,6 @@
#ifndef STORAGE_LEVELDB_DB_SNAPSHOT_H_ #ifndef STORAGE_LEVELDB_DB_SNAPSHOT_H_
#define STORAGE_LEVELDB_DB_SNAPSHOT_H_ #define STORAGE_LEVELDB_DB_SNAPSHOT_H_
#include "db/dbformat.h"
#include "leveldb/db.h" #include "leveldb/db.h"
namespace leveldb { namespace leveldb {

View file

@ -5,22 +5,26 @@
#include "db/table_cache.h" #include "db/table_cache.h"
#include "db/filename.h" #include "db/filename.h"
#include "db/log_reader.h"
#include "db/log_writer.h"
#include "db/version_edit.h"
#include "leveldb/env.h" #include "leveldb/env.h"
#include "leveldb/table.h" #include "leveldb/table.h"
#include "util/coding.h" #include "util/coding.h"
#include "leveldb/perf_count.h"
namespace leveldb { namespace leveldb {
struct TableAndFile {
RandomAccessFile* file;
Table* table;
};
static void DeleteEntry(const Slice& key, void* value) { static void DeleteEntry(const Slice& key, void* value) {
TableAndFile* tf = reinterpret_cast<TableAndFile*>(value); TableAndFile* tf = reinterpret_cast<TableAndFile*>(value);
if (0==dec_and_fetch(&tf->user_count))
{
if (NULL!=tf->doublecache)
tf->doublecache->SubFileSize(tf->table->GetFileSize());
delete tf->table; delete tf->table;
delete tf->file; delete tf->file;
delete tf; delete tf;
} // if
} }
static void UnrefEntry(void* arg1, void* arg2) { static void UnrefEntry(void* arg1, void* arg2) {
@ -31,37 +35,38 @@ static void UnrefEntry(void* arg1, void* arg2) {
TableCache::TableCache(const std::string& dbname, TableCache::TableCache(const std::string& dbname,
const Options* options, const Options* options,
int entries) Cache * file_cache,
DoubleCache & doublecache)
: env_(options->env), : env_(options->env),
dbname_(dbname), dbname_(dbname),
options_(options), options_(options),
cache_(NewLRUCache(entries)) { cache_(file_cache),
doublecache_(doublecache)
{
} }
TableCache::~TableCache() { TableCache::~TableCache() {
delete cache_;
} }
Status TableCache::FindTable(uint64_t file_number, uint64_t file_size, Status TableCache::FindTable(uint64_t file_number, uint64_t file_size, int level,
Cache::Handle** handle) { Cache::Handle** handle, bool is_compaction,
bool for_iterator) {
Status s; Status s;
char buf[sizeof(file_number)]; char buf[sizeof(file_number)];
EncodeFixed64(buf, file_number); EncodeFixed64(buf, file_number);
Slice key(buf, sizeof(buf)); Slice key(buf, sizeof(buf));
*handle = cache_->Lookup(key); *handle = cache_->Lookup(key);
if (*handle == NULL) { if (*handle == NULL) {
std::string fname = TableFileName(dbname_, file_number); std::string fname = TableFileName(*options_, file_number, level);
RandomAccessFile* file = NULL; RandomAccessFile* file = NULL;
Table* table = NULL; Table* table = NULL;
s = env_->NewRandomAccessFile(fname, &file); s = env_->NewRandomAccessFile(fname, &file);
if (!s.ok()) {
std::string old_fname = SSTTableFileName(dbname_, file_number);
if (env_->NewRandomAccessFile(old_fname, &file).ok()) {
s = Status::OK();
}
}
if (s.ok()) { if (s.ok()) {
s = Table::Open(*options_, file, file_size, &table); s = Table::Open(*options_, file, file_size, &table);
// Riak: support opportunity to manage Linux page cache
if (is_compaction)
file->SetForCompaction(file_size);
} }
if (!s.ok()) { if (!s.ok()) {
@ -73,22 +78,74 @@ Status TableCache::FindTable(uint64_t file_number, uint64_t file_size,
TableAndFile* tf = new TableAndFile; TableAndFile* tf = new TableAndFile;
tf->file = file; tf->file = file;
tf->table = table; tf->table = table;
*handle = cache_->Insert(key, tf, 1, &DeleteEntry); tf->doublecache = &doublecache_;
tf->file_number = file_number;
tf->level = level;
*handle = cache_->Insert(key, tf, table->TableObjectSize(), &DeleteEntry);
gPerfCounters->Inc(ePerfTableOpened);
doublecache_.AddFileSize(table->GetFileSize());
// temporary hardcoding to match number of levels defined as
// overlapped in version_set.cc
if (level<config::kNumOverlapLevels)
cache_->Addref(*handle);
} }
} }
else
{
Table *table = reinterpret_cast<TableAndFile*>(cache_->Value(*handle))->table;
// this is NOT first access, see if bloom filter can load now
if (!for_iterator && table->ReadFilter())
{
// TableAndFile now going to be present in two cache entries
// 1. retrieve old entry within file cache
TableAndFile* tf = reinterpret_cast<TableAndFile*>(cache_->Value(*handle));
inc_and_fetch(&tf->user_count);
// 2. must clean file size, do not want double count
if (NULL!=tf->doublecache)
tf->doublecache->SubFileSize(tf->table->GetFileSize());
// 3. release current reference (and possible special overlap reference)
cache_->Release(*handle);
if (tf->level<config::kNumOverlapLevels)
cache_->Release(*handle);
// 4. create second table cache entry using TableObjectSize that now includes
// bloom filter size
*handle = cache_->Insert(key, tf, table->TableObjectSize(), &DeleteEntry);
// 5. set double reference if an overlapped file (prevents from being flushed)
if (level<config::kNumOverlapLevels)
cache_->Addref(*handle);
} // if
// for Linux, let fadvise start precaching
if (is_compaction)
{
RandomAccessFile *file = reinterpret_cast<TableAndFile*>(cache_->Value(*handle))->file;
file->SetForCompaction(file_size);
} // if
gPerfCounters->Inc(ePerfTableCached);
} // else
return s; return s;
} }
Iterator* TableCache::NewIterator(const ReadOptions& options, Iterator* TableCache::NewIterator(const ReadOptions& options,
uint64_t file_number, uint64_t file_number,
uint64_t file_size, uint64_t file_size,
int level,
Table** tableptr) { Table** tableptr) {
if (tableptr != NULL) { if (tableptr != NULL) {
*tableptr = NULL; *tableptr = NULL;
} }
Cache::Handle* handle = NULL; Cache::Handle* handle = NULL;
Status s = FindTable(file_number, file_size, &handle); Status s = FindTable(file_number, file_size, level, &handle, options.IsCompaction(), true);
if (!s.ok()) { if (!s.ok()) {
return NewErrorIterator(s); return NewErrorIterator(s);
} }
@ -105,11 +162,13 @@ Iterator* TableCache::NewIterator(const ReadOptions& options,
Status TableCache::Get(const ReadOptions& options, Status TableCache::Get(const ReadOptions& options,
uint64_t file_number, uint64_t file_number,
uint64_t file_size, uint64_t file_size,
int level,
const Slice& k, const Slice& k,
void* arg, void* arg,
void (*saver)(void*, const Slice&, const Slice&)) { bool (*saver)(void*, const Slice&, const Slice&)) {
Cache::Handle* handle = NULL; Cache::Handle* handle = NULL;
Status s = FindTable(file_number, file_size, &handle); Status s = FindTable(file_number, file_size, level, &handle);
if (s.ok()) { if (s.ok()) {
Table* t = reinterpret_cast<TableAndFile*>(cache_->Value(handle))->table; Table* t = reinterpret_cast<TableAndFile*>(cache_->Value(handle))->table;
s = t->InternalGet(options, k, arg, saver); s = t->InternalGet(options, k, arg, saver);
@ -118,10 +177,60 @@ Status TableCache::Get(const ReadOptions& options,
return s; return s;
} }
void TableCache::Evict(uint64_t file_number) { void TableCache::Evict(uint64_t file_number, bool is_overlapped) {
char buf[sizeof(file_number)]; char buf[sizeof(file_number)];
EncodeFixed64(buf, file_number); EncodeFixed64(buf, file_number);
// overlapped files have extra reference to prevent their purge,
// release that reference now
if (is_overlapped)
{
Cache::Handle *handle;
// the Lookup call adds a reference too, back out both
handle=cache_->Lookup(Slice(buf, sizeof(buf)));
// with multiple background threads, file might already be
// evicted
if (NULL!=handle)
{
cache_->Release(handle); // release for Lookup() call just made
cache_->Release(handle); // release for extra reference
} // if
} // if
cache_->Erase(Slice(buf, sizeof(buf))); cache_->Erase(Slice(buf, sizeof(buf)));
} }
/**
* Riak specific routine to return table statistic ONLY if table metadata
* already within cache ... otherwise return 0.
*/
uint64_t
TableCache::GetStatisticValue(
uint64_t file_number,
unsigned Index)
{
uint64_t ret_val;
char buf[sizeof(file_number)];
Cache::Handle *handle;
ret_val=0;
EncodeFixed64(buf, file_number);
Slice key(buf, sizeof(buf));
handle = cache_->Lookup(key);
if (NULL != handle)
{
TableAndFile * tf;
tf=reinterpret_cast<TableAndFile*>(cache_->Value(handle));
ret_val=tf->table->GetSstCounters().Value(Index);
cache_->Release(handle);
} // if
return(ret_val);
} // TableCache::GetStatisticValue
} // namespace leveldb } // namespace leveldb

View file

@ -13,6 +13,7 @@
#include "leveldb/cache.h" #include "leveldb/cache.h"
#include "leveldb/table.h" #include "leveldb/table.h"
#include "port/port.h" #include "port/port.h"
#include "util/cache2.h"
namespace leveldb { namespace leveldb {
@ -20,8 +21,10 @@ class Env;
class TableCache { class TableCache {
public: public:
TableCache(const std::string& dbname, const Options* options, int entries); // clean up note: file_cache is redundant to GetFileCache available from doublecache
~TableCache(); TableCache(const std::string& dbname, const Options* options, Cache * file_cache,
DoubleCache & doublecache);
virtual ~TableCache();
// Return an iterator for the specified file number (the corresponding // Return an iterator for the specified file number (the corresponding
// file length must be exactly "file_size" bytes). If "tableptr" is // file length must be exactly "file_size" bytes). If "tableptr" is
@ -33,6 +36,7 @@ class TableCache {
Iterator* NewIterator(const ReadOptions& options, Iterator* NewIterator(const ReadOptions& options,
uint64_t file_number, uint64_t file_number,
uint64_t file_size, uint64_t file_size,
int level,
Table** tableptr = NULL); Table** tableptr = NULL);
// If a seek to internal key "k" in specified file finds an entry, // If a seek to internal key "k" in specified file finds an entry,
@ -40,22 +44,65 @@ class TableCache {
Status Get(const ReadOptions& options, Status Get(const ReadOptions& options,
uint64_t file_number, uint64_t file_number,
uint64_t file_size, uint64_t file_size,
int level,
const Slice& k, const Slice& k,
void* arg, void* arg,
void (*handle_result)(void*, const Slice&, const Slice&)); bool (*handle_result)(void*, const Slice&, const Slice&));
// Evict any entry for the specified file number // Evict any entry for the specified file number
void Evict(uint64_t file_number); void Evict(uint64_t file_number, bool is_overlapped);
private: // Riak specific: return table statistic ONLY if table in cache, otherwise zero
uint64_t GetStatisticValue(uint64_t file_number, unsigned Index);
// access for testing tools, not for public access
Status TEST_FindTable(uint64_t file_number, uint64_t file_size, int level, Cache::Handle** handle)
{return( FindTable(file_number, file_size, level, handle));};
Cache* TEST_GetInternalCache() {return(cache_);};
void Release(Cache::Handle * handle) {cache_->Release(handle);};
// routine called if Options::cache_object_warming is true.
// Writes list of all file names currently in file cache to disk.
Status SaveOpenFileList();
// routine called if Options::cache_object_warming is true.
// Reads file created by SaveOpenFileList() and attempts to open
// every file.
Status PreloadTableCache();
// was private, now protected to allow easy unit test overrides
protected:
Env* const env_; Env* const env_;
const std::string dbname_; const std::string dbname_;
const Options* options_; const Options* options_;
Cache* cache_; Cache * cache_;
DoubleCache & doublecache_;
Status FindTable(uint64_t file_number, uint64_t file_size, Cache::Handle**); // virtual to enable unit test overrides
virtual Status FindTable(uint64_t file_number, uint64_t file_size, int level,
Cache::Handle**, bool is_compaction=false,
bool for_iterator=false);
}; };
struct TableAndFile {
RandomAccessFile* file;
Table* table;
DoubleCache * doublecache;
uint64_t file_number; // saved for cache object warming
int level; // saved for cache object warming
volatile uint32_t user_count;
TableAndFile()
: file(NULL), table(NULL), doublecache(NULL),
file_number(0), level(0), user_count(1)
{};
};
} // namespace leveldb } // namespace leveldb
#endif // STORAGE_LEVELDB_DB_TABLE_CACHE_H_ #endif // STORAGE_LEVELDB_DB_TABLE_CACHE_H_

View file

@ -9,20 +9,6 @@
namespace leveldb { namespace leveldb {
// Tag numbers for serialized VersionEdit. These numbers are written to
// disk and should not be changed.
enum Tag {
kComparator = 1,
kLogNumber = 2,
kNextFileNumber = 3,
kLastSequence = 4,
kCompactPointer = 5,
kDeletedFile = 6,
kNewFile = 7,
// 8 was used for large value refs
kPrevLogNumber = 9
};
void VersionEdit::Clear() { void VersionEdit::Clear() {
comparator_.clear(); comparator_.clear();
log_number_ = 0; log_number_ = 0;
@ -34,11 +20,21 @@ void VersionEdit::Clear() {
has_prev_log_number_ = false; has_prev_log_number_ = false;
has_next_file_number_ = false; has_next_file_number_ = false;
has_last_sequence_ = false; has_last_sequence_ = false;
has_f1_files_ = false;
has_f2_files_ = false;
deleted_files_.clear(); deleted_files_.clear();
new_files_.clear(); new_files_.clear();
} }
void VersionEdit::EncodeTo(std::string* dst) const { /**
* EncodeTo serializes the VersionEdit object
* to the "dst" string parameter. "format2" flag
* indicates whether serialization should use original
* Google format for file objects (false) or Basho's updated
* file2 format for expiry enabled file objects (true)
*/
void VersionEdit::EncodeTo(std::string* dst, bool format2) const {
if (has_comparator_) { if (has_comparator_) {
PutVarint32(dst, kComparator); PutVarint32(dst, kComparator);
PutLengthPrefixedSlice(dst, comparator_); PutLengthPrefixedSlice(dst, comparator_);
@ -76,12 +72,21 @@ void VersionEdit::EncodeTo(std::string* dst) const {
for (size_t i = 0; i < new_files_.size(); i++) { for (size_t i = 0; i < new_files_.size(); i++) {
const FileMetaData& f = new_files_[i].second; const FileMetaData& f = new_files_[i].second;
if (format2)
PutVarint32(dst, kNewFile2);
else
PutVarint32(dst, kNewFile); PutVarint32(dst, kNewFile);
PutVarint32(dst, new_files_[i].first); // level PutVarint32(dst, new_files_[i].first); // level
PutVarint64(dst, f.number); PutVarint64(dst, f.number);
PutVarint64(dst, f.file_size); PutVarint64(dst, f.file_size);
PutLengthPrefixedSlice(dst, f.smallest.Encode()); PutLengthPrefixedSlice(dst, f.smallest.Encode());
PutLengthPrefixedSlice(dst, f.largest.Encode()); PutLengthPrefixedSlice(dst, f.largest.Encode());
if (format2)
{
PutVarint64(dst, f.exp_write_low);
PutVarint64(dst, f.exp_write_high);
PutVarint64(dst, f.exp_explicit_high);
}
} }
} }
@ -98,7 +103,7 @@ static bool GetInternalKey(Slice* input, InternalKey* dst) {
static bool GetLevel(Slice* input, int* level) { static bool GetLevel(Slice* input, int* level) {
uint32_t v; uint32_t v;
if (GetVarint32(input, &v) && if (GetVarint32(input, &v) &&
v < config::kNumLevels) { v < (unsigned)config::kNumLevels) {
*level = v; *level = v;
return true; return true;
} else { } else {
@ -185,13 +190,34 @@ Status VersionEdit::DecodeFrom(const Slice& src) {
GetVarint64(&input, &f.number) && GetVarint64(&input, &f.number) &&
GetVarint64(&input, &f.file_size) && GetVarint64(&input, &f.file_size) &&
GetInternalKey(&input, &f.smallest) && GetInternalKey(&input, &f.smallest) &&
GetInternalKey(&input, &f.largest)) { GetInternalKey(&input, &f.largest))
{
has_f1_files_ = true;
f.level=level;
new_files_.push_back(std::make_pair(level, f)); new_files_.push_back(std::make_pair(level, f));
} else { } else {
msg = "new-file entry"; msg = "new-file entry";
} }
break; break;
case kNewFile2:
if (GetLevel(&input, &level) &&
GetVarint64(&input, &f.number) &&
GetVarint64(&input, &f.file_size) &&
GetInternalKey(&input, &f.smallest) &&
GetInternalKey(&input, &f.largest) &&
GetVarint64(&input, &f.exp_write_low) &&
GetVarint64(&input, &f.exp_write_high) &&
GetVarint64(&input, &f.exp_explicit_high))
{
has_f2_files_ = true;
f.level=level;
new_files_.push_back(std::make_pair(level, f));
} else {
msg = "new-file2 entry";
}
break;
default: default:
msg = "unknown tag"; msg = "unknown tag";
break; break;
@ -258,6 +284,12 @@ std::string VersionEdit::DebugString() const {
r.append(f.smallest.DebugString()); r.append(f.smallest.DebugString());
r.append(" .. "); r.append(" .. ");
r.append(f.largest.DebugString()); r.append(f.largest.DebugString());
r.append(" ");
AppendNumberTo(&r, f.exp_write_low);
r.append(" ");
AppendNumberTo(&r, f.exp_write_high);
r.append(" ");
AppendNumberTo(&r, f.exp_explicit_high);
} }
r.append("\n}\n"); r.append("\n}\n");
return r; return r;

View file

@ -16,15 +16,41 @@ class VersionSet;
struct FileMetaData { struct FileMetaData {
int refs; int refs;
int allowed_seeks; // Seeks allowed until compaction // int allowed_seeks; // Seeks allowed until compaction
uint64_t number; uint64_t number;
uint64_t file_size; // File size in bytes uint64_t file_size; // File size in bytes
uint64_t num_entries; // count of values in .sst file, only valid during table build
InternalKey smallest; // Smallest internal key served by table InternalKey smallest; // Smallest internal key served by table
InternalKey largest; // Largest internal key served by table InternalKey largest; // Largest internal key served by table
int level;
ExpiryTimeMicros exp_write_low; // oldest write time in file:
// 0 - non-expiry keys exist too
// ULLONG_MAX - no write time expiry & no plain keys
ExpiryTimeMicros exp_write_high; // most recent write time in file
ExpiryTimeMicros exp_explicit_high; // most recent/furthest into future explicit expiry
FileMetaData() : refs(0), allowed_seeks(1 << 30), file_size(0) { } FileMetaData()
: refs(0), /*allowed_seeks(1 << 30),*/ file_size(0),
num_entries(0), level(-1), exp_write_low(0), exp_write_high(0), exp_explicit_high(0)
{ }
}; };
class FileMetaDataPtrCompare
{
protected:
const Comparator * comparator_;
public:
explicit FileMetaDataPtrCompare(const Comparator * Comparer)
: comparator_(Comparer) {};
bool operator() (const FileMetaData * file1, const FileMetaData * file2) const
{
return(comparator_->Compare(file1->smallest.user_key(), file2->smallest.user_key()) < 0);
}
}; // class FileMetaDataPtrCompare
class VersionEdit { class VersionEdit {
public: public:
VersionEdit() { Clear(); } VersionEdit() { Clear(); }
@ -59,6 +85,7 @@ class VersionEdit {
// Add the specified file at the specified number. // Add the specified file at the specified number.
// REQUIRES: This version has not been saved (see VersionSet::SaveTo) // REQUIRES: This version has not been saved (see VersionSet::SaveTo)
// REQUIRES: "smallest" and "largest" are smallest and largest keys in file // REQUIRES: "smallest" and "largest" are smallest and largest keys in file
#if 0
void AddFile(int level, uint64_t file, void AddFile(int level, uint64_t file,
uint64_t file_size, uint64_t file_size,
const InternalKey& smallest, const InternalKey& smallest,
@ -68,6 +95,27 @@ class VersionEdit {
f.file_size = file_size; f.file_size = file_size;
f.smallest = smallest; f.smallest = smallest;
f.largest = largest; f.largest = largest;
f.level = level;
new_files_.push_back(std::make_pair(level, f));
}
#endif
void AddFile2(int level, uint64_t file,
uint64_t file_size,
const InternalKey& smallest,
const InternalKey& largest,
uint64_t exp_write_low,
uint64_t exp_write_high,
uint64_t exp_explicit_high) {
FileMetaData f;
f.number = file;
f.file_size = file_size;
f.smallest = smallest;
f.largest = largest;
f.level = level;
f.exp_write_low = exp_write_low;
f.exp_write_high = exp_write_high;
f.exp_explicit_high = exp_explicit_high;
new_files_.push_back(std::make_pair(level, f)); new_files_.push_back(std::make_pair(level, f));
} }
@ -75,16 +123,37 @@ class VersionEdit {
void DeleteFile(int level, uint64_t file) { void DeleteFile(int level, uint64_t file) {
deleted_files_.insert(std::make_pair(level, file)); deleted_files_.insert(std::make_pair(level, file));
} }
size_t DeletedFileCount() const {return(deleted_files_.size());};
void EncodeTo(std::string* dst) const; void EncodeTo(std::string* dst, bool format2=true) const;
Status DecodeFrom(const Slice& src); Status DecodeFrom(const Slice& src);
// unit test access to validate file entries' format types
bool HasF1Files() const {return(has_f1_files_);};
bool HasF2Files() const {return(has_f2_files_);};
std::string DebugString() const; std::string DebugString() const;
// Tag numbers for serialized VersionEdit. These numbers are written to
// disk and should not be changed.
enum Tag {
kComparator = 1,
kLogNumber = 2,
kNextFileNumber = 3,
kLastSequence = 4,
kCompactPointer = 5,
kDeletedFile = 6,
kNewFile = 7,
// 8 was used for large value refs
kPrevLogNumber = 9,
kFileCacheObject = 10,
kNewFile2 = 11 // expiry capable file
};
private: private:
friend class VersionSet; friend class VersionSet;
typedef std::set< std::pair<int, uint64_t> > DeletedFileSet; USED_BY_NESTED_FRIEND2(typedef std::set< std::pair<int, uint64_t> > DeletedFileSet)
std::string comparator_; std::string comparator_;
uint64_t log_number_; uint64_t log_number_;
@ -96,10 +165,13 @@ class VersionEdit {
bool has_prev_log_number_; bool has_prev_log_number_;
bool has_next_file_number_; bool has_next_file_number_;
bool has_last_sequence_; bool has_last_sequence_;
// following should be mutually exclusive, but tested independently to be sure
bool has_f1_files_; // manifest uses format 1 (for unit tests)
bool has_f2_files_; // manifest uses format 2 (for unit tests)
std::vector< std::pair<int, InternalKey> > compact_pointers_; USED_BY_NESTED_FRIEND2(std::vector< std::pair<int, InternalKey> > compact_pointers_)
DeletedFileSet deleted_files_; USED_BY_NESTED_FRIEND(DeletedFileSet deleted_files_)
std::vector< std::pair<int, FileMetaData> > new_files_; USED_BY_NESTED_FRIEND2(std::vector< std::pair<int, FileMetaData> > new_files_)
}; };
} // namespace leveldb } // namespace leveldb

View file

@ -7,14 +7,22 @@
namespace leveldb { namespace leveldb {
static void TestEncodeDecode(const VersionEdit& edit) { static void TestEncodeDecode(
const VersionEdit& edit,
bool format2=false) {
std::string encoded, encoded2; std::string encoded, encoded2;
edit.EncodeTo(&encoded); edit.EncodeTo(&encoded,format2);
VersionEdit parsed; VersionEdit parsed;
Status s = parsed.DecodeFrom(encoded); Status s = parsed.DecodeFrom(encoded);
ASSERT_TRUE(s.ok()) << s.ToString(); ASSERT_TRUE(s.ok()) << s.ToString();
parsed.EncodeTo(&encoded2); parsed.EncodeTo(&encoded2,format2);
ASSERT_EQ(encoded, encoded2); ASSERT_EQ(encoded, encoded2);
if (parsed.HasF1Files() || parsed.HasF2Files())
{
ASSERT_EQ(parsed.HasF1Files(), !format2);
ASSERT_EQ(parsed.HasF2Files(), format2);
} // if
} }
class VersionEditTest { }; class VersionEditTest { };
@ -25,11 +33,12 @@ TEST(VersionEditTest, EncodeDecode) {
VersionEdit edit; VersionEdit edit;
for (int i = 0; i < 4; i++) { for (int i = 0; i < 4; i++) {
TestEncodeDecode(edit); TestEncodeDecode(edit);
edit.AddFile(3, kBig + 300 + i, kBig + 400 + i, edit.AddFile2(3, kBig + 300 + i, kBig + 400 + i,
InternalKey("foo", kBig + 500 + i, kTypeValue), InternalKey("foo", 0, kBig + 500 + i, kTypeValue),
InternalKey("zoo", kBig + 600 + i, kTypeDeletion)); InternalKey("zoo", 0, kBig + 600 + i, kTypeDeletion),
0,0,0);
edit.DeleteFile(4, kBig + 700 + i); edit.DeleteFile(4, kBig + 700 + i);
edit.SetCompactPointer(i, InternalKey("x", kBig + 900 + i, kTypeValue)); edit.SetCompactPointer(i, InternalKey("x", 0, kBig + 900 + i, kTypeValue));
} }
edit.SetComparatorName("foo"); edit.SetComparatorName("foo");
@ -39,6 +48,29 @@ TEST(VersionEditTest, EncodeDecode) {
TestEncodeDecode(edit); TestEncodeDecode(edit);
} }
TEST(VersionEditTest, EncodeDecodeExpiry) {
static const uint64_t kBig = 1ull << 25;
VersionEdit edit;
for (int i = 0; i < 4; i++) {
TestEncodeDecode(edit, false); // only testing for s.ok()
edit.AddFile2(3, kBig + 300 + i, kBig + 400 + i,
InternalKey("foo", 700+i, kBig + 500 + i, kTypeValueExplicitExpiry),
InternalKey("zoo", 800+i, kBig + 600 + i, kTypeDeletion),
10203040,
123456789,
987654321);
edit.DeleteFile(4, kBig + 700 + i);
edit.SetCompactPointer(i, InternalKey("x", 0, kBig + 900 + i, kTypeValue));
}
edit.SetComparatorName("foo");
edit.SetLogNumber(kBig + 100);
edit.SetNextFile(kBig + 200);
edit.SetLastSequence(kBig + 1000);
TestEncodeDecode(edit, true);
}
} // namespace leveldb } // namespace leveldb
int main(int argc, char** argv) { int main(int argc, char** argv) {

File diff suppressed because it is too large Load diff

View file

@ -21,7 +21,9 @@
#include "db/dbformat.h" #include "db/dbformat.h"
#include "db/version_edit.h" #include "db/version_edit.h"
#include "port/port.h" #include "port/port.h"
#include "port/thread_annotations.h" #include "leveldb/atomics.h"
#include "leveldb/env.h"
#include "util/throttle.h"
namespace leveldb { namespace leveldb {
@ -70,7 +72,7 @@ class Version {
FileMetaData* seek_file; FileMetaData* seek_file;
int seek_file_level; int seek_file_level;
}; };
Status Get(const ReadOptions&, const LookupKey& key, std::string* val, Status Get(const ReadOptions&, const LookupKey& key, Value* val,
GetStats* stats); GetStats* stats);
// Adds "stats" into the current state. Returns true if a new // Adds "stats" into the current state. Returns true if a new
@ -78,12 +80,6 @@ class Version {
// REQUIRES: lock is held // REQUIRES: lock is held
bool UpdateStats(const GetStats& stats); bool UpdateStats(const GetStats& stats);
// Record a sample of bytes read at the specified internal key.
// Samples are taken approximately once every config::kReadBytesPeriod
// bytes. Returns true if a new compaction may need to be triggered.
// REQUIRES: lock is held
bool RecordReadSample(Slice key);
// Reference count management (so Versions do not disappear out from // Reference count management (so Versions do not disappear out from
// under live iterators) // under live iterators)
void Ref(); void Ref();
@ -101,43 +97,47 @@ class Version {
// largest_user_key==NULL represents a key largest than all keys in the DB. // largest_user_key==NULL represents a key largest than all keys in the DB.
bool OverlapInLevel(int level, bool OverlapInLevel(int level,
const Slice* smallest_user_key, const Slice* smallest_user_key,
const Slice* largest_user_key); const Slice* largest_user_key) const;
// Return the level at which we should place a new memtable compaction // Return the level at which we should place a new memtable compaction
// result that covers the range [smallest_user_key,largest_user_key]. // result that covers the range [smallest_user_key,largest_user_key].
int PickLevelForMemTableOutput(const Slice& smallest_user_key, int PickLevelForMemTableOutput(const Slice& smallest_user_key,
const Slice& largest_user_key); const Slice& largest_user_key,
const int level_limit);
int NumFiles(int level) const { return files_[level].size(); } virtual size_t NumFiles(int level) const { return files_[level].size(); }
const VersionSet * GetVersionSet() const { return vset_; }
typedef std::vector<FileMetaData*> FileMetaDataVector_t;
virtual const std::vector<FileMetaData*> & GetFileList(int level) const {return files_[level];};
volatile int WritePenalty() const {return write_penalty_; }
// Riak specific repair routine
bool VerifyLevels(int & level, InternalKey & begin, InternalKey & end);
// Return a human readable string that describes this version's contents. // Return a human readable string that describes this version's contents.
std::string DebugString() const; std::string DebugString() const;
private: protected:
friend class Compaction; friend class Compaction;
friend class VersionSet; friend class VersionSet;
class LevelFileNumIterator; class LevelFileNumIterator;
Iterator* NewConcatenatingIterator(const ReadOptions&, int level) const; Iterator* NewConcatenatingIterator(const ReadOptions&, int level) const;
// Call func(arg, level, f) for every file that overlaps user_key in
// order from newest to oldest. If an invocation of func returns
// false, makes no more calls.
//
// REQUIRES: user portion of internal_key == user_key.
void ForEachOverlapping(Slice user_key, Slice internal_key,
void* arg,
bool (*func)(void*, int, FileMetaData*));
VersionSet* vset_; // VersionSet to which this Version belongs VersionSet* vset_; // VersionSet to which this Version belongs
Version* next_; // Next version in linked list Version* next_; // Next version in linked list
Version* prev_; // Previous version in linked list Version* prev_; // Previous version in linked list
int refs_; // Number of live refs to this version int refs_; // Number of live refs to this version
// List of files per level // List of files per level
std::vector<FileMetaData*> files_[config::kNumLevels]; USED_BY_NESTED_FRIEND(std::vector<FileMetaData*> files_[config::kNumLevels];)
// Next file to compact based on seek stats. protected:
// Next file to compact based on seek stats (or Riak delete test)
FileMetaData* file_to_compact_; FileMetaData* file_to_compact_;
int file_to_compact_level_; int file_to_compact_level_;
@ -146,17 +146,29 @@ class Version {
// are initialized by Finalize(). // are initialized by Finalize().
double compaction_score_; double compaction_score_;
int compaction_level_; int compaction_level_;
bool compaction_grooming_;
bool compaction_no_move_;
bool compaction_expirefile_;
volatile int write_penalty_;
protected:
// make the ctor/dtor protected, so that a unit test can subclass
explicit Version(VersionSet* vset) explicit Version(VersionSet* vset)
: vset_(vset), next_(this), prev_(this), refs_(0), : vset_(vset), next_(this), prev_(this), refs_(0),
file_to_compact_(NULL), file_to_compact_(NULL),
file_to_compact_level_(-1), file_to_compact_level_(-1),
compaction_score_(-1), compaction_score_(-1),
compaction_level_(-1) { compaction_level_(-1),
compaction_grooming_(false),
compaction_no_move_(false),
compaction_expirefile_(false),
write_penalty_(0)
{
} }
~Version(); virtual ~Version();
private:
// No copying allowed // No copying allowed
Version(const Version&); Version(const Version&);
void operator=(const Version&); void operator=(const Version&);
@ -175,11 +187,10 @@ class VersionSet {
// current version. Will release *mu while actually writing to the file. // current version. Will release *mu while actually writing to the file.
// REQUIRES: *mu is held on entry. // REQUIRES: *mu is held on entry.
// REQUIRES: no other thread concurrently calls LogAndApply() // REQUIRES: no other thread concurrently calls LogAndApply()
Status LogAndApply(VersionEdit* edit, port::Mutex* mu) Status LogAndApply(VersionEdit* edit, port::Mutex* mu);
EXCLUSIVE_LOCKS_REQUIRED(mu);
// Recover the last saved descriptor from persistent storage. // Recover the last saved descriptor from persistent storage.
Status Recover(bool *save_manifest); Status Recover();
// Return the current version. // Return the current version.
Version* current() const { return current_; } Version* current() const { return current_; }
@ -188,19 +199,29 @@ class VersionSet {
uint64_t ManifestFileNumber() const { return manifest_file_number_; } uint64_t ManifestFileNumber() const { return manifest_file_number_; }
// Allocate and return a new file number // Allocate and return a new file number
uint64_t NewFileNumber() { return next_file_number_++; } // (-1 is to "duplicate" old post-increment logic while maintaining
// some threading integrity ... next_file_number_ used naked a bunch)
uint64_t NewFileNumber() { return(inc_and_fetch(&next_file_number_) -1); }
// Arrange to reuse "file_number" unless a newer file number has // Arrange to reuse "file_number" unless a newer file number has
// already been allocated. // already been allocated.
// REQUIRES: "file_number" was returned by a call to NewFileNumber(). // REQUIRES: "file_number" was returned by a call to NewFileNumber().
// (disabled due to threading concerns ... and desire NOT to use mutex, matthewv)
void ReuseFileNumber(uint64_t file_number) { void ReuseFileNumber(uint64_t file_number) {
if (next_file_number_ == file_number + 1) { // if (next_file_number_ == file_number + 1) {
next_file_number_ = file_number; // next_file_number_ = file_number;
} // }
} }
// Return the number of Table files at the specified level. // Return the number of Table files at the specified level.
int NumLevelFiles(int level) const; size_t NumLevelFiles(int level) const;
// is the specified level overlapped (or if false->sorted)
static bool IsLevelOverlapped(int level);
static uint64_t DesiredBytesForLevel(int level);
static uint64_t MaxBytesForLevel(int level);
static uint64_t MaxFileSizeForLevel(int level);
// Return the combined file size of all files at the specified level. // Return the combined file size of all files at the specified level.
int64_t NumLevelBytes(int level) const; int64_t NumLevelBytes(int level) const;
@ -224,11 +245,36 @@ class VersionSet {
// being compacted, or zero if there is no such log file. // being compacted, or zero if there is no such log file.
uint64_t PrevLogNumber() const { return prev_log_number_; } uint64_t PrevLogNumber() const { return prev_log_number_; }
int WriteThrottleUsec(bool active_compaction)
{
uint64_t penalty, throttle;
int ret_val;
penalty=current_->write_penalty_;
throttle=GetThrottleWriteRate();
ret_val=0;
if (0==penalty && 1!=throttle)
ret_val=(int)throttle;
else if (0!=penalty)
{
if (1==throttle)
throttle=GetUnadjustedThrottleWriteRate();
ret_val=(int)penalty * throttle;
} // else if
return(ret_val);
}
// Pick level and inputs for a new compaction. // Pick level and inputs for a new compaction.
// Returns NULL if there is no compaction to be done. // Returns NULL if there is no compaction to be done.
// Otherwise returns a pointer to a heap-allocated object that // Otherwise returns a pointer to a heap-allocated object that
// describes the compaction. Caller should delete the result. // describes the compaction. Caller should delete the result.
Compaction* PickCompaction(); //
// Riak October 2013: Pick Compaction now posts work directly
// to hot_thread pools
void PickCompaction(class DBImpl * db_impl);
// Return a compaction object for compacting the range [begin,end] in // Return a compaction object for compacting the range [begin,end] in
// the specified level. Returns NULL if there is nothing in that // the specified level. Returns NULL if there is nothing in that
@ -267,16 +313,42 @@ class VersionSet {
char buffer[100]; char buffer[100];
}; };
const char* LevelSummary(LevelSummaryStorage* scratch) const; const char* LevelSummary(LevelSummaryStorage* scratch) const;
const char* CompactionSummary(LevelSummaryStorage* scratch) const;
private: TableCache* GetTableCache() {return(table_cache_);};
const Options * GetOptions() const {return(options_);};
bool IsCompactionSubmitted(int level)
{return(m_CompactionStatus[level].m_Submitted);}
void SetCompactionSubmitted(int level)
{m_CompactionStatus[level].m_Submitted=true;}
void SetCompactionRunning(int level)
{m_CompactionStatus[level].m_Running=true;}
void SetCompactionDone(int level, uint64_t Now)
{ m_CompactionStatus[level].m_Running=false;
m_CompactionStatus[level].m_Submitted=false;
// must set both source and destination. otherwise
// destination might immediately decide it needs a
// timed grooming too ... defeating idea to spreadout the groomings
m_CompactionStatus[level].m_LastCompaction=Now;
if ((level+1)<config::kNumLevels)
m_CompactionStatus[level+1].m_LastCompaction=Now;
}
bool NeighborCompactionsQuiet(int level);
protected:
class Builder; class Builder;
friend class Compaction; friend class Compaction;
friend class Version; friend class Version;
bool ReuseManifest(const std::string& dscname, const std::string& dscbase); bool Finalize(Version* v);
void UpdatePenalty(Version *v);
void Finalize(Version* v);
void GetRange(const std::vector<FileMetaData*>& inputs, void GetRange(const std::vector<FileMetaData*>& inputs,
InternalKey* smallest, InternalKey* smallest,
@ -299,7 +371,7 @@ class VersionSet {
const Options* const options_; const Options* const options_;
TableCache* const table_cache_; TableCache* const table_cache_;
const InternalKeyComparator icmp_; const InternalKeyComparator icmp_;
uint64_t next_file_number_; volatile uint64_t next_file_number_;
uint64_t manifest_file_number_; uint64_t manifest_file_number_;
uint64_t last_sequence_; uint64_t last_sequence_;
uint64_t log_number_; uint64_t log_number_;
@ -315,11 +387,44 @@ class VersionSet {
// Either an empty string, or a valid InternalKey. // Either an empty string, or a valid InternalKey.
std::string compact_pointer_[config::kNumLevels]; std::string compact_pointer_[config::kNumLevels];
// Riak allows multiple compaction threads, this mutex allows
// only one to write to manifest at a time. Only used in LogAndApply
port::Mutex manifest_mutex_;
volatile uint64_t last_penalty_minutes_;
volatile int prev_write_penalty_;
struct CompactionStatus_s
{
bool m_Submitted; //!< level submitted to hot thread pool
bool m_Running; //!< thread actually running compaction
uint64_t m_LastCompaction; //!<NowMicros() when last compaction completed
CompactionStatus_s()
: m_Submitted(false), m_Running(false), m_LastCompaction(0)
{};
} m_CompactionStatus[config::kNumLevels];
private:
// No copying allowed // No copying allowed
VersionSet(const VersionSet&); VersionSet(const VersionSet&);
void operator=(const VersionSet&); void operator=(const VersionSet&);
}; };
//
// allows routing of compaction request to
// diverse processing routines via common
// BackgroundCall2 thread entry
//
enum CompactionType
{
kNormalCompaction = 0x0,
kExpiryFileCompaction = 0x1
}; // CompactionType
// A Compaction encapsulates information about a compaction. // A Compaction encapsulates information about a compaction.
class Compaction { class Compaction {
public: public:
@ -329,6 +434,9 @@ class Compaction {
// and "level+1" will be merged to produce a set of "level+1" files. // and "level+1" will be merged to produce a set of "level+1" files.
int level() const { return level_; } int level() const { return level_; }
// Return parent Version object
const Version * version() const { return input_version_; }
// Return the object that holds the edits to the descriptor done // Return the object that holds the edits to the descriptor done
// by this compaction. // by this compaction.
VersionEdit* edit() { return &edit_; } VersionEdit* edit() { return &edit_; }
@ -356,32 +464,47 @@ class Compaction {
// Returns true iff we should stop building the current output // Returns true iff we should stop building the current output
// before processing "internal_key". // before processing "internal_key".
bool ShouldStopBefore(const Slice& internal_key); bool ShouldStopBefore(const Slice& internal_key, size_t key_count);
// Release the input version for the compaction, once the compaction // Release the input version for the compaction, once the compaction
// is successful. // is successful.
void ReleaseInputs(); void ReleaseInputs();
// Riak specific: get summary statistics from compaction inputs
void CalcInputStats(TableCache & tables);
size_t TotalUserDataSize() const {return(tot_user_data_);};
size_t TotalIndexKeys() const {return(tot_index_keys_);};
size_t AverageValueSize() const {return(avg_value_size_);};
size_t AverageKeySize() const {return(avg_key_size_);};
size_t AverageBlockSize() const {return(avg_block_size_);};
bool IsCompressible() const {return(compressible_);};
// Riak specific: is move operation ok for compaction?
bool IsMoveOk() const {return(!no_move_);};
enum CompactionType GetCompactionType() const {return(compaction_type_);};
private: private:
friend class Version; friend class Version;
friend class VersionSet; friend class VersionSet;
Compaction(const Options* options, int level); explicit Compaction(int level);
int level_; int level_;
uint64_t max_output_file_size_; uint64_t max_output_file_size_;
Version* input_version_; Version* input_version_;
VersionEdit edit_; VersionEdit edit_;
CompactionType compaction_type_;
// Each compaction reads inputs from "level_" and "level_+1" // Each compaction reads inputs from "level_" and "level_+1"
std::vector<FileMetaData*> inputs_[2]; // The two sets of inputs std::vector<FileMetaData*> inputs_[2]; // The two sets of inputs
// State used to check for number of overlapping grandparent files // State used to check for number of of overlapping grandparent files
// (parent == level_ + 1, grandparent == level_ + 2) // (parent == level_ + 1, grandparent == level_ + 2)
std::vector<FileMetaData*> grandparents_; std::vector<FileMetaData*> grandparents_;
size_t grandparent_index_; // Index in grandparent_starts_ size_t grandparent_index_; // Index in grandparent_starts_
bool seen_key_; // Some output key has been seen bool seen_key_; // Some output key has been seen
int64_t overlapped_bytes_; // Bytes of overlap between current output uint64_t overlapped_bytes_; // Bytes of overlap between current output
// and grandparent files // and grandparent files
// State for implementing IsBaseLevelForKey // State for implementing IsBaseLevelForKey
@ -391,6 +514,16 @@ class Compaction {
// higher level than the ones involved in this compaction (i.e. for // higher level than the ones involved in this compaction (i.e. for
// all L >= level_ + 2). // all L >= level_ + 2).
size_t level_ptrs_[config::kNumLevels]; size_t level_ptrs_[config::kNumLevels];
// Riak specific: output statistics from CalcInputStats
size_t tot_user_data_;
size_t tot_index_keys_;
size_t avg_value_size_;
size_t avg_key_size_;
size_t avg_block_size_;
bool compressible_;
bool stats_done_;
bool no_move_;
}; };
} // namespace leveldb } // namespace leveldb

View file

@ -27,13 +27,13 @@ class FindFileTest {
SequenceNumber largest_seq = 100) { SequenceNumber largest_seq = 100) {
FileMetaData* f = new FileMetaData; FileMetaData* f = new FileMetaData;
f->number = files_.size() + 1; f->number = files_.size() + 1;
f->smallest = InternalKey(smallest, smallest_seq, kTypeValue); f->smallest = InternalKey(smallest, 0, smallest_seq, kTypeValue);
f->largest = InternalKey(largest, largest_seq, kTypeValue); f->largest = InternalKey(largest, 0, largest_seq, kTypeValue);
files_.push_back(f); files_.push_back(f);
} }
int Find(const char* key) { int Find(const char* key) {
InternalKey target(key, 100, kTypeValue); InternalKey target(key, 0, 100, kTypeValue);
InternalKeyComparator cmp(BytewiseComparator()); InternalKeyComparator cmp(BytewiseComparator());
return FindFile(cmp, files_, target.Encode()); return FindFile(cmp, files_, target.Encode());
} }

View file

@ -13,13 +13,17 @@
// len: varint32 // len: varint32
// data: uint8[len] // data: uint8[len]
#include "leveldb/write_batch.h" #include <stdint.h>
#include "leveldb/db.h" #include "leveldb/db.h"
#include "leveldb/env.h"
#include "leveldb/expiry.h"
#include "leveldb/write_batch.h"
#include "db/dbformat.h" #include "db/dbformat.h"
#include "db/memtable.h" #include "db/memtable.h"
#include "db/write_batch_internal.h" #include "db/write_batch_internal.h"
#include "util/coding.h" #include "util/coding.h"
#include "util/throttle.h"
namespace leveldb { namespace leveldb {
@ -47,16 +51,17 @@ Status WriteBatch::Iterate(Handler* handler) const {
input.remove_prefix(kHeader); input.remove_prefix(kHeader);
Slice key, value; Slice key, value;
ExpiryTimeMicros expiry;
int found = 0; int found = 0;
while (!input.empty()) { while (!input.empty()) {
found++; found++;
char tag = input[0]; ValueType tag = (ValueType)input[0];
input.remove_prefix(1); input.remove_prefix(1);
switch (tag) { switch (tag) {
case kTypeValue: case kTypeValue:
if (GetLengthPrefixedSlice(&input, &key) && if (GetLengthPrefixedSlice(&input, &key) &&
GetLengthPrefixedSlice(&input, &value)) { GetLengthPrefixedSlice(&input, &value)) {
handler->Put(key, value); handler->Put(key, value, kTypeValue, 0);
} else { } else {
return Status::Corruption("bad WriteBatch Put"); return Status::Corruption("bad WriteBatch Put");
} }
@ -68,6 +73,16 @@ Status WriteBatch::Iterate(Handler* handler) const {
return Status::Corruption("bad WriteBatch Delete"); return Status::Corruption("bad WriteBatch Delete");
} }
break; break;
case kTypeValueWriteTime:
case kTypeValueExplicitExpiry:
if (GetLengthPrefixedSlice(&input, &key) &&
GetVarint64(&input, &expiry) &&
GetLengthPrefixedSlice(&input, &value)) {
handler->Put(key, value, tag, expiry);
} else {
return Status::Corruption("bad WriteBatch Expiry");
}
break;
default: default:
return Status::Corruption("unknown WriteBatch tag"); return Status::Corruption("unknown WriteBatch tag");
} }
@ -95,10 +110,20 @@ void WriteBatchInternal::SetSequence(WriteBatch* b, SequenceNumber seq) {
EncodeFixed64(&b->rep_[0], seq); EncodeFixed64(&b->rep_[0], seq);
} }
void WriteBatch::Put(const Slice& key, const Slice& value) { void WriteBatch::Put(const Slice& key, const Slice& value, const KeyMetaData * meta) {
KeyMetaData local_meta;
WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1); WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1);
rep_.push_back(static_cast<char>(kTypeValue)); if (NULL!=meta)
local_meta=*meta;
rep_.push_back(static_cast<char>(local_meta.m_Type));
PutLengthPrefixedSlice(&rep_, key); PutLengthPrefixedSlice(&rep_, key);
if (kTypeValueExplicitExpiry==local_meta.m_Type
|| kTypeValueWriteTime==local_meta.m_Type)
{
if (kTypeValueWriteTime==local_meta.m_Type && 0==local_meta.m_Expiry)
local_meta.m_Expiry=GetCachedTimeMicros();
PutVarint64(&rep_, local_meta.m_Expiry);
} // if
PutLengthPrefixedSlice(&rep_, value); PutLengthPrefixedSlice(&rep_, value);
} }
@ -113,23 +138,33 @@ class MemTableInserter : public WriteBatch::Handler {
public: public:
SequenceNumber sequence_; SequenceNumber sequence_;
MemTable* mem_; MemTable* mem_;
const Options * options_;
virtual void Put(const Slice& key, const Slice& value) { MemTableInserter() : mem_(NULL), options_(NULL) {};
mem_->Add(sequence_, kTypeValue, key, value);
virtual void Put(const Slice& key, const Slice& value, const ValueType &type, const ExpiryTimeMicros &expiry) {
ValueType type_use(type);
ExpiryTimeMicros expiry_use(expiry);
if (NULL!=options_ && options_->ExpiryActivated())
options_->expiry_module->MemTableInserterCallback(key, value, type_use, expiry_use);
mem_->Add(sequence_, (ValueType)type_use, key, value, expiry_use);
sequence_++; sequence_++;
} }
virtual void Delete(const Slice& key) { virtual void Delete(const Slice& key) {
mem_->Add(sequence_, kTypeDeletion, key, Slice()); mem_->Add(sequence_, kTypeDeletion, key, Slice(), 0);
sequence_++; sequence_++;
} }
}; };
} // namespace } // namespace
Status WriteBatchInternal::InsertInto(const WriteBatch* b, Status WriteBatchInternal::InsertInto(const WriteBatch* b,
MemTable* memtable) { MemTable* memtable,
const Options * options) {
MemTableInserter inserter; MemTableInserter inserter;
inserter.sequence_ = WriteBatchInternal::Sequence(b); inserter.sequence_ = WriteBatchInternal::Sequence(b);
inserter.mem_ = memtable; inserter.mem_ = memtable;
inserter.options_ = options;
return b->Iterate(&inserter); return b->Iterate(&inserter);
} }

View file

@ -5,7 +5,6 @@
#ifndef STORAGE_LEVELDB_DB_WRITE_BATCH_INTERNAL_H_ #ifndef STORAGE_LEVELDB_DB_WRITE_BATCH_INTERNAL_H_
#define STORAGE_LEVELDB_DB_WRITE_BATCH_INTERNAL_H_ #define STORAGE_LEVELDB_DB_WRITE_BATCH_INTERNAL_H_
#include "db/dbformat.h"
#include "leveldb/write_batch.h" #include "leveldb/write_batch.h"
namespace leveldb { namespace leveldb {
@ -22,10 +21,10 @@ class WriteBatchInternal {
// Set the count for the number of entries in the batch. // Set the count for the number of entries in the batch.
static void SetCount(WriteBatch* batch, int n); static void SetCount(WriteBatch* batch, int n);
// Return the sequence number for the start of this batch. // Return the seqeunce number for the start of this batch.
static SequenceNumber Sequence(const WriteBatch* batch); static SequenceNumber Sequence(const WriteBatch* batch);
// Store the specified number as the sequence number for the start of // Store the specified number as the seqeunce number for the start of
// this batch. // this batch.
static void SetSequence(WriteBatch* batch, SequenceNumber seq); static void SetSequence(WriteBatch* batch, SequenceNumber seq);
@ -39,7 +38,7 @@ class WriteBatchInternal {
static void SetContents(WriteBatch* batch, const Slice& contents); static void SetContents(WriteBatch* batch, const Slice& contents);
static Status InsertInto(const WriteBatch* batch, MemTable* memtable); static Status InsertInto(const WriteBatch* batch, MemTable* memtable, const Options * options);
static void Append(WriteBatch* dst, const WriteBatch* src); static void Append(WriteBatch* dst, const WriteBatch* src);
}; };

View file

@ -2,6 +2,7 @@
// Use of this source code is governed by a BSD-style license that can be // Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors. // found in the LICENSE file. See the AUTHORS file for names of contributors.
#include <sstream>
#include "leveldb/db.h" #include "leveldb/db.h"
#include "db/memtable.h" #include "db/memtable.h"
@ -17,11 +18,12 @@ static std::string PrintContents(WriteBatch* b) {
MemTable* mem = new MemTable(cmp); MemTable* mem = new MemTable(cmp);
mem->Ref(); mem->Ref();
std::string state; std::string state;
Status s = WriteBatchInternal::InsertInto(b, mem); Status s = WriteBatchInternal::InsertInto(b, mem, NULL);
int count = 0; int count = 0;
Iterator* iter = mem->NewIterator(); Iterator* iter = mem->NewIterator();
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
ParsedInternalKey ikey; ParsedInternalKey ikey;
std::stringstream sstr;
ASSERT_TRUE(ParseInternalKey(iter->key(), &ikey)); ASSERT_TRUE(ParseInternalKey(iter->key(), &ikey));
switch (ikey.type) { switch (ikey.type) {
case kTypeValue: case kTypeValue:
@ -32,6 +34,28 @@ static std::string PrintContents(WriteBatch* b) {
state.append(")"); state.append(")");
count++; count++;
break; break;
case kTypeValueWriteTime:
state.append("PutWT(");
state.append(ikey.user_key.ToString());
state.append(", ");
sstr << ikey.expiry;
state.append(sstr.str());
state.append(", ");
state.append(iter->value().ToString());
state.append(")");
count++;
break;
case kTypeValueExplicitExpiry:
state.append("PutEE(");
state.append(ikey.user_key.ToString());
state.append(", ");
sstr << ikey.expiry;
state.append(sstr.str());
state.append(", ");
state.append(iter->value().ToString());
state.append(")");
count++;
break;
case kTypeDeletion: case kTypeDeletion:
state.append("Delete("); state.append("Delete(");
state.append(ikey.user_key.ToString()); state.append(ikey.user_key.ToString());
@ -74,6 +98,32 @@ TEST(WriteBatchTest, Multiple) {
PrintContents(&batch)); PrintContents(&batch));
} }
TEST(WriteBatchTest, MultipleExpiry) {
WriteBatch batch;
KeyMetaData meta;
batch.Put(Slice("Mary"), Slice("Lamb"));
meta.m_Type=kTypeValueExplicitExpiry;
meta.m_Expiry=2347;
batch.Put(Slice("Adam"), Slice("Ant"), &meta);
//batch.PutExplicitExpiry(Slice("Adam"), Slice("Ant"), 2347);
batch.Put(Slice("Frosty"), Slice("Snowman"));
batch.Put(Slice("Tip"), Slice("ONeal"));
batch.Delete(Slice("Frosty"));
meta.m_Type=kTypeValueExplicitExpiry;
meta.m_Expiry=987654321;
batch.Put(Slice("The"), Slice("Fonz"), &meta);
WriteBatchInternal::SetSequence(&batch, 200);
ASSERT_EQ(200, WriteBatchInternal::Sequence(&batch));
ASSERT_EQ(6, WriteBatchInternal::Count(&batch));
ASSERT_EQ("PutEE(Adam, 2347, Ant)@201"
"Delete(Frosty)@204"
"Put(Frosty, Snowman)@202"
"Put(Mary, Lamb)@200"
"PutEE(The, 987654321, Fonz)@205"
"Put(Tip, ONeal)@203",
PrintContents(&batch));
}
TEST(WriteBatchTest, Corruption) { TEST(WriteBatchTest, Corruption) {
WriteBatch batch; WriteBatch batch;
batch.Put(Slice("foo"), Slice("bar")); batch.Put(Slice("foo"), Slice("bar"));

View file

@ -618,7 +618,7 @@ class Benchmark {
ErrorCheck(status); ErrorCheck(status);
// Execute read statement // Execute read statement
while ((status = sqlite3_step(read_stmt)) == SQLITE_ROW) {} while ((status = sqlite3_step(read_stmt)) == SQLITE_ROW);
StepErrorCheck(status); StepErrorCheck(status);
// Reset SQLite statement for another use // Reset SQLite statement for another use

View file

@ -338,7 +338,7 @@ class Benchmark {
bool write_sync = false; bool write_sync = false;
if (name == Slice("fillseq")) { if (name == Slice("fillseq")) {
Write(write_sync, SEQUENTIAL, FRESH, num_, FLAGS_value_size, 1); Write(write_sync, SEQUENTIAL, FRESH, num_, FLAGS_value_size, 1);
DBSynchronize(db_);
} else if (name == Slice("fillrandom")) { } else if (name == Slice("fillrandom")) {
Write(write_sync, RANDOM, FRESH, num_, FLAGS_value_size, 1); Write(write_sync, RANDOM, FRESH, num_, FLAGS_value_size, 1);
DBSynchronize(db_); DBSynchronize(db_);

89
src/leveldb/doc/doc.css Normal file
View file

@ -0,0 +1,89 @@
body {
margin-left: 0.5in;
margin-right: 0.5in;
background: white;
color: black;
}
h1 {
margin-left: -0.2in;
font-size: 14pt;
}
h2 {
margin-left: -0in;
font-size: 12pt;
}
h3 {
margin-left: -0in;
}
h4 {
margin-left: -0in;
}
hr {
margin-left: -0in;
}
/* Definition lists: definition term bold */
dt {
font-weight: bold;
}
address {
text-align: center;
}
code,samp,var {
color: blue;
}
kbd {
color: #600000;
}
div.note p {
float: right;
width: 3in;
margin-right: 0%;
padding: 1px;
border: 2px solid #6060a0;
background-color: #fffff0;
}
ul {
margin-top: -0em;
margin-bottom: -0em;
}
ol {
margin-top: -0em;
margin-bottom: -0em;
}
UL.nobullets {
list-style-type: none;
list-style-image: none;
margin-left: -1em;
}
p {
margin: 1em 0 1em 0;
padding: 0 0 0 0;
}
pre {
line-height: 1.3em;
padding: 0.4em 0 0.8em 0;
margin: 0 0 0 0;
border: 0 0 0 0;
color: blue;
}
.datatable {
margin-left: auto;
margin-right: auto;
margin-top: 2em;
margin-bottom: 2em;
border: 1px solid;
}
.datatable td,th {
padding: 0 0.5em 0 0.5em;
text-align: right;
}

213
src/leveldb/doc/impl.html Normal file
View file

@ -0,0 +1,213 @@
<!DOCTYPE html>
<html>
<head>
<link rel="stylesheet" type="text/css" href="doc.css" />
<title>Leveldb file layout and compactions</title>
</head>
<body>
<h1>Files</h1>
The implementation of leveldb is similar in spirit to the
representation of a single
<a href="http://labs.google.com/papers/bigtable.html">
Bigtable tablet (section 5.3)</a>.
However the organization of the files that make up the representation
is somewhat different and is explained below.
<p>
Each database is represented by a set of files stored in a directory.
There are several different types of files as documented below:
<p>
<h2>Log files</h2>
<p>
A log file (*.log) stores a sequence of recent updates. Each update
is appended to the current log file. When the log file reaches a
pre-determined size (approximately 4MB by default), it is converted
to a sorted table (see below) and a new log file is created for future
updates.
<p>
A copy of the current log file is kept in an in-memory structure (the
<code>memtable</code>). This copy is consulted on every read so that read
operations reflect all logged updates.
<p>
<h2>Sorted tables</h2>
<p>
A sorted table (*.sst) stores a sequence of entries sorted by key.
Each entry is either a value for the key, or a deletion marker for the
key. (Deletion markers are kept around to hide obsolete values
present in older sorted tables).
<p>
The set of sorted tables are organized into a sequence of levels. The
sorted table generated from a log file is placed in a special <code>young</code>
level (also called level-0). When the number of young files exceeds a
certain threshold (currently four), all of the young files are merged
together with all of the overlapping level-1 files to produce a
sequence of new level-1 files (we create a new level-1 file for every
2MB of data.)
<p>
Files in the young level may contain overlapping keys. However files
in other levels have distinct non-overlapping key ranges. Consider
level number L where L >= 1. When the combined size of files in
level-L exceeds (10^L) MB (i.e., 10MB for level-1, 100MB for level-2,
...), one file in level-L, and all of the overlapping files in
level-(L+1) are merged to form a set of new files for level-(L+1).
These merges have the effect of gradually migrating new updates from
the young level to the largest level using only bulk reads and writes
(i.e., minimizing expensive seeks).
<h2>Manifest</h2>
<p>
A MANIFEST file lists the set of sorted tables that make up each
level, the corresponding key ranges, and other important metadata.
A new MANIFEST file (with a new number embedded in the file name)
is created whenever the database is reopened. The MANIFEST file is
formatted as a log, and changes made to the serving state (as files
are added or removed) are appended to this log.
<p>
<h2>Current</h2>
<p>
CURRENT is a simple text file that contains the name of the latest
MANIFEST file.
<p>
<h2>Info logs</h2>
<p>
Informational messages are printed to files named LOG and LOG.old.
<p>
<h2>Others</h2>
<p>
Other files used for miscellaneous purposes may also be present
(LOCK, *.dbtmp).
<h1>Level 0</h1>
When the log file grows above a certain size (1MB by default):
<ul>
<li>Create a brand new memtable and log file and direct future updates here
<li>In the background:
<ul>
<li>Write the contents of the previous memtable to an sstable
<li>Discard the memtable
<li>Delete the old log file and the old memtable
<li>Add the new sstable to the young (level-0) level.
</ul>
</ul>
<h1>Compactions</h1>
<p>
When the size of level L exceeds its limit, we compact it in a
background thread. The compaction picks a file from level L and all
overlapping files from the next level L+1. Note that if a level-L
file overlaps only part of a level-(L+1) file, the entire file at
level-(L+1) is used as an input to the compaction and will be
discarded after the compaction. Aside: because level-0 is special
(files in it may overlap each other), we treat compactions from
level-0 to level-1 specially: a level-0 compaction may pick more than
one level-0 file in case some of these files overlap each other.
<p>
A compaction merges the contents of the picked files to produce a
sequence of level-(L+1) files. We switch to producing a new
level-(L+1) file after the current output file has reached the target
file size (2MB). We also switch to a new output file when the key
range of the current output file has grown enough to overlap more then
ten level-(L+2) files. This last rule ensures that a later compaction
of a level-(L+1) file will not pick up too much data from level-(L+2).
<p>
The old files are discarded and the new files are added to the serving
state.
<p>
Compactions for a particular level rotate through the key space. In
more detail, for each level L, we remember the ending key of the last
compaction at level L. The next compaction for level L will pick the
first file that starts after this key (wrapping around to the
beginning of the key space if there is no such file).
<p>
Compactions drop overwritten values. They also drop deletion markers
if there are no higher numbered levels that contain a file whose range
overlaps the current key.
<h2>Timing</h2>
Level-0 compactions will read up to four 1MB files from level-0, and
at worst all the level-1 files (10MB). I.e., we will read 14MB and
write 14MB.
<p>
Other than the special level-0 compactions, we will pick one 2MB file
from level L. In the worst case, this will overlap ~ 12 files from
level L+1 (10 because level-(L+1) is ten times the size of level-L,
and another two at the boundaries since the file ranges at level-L
will usually not be aligned with the file ranges at level-L+1). The
compaction will therefore read 26MB and write 26MB. Assuming a disk
IO rate of 100MB/s (ballpark range for modern drives), the worst
compaction cost will be approximately 0.5 second.
<p>
If we throttle the background writing to something small, say 10% of
the full 100MB/s speed, a compaction may take up to 5 seconds. If the
user is writing at 10MB/s, we might build up lots of level-0 files
(~50 to hold the 5*10MB). This may signficantly increase the cost of
reads due to the overhead of merging more files together on every
read.
<p>
Solution 1: To reduce this problem, we might want to increase the log
switching threshold when the number of level-0 files is large. Though
the downside is that the larger this threshold, the more memory we will
need to hold the corresponding memtable.
<p>
Solution 2: We might want to decrease write rate artificially when the
number of level-0 files goes up.
<p>
Solution 3: We work on reducing the cost of very wide merges.
Perhaps most of the level-0 files will have their blocks sitting
uncompressed in the cache and we will only need to worry about the
O(N) complexity in the merging iterator.
<h2>Number of files</h2>
Instead of always making 2MB files, we could make larger files for
larger levels to reduce the total file count, though at the expense of
more bursty compactions. Alternatively, we could shard the set of
files into multiple directories.
<p>
An experiment on an <code>ext3</code> filesystem on Feb 04, 2011 shows
the following timings to do 100K file opens in directories with
varying number of files:
<table class="datatable">
<tr><th>Files in directory</th><th>Microseconds to open a file</th></tr>
<tr><td>1000</td><td>9</td>
<tr><td>10000</td><td>10</td>
<tr><td>100000</td><td>16</td>
</table>
So maybe even the sharding is not necessary on modern filesystems?
<h1>Recovery</h1>
<ul>
<li> Read CURRENT to find name of the latest committed MANIFEST
<li> Read the named MANIFEST file
<li> Clean up stale files
<li> We could open all sstables here, but it is probably better to be lazy...
<li> Convert log chunk to a new level-0 sstable
<li> Start directing new writes to a new log file with recovered sequence#
</ul>
<h1>Garbage collection of files</h1>
<code>DeleteObsoleteFiles()</code> is called at the end of every
compaction and at the end of recovery. It finds the names of all
files in the database. It deletes all log files that are not the
current log file. It deletes all table files that are not referenced
from some level and are not the output of an active compaction.
</body>
</html>

View file

@ -1,170 +0,0 @@
## Files
The implementation of leveldb is similar in spirit to the representation of a
single [Bigtable tablet (section 5.3)](http://research.google.com/archive/bigtable.html).
However the organization of the files that make up the representation is
somewhat different and is explained below.
Each database is represented by a set of files stored in a directory. There are
several different types of files as documented below:
### Log files
A log file (*.log) stores a sequence of recent updates. Each update is appended
to the current log file. When the log file reaches a pre-determined size
(approximately 4MB by default), it is converted to a sorted table (see below)
and a new log file is created for future updates.
A copy of the current log file is kept in an in-memory structure (the
`memtable`). This copy is consulted on every read so that read operations
reflect all logged updates.
## Sorted tables
A sorted table (*.ldb) stores a sequence of entries sorted by key. Each entry is
either a value for the key, or a deletion marker for the key. (Deletion markers
are kept around to hide obsolete values present in older sorted tables).
The set of sorted tables are organized into a sequence of levels. The sorted
table generated from a log file is placed in a special **young** level (also
called level-0). When the number of young files exceeds a certain threshold
(currently four), all of the young files are merged together with all of the
overlapping level-1 files to produce a sequence of new level-1 files (we create
a new level-1 file for every 2MB of data.)
Files in the young level may contain overlapping keys. However files in other
levels have distinct non-overlapping key ranges. Consider level number L where
L >= 1. When the combined size of files in level-L exceeds (10^L) MB (i.e., 10MB
for level-1, 100MB for level-2, ...), one file in level-L, and all of the
overlapping files in level-(L+1) are merged to form a set of new files for
level-(L+1). These merges have the effect of gradually migrating new updates
from the young level to the largest level using only bulk reads and writes
(i.e., minimizing expensive seeks).
### Manifest
A MANIFEST file lists the set of sorted tables that make up each level, the
corresponding key ranges, and other important metadata. A new MANIFEST file
(with a new number embedded in the file name) is created whenever the database
is reopened. The MANIFEST file is formatted as a log, and changes made to the
serving state (as files are added or removed) are appended to this log.
### Current
CURRENT is a simple text file that contains the name of the latest MANIFEST
file.
### Info logs
Informational messages are printed to files named LOG and LOG.old.
### Others
Other files used for miscellaneous purposes may also be present (LOCK, *.dbtmp).
## Level 0
When the log file grows above a certain size (1MB by default):
Create a brand new memtable and log file and direct future updates here
In the background:
Write the contents of the previous memtable to an sstable
Discard the memtable
Delete the old log file and the old memtable
Add the new sstable to the young (level-0) level.
## Compactions
When the size of level L exceeds its limit, we compact it in a background
thread. The compaction picks a file from level L and all overlapping files from
the next level L+1. Note that if a level-L file overlaps only part of a
level-(L+1) file, the entire file at level-(L+1) is used as an input to the
compaction and will be discarded after the compaction. Aside: because level-0
is special (files in it may overlap each other), we treat compactions from
level-0 to level-1 specially: a level-0 compaction may pick more than one
level-0 file in case some of these files overlap each other.
A compaction merges the contents of the picked files to produce a sequence of
level-(L+1) files. We switch to producing a new level-(L+1) file after the
current output file has reached the target file size (2MB). We also switch to a
new output file when the key range of the current output file has grown enough
to overlap more than ten level-(L+2) files. This last rule ensures that a later
compaction of a level-(L+1) file will not pick up too much data from
level-(L+2).
The old files are discarded and the new files are added to the serving state.
Compactions for a particular level rotate through the key space. In more detail,
for each level L, we remember the ending key of the last compaction at level L.
The next compaction for level L will pick the first file that starts after this
key (wrapping around to the beginning of the key space if there is no such
file).
Compactions drop overwritten values. They also drop deletion markers if there
are no higher numbered levels that contain a file whose range overlaps the
current key.
### Timing
Level-0 compactions will read up to four 1MB files from level-0, and at worst
all the level-1 files (10MB). I.e., we will read 14MB and write 14MB.
Other than the special level-0 compactions, we will pick one 2MB file from level
L. In the worst case, this will overlap ~ 12 files from level L+1 (10 because
level-(L+1) is ten times the size of level-L, and another two at the boundaries
since the file ranges at level-L will usually not be aligned with the file
ranges at level-L+1). The compaction will therefore read 26MB and write 26MB.
Assuming a disk IO rate of 100MB/s (ballpark range for modern drives), the worst
compaction cost will be approximately 0.5 second.
If we throttle the background writing to something small, say 10% of the full
100MB/s speed, a compaction may take up to 5 seconds. If the user is writing at
10MB/s, we might build up lots of level-0 files (~50 to hold the 5*10MB). This
may significantly increase the cost of reads due to the overhead of merging more
files together on every read.
Solution 1: To reduce this problem, we might want to increase the log switching
threshold when the number of level-0 files is large. Though the downside is that
the larger this threshold, the more memory we will need to hold the
corresponding memtable.
Solution 2: We might want to decrease write rate artificially when the number of
level-0 files goes up.
Solution 3: We work on reducing the cost of very wide merges. Perhaps most of
the level-0 files will have their blocks sitting uncompressed in the cache and
we will only need to worry about the O(N) complexity in the merging iterator.
### Number of files
Instead of always making 2MB files, we could make larger files for larger levels
to reduce the total file count, though at the expense of more bursty
compactions. Alternatively, we could shard the set of files into multiple
directories.
An experiment on an ext3 filesystem on Feb 04, 2011 shows the following timings
to do 100K file opens in directories with varying number of files:
| Files in directory | Microseconds to open a file |
|-------------------:|----------------------------:|
| 1000 | 9 |
| 10000 | 10 |
| 100000 | 16 |
So maybe even the sharding is not necessary on modern filesystems?
## Recovery
* Read CURRENT to find name of the latest committed MANIFEST
* Read the named MANIFEST file
* Clean up stale files
* We could open all sstables here, but it is probably better to be lazy...
* Convert log chunk to a new level-0 sstable
* Start directing new writes to a new log file with recovered sequence#
## Garbage collection of files
`DeleteObsoleteFiles()` is called at the end of every compaction and at the end
of recovery. It finds the names of all files in the database. It deletes all log
files that are not the current log file. It deletes all table files that are not
referenced from some level and are not the output of an active compaction.

549
src/leveldb/doc/index.html Normal file
View file

@ -0,0 +1,549 @@
<!DOCTYPE html>
<html>
<head>
<link rel="stylesheet" type="text/css" href="doc.css" />
<title>Leveldb</title>
</head>
<body>
<h1>Leveldb</h1>
<address>Jeff Dean, Sanjay Ghemawat</address>
<p>
The <code>leveldb</code> library provides a persistent key value store. Keys and
values are arbitrary byte arrays. The keys are ordered within the key
value store according to a user-specified comparator function.
<p>
<h1>Opening A Database</h1>
<p>
A <code>leveldb</code> database has a name which corresponds to a file system
directory. All of the contents of database are stored in this
directory. The following example shows how to open a database,
creating it if necessary:
<p>
<pre>
#include &lt;assert&gt;
#include "leveldb/db.h"
leveldb::DB* db;
leveldb::Options options;
options.create_if_missing = true;
leveldb::Status status = leveldb::DB::Open(options, "/tmp/testdb", &amp;db);
assert(status.ok());
...
</pre>
If you want to raise an error if the database already exists, add
the following line before the <code>leveldb::DB::Open</code> call:
<pre>
options.error_if_exists = true;
</pre>
<h1>Status</h1>
<p>
You may have noticed the <code>leveldb::Status</code> type above. Values of this
type are returned by most functions in <code>leveldb</code> that may encounter an
error. You can check if such a result is ok, and also print an
associated error message:
<p>
<pre>
leveldb::Status s = ...;
if (!s.ok()) cerr &lt;&lt; s.ToString() &lt;&lt; endl;
</pre>
<h1>Closing A Database</h1>
<p>
When you are done with a database, just delete the database object.
Example:
<p>
<pre>
... open the db as described above ...
... do something with db ...
delete db;
</pre>
<h1>Reads And Writes</h1>
<p>
The database provides <code>Put</code>, <code>Delete</code>, and <code>Get</code> methods to
modify/query the database. For example, the following code
moves the value stored under key1 to key2.
<pre>
std::string value;
leveldb::Status s = db-&gt;Get(leveldb::ReadOptions(), key1, &amp;value);
if (s.ok()) s = db-&gt;Put(leveldb::WriteOptions(), key2, value);
if (s.ok()) s = db-&gt;Delete(leveldb::WriteOptions(), key1);
</pre>
<h1>Atomic Updates</h1>
<p>
Note that if the process dies after the Put of key2 but before the
delete of key1, the same value may be left stored under multiple keys.
Such problems can be avoided by using the <code>WriteBatch</code> class to
atomically apply a set of updates:
<p>
<pre>
#include "leveldb/write_batch.h"
...
std::string value;
leveldb::Status s = db-&gt;Get(leveldb::ReadOptions(), key1, &amp;value);
if (s.ok()) {
leveldb::WriteBatch batch;
batch.Delete(key1);
batch.Put(key2, value);
s = db-&gt;Write(leveldb::WriteOptions(), &amp;batch);
}
</pre>
The <code>WriteBatch</code> holds a sequence of edits to be made to the database,
and these edits within the batch are applied in order. Note that we
called <code>Delete</code> before <code>Put</code> so that if <code>key1</code> is identical to <code>key2</code>,
we do not end up erroneously dropping the value entirely.
<p>
Apart from its atomicity benefits, <code>WriteBatch</code> may also be used to
speed up bulk updates by placing lots of individual mutations into the
same batch.
<h1>Synchronous Writes</h1>
By default, each write to <code>leveldb</code> is asynchronous: it
returns after pushing the write from the process into the operating
system. The transfer from operating system memory to the underlying
persistent storage happens asynchronously. The <code>sync</code> flag
can be turned on for a particular write to make the write operation
not return until the data being written has been pushed all the way to
persistent storage. (On Posix systems, this is implemented by calling
either <code>fsync(...)</code> or <code>fdatasync(...)</code> or
<code>msync(..., MS_SYNC)</code> before the write operation returns.)
<pre>
leveldb::WriteOptions write_options;
write_options.sync = true;
db-&gt;Put(write_options, ...);
</pre>
Asynchronous writes are often more than a thousand times as fast as
synchronous writes. The downside of asynchronous writes is that a
crash of the machine may cause the last few updates to be lost. Note
that a crash of just the writing process (i.e., not a reboot) will not
cause any loss since even when <code>sync</code> is false, an update
is pushed from the process memory into the operating system before it
is considered done.
<p>
Asynchronous writes can often be used safely. For example, when
loading a large amount of data into the database you can handle lost
updates by restarting the bulk load after a crash. A hybrid scheme is
also possible where every Nth write is synchronous, and in the event
of a crash, the bulk load is restarted just after the last synchronous
write finished by the previous run. (The synchronous write can update
a marker that describes where to restart on a crash.)
<p>
<code>WriteBatch</code> provides an alternative to asynchronous writes.
Multiple updates may be placed in the same <code>WriteBatch</code> and
applied together using a synchronous write (i.e.,
<code>write_options.sync</code> is set to true). The extra cost of
the synchronous write will be amortized across all of the writes in
the batch.
<p>
<h1>Concurrency</h1>
<p>
A database may only be opened by one process at a time.
The <code>leveldb</code> implementation acquires a lock from the
operating system to prevent misuse. Within a single process, the
same <code>leveldb::DB</code> object may be safely shared by multiple
concurrent threads. I.e., different threads may write into or fetch
iterators or call <code>Get</code> on the same database without any
external synchronization (the leveldb implementation will
automatically do the required synchronization). However other objects
(like Iterator and WriteBatch) may require external synchronization.
If two threads share such an object, they must protect access to it
using their own locking protocol. More details are available in
the public header files.
<p>
<h1>Iteration</h1>
<p>
The following example demonstrates how to print all key,value pairs
in a database.
<p>
<pre>
leveldb::Iterator* it = db-&gt;NewIterator(leveldb::ReadOptions());
for (it-&gt;SeekToFirst(); it-&gt;Valid(); it-&gt;Next()) {
cout &lt;&lt; it-&gt;key().ToString() &lt;&lt; ": " &lt;&lt; it-&gt;value().ToString() &lt;&lt; endl;
}
assert(it-&gt;status().ok()); // Check for any errors found during the scan
delete it;
</pre>
The following variation shows how to process just the keys in the
range <code>[start,limit)</code>:
<p>
<pre>
for (it-&gt;Seek(start);
it-&gt;Valid() &amp;&amp; it-&gt;key().ToString() &lt; limit;
it-&gt;Next()) {
...
}
</pre>
You can also process entries in reverse order. (Caveat: reverse
iteration may be somewhat slower than forward iteration.)
<p>
<pre>
for (it-&gt;SeekToLast(); it-&gt;Valid(); it-&gt;Prev()) {
...
}
</pre>
<h1>Snapshots</h1>
<p>
Snapshots provide consistent read-only views over the entire state of
the key-value store. <code>ReadOptions::snapshot</code> may be non-NULL to indicate
that a read should operate on a particular version of the DB state.
If <code>ReadOptions::snapshot</code> is NULL, the read will operate on an
implicit snapshot of the current state.
<p>
Snapshots are created by the DB::GetSnapshot() method:
<p>
<pre>
leveldb::ReadOptions options;
options.snapshot = db-&gt;GetSnapshot();
... apply some updates to db ...
leveldb::Iterator* iter = db-&gt;NewIterator(options);
... read using iter to view the state when the snapshot was created ...
delete iter;
db-&gt;ReleaseSnapshot(options.snapshot);
</pre>
Note that when a snapshot is no longer needed, it should be released
using the DB::ReleaseSnapshot interface. This allows the
implementation to get rid of state that was being maintained just to
support reading as of that snapshot.
<h1>Slice</h1>
<p>
The return value of the <code>it->key()</code> and <code>it->value()</code> calls above
are instances of the <code>leveldb::Slice</code> type. <code>Slice</code> is a simple
structure that contains a length and a pointer to an external byte
array. Returning a <code>Slice</code> is a cheaper alternative to returning a
<code>std::string</code> since we do not need to copy potentially large keys and
values. In addition, <code>leveldb</code> methods do not return null-terminated
C-style strings since <code>leveldb</code> keys and values are allowed to
contain '\0' bytes.
<p>
C++ strings and null-terminated C-style strings can be easily converted
to a Slice:
<p>
<pre>
leveldb::Slice s1 = "hello";
std::string str("world");
leveldb::Slice s2 = str;
</pre>
A Slice can be easily converted back to a C++ string:
<pre>
std::string str = s1.ToString();
assert(str == std::string("hello"));
</pre>
Be careful when using Slices since it is up to the caller to ensure that
the external byte array into which the Slice points remains live while
the Slice is in use. For example, the following is buggy:
<p>
<pre>
leveldb::Slice slice;
if (...) {
std::string str = ...;
slice = str;
}
Use(slice);
</pre>
When the <code>if</code> statement goes out of scope, <code>str</code> will be destroyed and the
backing storage for <code>slice</code> will disappear.
<p>
<h1>Comparators</h1>
<p>
The preceding examples used the default ordering function for key,
which orders bytes lexicographically. You can however supply a custom
comparator when opening a database. For example, suppose each
database key consists of two numbers and we should sort by the first
number, breaking ties by the second number. First, define a proper
subclass of <code>leveldb::Comparator</code> that expresses these rules:
<p>
<pre>
class TwoPartComparator : public leveldb::Comparator {
public:
// Three-way comparison function:
// if a &lt; b: negative result
// if a &gt; b: positive result
// else: zero result
int Compare(const leveldb::Slice&amp; a, const leveldb::Slice&amp; b) const {
int a1, a2, b1, b2;
ParseKey(a, &amp;a1, &amp;a2);
ParseKey(b, &amp;b1, &amp;b2);
if (a1 &lt; b1) return -1;
if (a1 &gt; b1) return +1;
if (a2 &lt; b2) return -1;
if (a2 &gt; b2) return +1;
return 0;
}
// Ignore the following methods for now:
const char* Name() const { return "TwoPartComparator"; }
void FindShortestSeparator(std::string*, const leveldb::Slice&amp;) const { }
void FindShortSuccessor(std::string*) const { }
};
</pre>
Now create a database using this custom comparator:
<p>
<pre>
TwoPartComparator cmp;
leveldb::DB* db;
leveldb::Options options;
options.create_if_missing = true;
options.comparator = &amp;cmp;
leveldb::Status status = leveldb::DB::Open(options, "/tmp/testdb", &amp;db);
...
</pre>
<h2>Backwards compatibility</h2>
<p>
The result of the comparator's <code>Name</code> method is attached to the
database when it is created, and is checked on every subsequent
database open. If the name changes, the <code>leveldb::DB::Open</code> call will
fail. Therefore, change the name if and only if the new key format
and comparison function are incompatible with existing databases, and
it is ok to discard the contents of all existing databases.
<p>
You can however still gradually evolve your key format over time with
a little bit of pre-planning. For example, you could store a version
number at the end of each key (one byte should suffice for most uses).
When you wish to switch to a new key format (e.g., adding an optional
third part to the keys processed by <code>TwoPartComparator</code>),
(a) keep the same comparator name (b) increment the version number
for new keys (c) change the comparator function so it uses the
version numbers found in the keys to decide how to interpret them.
<p>
<h1>Performance</h1>
<p>
Performance can be tuned by changing the default values of the
types defined in <code>include/leveldb/options.h</code>.
<p>
<h2>Block size</h2>
<p>
<code>leveldb</code> groups adjacent keys together into the same block and such a
block is the unit of transfer to and from persistent storage. The
default block size is approximately 4096 uncompressed bytes.
Applications that mostly do bulk scans over the contents of the
database may wish to increase this size. Applications that do a lot
of point reads of small values may wish to switch to a smaller block
size if performance measurements indicate an improvement. There isn't
much benefit in using blocks smaller than one kilobyte, or larger than
a few megabytes. Also note that compression will be more effective
with larger block sizes.
<p>
<h2>Compression</h2>
<p>
Each block is individually compressed before being written to
persistent storage. Compression is on by default since the default
compression method is very fast, and is automatically disabled for
uncompressible data. In rare cases, applications may want to disable
compression entirely, but should only do so if benchmarks show a
performance improvement:
<p>
<pre>
leveldb::Options options;
options.compression = leveldb::kNoCompression;
... leveldb::DB::Open(options, name, ...) ....
</pre>
<h2>Cache</h2>
<p>
The contents of the database are stored in a set of files in the
filesystem and each file stores a sequence of compressed blocks. If
<code>options.cache</code> is non-NULL, it is used to cache frequently used
uncompressed block contents.
<p>
<pre>
#include "leveldb/cache.h"
leveldb::Options options;
options.cache = leveldb::NewLRUCache(100 * 1048576); // 100MB cache
leveldb::DB* db;
leveldb::DB::Open(options, name, &db);
... use the db ...
delete db
delete options.cache;
</pre>
Note that the cache holds uncompressed data, and therefore it should
be sized according to application level data sizes, without any
reduction from compression. (Caching of compressed blocks is left to
the operating system buffer cache, or any custom <code>Env</code>
implementation provided by the client.)
<p>
When performing a bulk read, the application may wish to disable
caching so that the data processed by the bulk read does not end up
displacing most of the cached contents. A per-iterator option can be
used to achieve this:
<p>
<pre>
leveldb::ReadOptions options;
options.fill_cache = false;
leveldb::Iterator* it = db-&gt;NewIterator(options);
for (it-&gt;SeekToFirst(); it-&gt;Valid(); it-&gt;Next()) {
...
}
</pre>
<h2>Key Layout</h2>
<p>
Note that the unit of disk transfer and caching is a block. Adjacent
keys (according to the database sort order) will usually be placed in
the same block. Therefore the application can improve its performance
by placing keys that are accessed together near each other and placing
infrequently used keys in a separate region of the key space.
<p>
For example, suppose we are implementing a simple file system on top
of <code>leveldb</code>. The types of entries we might wish to store are:
<p>
<pre>
filename -&gt; permission-bits, length, list of file_block_ids
file_block_id -&gt; data
</pre>
We might want to prefix <code>filename</code> keys with one letter (say '/') and the
<code>file_block_id</code> keys with a different letter (say '0') so that scans
over just the metadata do not force us to fetch and cache bulky file
contents.
<p>
<h2>Filters</h2>
<p>
Because of the way <code>leveldb</code> data is organized on disk,
a single <code>Get()</code> call may involve multiple reads from disk.
The optional <code>FilterPolicy</code> mechanism can be used to reduce
the number of disk reads substantially.
<pre>
leveldb::Options options;
options.filter_policy = NewBloomFilter(10);
leveldb::DB* db;
leveldb::DB::Open(options, "/tmp/testdb", &amp;db);
... use the database ...
delete db;
delete options.filter_policy;
</pre>
The preceding code associates a
<a href="http://en.wikipedia.org/wiki/Bloom_filter">Bloom filter</a>
based filtering policy with the database. Bloom filter based
filtering relies on keeping some number of bits of data in memory per
key (in this case 10 bits per key since that is the argument we passed
to NewBloomFilter). This filter will reduce the number of unnecessary
disk reads needed for <code>Get()</code> calls by a factor of
approximately a 100. Increasing the bits per key will lead to a
larger reduction at the cost of more memory usage. We recommend that
applications whose working set does not fit in memory and that do a
lot of random reads set a filter policy.
<p>
If you are using a custom comparator, you should ensure that the filter
policy you are using is compatible with your comparator. For example,
consider a comparator that ignores trailing spaces when comparing keys.
<code>NewBloomFilter</code> must not be used with such a comparator.
Instead, the application should provide a custom filter policy that
also ignores trailing spaces. For example:
<pre>
class CustomFilterPolicy : public leveldb::FilterPolicy {
private:
FilterPolicy* builtin_policy_;
public:
CustomFilterPolicy() : builtin_policy_(NewBloomFilter(10)) { }
~CustomFilterPolicy() { delete builtin_policy_; }
const char* Name() const { return "IgnoreTrailingSpacesFilter"; }
void CreateFilter(const Slice* keys, int n, std::string* dst) const {
// Use builtin bloom filter code after removing trailing spaces
std::vector&lt;Slice&gt; trimmed(n);
for (int i = 0; i &lt; n; i++) {
trimmed[i] = RemoveTrailingSpaces(keys[i]);
}
return builtin_policy_-&gt;CreateFilter(&amp;trimmed[i], n, dst);
}
bool KeyMayMatch(const Slice& key, const Slice& filter) const {
// Use builtin bloom filter code after removing trailing spaces
return builtin_policy_-&gt;KeyMayMatch(RemoveTrailingSpaces(key), filter);
}
};
</pre>
<p>
Advanced applications may provide a filter policy that does not use
a bloom filter but uses some other mechanism for summarizing a set
of keys. See <code>leveldb/filter_policy.h</code> for detail.
<p>
<h1>Checksums</h1>
<p>
<code>leveldb</code> associates checksums with all data it stores in the file system.
There are two separate controls provided over how aggressively these
checksums are verified:
<p>
<ul>
<li> <code>ReadOptions::verify_checksums</code> may be set to true to force
checksum verification of all data that is read from the file system on
behalf of a particular read. By default, no such verification is
done.
<p>
<li> <code>Options::paranoid_checks</code> may be set to true before opening a
database to make the database implementation raise an error as soon as
it detects an internal corruption. Depending on which portion of the
database has been corrupted, the error may be raised when the database
is opened, or later by another database operation. By default,
paranoid checking is off so that the database can be used even if
parts of its persistent storage have been corrupted.
<p>
If a database is corrupted (perhaps it cannot be opened when
paranoid checking is turned on), the <code>leveldb::RepairDB</code> function
may be used to recover as much of the data as possible
<p>
</ul>
<h1>Approximate Sizes</h1>
<p>
The <code>GetApproximateSizes</code> method can used to get the approximate
number of bytes of file system space used by one or more key ranges.
<p>
<pre>
leveldb::Range ranges[2];
ranges[0] = leveldb::Range("a", "c");
ranges[1] = leveldb::Range("x", "z");
uint64_t sizes[2];
leveldb::Status s = db-&gt;GetApproximateSizes(ranges, 2, sizes);
</pre>
The preceding call will set <code>sizes[0]</code> to the approximate number of
bytes of file system space used by the key range <code>[a..c)</code> and
<code>sizes[1]</code> to the approximate number of bytes used by the key range
<code>[x..z)</code>.
<p>
<h1>Environment</h1>
<p>
All file operations (and other operating system calls) issued by the
<code>leveldb</code> implementation are routed through a <code>leveldb::Env</code> object.
Sophisticated clients may wish to provide their own <code>Env</code>
implementation to get better control. For example, an application may
introduce artificial delays in the file IO paths to limit the impact
of <code>leveldb</code> on other activities in the system.
<p>
<pre>
class SlowEnv : public leveldb::Env {
.. implementation of the Env interface ...
};
SlowEnv env;
leveldb::Options options;
options.env = &amp;env;
Status s = leveldb::DB::Open(options, ...);
</pre>
<h1>Porting</h1>
<p>
<code>leveldb</code> may be ported to a new platform by providing platform
specific implementations of the types/methods/functions exported by
<code>leveldb/port/port.h</code>. See <code>leveldb/port/port_example.h</code> for more
details.
<p>
In addition, the new platform may need a new default <code>leveldb::Env</code>
implementation. See <code>leveldb/util/env_posix.h</code> for an example.
<h1>Other Information</h1>
<p>
Details about the <code>leveldb</code> implementation may be found in
the following documents:
<ul>
<li> <a href="impl.html">Implementation notes</a>
<li> <a href="table_format.txt">Format of an immutable Table file</a>
<li> <a href="log_format.txt">Format of a log file</a>
</ul>
</body>
</html>

View file

@ -1,523 +0,0 @@
leveldb
=======
_Jeff Dean, Sanjay Ghemawat_
The leveldb library provides a persistent key value store. Keys and values are
arbitrary byte arrays. The keys are ordered within the key value store
according to a user-specified comparator function.
## Opening A Database
A leveldb database has a name which corresponds to a file system directory. All
of the contents of database are stored in this directory. The following example
shows how to open a database, creating it if necessary:
```c++
#include <cassert>
#include "leveldb/db.h"
leveldb::DB* db;
leveldb::Options options;
options.create_if_missing = true;
leveldb::Status status = leveldb::DB::Open(options, "/tmp/testdb", &db);
assert(status.ok());
...
```
If you want to raise an error if the database already exists, add the following
line before the `leveldb::DB::Open` call:
```c++
options.error_if_exists = true;
```
## Status
You may have noticed the `leveldb::Status` type above. Values of this type are
returned by most functions in leveldb that may encounter an error. You can check
if such a result is ok, and also print an associated error message:
```c++
leveldb::Status s = ...;
if (!s.ok()) cerr << s.ToString() << endl;
```
## Closing A Database
When you are done with a database, just delete the database object. Example:
```c++
... open the db as described above ...
... do something with db ...
delete db;
```
## Reads And Writes
The database provides Put, Delete, and Get methods to modify/query the database.
For example, the following code moves the value stored under key1 to key2.
```c++
std::string value;
leveldb::Status s = db->Get(leveldb::ReadOptions(), key1, &value);
if (s.ok()) s = db->Put(leveldb::WriteOptions(), key2, value);
if (s.ok()) s = db->Delete(leveldb::WriteOptions(), key1);
```
## Atomic Updates
Note that if the process dies after the Put of key2 but before the delete of
key1, the same value may be left stored under multiple keys. Such problems can
be avoided by using the `WriteBatch` class to atomically apply a set of updates:
```c++
#include "leveldb/write_batch.h"
...
std::string value;
leveldb::Status s = db->Get(leveldb::ReadOptions(), key1, &value);
if (s.ok()) {
leveldb::WriteBatch batch;
batch.Delete(key1);
batch.Put(key2, value);
s = db->Write(leveldb::WriteOptions(), &batch);
}
```
The `WriteBatch` holds a sequence of edits to be made to the database, and these
edits within the batch are applied in order. Note that we called Delete before
Put so that if key1 is identical to key2, we do not end up erroneously dropping
the value entirely.
Apart from its atomicity benefits, `WriteBatch` may also be used to speed up
bulk updates by placing lots of individual mutations into the same batch.
## Synchronous Writes
By default, each write to leveldb is asynchronous: it returns after pushing the
write from the process into the operating system. The transfer from operating
system memory to the underlying persistent storage happens asynchronously. The
sync flag can be turned on for a particular write to make the write operation
not return until the data being written has been pushed all the way to
persistent storage. (On Posix systems, this is implemented by calling either
`fsync(...)` or `fdatasync(...)` or `msync(..., MS_SYNC)` before the write
operation returns.)
```c++
leveldb::WriteOptions write_options;
write_options.sync = true;
db->Put(write_options, ...);
```
Asynchronous writes are often more than a thousand times as fast as synchronous
writes. The downside of asynchronous writes is that a crash of the machine may
cause the last few updates to be lost. Note that a crash of just the writing
process (i.e., not a reboot) will not cause any loss since even when sync is
false, an update is pushed from the process memory into the operating system
before it is considered done.
Asynchronous writes can often be used safely. For example, when loading a large
amount of data into the database you can handle lost updates by restarting the
bulk load after a crash. A hybrid scheme is also possible where every Nth write
is synchronous, and in the event of a crash, the bulk load is restarted just
after the last synchronous write finished by the previous run. (The synchronous
write can update a marker that describes where to restart on a crash.)
`WriteBatch` provides an alternative to asynchronous writes. Multiple updates
may be placed in the same WriteBatch and applied together using a synchronous
write (i.e., `write_options.sync` is set to true). The extra cost of the
synchronous write will be amortized across all of the writes in the batch.
## Concurrency
A database may only be opened by one process at a time. The leveldb
implementation acquires a lock from the operating system to prevent misuse.
Within a single process, the same `leveldb::DB` object may be safely shared by
multiple concurrent threads. I.e., different threads may write into or fetch
iterators or call Get on the same database without any external synchronization
(the leveldb implementation will automatically do the required synchronization).
However other objects (like Iterator and `WriteBatch`) may require external
synchronization. If two threads share such an object, they must protect access
to it using their own locking protocol. More details are available in the public
header files.
## Iteration
The following example demonstrates how to print all key,value pairs in a
database.
```c++
leveldb::Iterator* it = db->NewIterator(leveldb::ReadOptions());
for (it->SeekToFirst(); it->Valid(); it->Next()) {
cout << it->key().ToString() << ": " << it->value().ToString() << endl;
}
assert(it->status().ok()); // Check for any errors found during the scan
delete it;
```
The following variation shows how to process just the keys in the range
[start,limit):
```c++
for (it->Seek(start);
it->Valid() && it->key().ToString() < limit;
it->Next()) {
...
}
```
You can also process entries in reverse order. (Caveat: reverse iteration may be
somewhat slower than forward iteration.)
```c++
for (it->SeekToLast(); it->Valid(); it->Prev()) {
...
}
```
## Snapshots
Snapshots provide consistent read-only views over the entire state of the
key-value store. `ReadOptions::snapshot` may be non-NULL to indicate that a
read should operate on a particular version of the DB state. If
`ReadOptions::snapshot` is NULL, the read will operate on an implicit snapshot
of the current state.
Snapshots are created by the `DB::GetSnapshot()` method:
```c++
leveldb::ReadOptions options;
options.snapshot = db->GetSnapshot();
... apply some updates to db ...
leveldb::Iterator* iter = db->NewIterator(options);
... read using iter to view the state when the snapshot was created ...
delete iter;
db->ReleaseSnapshot(options.snapshot);
```
Note that when a snapshot is no longer needed, it should be released using the
`DB::ReleaseSnapshot` interface. This allows the implementation to get rid of
state that was being maintained just to support reading as of that snapshot.
## Slice
The return value of the `it->key()` and `it->value()` calls above are instances
of the `leveldb::Slice` type. Slice is a simple structure that contains a length
and a pointer to an external byte array. Returning a Slice is a cheaper
alternative to returning a `std::string` since we do not need to copy
potentially large keys and values. In addition, leveldb methods do not return
null-terminated C-style strings since leveldb keys and values are allowed to
contain `'\0'` bytes.
C++ strings and null-terminated C-style strings can be easily converted to a
Slice:
```c++
leveldb::Slice s1 = "hello";
std::string str("world");
leveldb::Slice s2 = str;
```
A Slice can be easily converted back to a C++ string:
```c++
std::string str = s1.ToString();
assert(str == std::string("hello"));
```
Be careful when using Slices since it is up to the caller to ensure that the
external byte array into which the Slice points remains live while the Slice is
in use. For example, the following is buggy:
```c++
leveldb::Slice slice;
if (...) {
std::string str = ...;
slice = str;
}
Use(slice);
```
When the if statement goes out of scope, str will be destroyed and the backing
storage for slice will disappear.
## Comparators
The preceding examples used the default ordering function for key, which orders
bytes lexicographically. You can however supply a custom comparator when opening
a database. For example, suppose each database key consists of two numbers and
we should sort by the first number, breaking ties by the second number. First,
define a proper subclass of `leveldb::Comparator` that expresses these rules:
```c++
class TwoPartComparator : public leveldb::Comparator {
public:
// Three-way comparison function:
// if a < b: negative result
// if a > b: positive result
// else: zero result
int Compare(const leveldb::Slice& a, const leveldb::Slice& b) const {
int a1, a2, b1, b2;
ParseKey(a, &a1, &a2);
ParseKey(b, &b1, &b2);
if (a1 < b1) return -1;
if (a1 > b1) return +1;
if (a2 < b2) return -1;
if (a2 > b2) return +1;
return 0;
}
// Ignore the following methods for now:
const char* Name() const { return "TwoPartComparator"; }
void FindShortestSeparator(std::string*, const leveldb::Slice&) const {}
void FindShortSuccessor(std::string*) const {}
};
```
Now create a database using this custom comparator:
```c++
TwoPartComparator cmp;
leveldb::DB* db;
leveldb::Options options;
options.create_if_missing = true;
options.comparator = &cmp;
leveldb::Status status = leveldb::DB::Open(options, "/tmp/testdb", &db);
...
```
### Backwards compatibility
The result of the comparator's Name method is attached to the database when it
is created, and is checked on every subsequent database open. If the name
changes, the `leveldb::DB::Open` call will fail. Therefore, change the name if
and only if the new key format and comparison function are incompatible with
existing databases, and it is ok to discard the contents of all existing
databases.
You can however still gradually evolve your key format over time with a little
bit of pre-planning. For example, you could store a version number at the end of
each key (one byte should suffice for most uses). When you wish to switch to a
new key format (e.g., adding an optional third part to the keys processed by
`TwoPartComparator`), (a) keep the same comparator name (b) increment the
version number for new keys (c) change the comparator function so it uses the
version numbers found in the keys to decide how to interpret them.
## Performance
Performance can be tuned by changing the default values of the types defined in
`include/leveldb/options.h`.
### Block size
leveldb groups adjacent keys together into the same block and such a block is
the unit of transfer to and from persistent storage. The default block size is
approximately 4096 uncompressed bytes. Applications that mostly do bulk scans
over the contents of the database may wish to increase this size. Applications
that do a lot of point reads of small values may wish to switch to a smaller
block size if performance measurements indicate an improvement. There isn't much
benefit in using blocks smaller than one kilobyte, or larger than a few
megabytes. Also note that compression will be more effective with larger block
sizes.
### Compression
Each block is individually compressed before being written to persistent
storage. Compression is on by default since the default compression method is
very fast, and is automatically disabled for uncompressible data. In rare cases,
applications may want to disable compression entirely, but should only do so if
benchmarks show a performance improvement:
```c++
leveldb::Options options;
options.compression = leveldb::kNoCompression;
... leveldb::DB::Open(options, name, ...) ....
```
### Cache
The contents of the database are stored in a set of files in the filesystem and
each file stores a sequence of compressed blocks. If options.cache is non-NULL,
it is used to cache frequently used uncompressed block contents.
```c++
#include "leveldb/cache.h"
leveldb::Options options;
options.cache = leveldb::NewLRUCache(100 * 1048576); // 100MB cache
leveldb::DB* db;
leveldb::DB::Open(options, name, &db);
... use the db ...
delete db
delete options.cache;
```
Note that the cache holds uncompressed data, and therefore it should be sized
according to application level data sizes, without any reduction from
compression. (Caching of compressed blocks is left to the operating system
buffer cache, or any custom Env implementation provided by the client.)
When performing a bulk read, the application may wish to disable caching so that
the data processed by the bulk read does not end up displacing most of the
cached contents. A per-iterator option can be used to achieve this:
```c++
leveldb::ReadOptions options;
options.fill_cache = false;
leveldb::Iterator* it = db->NewIterator(options);
for (it->SeekToFirst(); it->Valid(); it->Next()) {
...
}
```
### Key Layout
Note that the unit of disk transfer and caching is a block. Adjacent keys
(according to the database sort order) will usually be placed in the same block.
Therefore the application can improve its performance by placing keys that are
accessed together near each other and placing infrequently used keys in a
separate region of the key space.
For example, suppose we are implementing a simple file system on top of leveldb.
The types of entries we might wish to store are:
filename -> permission-bits, length, list of file_block_ids
file_block_id -> data
We might want to prefix filename keys with one letter (say '/') and the
`file_block_id` keys with a different letter (say '0') so that scans over just
the metadata do not force us to fetch and cache bulky file contents.
### Filters
Because of the way leveldb data is organized on disk, a single `Get()` call may
involve multiple reads from disk. The optional FilterPolicy mechanism can be
used to reduce the number of disk reads substantially.
```c++
leveldb::Options options;
options.filter_policy = NewBloomFilterPolicy(10);
leveldb::DB* db;
leveldb::DB::Open(options, "/tmp/testdb", &db);
... use the database ...
delete db;
delete options.filter_policy;
```
The preceding code associates a Bloom filter based filtering policy with the
database. Bloom filter based filtering relies on keeping some number of bits of
data in memory per key (in this case 10 bits per key since that is the argument
we passed to `NewBloomFilterPolicy`). This filter will reduce the number of
unnecessary disk reads needed for Get() calls by a factor of approximately
a 100. Increasing the bits per key will lead to a larger reduction at the cost
of more memory usage. We recommend that applications whose working set does not
fit in memory and that do a lot of random reads set a filter policy.
If you are using a custom comparator, you should ensure that the filter policy
you are using is compatible with your comparator. For example, consider a
comparator that ignores trailing spaces when comparing keys.
`NewBloomFilterPolicy` must not be used with such a comparator. Instead, the
application should provide a custom filter policy that also ignores trailing
spaces. For example:
```c++
class CustomFilterPolicy : public leveldb::FilterPolicy {
private:
FilterPolicy* builtin_policy_;
public:
CustomFilterPolicy() : builtin_policy_(NewBloomFilterPolicy(10)) {}
~CustomFilterPolicy() { delete builtin_policy_; }
const char* Name() const { return "IgnoreTrailingSpacesFilter"; }
void CreateFilter(const Slice* keys, int n, std::string* dst) const {
// Use builtin bloom filter code after removing trailing spaces
std::vector<Slice> trimmed(n);
for (int i = 0; i < n; i++) {
trimmed[i] = RemoveTrailingSpaces(keys[i]);
}
return builtin_policy_->CreateFilter(&trimmed[i], n, dst);
}
};
```
Advanced applications may provide a filter policy that does not use a bloom
filter but uses some other mechanism for summarizing a set of keys. See
`leveldb/filter_policy.h` for detail.
## Checksums
leveldb associates checksums with all data it stores in the file system. There
are two separate controls provided over how aggressively these checksums are
verified:
`ReadOptions::verify_checksums` may be set to true to force checksum
verification of all data that is read from the file system on behalf of a
particular read. By default, no such verification is done.
`Options::paranoid_checks` may be set to true before opening a database to make
the database implementation raise an error as soon as it detects an internal
corruption. Depending on which portion of the database has been corrupted, the
error may be raised when the database is opened, or later by another database
operation. By default, paranoid checking is off so that the database can be used
even if parts of its persistent storage have been corrupted.
If a database is corrupted (perhaps it cannot be opened when paranoid checking
is turned on), the `leveldb::RepairDB` function may be used to recover as much
of the data as possible
## Approximate Sizes
The `GetApproximateSizes` method can used to get the approximate number of bytes
of file system space used by one or more key ranges.
```c++
leveldb::Range ranges[2];
ranges[0] = leveldb::Range("a", "c");
ranges[1] = leveldb::Range("x", "z");
uint64_t sizes[2];
leveldb::Status s = db->GetApproximateSizes(ranges, 2, sizes);
```
The preceding call will set `sizes[0]` to the approximate number of bytes of
file system space used by the key range `[a..c)` and `sizes[1]` to the
approximate number of bytes used by the key range `[x..z)`.
## Environment
All file operations (and other operating system calls) issued by the leveldb
implementation are routed through a `leveldb::Env` object. Sophisticated clients
may wish to provide their own Env implementation to get better control.
For example, an application may introduce artificial delays in the file IO
paths to limit the impact of leveldb on other activities in the system.
```c++
class SlowEnv : public leveldb::Env {
... implementation of the Env interface ...
};
SlowEnv env;
leveldb::Options options;
options.env = &env;
Status s = leveldb::DB::Open(options, ...);
```
## Porting
leveldb may be ported to a new platform by providing platform specific
implementations of the types/methods/functions exported by
`leveldb/port/port.h`. See `leveldb/port/port_example.h` for more details.
In addition, the new platform may need a new default `leveldb::Env`
implementation. See `leveldb/util/env_posix.h` for an example.
## Other Information
Details about the leveldb implementation may be found in the following
documents:
1. [Implementation notes](impl.md)
2. [Format of an immutable Table file](table_format.md)
3. [Format of a log file](log_format.md)

View file

@ -1,75 +0,0 @@
leveldb Log format
==================
The log file contents are a sequence of 32KB blocks. The only exception is that
the tail of the file may contain a partial block.
Each block consists of a sequence of records:
block := record* trailer?
record :=
checksum: uint32 // crc32c of type and data[] ; little-endian
length: uint16 // little-endian
type: uint8 // One of FULL, FIRST, MIDDLE, LAST
data: uint8[length]
A record never starts within the last six bytes of a block (since it won't fit).
Any leftover bytes here form the trailer, which must consist entirely of zero
bytes and must be skipped by readers.
Aside: if exactly seven bytes are left in the current block, and a new non-zero
length record is added, the writer must emit a FIRST record (which contains zero
bytes of user data) to fill up the trailing seven bytes of the block and then
emit all of the user data in subsequent blocks.
More types may be added in the future. Some Readers may skip record types they
do not understand, others may report that some data was skipped.
FULL == 1
FIRST == 2
MIDDLE == 3
LAST == 4
The FULL record contains the contents of an entire user record.
FIRST, MIDDLE, LAST are types used for user records that have been split into
multiple fragments (typically because of block boundaries). FIRST is the type
of the first fragment of a user record, LAST is the type of the last fragment of
a user record, and MIDDLE is the type of all interior fragments of a user
record.
Example: consider a sequence of user records:
A: length 1000
B: length 97270
C: length 8000
**A** will be stored as a FULL record in the first block.
**B** will be split into three fragments: first fragment occupies the rest of
the first block, second fragment occupies the entirety of the second block, and
the third fragment occupies a prefix of the third block. This will leave six
bytes free in the third block, which will be left empty as the trailer.
**C** will be stored as a FULL record in the fourth block.
----
## Some benefits over the recordio format:
1. We do not need any heuristics for resyncing - just go to next block boundary
and scan. If there is a corruption, skip to the next block. As a
side-benefit, we do not get confused when part of the contents of one log
file are embedded as a record inside another log file.
2. Splitting at approximate boundaries (e.g., for mapreduce) is simple: find the
next block boundary and skip records until we hit a FULL or FIRST record.
3. We do not need extra buffering for large records.
## Some downsides compared to recordio format:
1. No packing of tiny records. This could be fixed by adding a new record type,
so it is a shortcoming of the current implementation, not necessarily the
format.
2. No compression. Again, this could be fixed by adding new record types.

View file

@ -0,0 +1,75 @@
The log file contents are a sequence of 32KB blocks. The only
exception is that the tail of the file may contain a partial block.
Each block consists of a sequence of records:
block := record* trailer?
record :=
checksum: uint32 // crc32c of type and data[]
length: uint16
type: uint8 // One of FULL, FIRST, MIDDLE, LAST
data: uint8[length]
A record never starts within the last six bytes of a block (since it
won't fit). Any leftover bytes here form the trailer, which must
consist entirely of zero bytes and must be skipped by readers.
Aside: if exactly seven bytes are left in the current block, and a new
non-zero length record is added, the writer must emit a FIRST record
(which contains zero bytes of user data) to fill up the trailing seven
bytes of the block and then emit all of the user data in subsequent
blocks.
More types may be added in the future. Some Readers may skip record
types they do not understand, others may report that some data was
skipped.
FULL == 1
FIRST == 2
MIDDLE == 3
LAST == 4
The FULL record contains the contents of an entire user record.
FIRST, MIDDLE, LAST are types used for user records that have been
split into multiple fragments (typically because of block boundaries).
FIRST is the type of the first fragment of a user record, LAST is the
type of the last fragment of a user record, and MID is the type of all
interior fragments of a user record.
Example: consider a sequence of user records:
A: length 1000
B: length 97270
C: length 8000
A will be stored as a FULL record in the first block.
B will be split into three fragments: first fragment occupies the rest
of the first block, second fragment occupies the entirety of the
second block, and the third fragment occupies a prefix of the third
block. This will leave six bytes free in the third block, which will
be left empty as the trailer.
C will be stored as a FULL record in the fourth block.
===================
Some benefits over the recordio format:
(1) We do not need any heuristics for resyncing - just go to next
block boundary and scan. If there is a corruption, skip to the next
block. As a side-benefit, we do not get confused when part of the
contents of one log file are embedded as a record inside another log
file.
(2) Splitting at approximate boundaries (e.g., for mapreduce) is
simple: find the next block boundary and skip records until we
hit a FULL or FIRST record.
(3) We do not need extra buffering for large records.
Some downsides compared to recordio format:
(1) No packing of tiny records. This could be fixed by adding a new
record type, so it is a shortcoming of the current implementation,
not necessarily the format.
(2) No compression. Again, this could be fixed by adding new record types.

View file

@ -1,107 +0,0 @@
leveldb File format
===================
<beginning_of_file>
[data block 1]
[data block 2]
...
[data block N]
[meta block 1]
...
[meta block K]
[metaindex block]
[index block]
[Footer] (fixed size; starts at file_size - sizeof(Footer))
<end_of_file>
The file contains internal pointers. Each such pointer is called
a BlockHandle and contains the following information:
offset: varint64
size: varint64
See [varints](https://developers.google.com/protocol-buffers/docs/encoding#varints)
for an explanation of varint64 format.
1. The sequence of key/value pairs in the file are stored in sorted
order and partitioned into a sequence of data blocks. These blocks
come one after another at the beginning of the file. Each data block
is formatted according to the code in `block_builder.cc`, and then
optionally compressed.
2. After the data blocks we store a bunch of meta blocks. The
supported meta block types are described below. More meta block types
may be added in the future. Each meta block is again formatted using
`block_builder.cc` and then optionally compressed.
3. A "metaindex" block. It contains one entry for every other meta
block where the key is the name of the meta block and the value is a
BlockHandle pointing to that meta block.
4. An "index" block. This block contains one entry per data block,
where the key is a string >= last key in that data block and before
the first key in the successive data block. The value is the
BlockHandle for the data block.
5. At the very end of the file is a fixed length footer that contains
the BlockHandle of the metaindex and index blocks as well as a magic number.
metaindex_handle: char[p]; // Block handle for metaindex
index_handle: char[q]; // Block handle for index
padding: char[40-p-q];// zeroed bytes to make fixed length
// (40==2*BlockHandle::kMaxEncodedLength)
magic: fixed64; // == 0xdb4775248b80fb57 (little-endian)
## "filter" Meta Block
If a `FilterPolicy` was specified when the database was opened, a
filter block is stored in each table. The "metaindex" block contains
an entry that maps from `filter.<N>` to the BlockHandle for the filter
block where `<N>` is the string returned by the filter policy's
`Name()` method.
The filter block stores a sequence of filters, where filter i contains
the output of `FilterPolicy::CreateFilter()` on all keys that are stored
in a block whose file offset falls within the range
[ i*base ... (i+1)*base-1 ]
Currently, "base" is 2KB. So for example, if blocks X and Y start in
the range `[ 0KB .. 2KB-1 ]`, all of the keys in X and Y will be
converted to a filter by calling `FilterPolicy::CreateFilter()`, and the
resulting filter will be stored as the first filter in the filter
block.
The filter block is formatted as follows:
[filter 0]
[filter 1]
[filter 2]
...
[filter N-1]
[offset of filter 0] : 4 bytes
[offset of filter 1] : 4 bytes
[offset of filter 2] : 4 bytes
...
[offset of filter N-1] : 4 bytes
[offset of beginning of offset array] : 4 bytes
lg(base) : 1 byte
The offset array at the end of the filter block allows efficient
mapping from a data block offset to the corresponding filter.
## "stats" Meta Block
This meta block contains a bunch of stats. The key is the name
of the statistic. The value contains the statistic.
TODO(postrelease): record following stats.
data size
index size
key size (uncompressed)
value size (uncompressed)
number of entries
number of data blocks

View file

@ -0,0 +1,102 @@
File format
===========
<beginning_of_file>
[data block 1]
[data block 2]
...
[data block N]
[meta block 1]
...
[meta block K]
[metaindex block]
[index block]
[Footer] (fixed size; starts at file_size - sizeof(Footer))
<end_of_file>
The file contains internal pointers. Each such pointer is called
a BlockHandle and contains the following information:
offset: varint64
size: varint64
(1) The sequence of key/value pairs in the file are stored in sorted
order and partitioned into a sequence of data blocks. These blocks
come one after another at the beginning of the file. Each data block
is formatted according to the code in block_builder.cc, and then
optionally compressed.
(2) After the data blocks we store a bunch of meta blocks. The
supported meta block types are described below. More meta block types
may be added in the future. Each meta block is again formatted using
block_builder.cc and then optionally compressed.
(3) A "metaindex" block. It contains one entry for every other meta
block where the key is the name of the meta block and the value is a
BlockHandle pointing to that meta block.
(4) An "index" block. This block contains one entry per data block,
where the key is a string >= last key in that data block and before
the first key in the successive data block. The value is the
BlockHandle for the data block.
(6) At the very end of the file is a fixed length footer that contains
the BlockHandle of the metaindex and index blocks as well as a magic number.
metaindex_handle: char[p]; // Block handle for metaindex
index_handle: char[q]; // Block handle for index
padding: char[40-p-q]; // 0 bytes to make fixed length
// (40==2*BlockHandle::kMaxEncodedLength)
magic: fixed64; // == 0xdb4775248b80fb57
"filter" Meta Block
-------------------
If a "FilterPolicy" was specified when the database was opened, a
filter block is stored in each table. The "metaindex" block contains
an entry that maps from "filter.<N>" to the BlockHandle for the filter
block where "<N>" is the string returned by the filter policy's
"Name()" method.
The filter block stores a sequence of filters, where filter i contains
the output of FilterPolicy::CreateFilter() on all keys that are stored
in a block whose file offset falls within the range
[ i*base ... (i+1)*base-1 ]
Currently, "base" is 2KB. So for example, if blocks X and Y start in
the range [ 0KB .. 2KB-1 ], all of the keys in X and Y will be
converted to a filter by calling FilterPolicy::CreateFilter(), and the
resulting filter will be stored as the first filter in the filter
block.
The filter block is formatted as follows:
[filter 0]
[filter 1]
[filter 2]
...
[filter N-1]
[offset of filter 0] : 4 bytes
[offset of filter 1] : 4 bytes
[offset of filter 2] : 4 bytes
...
[offset of filter N-1] : 4 bytes
[offset of beginning of offset array] : 4 bytes
lg(base) : 1 byte
The offset array at the end of the filter block allows efficient
mapping from a data block offset to the corresponding filter.
"stats" Meta Block
------------------
This meta block contains a bunch of stats. The key is the name
of the statistic. The value contains the statistic.
TODO(postrelease): record following stats.
data size
index size
key size (uncompressed)
value size (uncompressed)
number of entries
number of data blocks

View file

@ -55,15 +55,14 @@ class FileState {
} }
const uint64_t available = size_ - offset; const uint64_t available = size_ - offset;
if (n > available) { if (n > available) {
n = static_cast<size_t>(available); n = available;
} }
if (n == 0) { if (n == 0) {
*result = Slice(); *result = Slice();
return Status::OK(); return Status::OK();
} }
assert(offset / kBlockSize <= SIZE_MAX); size_t block = offset / kBlockSize;
size_t block = static_cast<size_t>(offset / kBlockSize);
size_t block_offset = offset % kBlockSize; size_t block_offset = offset % kBlockSize;
if (n <= kBlockSize - block_offset) { if (n <= kBlockSize - block_offset) {
@ -168,7 +167,7 @@ class SequentialFileImpl : public SequentialFile {
if (pos_ > file_->Size()) { if (pos_ > file_->Size()) {
return Status::IOError("pos_ > file_->Size()"); return Status::IOError("pos_ > file_->Size()");
} }
const uint64_t available = file_->Size() - pos_; const size_t available = file_->Size() - pos_;
if (n > available) { if (n > available) {
n = available; n = available;
} }
@ -176,10 +175,9 @@ class SequentialFileImpl : public SequentialFile {
return Status::OK(); return Status::OK();
} }
virtual std::string GetName() const { return "[memenv]"; }
private: private:
FileState* file_; FileState* file_;
uint64_t pos_; size_t pos_;
}; };
class RandomAccessFileImpl : public RandomAccessFile { class RandomAccessFileImpl : public RandomAccessFile {
@ -197,7 +195,6 @@ class RandomAccessFileImpl : public RandomAccessFile {
return file_->Read(offset, n, result, scratch); return file_->Read(offset, n, result, scratch);
} }
virtual std::string GetName() const { return "[memenv]"; }
private: private:
FileState* file_; FileState* file_;
}; };
@ -220,16 +217,10 @@ class WritableFileImpl : public WritableFile {
virtual Status Flush() { return Status::OK(); } virtual Status Flush() { return Status::OK(); }
virtual Status Sync() { return Status::OK(); } virtual Status Sync() { return Status::OK(); }
virtual std::string GetName() const { return "[memenv]"; }
private: private:
FileState* file_; FileState* file_;
}; };
class NoOpLogger : public Logger {
public:
virtual void Logv(const char* format, va_list ap) { }
};
class InMemoryEnv : public EnvWrapper { class InMemoryEnv : public EnvWrapper {
public: public:
explicit InMemoryEnv(Env* base_env) : EnvWrapper(base_env) { } explicit InMemoryEnv(Env* base_env) : EnvWrapper(base_env) { }
@ -266,7 +257,7 @@ class InMemoryEnv : public EnvWrapper {
} }
virtual Status NewWritableFile(const std::string& fname, virtual Status NewWritableFile(const std::string& fname,
WritableFile** result) { WritableFile** result, size_t) {
MutexLock lock(&mutex_); MutexLock lock(&mutex_);
if (file_map_.find(fname) != file_map_.end()) { if (file_map_.find(fname) != file_map_.end()) {
DeleteFileInternal(fname); DeleteFileInternal(fname);
@ -280,19 +271,6 @@ class InMemoryEnv : public EnvWrapper {
return Status::OK(); return Status::OK();
} }
virtual Status NewAppendableFile(const std::string& fname,
WritableFile** result) {
MutexLock lock(&mutex_);
FileState** sptr = &file_map_[fname];
FileState* file = *sptr;
if (file == NULL) {
file = new FileState();
file->Ref();
}
*result = new WritableFileImpl(file);
return Status::OK();
}
virtual bool FileExists(const std::string& fname) { virtual bool FileExists(const std::string& fname) {
MutexLock lock(&mutex_); MutexLock lock(&mutex_);
return file_map_.find(fname) != file_map_.end(); return file_map_.find(fname) != file_map_.end();
@ -380,11 +358,6 @@ class InMemoryEnv : public EnvWrapper {
return Status::OK(); return Status::OK();
} }
virtual Status NewLogger(const std::string& fname, Logger** result) {
*result = new NoOpLogger;
return Status::OK();
}
private: private:
// Map from filenames to FileState objects, representing a simple file system. // Map from filenames to FileState objects, representing a simple file system.
typedef std::map<std::string, FileState*> FileSystem; typedef std::map<std::string, FileState*> FileSystem;

View file

@ -29,68 +29,61 @@ TEST(MemEnvTest, Basics) {
uint64_t file_size; uint64_t file_size;
WritableFile* writable_file; WritableFile* writable_file;
std::vector<std::string> children; std::vector<std::string> children;
std::string dbname;
ASSERT_OK(env_->CreateDir("/dir")); dbname=test::TmpDir();
ASSERT_OK(env_->CreateDir(dbname.c_str()));
// Check that the directory is empty. // Check that the directory is empty.
ASSERT_TRUE(!env_->FileExists("/dir/non_existent")); ASSERT_TRUE(!env_->FileExists(dbname + "/non_existent"));
ASSERT_TRUE(!env_->GetFileSize("/dir/non_existent", &file_size).ok()); ASSERT_TRUE(!env_->GetFileSize(dbname + "/non_existent", &file_size).ok());
ASSERT_OK(env_->GetChildren("/dir", &children)); ASSERT_OK(env_->GetChildren(dbname + "", &children));
ASSERT_EQ(0, children.size()); ASSERT_EQ(0, children.size());
// Create a file. // Create a file.
ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file)); ASSERT_OK(env_->NewWritableFile(dbname + "/f", &writable_file, 2<<20));
ASSERT_OK(env_->GetFileSize("/dir/f", &file_size));
ASSERT_EQ(0, file_size);
delete writable_file; delete writable_file;
// Check that the file exists. // Check that the file exists.
ASSERT_TRUE(env_->FileExists("/dir/f")); ASSERT_TRUE(env_->FileExists(dbname + "/f"));
ASSERT_OK(env_->GetFileSize("/dir/f", &file_size)); ASSERT_OK(env_->GetFileSize(dbname + "/f", &file_size));
ASSERT_EQ(0, file_size); ASSERT_EQ(0, file_size);
ASSERT_OK(env_->GetChildren("/dir", &children)); ASSERT_OK(env_->GetChildren(dbname + "", &children));
ASSERT_EQ(1, children.size()); ASSERT_EQ(1, children.size());
ASSERT_EQ("f", children[0]); ASSERT_EQ("f", children[0]);
// Write to the file. // Write to the file.
ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file)); ASSERT_OK(env_->NewWritableFile(dbname + "/f", &writable_file, 2<<20));
ASSERT_OK(writable_file->Append("abc")); ASSERT_OK(writable_file->Append("abc"));
delete writable_file; delete writable_file;
// Check that append works.
ASSERT_OK(env_->NewAppendableFile("/dir/f", &writable_file));
ASSERT_OK(env_->GetFileSize("/dir/f", &file_size));
ASSERT_EQ(3, file_size);
ASSERT_OK(writable_file->Append("hello"));
delete writable_file;
// Check for expected size. // Check for expected size.
ASSERT_OK(env_->GetFileSize("/dir/f", &file_size)); ASSERT_OK(env_->GetFileSize(dbname + "/f", &file_size));
ASSERT_EQ(8, file_size); ASSERT_EQ(3, file_size);
// Check that renaming works. // Check that renaming works.
ASSERT_TRUE(!env_->RenameFile("/dir/non_existent", "/dir/g").ok()); ASSERT_TRUE(!env_->RenameFile(dbname + "/non_existent", dbname + "/g").ok());
ASSERT_OK(env_->RenameFile("/dir/f", "/dir/g")); ASSERT_OK(env_->RenameFile(dbname + "/f", dbname + "/g"));
ASSERT_TRUE(!env_->FileExists("/dir/f")); ASSERT_TRUE(!env_->FileExists(dbname + "/f"));
ASSERT_TRUE(env_->FileExists("/dir/g")); ASSERT_TRUE(env_->FileExists(dbname + "/g"));
ASSERT_OK(env_->GetFileSize("/dir/g", &file_size)); ASSERT_OK(env_->GetFileSize(dbname + "/g", &file_size));
ASSERT_EQ(8, file_size); ASSERT_EQ(3, file_size);
// Check that opening non-existent file fails. // Check that opening non-existent file fails.
SequentialFile* seq_file; SequentialFile* seq_file;
RandomAccessFile* rand_file; RandomAccessFile* rand_file;
ASSERT_TRUE(!env_->NewSequentialFile("/dir/non_existent", &seq_file).ok()); ASSERT_TRUE(!env_->NewSequentialFile(dbname + "/non_existent", &seq_file).ok());
ASSERT_TRUE(!seq_file); ASSERT_TRUE(!seq_file);
ASSERT_TRUE(!env_->NewRandomAccessFile("/dir/non_existent", &rand_file).ok()); ASSERT_TRUE(!env_->NewRandomAccessFile(dbname + "/non_existent", &rand_file).ok());
ASSERT_TRUE(!rand_file); ASSERT_TRUE(!rand_file);
// Check that deleting works. // Check that deleting works.
ASSERT_TRUE(!env_->DeleteFile("/dir/non_existent").ok()); ASSERT_TRUE(!env_->DeleteFile(dbname + "/non_existent").ok());
ASSERT_OK(env_->DeleteFile("/dir/g")); ASSERT_OK(env_->DeleteFile(dbname + "/g"));
ASSERT_TRUE(!env_->FileExists("/dir/g")); ASSERT_TRUE(!env_->FileExists(dbname + "/g"));
ASSERT_OK(env_->GetChildren("/dir", &children)); ASSERT_OK(env_->GetChildren(dbname + "", &children));
ASSERT_EQ(0, children.size()); ASSERT_EQ(0, children.size());
ASSERT_OK(env_->DeleteDir("/dir")); ASSERT_OK(env_->DeleteDir(dbname + ""));
} }
TEST(MemEnvTest, ReadWrite) { TEST(MemEnvTest, ReadWrite) {
@ -99,16 +92,19 @@ TEST(MemEnvTest, ReadWrite) {
RandomAccessFile* rand_file; RandomAccessFile* rand_file;
Slice result; Slice result;
char scratch[100]; char scratch[100];
std::string dbname;
ASSERT_OK(env_->CreateDir("/dir")); dbname=test::TmpDir();
ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file)); ASSERT_OK(env_->CreateDir(dbname + ""));
ASSERT_OK(env_->NewWritableFile(dbname + "/f", &writable_file, 2<<20));
ASSERT_OK(writable_file->Append("hello ")); ASSERT_OK(writable_file->Append("hello "));
ASSERT_OK(writable_file->Append("world")); ASSERT_OK(writable_file->Append("world"));
delete writable_file; delete writable_file;
// Read sequentially. // Read sequentially.
ASSERT_OK(env_->NewSequentialFile("/dir/f", &seq_file)); ASSERT_OK(env_->NewSequentialFile(dbname + "/f", &seq_file));
ASSERT_OK(seq_file->Read(5, &result, scratch)); // Read "hello". ASSERT_OK(seq_file->Read(5, &result, scratch)); // Read "hello".
ASSERT_EQ(0, result.compare("hello")); ASSERT_EQ(0, result.compare("hello"));
ASSERT_OK(seq_file->Skip(1)); ASSERT_OK(seq_file->Skip(1));
@ -122,7 +118,7 @@ TEST(MemEnvTest, ReadWrite) {
delete seq_file; delete seq_file;
// Random reads. // Random reads.
ASSERT_OK(env_->NewRandomAccessFile("/dir/f", &rand_file)); ASSERT_OK(env_->NewRandomAccessFile(dbname + "/f", &rand_file));
ASSERT_OK(rand_file->Read(6, 5, &result, scratch)); // Read "world". ASSERT_OK(rand_file->Read(6, 5, &result, scratch)); // Read "world".
ASSERT_EQ(0, result.compare("world")); ASSERT_EQ(0, result.compare("world"));
ASSERT_OK(rand_file->Read(0, 5, &result, scratch)); // Read "hello". ASSERT_OK(rand_file->Read(0, 5, &result, scratch)); // Read "hello".
@ -149,7 +145,7 @@ TEST(MemEnvTest, Misc) {
ASSERT_TRUE(!test_dir.empty()); ASSERT_TRUE(!test_dir.empty());
WritableFile* writable_file; WritableFile* writable_file;
ASSERT_OK(env_->NewWritableFile("/a/b", &writable_file)); ASSERT_OK(env_->NewWritableFile("/a/b", &writable_file, 2<<20));
// These are no-ops, but we test they return success. // These are no-ops, but we test they return success.
ASSERT_OK(writable_file->Sync()); ASSERT_OK(writable_file->Sync());
@ -161,6 +157,9 @@ TEST(MemEnvTest, Misc) {
TEST(MemEnvTest, LargeWrite) { TEST(MemEnvTest, LargeWrite) {
const size_t kWriteSize = 300 * 1024; const size_t kWriteSize = 300 * 1024;
char* scratch = new char[kWriteSize * 2]; char* scratch = new char[kWriteSize * 2];
std::string dbname;
dbname=test::TmpDir();
std::string write_data; std::string write_data;
for (size_t i = 0; i < kWriteSize; ++i) { for (size_t i = 0; i < kWriteSize; ++i) {
@ -168,14 +167,14 @@ TEST(MemEnvTest, LargeWrite) {
} }
WritableFile* writable_file; WritableFile* writable_file;
ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file)); ASSERT_OK(env_->NewWritableFile(dbname + "/f", &writable_file, 2<<20));
ASSERT_OK(writable_file->Append("foo")); ASSERT_OK(writable_file->Append("foo"));
ASSERT_OK(writable_file->Append(write_data)); ASSERT_OK(writable_file->Append(write_data));
delete writable_file; delete writable_file;
SequentialFile* seq_file; SequentialFile* seq_file;
Slice result; Slice result;
ASSERT_OK(env_->NewSequentialFile("/dir/f", &seq_file)); ASSERT_OK(env_->NewSequentialFile(dbname + "/f", &seq_file));
ASSERT_OK(seq_file->Read(3, &result, scratch)); // Read "foo". ASSERT_OK(seq_file->Read(3, &result, scratch)); // Read "foo".
ASSERT_EQ(0, result.compare("foo")); ASSERT_EQ(0, result.compare("foo"));
@ -190,17 +189,21 @@ TEST(MemEnvTest, LargeWrite) {
delete seq_file; delete seq_file;
delete [] scratch; delete [] scratch;
} }
#if 0
TEST(MemEnvTest, DBTest) { TEST(MemEnvTest, DBTest) {
Options options; Options options;
options.create_if_missing = true; options.create_if_missing = true;
options.env = env_; options.env = env_;
DB* db; DB* db;
std::string dbname;
dbname=test::TmpDir();
ASSERT_OK(env_->CreateDir(dbname+ "/db"));
const Slice keys[] = {Slice("aaa"), Slice("bbb"), Slice("ccc")}; const Slice keys[] = {Slice("aaa"), Slice("bbb"), Slice("ccc")};
const Slice vals[] = {Slice("foo"), Slice("bar"), Slice("baz")}; const Slice vals[] = {Slice("foo"), Slice("bar"), Slice("baz")};
ASSERT_OK(DB::Open(options, "/dir/db", &db)); ASSERT_OK(DB::Open(options, dbname + "/db", &db));
for (size_t i = 0; i < 3; ++i) { for (size_t i = 0; i < 3; ++i) {
ASSERT_OK(db->Put(WriteOptions(), keys[i], vals[i])); ASSERT_OK(db->Put(WriteOptions(), keys[i], vals[i]));
} }
@ -233,7 +236,7 @@ TEST(MemEnvTest, DBTest) {
delete db; delete db;
} }
#endif
} // namespace leveldb } // namespace leveldb
int main(int argc, char** argv) { int main(int argc, char** argv) {

View file

@ -0,0 +1,227 @@
// -------------------------------------------------------------------
//
// atomics.h: portable atomic operations for leveldb/eleveldb (http://code.google.com/p/leveldb/)
//
// Copyright (c) 2011-2013 Basho Technologies, Inc. All Rights Reserved.
//
// This file is provided to you under the Apache License,
// Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain
// a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
//
// -------------------------------------------------------------------
/// Copied from basho/eleveldb/c_src/detail.hpp September 8, 2013
#ifndef LEVELDB_ATOMIC_H
#define LEVELDB_ATOMIC_H 1
#include <stdint.h>
#include <stddef.h>
/* These can be hopefully-replaced with constexpr or compile-time assert later: */
#if defined(OS_SOLARIS) || defined(SOLARIS) || defined(sun)
#define LEVELDB_IS_SOLARIS 1
#else
#undef LEVELDB_IS_SOLARIS
#endif
#ifdef LEVELDB_IS_SOLARIS
#include <atomic.h>
#endif
namespace leveldb {
/**
* Compare and swap
*/
// primary template
template <typename PtrT, typename ValueT>
inline bool compare_and_swap(volatile PtrT *ptr, const ValueT& comp_val, const ValueT& exchange_val);
// uint32 size (needed for solaris)
template <>
inline bool compare_and_swap(volatile uint32_t *ptr, const int& comp_val, const int& exchange_val)
{
#if LEVELDB_IS_SOLARIS
return ((uint32_t) comp_val==atomic_cas_32(ptr, comp_val, exchange_val));
#else
return __sync_bool_compare_and_swap(ptr, comp_val, exchange_val);
#endif
}
// generic specification ... for pointers
template <typename PtrT, typename ValueT>
inline bool compare_and_swap(volatile PtrT *ptr, const ValueT& comp_val, const ValueT& exchange_val)
{
#if LEVELDB_IS_SOLARIS
return (comp_val==atomic_cas_ptr(ptr, comp_val, exchange_val));
#else
return __sync_bool_compare_and_swap(ptr, comp_val, exchange_val);
#endif
}
/**
* Atomic increment
*/
template <typename ValueT>
inline ValueT inc_and_fetch(volatile ValueT *ptr);
template <>
inline uint64_t inc_and_fetch(volatile uint64_t *ptr)
{
#if LEVELDB_IS_SOLARIS
return atomic_inc_64_nv(ptr);
#else
return __sync_add_and_fetch(ptr, 1);
#endif
}
template <>
inline uint32_t inc_and_fetch(volatile uint32_t *ptr)
{
#if LEVELDB_IS_SOLARIS
return atomic_inc_32_nv(ptr);
#else
return __sync_add_and_fetch(ptr, 1);
#endif
}
#if defined(__APPLE__) || defined(__OpenBSD__) || (defined(__s390__) && !defined(__s390x__))
template <>
inline size_t inc_and_fetch(volatile size_t *ptr)
{
return __sync_add_and_fetch(ptr, 1);
}
#endif
/**
* atomic decrement
*/
template <typename ValueT>
inline ValueT dec_and_fetch(volatile ValueT *ptr);
template <>
inline uint64_t dec_and_fetch(volatile uint64_t *ptr)
{
#if LEVELDB_IS_SOLARIS
return atomic_dec_64_nv(ptr);
#else
return __sync_sub_and_fetch(ptr, 1);
#endif
}
template <>
inline uint32_t dec_and_fetch(volatile uint32_t *ptr)
{
#if LEVELDB_IS_SOLARIS
return atomic_dec_32_nv(ptr);
#else
return __sync_sub_and_fetch(ptr, 1);
#endif
}
#if defined(__APPLE__) || defined(__OpenBSD__) || (defined(__s390__) && !defined(__s390x__))
template <>
inline size_t dec_and_fetch(volatile size_t *ptr)
{
return __sync_sub_and_fetch(ptr, 1);
}
#endif
/**
* Atomic add
*/
template <typename ValueT>
inline ValueT add_and_fetch(volatile ValueT *ptr, ValueT val);
template <>
inline uint64_t add_and_fetch(volatile uint64_t *ptr, uint64_t val)
{
#if LEVELDB_IS_SOLARIS
return atomic_add_64_nv(ptr, val);
#else
return __sync_add_and_fetch(ptr, val);
#endif
}
template <>
inline uint32_t add_and_fetch(volatile uint32_t *ptr, uint32_t val)
{
#if LEVELDB_IS_SOLARIS
return atomic_add_32_nv(ptr, val);
#else
return __sync_add_and_fetch(ptr, val);
#endif
}
#if defined(__APPLE__) || defined(__OpenBSD__) || (defined(__s390__) && !defined(__s390x__))
template <>
inline size_t add_and_fetch(volatile size_t *ptr, size_t val)
{
return __sync_add_and_fetch(ptr, val);
}
#endif
/**
* Atomic subtract
*/
template <typename ValueT>
inline ValueT sub_and_fetch(volatile ValueT *ptr, ValueT val);
template <>
inline uint64_t sub_and_fetch(volatile uint64_t *ptr, uint64_t val)
{
#if LEVELDB_IS_SOLARIS
uint64_t temp=(~val)+1; // 2's complement, bypass sign warnings
return atomic_add_64_nv(ptr, temp);
#else
return __sync_sub_and_fetch(ptr, val);
#endif
}
template <>
inline uint32_t sub_and_fetch(volatile uint32_t *ptr, uint32_t val)
{
#if LEVELDB_IS_SOLARIS
uint32_t temp=(~val)+1; // 2's complement, bypass sign warnings
return atomic_add_32_nv(ptr, temp);
#else
return __sync_sub_and_fetch(ptr, val);
#endif
}
#if defined(__APPLE__) || defined(__OpenBSD__) || (defined(__s390__) && !defined(__s390x__))
template <>
inline size_t sub_and_fetch(volatile size_t *ptr, size_t val)
{
return __sync_sub_and_fetch(ptr, val);
}
#endif
} // namespace leveldb
#endif

View file

@ -9,6 +9,7 @@
Does not support: Does not support:
. getters for the option types . getters for the option types
. custom comparators that implement key shortening . custom comparators that implement key shortening
. capturing post-write-snapshot
. custom iter, db, env, cache implementations using just the C bindings . custom iter, db, env, cache implementations using just the C bindings
Some conventions: Some conventions:
@ -27,7 +28,6 @@
be true on entry: be true on entry:
*errptr == NULL *errptr == NULL
*errptr points to a malloc()ed null-terminated error message *errptr points to a malloc()ed null-terminated error message
(On Windows, *errptr must have been malloc()-ed by this library.)
On success, a leveldb routine leaves *errptr unchanged. On success, a leveldb routine leaves *errptr unchanged.
On failure, leveldb frees the old value of *errptr and On failure, leveldb frees the old value of *errptr and
set *errptr to a malloc()ed error message. set *errptr to a malloc()ed error message.
@ -66,7 +66,7 @@ typedef struct leveldb_snapshot_t leveldb_snapshot_t;
typedef struct leveldb_writablefile_t leveldb_writablefile_t; typedef struct leveldb_writablefile_t leveldb_writablefile_t;
typedef struct leveldb_writebatch_t leveldb_writebatch_t; typedef struct leveldb_writebatch_t leveldb_writebatch_t;
typedef struct leveldb_writeoptions_t leveldb_writeoptions_t; typedef struct leveldb_writeoptions_t leveldb_writeoptions_t;
typedef struct leveldb_keymetadata_t leveldb_keymetadata_t;
/* DB operations */ /* DB operations */
extern leveldb_t* leveldb_open( extern leveldb_t* leveldb_open(
@ -83,6 +83,14 @@ extern void leveldb_put(
const char* val, size_t vallen, const char* val, size_t vallen,
char** errptr); char** errptr);
extern void leveldb_put2(
leveldb_t* db,
const leveldb_writeoptions_t* options,
const char* key, size_t keylen,
const char* val, size_t vallen,
char** errptr,
const leveldb_keymetadata_t * metadata);
extern void leveldb_delete( extern void leveldb_delete(
leveldb_t* db, leveldb_t* db,
const leveldb_writeoptions_t* options, const leveldb_writeoptions_t* options,
@ -104,6 +112,14 @@ extern char* leveldb_get(
size_t* vallen, size_t* vallen,
char** errptr); char** errptr);
extern char* leveldb_get2(
leveldb_t* db,
const leveldb_readoptions_t* options,
const char* key, size_t keylen,
size_t* vallen,
char** errptr,
leveldb_keymetadata_t * metadata);
extern leveldb_iterator_t* leveldb_create_iterator( extern leveldb_iterator_t* leveldb_create_iterator(
leveldb_t* db, leveldb_t* db,
const leveldb_readoptions_t* options); const leveldb_readoptions_t* options);
@ -156,6 +172,7 @@ extern void leveldb_iter_next(leveldb_iterator_t*);
extern void leveldb_iter_prev(leveldb_iterator_t*); extern void leveldb_iter_prev(leveldb_iterator_t*);
extern const char* leveldb_iter_key(const leveldb_iterator_t*, size_t* klen); extern const char* leveldb_iter_key(const leveldb_iterator_t*, size_t* klen);
extern const char* leveldb_iter_value(const leveldb_iterator_t*, size_t* vlen); extern const char* leveldb_iter_value(const leveldb_iterator_t*, size_t* vlen);
extern const void leveldb_iter_keymetadata(const leveldb_iterator_t *, leveldb_keymetadata_t *);
extern void leveldb_iter_get_error(const leveldb_iterator_t*, char** errptr); extern void leveldb_iter_get_error(const leveldb_iterator_t*, char** errptr);
/* Write batch */ /* Write batch */
@ -167,13 +184,19 @@ extern void leveldb_writebatch_put(
leveldb_writebatch_t*, leveldb_writebatch_t*,
const char* key, size_t klen, const char* key, size_t klen,
const char* val, size_t vlen); const char* val, size_t vlen);
extern void leveldb_writebatch_put2(
leveldb_writebatch_t*,
const char* key, size_t klen,
const char* val, size_t vlen,
const leveldb_keymetadata_t * meta);
extern void leveldb_writebatch_delete( extern void leveldb_writebatch_delete(
leveldb_writebatch_t*, leveldb_writebatch_t*,
const char* key, size_t klen); const char* key, size_t klen);
extern void leveldb_writebatch_iterate( extern void leveldb_writebatch_iterate(
leveldb_writebatch_t*, leveldb_writebatch_t*,
void* state, void* state,
void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen), void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen,
const int & type, const uint64_t & expiry),
void (*deleted)(void*, const char* k, size_t klen)); void (*deleted)(void*, const char* k, size_t klen));
/* Options */ /* Options */
@ -192,6 +215,8 @@ extern void leveldb_options_set_error_if_exists(
leveldb_options_t*, unsigned char); leveldb_options_t*, unsigned char);
extern void leveldb_options_set_paranoid_checks( extern void leveldb_options_set_paranoid_checks(
leveldb_options_t*, unsigned char); leveldb_options_t*, unsigned char);
extern void leveldb_options_set_verify_compactions(
leveldb_options_t*, unsigned char);
extern void leveldb_options_set_env(leveldb_options_t*, leveldb_env_t*); extern void leveldb_options_set_env(leveldb_options_t*, leveldb_env_t*);
extern void leveldb_options_set_info_log(leveldb_options_t*, leveldb_logger_t*); extern void leveldb_options_set_info_log(leveldb_options_t*, leveldb_logger_t*);
extern void leveldb_options_set_write_buffer_size(leveldb_options_t*, size_t); extern void leveldb_options_set_write_buffer_size(leveldb_options_t*, size_t);
@ -199,6 +224,7 @@ extern void leveldb_options_set_max_open_files(leveldb_options_t*, int);
extern void leveldb_options_set_cache(leveldb_options_t*, leveldb_cache_t*); extern void leveldb_options_set_cache(leveldb_options_t*, leveldb_cache_t*);
extern void leveldb_options_set_block_size(leveldb_options_t*, size_t); extern void leveldb_options_set_block_size(leveldb_options_t*, size_t);
extern void leveldb_options_set_block_restart_interval(leveldb_options_t*, int); extern void leveldb_options_set_block_restart_interval(leveldb_options_t*, int);
extern void leveldb_options_set_total_leveldb_mem(leveldb_options_t*, size_t);
enum { enum {
leveldb_no_compression = 0, leveldb_no_compression = 0,
@ -267,20 +293,20 @@ extern void leveldb_cache_destroy(leveldb_cache_t* cache);
extern leveldb_env_t* leveldb_create_default_env(); extern leveldb_env_t* leveldb_create_default_env();
extern void leveldb_env_destroy(leveldb_env_t*); extern void leveldb_env_destroy(leveldb_env_t*);
extern void leveldb_env_shutdown();
/* Utility */ /* Util */
/* Calls free(ptr). /**
REQUIRES: ptr was malloc()-ed and returned by one of the routines * CAUTION: this call is only for char * objects returned by
in this file. Note that in certain cases (typically on Windows), you * functions like leveldb_get and leveldb_property_value.
may need to call this routine instead of free(ptr) to dispose of * Also used to release errptr strings.
malloc()-ed memory returned by this library. */ */
extern void leveldb_free(void* ptr); extern void leveldb_free(void* ptr);
/* Return the major version number for this release. */ /* Version */
extern int leveldb_major_version();
/* Return the minor version number for this release. */ extern int leveldb_major_version();
extern int leveldb_minor_version(); extern int leveldb_minor_version();
#ifdef __cplusplus #ifdef __cplusplus

View file

@ -29,6 +29,11 @@ class Cache;
// of Cache uses a least-recently-used eviction policy. // of Cache uses a least-recently-used eviction policy.
extern Cache* NewLRUCache(size_t capacity); extern Cache* NewLRUCache(size_t capacity);
// Riak customization - just like NewLRUCache except the underlying
// structure is NOT sharded. Better for file cache.
extern Cache* NewLRUCache2(size_t capacity);
class Cache { class Cache {
public: public:
Cache() { } Cache() { }
@ -81,16 +86,17 @@ class Cache {
// its cache keys. // its cache keys.
virtual uint64_t NewId() = 0; virtual uint64_t NewId() = 0;
// Remove all cache entries that are not actively in use. Memory-constrained // Return size, if any, of per entry overhead for item placed in cache.
// applications may wish to call this method to reduce memory usage. // Allows more accurate tracking of "charge" against each cache item.
// Default implementation of Prune() does nothing. Subclasses are strongly virtual size_t EntryOverheadSize() {return(0);};
// encouraged to override the default implementation. A future release of
// leveldb may change Prune() to a pure abstract method.
virtual void Prune() {}
// Return an estimate of the combined charges of all elements stored in the // Riak specific: Add a reference to cache object to help hold it
// cache. // in memory
virtual size_t TotalCharge() const = 0; virtual void Addref(Handle* e) = 0;
// Riak specific: walk contents of entire cache, calling functor Acc
// with the "value" for each cache entry. Locks cache throughout call.
virtual bool WalkCache(class CacheAccumulator & Acc) {return(true);};
private: private:
void LRU_Remove(Handle* e); void LRU_Remove(Handle* e);
@ -107,4 +113,4 @@ class Cache {
} // namespace leveldb } // namespace leveldb
#endif // STORAGE_LEVELDB_INCLUDE_CACHE_H_ #endif // STORAGE_LEVELDB_UTIL_CACHE_H_

View file

@ -58,6 +58,10 @@ class Comparator {
// must not be deleted. // must not be deleted.
extern const Comparator* BytewiseComparator(); extern const Comparator* BytewiseComparator();
// Riak specific: cleans up the default comparitor to make
// valgrind results clean
extern void ComparatorShutdown();
} // namespace leveldb } // namespace leveldb
#endif // STORAGE_LEVELDB_INCLUDE_COMPARATOR_H_ #endif // STORAGE_LEVELDB_INCLUDE_COMPARATOR_H_

View file

@ -14,7 +14,7 @@ namespace leveldb {
// Update Makefile if you change these // Update Makefile if you change these
static const int kMajorVersion = 1; static const int kMajorVersion = 1;
static const int kMinorVersion = 20; static const int kMinorVersion = 9;
struct Options; struct Options;
struct ReadOptions; struct ReadOptions;
@ -38,6 +38,17 @@ struct Range {
Range(const Slice& s, const Slice& l) : start(s), limit(l) { } Range(const Slice& s, const Slice& l) : start(s), limit(l) { }
}; };
// Abstract holder for a DB value.
// This allows callers to manage their own value buffers and have
// DB values copied directly into those buffers.
class Value {
public:
virtual Value& assign(const char* data, size_t size) = 0;
protected:
virtual ~Value();
};
// A DB is a persistent ordered map from keys to values. // A DB is a persistent ordered map from keys to values.
// A DB is safe for concurrent access from multiple threads without // A DB is safe for concurrent access from multiple threads without
// any external synchronization. // any external synchronization.
@ -60,7 +71,8 @@ class DB {
// Note: consider setting options.sync = true. // Note: consider setting options.sync = true.
virtual Status Put(const WriteOptions& options, virtual Status Put(const WriteOptions& options,
const Slice& key, const Slice& key,
const Slice& value) = 0; const Slice& value,
const KeyMetaData * meta=NULL) = 0;
// Remove the database entry (if any) for "key". Returns OK on // Remove the database entry (if any) for "key". Returns OK on
// success, and a non-OK status on error. It is not an error if "key" // success, and a non-OK status on error. It is not an error if "key"
@ -81,7 +93,11 @@ class DB {
// //
// May return some other Status on an error. // May return some other Status on an error.
virtual Status Get(const ReadOptions& options, virtual Status Get(const ReadOptions& options,
const Slice& key, std::string* value) = 0; const Slice& key, std::string* value,
KeyMetaData * meta=NULL) = 0;
virtual Status Get(const ReadOptions& options,
const Slice& key, Value* value,
KeyMetaData * meta=NULL) = 0;
// Return a heap-allocated iterator over the contents of the database. // Return a heap-allocated iterator over the contents of the database.
// The result of NewIterator() is initially invalid (caller must // The result of NewIterator() is initially invalid (caller must
@ -115,8 +131,6 @@ class DB {
// about the internal operation of the DB. // about the internal operation of the DB.
// "leveldb.sstables" - returns a multi-line string that describes all // "leveldb.sstables" - returns a multi-line string that describes all
// of the sstables that make up the db contents. // of the sstables that make up the db contents.
// "leveldb.approximate-memory-usage" - returns the approximate number of
// bytes of memory in use by the DB.
virtual bool GetProperty(const Slice& property, std::string* value) = 0; virtual bool GetProperty(const Slice& property, std::string* value) = 0;
// For each i in [0,n-1], store in "sizes[i]", the approximate // For each i in [0,n-1], store in "sizes[i]", the approximate
@ -142,6 +156,21 @@ class DB {
// db->CompactRange(NULL, NULL); // db->CompactRange(NULL, NULL);
virtual void CompactRange(const Slice* begin, const Slice* end) = 0; virtual void CompactRange(const Slice* begin, const Slice* end) = 0;
// Riak specific function: Verify that no .sst files overlap
// within the levels that expect non-overlapping files. Run
// compactions as necessary to correct. Assumes DB opened
// with Options.is_repair=true
virtual Status VerifyLevels();
// Riak specific function: Request database check for
// available compactions. This is to stimulate retry of
// grooming that might have been offered and rejected previously
virtual void CheckAvailableCompactions();
// Riak specific function: Give external code, namely
// eleveldb, access to leveldb's logging routines.
virtual Logger* GetLogger() const { return NULL; }
private: private:
// No copying allowed // No copying allowed
DB(const DB&); DB(const DB&);

View file

@ -1,25 +0,0 @@
// Copyright (c) 2014 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#ifndef STORAGE_LEVELDB_INCLUDE_DUMPFILE_H_
#define STORAGE_LEVELDB_INCLUDE_DUMPFILE_H_
#include <string>
#include "leveldb/env.h"
#include "leveldb/status.h"
namespace leveldb {
// Dump the contents of the file named by fname in text format to
// *dst. Makes a sequence of dst->Append() calls; each call is passed
// the newline-terminated text corresponding to a single item found
// in the file.
//
// Returns a non-OK result if fname does not name a leveldb storage
// file, or if the file cannot be read.
Status DumpFile(Env* env, const std::string& fname, WritableFile* dst);
} // namespace leveldb
#endif // STORAGE_LEVELDB_INCLUDE_DUMPFILE_H_

View file

@ -13,15 +13,19 @@
#ifndef STORAGE_LEVELDB_INCLUDE_ENV_H_ #ifndef STORAGE_LEVELDB_INCLUDE_ENV_H_
#define STORAGE_LEVELDB_INCLUDE_ENV_H_ #define STORAGE_LEVELDB_INCLUDE_ENV_H_
#include <cstdarg>
#include <pthread.h>
#include <string> #include <string>
#include <vector> #include <vector>
#include <stdarg.h>
#include <stdint.h> #include <stdint.h>
#include "leveldb/perf_count.h"
#include "leveldb/status.h" #include "leveldb/status.h"
namespace leveldb { namespace leveldb {
class AppendableFile;
class FileLock; class FileLock;
struct Options;
class Logger; class Logger;
class RandomAccessFile; class RandomAccessFile;
class SequentialFile; class SequentialFile;
@ -40,6 +44,11 @@ class Env {
// The result of Default() belongs to leveldb and must never be deleted. // The result of Default() belongs to leveldb and must never be deleted.
static Env* Default(); static Env* Default();
// Riak specific: Shutdown background work threads and other objects
// to get clean environment for valgrind memory test. No restart supported
// after this call. Not thread safe.
static void Shutdown();
// Create a brand new sequentially-readable file with the specified name. // Create a brand new sequentially-readable file with the specified name.
// On success, stores a pointer to the new file in *result and returns OK. // On success, stores a pointer to the new file in *result and returns OK.
// On failure stores NULL in *result and returns non-OK. If the file does // On failure stores NULL in *result and returns non-OK. If the file does
@ -67,22 +76,31 @@ class Env {
// //
// The returned file will only be accessed by one thread at a time. // The returned file will only be accessed by one thread at a time.
virtual Status NewWritableFile(const std::string& fname, virtual Status NewWritableFile(const std::string& fname,
WritableFile** result) = 0; WritableFile** result,
size_t map_size) = 0;
// Create an object that either appends to an existing file, or // Riak specific:
// writes to a new file (if the file does not exist to begin with). // Derived from NewWritableFile. One change: if the file exists,
// On success, stores a pointer to the new file in *result and // move to the end of the file and continue writing.
// returns OK. On failure stores NULL in *result and returns // new file. On success, stores a pointer to the open file in
// non-OK. // *result and returns OK. On failure stores NULL in *result and
// returns non-OK.
// //
// The returned file will only be accessed by one thread at a time. // The returned file will only be accessed by one thread at a time.
//
// May return an IsNotSupportedError error if this Env does
// not allow appending to an existing file. Users of Env (including
// the leveldb implementation) must be prepared to deal with
// an Env that does not support appending.
virtual Status NewAppendableFile(const std::string& fname, virtual Status NewAppendableFile(const std::string& fname,
WritableFile** result); WritableFile** result,
size_t map_size) = 0;
// Riak specific:
// Allows for virtualized version of NewWritableFile that enables write
// and close operations to execute on background threads
// (where platform supported).
//
// The returned file will only be accessed by one thread at a time.
virtual Status NewWriteOnlyFile(const std::string& fname,
WritableFile** result,
size_t map_size)
{return(NewWritableFile(fname, result, map_size));};
// Returns true iff the named file exists. // Returns true iff the named file exists.
virtual bool FileExists(const std::string& fname) = 0; virtual bool FileExists(const std::string& fname) = 0;
@ -142,7 +160,7 @@ class Env {
// Start a new thread, invoking "function(arg)" within the new thread. // Start a new thread, invoking "function(arg)" within the new thread.
// When "function(arg)" returns, the thread will be destroyed. // When "function(arg)" returns, the thread will be destroyed.
virtual void StartThread(void (*function)(void* arg), void* arg) = 0; virtual pthread_t StartThread(void (*function)(void* arg), void* arg) = 0;
// *path is set to a temporary directory that can be used for testing. It may // *path is set to a temporary directory that can be used for testing. It may
// or many not have just been created. The directory may or may not differ // or many not have just been created. The directory may or may not differ
@ -157,9 +175,16 @@ class Env {
// useful for computing deltas of time. // useful for computing deltas of time.
virtual uint64_t NowMicros() = 0; virtual uint64_t NowMicros() = 0;
// Sleep/delay the thread for the prescribed number of micro-seconds. // Sleep/delay the thread for the perscribed number of micro-seconds.
virtual void SleepForMicroseconds(int micros) = 0; virtual void SleepForMicroseconds(int micros) = 0;
// Riak specific: Get object that is tracking various software counters
virtual PerformanceCounters * GetPerformanceCounters() {return(gPerfCounters);};
// Riak specific: Request size of recovery memory map, potentially using
// Options data for the decision. Default 2Mbyte is Google's original size.
virtual size_t RecoveryMmapSize(const struct Options *) const {return(2*1024*1024L);};
private: private:
// No copying allowed // No copying allowed
Env(const Env&); Env(const Env&);
@ -190,14 +215,6 @@ class SequentialFile {
// //
// REQUIRES: External synchronization // REQUIRES: External synchronization
virtual Status Skip(uint64_t n) = 0; virtual Status Skip(uint64_t n) = 0;
// Get a name for the file, only for error reporting
virtual std::string GetName() const = 0;
private:
// No copying allowed
SequentialFile(const SequentialFile&);
void operator=(const SequentialFile&);
}; };
// A file abstraction for randomly reading the contents of a file. // A file abstraction for randomly reading the contents of a file.
@ -218,13 +235,11 @@ class RandomAccessFile {
virtual Status Read(uint64_t offset, size_t n, Slice* result, virtual Status Read(uint64_t offset, size_t n, Slice* result,
char* scratch) const = 0; char* scratch) const = 0;
// Get a name for the file, only for error reporting // Riak optimization: allows advising Linux page cache
virtual std::string GetName() const = 0; virtual void SetForCompaction(uint64_t file_size) {};
private: // Riak addition: size of this structure in bytes
// No copying allowed virtual size_t ObjectSize() {return(sizeof(RandomAccessFile));};
RandomAccessFile(const RandomAccessFile&);
void operator=(const RandomAccessFile&);
}; };
// A file abstraction for sequential writing. The implementation // A file abstraction for sequential writing. The implementation
@ -240,8 +255,10 @@ class WritableFile {
virtual Status Flush() = 0; virtual Status Flush() = 0;
virtual Status Sync() = 0; virtual Status Sync() = 0;
// Get a name for the file, only for error reporting // Riak specific:
virtual std::string GetName() const = 0; // Provide hint where key/value data ends and metadata starts
// in an .sst table file.
virtual void SetMetadataOffset(uint64_t) {};
private: private:
// No copying allowed // No copying allowed
@ -249,12 +266,30 @@ class WritableFile {
void operator=(const WritableFile&); void operator=(const WritableFile&);
}; };
// A file abstraction for sequential writing at end of existing file.
class AppendableFile: public WritableFile {
public:
AppendableFile() { }
virtual ~AppendableFile();
private:
// No copying allowed
AppendableFile(const AppendableFile&);
void operator=(const AppendableFile&);
};
// An interface for writing log messages. // An interface for writing log messages.
class Logger { class Logger {
public: public:
Logger() { } Logger() { }
virtual ~Logger(); virtual ~Logger();
// Riak specific function for hot backup.
// hot_backup.cc assumes that it can rotate the LOG file
// via standard Env routines if this function returns a
// non-zero value.
virtual long LogSize() {return(0);};
// Write an entry to the log file with the specified format. // Write an entry to the log file with the specified format.
virtual void Logv(const char* format, va_list ap) = 0; virtual void Logv(const char* format, va_list ap) = 0;
@ -310,11 +345,14 @@ class EnvWrapper : public Env {
Status NewRandomAccessFile(const std::string& f, RandomAccessFile** r) { Status NewRandomAccessFile(const std::string& f, RandomAccessFile** r) {
return target_->NewRandomAccessFile(f, r); return target_->NewRandomAccessFile(f, r);
} }
Status NewWritableFile(const std::string& f, WritableFile** r) { Status NewWritableFile(const std::string& f, WritableFile** r, size_t s=0) {
return target_->NewWritableFile(f, r); return target_->NewWritableFile(f, r, s);
} }
Status NewAppendableFile(const std::string& f, WritableFile** r) { Status NewAppendableFile(const std::string& f, WritableFile** r, size_t s=0) {
return target_->NewAppendableFile(f, r); return target_->NewAppendableFile(f, r, s);
}
Status NewWriteOnlyFile(const std::string& f, WritableFile** r, size_t s=0) {
return target_->NewWriteOnlyFile(f, r, s);
} }
bool FileExists(const std::string& f) { return target_->FileExists(f); } bool FileExists(const std::string& f) { return target_->FileExists(f); }
Status GetChildren(const std::string& dir, std::vector<std::string>* r) { Status GetChildren(const std::string& dir, std::vector<std::string>* r) {
@ -336,7 +374,7 @@ class EnvWrapper : public Env {
void Schedule(void (*f)(void*), void* a) { void Schedule(void (*f)(void*), void* a) {
return target_->Schedule(f, a); return target_->Schedule(f, a);
} }
void StartThread(void (*f)(void*), void* a) { pthread_t StartThread(void (*f)(void*), void* a) {
return target_->StartThread(f, a); return target_->StartThread(f, a);
} }
virtual Status GetTestDirectory(std::string* path) { virtual Status GetTestDirectory(std::string* path) {
@ -355,6 +393,12 @@ class EnvWrapper : public Env {
Env* target_; Env* target_;
}; };
// Riak specific hack to allow runtime change
// of mapping size
extern volatile size_t gMapSize;
extern bool gFadviseWillNeed;
} // namespace leveldb } // namespace leveldb
#endif // STORAGE_LEVELDB_INCLUDE_ENV_H_ #endif // STORAGE_LEVELDB_INCLUDE_ENV_H_

View file

@ -0,0 +1,135 @@
// -------------------------------------------------------------------
//
// expiry.h: background expiry management for Basho's modified leveldb
//
// Copyright (c) 2016 Basho Technologies, Inc. All Rights Reserved.
//
// This file is provided to you under the Apache License,
// Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain
// a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
//
// -------------------------------------------------------------------
#ifndef EXPIRY_H
#define EXPIRY_H
#include <limits.h>
#include <stdint.h>
#include "leveldb/env.h"
#include "leveldb/options.h"
#include "util/refobject_base.h"
namespace leveldb {
class Compaction;
class Logger;
struct ParsedInternalKey;
class Slice;
class SstCounters;
class Version;
class VersionEdit;
struct FileMetaData;
enum EleveldbRouterActions_t
{
eGetBucketProperties=1
}; // enum EleveldbRouterActions_t
typedef bool (* EleveldbRouter_t)(EleveldbRouterActions_t Action, int ParamCount, const void ** Params);
class ExpiryModule : public RefObjectBase
{
public:
virtual ~ExpiryModule() {};
// Print expiry options to LOG file
virtual void Dump(Logger * log) const
{Log(log," Expiry: (none)");};
// Quick test to allow manifest logic and such know if
// extra expiry logic should be checked
virtual bool ExpiryActivated() const {return(false);};
// db/write_batch.cc MemTableInserter::Put() calls this.
// returns false on internal error
virtual bool MemTableInserterCallback(
const Slice & Key, // input: user's key about to be written
const Slice & Value, // input: user's value object
ValueType & ValType, // input/output: key type. call might change
ExpiryTimeMicros & Expiry) const // input/output: 0 or specific expiry. call might change
{return(true);};
// db/dbformat.cc KeyRetirement::operator() calls this.
// db/version_set.cc SaveValue() calls this too.
// returns true if key is expired, returns false if key not expired
virtual bool KeyRetirementCallback(
const ParsedInternalKey & Ikey) const
{return(false);};
// table/table_builder.cc TableBuilder::Add() calls this.
// returns false on internal error
virtual bool TableBuilderCallback(
const Slice & Key, // input: internal key
SstCounters & Counters) const // input/output: counters for new sst table
{return(true);};
// db/memtable.cc MemTable::Get() calls this.
// returns true if type/expiry is expired, returns false if not expired
virtual bool MemTableCallback(
const Slice & Key) const // input: leveldb internal key
{return(false);};
// db/version_set.cc VersionSet::Finalize() calls this if no
// other compaction selected for a level
// returns true if there is an expiry compaction eligible
virtual bool CompactionFinalizeCallback(
bool WantAll, // input: true - examine all expired files
const Version & Ver, // input: database state for examination
int Level, // input: level to review for expiry
VersionEdit * Edit) const // output: NULL or destination of delete list
{return(false);};
// yep, sometimes we want to expiry this expiry module object.
// mostly for bucket level properties in Riak EE
virtual uint64_t ExpiryModuleExpiryMicros() {return(0);};
// Creates derived ExpiryModule object that matches compile time
// switch for open source or Basho enterprise edition features.
static ExpiryModule * CreateExpiryModule(EleveldbRouter_t Router);
// Cleans up global objects related to expiry
// switch for open source or Basho enterprise edition features.
static void ShutdownExpiryModule();
// Riak EE: stash a user created module with settings
virtual void NoteUserExpirySettings() {};
protected:
ExpiryModule() {};
private:
ExpiryModule(const ExpiryModule &);
ExpiryModule & operator=(const ExpiryModule &);
}; // ExpiryModule
typedef RefPtr<class ExpiryModule> ExpiryPtr_t;
} // namespace leveldb
#endif // ifndef

View file

@ -23,9 +23,21 @@ namespace leveldb {
class Slice; class Slice;
class FilterPolicy { class FilterPolicy {
public: protected:
mutable const FilterPolicy * m_Next; // used by FilterInventory
public:
FilterPolicy()
: m_Next(NULL)
{};
virtual ~FilterPolicy(); virtual ~FilterPolicy();
// list pointer accessors
const FilterPolicy * GetNext() const {return(m_Next);};
void SetNext(const FilterPolicy * Next) const {m_Next=Next;};
// Return the name of this policy. Note that if the filter encoding // Return the name of this policy. Note that if the filter encoding
// changes in an incompatible way, the name returned by this method // changes in an incompatible way, the name returned by this method
// must be changed. Otherwise, old incompatible filters may be // must be changed. Otherwise, old incompatible filters may be
@ -47,6 +59,7 @@ class FilterPolicy {
// This method may return true or false if the key was not on the // This method may return true or false if the key was not on the
// list, but it should aim to return false with a high probability. // list, but it should aim to return false with a high probability.
virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const = 0; virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const = 0;
}; };
// Return a new filter policy that uses a bloom filter with approximately // Return a new filter policy that uses a bloom filter with approximately
@ -64,7 +77,29 @@ class FilterPolicy {
// FilterPolicy (like NewBloomFilterPolicy) that does not ignore // FilterPolicy (like NewBloomFilterPolicy) that does not ignore
// trailing spaces in keys. // trailing spaces in keys.
extern const FilterPolicy* NewBloomFilterPolicy(int bits_per_key); extern const FilterPolicy* NewBloomFilterPolicy(int bits_per_key);
extern const FilterPolicy* NewBloomFilterPolicy2(int bits_per_key);
}
class FilterInventory
{
public:
// MUST be static variable so that it initializes before any static objects
// have their initializers called
static const FilterPolicy * ListHead;
// This might be called prior to singleton FilterInventory object
// being initialized. NOT THREAD SAFE.
static void AddFilterToInventory(const FilterPolicy * Filter)
{
if (NULL!=Filter)
{
Filter->SetNext(ListHead);
ListHead=Filter;
} // if
return;
}
}; // class FilterInventory
} // namespace leveldb
#endif // STORAGE_LEVELDB_INCLUDE_FILTER_POLICY_H_ #endif // STORAGE_LEVELDB_INCLUDE_FILTER_POLICY_H_

View file

@ -17,6 +17,7 @@
#include "leveldb/slice.h" #include "leveldb/slice.h"
#include "leveldb/status.h" #include "leveldb/status.h"
#include "leveldb/options.h"
namespace leveldb { namespace leveldb {
@ -37,7 +38,7 @@ class Iterator {
// Valid() after this call iff the source is not empty. // Valid() after this call iff the source is not empty.
virtual void SeekToLast() = 0; virtual void SeekToLast() = 0;
// Position at the first key in the source that is at or past target. // Position at the first key in the source that at or past target
// The iterator is Valid() after this call iff the source contains // The iterator is Valid() after this call iff the source contains
// an entry that comes at or past target. // an entry that comes at or past target.
virtual void Seek(const Slice& target) = 0; virtual void Seek(const Slice& target) = 0;
@ -61,9 +62,13 @@ class Iterator {
// Return the value for the current entry. The underlying storage for // Return the value for the current entry. The underlying storage for
// the returned slice is valid only until the next modification of // the returned slice is valid only until the next modification of
// the iterator. // the iterator.
// REQUIRES: Valid() // REQUIRES: !AtEnd() && !AtStart()
virtual Slice value() const = 0; virtual Slice value() const = 0;
// Riak specific: if a database iterator, returns key meta data
// REQUIRES: Valid()
virtual KeyMetaData & keymetadata() const {return(keymetadata_); };
// If an error has occurred, return it. Else return an ok status. // If an error has occurred, return it. Else return an ok status.
virtual Status status() const = 0; virtual Status status() const = 0;
@ -75,6 +80,10 @@ class Iterator {
typedef void (*CleanupFunction)(void* arg1, void* arg2); typedef void (*CleanupFunction)(void* arg1, void* arg2);
void RegisterCleanup(CleanupFunction function, void* arg1, void* arg2); void RegisterCleanup(CleanupFunction function, void* arg1, void* arg2);
protected:
// mutable so reusable by derived classes
mutable KeyMetaData keymetadata_;
private: private:
struct Cleanup { struct Cleanup {
CleanupFunction function; CleanupFunction function;

View file

@ -6,15 +6,23 @@
#define STORAGE_LEVELDB_INCLUDE_OPTIONS_H_ #define STORAGE_LEVELDB_INCLUDE_OPTIONS_H_
#include <stddef.h> #include <stddef.h>
#include <stdint.h>
#include <string>
#include <memory>
namespace leveldb { namespace leveldb {
class Cache; class Cache;
class Comparator; class Comparator;
class Env; class Env;
class ExpiryModule;
class FilterPolicy; class FilterPolicy;
class Logger; class Logger;
class Snapshot; class Snapshot;
namespace log
{
class Writer;
} // namespace log
// DB contents are stored in a set of blocks, each of which holds a // DB contents are stored in a set of blocks, each of which holds a
// sequence of key,value pairs. Each block may be compressed before // sequence of key,value pairs. Each block may be compressed before
@ -24,9 +32,34 @@ enum CompressionType {
// NOTE: do not change the values of existing entries, as these are // NOTE: do not change the values of existing entries, as these are
// part of the persistent format on disk. // part of the persistent format on disk.
kNoCompression = 0x0, kNoCompression = 0x0,
kSnappyCompression = 0x1 kSnappyCompression = 0x1,
kLZ4Compression = 0x2,
kNoCompressionAutomated = 0x3
}; };
// Originally located in db/dbformat.h. Now available publically.
// Value types encoded as the last component of internal keys.
// DO NOT CHANGE THESE ENUM VALUES: they are embedded in the on-disk
// data structures.
enum ValueType {
kTypeDeletion = 0x0,
kTypeValue = 0x1,
kTypeValueWriteTime = 0x2,
kTypeValueExplicitExpiry = 0x3
};
// Originally located in db/dbformat.h
typedef uint64_t SequenceNumber;
typedef uint64_t ExpiryTimeMicros;
}; // namespace leveldb
//
// must follow ValueType declaration
#include "leveldb/expiry.h"
namespace leveldb {
// Options to control the behavior of a database (passed to DB::Open) // Options to control the behavior of a database (passed to DB::Open)
struct Options { struct Options {
// ------------------- // -------------------
@ -56,6 +89,14 @@ struct Options {
// Default: false // Default: false
bool paranoid_checks; bool paranoid_checks;
// Riak specific: this variable replaces paranoid_checks at one
// one place in the code. This variable alone controls whether or not
// compaction read operations check CRC values. Riak needs
// the compaction CRC check, but not other paranoid_checks ... so
// this independent control.
// Default: true
bool verify_compactions;
// Use the specified object to interact with the environment, // Use the specified object to interact with the environment,
// e.g. to read/write files, schedule background work, etc. // e.g. to read/write files, schedule background work, etc.
// Default: Env::Default() // Default: Env::Default()
@ -85,7 +126,7 @@ struct Options {
// Number of open files that can be used by the DB. You may need to // Number of open files that can be used by the DB. You may need to
// increase this if your database has a large working set (budget // increase this if your database has a large working set (budget
// one open file per 2MB of working set). // one open file per 2MB of working set).
// // RIAK: NO LONGER USED
// Default: 1000 // Default: 1000
int max_open_files; int max_open_files;
@ -105,6 +146,15 @@ struct Options {
// Default: 4K // Default: 4K
size_t block_size; size_t block_size;
// Riak specific: non-zero value activates code to automatically
// increase block_size as needed to ensure maximum number of files
// are available in the file cache. The value indicates how many
// incremental increases to use between the original block_size
// and largest, reasonable block_size.
//
// Default: 16
int block_size_steps;
// Number of keys between restart points for delta encoding of keys. // Number of keys between restart points for delta encoding of keys.
// This parameter can be changed dynamically. Most clients should // This parameter can be changed dynamically. Most clients should
// leave this parameter alone. // leave this parameter alone.
@ -112,18 +162,6 @@ struct Options {
// Default: 16 // Default: 16
int block_restart_interval; int block_restart_interval;
// Leveldb will write up to this amount of bytes to a file before
// switching to a new one.
// Most clients should leave this parameter alone. However if your
// filesystem is more efficient with larger files, you could
// consider increasing the value. The downside will be longer
// compactions and hence longer latency/performance hiccups.
// Another reason to increase this parameter might be when you are
// initially populating a large database.
//
// Default: 2MB
size_t max_file_size;
// Compress blocks using the specified compression algorithm. This // Compress blocks using the specified compression algorithm. This
// parameter can be changed dynamically. // parameter can be changed dynamically.
// //
@ -140,12 +178,6 @@ struct Options {
// efficiently detect that and will switch to uncompressed mode. // efficiently detect that and will switch to uncompressed mode.
CompressionType compression; CompressionType compression;
// EXPERIMENTAL: If true, append to existing MANIFEST and log files
// when a database is opened. This can significantly speed up open.
//
// Default: currently false, but may become true later.
bool reuse_logs;
// If non-NULL, use the specified filter policy to reduce disk reads. // If non-NULL, use the specified filter policy to reduce disk reads.
// Many applications will benefit from passing the result of // Many applications will benefit from passing the result of
// NewBloomFilterPolicy() here. // NewBloomFilterPolicy() here.
@ -153,8 +185,84 @@ struct Options {
// Default: NULL // Default: NULL
const FilterPolicy* filter_policy; const FilterPolicy* filter_policy;
// Riak specific flag used to indicate when database is open
// as part of a Repair operation. Default is false
bool is_repair;
// Riak specific flag to mark Riak internal database versus
// user database. (User database gets larger cache resources.)
bool is_internal_db;
// Riak replacement for max_open_files and block_cache. This is
// TOTAL memory to be used by leveldb across ALL DATABASES.
// Most recent value seen upon database open, wins. Zero for default.
uint64_t total_leveldb_mem;
// Riak specific option specifying block cache space that cannot
// be released for page cache use. The space may still be
// released for file cache.
uint64_t block_cache_threshold;
// Riak option to override most memory modeling and create
// smaller memory footprint for developers. Helps when
// running large number of databases and multiple VMs. Do
// NOT use this option if making performance measurements.
// Default: false
bool limited_developer_mem;
// The size of each MMAped file, choose 0 for the default (20M)
uint64_t mmap_size;
// Riak option to adjust aggressive delete behavior.
// - zero disables aggressive delete
// - positive value indicates how many deletes must exist
// in a file for it to be compacted due to deletes
uint64_t delete_threshold;
// Riak specific flag used to indicate when fadvise() management
// should default to WILLNEED instead of DONTNEED. Default is false
bool fadvise_willneed;
// *****
// Riak specific options for establishing two tiers of disk arrays.
// All three tier options must be valid for the option to activate.
// When active, leveldb directories are constructed using either
// the fast or slow prefix followed by the database name given
// in the DB::Open call. (a synonym for "prefix" is "mount")
// *****
// Riak specific option setting the level number at which the
// "tiered_slow_prefix" should be used. Default is zero which
// disables the option. Valid values are 1 to 6. 3 or 4 recommended.
unsigned tiered_slow_level;
// Riak specific option with the path prefix used for "fast" disk
// array. levels 0 to tiered_slow_level-1 use this path prefix
std::string tiered_fast_prefix;
// Riak specific option with the path prefix used for "slow" disk
// array. levels tiered_slow_level through 6 use this path prefix
std::string tiered_slow_prefix;
// Riak specific option that writes a list of open table files
// to disk on close then automatically opens same files again
// upon restart.
bool cache_object_warming;
// Riak specific object that defines expiry policy for data
// written to leveldb.
ExpiryPtr_t expiry_module;
// Create an Options object with default values for all fields. // Create an Options object with default values for all fields.
Options(); Options();
void Dump(Logger * log) const;
bool ExpiryActivated() const
{return(NULL!=expiry_module.get() && expiry_module->ExpiryActivated());};
private:
}; };
// Options that control read operations // Options that control read operations
@ -171,16 +279,57 @@ struct ReadOptions {
// If "snapshot" is non-NULL, read as of the supplied snapshot // If "snapshot" is non-NULL, read as of the supplied snapshot
// (which must belong to the DB that is being read and which must // (which must belong to the DB that is being read and which must
// not have been released). If "snapshot" is NULL, use an implicit // not have been released). If "snapshot" is NULL, use an impliicit
// snapshot of the state at the beginning of this read operation. // snapshot of the state at the beginning of this read operation.
// Default: NULL // Default: NULL
const Snapshot* snapshot; const Snapshot* snapshot;
// Riak specific flag, currently used within Erlang adaptor
// to enable automatic delete and new of fresh snapshot
// and database iterator objects for long running iterations
// (only supports iterator NEXT operations).
// Default: false
bool iterator_refresh;
ReadOptions() ReadOptions()
: verify_checksums(false), : verify_checksums(true),
fill_cache(true), fill_cache(true),
snapshot(NULL) { snapshot(NULL),
iterator_refresh(false),
is_compaction(false),
env(NULL),
info_log(NULL)
{
} }
// accessors to the private data
bool IsCompaction() const {return(is_compaction);};
Logger * GetInfoLog() const {return(info_log);};
const std::string & GetDBName() const {return(dbname);};
Env * GetEnv() const {return(env);};
// The items below are internal options, not for external manipulation.
// They are populated by VersionSet::MakeInputIterator only during compaction operations
private:
friend class VersionSet;
// true when used on background compaction
bool is_compaction;
// Database name for potential creation of bad blocks file
std::string dbname;
// Needed for file operations if creating bad blocks file
Env * env;
// Open log file for error notifications
// Only valid when is_compation==true
Logger* info_log;
}; };
// Options that control write operations // Options that control write operations
@ -208,6 +357,22 @@ struct WriteOptions {
} }
}; };
// Riak specific object that can return key metadata
// during get or iterate operation
struct KeyMetaData
{
ValueType m_Type; // see above
SequenceNumber m_Sequence; // output only, leveldb internal
ExpiryTimeMicros m_Expiry; // microseconds since Epoch, UTC
KeyMetaData()
: m_Type(kTypeValue), m_Sequence(0), m_Expiry(0)
{};
}; // struct KeyMetaData
const char * CompileOptionsString();
} // namespace leveldb } // namespace leveldb
#endif // STORAGE_LEVELDB_INCLUDE_OPTIONS_H_ #endif // STORAGE_LEVELDB_INCLUDE_OPTIONS_H_

View file

@ -0,0 +1,329 @@
// -------------------------------------------------------------------
//
// perf_count.h: performance counters LevelDB
//
// Copyright (c) 2012-2016 Basho Technologies, Inc. All Rights Reserved.
//
// This file is provided to you under the Apache License,
// Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain
// a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
//
// -------------------------------------------------------------------
#ifndef STORAGE_LEVELDB_INCLUDE_PERF_COUNT_H_
#define STORAGE_LEVELDB_INCLUDE_PERF_COUNT_H_
#include <stdint.h>
#include <string>
#include "leveldb/status.h"
namespace leveldb {
enum SstCountEnum
{
//
// array index values/names
//
eSstCountKeys=0, //!< how many keys in this sst
eSstCountBlocks=1, //!< how many blocks in this sst
eSstCountCompressAborted=2,//!< how many blocks attempted compression and aborted use
eSstCountKeySize=3, //!< byte count of all keys
eSstCountValueSize=4, //!< byte count of all values
eSstCountBlockSize=5, //!< byte count of all blocks (pre-compression)
eSstCountBlockWriteSize=6, //!< post-compression size, or BlockSize if no compression
eSstCountIndexKeys=7, //!< how many keys in the index block
eSstCountKeyLargest=8, //!< largest key in sst
eSstCountKeySmallest=9, //!< smallest key in sst
eSstCountValueLargest=10, //!< largest value in sst
eSstCountValueSmallest=11, //!< smallest value in sst
eSstCountDeleteKey=12, //!< tombstone count
eSstCountBlockSizeUsed=13, //!< Options::block_size used with this file
eSstCountUserDataSize=14, //!< post-compression size of non-metadata (user keys/values/block overhead)
eSstCountExpiry1=15, //!< undocumented expiry counter 1
eSstCountExpiry2=16, //!< undocumented expiry counter 2
eSstCountExpiry3=17, //!< undocumented expiry counter 3
eSstCountSequence=18, //!< highest sequence number in file
// must follow last index name to represent size of array
eSstCountEnumSize, //!< size of the array described by the enum values
eSstCountVersion=1
}; // enum SstCountEnum
class SstCounters
{
protected:
bool m_IsReadOnly; //!< set when data decoded from a file
uint32_t m_Version; //!< object revision identification
uint32_t m_CounterSize; //!< number of objects in m_Counter
uint64_t m_Counter[eSstCountEnumSize];
public:
// constructors / destructor
SstCounters();
// Put data into disk form
void EncodeTo(std::string & Dst) const;
// Populate member data from prior EncodeTo block
Status DecodeFrom(const Slice& src);
// increment the counter
uint64_t Inc(unsigned Index);
// add value to the counter
uint64_t Add(unsigned Index, uint64_t Amount);
// return value of a counter
uint64_t Value(unsigned Index) const;
// set a value
void Set(unsigned Index, uint64_t);
// return number of counters
uint32_t Size() const {return(m_CounterSize);};
// printf all values
void Dump() const;
}; // class SstCounters
extern struct PerformanceCounters * gPerfCounters;
enum PerformanceCountersEnum
{
//
// array index values/names
// (enum explicitly numbered to allow future edits / moves / inserts)
//
ePerfROFileOpen=0, //!< PosixMmapReadableFile open
ePerfROFileClose=1, //!< closed
ePerfROFileUnmap=2, //!< unmap without close
ePerfRWFileOpen=3, //!< PosixMmapFile open
ePerfRWFileClose=4, //!< closed
ePerfRWFileUnmap=5, //!< unmap without close
ePerfApiOpen=6, //!< Count of DB::Open completions
ePerfApiGet=7, //!< Count of DBImpl::Get completions
ePerfApiWrite=8, //!< Count of DBImpl::Get completions
ePerfWriteSleep=9, //!< DBImpl::MakeRoomForWrite called sleep
ePerfWriteWaitImm=10, //!< DBImpl::MakeRoomForWrite called Wait on Imm compact
ePerfWriteWaitLevel0=11,//!< DBImpl::MakeRoomForWrite called Wait on Level0 compact
ePerfWriteNewMem=12, //!< DBImpl::MakeRoomForWrite created new memory log
ePerfWriteError=13, //!< DBImpl::MakeRoomForWrite saw bg_error_
ePerfWriteNoWait=14, //!< DBImpl::MakeRoomForWrite took no action
ePerfGetMem=15, //!< DBImpl::Get read from memory log
ePerfGetImm=16, //!< DBImpl::Get read from previous memory log
ePerfGetVersion=17, //!< DBImpl::Get read from Version object
// code ASSUMES the levels are in numerical order,
// i.e. based off of ePerfSearchLevel0
ePerfSearchLevel0=18, //!< Version::Get read searched one or more files here
ePerfSearchLevel1=19, //!< Version::Get read searched one or more files here
ePerfSearchLevel2=20, //!< Version::Get read searched one or more files here
ePerfSearchLevel3=21, //!< Version::Get read searched one or more files here
ePerfSearchLevel4=22, //!< Version::Get read searched one or more files here
ePerfSearchLevel5=23, //!< Version::Get read searched one or more files here
ePerfSearchLevel6=24, //!< Version::Get read searched one or more files here
ePerfTableCached=25, //!< TableCache::FindTable found table in cache
ePerfTableOpened=26, //!< TableCache::FindTable had to open table file
ePerfTableGet=27, //!< TableCache::Get used to retrieve a key
ePerfBGCloseUnmap=28, //!< PosixEnv::BGThreaed started Unmap/Close job
ePerfBGCompactImm=29, //!< PosixEnv::BGThreaed started compaction of Imm
ePerfBGNormal=30, //!< PosixEnv::BGThreaed started normal compaction job
ePerfBGCompactLevel0=31,//!< PosixEnv::BGThreaed started compaction of Level0
ePerfBlockFiltered=32, //!< Table::BlockReader search stopped due to filter
ePerfBlockFilterFalse=33,//!< Table::BlockReader gave a false positive for match
ePerfBlockCached=34, //!< Table::BlockReader found block in cache
ePerfBlockRead=35, //!< Table::BlockReader read block from disk
ePerfBlockFilterRead=36,//!< Table::ReadMeta filter loaded from file
ePerfBlockValidGet=37, //!< Table::InternalGet has valid iterator
ePerfDebug0=38, //!< Developer debug counters, moveable
ePerfDebug1=39, //!< Developer debug counters, moveable
ePerfDebug2=40, //!< Developer debug counters, moveable
ePerfDebug3=41, //!< Developer debug counters, moveable
ePerfDebug4=42, //!< Developer debug counters, moveable
ePerfReadBlockError=43, //!< crc or compression error in ReadBlock (format.cc)
ePerfIterNew=44, //!< Count of DBImpl::NewDBIterator calls
ePerfIterNext=45, //!< Count of DBIter::Next calls
ePerfIterPrev=46, //!< Count of DBIter::Prev calls
ePerfIterSeek=47, //!< Count of DBIter::Seek calls
ePerfIterSeekFirst=48, //!< Count of DBIter::SeekFirst calls
ePerfIterSeekLast=49, //!< Count of DBIter::SeekLast calls
ePerfIterDelete=50, //!< Count of DBIter::~DBIter
ePerfElevelDirect=51, //!< eleveldb's FindWaitingThread went direct to thread
ePerfElevelQueued=52, //!< eleveldb's FindWaitingThread queued work item
ePerfElevelDequeued=53, //!< eleveldb's worker took item from backlog queue
ePerfElevelRefCreate=54,//!< eleveldb RefObject constructed
ePerfElevelRefDelete=55,//!< eleveldb RefObject destructed
ePerfThrottleGauge=56, //!< current throttle value
ePerfThrottleCounter=57,//!< running throttle by seconds
ePerfThrottleMicros0=58,//!< level 0 micros spent compacting
ePerfThrottleKeys0=59, //!< level 0 keys processed
ePerfThrottleBacklog0=60,//!< backlog at time of posting (level0)
ePerfThrottleCompacts0=61,//!< number of level 0 compactions
ePerfThrottleMicros1=62,//!< level 1+ micros spent compacting
ePerfThrottleKeys1=63, //!< level 1+ keys processed
ePerfThrottleBacklog1=64,//!< backlog at time of posting (level1+)
ePerfThrottleCompacts1=65,//!< number of level 1+ compactions
ePerfBGWriteError=66, //!< error in write/close, see syslog
ePerfThrottleWait=67, //!< milliseconds of throttle wait
ePerfThreadError=68, //!< system error on thread related call, no LOG access
ePerfBGImmDirect=69, //!< count Imm compactions happened directly
ePerfBGImmQueued=70, //!< count Imm compactions placed on queue
ePerfBGImmDequeued=71, //!< count Imm compactions removed from queue
ePerfBGImmWeighted=72, //!< total microseconds item spent on queue
ePerfBGUnmapDirect=73, //!< count Unmap operations happened directly
ePerfBGUnmapQueued=74, //!< count Unmap operations placed on queue
ePerfBGUnmapDequeued=75,//!< count Unmap operations removed from queue
ePerfBGUnmapWeighted=76,//!< total microseconds item spent on queue
ePerfBGLevel0Direct=77, //!< count Level0 compactions happened directly
ePerfBGLevel0Queued=78, //!< count Level0 compactions placed on queue
ePerfBGLevel0Dequeued=79,//!< count Level0 compactions removed from queue
ePerfBGLevel0Weighted=80,//!< total microseconds item spent on queue
ePerfBGCompactDirect=81, //!< count generic compactions happened directly
ePerfBGCompactQueued=82, //!< count generic compactions placed on queue
ePerfBGCompactDequeued=83,//!< count generic compactions removed from queue
ePerfBGCompactWeighted=84,//!< total microseconds item spent on queue
ePerfFileCacheInsert=85, //!< total bytes inserted into file cache
ePerfFileCacheRemove=86, //!< total bytes removed from file cache
ePerfBlockCacheInsert=87, //!< total bytes inserted into block cache
ePerfBlockCacheRemove=88, //!< total bytes removed from block cache
ePerfApiDelete=89, //!< Count of DB::Delete
ePerfBGMove=90, //!< compaction was a successful move
ePerfBGMoveFail=91, //!< compaction move failed, regular compaction attempted
ePerfThrottleUnadjusted=92,//!< current unadjusted throttle gauge
// this one was added to the other ePerfElevelXxx counters above when we backported HotThreadPool to eleveldb
ePerfElevelWeighted=93, //!< total microseconds item spent on queue
ePerfExpiredKeys=94, //!< key physically removed because it expired
ePerfExpiredFiles=95, //!< entire file removed because all keys expired
ePerfSyslogWrite=96, //!< logged message to syslog
ePerfBackupStarted=97, //!< hot backup initiated
ePerfBackupError=98, //!< hot backup had an error
ePerfPropCacheHit=99, //!< property cache had data
ePerfPropCacheMiss=100, //!< property cache had to look up data
ePerfPropCacheError=101, //!< no property cache entry built/located
// must follow last index name to represent size of array
// (ASSUMES previous enum is highest value)
ePerfCountEnumSize, //!< size of the array described by the enum values
ePerfVersion=1, //!< structure versioning
ePerfKey=41207 //!< random number as shared memory identifier
};
struct PerfCounterAttributes
{
const char * m_PerfCounterName; //!< text description
const bool m_PerfDiscretionary; //!< true if ok to disable
}; // PerfCounterAttributes
//
// Do NOT use virtual functions. This structure will be aligned at different
// locations in multiple processes. Things can get messy with virtuals.
struct PerformanceCounters
{
public:
static int m_LastError;
protected:
uint32_t m_Version; //!< object revision identification
uint32_t m_CounterSize; //!< number of objects in m_Counter
volatile uint64_t m_Counter[ePerfCountEnumSize];
static const PerfCounterAttributes m_PerfCounterAttr[];
static int m_PerfSharedId;
static volatile uint64_t m_BogusCounter; //!< for out of range GetPtr calls
public:
// only called for local object, not for shared memory
PerformanceCounters();
//!< does executable's idea of version match shared object?
bool VersionTest()
{return(ePerfCountEnumSize<=m_CounterSize && ePerfVersion==m_Version);};
//!< mostly for perf_count_test.cc
void SetVersion(uint32_t Version, uint32_t CounterSize)
{m_Version=Version; m_CounterSize=CounterSize;};
static PerformanceCounters * Init(bool IsReadOnly);
static int Close(PerformanceCounters * Counts);
uint64_t Inc(unsigned Index);
uint64_t Dec(unsigned Index);
// add value to the counter
uint64_t Add(unsigned Index, uint64_t Amount);
// return value of a counter
uint64_t Value(unsigned Index) const;
// set a value
void Set(unsigned Index, uint64_t);
volatile const uint64_t * GetPtr(unsigned Index) const;
static const char * GetNamePtr(unsigned Index);
int LookupCounter(const char * Name);
void Dump();
}; // struct PerformanceCounters
extern PerformanceCounters * gPerfCounters;
extern volatile bool gPerfCountersDisabled;
} // namespace leveldb
#endif // STORAGE_LEVELDB_INCLUDE_PERF_COUNT_H_

View file

@ -94,7 +94,7 @@ inline bool operator!=(const Slice& x, const Slice& y) {
} }
inline int Slice::compare(const Slice& b) const { inline int Slice::compare(const Slice& b) const {
const size_t min_len = (size_ < b.size_) ? size_ : b.size_; const int min_len = (size_ < b.size_) ? size_ : b.size_;
int r = memcmp(data_, b.data_, min_len); int r = memcmp(data_, b.data_, min_len);
if (r == 0) { if (r == 0) {
if (size_ < b.size_) r = -1; if (size_ < b.size_) r = -1;

View file

@ -60,12 +60,6 @@ class Status {
// Returns true iff the status indicates an IOError. // Returns true iff the status indicates an IOError.
bool IsIOError() const { return code() == kIOError; } bool IsIOError() const { return code() == kIOError; }
// Returns true iff the status indicates a NotSupportedError.
bool IsNotSupportedError() const { return code() == kNotSupported; }
// Returns true iff the status indicates an InvalidArgument.
bool IsInvalidArgument() const { return code() == kInvalidArgument; }
// Return a string representation of this status suitable for printing. // Return a string representation of this status suitable for printing.
// Returns the string "OK" for success. // Returns the string "OK" for success.
std::string ToString() const; std::string ToString() const;

View file

@ -7,6 +7,7 @@
#include <stdint.h> #include <stdint.h>
#include "leveldb/iterator.h" #include "leveldb/iterator.h"
#include "leveldb/perf_count.h"
namespace leveldb { namespace leveldb {
@ -40,7 +41,7 @@ class Table {
uint64_t file_size, uint64_t file_size,
Table** table); Table** table);
~Table(); virtual ~Table();
// Returns a new iterator over the table contents. // Returns a new iterator over the table contents.
// The result of NewIterator() is initially invalid (caller must // The result of NewIterator() is initially invalid (caller must
@ -55,7 +56,29 @@ class Table {
// be close to the file length. // be close to the file length.
uint64_t ApproximateOffsetOf(const Slice& key) const; uint64_t ApproximateOffsetOf(const Slice& key) const;
private: // return a static copy of the table's counters.
SstCounters GetSstCounters() const;
// riak routine to retrieve total memory footprint of an open table
// object in memory
size_t TableObjectSize();
// riak routine to retrieve disk size of table file
// ("virtual" is for unit test activites)
virtual uint64_t GetFileSize();
// Riak routine to request bloom filter load on
// second read operation (not iterator read)
bool ReadFilter();
// access routines for testing tools, not for public use
Block * TEST_GetIndexBlock();
size_t TEST_TableObjectSize() {return(TableObjectSize());};
size_t TEST_FilterDataSize();
static Iterator* TEST_BlockReader(void* Ptr, const ReadOptions& ROptions, const Slice& SliceReturn)
{return(BlockReader(Ptr, ROptions, SliceReturn));};
protected: // was private, made protected for unit tests
struct Rep; struct Rep;
Rep* rep_; Rep* rep_;
@ -69,11 +92,12 @@ class Table {
Status InternalGet( Status InternalGet(
const ReadOptions&, const Slice& key, const ReadOptions&, const Slice& key,
void* arg, void* arg,
void (*handle_result)(void* arg, const Slice& k, const Slice& v)); bool (*handle_result)(void* arg, const Slice& k, const Slice& v));
void ReadMeta(const Footer& footer); void ReadMeta(const Footer& footer);
void ReadFilter(const Slice& filter_handle_value); void ReadFilter(class BlockHandle & filter_handle_value, const class FilterPolicy * policy);
void ReadSstCounters(const Slice& sst_counters_handle_value);
// No copying allowed // No copying allowed
Table(const Table&); Table(const Table&);

View file

@ -74,6 +74,14 @@ class TableBuilder {
// Finish() call, returns the size of the final generated file. // Finish() call, returns the size of the final generated file.
uint64_t FileSize() const; uint64_t FileSize() const;
// Number of delete tombstones so far.
uint64_t NumDeletes() const;
// Retrieve expiry control values
uint64_t GetExpiryWriteLow() const;
uint64_t GetExpiryWriteHigh() const;
uint64_t GetExpiryExplicitHigh() const;
private: private:
bool ok() const { return status().ok(); } bool ok() const { return status().ok(); }
void WriteBlock(BlockBuilder* block, BlockHandle* handle); void WriteBlock(BlockBuilder* block, BlockHandle* handle);

View file

@ -23,6 +23,7 @@
#include <string> #include <string>
#include "leveldb/status.h" #include "leveldb/status.h"
#include "leveldb/options.h"
namespace leveldb { namespace leveldb {
@ -34,7 +35,7 @@ class WriteBatch {
~WriteBatch(); ~WriteBatch();
// Store the mapping "key->value" in the database. // Store the mapping "key->value" in the database.
void Put(const Slice& key, const Slice& value); void Put(const Slice& key, const Slice& value, const KeyMetaData * meta=NULL);
// If the database contains a mapping for "key", erase it. Else do nothing. // If the database contains a mapping for "key", erase it. Else do nothing.
void Delete(const Slice& key); void Delete(const Slice& key);
@ -46,7 +47,8 @@ class WriteBatch {
class Handler { class Handler {
public: public:
virtual ~Handler(); virtual ~Handler();
virtual void Put(const Slice& key, const Slice& value) = 0; virtual void Put(const Slice& key, const Slice& value,
const ValueType & type, const ExpiryTimeMicros & expiry) = 0;
virtual void Delete(const Slice& key) = 0; virtual void Delete(const Slice& key) = 0;
}; };
Status Iterate(Handler* handler) const; Status Iterate(Handler* handler) const;

View file

@ -1,92 +0,0 @@
// Copyright (c) 2013 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
// Test for issue 178: a manual compaction causes deleted data to reappear.
#include <iostream>
#include <sstream>
#include <cstdlib>
#include "leveldb/db.h"
#include "leveldb/write_batch.h"
#include "util/testharness.h"
namespace {
const int kNumKeys = 1100000;
std::string Key1(int i) {
char buf[100];
snprintf(buf, sizeof(buf), "my_key_%d", i);
return buf;
}
std::string Key2(int i) {
return Key1(i) + "_xxx";
}
class Issue178 { };
TEST(Issue178, Test) {
// Get rid of any state from an old run.
std::string dbpath = leveldb::test::TmpDir() + "/leveldb_cbug_test";
DestroyDB(dbpath, leveldb::Options());
// Open database. Disable compression since it affects the creation
// of layers and the code below is trying to test against a very
// specific scenario.
leveldb::DB* db;
leveldb::Options db_options;
db_options.create_if_missing = true;
db_options.compression = leveldb::kNoCompression;
ASSERT_OK(leveldb::DB::Open(db_options, dbpath, &db));
// create first key range
leveldb::WriteBatch batch;
for (size_t i = 0; i < kNumKeys; i++) {
batch.Put(Key1(i), "value for range 1 key");
}
ASSERT_OK(db->Write(leveldb::WriteOptions(), &batch));
// create second key range
batch.Clear();
for (size_t i = 0; i < kNumKeys; i++) {
batch.Put(Key2(i), "value for range 2 key");
}
ASSERT_OK(db->Write(leveldb::WriteOptions(), &batch));
// delete second key range
batch.Clear();
for (size_t i = 0; i < kNumKeys; i++) {
batch.Delete(Key2(i));
}
ASSERT_OK(db->Write(leveldb::WriteOptions(), &batch));
// compact database
std::string start_key = Key1(0);
std::string end_key = Key1(kNumKeys - 1);
leveldb::Slice least(start_key.data(), start_key.size());
leveldb::Slice greatest(end_key.data(), end_key.size());
// commenting out the line below causes the example to work correctly
db->CompactRange(&least, &greatest);
// count the keys
leveldb::Iterator* iter = db->NewIterator(leveldb::ReadOptions());
size_t num_keys = 0;
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
num_keys++;
}
delete iter;
ASSERT_EQ(kNumKeys, num_keys) << "Bad number of keys";
// close database
delete db;
DestroyDB(dbpath, leveldb::Options());
}
} // anonymous namespace
int main(int argc, char** argv) {
return leveldb::test::RunAllTests();
}

View file

@ -1,59 +0,0 @@
// Copyright (c) 2013 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
// Test for issue 200: when iterator switches direction from backward
// to forward, the current key can be yielded unexpectedly if a new
// mutation has been added just before the current key.
#include "leveldb/db.h"
#include "util/testharness.h"
namespace leveldb {
class Issue200 { };
TEST(Issue200, Test) {
// Get rid of any state from an old run.
std::string dbpath = test::TmpDir() + "/leveldb_issue200_test";
DestroyDB(dbpath, Options());
DB *db;
Options options;
options.create_if_missing = true;
ASSERT_OK(DB::Open(options, dbpath, &db));
WriteOptions write_options;
ASSERT_OK(db->Put(write_options, "1", "b"));
ASSERT_OK(db->Put(write_options, "2", "c"));
ASSERT_OK(db->Put(write_options, "3", "d"));
ASSERT_OK(db->Put(write_options, "4", "e"));
ASSERT_OK(db->Put(write_options, "5", "f"));
ReadOptions read_options;
Iterator *iter = db->NewIterator(read_options);
// Add an element that should not be reflected in the iterator.
ASSERT_OK(db->Put(write_options, "25", "cd"));
iter->Seek("5");
ASSERT_EQ(iter->key().ToString(), "5");
iter->Prev();
ASSERT_EQ(iter->key().ToString(), "4");
iter->Prev();
ASSERT_EQ(iter->key().ToString(), "3");
iter->Next();
ASSERT_EQ(iter->key().ToString(), "4");
iter->Next();
ASSERT_EQ(iter->key().ToString(), "5");
delete iter;
delete db;
DestroyDB(dbpath, options);
}
} // namespace leveldb
int main(int argc, char** argv) {
return leveldb::test::RunAllTests();
}

Some files were not shown because too many files have changed in this diff Show more