Compare commits
4 commits
master
...
experiment
Author | SHA1 | Date | |
---|---|---|---|
|
1069eb65b5 | ||
|
83319b7f31 | ||
|
6b8935718e | ||
|
05d89e91cf |
209 changed files with 23549 additions and 8105 deletions
|
@ -475,7 +475,6 @@ lbrycrdd_LDADD = \
|
|||
$(LIBBITCOIN_CONSENSUS) \
|
||||
$(LIBBITCOIN_CRYPTO) \
|
||||
$(LIBLEVELDB) \
|
||||
$(LIBLEVELDB_SSE42) \
|
||||
$(LIBMEMENV) \
|
||||
$(LIBSECP256K1)
|
||||
|
||||
|
@ -573,7 +572,7 @@ $(top_srcdir)/$(subdir)/config/bitcoin-config.h.in: $(am__configure_deps)
|
|||
clean-local:
|
||||
-$(MAKE) -C secp256k1 clean
|
||||
-$(MAKE) -C univalue clean
|
||||
-rm -f leveldb/*/*.gcda leveldb/*/*.gcno leveldb/helpers/memenv/*.gcda leveldb/helpers/memenv/*.gcno
|
||||
-$(MAKE) -C leveldb clean
|
||||
-rm -f config.h
|
||||
-rm -rf test/__pycache__
|
||||
|
||||
|
|
|
@ -42,7 +42,6 @@ bench_bench_bitcoin_LDADD = \
|
|||
$(LIBBITCOIN_CONSENSUS) \
|
||||
$(LIBBITCOIN_CRYPTO) \
|
||||
$(LIBLEVELDB) \
|
||||
$(LIBLEVELDB_SSE42) \
|
||||
$(LIBMEMENV) \
|
||||
$(LIBSECP256K1) \
|
||||
$(LIBUNIVALUE)
|
||||
|
|
|
@ -2,148 +2,23 @@
|
|||
# Distributed under the MIT software license, see the accompanying
|
||||
# file COPYING or http://www.opensource.org/licenses/mit-license.php.
|
||||
|
||||
SUBDIRS = leveldb
|
||||
|
||||
LIBLEVELDB_INT = leveldb/libleveldb.a
|
||||
LIBMEMENV_INT = leveldb/libmemenv.a
|
||||
LIBLEVELDB_SSE42_INT = leveldb/libleveldb_sse42.a
|
||||
|
||||
EXTRA_LIBRARIES += $(LIBLEVELDB_INT)
|
||||
EXTRA_LIBRARIES += $(LIBMEMENV_INT)
|
||||
EXTRA_LIBRARIES += $(LIBLEVELDB_SSE42_INT)
|
||||
|
||||
LIBLEVELDB += $(LIBLEVELDB_INT)
|
||||
LIBMEMENV += $(LIBMEMENV_INT)
|
||||
LIBLEVELDB_SSE42 = $(LIBLEVELDB_SSE42_INT)
|
||||
|
||||
LEVELDB_CPPFLAGS += -I$(srcdir)/leveldb/include
|
||||
LEVELDB_CPPFLAGS += -I$(srcdir)/leveldb/helpers/memenv
|
||||
LEVELDB_CPPFLAGS += -I$(srcdir)/leveldb
|
||||
|
||||
LEVELDB_CPPFLAGS_INT =
|
||||
LEVELDB_CPPFLAGS_INT += -I$(srcdir)/leveldb
|
||||
LEVELDB_CPPFLAGS_INT += $(LEVELDB_TARGET_FLAGS)
|
||||
LEVELDB_CPPFLAGS_INT += -DLEVELDB_ATOMIC_PRESENT
|
||||
LEVELDB_CPPFLAGS_INT += -D__STDC_LIMIT_MACROS
|
||||
leveldb/libleveldb.a:
|
||||
$(AM_V_at)$(MAKE) $(AM_MAKEFLAGS) -C leveldb
|
||||
|
||||
if TARGET_WINDOWS
|
||||
LEVELDB_CPPFLAGS_INT += -DLEVELDB_PLATFORM_WINDOWS -DWINVER=0x0500 -D__USE_MINGW_ANSI_STDIO=1
|
||||
else
|
||||
LEVELDB_CPPFLAGS_INT += -DLEVELDB_PLATFORM_POSIX
|
||||
endif
|
||||
|
||||
leveldb_libleveldb_a_CPPFLAGS = $(AM_CPPFLAGS) $(LEVELDB_CPPFLAGS_INT) $(LEVELDB_CPPFLAGS)
|
||||
leveldb_libleveldb_a_CXXFLAGS = $(AM_CXXFLAGS) $(PIE_FLAGS)
|
||||
|
||||
leveldb_libleveldb_a_SOURCES=
|
||||
leveldb_libleveldb_a_SOURCES += leveldb/port/atomic_pointer.h
|
||||
leveldb_libleveldb_a_SOURCES += leveldb/port/port_example.h
|
||||
leveldb_libleveldb_a_SOURCES += leveldb/port/port_posix.h
|
||||
leveldb_libleveldb_a_SOURCES += leveldb/port/win/stdint.h
|
||||
leveldb_libleveldb_a_SOURCES += leveldb/port/port.h
|
||||
leveldb_libleveldb_a_SOURCES += leveldb/port/port_win.h
|
||||
leveldb_libleveldb_a_SOURCES += leveldb/port/thread_annotations.h
|
||||
leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/db.h
|
||||
leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/options.h
|
||||
leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/comparator.h
|
||||
leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/filter_policy.h
|
||||
leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/slice.h
|
||||
leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/table_builder.h
|
||||
leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/env.h
|
||||
leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/c.h
|
||||
leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/iterator.h
|
||||
leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/cache.h
|
||||
leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/dumpfile.h
|
||||
leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/table.h
|
||||
leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/write_batch.h
|
||||
leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/status.h
|
||||
leveldb_libleveldb_a_SOURCES += leveldb/db/log_format.h
|
||||
leveldb_libleveldb_a_SOURCES += leveldb/db/memtable.h
|
||||
leveldb_libleveldb_a_SOURCES += leveldb/db/version_set.h
|
||||
leveldb_libleveldb_a_SOURCES += leveldb/db/write_batch_internal.h
|
||||
leveldb_libleveldb_a_SOURCES += leveldb/db/filename.h
|
||||
leveldb_libleveldb_a_SOURCES += leveldb/db/version_edit.h
|
||||
leveldb_libleveldb_a_SOURCES += leveldb/db/dbformat.h
|
||||
leveldb_libleveldb_a_SOURCES += leveldb/db/builder.h
|
||||
leveldb_libleveldb_a_SOURCES += leveldb/db/log_writer.h
|
||||
leveldb_libleveldb_a_SOURCES += leveldb/db/db_iter.h
|
||||
leveldb_libleveldb_a_SOURCES += leveldb/db/skiplist.h
|
||||
leveldb_libleveldb_a_SOURCES += leveldb/db/db_impl.h
|
||||
leveldb_libleveldb_a_SOURCES += leveldb/db/table_cache.h
|
||||
leveldb_libleveldb_a_SOURCES += leveldb/db/snapshot.h
|
||||
leveldb_libleveldb_a_SOURCES += leveldb/db/log_reader.h
|
||||
leveldb_libleveldb_a_SOURCES += leveldb/table/filter_block.h
|
||||
leveldb_libleveldb_a_SOURCES += leveldb/table/block_builder.h
|
||||
leveldb_libleveldb_a_SOURCES += leveldb/table/block.h
|
||||
leveldb_libleveldb_a_SOURCES += leveldb/table/two_level_iterator.h
|
||||
leveldb_libleveldb_a_SOURCES += leveldb/table/merger.h
|
||||
leveldb_libleveldb_a_SOURCES += leveldb/table/format.h
|
||||
leveldb_libleveldb_a_SOURCES += leveldb/table/iterator_wrapper.h
|
||||
leveldb_libleveldb_a_SOURCES += leveldb/util/crc32c.h
|
||||
leveldb_libleveldb_a_SOURCES += leveldb/util/env_posix_test_helper.h
|
||||
leveldb_libleveldb_a_SOURCES += leveldb/util/arena.h
|
||||
leveldb_libleveldb_a_SOURCES += leveldb/util/random.h
|
||||
leveldb_libleveldb_a_SOURCES += leveldb/util/posix_logger.h
|
||||
leveldb_libleveldb_a_SOURCES += leveldb/util/hash.h
|
||||
leveldb_libleveldb_a_SOURCES += leveldb/util/histogram.h
|
||||
leveldb_libleveldb_a_SOURCES += leveldb/util/coding.h
|
||||
leveldb_libleveldb_a_SOURCES += leveldb/util/testutil.h
|
||||
leveldb_libleveldb_a_SOURCES += leveldb/util/mutexlock.h
|
||||
leveldb_libleveldb_a_SOURCES += leveldb/util/logging.h
|
||||
leveldb_libleveldb_a_SOURCES += leveldb/util/testharness.h
|
||||
|
||||
leveldb_libleveldb_a_SOURCES += leveldb/db/builder.cc
|
||||
leveldb_libleveldb_a_SOURCES += leveldb/db/c.cc
|
||||
leveldb_libleveldb_a_SOURCES += leveldb/db/dbformat.cc
|
||||
leveldb_libleveldb_a_SOURCES += leveldb/db/db_impl.cc
|
||||
leveldb_libleveldb_a_SOURCES += leveldb/db/db_iter.cc
|
||||
leveldb_libleveldb_a_SOURCES += leveldb/db/dumpfile.cc
|
||||
leveldb_libleveldb_a_SOURCES += leveldb/db/filename.cc
|
||||
leveldb_libleveldb_a_SOURCES += leveldb/db/log_reader.cc
|
||||
leveldb_libleveldb_a_SOURCES += leveldb/db/log_writer.cc
|
||||
leveldb_libleveldb_a_SOURCES += leveldb/db/memtable.cc
|
||||
leveldb_libleveldb_a_SOURCES += leveldb/db/repair.cc
|
||||
leveldb_libleveldb_a_SOURCES += leveldb/db/table_cache.cc
|
||||
leveldb_libleveldb_a_SOURCES += leveldb/db/version_edit.cc
|
||||
leveldb_libleveldb_a_SOURCES += leveldb/db/version_set.cc
|
||||
leveldb_libleveldb_a_SOURCES += leveldb/db/write_batch.cc
|
||||
leveldb_libleveldb_a_SOURCES += leveldb/table/block_builder.cc
|
||||
leveldb_libleveldb_a_SOURCES += leveldb/table/block.cc
|
||||
leveldb_libleveldb_a_SOURCES += leveldb/table/filter_block.cc
|
||||
leveldb_libleveldb_a_SOURCES += leveldb/table/format.cc
|
||||
leveldb_libleveldb_a_SOURCES += leveldb/table/iterator.cc
|
||||
leveldb_libleveldb_a_SOURCES += leveldb/table/merger.cc
|
||||
leveldb_libleveldb_a_SOURCES += leveldb/table/table_builder.cc
|
||||
leveldb_libleveldb_a_SOURCES += leveldb/table/table.cc
|
||||
leveldb_libleveldb_a_SOURCES += leveldb/table/two_level_iterator.cc
|
||||
leveldb_libleveldb_a_SOURCES += leveldb/util/arena.cc
|
||||
leveldb_libleveldb_a_SOURCES += leveldb/util/bloom.cc
|
||||
leveldb_libleveldb_a_SOURCES += leveldb/util/cache.cc
|
||||
leveldb_libleveldb_a_SOURCES += leveldb/util/coding.cc
|
||||
leveldb_libleveldb_a_SOURCES += leveldb/util/comparator.cc
|
||||
leveldb_libleveldb_a_SOURCES += leveldb/util/crc32c.cc
|
||||
leveldb_libleveldb_a_SOURCES += leveldb/util/env.cc
|
||||
leveldb_libleveldb_a_SOURCES += leveldb/util/env_posix.cc
|
||||
leveldb_libleveldb_a_SOURCES += leveldb/util/filter_policy.cc
|
||||
leveldb_libleveldb_a_SOURCES += leveldb/util/hash.cc
|
||||
leveldb_libleveldb_a_SOURCES += leveldb/util/histogram.cc
|
||||
leveldb_libleveldb_a_SOURCES += leveldb/util/logging.cc
|
||||
leveldb_libleveldb_a_SOURCES += leveldb/util/options.cc
|
||||
leveldb_libleveldb_a_SOURCES += leveldb/util/status.cc
|
||||
|
||||
if TARGET_WINDOWS
|
||||
leveldb_libleveldb_a_SOURCES += leveldb/util/env_win.cc
|
||||
leveldb_libleveldb_a_SOURCES += leveldb/port/port_win.cc
|
||||
else
|
||||
leveldb_libleveldb_a_SOURCES += leveldb/port/port_posix.cc
|
||||
endif
|
||||
|
||||
leveldb_libmemenv_a_CPPFLAGS = $(leveldb_libleveldb_a_CPPFLAGS)
|
||||
leveldb_libmemenv_a_CXXFLAGS = $(leveldb_libleveldb_a_CXXFLAGS)
|
||||
leveldb_libmemenv_a_SOURCES = leveldb/helpers/memenv/memenv.cc
|
||||
leveldb_libmemenv_a_SOURCES += leveldb/helpers/memenv/memenv.h
|
||||
|
||||
leveldb_libleveldb_sse42_a_CPPFLAGS = $(leveldb_libleveldb_a_CPPFLAGS)
|
||||
leveldb_libleveldb_sse42_a_CXXFLAGS = $(leveldb_libleveldb_a_CXXFLAGS)
|
||||
if ENABLE_HWCRC32
|
||||
leveldb_libleveldb_sse42_a_CPPFLAGS += -DLEVELDB_PLATFORM_POSIX_SSE
|
||||
leveldb_libleveldb_sse42_a_CXXFLAGS += $(SSE42_CXXFLAGS)
|
||||
endif
|
||||
leveldb_libleveldb_sse42_a_SOURCES = leveldb/port/port_posix_sse.cc
|
||||
leveldb/libmemenv.a: leveldb/libleveldb.a
|
||||
$(AM_V_at)$(MAKE) $(AM_MAKEFLAGS) -C leveldb memenv_test
|
||||
|
|
|
@ -408,7 +408,7 @@ endif
|
|||
if ENABLE_ZMQ
|
||||
qt_lbrycrd_qt_LDADD += $(LIBBITCOIN_ZMQ) $(ZMQ_LIBS)
|
||||
endif
|
||||
qt_lbrycrd_qt_LDADD += $(LIBBITCOIN_CLI) $(LIBBITCOIN_COMMON) $(LIBBITCOIN_UTIL) $(LIBBITCOIN_CONSENSUS) $(LIBBITCOIN_CRYPTO) $(LIBUNIVALUE) $(LIBLEVELDB) $(LIBLEVELDB_SSE42) $(LIBMEMENV) \
|
||||
qt_lbrycrd_qt_LDADD += $(LIBBITCOIN_CLI) $(LIBBITCOIN_COMMON) $(LIBBITCOIN_UTIL) $(LIBBITCOIN_CONSENSUS) $(LIBBITCOIN_CRYPTO) $(LIBUNIVALUE) $(LIBLEVELDB) $(LIBMEMENV) \
|
||||
$(BOOST_LIBS) $(QT_LIBS) $(QT_DBUS_LIBS) $(QR_LIBS) $(PROTOBUF_LIBS) $(ICU_LIBS) $(BDB_LIBS) $(SSL_LIBS) $(CRYPTO_LIBS) $(MINIUPNPC_LIBS) $(LIBSECP256K1) \
|
||||
$(EVENT_PTHREADS_LIBS) $(EVENT_LIBS)
|
||||
qt_lbrycrd_qt_LDFLAGS = $(RELDFLAGS) $(AM_LDFLAGS) $(QT_LDFLAGS) $(LIBTOOL_APP_LDFLAGS)
|
||||
|
|
|
@ -63,7 +63,7 @@ if ENABLE_ZMQ
|
|||
qt_test_test_lbrycrd_qt_LDADD += $(LIBBITCOIN_ZMQ) $(ZMQ_LIBS)
|
||||
endif
|
||||
qt_test_test_lbrycrd_qt_LDADD += $(LIBBITCOIN_CLI) $(LIBBITCOIN_COMMON) $(LIBBITCOIN_UTIL) $(LIBBITCOIN_CONSENSUS) $(LIBBITCOIN_CRYPTO) $(LIBUNIVALUE) $(LIBLEVELDB) \
|
||||
$(LIBLEVELDB_SSE42) $(LIBMEMENV) $(BOOST_LIBS) $(QT_DBUS_LIBS) $(QT_TEST_LIBS) $(QT_LIBS) \
|
||||
$(LIBMEMENV) $(BOOST_LIBS) $(QT_DBUS_LIBS) $(QT_TEST_LIBS) $(QT_LIBS) \
|
||||
$(QR_LIBS) $(PROTOBUF_LIBS) $(ICU_LIBS) $(BDB_LIBS) $(SSL_LIBS) $(CRYPTO_LIBS) $(MINIUPNPC_LIBS) $(LIBSECP256K1) \
|
||||
$(EVENT_PTHREADS_LIBS) $(EVENT_LIBS)
|
||||
qt_test_test_lbrycrd_qt_LDFLAGS = $(RELDFLAGS) $(AM_LDFLAGS) $(QT_LDFLAGS) $(LIBTOOL_APP_LDFLAGS)
|
||||
|
|
|
@ -122,7 +122,7 @@ test_test_lbrycrd_LDADD += $(LIBBITCOIN_WALLET)
|
|||
endif
|
||||
|
||||
test_test_lbrycrd_LDADD += $(LIBBITCOIN_SERVER) $(LIBBITCOIN_CLI) $(LIBBITCOIN_COMMON) $(LIBBITCOIN_UTIL) $(LIBBITCOIN_CONSENSUS) $(LIBBITCOIN_CRYPTO) $(LIBUNIVALUE) \
|
||||
$(LIBLEVELDB) $(LIBLEVELDB_SSE42) $(LIBMEMENV) $(BOOST_LIBS) $(BOOST_UNIT_TEST_FRAMEWORK_LIB) $(LIBSECP256K1) $(EVENT_LIBS) $(EVENT_PTHREADS_LIBS)
|
||||
$(LIBLEVELDB) $(LIBMEMENV) $(BOOST_LIBS) $(BOOST_UNIT_TEST_FRAMEWORK_LIB) $(LIBSECP256K1) $(EVENT_LIBS) $(EVENT_PTHREADS_LIBS)
|
||||
test_test_lbrycrd_CXXFLAGS = $(AM_CXXFLAGS) $(PIE_FLAGS)
|
||||
|
||||
test_test_lbrycrd_LDADD += $(LIBBITCOIN_CONSENSUS) $(BDB_LIBS) $(CRYPTO_LIBS) $(ICU_LIBS) $(MINIUPNPC_LIBS)
|
||||
|
|
|
@ -8,8 +8,6 @@
|
|||
#include <algorithm>
|
||||
#include <memory>
|
||||
|
||||
#include <boost/scoped_ptr.hpp>
|
||||
|
||||
static const uint256 one = uint256S("0000000000000000000000000000000000000000000000000000000000000001");
|
||||
|
||||
std::vector<unsigned char> heightToVch(int n)
|
||||
|
@ -123,13 +121,13 @@ void CClaimTrieData::reorderClaims(const supportEntryType& supports)
|
|||
claim.nEffectiveAmount += support.nAmount;
|
||||
}
|
||||
|
||||
std::make_heap(claims.begin(), claims.end());
|
||||
std::sort(claims.rbegin(), claims.rend());
|
||||
}
|
||||
|
||||
CClaimTrie::CClaimTrie(bool fMemory, bool fWipe, int proportionalDelayFactor)
|
||||
{
|
||||
nProportionalDelayFactor = proportionalDelayFactor;
|
||||
db.reset(new CDBWrapper(GetDataDir() / "claimtrie", 100 * 1024 * 1024, fMemory, fWipe, false));
|
||||
db.reset(new CDBWrapper(GetDataDir() / "claimtrie", 200 * 1024 * 1024, fMemory, fWipe, false));
|
||||
}
|
||||
|
||||
bool CClaimTrie::SyncToDisk()
|
||||
|
@ -200,7 +198,7 @@ typename queueNameType::value_type* CClaimTrieCacheBase::getQueueCacheNameRow(co
|
|||
template <>
|
||||
typename expirationQueueType::value_type* CClaimTrieCacheBase::getExpirationQueueCacheRow<CClaimValue>(int nHeight, bool createIfNotExists)
|
||||
{
|
||||
return getQueue(*(base->db), EXP_QUEUE_ROW, nHeight, expirationQueueCache, createIfNotExists);
|
||||
return getQueue(*(base->db), CLAIM_EXP_QUEUE_ROW, nHeight, expirationQueueCache, createIfNotExists);
|
||||
}
|
||||
|
||||
template <>
|
||||
|
@ -218,8 +216,14 @@ typename expirationQueueType::value_type* CClaimTrieCacheBase::getExpirationQueu
|
|||
|
||||
bool CClaimTrieCacheBase::haveClaim(const std::string& name, const COutPoint& outPoint) const
|
||||
{
|
||||
auto it = find(name);
|
||||
return it && it->haveClaim(outPoint);
|
||||
auto it = nodesToAddOrUpdate.find(name);
|
||||
if (it && it->haveClaim(outPoint))
|
||||
return true;
|
||||
if (it || nodesToDelete.count(name))
|
||||
return false;
|
||||
CClaimTrieDataNode node;
|
||||
node.childrenSerialization = false;
|
||||
return base->find(name, node) && node.data.haveClaim(outPoint);
|
||||
}
|
||||
|
||||
bool CClaimTrieCacheBase::haveSupport(const std::string& name, const COutPoint& outPoint) const
|
||||
|
@ -272,39 +276,63 @@ bool CClaimTrieCacheBase::haveSupportInQueue(const std::string& name, const COut
|
|||
return haveInQueue<CSupportValue>(name, outPoint, nValidAtHeight);
|
||||
}
|
||||
|
||||
std::size_t CClaimTrieCacheBase::getTotalNamesInTrie() const
|
||||
void CClaimTrie::recurseAllHashedNodes(const std::string& name, const CClaimTrieDataNode& current, std::function<void(const std::string&, const CClaimTrieDataNode&)> function) const {
|
||||
function(name, current);
|
||||
for (auto& child: current.children) {
|
||||
CClaimTrieDataNode node;
|
||||
if (find(child.second, node))
|
||||
recurseAllHashedNodes(name + child.first, node, function);
|
||||
}
|
||||
}
|
||||
|
||||
std::size_t CClaimTrie::getTotalNamesInTrie() const
|
||||
{
|
||||
std::size_t count = 0;
|
||||
for (auto it = base->cbegin(); it != base->cend(); ++it)
|
||||
if (!it->empty()) ++count;
|
||||
CClaimTrieDataNode node;
|
||||
if (find("", node))
|
||||
recurseAllHashedNodes("", node, [&count](const std::string&, const CClaimTrieDataNode& node) {
|
||||
count += !node.data.empty();
|
||||
});
|
||||
return count;
|
||||
}
|
||||
|
||||
std::size_t CClaimTrieCacheBase::getTotalClaimsInTrie() const
|
||||
std::size_t CClaimTrie::getTotalClaimsInTrie() const
|
||||
{
|
||||
std::size_t count = 0;
|
||||
for (auto it = base->cbegin(); it != base->cend(); ++it)
|
||||
count += it->claims.size();
|
||||
CClaimTrieDataNode node;
|
||||
if (find("", node))
|
||||
recurseAllHashedNodes("", node, [&count](const std::string&, const CClaimTrieDataNode& node) {
|
||||
count += node.data.claims.size();
|
||||
});
|
||||
return count;
|
||||
}
|
||||
|
||||
CAmount CClaimTrieCacheBase::getTotalValueOfClaimsInTrie(bool fControllingOnly) const
|
||||
CAmount CClaimTrie::getTotalValueOfClaimsInTrie(bool fControllingOnly) const
|
||||
{
|
||||
CAmount value_in_subtrie = 0;
|
||||
for (auto it = base->cbegin(); it != base->cend(); ++it) {
|
||||
for (const auto& claim : it->claims) {
|
||||
std::size_t count = 0;
|
||||
CClaimTrieDataNode node;
|
||||
if (find("", node))
|
||||
recurseAllHashedNodes("", node, [&value_in_subtrie, fControllingOnly](const std::string&, const CClaimTrieDataNode& node) {
|
||||
for (const auto& claim : node.data.claims) {
|
||||
value_in_subtrie += claim.nAmount;
|
||||
if (fControllingOnly)
|
||||
break;
|
||||
}
|
||||
}
|
||||
});
|
||||
return value_in_subtrie;
|
||||
}
|
||||
|
||||
bool CClaimTrieCacheBase::getInfoForName(const std::string& name, CClaimValue& claim) const
|
||||
{
|
||||
auto it = find(name);
|
||||
return it && it->getBestClaim(claim);
|
||||
auto it = nodesToAddOrUpdate.find(name);
|
||||
if (it && it->getBestClaim(claim))
|
||||
return true;
|
||||
if (it || nodesToDelete.count(name))
|
||||
return false;
|
||||
CClaimTrieDataNode node;
|
||||
node.childrenSerialization = false;
|
||||
return base->find(name, node) && node.data.getBestClaim(claim);
|
||||
}
|
||||
|
||||
CClaimsForNameType CClaimTrieCacheBase::getClaimsForName(const std::string& name) const
|
||||
|
@ -313,10 +341,16 @@ CClaimsForNameType CClaimTrieCacheBase::getClaimsForName(const std::string& name
|
|||
int nLastTakeoverHeight = 0;
|
||||
auto supports = getSupportsForName(name);
|
||||
|
||||
if (auto it = find(name)) {
|
||||
CClaimTrieDataNode node;
|
||||
node.childrenSerialization = false;
|
||||
if (auto it = nodesToAddOrUpdate.find(name)) {
|
||||
claims = it->claims;
|
||||
nLastTakeoverHeight = it->nHeightOfLastTakeover;
|
||||
}
|
||||
else if (!nodesToDelete.count(name) && base->find(name, node)) {
|
||||
claims = node.data.claims;
|
||||
nLastTakeoverHeight = node.data.nHeightOfLastTakeover;
|
||||
}
|
||||
return {std::move(claims), std::move(supports), nLastTakeoverHeight, name};
|
||||
}
|
||||
|
||||
|
@ -381,60 +415,97 @@ uint256 recursiveMerkleHash(TIterator& it, const iCbType<TIterator>& process, co
|
|||
return Hash(vchToHash.begin(), vchToHash.end());
|
||||
}
|
||||
|
||||
bool recursiveCheckConsistency(CClaimTrie::const_iterator& it, std::string& failed)
|
||||
bool CClaimTrie::checkConsistency(const uint256& rootHash) const
|
||||
{
|
||||
struct CRecursiveBreak : public std::exception {};
|
||||
|
||||
using iterator = CClaimTrie::const_iterator;
|
||||
iCbType<iterator> verify = [&failed](iterator& it) {
|
||||
if (!it.hasChildren()) {
|
||||
// we don't allow a situation of no children and no claims; no empty leaf nodes allowed
|
||||
failed = it.key();
|
||||
throw CRecursiveBreak();
|
||||
}
|
||||
};
|
||||
|
||||
iCbType<iterator> process = [&failed, &process, &verify](iterator& it) {
|
||||
if (it->hash != recursiveMerkleHash(it, process, verify)) {
|
||||
failed = it.key();
|
||||
throw CRecursiveBreak();
|
||||
}
|
||||
};
|
||||
|
||||
try {
|
||||
process(it);
|
||||
} catch (const CRecursiveBreak&) {
|
||||
return false;
|
||||
}
|
||||
CClaimTrieDataNode node;
|
||||
if (!find("", node) || node.data.hash != rootHash) {
|
||||
if (rootHash == one)
|
||||
return true;
|
||||
|
||||
return error("Mismatched root claim trie hashes. This may happen when there is not a clean process shutdown. Please run with -reindex.");
|
||||
}
|
||||
|
||||
bool success = true;
|
||||
recurseAllHashedNodes("", node, [&success, this](const std::string& name, const CClaimTrieDataNode& node) {
|
||||
if (!success) return;
|
||||
|
||||
success &= contains(name);
|
||||
|
||||
std::vector<uint8_t> vchToHash;
|
||||
const auto pos = name.size();
|
||||
for (auto& child : node.children) {
|
||||
auto key = name + child.first;
|
||||
auto hash = child.second;
|
||||
completeHash(hash, key, pos);
|
||||
vchToHash.push_back(key[pos]);
|
||||
vchToHash.insert(vchToHash.end(), hash.begin(), hash.end());
|
||||
}
|
||||
|
||||
CClaimValue claim;
|
||||
if (node.data.getBestClaim(claim)) {
|
||||
uint256 valueHash = getValueHash(claim.outPoint, node.data.nHeightOfLastTakeover);
|
||||
vchToHash.insert(vchToHash.end(), valueHash.begin(), valueHash.end());
|
||||
} else {
|
||||
success &= !node.children.empty(); // we disallow leaf nodes without claims
|
||||
}
|
||||
|
||||
success &= node.data.hash == Hash(vchToHash.begin(), vchToHash.end());
|
||||
});
|
||||
|
||||
return success;
|
||||
}
|
||||
|
||||
bool CClaimTrieCacheBase::checkConsistency() const
|
||||
{
|
||||
if (base->empty())
|
||||
return true;
|
||||
std::vector<std::pair<std::string, CClaimTrieDataNode>> CClaimTrie::nodes(const std::string &key) const {
|
||||
std::vector<std::pair<std::string, CClaimTrieDataNode>> ret;
|
||||
CClaimTrieDataNode node;
|
||||
|
||||
auto it = base->cbegin();
|
||||
std::string failed;
|
||||
auto consistent = recursiveCheckConsistency(it, failed);
|
||||
if (!consistent) {
|
||||
LogPrintf("\nPrinting base tree from its parent:\n");
|
||||
auto basePath = base->nodes(failed);
|
||||
if (basePath.size() > 1) basePath.pop_back();
|
||||
dumpToLog(basePath.back(), false);
|
||||
auto cachePath = nodesToAddOrUpdate.nodes(failed);
|
||||
if (!cachePath.empty()) {
|
||||
LogPrintf("\nPrinting %s's parent from cache:\n", failed);
|
||||
if (cachePath.size() > 1) cachePath.pop_back();
|
||||
dumpToLog(cachePath.back(), false);
|
||||
if (!find("", node))
|
||||
return ret;
|
||||
ret.emplace_back("", node);
|
||||
|
||||
std::string partialKey = key;
|
||||
|
||||
while (!node.children.empty()) {
|
||||
// auto it = node.children.lower_bound(partialKey); // for using a std::map
|
||||
auto it = std::lower_bound(node.children.begin(), node.children.end(), std::make_pair(partialKey, uint256()));
|
||||
if (it != node.children.end() && it->first == partialKey) {
|
||||
// we're completely done
|
||||
if (find(it->second, node))
|
||||
ret.emplace_back(key, node);
|
||||
break;
|
||||
}
|
||||
if (!nodesToDelete.empty()) {
|
||||
std::string joined;
|
||||
for (const auto &piece : nodesToDelete) joined += ", " + piece;
|
||||
LogPrintf("Nodes to be deleted: %s\n", joined.substr(2));
|
||||
if (it != node.children.begin()) --it;
|
||||
const auto count = match(partialKey, it->first);
|
||||
|
||||
if (count != it->first.size()) break;
|
||||
if (count == partialKey.size()) break;
|
||||
partialKey = partialKey.substr(count);
|
||||
if (find(it->second, node))
|
||||
ret.emplace_back(key.substr(0, key.size() - partialKey.size()), node);
|
||||
else break;
|
||||
}
|
||||
}
|
||||
return consistent;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
bool CClaimTrie::contains(const std::string &key) const {
|
||||
return db->Exists(std::make_pair(TRIE_NODE_BY_NAME, key));
|
||||
}
|
||||
|
||||
bool CClaimTrie::empty() const {
|
||||
return !contains("");
|
||||
}
|
||||
|
||||
bool CClaimTrie::find(const std::string &key, CClaimTrieDataNode &node) const {
|
||||
uint256 hash;
|
||||
if (!db->Read(std::make_pair(TRIE_NODE_BY_NAME, key), hash))
|
||||
return false;
|
||||
auto found = find(hash, node);
|
||||
return found;
|
||||
}
|
||||
|
||||
bool CClaimTrie::find(const uint256 &key, CClaimTrieDataNode &node) const {
|
||||
return db->Read(std::make_pair(TRIE_NODE_BY_HASH, key), node);
|
||||
}
|
||||
|
||||
bool CClaimTrieCacheBase::getClaimById(const uint160& claimId, std::string& name, CClaimValue& claim) const
|
||||
|
@ -486,99 +557,79 @@ bool CClaimTrieCacheBase::flush()
|
|||
|
||||
getMerkleHash();
|
||||
|
||||
std::set<std::string> forDeletion;
|
||||
for (const auto& nodeName : nodesToDelete) {
|
||||
if (nodesToAddOrUpdate.contains(nodeName))
|
||||
continue;
|
||||
// TODO: we don't need to deserialize all the nodes right here
|
||||
// we could be smarter about this and fill in the whole list in removeClaimFromTrie
|
||||
auto nodes = base->nodes(nodeName);
|
||||
base->erase(nodeName);
|
||||
for (auto& node : nodes)
|
||||
if (!node)
|
||||
batch.Erase(std::make_pair(TRIE_NODE, node.key()));
|
||||
forDeletion.insert(node.first);
|
||||
}
|
||||
|
||||
for (auto it = nodesToAddOrUpdate.begin(); it != nodesToAddOrUpdate.end(); ++it) {
|
||||
auto old = base->find(it.key());
|
||||
if (!old || old.data() != it.data()) {
|
||||
base->copy(it);
|
||||
batch.Write(std::make_pair(TRIE_NODE, it.key()), it.data());
|
||||
forDeletion.erase(it.key());
|
||||
if (!dirtyNodes.count(it.key()))
|
||||
continue;
|
||||
|
||||
CClaimTrieDataNode node;
|
||||
node.data = it.data();
|
||||
for (auto &child: it.children()) // ordering here is important
|
||||
node.children.emplace_back(child.key().substr(it.key().size()), child->hash);
|
||||
|
||||
batch.Write(std::make_pair(TRIE_NODE_BY_HASH, it->hash), node);
|
||||
batch.Write(std::make_pair(TRIE_NODE_BY_NAME, it.key()), it->hash);
|
||||
}
|
||||
|
||||
for (auto& name: forDeletion) {
|
||||
batch.Erase(std::make_pair(TRIE_NODE_BY_NAME, name));
|
||||
}
|
||||
|
||||
BatchWriteQueue(batch, SUPPORT, supportCache);
|
||||
|
||||
BatchWriteQueue(batch, CLAIM_QUEUE_ROW, claimQueueCache);
|
||||
BatchWriteQueue(batch, CLAIM_QUEUE_NAME_ROW, claimQueueNameCache);
|
||||
BatchWriteQueue(batch, EXP_QUEUE_ROW, expirationQueueCache);
|
||||
BatchWriteQueue(batch, CLAIM_EXP_QUEUE_ROW, expirationQueueCache);
|
||||
|
||||
BatchWriteQueue(batch, SUPPORT_QUEUE_ROW, supportQueueCache);
|
||||
BatchWriteQueue(batch, SUPPORT_QUEUE_NAME_ROW, supportQueueNameCache);
|
||||
BatchWriteQueue(batch, SUPPORT_EXP_QUEUE_ROW, supportExpirationQueueCache);
|
||||
|
||||
base->nNextHeight = nNextHeight;
|
||||
if (!nodesToAddOrUpdate.empty())
|
||||
LogPrint(BCLog::CLAIMS, "Cache size: %zu from base size: %zu on block %d\n", nodesToAddOrUpdate.height(), base->height(), nNextHeight);
|
||||
if (!nodesToAddOrUpdate.empty() && (LogAcceptCategory(BCLog::CLAIMS) || LogAcceptCategory(BCLog::BENCH))) {
|
||||
LogPrintf("TrieCache size: %zu nodes on block %d, batch writes %zu bytes.\n",
|
||||
nodesToAddOrUpdate.height(), nNextHeight, batch.SizeEstimate(), base->db->DynamicMemoryUsage());
|
||||
}
|
||||
auto ret = base->db->WriteBatch(batch);
|
||||
clear();
|
||||
return ret;
|
||||
}
|
||||
|
||||
bool CClaimTrieCacheBase::ReadFromDisk(const CBlockIndex* tip)
|
||||
bool CClaimTrieCacheBase::validateTrieConsistency(const CBlockIndex* tip)
|
||||
{
|
||||
LogPrintf("Loading the claim trie from disk...\n");
|
||||
|
||||
base->nNextHeight = nNextHeight = tip ? tip->nHeight + 1 : 0;
|
||||
|
||||
clear();
|
||||
base->clear();
|
||||
boost::scoped_ptr<CDBIterator> pcursor(base->db->NewIterator());
|
||||
|
||||
std::vector<std::pair<std::string, uint256>> hashesOnEmptyNodes;
|
||||
|
||||
for (pcursor->SeekToFirst(); pcursor->Valid(); pcursor->Next()) {
|
||||
std::pair<uint8_t, std::string> key;
|
||||
if (!pcursor->GetKey(key) || key.first != TRIE_NODE)
|
||||
continue;
|
||||
|
||||
CClaimTrieData data;
|
||||
if (pcursor->GetValue(data)) {
|
||||
if (data.empty()) {
|
||||
// we have a situation where our old trie had many empty nodes
|
||||
// we don't want to automatically throw those all into our prefix trie
|
||||
hashesOnEmptyNodes.emplace_back(key.second, data.hash);
|
||||
continue;
|
||||
}
|
||||
|
||||
// nEffectiveAmount isn't serialized but it needs to be initialized (as done in reorderClaims):
|
||||
auto supports = getSupportsForName(key.second);
|
||||
data.reorderClaims(supports);
|
||||
base->insert(key.second, std::move(data));
|
||||
} else {
|
||||
return error("%s(): error reading claim trie from disk", __func__);
|
||||
}
|
||||
}
|
||||
|
||||
CDBBatch batch(*(base->db));
|
||||
for (auto& kvp: hashesOnEmptyNodes) {
|
||||
auto hit = base->find(kvp.first);
|
||||
if (hit != base->end())
|
||||
hit->hash = kvp.second;
|
||||
else {
|
||||
// the first time the prefix trie is ran there will be many unused nodes
|
||||
// we need to clean those out so that we can go faster next time
|
||||
batch.Erase(std::make_pair(TRIE_NODE, kvp.first));
|
||||
}
|
||||
}
|
||||
if (!tip || tip->nHeight < 1)
|
||||
return true;
|
||||
|
||||
LogPrintf("Checking claim trie consistency... ");
|
||||
if (checkConsistency()) {
|
||||
if (base->checkConsistency(tip->hashClaimTrie)) {
|
||||
LogPrintf("consistent\n");
|
||||
if (tip && tip->hashClaimTrie != getMerkleHash())
|
||||
return error("%s(): hashes don't match when reading claimtrie from disk", __func__);
|
||||
base->db->WriteBatch(batch);
|
||||
return true;
|
||||
}
|
||||
LogPrintf("inconsistent!\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
bool CClaimTrieCacheBase::ReadFromDisk(const CBlockIndex* tip)
|
||||
{
|
||||
base->nNextHeight = nNextHeight = tip ? tip->nHeight + 1 : 0;
|
||||
clear();
|
||||
|
||||
if (tip && (base->db->Exists(std::make_pair(TRIE_NODE, std::string())) || !base->db->Exists(std::make_pair(TRIE_NODE_BY_HASH, tip->hashClaimTrie)))) {
|
||||
LogPrintf("The claim trie database contains deprecated data and will need to be rebuilt");
|
||||
return false;
|
||||
}
|
||||
return validateTrieConsistency(tip);
|
||||
}
|
||||
|
||||
CClaimTrieCacheBase::CClaimTrieCacheBase(CClaimTrie* base) : base(base)
|
||||
{
|
||||
assert(base);
|
||||
|
@ -590,9 +641,9 @@ int CClaimTrieCacheBase::expirationTime() const
|
|||
return Params().GetConsensus().nOriginalClaimExpirationTime;
|
||||
}
|
||||
|
||||
uint256 CClaimTrieCacheBase::recursiveComputeMerkleHash(CClaimTrie::iterator& it)
|
||||
uint256 CClaimTrieCacheBase::recursiveComputeMerkleHash(CClaimPrefixTrie::iterator& it)
|
||||
{
|
||||
using iterator = CClaimTrie::iterator;
|
||||
using iterator = CClaimPrefixTrie::iterator;
|
||||
iCbType<iterator> process = [&process](iterator& it) {
|
||||
if (it->hash.IsNull())
|
||||
it->hash = recursiveMerkleHash(it, process);
|
||||
|
@ -604,54 +655,52 @@ uint256 CClaimTrieCacheBase::recursiveComputeMerkleHash(CClaimTrie::iterator& it
|
|||
uint256 CClaimTrieCacheBase::getMerkleHash()
|
||||
{
|
||||
auto it = nodesToAddOrUpdate.begin();
|
||||
if (nodesToAddOrUpdate.empty() && nodesToDelete.empty())
|
||||
it = base->begin();
|
||||
return !it ? one : recursiveComputeMerkleHash(it);
|
||||
if (it)
|
||||
return recursiveComputeMerkleHash(it);
|
||||
if (nodesToDelete.empty() && nodesAlreadyCached.empty()) {
|
||||
CClaimTrieDataNode node;
|
||||
node.childrenSerialization = false;
|
||||
if (base->find("", node))
|
||||
return node.data.hash; // it may be valuable to have base cache its current root hash
|
||||
}
|
||||
return one; // we have no data or we deleted everything
|
||||
}
|
||||
|
||||
CClaimTrie::const_iterator CClaimTrieCacheBase::begin() const
|
||||
CClaimPrefixTrie::const_iterator CClaimTrieCacheBase::begin() const
|
||||
{
|
||||
return nodesToAddOrUpdate.empty() && nodesToDelete.empty() ? base->cbegin() : nodesToAddOrUpdate.begin();
|
||||
return nodesToAddOrUpdate.begin();
|
||||
}
|
||||
|
||||
CClaimTrie::const_iterator CClaimTrieCacheBase::end() const
|
||||
CClaimPrefixTrie::const_iterator CClaimTrieCacheBase::end() const
|
||||
{
|
||||
return nodesToAddOrUpdate.empty() && nodesToDelete.empty() ? base->cend() : nodesToAddOrUpdate.end();
|
||||
}
|
||||
|
||||
CClaimTrie::const_iterator CClaimTrieCacheBase::find(const std::string& name) const
|
||||
{
|
||||
if (auto it = nodesToAddOrUpdate.find(name))
|
||||
return it;
|
||||
return base->find(name);
|
||||
return nodesToAddOrUpdate.end();
|
||||
}
|
||||
|
||||
bool CClaimTrieCacheBase::empty() const
|
||||
{
|
||||
return base->empty() && nodesToAddOrUpdate.empty();
|
||||
return nodesToAddOrUpdate.empty();
|
||||
}
|
||||
|
||||
CClaimTrie::iterator CClaimTrieCacheBase::cacheData(const std::string& name, bool create)
|
||||
CClaimPrefixTrie::iterator CClaimTrieCacheBase::cacheData(const std::string& name, bool create)
|
||||
{
|
||||
// get data from the cache. if no data, create empty one
|
||||
const auto insert = [this](CClaimTrie::iterator& it) {
|
||||
auto& key = it.key();
|
||||
// we only ever cache nodes once per cache instance
|
||||
if (!nodesAlreadyCached.count(key)) {
|
||||
// do not insert nodes that are already present
|
||||
nodesAlreadyCached.insert(key);
|
||||
nodesToAddOrUpdate.insert(key, it.data());
|
||||
}
|
||||
};
|
||||
|
||||
// we need all parent nodes and their one level deep children
|
||||
// to calculate merkle hash
|
||||
auto nodes = base->nodes(name);
|
||||
for (auto& node: nodes) {
|
||||
for (auto& child : node.children())
|
||||
if (!nodesAlreadyCached.count(child.key()))
|
||||
nodesToAddOrUpdate.copy(child);
|
||||
insert(node);
|
||||
if (nodesAlreadyCached.insert(node.first).second) {
|
||||
// do not insert nodes that are already present
|
||||
nodesToAddOrUpdate.insert(node.first, node.second.data);
|
||||
}
|
||||
for (auto& child : node.second.children) {
|
||||
auto childKey = node.first + child.first;
|
||||
if (nodesAlreadyCached.insert(childKey).second) {
|
||||
CClaimTrieDataNode childNode;
|
||||
childNode.childrenSerialization = false;
|
||||
if (base->find(child.second, childNode)) {
|
||||
nodesToAddOrUpdate.insert(childKey, childNode.data);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
auto it = nodesToAddOrUpdate.find(name);
|
||||
|
@ -677,10 +726,12 @@ bool CClaimTrieCacheBase::getLastTakeoverForName(const std::string& name, uint16
|
|||
std::tie(claimId, takeoverHeight) = cit->second;
|
||||
return true;
|
||||
}
|
||||
if (auto it = base->find(name)) {
|
||||
takeoverHeight = it->nHeightOfLastTakeover;
|
||||
CClaimTrieDataNode data;
|
||||
data.childrenSerialization = false;
|
||||
if (base->find(name, data)) {
|
||||
takeoverHeight = data.data.nHeightOfLastTakeover;
|
||||
CClaimValue claim;
|
||||
if (it->getBestClaim(claim)) {
|
||||
if (data.data.getBestClaim(claim)) {
|
||||
claimId = claim.claimId;
|
||||
return true;
|
||||
}
|
||||
|
@ -690,8 +741,10 @@ bool CClaimTrieCacheBase::getLastTakeoverForName(const std::string& name, uint16
|
|||
|
||||
void CClaimTrieCacheBase::markAsDirty(const std::string& name, bool fCheckTakeover)
|
||||
{
|
||||
for (auto& node : nodesToAddOrUpdate.nodes(name))
|
||||
for (auto& node : nodesToAddOrUpdate.nodes(name)) {
|
||||
dirtyNodes.insert(node.key());
|
||||
node->hash.SetNull();
|
||||
}
|
||||
|
||||
if (fCheckTakeover)
|
||||
namesToCheckForTakeover.insert(name);
|
||||
|
@ -712,7 +765,7 @@ bool CClaimTrieCacheBase::removeClaimFromTrie(const std::string& name, const COu
|
|||
auto it = cacheData(name, false);
|
||||
|
||||
if (!it || !it->removeClaim(outPoint, claim)) {
|
||||
LogPrint(BCLog::CLAIMS, "%s: Removing a claim was unsuccessful. name = %s, txhash = %s, nOut = %d", __func__, name, outPoint.hash.GetHex(), outPoint.n);
|
||||
LogPrint(BCLog::CLAIMS, "%s: Removing a claim was unsuccessful. name = %s, txhash = %s, nOut = %d\n", __func__, name, outPoint.hash.GetHex(), outPoint.n);
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -963,11 +1016,14 @@ bool CClaimTrieCacheBase::removeSupportFromMap(const std::string& name, const CO
|
|||
return false;
|
||||
}
|
||||
|
||||
void CClaimTrieCacheBase::dumpToLog(CClaimTrie::const_iterator it, bool diffFromBase) const
|
||||
void CClaimTrieCacheBase::dumpToLog(CClaimPrefixTrie::const_iterator it, bool diffFromBase) const
|
||||
{
|
||||
if (!it) return;
|
||||
|
||||
if (diffFromBase) {
|
||||
auto hit = base->find(it.key());
|
||||
if (hit && hit->hash == it->hash)
|
||||
CClaimTrieDataNode node;
|
||||
node.childrenSerialization = false;
|
||||
if (base->find(it.key(), node) && node.data.hash == it->hash)
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -1281,8 +1337,16 @@ int CClaimTrieCacheBase::getNumBlocksOfContinuousOwnership(const std::string& na
|
|||
that->removalWorkaround.erase(hit);
|
||||
return 0;
|
||||
}
|
||||
auto it = find(name);
|
||||
return it && !it->empty() ? nNextHeight - it->nHeightOfLastTakeover : 0;
|
||||
auto it = nodesToAddOrUpdate.find(name);
|
||||
if (it && !it->empty())
|
||||
return nNextHeight - it->nHeightOfLastTakeover;
|
||||
if (it) // we specifically ignore deleted nodes here to allow this to fall into the base lookup in that scenario
|
||||
return 0;
|
||||
CClaimTrieDataNode node;
|
||||
node.childrenSerialization = false;
|
||||
if (base->find(name, node) && !node.data.empty())
|
||||
return nNextHeight - node.data.nHeightOfLastTakeover;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int CClaimTrieCacheBase::getDelayForName(const std::string& name) const
|
||||
|
@ -1311,6 +1375,7 @@ bool CClaimTrieCacheBase::clear()
|
|||
{
|
||||
nodesToAddOrUpdate.clear();
|
||||
claimsToAddToByIdIndex.clear();
|
||||
dirtyNodes.clear();
|
||||
supportCache.clear();
|
||||
nodesToDelete.clear();
|
||||
claimsToDeleteFromByIdIndex.clear();
|
||||
|
|
|
@ -18,11 +18,13 @@
|
|||
#include <unordered_set>
|
||||
|
||||
// leveldb keys
|
||||
#define TRIE_NODE 'n'
|
||||
#define TRIE_NODE 'n' // deprecated
|
||||
#define TRIE_NODE_BY_HASH 'h'
|
||||
#define TRIE_NODE_BY_NAME 'g'
|
||||
#define CLAIM_BY_ID 'i'
|
||||
#define CLAIM_QUEUE_ROW 'r'
|
||||
#define CLAIM_QUEUE_NAME_ROW 'm'
|
||||
#define EXP_QUEUE_ROW 'e'
|
||||
#define CLAIM_EXP_QUEUE_ROW 'e'
|
||||
#define SUPPORT 's'
|
||||
#define SUPPORT_QUEUE_ROW 'u'
|
||||
#define SUPPORT_QUEUE_NAME_ROW 'p'
|
||||
|
@ -61,6 +63,7 @@ struct CClaimValue
|
|||
READWRITE(nAmount);
|
||||
READWRITE(nHeight);
|
||||
READWRITE(nValidAtHeight);
|
||||
READWRITE(nEffectiveAmount);
|
||||
}
|
||||
|
||||
bool operator<(const CClaimValue& other) const
|
||||
|
@ -157,17 +160,6 @@ struct CClaimTrieData
|
|||
inline void SerializationOp(Stream& s, Operation ser_action)
|
||||
{
|
||||
READWRITE(hash);
|
||||
|
||||
if (ser_action.ForRead()) {
|
||||
if (s.eof()) {
|
||||
claims.clear();
|
||||
nHeightOfLastTakeover = 0;
|
||||
return;
|
||||
}
|
||||
}
|
||||
else if (claims.empty())
|
||||
return;
|
||||
|
||||
READWRITE(claims);
|
||||
READWRITE(nHeightOfLastTakeover);
|
||||
}
|
||||
|
@ -188,6 +180,30 @@ struct CClaimTrieData
|
|||
}
|
||||
};
|
||||
|
||||
struct CClaimTrieDataNode {
|
||||
CClaimTrieData data;
|
||||
// we're using a vector to avoid RAM thrashing and for faster serialization ops.
|
||||
// We're assuming its data is inserted in order and never modified.
|
||||
std::vector<std::pair<std::string, uint256>> children;
|
||||
bool childrenSerialization = true;
|
||||
|
||||
CClaimTrieDataNode() = default;
|
||||
CClaimTrieDataNode(CClaimTrieDataNode&&) = default;
|
||||
CClaimTrieDataNode(const CClaimTrieDataNode&) = default;
|
||||
CClaimTrieDataNode& operator=(CClaimTrieDataNode&&) = default;
|
||||
CClaimTrieDataNode& operator=(const CClaimTrieDataNode& d) = default;
|
||||
|
||||
ADD_SERIALIZE_METHODS;
|
||||
|
||||
template <typename Stream, typename Operation>
|
||||
inline void SerializationOp(Stream& s, Operation ser_action)
|
||||
{
|
||||
READWRITE(data);
|
||||
if (childrenSerialization) // wanting constexpr but hoping the compiler is smart enough anyway
|
||||
READWRITE(children);
|
||||
}
|
||||
};
|
||||
|
||||
struct COutPointHeightType
|
||||
{
|
||||
COutPoint outPoint;
|
||||
|
@ -301,7 +317,7 @@ struct CClaimsForNameType
|
|||
CClaimsForNameType& operator=(const CClaimsForNameType&) = default;
|
||||
};
|
||||
|
||||
class CClaimTrie : public CPrefixTrie<std::string, CClaimTrieData>
|
||||
class CClaimTrie
|
||||
{
|
||||
int nNextHeight = 0;
|
||||
int nProportionalDelayFactor = 0;
|
||||
|
@ -322,6 +338,19 @@ public:
|
|||
friend struct ClaimTrieChainFixture;
|
||||
friend class CClaimTrieCacheExpirationFork;
|
||||
friend class CClaimTrieCacheNormalizationFork;
|
||||
|
||||
std::size_t getTotalNamesInTrie() const;
|
||||
std::size_t getTotalClaimsInTrie() const;
|
||||
CAmount getTotalValueOfClaimsInTrie(bool fControllingOnly) const;
|
||||
bool checkConsistency(const uint256& rootHash) const;
|
||||
|
||||
bool contains(const std::string& key) const;
|
||||
bool empty() const;
|
||||
bool find(const uint256& key, CClaimTrieDataNode& node) const;
|
||||
bool find(const std::string& key, CClaimTrieDataNode& node) const;
|
||||
|
||||
std::vector<std::pair<std::string, CClaimTrieDataNode>> nodes(const std::string& key) const;
|
||||
void recurseAllHashedNodes(const std::string& name, const CClaimTrieDataNode& current, std::function<void(const std::string&, const CClaimTrieDataNode&)> function) const;
|
||||
};
|
||||
|
||||
struct CClaimTrieProofNode
|
||||
|
@ -381,6 +410,8 @@ typedef std::map<int, expirationQueueRowType> expirationQueueType;
|
|||
typedef std::set<CClaimValue> claimIndexClaimListType;
|
||||
typedef std::vector<CClaimIndexElement> claimIndexElementListType;
|
||||
|
||||
typedef CPrefixTrie<std::string, CClaimTrieData> CClaimPrefixTrie;
|
||||
|
||||
class CClaimTrieCacheBase
|
||||
{
|
||||
public:
|
||||
|
@ -388,7 +419,6 @@ public:
|
|||
virtual ~CClaimTrieCacheBase() = default;
|
||||
|
||||
uint256 getMerkleHash();
|
||||
bool checkConsistency() const;
|
||||
|
||||
bool getClaimById(const uint160& claimId, std::string& name, CClaimValue& claim) const;
|
||||
|
||||
|
@ -402,10 +432,6 @@ public:
|
|||
bool haveSupport(const std::string& name, const COutPoint& outPoint) const;
|
||||
bool haveSupportInQueue(const std::string& name, const COutPoint& outPoint, int& nValidAtHeight);
|
||||
|
||||
std::size_t getTotalNamesInTrie() const;
|
||||
std::size_t getTotalClaimsInTrie() const;
|
||||
CAmount getTotalValueOfClaimsInTrie(bool fControllingOnly) const;
|
||||
|
||||
bool addClaim(const std::string& name, const COutPoint& outPoint, const uint160& claimId, CAmount nAmount, int nHeight);
|
||||
bool undoAddClaim(const std::string& name, const COutPoint& outPoint, int nHeight);
|
||||
|
||||
|
@ -441,18 +467,18 @@ public:
|
|||
CAmount getEffectiveAmountForClaim(const std::string& name, const uint160& claimId, std::vector<CSupportValue>* supports = nullptr) const;
|
||||
CAmount getEffectiveAmountForClaim(const CClaimsForNameType& claims, const uint160& claimId, std::vector<CSupportValue>* supports = nullptr) const;
|
||||
|
||||
CClaimTrie::const_iterator begin() const;
|
||||
CClaimTrie::const_iterator end() const;
|
||||
CClaimTrie::const_iterator find(const std::string& name) const;
|
||||
CClaimPrefixTrie::const_iterator begin() const;
|
||||
CClaimPrefixTrie::const_iterator end() const;
|
||||
|
||||
void dumpToLog(CClaimTrie::const_iterator it, bool diffFromBase = true) const;
|
||||
void dumpToLog(CClaimPrefixTrie::const_iterator it, bool diffFromBase = true) const;
|
||||
virtual std::string adjustNameForValidHeight(const std::string& name, int validHeight) const;
|
||||
|
||||
protected:
|
||||
CClaimTrie* base;
|
||||
CClaimTrie nodesToAddOrUpdate; // nodes pulled in from base (and possibly modified thereafter), written to base on flush
|
||||
CClaimPrefixTrie nodesToAddOrUpdate; // nodes pulled in from base (and possibly modified thereafter), written to base on flush
|
||||
std::unordered_set<std::string> namesToCheckForTakeover; // takeover numbers are updated on increment
|
||||
|
||||
uint256 recursiveComputeMerkleHash(CClaimTrie::iterator& it);
|
||||
uint256 recursiveComputeMerkleHash(CClaimPrefixTrie::iterator& it);
|
||||
|
||||
virtual bool insertClaimIntoTrie(const std::string& name, const CClaimValue& claim, bool fCheckTakeover);
|
||||
virtual bool removeClaimFromTrie(const std::string& name, const COutPoint& outPoint, CClaimValue& claim, bool fCheckTakeover);
|
||||
|
@ -460,14 +486,12 @@ protected:
|
|||
virtual bool insertSupportIntoMap(const std::string& name, const CSupportValue& support, bool fCheckTakeover);
|
||||
virtual bool removeSupportFromMap(const std::string& name, const COutPoint& outPoint, CSupportValue& support, bool fCheckTakeover);
|
||||
|
||||
virtual std::string adjustNameForValidHeight(const std::string& name, int validHeight) const;
|
||||
|
||||
supportEntryType getSupportsForName(const std::string& name) const;
|
||||
|
||||
int getDelayForName(const std::string& name) const;
|
||||
virtual int getDelayForName(const std::string& name, const uint160& claimId) const;
|
||||
|
||||
CClaimTrie::iterator cacheData(const std::string& name, bool create = true);
|
||||
CClaimPrefixTrie::iterator cacheData(const std::string& name, bool create = true);
|
||||
|
||||
bool getLastTakeoverForName(const std::string& name, uint160& claimId, int& takeoverHeight) const;
|
||||
|
||||
|
@ -499,6 +523,7 @@ private:
|
|||
std::unordered_set<std::string> nodesAlreadyCached; // set of nodes already pulled into cache from base
|
||||
std::unordered_map<std::string, bool> takeoverWorkaround;
|
||||
std::unordered_set<std::string> removalWorkaround;
|
||||
std::unordered_set<std::string> dirtyNodes;
|
||||
|
||||
bool shouldUseTakeoverWorkaround(const std::string& key) const;
|
||||
void addTakeoverWorkaroundPotential(const std::string& key);
|
||||
|
@ -510,6 +535,8 @@ private:
|
|||
bool removeSupport(const std::string& name, const COutPoint& outPoint, int nHeight, int& nValidAtHeight, bool fCheckTakeover);
|
||||
bool removeClaim(const std::string& name, const COutPoint& outPoint, int nHeight, int& nValidAtHeight, bool fCheckTakeover);
|
||||
|
||||
bool validateTrieConsistency(const CBlockIndex* tip);
|
||||
|
||||
template <typename T>
|
||||
std::pair<const int, std::vector<queueEntryType<T>>>* getQueueCacheRow(int nHeight, bool createIfNotExists = false);
|
||||
|
||||
|
@ -614,6 +641,7 @@ public:
|
|||
bool getProofForName(const std::string& name, CClaimTrieProof& proof) override;
|
||||
bool getInfoForName(const std::string& name, CClaimValue& claim) const override;
|
||||
CClaimsForNameType getClaimsForName(const std::string& name) const override;
|
||||
std::string adjustNameForValidHeight(const std::string& name, int validHeight) const override;
|
||||
|
||||
protected:
|
||||
bool insertClaimIntoTrie(const std::string& name, const CClaimValue& claim, bool fCheckTakeover) override;
|
||||
|
@ -624,8 +652,6 @@ protected:
|
|||
|
||||
int getDelayForName(const std::string& name, const uint160& claimId) const override;
|
||||
|
||||
std::string adjustNameForValidHeight(const std::string& name, int validHeight) const override;
|
||||
|
||||
private:
|
||||
bool overrideInsertNormalization;
|
||||
bool overrideRemoveNormalization;
|
||||
|
|
|
@ -8,6 +8,7 @@
|
|||
#include <boost/locale/conversion.hpp>
|
||||
#include <boost/locale/localization_backend.hpp>
|
||||
#include <boost/scope_exit.hpp>
|
||||
#include <boost/scoped_ptr.hpp>
|
||||
|
||||
CClaimTrieCacheExpirationFork::CClaimTrieCacheExpirationFork(CClaimTrie* base)
|
||||
: CClaimTrieCacheBase(base)
|
||||
|
@ -66,7 +67,7 @@ bool CClaimTrieCacheExpirationFork::forkForExpirationChange(bool increment)
|
|||
if (!pcursor->GetKey(key))
|
||||
continue;
|
||||
int height = key.second;
|
||||
if (key.first == EXP_QUEUE_ROW) {
|
||||
if (key.first == CLAIM_EXP_QUEUE_ROW) {
|
||||
expirationQueueRowType row;
|
||||
if (pcursor->GetValue(row)) {
|
||||
reactivateClaim(row, height, increment);
|
||||
|
@ -160,40 +161,48 @@ bool CClaimTrieCacheNormalizationFork::normalizeAllNamesInTrieIfNecessary(insert
|
|||
// run the one-time upgrade of all names that need to change
|
||||
// it modifies the (cache) trie as it goes, so we need to grab everything to be modified first
|
||||
|
||||
for (auto it = base->begin(); it != base->end(); ++it) {
|
||||
const std::string normalized = normalizeClaimName(it.key(), true);
|
||||
if (normalized == it.key())
|
||||
boost::scoped_ptr<CDBIterator> pcursor(base->db->NewIterator());
|
||||
for (pcursor->SeekToFirst(); pcursor->Valid(); pcursor->Next()) {
|
||||
std::pair<uint8_t, std::string> key;
|
||||
if (!pcursor->GetKey(key) || key.first != TRIE_NODE_BY_NAME)
|
||||
continue;
|
||||
|
||||
auto supports = getSupportsForName(it.key());
|
||||
const auto& name = key.second;
|
||||
const std::string normalized = normalizeClaimName(name, true);
|
||||
if (normalized == key.second)
|
||||
continue;
|
||||
|
||||
auto supports = getSupportsForName(name);
|
||||
for (auto support : supports) {
|
||||
// if it's already going to expire just skip it
|
||||
if (support.nHeight + expirationTime() <= nNextHeight)
|
||||
continue;
|
||||
|
||||
assert(removeSupportFromMap(it.key(), support.outPoint, support, false));
|
||||
expireSupportUndo.emplace_back(it.key(), support);
|
||||
assert(removeSupportFromMap(name, support.outPoint, support, false));
|
||||
expireSupportUndo.emplace_back(name, support);
|
||||
assert(insertSupportIntoMap(normalized, support, false));
|
||||
insertSupportUndo.emplace_back(it.key(), support.outPoint, -1);
|
||||
insertSupportUndo.emplace_back(name, support.outPoint, -1);
|
||||
}
|
||||
|
||||
namesToCheckForTakeover.insert(normalized);
|
||||
|
||||
auto cached = cacheData(it.key(), false);
|
||||
auto cached = cacheData(name, false);
|
||||
if (!cached || cached->empty())
|
||||
continue;
|
||||
|
||||
for (auto claim : it->claims) {
|
||||
auto claimsCopy = cached->claims;
|
||||
auto takeoverHeightCopy = cached->nHeightOfLastTakeover;
|
||||
for (auto claim : claimsCopy) {
|
||||
if (claim.nHeight + expirationTime() <= nNextHeight)
|
||||
continue;
|
||||
|
||||
assert(removeClaimFromTrie(it.key(), claim.outPoint, claim, false));
|
||||
removeUndo.emplace_back(it.key(), claim);
|
||||
assert(removeClaimFromTrie(name, claim.outPoint, claim, false));
|
||||
removeUndo.emplace_back(name, claim);
|
||||
assert(insertClaimIntoTrie(normalized, claim, true));
|
||||
insertUndo.emplace_back(it.key(), claim.outPoint, -1);
|
||||
insertUndo.emplace_back(name, claim.outPoint, -1);
|
||||
}
|
||||
|
||||
takeoverHeightUndo.emplace_back(it.key(), it->nHeightOfLastTakeover);
|
||||
takeoverHeightUndo.emplace_back(name, takeoverHeightCopy);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
|
|
@ -97,11 +97,45 @@ static void SetMaxOpenFiles(leveldb::Options *options) {
|
|||
options->max_open_files, default_open_files);
|
||||
}
|
||||
|
||||
class CappedLenCache: public leveldb::Cache {
|
||||
leveldb::Cache* inner;
|
||||
std::size_t maxKeyLen;
|
||||
public:
|
||||
CappedLenCache(std::size_t capacity, std::size_t maxKeyLen)
|
||||
: inner(leveldb::NewLRUCache(capacity)), maxKeyLen(maxKeyLen) {}
|
||||
|
||||
~CappedLenCache() override { delete inner; }
|
||||
|
||||
Handle* Insert(const leveldb::Slice& key, void* value, size_t charge,
|
||||
void (*deleter)(const leveldb::Slice& key, void* value)) override {
|
||||
if (key.size() <= maxKeyLen)
|
||||
return inner->Insert(key, value, charge, deleter);
|
||||
deleter(key, value);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
Handle* Lookup(const leveldb::Slice& key) override { return inner->Lookup(key); }
|
||||
void Release(Handle* handle) override { return inner->Release(handle); }
|
||||
void* Value(Handle* handle) override { return inner->Value(handle); }
|
||||
void Erase(const leveldb::Slice& key) override {return inner->Erase(key); }
|
||||
uint64_t NewId() override { return inner->NewId(); }
|
||||
};
|
||||
|
||||
static leveldb::Options GetOptions(size_t nCacheSize)
|
||||
{
|
||||
leveldb::Options options;
|
||||
auto write_cache = std::min(nCacheSize / 4, size_t(16) << 20U); // cap write_cache at 16MB (4x default)
|
||||
|
||||
options.filter_policy=leveldb::NewBloomFilterPolicy2(16);
|
||||
options.write_buffer_size=60 * 1024 * 1024;
|
||||
options.total_leveldb_mem=2500ULL * 1024ULL * 1024ULL;
|
||||
options.env=leveldb::Env::Default();
|
||||
options.compression = leveldb::kNoCompression;
|
||||
options.info_log = new CBitcoinLevelDBLogger();
|
||||
return options;
|
||||
|
||||
auto write_cache = std::min(nCacheSize / 4, size_t(4 * 1024 * 1024)); // cap write_cache at 4MB (default)
|
||||
options.block_cache = leveldb::NewLRUCache(nCacheSize - write_cache * 2);
|
||||
// options.block_cache = new CappedLenCache(nCacheSize - write_cache * 2, 6);
|
||||
options.write_buffer_size = write_cache; // up to two write buffers may be held in memory simultaneously
|
||||
options.filter_policy = leveldb::NewBloomFilterPolicy(10);
|
||||
options.compression = leveldb::kNoCompression;
|
||||
|
@ -112,6 +146,7 @@ static leveldb::Options GetOptions(size_t nCacheSize)
|
|||
options.paranoid_checks = true;
|
||||
}
|
||||
SetMaxOpenFiles(&options);
|
||||
options.max_open_files = 30000;
|
||||
return options;
|
||||
}
|
||||
|
||||
|
|
|
@ -81,7 +81,7 @@ public:
|
|||
ssValue.Xor(dbwrapper_private::GetObfuscateKey(parent));
|
||||
leveldb::Slice slValue(ssValue.data(), ssValue.size());
|
||||
|
||||
batch.Put(slKey, slValue);
|
||||
batch.Put(slKey, slValue, nullptr);
|
||||
// LevelDB serializes writes as:
|
||||
// - byte: header
|
||||
// - varint: key length (1 byte up to 127B, 2 bytes up to 16383B, ...)
|
||||
|
|
|
@ -1461,7 +1461,7 @@ bool AppInitMain()
|
|||
pblocktree.reset();
|
||||
pblocktree.reset(new CBlockTreeDB(nBlockTreeDBCache, false, fReset));
|
||||
delete pclaimTrie;
|
||||
pclaimTrie = new CClaimTrie(false, fReindex);
|
||||
pclaimTrie = new CClaimTrie(false, fReindex || fReindexChainState);
|
||||
|
||||
if (fReset) {
|
||||
pblocktree->WriteReindexing(true);
|
||||
|
|
13
src/leveldb/.gitignore
vendored
13
src/leveldb/.gitignore
vendored
|
@ -1,13 +0,0 @@
|
|||
build_config.mk
|
||||
*.a
|
||||
*.o
|
||||
*.dylib*
|
||||
*.so
|
||||
*.so.*
|
||||
*_test
|
||||
db_bench
|
||||
leveldbutil
|
||||
Release
|
||||
Debug
|
||||
Benchmark
|
||||
vs2010.*
|
|
@ -6,7 +6,3 @@ Google Inc.
|
|||
# Initial version authors:
|
||||
Jeffrey Dean <jeff@google.com>
|
||||
Sanjay Ghemawat <sanjay@google.com>
|
||||
|
||||
# Partial list of contributors:
|
||||
Kevin Regan <kevin.d.regan@gmail.com>
|
||||
Johan Bilien <jobi@litl.com>
|
||||
|
|
72
src/leveldb/BASHO_RELEASES
Normal file
72
src/leveldb/BASHO_RELEASES
Normal file
|
@ -0,0 +1,72 @@
|
|||
github.com tag 2.0.34 - February 15, 2017
|
||||
-----------------------------------------
|
||||
mv-hot-backup2: - correct MakeTieredDbname() within db/filename.cc
|
||||
for case where dbname input is blank and fast/slow
|
||||
already populated in options. Corrects issue
|
||||
with hot backup in non-tiered storage situations
|
||||
|
||||
github.com tag 2.0.33 - November 21, 2016
|
||||
-----------------------------------------
|
||||
mv-bucket-expiry: - partial branch to enable X-Riak-Meta-Expiry-Base-Seconds
|
||||
property within enterprise edition
|
||||
|
||||
--- no 2.0.32 tag on leveldb ---
|
||||
|
||||
github.com tag 2.0.31 - November 1, 2016
|
||||
----------------------------------------
|
||||
- version shipped with Riak 2.2
|
||||
mv-no-md-expiry: - Riak specific
|
||||
- never convert a key prefix of sext:encoded "{md" to expiry
|
||||
- update sst_scan for dumping Riak formated keys
|
||||
mv-tuning8: - rework penalty rules in version_set.cc UpdatePenalty()
|
||||
- add unit test framework for UpdatePenalty()
|
||||
|
||||
github.com tag 2.0.30 - October 11, 2016
|
||||
----------------------------------------
|
||||
mv-delayed-bloom: - when opening an .sst table file, only load
|
||||
bloom filter on second Get() operation. Saves time.
|
||||
- correct VersionSet::Finalize() logic for level 1 when
|
||||
when level 2 is above desired size
|
||||
- move hot backup to Riak ee build
|
||||
|
||||
github.com tag 2.0.29 - September 13, 2016
|
||||
------------------------------------------
|
||||
mv-expiry-manifest: only switch to expiry enabled manifest format
|
||||
if expiry function enabled. Eases downgrade
|
||||
during early Riak releases containing expiry
|
||||
|
||||
github.com tag 2.0.28 - September 6, 2016
|
||||
-----------------------------------------
|
||||
mv-hot-backup: add externally triggered hot backup feature
|
||||
|
||||
github.com tag 2.0.27 - August 22, 2016
|
||||
---------------------------------------
|
||||
mv-mem-fences: fix iterator double delete bug in eleveldb and
|
||||
build better memory fenced operations for referenced count objects.
|
||||
|
||||
github.com tag 2.0.26 - August 21, 2016
|
||||
---------------------------------------
|
||||
mv-expiry-iter-bug: DBImpl::NewIterator() was not setting the new expiry parameter.
|
||||
|
||||
github.com tag 2.0.25 - August 10, 2016
|
||||
---------------------------------------
|
||||
Make LZ4 the default compression instead of Snappy.
|
||||
|
||||
github.com tag 2.0.24 - August 2, 2016
|
||||
--------------------------------------
|
||||
mv-expiry: open source expiry. Supports one expiry policy for all databases.
|
||||
|
||||
github.com tag 2.0.23 - July 20, 2016
|
||||
-------------------------------------
|
||||
mv-no-semaphore: remove semaphore controlled thread in hot_threads.cc. Instead use
|
||||
use mutex of thread 0 (only one thread's mutex) to address know race condition.
|
||||
|
||||
github.com tag 2.0.22 - June 22, 2016
|
||||
-------------------------------------
|
||||
no change: iterator fix in eleveldb
|
||||
|
||||
github.com tag 2.0.21 - June 16, 2016
|
||||
-------------------------------------
|
||||
branch mv-iterator-hot-threads: correct condition where eleveldb MoveTask
|
||||
could hang an iterator. (https://github.com/basho/leveldb/wiki/mv-iterator-hot-threads)
|
||||
|
|
@ -1,36 +0,0 @@
|
|||
# Contributing
|
||||
|
||||
We'd love to accept your code patches! However, before we can take them, we
|
||||
have to jump a couple of legal hurdles.
|
||||
|
||||
## Contributor License Agreements
|
||||
|
||||
Please fill out either the individual or corporate Contributor License
|
||||
Agreement as appropriate.
|
||||
|
||||
* If you are an individual writing original source code and you're sure you
|
||||
own the intellectual property, then sign an [individual CLA](https://developers.google.com/open-source/cla/individual).
|
||||
* If you work for a company that wants to allow you to contribute your work,
|
||||
then sign a [corporate CLA](https://developers.google.com/open-source/cla/corporate).
|
||||
|
||||
Follow either of the two links above to access the appropriate CLA and
|
||||
instructions for how to sign and return it.
|
||||
|
||||
## Submitting a Patch
|
||||
|
||||
1. Sign the contributors license agreement above.
|
||||
2. Decide which code you want to submit. A submission should be a set of changes
|
||||
that addresses one issue in the [issue tracker](https://github.com/google/leveldb/issues).
|
||||
Please don't mix more than one logical change per submission, because it makes
|
||||
the history hard to follow. If you want to make a change
|
||||
(e.g. add a sample or feature) that doesn't have a corresponding issue in the
|
||||
issue tracker, please create one.
|
||||
3. **Submitting**: When you are ready to submit, send us a Pull Request. Be
|
||||
sure to include the issue number you fixed and the name you used to sign
|
||||
the CLA.
|
||||
|
||||
## Writing Code ##
|
||||
|
||||
If your contribution contains code, please make sure that it follows
|
||||
[the style guide](http://google-styleguide.googlecode.com/svn/trunk/cppguide.xml).
|
||||
Otherwise we will have to ask you to make changes, and that's no fun for anyone.
|
|
@ -2,423 +2,219 @@
|
|||
# Use of this source code is governed by a BSD-style license that can be
|
||||
# found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
# Inherit some settings from environment variables, if available
|
||||
INSTALL_PATH ?= $(CURDIR)
|
||||
|
||||
#-----------------------------------------------
|
||||
# Uncomment exactly one of the lines labelled (A), (B), and (C) below
|
||||
# to switch between compilation modes.
|
||||
# NOTE: targets "debug" and "prof" provide same functionality
|
||||
# NOTE 2: -DNDEBUG disables assert() statements within C code,
|
||||
# i.e. no assert()s in production code
|
||||
|
||||
# (A) Production use (optimized mode)
|
||||
OPT ?= -O2 -DNDEBUG
|
||||
# (B) Debug mode, w/ full line-level debugging symbols
|
||||
# OPT ?= -g2
|
||||
# (C) Profiling mode: opt, but w/debugging symbols
|
||||
# OPT ?= -O2 -g2 -DNDEBUG
|
||||
OPT ?= -O2 -g -DNDEBUG # (A) Production use (optimized mode)
|
||||
# OPT ?= -g2 # (B) Debug mode, w/ full line-level debugging symbols
|
||||
# OPT ?= -O2 -g2 -DNDEBUG # (C) Profiling mode: opt, but w/debugging symbols
|
||||
#-----------------------------------------------
|
||||
|
||||
# detect what platform we're building on
|
||||
$(shell CC="$(CC)" CXX="$(CXX)" TARGET_OS="$(TARGET_OS)" \
|
||||
./build_detect_platform build_config.mk ./)
|
||||
ifeq ($(wildcard build_config.mk),)
|
||||
$(shell ./build_detect_platform build_config.mk)
|
||||
endif
|
||||
# this file is generated by the previous line to set build flags and sources
|
||||
include build_config.mk
|
||||
|
||||
TESTS = \
|
||||
db/autocompact_test \
|
||||
db/c_test \
|
||||
db/corruption_test \
|
||||
db/db_test \
|
||||
db/dbformat_test \
|
||||
db/fault_injection_test \
|
||||
db/filename_test \
|
||||
db/log_test \
|
||||
db/recovery_test \
|
||||
db/skiplist_test \
|
||||
db/version_edit_test \
|
||||
db/version_set_test \
|
||||
db/write_batch_test \
|
||||
helpers/memenv/memenv_test \
|
||||
issues/issue178_test \
|
||||
issues/issue200_test \
|
||||
table/filter_block_test \
|
||||
table/table_test \
|
||||
util/arena_test \
|
||||
util/bloom_test \
|
||||
util/cache_test \
|
||||
util/coding_test \
|
||||
util/crc32c_test \
|
||||
util/env_posix_test \
|
||||
util/env_test \
|
||||
util/hash_test
|
||||
|
||||
UTILS = \
|
||||
db/db_bench \
|
||||
db/leveldbutil
|
||||
|
||||
# Put the object files in a subdirectory, but the application at the top of the object dir.
|
||||
PROGNAMES := $(notdir $(TESTS) $(UTILS))
|
||||
|
||||
# On Linux may need libkyotocabinet-dev for dependency.
|
||||
BENCHMARKS = \
|
||||
doc/bench/db_bench_sqlite3 \
|
||||
doc/bench/db_bench_tree_db
|
||||
|
||||
CFLAGS += -I. -I./include $(PLATFORM_CCFLAGS) $(OPT)
|
||||
CXXFLAGS += -I. -I./include $(PLATFORM_CXXFLAGS) $(OPT)
|
||||
|
||||
LDFLAGS += $(PLATFORM_LDFLAGS)
|
||||
LIBS += $(PLATFORM_LIBS)
|
||||
|
||||
SIMULATOR_OUTDIR=out-ios-x86
|
||||
DEVICE_OUTDIR=out-ios-arm
|
||||
LIBOBJECTS := $(SOURCES:.cc=.o)
|
||||
LIBOBJECTS += util/lz4.o
|
||||
MEMENVOBJECTS = $(MEMENV_SOURCES:.cc=.o)
|
||||
DEPEND := $(SOURCES:.cc=.d)
|
||||
|
||||
ifeq ($(PLATFORM), IOS)
|
||||
# Note: iOS should probably be using libtool, not ar.
|
||||
AR=xcrun ar
|
||||
SIMULATORSDK=$(shell xcrun -sdk iphonesimulator --show-sdk-path)
|
||||
DEVICESDK=$(shell xcrun -sdk iphoneos --show-sdk-path)
|
||||
DEVICE_CFLAGS = -isysroot "$(DEVICESDK)" -arch armv6 -arch armv7 -arch armv7s -arch arm64
|
||||
SIMULATOR_CFLAGS = -isysroot "$(SIMULATORSDK)" -arch i686 -arch x86_64
|
||||
STATIC_OUTDIR=out-ios-universal
|
||||
TESTUTIL = ./util/testutil.o
|
||||
TESTHARNESS = ./util/testharness.o $(TESTUTIL)
|
||||
|
||||
TESTS := $(sort $(notdir $(basename $(TEST_SOURCES))))
|
||||
|
||||
TOOLS = \
|
||||
leveldb_repair \
|
||||
perf_dump \
|
||||
sst_rewrite \
|
||||
sst_scan
|
||||
|
||||
PROGRAMS = db_bench $(TESTS) $(TOOLS)
|
||||
BENCHMARKS = db_bench_sqlite3 db_bench_tree_db
|
||||
|
||||
LIBRARY = libleveldb.a
|
||||
MEMENVLIBRARY = libmemenv.a
|
||||
|
||||
#
|
||||
# static link leveldb to tools to simplify platform usage (if Linux)
|
||||
#
|
||||
ifeq ($(PLATFORM),OS_LINUX)
|
||||
LEVEL_LDFLAGS := -L . -Wl,-non_shared -lleveldb -Wl,-call_shared
|
||||
else
|
||||
STATIC_OUTDIR=out-static
|
||||
SHARED_OUTDIR=out-shared
|
||||
STATIC_PROGRAMS := $(addprefix $(STATIC_OUTDIR)/, $(PROGNAMES))
|
||||
SHARED_PROGRAMS := $(addprefix $(SHARED_OUTDIR)/, db_bench)
|
||||
LEVEL_LDFLAGS := -L . -lleveldb
|
||||
endif
|
||||
|
||||
STATIC_LIBOBJECTS := $(addprefix $(STATIC_OUTDIR)/, $(SOURCES:.cc=.o))
|
||||
STATIC_MEMENVOBJECTS := $(addprefix $(STATIC_OUTDIR)/, $(MEMENV_SOURCES:.cc=.o))
|
||||
|
||||
DEVICE_LIBOBJECTS := $(addprefix $(DEVICE_OUTDIR)/, $(SOURCES:.cc=.o))
|
||||
DEVICE_MEMENVOBJECTS := $(addprefix $(DEVICE_OUTDIR)/, $(MEMENV_SOURCES:.cc=.o))
|
||||
|
||||
SIMULATOR_LIBOBJECTS := $(addprefix $(SIMULATOR_OUTDIR)/, $(SOURCES:.cc=.o))
|
||||
SIMULATOR_MEMENVOBJECTS := $(addprefix $(SIMULATOR_OUTDIR)/, $(MEMENV_SOURCES:.cc=.o))
|
||||
|
||||
SHARED_LIBOBJECTS := $(addprefix $(SHARED_OUTDIR)/, $(SOURCES:.cc=.o))
|
||||
SHARED_MEMENVOBJECTS := $(addprefix $(SHARED_OUTDIR)/, $(MEMENV_SOURCES:.cc=.o))
|
||||
|
||||
TESTUTIL := $(STATIC_OUTDIR)/util/testutil.o
|
||||
TESTHARNESS := $(STATIC_OUTDIR)/util/testharness.o $(TESTUTIL)
|
||||
|
||||
STATIC_TESTOBJS := $(addprefix $(STATIC_OUTDIR)/, $(addsuffix .o, $(TESTS)))
|
||||
STATIC_UTILOBJS := $(addprefix $(STATIC_OUTDIR)/, $(addsuffix .o, $(UTILS)))
|
||||
STATIC_ALLOBJS := $(STATIC_LIBOBJECTS) $(STATIC_MEMENVOBJECTS) $(STATIC_TESTOBJS) $(STATIC_UTILOBJS) $(TESTHARNESS)
|
||||
DEVICE_ALLOBJS := $(DEVICE_LIBOBJECTS) $(DEVICE_MEMENVOBJECTS)
|
||||
SIMULATOR_ALLOBJS := $(SIMULATOR_LIBOBJECTS) $(SIMULATOR_MEMENVOBJECTS)
|
||||
|
||||
default: all
|
||||
|
||||
# Should we build shared libraries?
|
||||
ifneq ($(PLATFORM_SHARED_EXT),)
|
||||
|
||||
# Many leveldb test apps use non-exported API's. Only build a subset for testing.
|
||||
SHARED_ALLOBJS := $(SHARED_LIBOBJECTS) $(SHARED_MEMENVOBJECTS) $(TESTHARNESS)
|
||||
|
||||
ifneq ($(PLATFORM_SHARED_VERSIONED),true)
|
||||
SHARED_LIB1 = libleveldb.$(PLATFORM_SHARED_EXT)
|
||||
SHARED_LIB2 = $(SHARED_LIB1)
|
||||
SHARED_LIB3 = $(SHARED_LIB1)
|
||||
SHARED_LIBS = $(SHARED_LIB1)
|
||||
SHARED_MEMENVLIB = $(SHARED_OUTDIR)/libmemenv.a
|
||||
SHARED1 = libleveldb.$(PLATFORM_SHARED_EXT)
|
||||
SHARED2 = $(SHARED1)
|
||||
SHARED3 = $(SHARED1)
|
||||
SHARED = $(SHARED1)
|
||||
else
|
||||
# Update db.h if you change these.
|
||||
SHARED_VERSION_MAJOR = 1
|
||||
SHARED_VERSION_MINOR = 20
|
||||
SHARED_LIB1 = libleveldb.$(PLATFORM_SHARED_EXT)
|
||||
SHARED_LIB2 = $(SHARED_LIB1).$(SHARED_VERSION_MAJOR)
|
||||
SHARED_LIB3 = $(SHARED_LIB1).$(SHARED_VERSION_MAJOR).$(SHARED_VERSION_MINOR)
|
||||
SHARED_LIBS = $(SHARED_OUTDIR)/$(SHARED_LIB1) $(SHARED_OUTDIR)/$(SHARED_LIB2) $(SHARED_OUTDIR)/$(SHARED_LIB3)
|
||||
$(SHARED_OUTDIR)/$(SHARED_LIB1): $(SHARED_OUTDIR)/$(SHARED_LIB3)
|
||||
ln -fs $(SHARED_LIB3) $(SHARED_OUTDIR)/$(SHARED_LIB1)
|
||||
$(SHARED_OUTDIR)/$(SHARED_LIB2): $(SHARED_OUTDIR)/$(SHARED_LIB3)
|
||||
ln -fs $(SHARED_LIB3) $(SHARED_OUTDIR)/$(SHARED_LIB2)
|
||||
SHARED_MEMENVLIB = $(SHARED_OUTDIR)/libmemenv.a
|
||||
SHARED_MAJOR = 1
|
||||
SHARED_MINOR = 9
|
||||
SHARED1 = libleveldb.$(PLATFORM_SHARED_EXT)
|
||||
SHARED2 = $(SHARED1).$(SHARED_MAJOR)
|
||||
SHARED3 = $(SHARED1).$(SHARED_MAJOR).$(SHARED_MINOR)
|
||||
SHARED = $(SHARED1) $(SHARED2) $(SHARED3)
|
||||
$(SHARED1): $(SHARED3)
|
||||
ln -fs $(SHARED3) $(SHARED1)
|
||||
$(SHARED2): $(SHARED3)
|
||||
ln -fs $(SHARED3) $(SHARED2)
|
||||
endif
|
||||
|
||||
$(SHARED_OUTDIR)/$(SHARED_LIB3): $(SHARED_LIBOBJECTS)
|
||||
$(CXX) $(LDFLAGS) $(PLATFORM_SHARED_LDFLAGS)$(SHARED_LIB2) $(SHARED_LIBOBJECTS) -o $(SHARED_OUTDIR)/$(SHARED_LIB3) $(LIBS)
|
||||
$(SHARED3): $(LIBOBJECTS)
|
||||
$(CXX) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) $(LIBOBJECTS) -o $(SHARED3) $(LDFLAGS) $(PLATFORM_SHARED_LDFLAGS)$(SHARED2)
|
||||
|
||||
endif # PLATFORM_SHARED_EXT
|
||||
|
||||
all: $(SHARED_LIBS) $(SHARED_PROGRAMS) $(STATIC_OUTDIR)/libleveldb.a $(STATIC_OUTDIR)/libmemenv.a $(STATIC_PROGRAMS)
|
||||
all: $(SHARED) $(LIBRARY)
|
||||
|
||||
check: $(STATIC_PROGRAMS)
|
||||
for t in $(notdir $(TESTS)); do echo "***** Running $$t"; $(STATIC_OUTDIR)/$$t || exit 1; done
|
||||
test check: all $(PROGRAMS) $(TESTS)
|
||||
for t in $(TESTS); do echo "***** Running $$t"; ./$$t || exit 1; done
|
||||
|
||||
clean:
|
||||
-rm -rf out-static out-shared out-ios-x86 out-ios-arm out-ios-universal
|
||||
-rm -f build_config.mk
|
||||
-rm -rf ios-x86 ios-arm
|
||||
tools: all $(TOOLS)
|
||||
|
||||
$(STATIC_OUTDIR):
|
||||
mkdir $@
|
||||
|
||||
$(STATIC_OUTDIR)/db: | $(STATIC_OUTDIR)
|
||||
mkdir $@
|
||||
|
||||
$(STATIC_OUTDIR)/helpers/memenv: | $(STATIC_OUTDIR)
|
||||
mkdir -p $@
|
||||
|
||||
$(STATIC_OUTDIR)/port: | $(STATIC_OUTDIR)
|
||||
mkdir $@
|
||||
|
||||
$(STATIC_OUTDIR)/table: | $(STATIC_OUTDIR)
|
||||
mkdir $@
|
||||
|
||||
$(STATIC_OUTDIR)/util: | $(STATIC_OUTDIR)
|
||||
mkdir $@
|
||||
|
||||
.PHONY: STATIC_OBJDIRS
|
||||
STATIC_OBJDIRS: \
|
||||
$(STATIC_OUTDIR)/db \
|
||||
$(STATIC_OUTDIR)/port \
|
||||
$(STATIC_OUTDIR)/table \
|
||||
$(STATIC_OUTDIR)/util \
|
||||
$(STATIC_OUTDIR)/helpers/memenv
|
||||
|
||||
$(SHARED_OUTDIR):
|
||||
mkdir $@
|
||||
|
||||
$(SHARED_OUTDIR)/db: | $(SHARED_OUTDIR)
|
||||
mkdir $@
|
||||
|
||||
$(SHARED_OUTDIR)/helpers/memenv: | $(SHARED_OUTDIR)
|
||||
mkdir -p $@
|
||||
|
||||
$(SHARED_OUTDIR)/port: | $(SHARED_OUTDIR)
|
||||
mkdir $@
|
||||
|
||||
$(SHARED_OUTDIR)/table: | $(SHARED_OUTDIR)
|
||||
mkdir $@
|
||||
|
||||
$(SHARED_OUTDIR)/util: | $(SHARED_OUTDIR)
|
||||
mkdir $@
|
||||
|
||||
.PHONY: SHARED_OBJDIRS
|
||||
SHARED_OBJDIRS: \
|
||||
$(SHARED_OUTDIR)/db \
|
||||
$(SHARED_OUTDIR)/port \
|
||||
$(SHARED_OUTDIR)/table \
|
||||
$(SHARED_OUTDIR)/util \
|
||||
$(SHARED_OUTDIR)/helpers/memenv
|
||||
|
||||
$(DEVICE_OUTDIR):
|
||||
mkdir $@
|
||||
|
||||
$(DEVICE_OUTDIR)/db: | $(DEVICE_OUTDIR)
|
||||
mkdir $@
|
||||
|
||||
$(DEVICE_OUTDIR)/helpers/memenv: | $(DEVICE_OUTDIR)
|
||||
mkdir -p $@
|
||||
|
||||
$(DEVICE_OUTDIR)/port: | $(DEVICE_OUTDIR)
|
||||
mkdir $@
|
||||
|
||||
$(DEVICE_OUTDIR)/table: | $(DEVICE_OUTDIR)
|
||||
mkdir $@
|
||||
|
||||
$(DEVICE_OUTDIR)/util: | $(DEVICE_OUTDIR)
|
||||
mkdir $@
|
||||
|
||||
.PHONY: DEVICE_OBJDIRS
|
||||
DEVICE_OBJDIRS: \
|
||||
$(DEVICE_OUTDIR)/db \
|
||||
$(DEVICE_OUTDIR)/port \
|
||||
$(DEVICE_OUTDIR)/table \
|
||||
$(DEVICE_OUTDIR)/util \
|
||||
$(DEVICE_OUTDIR)/helpers/memenv
|
||||
|
||||
$(SIMULATOR_OUTDIR):
|
||||
mkdir $@
|
||||
|
||||
$(SIMULATOR_OUTDIR)/db: | $(SIMULATOR_OUTDIR)
|
||||
mkdir $@
|
||||
|
||||
$(SIMULATOR_OUTDIR)/helpers/memenv: | $(SIMULATOR_OUTDIR)
|
||||
mkdir -p $@
|
||||
|
||||
$(SIMULATOR_OUTDIR)/port: | $(SIMULATOR_OUTDIR)
|
||||
mkdir $@
|
||||
|
||||
$(SIMULATOR_OUTDIR)/table: | $(SIMULATOR_OUTDIR)
|
||||
mkdir $@
|
||||
|
||||
$(SIMULATOR_OUTDIR)/util: | $(SIMULATOR_OUTDIR)
|
||||
mkdir $@
|
||||
|
||||
.PHONY: SIMULATOR_OBJDIRS
|
||||
SIMULATOR_OBJDIRS: \
|
||||
$(SIMULATOR_OUTDIR)/db \
|
||||
$(SIMULATOR_OUTDIR)/port \
|
||||
$(SIMULATOR_OUTDIR)/table \
|
||||
$(SIMULATOR_OUTDIR)/util \
|
||||
$(SIMULATOR_OUTDIR)/helpers/memenv
|
||||
|
||||
$(STATIC_ALLOBJS): | STATIC_OBJDIRS
|
||||
$(DEVICE_ALLOBJS): | DEVICE_OBJDIRS
|
||||
$(SIMULATOR_ALLOBJS): | SIMULATOR_OBJDIRS
|
||||
$(SHARED_ALLOBJS): | SHARED_OBJDIRS
|
||||
|
||||
ifeq ($(PLATFORM), IOS)
|
||||
$(DEVICE_OUTDIR)/libleveldb.a: $(DEVICE_LIBOBJECTS)
|
||||
rm -f $@
|
||||
$(AR) -rs $@ $(DEVICE_LIBOBJECTS)
|
||||
|
||||
$(SIMULATOR_OUTDIR)/libleveldb.a: $(SIMULATOR_LIBOBJECTS)
|
||||
rm -f $@
|
||||
$(AR) -rs $@ $(SIMULATOR_LIBOBJECTS)
|
||||
|
||||
$(DEVICE_OUTDIR)/libmemenv.a: $(DEVICE_MEMENVOBJECTS)
|
||||
rm -f $@
|
||||
$(AR) -rs $@ $(DEVICE_MEMENVOBJECTS)
|
||||
|
||||
$(SIMULATOR_OUTDIR)/libmemenv.a: $(SIMULATOR_MEMENVOBJECTS)
|
||||
rm -f $@
|
||||
$(AR) -rs $@ $(SIMULATOR_MEMENVOBJECTS)
|
||||
|
||||
# For iOS, create universal object libraries to be used on both the simulator and
|
||||
# a device.
|
||||
$(STATIC_OUTDIR)/libleveldb.a: $(STATIC_OUTDIR) $(DEVICE_OUTDIR)/libleveldb.a $(SIMULATOR_OUTDIR)/libleveldb.a
|
||||
lipo -create $(DEVICE_OUTDIR)/libleveldb.a $(SIMULATOR_OUTDIR)/libleveldb.a -output $@
|
||||
|
||||
$(STATIC_OUTDIR)/libmemenv.a: $(STATIC_OUTDIR) $(DEVICE_OUTDIR)/libmemenv.a $(SIMULATOR_OUTDIR)/libmemenv.a
|
||||
lipo -create $(DEVICE_OUTDIR)/libmemenv.a $(SIMULATOR_OUTDIR)/libmemenv.a -output $@
|
||||
else
|
||||
$(STATIC_OUTDIR)/libleveldb.a:$(STATIC_LIBOBJECTS)
|
||||
rm -f $@
|
||||
$(AR) -rs $@ $(STATIC_LIBOBJECTS)
|
||||
|
||||
$(STATIC_OUTDIR)/libmemenv.a:$(STATIC_MEMENVOBJECTS)
|
||||
rm -f $@
|
||||
$(AR) -rs $@ $(STATIC_MEMENVOBJECTS)
|
||||
#
|
||||
# command line targets: debug and prof
|
||||
# just like
|
||||
ifneq ($(filter debug,$(MAKECMDGOALS)),)
|
||||
OPT := -g2 # (B) Debug mode, w/ full line-level debugging symbols
|
||||
debug: all
|
||||
endif
|
||||
|
||||
$(SHARED_MEMENVLIB):$(SHARED_MEMENVOBJECTS)
|
||||
ifneq ($(filter prof,$(MAKECMDGOALS)),)
|
||||
OPT := -O2 -g2 -DNDEBUG # (C) Profiling mode: opt, but w/debugging symbols
|
||||
prof: all
|
||||
endif
|
||||
|
||||
|
||||
clean:
|
||||
-rm -f $(PROGRAMS) $(BENCHMARKS) $(LIBRARY) $(SHARED) $(MEMENVLIBRARY) */*.o */*/*.o */*.d */*/*.d ios-x86/*/*.o ios-arm/*/*.o build_config.mk include/leveldb/ldb_config.h
|
||||
-rm -rf ios-x86/* ios-arm/* *.dSYM
|
||||
|
||||
|
||||
$(LIBRARY): $(LIBOBJECTS)
|
||||
rm -f $@
|
||||
$(AR) -rs $@ $(SHARED_MEMENVOBJECTS)
|
||||
$(AR) -rs $@ $(LIBOBJECTS)
|
||||
|
||||
$(STATIC_OUTDIR)/db_bench:db/db_bench.cc $(STATIC_LIBOBJECTS) $(TESTUTIL)
|
||||
$(CXX) $(LDFLAGS) $(CXXFLAGS) db/db_bench.cc $(STATIC_LIBOBJECTS) $(TESTUTIL) -o $@ $(LIBS)
|
||||
#
|
||||
# all tools, programs, and tests depend upon the static library
|
||||
$(TESTS) $(PROGRAMS) $(TOOLS) : $(LIBRARY)
|
||||
|
||||
$(STATIC_OUTDIR)/db_bench_sqlite3:doc/bench/db_bench_sqlite3.cc $(STATIC_LIBOBJECTS) $(TESTUTIL)
|
||||
$(CXX) $(LDFLAGS) $(CXXFLAGS) doc/bench/db_bench_sqlite3.cc $(STATIC_LIBOBJECTS) $(TESTUTIL) -o $@ -lsqlite3 $(LIBS)
|
||||
#
|
||||
# all tests depend upon the test harness
|
||||
$(TESTS) : $(TESTHARNESS)
|
||||
|
||||
$(STATIC_OUTDIR)/db_bench_tree_db:doc/bench/db_bench_tree_db.cc $(STATIC_LIBOBJECTS) $(TESTUTIL)
|
||||
$(CXX) $(LDFLAGS) $(CXXFLAGS) doc/bench/db_bench_tree_db.cc $(STATIC_LIBOBJECTS) $(TESTUTIL) -o $@ -lkyotocabinet $(LIBS)
|
||||
#
|
||||
# tools, programs, and tests will compile to the root directory
|
||||
# but their .cc source file will be in one of the following subdirectories
|
||||
vpath %.cc db:table:util:leveldb_ee:leveldb_os
|
||||
|
||||
$(STATIC_OUTDIR)/leveldbutil:db/leveldbutil.cc $(STATIC_LIBOBJECTS)
|
||||
$(CXX) $(LDFLAGS) $(CXXFLAGS) db/leveldbutil.cc $(STATIC_LIBOBJECTS) -o $@ $(LIBS)
|
||||
# special case for c_test
|
||||
vpath %.c db
|
||||
|
||||
$(STATIC_OUTDIR)/arena_test:util/arena_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) $(LDFLAGS) $(CXXFLAGS) util/arena_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
|
||||
db_bench: db/db_bench.o $(LIBRARY) $(TESTUTIL)
|
||||
$(CXX) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) $< $(TESTUTIL) -o $@ $(LEVEL_LDFLAGS) $(LDFLAGS)
|
||||
|
||||
$(STATIC_OUTDIR)/autocompact_test:db/autocompact_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) $(LDFLAGS) $(CXXFLAGS) db/autocompact_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
|
||||
db_bench_sqlite3: doc/bench/db_bench_sqlite3.o $(LIBRARY) $(TESTUTIL)
|
||||
|
||||
$(STATIC_OUTDIR)/bloom_test:util/bloom_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) $(LDFLAGS) $(CXXFLAGS) util/bloom_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
|
||||
db_bench_tree_db: doc/bench/db_bench_tree_db.o $(LIBRARY) $(TESTUTIL)
|
||||
|
||||
$(STATIC_OUTDIR)/c_test:$(STATIC_OUTDIR)/db/c_test.o $(STATIC_LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) $(LDFLAGS) $(STATIC_OUTDIR)/db/c_test.o $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
|
||||
|
||||
$(STATIC_OUTDIR)/cache_test:util/cache_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) $(LDFLAGS) $(CXXFLAGS) util/cache_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
|
||||
#
|
||||
# build line taken from lz4 makefile
|
||||
#
|
||||
util/lz4.o: util/lz4.c util/lz4.h
|
||||
$(CC) $(CFLAGS) $(PLATFORM_SHARED_CFLAGS) -O3 -std=c99 -Wall -Wextra -Wundef -Wshadow -Wcast-qual -Wcast-align -Wstrict-prototypes -pedantic -DLZ4_VERSION=\"r130\" -c util/lz4.c -o util/lz4.o
|
||||
|
||||
$(STATIC_OUTDIR)/coding_test:util/coding_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) $(LDFLAGS) $(CXXFLAGS) util/coding_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
|
||||
#
|
||||
# memory env
|
||||
#
|
||||
$(MEMENVLIBRARY) : $(MEMENVOBJECTS)
|
||||
rm -f $@
|
||||
$(AR) -rs $@ $(MEMENVOBJECTS)
|
||||
|
||||
$(STATIC_OUTDIR)/corruption_test:db/corruption_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) $(LDFLAGS) $(CXXFLAGS) db/corruption_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
|
||||
memenv_test : helpers/memenv/memenv_test.o $(MEMENVLIBRARY) $(LIBRARY) $(TESTHARNESS)
|
||||
$(CXX) helpers/memenv/memenv_test.o $(MEMENVLIBRARY) $(LIBRARY) $(TESTHARNESS) -o $@ $(LDFLAGS)
|
||||
|
||||
$(STATIC_OUTDIR)/crc32c_test:util/crc32c_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) $(LDFLAGS) $(CXXFLAGS) util/crc32c_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
|
||||
#
|
||||
# IOS build
|
||||
#
|
||||
ifeq ($(PLATFORM), IOS)
|
||||
# For iOS, create universal object files to be used on both the simulator and
|
||||
# a device.
|
||||
PLATFORMSROOT=/Applications/Xcode.app/Contents/Developer/Platforms
|
||||
SIMULATORROOT=$(PLATFORMSROOT)/iPhoneSimulator.platform/Developer
|
||||
DEVICEROOT=$(PLATFORMSROOT)/iPhoneOS.platform/Developer
|
||||
IOSVERSION=$(shell defaults read $(PLATFORMSROOT)/iPhoneOS.platform/version CFBundleShortVersionString)
|
||||
|
||||
$(STATIC_OUTDIR)/db_test:db/db_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) $(LDFLAGS) $(CXXFLAGS) db/db_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
|
||||
.cc.o:
|
||||
mkdir -p ios-x86/$(dir $@)
|
||||
$(SIMULATORROOT)/usr/bin/$(CXX) $(CXXFLAGS) -isysroot $(SIMULATORROOT)/SDKs/iPhoneSimulator$(IOSVERSION).sdk -arch i686 -c $< -o ios-x86/$@
|
||||
mkdir -p ios-arm/$(dir $@)
|
||||
$(DEVICEROOT)/usr/bin/$(CXX) $(CXXFLAGS) -isysroot $(DEVICEROOT)/SDKs/iPhoneOS$(IOSVERSION).sdk -arch armv6 -arch armv7 -c $< -o ios-arm/$@
|
||||
lipo ios-x86/$@ ios-arm/$@ -create -output $@
|
||||
|
||||
$(STATIC_OUTDIR)/dbformat_test:db/dbformat_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) $(LDFLAGS) $(CXXFLAGS) db/dbformat_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
|
||||
.c.o:
|
||||
mkdir -p ios-x86/$(dir $@)
|
||||
$(SIMULATORROOT)/usr/bin/$(CC) $(CFLAGS) -isysroot $(SIMULATORROOT)/SDKs/iPhoneSimulator$(IOSVERSION).sdk -arch i686 -c $< -o ios-x86/$@
|
||||
mkdir -p ios-arm/$(dir $@)
|
||||
$(DEVICEROOT)/usr/bin/$(CC) $(CFLAGS) -isysroot $(DEVICEROOT)/SDKs/iPhoneOS$(IOSVERSION).sdk -arch armv6 -arch armv7 -c $< -o ios-arm/$@
|
||||
lipo ios-x86/$@ ios-arm/$@ -create -output $@
|
||||
|
||||
$(STATIC_OUTDIR)/env_posix_test:util/env_posix_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) $(LDFLAGS) $(CXXFLAGS) util/env_posix_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
|
||||
|
||||
$(STATIC_OUTDIR)/env_test:util/env_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) $(LDFLAGS) $(CXXFLAGS) util/env_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
|
||||
|
||||
$(STATIC_OUTDIR)/fault_injection_test:db/fault_injection_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) $(LDFLAGS) $(CXXFLAGS) db/fault_injection_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
|
||||
|
||||
$(STATIC_OUTDIR)/filename_test:db/filename_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) $(LDFLAGS) $(CXXFLAGS) db/filename_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
|
||||
|
||||
$(STATIC_OUTDIR)/filter_block_test:table/filter_block_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) $(LDFLAGS) $(CXXFLAGS) table/filter_block_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
|
||||
|
||||
$(STATIC_OUTDIR)/hash_test:util/hash_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) $(LDFLAGS) $(CXXFLAGS) util/hash_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
|
||||
|
||||
$(STATIC_OUTDIR)/issue178_test:issues/issue178_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) $(LDFLAGS) $(CXXFLAGS) issues/issue178_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
|
||||
|
||||
$(STATIC_OUTDIR)/issue200_test:issues/issue200_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) $(LDFLAGS) $(CXXFLAGS) issues/issue200_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
|
||||
|
||||
$(STATIC_OUTDIR)/log_test:db/log_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) $(LDFLAGS) $(CXXFLAGS) db/log_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
|
||||
|
||||
$(STATIC_OUTDIR)/recovery_test:db/recovery_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) $(LDFLAGS) $(CXXFLAGS) db/recovery_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
|
||||
|
||||
$(STATIC_OUTDIR)/table_test:table/table_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) $(LDFLAGS) $(CXXFLAGS) table/table_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
|
||||
|
||||
$(STATIC_OUTDIR)/skiplist_test:db/skiplist_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) $(LDFLAGS) $(CXXFLAGS) db/skiplist_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
|
||||
|
||||
$(STATIC_OUTDIR)/version_edit_test:db/version_edit_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) $(LDFLAGS) $(CXXFLAGS) db/version_edit_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
|
||||
|
||||
$(STATIC_OUTDIR)/version_set_test:db/version_set_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) $(LDFLAGS) $(CXXFLAGS) db/version_set_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
|
||||
|
||||
$(STATIC_OUTDIR)/write_batch_test:db/write_batch_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) $(LDFLAGS) $(CXXFLAGS) db/write_batch_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
|
||||
|
||||
$(STATIC_OUTDIR)/memenv_test:$(STATIC_OUTDIR)/helpers/memenv/memenv_test.o $(STATIC_OUTDIR)/libmemenv.a $(STATIC_OUTDIR)/libleveldb.a $(TESTHARNESS)
|
||||
$(XCRUN) $(CXX) $(LDFLAGS) $(STATIC_OUTDIR)/helpers/memenv/memenv_test.o $(STATIC_OUTDIR)/libmemenv.a $(STATIC_OUTDIR)/libleveldb.a $(TESTHARNESS) -o $@ $(LIBS)
|
||||
|
||||
$(SHARED_OUTDIR)/db_bench:$(SHARED_OUTDIR)/db/db_bench.o $(SHARED_LIBS) $(TESTUTIL)
|
||||
$(XCRUN) $(CXX) $(LDFLAGS) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) $(SHARED_OUTDIR)/db/db_bench.o $(TESTUTIL) $(SHARED_OUTDIR)/$(SHARED_LIB3) -o $@ $(LIBS)
|
||||
|
||||
.PHONY: run-shared
|
||||
run-shared: $(SHARED_OUTDIR)/db_bench
|
||||
LD_LIBRARY_PATH=$(SHARED_OUTDIR) $(SHARED_OUTDIR)/db_bench
|
||||
|
||||
$(SIMULATOR_OUTDIR)/%.o: %.cc
|
||||
xcrun -sdk iphonesimulator $(CXX) $(CXXFLAGS) $(SIMULATOR_CFLAGS) -c $< -o $@
|
||||
|
||||
$(DEVICE_OUTDIR)/%.o: %.cc
|
||||
xcrun -sdk iphoneos $(CXX) $(CXXFLAGS) $(DEVICE_CFLAGS) -c $< -o $@
|
||||
|
||||
$(SIMULATOR_OUTDIR)/%.o: %.c
|
||||
xcrun -sdk iphonesimulator $(CC) $(CFLAGS) $(SIMULATOR_CFLAGS) -c $< -o $@
|
||||
|
||||
$(DEVICE_OUTDIR)/%.o: %.c
|
||||
xcrun -sdk iphoneos $(CC) $(CFLAGS) $(DEVICE_CFLAGS) -c $< -o $@
|
||||
|
||||
$(STATIC_OUTDIR)/%.o: %.cc
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
|
||||
$(STATIC_OUTDIR)/%.o: %.c
|
||||
$(CC) $(CFLAGS) -c $< -o $@
|
||||
|
||||
$(SHARED_OUTDIR)/%.o: %.cc
|
||||
else
|
||||
#
|
||||
# build for everything NOT IOS
|
||||
#
|
||||
.cc.o:
|
||||
$(CXX) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) -c $< -o $@
|
||||
|
||||
$(SHARED_OUTDIR)/%.o: %.c
|
||||
.c.o:
|
||||
$(CC) $(CFLAGS) $(PLATFORM_SHARED_CFLAGS) -c $< -o $@
|
||||
|
||||
$(STATIC_OUTDIR)/port/port_posix_sse.o: port/port_posix_sse.cc
|
||||
$(CXX) $(CXXFLAGS) $(PLATFORM_SSEFLAGS) -c $< -o $@
|
||||
## @echo -- Creating dependency file for $<
|
||||
%.d: %.cc
|
||||
$(CC) $(CFLAGS) $(PLATFORM_SHARED_CFLAGS) -MM -E -MT $(basename $@).d -MT $(basename $@).o -MF $@ $<
|
||||
@echo $(basename $@).o: $(basename $@).d >>$@
|
||||
|
||||
$(SHARED_OUTDIR)/port/port_posix_sse.o: port/port_posix_sse.cc
|
||||
$(CXX) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) $(PLATFORM_SSEFLAGS) -c $< -o $@
|
||||
# generic build for command line tests
|
||||
%: %.cc
|
||||
$(CXX) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) $< $(TESTHARNESS) -o $@ $(LEVEL_LDFLAGS) $(LDFLAGS)
|
||||
|
||||
%: db/%.c
|
||||
$(CXX) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) $< $(TESTHARNESS) -o $@ $(LEVEL_LDFLAGS) $(LDFLAGS)
|
||||
|
||||
# for tools, omits test harness
|
||||
%: tools/%.cc
|
||||
$(CXX) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) $< -o $@ $(LEVEL_LDFLAGS) $(LDFLAGS)
|
||||
|
||||
endif
|
||||
|
||||
#
|
||||
# load dependency files
|
||||
#
|
||||
ifeq ($(filter tar clean allclean distclean,$(MAKECMDGOALS)),)
|
||||
-include $(DEPEND)
|
||||
endif
|
||||
|
|
83
src/leveldb/README
Normal file
83
src/leveldb/README
Normal file
|
@ -0,0 +1,83 @@
|
|||
leveldb: A key-value store
|
||||
Authors: Sanjay Ghemawat (sanjay@google.com) and Jeff Dean (jeff@google.com)
|
||||
|
||||
The original Google README is now README.GOOGLE.
|
||||
|
||||
** Introduction
|
||||
|
||||
This repository contains the Google source code as modified to benefit
|
||||
the Riak environment. The typical Riak environment has two attributes
|
||||
that necessitate leveldb adjustments, both in options and code:
|
||||
|
||||
- production servers: Riak often runs in heavy Internet environments:
|
||||
servers with many CPU cores, lots of memory, and 24x7 disk activity.
|
||||
Basho's leveldb takes advantage of the environment by adding
|
||||
hardware CRC calculation, increasing Bloom filter accuracy, and
|
||||
defaulting to integrity checking enabled.
|
||||
|
||||
- multiple databases open: Riak opens 8 to 128 databases
|
||||
simultaneously. Google's leveldb supports this, but its background
|
||||
compaction thread can fall behind. leveldb will "stall" new user
|
||||
writes whenever the compaction thread gets too far behind. Basho's
|
||||
leveldb modification include multiple thread blocks that each
|
||||
contain prioritized threads for specific compaction activities.
|
||||
|
||||
Details for Basho's customizations exist in the leveldb wiki:
|
||||
|
||||
http://github.com/basho/leveldb/wiki
|
||||
|
||||
|
||||
** Branch pattern
|
||||
|
||||
This repository follows the Basho standard for branch management
|
||||
as of November 28, 2013. The standard is found here:
|
||||
|
||||
https://github.com/basho/riak/wiki/Basho-repository-management
|
||||
|
||||
In summary, the "develop" branch contains the most recently reviewed
|
||||
engineering work. The "master" branch contains the most recently
|
||||
released work, i.e. distributed as part of a Riak release.
|
||||
|
||||
|
||||
** Basic options needed
|
||||
|
||||
Those wishing to truly savor the benefits of Basho's modifications
|
||||
need to initialize a new leveldb::Options structure similar to the
|
||||
following before each call to leveldb::DB::Open:
|
||||
|
||||
leveldb::Options * options;
|
||||
|
||||
options=new Leveldb::Options;
|
||||
|
||||
options.filter_policy=leveldb::NewBloomFilterPolicy2(16);
|
||||
options.write_buffer_size=62914560; // 60Mbytes
|
||||
options.total_leveldb_mem=2684354560; // 2.5Gbytes (details below)
|
||||
options.env=leveldb::Env::Default();
|
||||
|
||||
|
||||
** Memory plan
|
||||
|
||||
Basho's leveldb dramatically departed from Google's original internal
|
||||
memory allotment plan with Riak 2.0. Basho's leveldb uses a methodology
|
||||
called flexcache. The technical details are here:
|
||||
|
||||
https://github.com/basho/leveldb/wiki/mv-flexcache
|
||||
|
||||
The key points are:
|
||||
|
||||
- options.total_leveldb_mem is an allocation for the entire process,
|
||||
not a single database
|
||||
|
||||
- giving different values to options.total_leveldb_mem on subsequent Open
|
||||
calls causes memory to rearrange to current value across all databases
|
||||
|
||||
- recommended minimum for Basho's leveldb is 340Mbytes per database.
|
||||
|
||||
- performance improves rapidly from 340Mbytes to 2.5Gbytes per database (3.0Gbytes
|
||||
if using Riak's active anti-entropy). Even more is nice, but not as helpful.
|
||||
|
||||
- never assign more than 75% of available RAM to total_leveldb_mem. There is
|
||||
too much unaccounted memory overhead (worse if you use tcmalloc library).
|
||||
|
||||
- options.max_open_files and options.block_cache should not be used.
|
||||
|
51
src/leveldb/README.GOOGLE
Normal file
51
src/leveldb/README.GOOGLE
Normal file
|
@ -0,0 +1,51 @@
|
|||
leveldb: A key-value store
|
||||
Authors: Sanjay Ghemawat (sanjay@google.com) and Jeff Dean (jeff@google.com)
|
||||
|
||||
The code under this directory implements a system for maintaining a
|
||||
persistent key/value store.
|
||||
|
||||
See doc/index.html for more explanation.
|
||||
See doc/impl.html for a brief overview of the implementation.
|
||||
|
||||
The public interface is in include/*.h. Callers should not include or
|
||||
rely on the details of any other header files in this package. Those
|
||||
internal APIs may be changed without warning.
|
||||
|
||||
Guide to header files:
|
||||
|
||||
include/db.h
|
||||
Main interface to the DB: Start here
|
||||
|
||||
include/options.h
|
||||
Control over the behavior of an entire database, and also
|
||||
control over the behavior of individual reads and writes.
|
||||
|
||||
include/comparator.h
|
||||
Abstraction for user-specified comparison function. If you want
|
||||
just bytewise comparison of keys, you can use the default comparator,
|
||||
but clients can write their own comparator implementations if they
|
||||
want custom ordering (e.g. to handle different character
|
||||
encodings, etc.)
|
||||
|
||||
include/iterator.h
|
||||
Interface for iterating over data. You can get an iterator
|
||||
from a DB object.
|
||||
|
||||
include/write_batch.h
|
||||
Interface for atomically applying multiple updates to a database.
|
||||
|
||||
include/slice.h
|
||||
A simple module for maintaining a pointer and a length into some
|
||||
other byte array.
|
||||
|
||||
include/status.h
|
||||
Status is returned from many of the public interfaces and is used
|
||||
to report success and various kinds of errors.
|
||||
|
||||
include/env.h
|
||||
Abstraction of the OS environment. A posix implementation of
|
||||
this interface is in util/env_posix.cc
|
||||
|
||||
include/table.h
|
||||
include/table_builder.h
|
||||
Lower-level modules that most clients probably won't use directly
|
|
@ -1,174 +0,0 @@
|
|||
**LevelDB is a fast key-value storage library written at Google that provides an ordered mapping from string keys to string values.**
|
||||
|
||||
[![Build Status](https://travis-ci.org/google/leveldb.svg?branch=master)](https://travis-ci.org/google/leveldb)
|
||||
|
||||
Authors: Sanjay Ghemawat (sanjay@google.com) and Jeff Dean (jeff@google.com)
|
||||
|
||||
# Features
|
||||
* Keys and values are arbitrary byte arrays.
|
||||
* Data is stored sorted by key.
|
||||
* Callers can provide a custom comparison function to override the sort order.
|
||||
* The basic operations are `Put(key,value)`, `Get(key)`, `Delete(key)`.
|
||||
* Multiple changes can be made in one atomic batch.
|
||||
* Users can create a transient snapshot to get a consistent view of data.
|
||||
* Forward and backward iteration is supported over the data.
|
||||
* Data is automatically compressed using the [Snappy compression library](http://google.github.io/snappy/).
|
||||
* External activity (file system operations etc.) is relayed through a virtual interface so users can customize the operating system interactions.
|
||||
|
||||
# Documentation
|
||||
[LevelDB library documentation](https://github.com/google/leveldb/blob/master/doc/index.md) is online and bundled with the source code.
|
||||
|
||||
|
||||
# Limitations
|
||||
* This is not a SQL database. It does not have a relational data model, it does not support SQL queries, and it has no support for indexes.
|
||||
* Only a single process (possibly multi-threaded) can access a particular database at a time.
|
||||
* There is no client-server support builtin to the library. An application that needs such support will have to wrap their own server around the library.
|
||||
|
||||
# Contributing to the leveldb Project
|
||||
The leveldb project welcomes contributions. leveldb's primary goal is to be
|
||||
a reliable and fast key/value store. Changes that are in line with the
|
||||
features/limitations outlined above, and meet the requirements below,
|
||||
will be considered.
|
||||
|
||||
Contribution requirements:
|
||||
|
||||
1. **POSIX only**. We _generally_ will only accept changes that are both
|
||||
compiled, and tested on a POSIX platform - usually Linux. Very small
|
||||
changes will sometimes be accepted, but consider that more of an
|
||||
exception than the rule.
|
||||
|
||||
2. **Stable API**. We strive very hard to maintain a stable API. Changes that
|
||||
require changes for projects using leveldb _might_ be rejected without
|
||||
sufficient benefit to the project.
|
||||
|
||||
3. **Tests**: All changes must be accompanied by a new (or changed) test, or
|
||||
a sufficient explanation as to why a new (or changed) test is not required.
|
||||
|
||||
## Submitting a Pull Request
|
||||
Before any pull request will be accepted the author must first sign a
|
||||
Contributor License Agreement (CLA) at https://cla.developers.google.com/.
|
||||
|
||||
In order to keep the commit timeline linear
|
||||
[squash](https://git-scm.com/book/en/v2/Git-Tools-Rewriting-History#Squashing-Commits)
|
||||
your changes down to a single commit and [rebase](https://git-scm.com/docs/git-rebase)
|
||||
on google/leveldb/master. This keeps the commit timeline linear and more easily sync'ed
|
||||
with the internal repository at Google. More information at GitHub's
|
||||
[About Git rebase](https://help.github.com/articles/about-git-rebase/) page.
|
||||
|
||||
# Performance
|
||||
|
||||
Here is a performance report (with explanations) from the run of the
|
||||
included db_bench program. The results are somewhat noisy, but should
|
||||
be enough to get a ballpark performance estimate.
|
||||
|
||||
## Setup
|
||||
|
||||
We use a database with a million entries. Each entry has a 16 byte
|
||||
key, and a 100 byte value. Values used by the benchmark compress to
|
||||
about half their original size.
|
||||
|
||||
LevelDB: version 1.1
|
||||
Date: Sun May 1 12:11:26 2011
|
||||
CPU: 4 x Intel(R) Core(TM)2 Quad CPU Q6600 @ 2.40GHz
|
||||
CPUCache: 4096 KB
|
||||
Keys: 16 bytes each
|
||||
Values: 100 bytes each (50 bytes after compression)
|
||||
Entries: 1000000
|
||||
Raw Size: 110.6 MB (estimated)
|
||||
File Size: 62.9 MB (estimated)
|
||||
|
||||
## Write performance
|
||||
|
||||
The "fill" benchmarks create a brand new database, in either
|
||||
sequential, or random order. The "fillsync" benchmark flushes data
|
||||
from the operating system to the disk after every operation; the other
|
||||
write operations leave the data sitting in the operating system buffer
|
||||
cache for a while. The "overwrite" benchmark does random writes that
|
||||
update existing keys in the database.
|
||||
|
||||
fillseq : 1.765 micros/op; 62.7 MB/s
|
||||
fillsync : 268.409 micros/op; 0.4 MB/s (10000 ops)
|
||||
fillrandom : 2.460 micros/op; 45.0 MB/s
|
||||
overwrite : 2.380 micros/op; 46.5 MB/s
|
||||
|
||||
Each "op" above corresponds to a write of a single key/value pair.
|
||||
I.e., a random write benchmark goes at approximately 400,000 writes per second.
|
||||
|
||||
Each "fillsync" operation costs much less (0.3 millisecond)
|
||||
than a disk seek (typically 10 milliseconds). We suspect that this is
|
||||
because the hard disk itself is buffering the update in its memory and
|
||||
responding before the data has been written to the platter. This may
|
||||
or may not be safe based on whether or not the hard disk has enough
|
||||
power to save its memory in the event of a power failure.
|
||||
|
||||
## Read performance
|
||||
|
||||
We list the performance of reading sequentially in both the forward
|
||||
and reverse direction, and also the performance of a random lookup.
|
||||
Note that the database created by the benchmark is quite small.
|
||||
Therefore the report characterizes the performance of leveldb when the
|
||||
working set fits in memory. The cost of reading a piece of data that
|
||||
is not present in the operating system buffer cache will be dominated
|
||||
by the one or two disk seeks needed to fetch the data from disk.
|
||||
Write performance will be mostly unaffected by whether or not the
|
||||
working set fits in memory.
|
||||
|
||||
readrandom : 16.677 micros/op; (approximately 60,000 reads per second)
|
||||
readseq : 0.476 micros/op; 232.3 MB/s
|
||||
readreverse : 0.724 micros/op; 152.9 MB/s
|
||||
|
||||
LevelDB compacts its underlying storage data in the background to
|
||||
improve read performance. The results listed above were done
|
||||
immediately after a lot of random writes. The results after
|
||||
compactions (which are usually triggered automatically) are better.
|
||||
|
||||
readrandom : 11.602 micros/op; (approximately 85,000 reads per second)
|
||||
readseq : 0.423 micros/op; 261.8 MB/s
|
||||
readreverse : 0.663 micros/op; 166.9 MB/s
|
||||
|
||||
Some of the high cost of reads comes from repeated decompression of blocks
|
||||
read from disk. If we supply enough cache to the leveldb so it can hold the
|
||||
uncompressed blocks in memory, the read performance improves again:
|
||||
|
||||
readrandom : 9.775 micros/op; (approximately 100,000 reads per second before compaction)
|
||||
readrandom : 5.215 micros/op; (approximately 190,000 reads per second after compaction)
|
||||
|
||||
## Repository contents
|
||||
|
||||
See [doc/index.md](doc/index.md) for more explanation. See
|
||||
[doc/impl.md](doc/impl.md) for a brief overview of the implementation.
|
||||
|
||||
The public interface is in include/*.h. Callers should not include or
|
||||
rely on the details of any other header files in this package. Those
|
||||
internal APIs may be changed without warning.
|
||||
|
||||
Guide to header files:
|
||||
|
||||
* **include/db.h**: Main interface to the DB: Start here
|
||||
|
||||
* **include/options.h**: Control over the behavior of an entire database,
|
||||
and also control over the behavior of individual reads and writes.
|
||||
|
||||
* **include/comparator.h**: Abstraction for user-specified comparison function.
|
||||
If you want just bytewise comparison of keys, you can use the default
|
||||
comparator, but clients can write their own comparator implementations if they
|
||||
want custom ordering (e.g. to handle different character encodings, etc.)
|
||||
|
||||
* **include/iterator.h**: Interface for iterating over data. You can get
|
||||
an iterator from a DB object.
|
||||
|
||||
* **include/write_batch.h**: Interface for atomically applying multiple
|
||||
updates to a database.
|
||||
|
||||
* **include/slice.h**: A simple module for maintaining a pointer and a
|
||||
length into some other byte array.
|
||||
|
||||
* **include/status.h**: Status is returned from many of the public interfaces
|
||||
and is used to report success and various kinds of errors.
|
||||
|
||||
* **include/env.h**:
|
||||
Abstraction of the OS environment. A posix implementation of this interface is
|
||||
in util/env_posix.cc
|
||||
|
||||
* **include/table.h, include/table_builder.h**: Lower-level modules that most
|
||||
clients probably won't use directly
|
|
@ -7,7 +7,6 @@ db
|
|||
within [start_key..end_key]? For Chrome, deletion of obsolete
|
||||
object stores, etc. can be done in the background anyway, so
|
||||
probably not that important.
|
||||
- There have been requests for MultiGet.
|
||||
|
||||
After a range is completely deleted, what gets rid of the
|
||||
corresponding files if we do no future changes to that range. Make
|
||||
|
|
|
@ -1,39 +0,0 @@
|
|||
# Building LevelDB On Windows
|
||||
|
||||
## Prereqs
|
||||
|
||||
Install the [Windows Software Development Kit version 7.1](http://www.microsoft.com/downloads/dlx/en-us/listdetailsview.aspx?FamilyID=6b6c21d2-2006-4afa-9702-529fa782d63b).
|
||||
|
||||
Download and extract the [Snappy source distribution](http://snappy.googlecode.com/files/snappy-1.0.5.tar.gz)
|
||||
|
||||
1. Open the "Windows SDK 7.1 Command Prompt" :
|
||||
Start Menu -> "Microsoft Windows SDK v7.1" > "Windows SDK 7.1 Command Prompt"
|
||||
2. Change the directory to the leveldb project
|
||||
|
||||
## Building the Static lib
|
||||
|
||||
* 32 bit Version
|
||||
|
||||
setenv /x86
|
||||
msbuild.exe /p:Configuration=Release /p:Platform=Win32 /p:Snappy=..\snappy-1.0.5
|
||||
|
||||
* 64 bit Version
|
||||
|
||||
setenv /x64
|
||||
msbuild.exe /p:Configuration=Release /p:Platform=x64 /p:Snappy=..\snappy-1.0.5
|
||||
|
||||
|
||||
## Building and Running the Benchmark app
|
||||
|
||||
* 32 bit Version
|
||||
|
||||
setenv /x86
|
||||
msbuild.exe /p:Configuration=Benchmark /p:Platform=Win32 /p:Snappy=..\snappy-1.0.5
|
||||
Benchmark\leveldb.exe
|
||||
|
||||
* 64 bit Version
|
||||
|
||||
setenv /x64
|
||||
msbuild.exe /p:Configuration=Benchmark /p:Platform=x64 /p:Snappy=..\snappy-1.0.5
|
||||
x64\Benchmark\leveldb.exe
|
||||
|
|
@ -7,11 +7,8 @@
|
|||
# CC C Compiler path
|
||||
# CXX C++ Compiler path
|
||||
# PLATFORM_LDFLAGS Linker flags
|
||||
# PLATFORM_LIBS Libraries flags
|
||||
# PLATFORM_SHARED_EXT Extension for shared libraries
|
||||
# PLATFORM_SHARED_LDFLAGS Flags for building shared library
|
||||
# This flag is embedded just before the name
|
||||
# of the shared library without intervening spaces
|
||||
# PLATFORM_SHARED_CFLAGS Flags for compiling objects for shared library
|
||||
# PLATFORM_CCFLAGS C compiler flags
|
||||
# PLATFORM_CXXFLAGS C++ compiler flags. Will contain:
|
||||
|
@ -20,15 +17,14 @@
|
|||
#
|
||||
# The PLATFORM_CCFLAGS and PLATFORM_CXXFLAGS might include the following:
|
||||
#
|
||||
# -DLEVELDB_ATOMIC_PRESENT if <atomic> is present
|
||||
# -DLEVELDB_CSTDATOMIC_PRESENT if <cstdatomic> is present
|
||||
# -DLEVELDB_PLATFORM_POSIX for Posix-based platforms
|
||||
# -DSNAPPY if the Snappy library is present
|
||||
#
|
||||
|
||||
OUTPUT=$1
|
||||
PREFIX=$2
|
||||
if test -z "$OUTPUT" || test -z "$PREFIX"; then
|
||||
echo "usage: $0 <output-filename> <directory_prefix>" >&2
|
||||
if test -z "$OUTPUT"; then
|
||||
echo "usage: $0 <output-filename>" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
|
@ -44,10 +40,6 @@ if test -z "$CXX"; then
|
|||
CXX=g++
|
||||
fi
|
||||
|
||||
if test -z "$TMPDIR"; then
|
||||
TMPDIR=/tmp
|
||||
fi
|
||||
|
||||
# Detect OS
|
||||
if test -z "$TARGET_OS"; then
|
||||
TARGET_OS=`uname -s`
|
||||
|
@ -58,119 +50,77 @@ CROSS_COMPILE=
|
|||
PLATFORM_CCFLAGS=
|
||||
PLATFORM_CXXFLAGS=
|
||||
PLATFORM_LDFLAGS=
|
||||
PLATFORM_LIBS=
|
||||
PLATFORM_SHARED_EXT="so"
|
||||
PLATFORM_SHARED_EXT=
|
||||
PLATFORM_SHARED_LDFLAGS="-shared -Wl,-soname -Wl,"
|
||||
PLATFORM_SHARED_CFLAGS="-fPIC"
|
||||
PLATFORM_SHARED_VERSIONED=true
|
||||
PLATFORM_SSEFLAGS=
|
||||
|
||||
MEMCMP_FLAG=
|
||||
if [ "$CXX" = "g++" ]; then
|
||||
# Use libc's memcmp instead of GCC's memcmp. This results in ~40%
|
||||
# performance improvement on readrandom under gcc 4.4.3 on Linux/x86.
|
||||
MEMCMP_FLAG="-fno-builtin-memcmp"
|
||||
if test -n "$LEVELDB_VSN"; then
|
||||
VERSION_FLAGS="$VERSION_FLAGS -DLEVELDB_VSN=\"$LEVELDB_VSN\""
|
||||
fi
|
||||
|
||||
# On GCC, we pick libc's memcmp over GCC's memcmp via -fno-builtin-memcmp
|
||||
case "$TARGET_OS" in
|
||||
CYGWIN_*)
|
||||
PLATFORM=OS_LINUX
|
||||
COMMON_FLAGS="$MEMCMP_FLAG -lpthread -DOS_LINUX -DCYGWIN"
|
||||
PLATFORM_LDFLAGS="-lpthread"
|
||||
PORT_FILE=port/port_posix.cc
|
||||
PORT_SSE_FILE=port/port_posix_sse.cc
|
||||
;;
|
||||
Darwin)
|
||||
PLATFORM=OS_MACOSX
|
||||
COMMON_FLAGS="$MEMCMP_FLAG -DOS_MACOSX"
|
||||
PLATFORM_SHARED_EXT=dylib
|
||||
[ -z "$INSTALL_PATH" ] && INSTALL_PATH=`pwd`
|
||||
PLATFORM_SHARED_LDFLAGS="-dynamiclib -install_name $INSTALL_PATH/"
|
||||
oIFS="$IFS"; IFS=.
|
||||
set `uname -r`
|
||||
IFS="$oIFS"
|
||||
if [ "$1" -ge 13 ]; then
|
||||
# assume clang compiler
|
||||
COMMON_FLAGS="-mmacosx-version-min=10.8 -DOS_MACOSX -stdlib=libc++"
|
||||
PLATFORM_LDFLAGS="-mmacosx-version-min=10.8"
|
||||
else
|
||||
COMMON_FLAGS="-fno-builtin-memcmp -DOS_MACOSX"
|
||||
fi
|
||||
PLATFORM_SHARED_EXT=
|
||||
PLATFORM_SHARED_LDFLAGS="-dynamiclib -install_name "
|
||||
PORT_FILE=port/port_posix.cc
|
||||
PORT_SSE_FILE=port/port_posix_sse.cc
|
||||
;;
|
||||
Linux)
|
||||
PLATFORM=OS_LINUX
|
||||
COMMON_FLAGS="$MEMCMP_FLAG -pthread -DOS_LINUX"
|
||||
PLATFORM_LDFLAGS="-pthread"
|
||||
COMMON_FLAGS="-fno-builtin-memcmp -pthread -DOS_LINUX"
|
||||
PLATFORM_LDFLAGS="-pthread -lrt"
|
||||
PORT_FILE=port/port_posix.cc
|
||||
PORT_SSE_FILE=port/port_posix_sse.cc
|
||||
;;
|
||||
SunOS)
|
||||
PLATFORM=OS_SOLARIS
|
||||
COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_SOLARIS"
|
||||
PLATFORM_LIBS="-lpthread -lrt"
|
||||
COMMON_FLAGS="-fno-builtin-memcmp -D_REENTRANT -DOS_SOLARIS -m64"
|
||||
PLATFORM_LDFLAGS="-lpthread -lrt"
|
||||
PLATFORM_SHARED_EXT=
|
||||
PORT_FILE=port/port_posix.cc
|
||||
PORT_SSE_FILE=port/port_posix_sse.cc
|
||||
;;
|
||||
FreeBSD)
|
||||
CC=cc
|
||||
CXX=c++
|
||||
PLATFORM=OS_FREEBSD
|
||||
COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_FREEBSD"
|
||||
PLATFORM_LIBS="-lpthread"
|
||||
PORT_FILE=port/port_posix.cc
|
||||
PORT_SSE_FILE=port/port_posix_sse.cc
|
||||
;;
|
||||
GNU/kFreeBSD)
|
||||
PLATFORM=OS_KFREEBSD
|
||||
COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_KFREEBSD"
|
||||
PLATFORM_LIBS="-lpthread"
|
||||
COMMON_FLAGS="-fno-builtin-memcmp -D_REENTRANT -DOS_FREEBSD"
|
||||
PLATFORM_LDFLAGS="-lpthread"
|
||||
PORT_FILE=port/port_posix.cc
|
||||
;;
|
||||
NetBSD)
|
||||
PLATFORM=OS_NETBSD
|
||||
COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_NETBSD"
|
||||
PLATFORM_LIBS="-lpthread -lgcc_s"
|
||||
COMMON_FLAGS="-fno-builtin-memcmp -D_REENTRANT -DOS_NETBSD"
|
||||
PLATFORM_LDFLAGS="-lpthread -lgcc_s"
|
||||
PORT_FILE=port/port_posix.cc
|
||||
PORT_SSE_FILE=port/port_posix_sse.cc
|
||||
;;
|
||||
OpenBSD)
|
||||
PLATFORM=OS_OPENBSD
|
||||
COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_OPENBSD"
|
||||
COMMON_FLAGS="-fno-builtin-memcmp -D_REENTRANT -DOS_OPENBSD"
|
||||
PLATFORM_LDFLAGS="-pthread"
|
||||
PORT_FILE=port/port_posix.cc
|
||||
PORT_SSE_FILE=port/port_posix_sse.cc
|
||||
;;
|
||||
DragonFly)
|
||||
PLATFORM=OS_DRAGONFLYBSD
|
||||
COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_DRAGONFLYBSD"
|
||||
PLATFORM_LIBS="-lpthread"
|
||||
COMMON_FLAGS="-fno-builtin-memcmp -D_REENTRANT -DOS_DRAGONFLYBSD"
|
||||
PLATFORM_LDFLAGS="-lpthread"
|
||||
PORT_FILE=port/port_posix.cc
|
||||
PORT_SSE_FILE=port/port_posix_sse.cc
|
||||
;;
|
||||
OS_ANDROID_CROSSCOMPILE)
|
||||
PLATFORM=OS_ANDROID
|
||||
COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_ANDROID -DLEVELDB_PLATFORM_POSIX"
|
||||
COMMON_FLAGS="-fno-builtin-memcmp -D_REENTRANT -DOS_ANDROID -DLEVELDB_PLATFORM_POSIX"
|
||||
PLATFORM_LDFLAGS="" # All pthread features are in the Android C library
|
||||
PORT_FILE=port/port_posix.cc
|
||||
PORT_SSE_FILE=port/port_posix_sse.cc
|
||||
CROSS_COMPILE=true
|
||||
;;
|
||||
HP-UX)
|
||||
PLATFORM=OS_HPUX
|
||||
COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_HPUX"
|
||||
PLATFORM_LDFLAGS="-pthread"
|
||||
PORT_FILE=port/port_posix.cc
|
||||
PORT_SSE_FILE=port/port_posix_sse.cc
|
||||
# man ld: +h internal_name
|
||||
PLATFORM_SHARED_LDFLAGS="-shared -Wl,+h -Wl,"
|
||||
;;
|
||||
IOS)
|
||||
PLATFORM=IOS
|
||||
COMMON_FLAGS="$MEMCMP_FLAG -DOS_MACOSX"
|
||||
[ -z "$INSTALL_PATH" ] && INSTALL_PATH=`pwd`
|
||||
PORT_FILE=port/port_posix.cc
|
||||
PORT_SSE_FILE=port/port_posix_sse.cc
|
||||
PLATFORM_SHARED_EXT=
|
||||
PLATFORM_SHARED_LDFLAGS=
|
||||
PLATFORM_SHARED_CFLAGS=
|
||||
PLATFORM_SHARED_VERSIONED=
|
||||
;;
|
||||
OS_WINDOWS_CROSSCOMPILE | NATIVE_WINDOWS)
|
||||
PLATFORM=OS_WINDOWS
|
||||
COMMON_FLAGS="-fno-builtin-memcmp -D_REENTRANT -DOS_WINDOWS -DLEVELDB_PLATFORM_WINDOWS -DWINVER=0x0500 -D__USE_MINGW_ANSI_STDIO=1"
|
||||
PLATFORM_SOURCES="util/env_win.cc"
|
||||
PLATFORM_LIBS="-lshlwapi"
|
||||
PORT_FILE=port/port_win.cc
|
||||
CROSS_COMPILE=true
|
||||
;;
|
||||
*)
|
||||
|
@ -182,78 +132,106 @@ esac
|
|||
# except for the test and benchmark files. By default, find will output a list
|
||||
# of all files matching either rule, so we need to append -print to make the
|
||||
# prune take effect.
|
||||
DIRS="$PREFIX/db $PREFIX/util $PREFIX/table"
|
||||
|
||||
if [ -f leveldb_ee/README.md ]; then
|
||||
DIRS="util db table leveldb_ee"
|
||||
else
|
||||
DIRS="util db table leveldb_os"
|
||||
fi
|
||||
set -f # temporarily disable globbing so that our patterns aren't expanded
|
||||
PRUNE_TEST="-name *test*.cc -prune"
|
||||
PRUNE_BENCH="-name *_bench.cc -prune"
|
||||
PRUNE_TOOL="-name leveldbutil.cc -prune"
|
||||
PORTABLE_FILES=`find $DIRS $PRUNE_TEST -o $PRUNE_BENCH -o $PRUNE_TOOL -o -name '*.cc' -print | sort | sed "s,^$PREFIX/,," | tr "\n" " "`
|
||||
|
||||
PORTABLE_FILES=`find $DIRS $PRUNE_TEST -o $PRUNE_BENCH -o -name '*.cc' -print | sort | tr "\n" " "`
|
||||
TESTS=`find $DIRS -name '*_test.c*' -print | sort | tr "\n" " "`
|
||||
set +f # re-enable globbing
|
||||
|
||||
# The sources consist of the portable files, plus the platform-specific port
|
||||
# file.
|
||||
echo "SOURCES=$PORTABLE_FILES $PORT_FILE $PORT_SSE_FILE" >> $OUTPUT
|
||||
echo "SOURCES=$PORTABLE_FILES $PORT_FILE" >> $OUTPUT
|
||||
echo "MEMENV_SOURCES=helpers/memenv/memenv.cc" >> $OUTPUT
|
||||
echo "TEST_SOURCES=$TESTS" >>$OUTPUT
|
||||
|
||||
if [ "$CROSS_COMPILE" = "true" ]; then
|
||||
# Cross-compiling; do not try any compilation tests.
|
||||
true
|
||||
else
|
||||
CXXOUTPUT="${TMPDIR}/leveldb_build_detect_platform-cxx.$$"
|
||||
|
||||
# If -std=c++0x works, use <atomic> as fallback for when memory barriers
|
||||
# are not available.
|
||||
$CXX $CXXFLAGS -std=c++0x -x c++ - -o $CXXOUTPUT 2>/dev/null <<EOF
|
||||
#include <atomic>
|
||||
# If -std=c++0x works, use <cstdatomic>. Otherwise use port_posix.h.
|
||||
$CXX $CFLAGS -std=c++0x -x c++ - -o /dev/null 2>/dev/null <<EOF
|
||||
#include <cstdatomic>
|
||||
int main() {}
|
||||
EOF
|
||||
if [ "$?" = 0 ]; then
|
||||
COMMON_FLAGS="$COMMON_FLAGS -DLEVELDB_PLATFORM_POSIX -DLEVELDB_ATOMIC_PRESENT"
|
||||
COMMON_FLAGS="$COMMON_FLAGS -DLEVELDB_PLATFORM_POSIX -DLEVELDB_CSTDATOMIC_PRESENT"
|
||||
PLATFORM_CXXFLAGS="-std=c++0x"
|
||||
else
|
||||
COMMON_FLAGS="$COMMON_FLAGS -DLEVELDB_PLATFORM_POSIX"
|
||||
fi
|
||||
|
||||
# Test whether Snappy library is installed
|
||||
# http://code.google.com/p/snappy/
|
||||
$CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null <<EOF
|
||||
#include <snappy.h>
|
||||
int main() {}
|
||||
EOF
|
||||
if [ "$?" = 0 ]; then
|
||||
COMMON_FLAGS="$COMMON_FLAGS -DSNAPPY"
|
||||
if [ "$PLATFORM" = "OS_LINUX" ]; then
|
||||
# Basho: switching to static snappy library to make tools more portable
|
||||
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -Wl,-non_shared -lsnappy -Wl,-call_shared"
|
||||
else
|
||||
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lsnappy"
|
||||
fi
|
||||
fi
|
||||
|
||||
# Test whether tcmalloc is available
|
||||
$CXX $CXXFLAGS -x c++ - -o $CXXOUTPUT -ltcmalloc 2>/dev/null <<EOF
|
||||
$CXX $CFLAGS -x c++ - -o /dev/null -ltcmalloc 2>/dev/null <<EOF
|
||||
int main() {}
|
||||
EOF
|
||||
if [ "$?" = 0 ]; then
|
||||
PLATFORM_LIBS="$PLATFORM_LIBS -ltcmalloc"
|
||||
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -ltcmalloc"
|
||||
fi
|
||||
|
||||
rm -f $CXXOUTPUT 2>/dev/null
|
||||
|
||||
# Test if gcc SSE 4.2 is supported
|
||||
$CXX $CXXFLAGS -x c++ - -o $CXXOUTPUT -msse4.2 2>/dev/null <<EOF
|
||||
int main() {}
|
||||
EOF
|
||||
if [ "$?" = 0 ]; then
|
||||
PLATFORM_SSEFLAGS="-msse4.2"
|
||||
fi
|
||||
|
||||
rm -f $CXXOUTPUT 2>/dev/null
|
||||
fi
|
||||
|
||||
# Use the SSE 4.2 CRC32C intrinsics iff runtime checks indicate compiler supports them.
|
||||
if [ -n "$PLATFORM_SSEFLAGS" ]; then
|
||||
PLATFORM_SSEFLAGS="$PLATFORM_SSEFLAGS -DLEVELDB_PLATFORM_POSIX_SSE"
|
||||
fi
|
||||
|
||||
PLATFORM_CCFLAGS="$PLATFORM_CCFLAGS $COMMON_FLAGS"
|
||||
PLATFORM_CXXFLAGS="$PLATFORM_CXXFLAGS $COMMON_FLAGS"
|
||||
PLATFORM_CCFLAGS="$PLATFORM_CCFLAGS $COMMON_FLAGS $VERSION_FLAGS"
|
||||
PLATFORM_CXXFLAGS="$PLATFORM_CXXFLAGS $COMMON_FLAGS $VERSION_FLAGS"
|
||||
|
||||
echo "CC=$CC" >> $OUTPUT
|
||||
echo "CXX=$CXX" >> $OUTPUT
|
||||
echo "PLATFORM=$PLATFORM" >> $OUTPUT
|
||||
echo "PLATFORM_LDFLAGS=$PLATFORM_LDFLAGS" >> $OUTPUT
|
||||
echo "PLATFORM_LIBS=$PLATFORM_LIBS" >> $OUTPUT
|
||||
echo "PLATFORM_CCFLAGS=$PLATFORM_CCFLAGS" >> $OUTPUT
|
||||
echo "PLATFORM_CXXFLAGS=$PLATFORM_CXXFLAGS" >> $OUTPUT
|
||||
echo "PLATFORM_SSEFLAGS=$PLATFORM_SSEFLAGS" >> $OUTPUT
|
||||
echo "PLATFORM_SHARED_CFLAGS=$PLATFORM_SHARED_CFLAGS" >> $OUTPUT
|
||||
echo "PLATFORM_SHARED_EXT=$PLATFORM_SHARED_EXT" >> $OUTPUT
|
||||
echo "PLATFORM_SHARED_LDFLAGS=$PLATFORM_SHARED_LDFLAGS" >> $OUTPUT
|
||||
echo "PLATFORM_SHARED_VERSIONED=$PLATFORM_SHARED_VERSIONED" >> $OUTPUT
|
||||
|
||||
#
|
||||
# Basho extension to place -D variable in include/leveldb/ldb_config.h
|
||||
#
|
||||
|
||||
LDB_CONFIG="include/leveldb/ldb_config.h"
|
||||
|
||||
# Delete existing output, if it exists
|
||||
rm -f $LDB_CONFIG
|
||||
|
||||
write_config_h()
|
||||
{
|
||||
for param in $@
|
||||
do
|
||||
prefix=$(expr -- $param : "\(..\)")
|
||||
if [ X$prefix = "X-D" ]
|
||||
then
|
||||
echo "" >>$LDB_CONFIG
|
||||
echo "#ifndef $(expr -- $param : '..\(.*\)')" >>$LDB_CONFIG
|
||||
echo " #define $(expr -- $param : '..\(.*\)')" >>$LDB_CONFIG
|
||||
echo "#endif" >>$LDB_CONFIG
|
||||
fi
|
||||
done
|
||||
}
|
||||
|
||||
echo "/** This file is generated by build_detect_platform." >$LDB_CONFIG
|
||||
echo " * It saves the state of compile flags. This benefits the reuse" >>$LDB_CONFIG
|
||||
echo " * of internal include files outside of a leveldb build." >>$LDB_CONFIG
|
||||
echo " */" >>$LDB_CONFIG
|
||||
|
||||
write_config_h $COMMON_FLAGS
|
||||
|
|
|
@ -1,118 +0,0 @@
|
|||
// Copyright (c) 2013 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include "leveldb/db.h"
|
||||
#include "db/db_impl.h"
|
||||
#include "leveldb/cache.h"
|
||||
#include "util/testharness.h"
|
||||
#include "util/testutil.h"
|
||||
|
||||
namespace leveldb {
|
||||
|
||||
class AutoCompactTest {
|
||||
public:
|
||||
std::string dbname_;
|
||||
Cache* tiny_cache_;
|
||||
Options options_;
|
||||
DB* db_;
|
||||
|
||||
AutoCompactTest() {
|
||||
dbname_ = test::TmpDir() + "/autocompact_test";
|
||||
tiny_cache_ = NewLRUCache(100);
|
||||
options_.block_cache = tiny_cache_;
|
||||
DestroyDB(dbname_, options_);
|
||||
options_.create_if_missing = true;
|
||||
options_.compression = kNoCompression;
|
||||
ASSERT_OK(DB::Open(options_, dbname_, &db_));
|
||||
}
|
||||
|
||||
~AutoCompactTest() {
|
||||
delete db_;
|
||||
DestroyDB(dbname_, Options());
|
||||
delete tiny_cache_;
|
||||
}
|
||||
|
||||
std::string Key(int i) {
|
||||
char buf[100];
|
||||
snprintf(buf, sizeof(buf), "key%06d", i);
|
||||
return std::string(buf);
|
||||
}
|
||||
|
||||
uint64_t Size(const Slice& start, const Slice& limit) {
|
||||
Range r(start, limit);
|
||||
uint64_t size;
|
||||
db_->GetApproximateSizes(&r, 1, &size);
|
||||
return size;
|
||||
}
|
||||
|
||||
void DoReads(int n);
|
||||
};
|
||||
|
||||
static const int kValueSize = 200 * 1024;
|
||||
static const int kTotalSize = 100 * 1024 * 1024;
|
||||
static const int kCount = kTotalSize / kValueSize;
|
||||
|
||||
// Read through the first n keys repeatedly and check that they get
|
||||
// compacted (verified by checking the size of the key space).
|
||||
void AutoCompactTest::DoReads(int n) {
|
||||
std::string value(kValueSize, 'x');
|
||||
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
|
||||
|
||||
// Fill database
|
||||
for (int i = 0; i < kCount; i++) {
|
||||
ASSERT_OK(db_->Put(WriteOptions(), Key(i), value));
|
||||
}
|
||||
ASSERT_OK(dbi->TEST_CompactMemTable());
|
||||
|
||||
// Delete everything
|
||||
for (int i = 0; i < kCount; i++) {
|
||||
ASSERT_OK(db_->Delete(WriteOptions(), Key(i)));
|
||||
}
|
||||
ASSERT_OK(dbi->TEST_CompactMemTable());
|
||||
|
||||
// Get initial measurement of the space we will be reading.
|
||||
const int64_t initial_size = Size(Key(0), Key(n));
|
||||
const int64_t initial_other_size = Size(Key(n), Key(kCount));
|
||||
|
||||
// Read until size drops significantly.
|
||||
std::string limit_key = Key(n);
|
||||
for (int read = 0; true; read++) {
|
||||
ASSERT_LT(read, 100) << "Taking too long to compact";
|
||||
Iterator* iter = db_->NewIterator(ReadOptions());
|
||||
for (iter->SeekToFirst();
|
||||
iter->Valid() && iter->key().ToString() < limit_key;
|
||||
iter->Next()) {
|
||||
// Drop data
|
||||
}
|
||||
delete iter;
|
||||
// Wait a little bit to allow any triggered compactions to complete.
|
||||
Env::Default()->SleepForMicroseconds(1000000);
|
||||
uint64_t size = Size(Key(0), Key(n));
|
||||
fprintf(stderr, "iter %3d => %7.3f MB [other %7.3f MB]\n",
|
||||
read+1, size/1048576.0, Size(Key(n), Key(kCount))/1048576.0);
|
||||
if (size <= initial_size/10) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Verify that the size of the key space not touched by the reads
|
||||
// is pretty much unchanged.
|
||||
const int64_t final_other_size = Size(Key(n), Key(kCount));
|
||||
ASSERT_LE(final_other_size, initial_other_size + 1048576);
|
||||
ASSERT_GE(final_other_size, initial_other_size/5 - 1048576);
|
||||
}
|
||||
|
||||
TEST(AutoCompactTest, ReadAll) {
|
||||
DoReads(kCount);
|
||||
}
|
||||
|
||||
TEST(AutoCompactTest, ReadHalf) {
|
||||
DoReads(kCount/2);
|
||||
}
|
||||
|
||||
} // namespace leveldb
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
return leveldb::test::RunAllTests();
|
||||
}
|
|
@ -2,12 +2,16 @@
|
|||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#define __STDC_FORMAT_MACROS
|
||||
#include <inttypes.h>
|
||||
|
||||
#include "db/builder.h"
|
||||
|
||||
#include "db/filename.h"
|
||||
#include "db/dbformat.h"
|
||||
#include "db/table_cache.h"
|
||||
#include "db/version_edit.h"
|
||||
#include "db/version_set.h"
|
||||
#include "leveldb/db.h"
|
||||
#include "leveldb/env.h"
|
||||
#include "leveldb/iterator.h"
|
||||
|
@ -17,27 +21,51 @@ namespace leveldb {
|
|||
Status BuildTable(const std::string& dbname,
|
||||
Env* env,
|
||||
const Options& options,
|
||||
const Comparator * user_comparator,
|
||||
TableCache* table_cache,
|
||||
Iterator* iter,
|
||||
FileMetaData* meta) {
|
||||
FileMetaData* meta,
|
||||
SequenceNumber smallest_snapshot) {
|
||||
Status s;
|
||||
size_t keys_seen, keys_retired;
|
||||
|
||||
keys_seen=0;
|
||||
keys_retired=0;
|
||||
|
||||
meta->file_size = 0;
|
||||
iter->SeekToFirst();
|
||||
|
||||
std::string fname = TableFileName(dbname, meta->number);
|
||||
KeyRetirement retire(user_comparator, smallest_snapshot, &options);
|
||||
|
||||
std::string fname = TableFileName(options, meta->number, meta->level);
|
||||
if (iter->Valid()) {
|
||||
WritableFile* file;
|
||||
s = env->NewWritableFile(fname, &file);
|
||||
|
||||
s = env->NewWritableFile(fname, &file,
|
||||
env->RecoveryMmapSize(&options));
|
||||
if (!s.ok()) {
|
||||
return s;
|
||||
}
|
||||
|
||||
// tune fadvise to keep all of this lower level file in page cache
|
||||
// (compaction of unsorted files causes severe cache misses)
|
||||
file->SetMetadataOffset(1);
|
||||
|
||||
TableBuilder* builder = new TableBuilder(options, file);
|
||||
meta->smallest.DecodeFrom(iter->key());
|
||||
for (; iter->Valid(); iter->Next()) {
|
||||
++keys_seen;
|
||||
Slice key = iter->key();
|
||||
if (!retire(key))
|
||||
{
|
||||
meta->largest.DecodeFrom(key);
|
||||
builder->Add(key, iter->value());
|
||||
++meta->num_entries;
|
||||
} // if
|
||||
else
|
||||
{
|
||||
++keys_retired;
|
||||
} // else
|
||||
}
|
||||
|
||||
// Finish and check for builder errors
|
||||
|
@ -45,6 +73,9 @@ Status BuildTable(const std::string& dbname,
|
|||
s = builder->Finish();
|
||||
if (s.ok()) {
|
||||
meta->file_size = builder->FileSize();
|
||||
meta->exp_write_low = builder->GetExpiryWriteLow();
|
||||
meta->exp_write_high = builder->GetExpiryWriteHigh();
|
||||
meta->exp_explicit_high = builder->GetExpiryExplicitHigh();
|
||||
assert(meta->file_size > 0);
|
||||
}
|
||||
} else {
|
||||
|
@ -64,10 +95,20 @@ Status BuildTable(const std::string& dbname,
|
|||
|
||||
if (s.ok()) {
|
||||
// Verify that the table is usable
|
||||
Table * table_ptr;
|
||||
Iterator* it = table_cache->NewIterator(ReadOptions(),
|
||||
meta->number,
|
||||
meta->file_size);
|
||||
meta->file_size,
|
||||
meta->level,
|
||||
&table_ptr);
|
||||
s = it->status();
|
||||
|
||||
// Riak specific: bloom filter is no longer read by default,
|
||||
// force read on highly used overlapped table files
|
||||
if (s.ok() && VersionSet::IsLevelOverlapped(meta->level))
|
||||
table_ptr->ReadFilter();
|
||||
|
||||
// table_ptr is owned by it and therefore invalidated by this delete
|
||||
delete it;
|
||||
}
|
||||
}
|
||||
|
@ -79,6 +120,11 @@ Status BuildTable(const std::string& dbname,
|
|||
|
||||
if (s.ok() && meta->file_size > 0) {
|
||||
// Keep it
|
||||
if (0!=keys_retired)
|
||||
{
|
||||
Log(options.info_log, "Level-0 table #%" PRIu64 ": %zd keys seen, %zd keys retired, %zd keys expired",
|
||||
meta->number, keys_seen, retire.GetDroppedCount(), retire.GetExpiredCount());
|
||||
} // if
|
||||
} else {
|
||||
env->DeleteFile(fname);
|
||||
}
|
||||
|
|
|
@ -6,6 +6,7 @@
|
|||
#define STORAGE_LEVELDB_DB_BUILDER_H_
|
||||
|
||||
#include "leveldb/status.h"
|
||||
#include "db/dbformat.h"
|
||||
|
||||
namespace leveldb {
|
||||
|
||||
|
@ -25,9 +26,11 @@ class VersionEdit;
|
|||
extern Status BuildTable(const std::string& dbname,
|
||||
Env* env,
|
||||
const Options& options,
|
||||
const Comparator * user_comparator,
|
||||
TableCache* table_cache,
|
||||
Iterator* iter,
|
||||
FileMetaData* meta);
|
||||
FileMetaData* meta,
|
||||
SequenceNumber smallest_snapshot);
|
||||
|
||||
} // namespace leveldb
|
||||
|
||||
|
|
|
@ -6,6 +6,7 @@
|
|||
|
||||
#include <stdlib.h>
|
||||
#include <unistd.h>
|
||||
#include <stdint.h>
|
||||
#include "leveldb/cache.h"
|
||||
#include "leveldb/comparator.h"
|
||||
#include "leveldb/db.h"
|
||||
|
@ -40,6 +41,8 @@ using leveldb::Status;
|
|||
using leveldb::WritableFile;
|
||||
using leveldb::WriteBatch;
|
||||
using leveldb::WriteOptions;
|
||||
using leveldb::KeyMetaData;
|
||||
using leveldb::ValueType;
|
||||
|
||||
extern "C" {
|
||||
|
||||
|
@ -49,6 +52,7 @@ struct leveldb_writebatch_t { WriteBatch rep; };
|
|||
struct leveldb_snapshot_t { const Snapshot* rep; };
|
||||
struct leveldb_readoptions_t { ReadOptions rep; };
|
||||
struct leveldb_writeoptions_t { WriteOptions rep; };
|
||||
struct leveldb_keymetadata_t { KeyMetaData rep; };
|
||||
struct leveldb_options_t { Options rep; };
|
||||
struct leveldb_cache_t { Cache* rep; };
|
||||
struct leveldb_seqfile_t { SequentialFile* rep; };
|
||||
|
@ -173,8 +177,19 @@ void leveldb_put(
|
|||
const char* key, size_t keylen,
|
||||
const char* val, size_t vallen,
|
||||
char** errptr) {
|
||||
return(leveldb_put2(db, options, key, keylen, val, vallen, errptr, NULL));
|
||||
}
|
||||
|
||||
void leveldb_put2(
|
||||
leveldb_t* db,
|
||||
const leveldb_writeoptions_t* options,
|
||||
const char* key, size_t keylen,
|
||||
const char* val, size_t vallen,
|
||||
char** errptr,
|
||||
const leveldb_keymetadata_t * metadata) {
|
||||
SaveError(errptr,
|
||||
db->rep->Put(options->rep, Slice(key, keylen), Slice(val, vallen)));
|
||||
db->rep->Put(options->rep, Slice(key, keylen), Slice(val, vallen),
|
||||
(NULL==metadata ? NULL : &metadata->rep)));
|
||||
}
|
||||
|
||||
void leveldb_delete(
|
||||
|
@ -200,9 +215,21 @@ char* leveldb_get(
|
|||
const char* key, size_t keylen,
|
||||
size_t* vallen,
|
||||
char** errptr) {
|
||||
|
||||
return(leveldb_get2(db, options, key, keylen, vallen, errptr, NULL));
|
||||
}
|
||||
|
||||
char* leveldb_get2(
|
||||
leveldb_t* db,
|
||||
const leveldb_readoptions_t* options,
|
||||
const char* key, size_t keylen,
|
||||
size_t* vallen,
|
||||
char** errptr,
|
||||
leveldb_keymetadata_t * metadata) {
|
||||
char* result = NULL;
|
||||
std::string tmp;
|
||||
Status s = db->rep->Get(options->rep, Slice(key, keylen), &tmp);
|
||||
Status s = db->rep->Get(options->rep, Slice(key, keylen), &tmp,
|
||||
(NULL==metadata ? NULL : &metadata->rep));
|
||||
if (s.ok()) {
|
||||
*vallen = tmp.size();
|
||||
result = CopyString(tmp);
|
||||
|
@ -330,6 +357,15 @@ const char* leveldb_iter_value(const leveldb_iterator_t* iter, size_t* vlen) {
|
|||
return s.data();
|
||||
}
|
||||
|
||||
const void leveldb_iter_keymetadata(const leveldb_iterator_t* iter,
|
||||
leveldb_keymetadata_t * meta)
|
||||
{
|
||||
if (NULL!=iter && NULL!=meta)
|
||||
{
|
||||
meta->rep=iter->rep->keymetadata();
|
||||
} // if
|
||||
}
|
||||
|
||||
void leveldb_iter_get_error(const leveldb_iterator_t* iter, char** errptr) {
|
||||
SaveError(errptr, iter->rep->status());
|
||||
}
|
||||
|
@ -350,7 +386,16 @@ void leveldb_writebatch_put(
|
|||
leveldb_writebatch_t* b,
|
||||
const char* key, size_t klen,
|
||||
const char* val, size_t vlen) {
|
||||
b->rep.Put(Slice(key, klen), Slice(val, vlen));
|
||||
leveldb_writebatch_put2(b, key, klen, val, vlen,NULL);
|
||||
}
|
||||
|
||||
void leveldb_writebatch_put2(
|
||||
leveldb_writebatch_t* b,
|
||||
const char* key, size_t klen,
|
||||
const char* val, size_t vlen,
|
||||
const leveldb_keymetadata_t * metadata) {
|
||||
b->rep.Put(Slice(key, klen), Slice(val, vlen),
|
||||
(NULL==metadata ? NULL : &metadata->rep));
|
||||
}
|
||||
|
||||
void leveldb_writebatch_delete(
|
||||
|
@ -362,15 +407,20 @@ void leveldb_writebatch_delete(
|
|||
void leveldb_writebatch_iterate(
|
||||
leveldb_writebatch_t* b,
|
||||
void* state,
|
||||
void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen),
|
||||
void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen,
|
||||
const int & type, const uint64_t & expiry),
|
||||
void (*deleted)(void*, const char* k, size_t klen)) {
|
||||
class H : public WriteBatch::Handler {
|
||||
public:
|
||||
void* state_;
|
||||
void (*put_)(void*, const char* k, size_t klen, const char* v, size_t vlen);
|
||||
void (*put_)(void*, const char* k, size_t klen, const char* v, size_t vlen,
|
||||
const int & type, const uint64_t & expiry);
|
||||
void (*deleted_)(void*, const char* k, size_t klen);
|
||||
virtual void Put(const Slice& key, const Slice& value) {
|
||||
(*put_)(state_, key.data(), key.size(), value.data(), value.size());
|
||||
virtual void Put(const Slice& key, const Slice& value,
|
||||
const leveldb::ValueType & type,
|
||||
const leveldb::ExpiryTimeMicros & expiry)
|
||||
{
|
||||
(*put_)(state_, key.data(), key.size(), value.data(), value.size(), (int)type, (uint64_t)expiry);
|
||||
}
|
||||
virtual void Delete(const Slice& key) {
|
||||
(*deleted_)(state_, key.data(), key.size());
|
||||
|
@ -418,6 +468,11 @@ void leveldb_options_set_paranoid_checks(
|
|||
opt->rep.paranoid_checks = v;
|
||||
}
|
||||
|
||||
void leveldb_options_set_verify_compactions(
|
||||
leveldb_options_t* opt, unsigned char v) {
|
||||
opt->rep.verify_compactions = v;
|
||||
}
|
||||
|
||||
void leveldb_options_set_env(leveldb_options_t* opt, leveldb_env_t* env) {
|
||||
opt->rep.env = (env ? env->rep : NULL);
|
||||
}
|
||||
|
@ -450,6 +505,10 @@ void leveldb_options_set_compression(leveldb_options_t* opt, int t) {
|
|||
opt->rep.compression = static_cast<CompressionType>(t);
|
||||
}
|
||||
|
||||
void leveldb_options_set_total_leveldb_mem(leveldb_options_t* opt, size_t s) {
|
||||
opt->rep.total_leveldb_mem = s;
|
||||
}
|
||||
|
||||
leveldb_comparator_t* leveldb_comparator_create(
|
||||
void* state,
|
||||
void (*destructor)(void*),
|
||||
|
@ -580,7 +639,17 @@ void leveldb_env_destroy(leveldb_env_t* env) {
|
|||
delete env;
|
||||
}
|
||||
|
||||
void leveldb_env_shutdown() {
|
||||
Env::Shutdown();
|
||||
}
|
||||
|
||||
/**
|
||||
* CAUTION: this call is only for char * objects returned by
|
||||
* functions like leveldb_get and leveldb_property_value.
|
||||
* Also used to release errptr strings.
|
||||
*/
|
||||
void leveldb_free(void* ptr) {
|
||||
if (NULL!=ptr)
|
||||
free(ptr);
|
||||
}
|
||||
|
||||
|
|
|
@ -3,6 +3,8 @@
|
|||
found in the LICENSE file. See the AUTHORS file for names of contributors. */
|
||||
|
||||
#include "leveldb/c.h"
|
||||
#include "leveldb/options.h"
|
||||
#include "port/port.h"
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdio.h>
|
||||
|
@ -11,8 +13,13 @@
|
|||
#include <sys/types.h>
|
||||
#include <unistd.h>
|
||||
|
||||
using leveldb::ValueType;
|
||||
|
||||
struct leveldb_keymetadata_t { leveldb::KeyMetaData rep; };
|
||||
|
||||
const char* phase = "";
|
||||
static char dbname[200];
|
||||
static leveldb::ExpiryTimeMicros gStartTime;
|
||||
|
||||
static void StartPhase(const char* name) {
|
||||
fprintf(stderr, "=== Test %s\n", name);
|
||||
|
@ -49,7 +56,7 @@ static void CheckEqual(const char* expected, const char* v, size_t n) {
|
|||
fprintf(stderr, "%s: expected '%s', got '%s'\n",
|
||||
phase,
|
||||
(expected ? expected : "(null)"),
|
||||
(v ? v : "(null"));
|
||||
(v ? v : "(null)"));
|
||||
abort();
|
||||
}
|
||||
}
|
||||
|
@ -112,6 +119,117 @@ static void CheckDel(void* ptr, const char* k, size_t klen) {
|
|||
(*state)++;
|
||||
}
|
||||
|
||||
// (expiry enabled)
|
||||
static void CheckGet2(
|
||||
leveldb_t* db,
|
||||
const leveldb_readoptions_t* options,
|
||||
const char* key,
|
||||
const char* expected,
|
||||
ValueType type,
|
||||
uint64_t expiry) {
|
||||
char* err = NULL;
|
||||
size_t val_len;
|
||||
char* val;
|
||||
leveldb_keymetadata_t meta;
|
||||
|
||||
val = leveldb_get2(db, options, key, strlen(key), &val_len, &err, &meta);
|
||||
CheckNoError(err);
|
||||
CheckEqual(expected, val, val_len);
|
||||
CheckCondition(type==meta.rep.m_Type);
|
||||
if (0==expiry && leveldb::kTypeValueWriteTime==type)
|
||||
{
|
||||
leveldb::ExpiryTimeMicros now=leveldb::port::TimeMicros();
|
||||
CheckCondition(gStartTime<=meta.rep.m_Expiry && meta.rep.m_Expiry<=now);
|
||||
} // if
|
||||
else
|
||||
{CheckCondition(expiry==meta.rep.m_Expiry);}
|
||||
|
||||
Free(&val);
|
||||
}
|
||||
|
||||
// (expiry enabled)
|
||||
static void CheckIter2(leveldb_iterator_t* iter,
|
||||
const char* key, const char* val,
|
||||
const leveldb::KeyMetaData & meta) {
|
||||
size_t len;
|
||||
const char* str;
|
||||
leveldb_keymetadata_t it_meta;
|
||||
|
||||
str = leveldb_iter_key(iter, &len);
|
||||
CheckEqual(key, str, len);
|
||||
str = leveldb_iter_value(iter, &len);
|
||||
CheckEqual(val, str, len);
|
||||
|
||||
leveldb_iter_keymetadata(iter, &it_meta);
|
||||
CheckCondition(meta.m_Type==it_meta.rep.m_Type);
|
||||
if (0==meta.m_Expiry && leveldb::kTypeValueWriteTime==meta.m_Type)
|
||||
{
|
||||
leveldb::ExpiryTimeMicros now=leveldb::port::TimeMicros();
|
||||
CheckCondition(gStartTime<=it_meta.rep.m_Expiry && it_meta.rep.m_Expiry<=now);
|
||||
} // if
|
||||
else
|
||||
{CheckCondition(meta.m_Expiry==it_meta.rep.m_Expiry);}
|
||||
|
||||
}
|
||||
|
||||
// Callback from leveldb_writebatch_iterate()
|
||||
// (expiry enabled)
|
||||
struct CheckPut2Data
|
||||
{
|
||||
const char * m_Key;
|
||||
const char * m_Value;
|
||||
ValueType m_Type;
|
||||
uint64_t m_Expiry;
|
||||
};
|
||||
|
||||
static struct CheckPut2Data gCheckPut2Data[]=
|
||||
{
|
||||
{"foo","hello_put2",leveldb::kTypeValue,0},
|
||||
{"box","c_put2",leveldb::kTypeValue,0},
|
||||
{"disney","cartoon_put2",leveldb::kTypeValueWriteTime, 0},
|
||||
{"money","lotsof_put2",leveldb::kTypeValueWriteTime, 9988776655},
|
||||
{"time","ismoney_put2",leveldb::kTypeValueExplicitExpiry, 221199887766}
|
||||
};
|
||||
|
||||
static struct CheckPut2Data gCheckPut2ItrData[]=
|
||||
{
|
||||
{"bar","b",leveldb::kTypeValue,0},
|
||||
{"box","c",leveldb::kTypeValue,0},
|
||||
{"bar","",leveldb::kTypeDeletion,0},
|
||||
{"mom","texas",leveldb::kTypeValueWriteTime,0},
|
||||
{"dad","poland",leveldb::kTypeValueExplicitExpiry,22446688}
|
||||
};
|
||||
|
||||
static void CheckPut2(void* ptr,
|
||||
const char* k, size_t klen,
|
||||
const char* v, size_t vlen,
|
||||
const int & type_int,
|
||||
const uint64_t & expiry) {
|
||||
int* state = (int*) ptr;
|
||||
CheckCondition(*state < (sizeof(gCheckPut2ItrData)/sizeof(gCheckPut2ItrData[0])));
|
||||
struct CheckPut2Data * test;
|
||||
|
||||
test=&gCheckPut2ItrData[*state];
|
||||
CheckEqual(test->m_Key, k, klen);
|
||||
CheckEqual(test->m_Value, v, vlen);
|
||||
CheckCondition((int)test->m_Type==type_int);
|
||||
if (leveldb::kTypeValueWriteTime!=test->m_Type)
|
||||
{CheckCondition((uint64_t)test->m_Expiry==expiry);}
|
||||
(*state)++;
|
||||
}
|
||||
|
||||
// Callback from leveldb_writebatch_iterate()
|
||||
// (expiry enabled)
|
||||
static void CheckDel2(void* ptr, const char* k, size_t klen) {
|
||||
int* state = (int*) ptr;
|
||||
CheckCondition(*state < (sizeof(gCheckPut2ItrData)/sizeof(gCheckPut2ItrData[0])));
|
||||
struct CheckPut2Data * test;
|
||||
|
||||
test=&gCheckPut2ItrData[*state];
|
||||
CheckEqual(test->m_Key, k, klen);
|
||||
(*state)++;
|
||||
}
|
||||
|
||||
static void CmpDestroy(void* arg) { }
|
||||
|
||||
static int CmpCompare(void* arg, const char* a, size_t alen,
|
||||
|
@ -141,7 +259,7 @@ static char* FilterCreate(
|
|||
int num_keys,
|
||||
size_t* filter_length) {
|
||||
*filter_length = 4;
|
||||
char* result = malloc(4);
|
||||
char* result = (char*)malloc(4);
|
||||
memcpy(result, "fake", 4);
|
||||
return result;
|
||||
}
|
||||
|
@ -167,6 +285,7 @@ int main(int argc, char** argv) {
|
|||
|
||||
CheckCondition(leveldb_major_version() >= 1);
|
||||
CheckCondition(leveldb_minor_version() >= 1);
|
||||
gStartTime=leveldb::port::TimeMicros();
|
||||
|
||||
snprintf(dbname, sizeof(dbname),
|
||||
"%s/leveldb_c_test-%d",
|
||||
|
@ -207,12 +326,6 @@ int main(int argc, char** argv) {
|
|||
CheckCondition(err != NULL);
|
||||
Free(&err);
|
||||
|
||||
StartPhase("leveldb_free");
|
||||
db = leveldb_open(options, dbname, &err);
|
||||
CheckCondition(err != NULL);
|
||||
leveldb_free(err);
|
||||
err = NULL;
|
||||
|
||||
StartPhase("open");
|
||||
leveldb_options_set_create_if_missing(options, 1);
|
||||
db = leveldb_open(options, dbname, &err);
|
||||
|
@ -234,42 +347,74 @@ int main(int argc, char** argv) {
|
|||
|
||||
StartPhase("writebatch");
|
||||
{
|
||||
leveldb_keymetadata_t meta;
|
||||
leveldb_writebatch_t* wb = leveldb_writebatch_create();
|
||||
leveldb_writebatch_put(wb, "foo", 3, "a", 1);
|
||||
leveldb_writebatch_clear(wb);
|
||||
leveldb_writebatch_put(wb, "bar", 3, "b", 1);
|
||||
leveldb_writebatch_put(wb, "box", 3, "c", 1);
|
||||
leveldb_writebatch_delete(wb, "bar", 3);
|
||||
meta.rep.m_Type=leveldb::kTypeValueWriteTime;
|
||||
meta.rep.m_Expiry=0;
|
||||
leveldb_writebatch_put2(wb, "mom", 3, "texas", 5, &meta);
|
||||
meta.rep.m_Type=leveldb::kTypeValueExplicitExpiry;
|
||||
meta.rep.m_Expiry=22446688;
|
||||
leveldb_writebatch_put2(wb, "dad", 3, "poland", 6, &meta);
|
||||
leveldb_write(db, woptions, wb, &err);
|
||||
CheckNoError(err);
|
||||
CheckGet(db, roptions, "foo", "hello");
|
||||
CheckGet(db, roptions, "bar", NULL);
|
||||
CheckGet(db, roptions, "box", "c");
|
||||
CheckGet2(db, roptions, "dad", "poland", leveldb::kTypeValueExplicitExpiry, 22446688);
|
||||
CheckGet2(db, roptions, "mom", "texas", leveldb::kTypeValueWriteTime, 0);
|
||||
int pos = 0;
|
||||
leveldb_writebatch_iterate(wb, &pos, CheckPut, CheckDel);
|
||||
CheckCondition(pos == 3);
|
||||
leveldb_writebatch_iterate(wb, &pos, CheckPut2, CheckDel2);
|
||||
CheckCondition(pos == 5);
|
||||
leveldb_writebatch_destroy(wb);
|
||||
}
|
||||
|
||||
// reminder: keymetadata not supported on backward iteration
|
||||
StartPhase("iter");
|
||||
{
|
||||
leveldb::KeyMetaData meta;
|
||||
leveldb_iterator_t* iter = leveldb_create_iterator(db, roptions);
|
||||
CheckCondition(!leveldb_iter_valid(iter));
|
||||
leveldb_iter_seek_to_first(iter);
|
||||
CheckCondition(leveldb_iter_valid(iter));
|
||||
CheckIter(iter, "box", "c");
|
||||
meta.m_Type=leveldb::kTypeValue;
|
||||
meta.m_Expiry=0;
|
||||
CheckIter2(iter, "box", "c", meta);
|
||||
|
||||
meta.m_Type=leveldb::kTypeValueExplicitExpiry;
|
||||
meta.m_Expiry=22446688;
|
||||
leveldb_iter_next(iter);
|
||||
CheckIter2(iter, "dad", "poland", meta);
|
||||
leveldb_iter_next(iter);
|
||||
CheckIter(iter, "foo", "hello");
|
||||
leveldb_iter_prev(iter);
|
||||
CheckIter(iter, "dad", "poland");
|
||||
leveldb_iter_prev(iter);
|
||||
CheckIter(iter, "box", "c");
|
||||
leveldb_iter_prev(iter);
|
||||
CheckCondition(!leveldb_iter_valid(iter));
|
||||
leveldb_iter_seek_to_last(iter);
|
||||
CheckIter(iter, "foo", "hello");
|
||||
CheckIter(iter, "mom", "texas");
|
||||
leveldb_iter_seek(iter, "b", 1);
|
||||
CheckIter(iter, "box", "c");
|
||||
leveldb_iter_get_error(iter, &err);
|
||||
CheckNoError(err);
|
||||
|
||||
meta.m_Type=leveldb::kTypeValue;
|
||||
meta.m_Expiry=0;
|
||||
CheckIter2(iter, "box", "c", meta);
|
||||
leveldb_iter_seek(iter, "m", 1);
|
||||
meta.m_Type=leveldb::kTypeValueWriteTime;
|
||||
meta.m_Expiry=0;
|
||||
CheckIter2(iter, "mom", "texas", meta);
|
||||
leveldb_iter_get_error(iter, &err);
|
||||
CheckNoError(err);
|
||||
|
||||
leveldb_iter_destroy(iter);
|
||||
}
|
||||
|
||||
|
@ -335,6 +480,70 @@ int main(int argc, char** argv) {
|
|||
leveldb_options_set_error_if_exists(options, 1);
|
||||
}
|
||||
|
||||
StartPhase("put expiry");
|
||||
{
|
||||
leveldb_keymetadata_t meta;
|
||||
int loop, count;
|
||||
|
||||
count = sizeof(gCheckPut2Data) / sizeof(gCheckPut2Data[0]);
|
||||
|
||||
for (loop=0; loop<count; ++loop)
|
||||
{
|
||||
size_t klen, vlen;
|
||||
leveldb_keymetadata_t meta;
|
||||
struct CheckPut2Data * test;
|
||||
|
||||
test=&gCheckPut2Data[loop];
|
||||
klen=strlen(test->m_Key);
|
||||
vlen=strlen(test->m_Value);
|
||||
meta.rep.m_Type=test->m_Type;
|
||||
meta.rep.m_Expiry=test->m_Expiry;
|
||||
|
||||
leveldb_put2(db, woptions, test->m_Key, klen,
|
||||
test->m_Value, vlen, &err,
|
||||
&meta);
|
||||
CheckNoError(err);
|
||||
} // for
|
||||
|
||||
// testing memtable right now
|
||||
for (loop=0; loop<count; ++loop)
|
||||
{
|
||||
size_t klen, vlen;
|
||||
leveldb_keymetadata_t meta;
|
||||
struct CheckPut2Data * test;
|
||||
|
||||
test=&gCheckPut2Data[loop];
|
||||
klen=strlen(test->m_Key);
|
||||
vlen=strlen(test->m_Value);
|
||||
|
||||
CheckGet2(db, roptions, test->m_Key, test->m_Value,
|
||||
test->m_Type, test->m_Expiry);
|
||||
} // for
|
||||
|
||||
// close and open to force memory table into .sst upon open
|
||||
leveldb_close(db);
|
||||
leveldb_options_set_error_if_exists(options, 0);
|
||||
db = leveldb_open(options, dbname, &err);
|
||||
CheckNoError(err);
|
||||
|
||||
// now testing get from a level-0 .sst file
|
||||
for (loop=0; loop<count; ++loop)
|
||||
{
|
||||
size_t klen, vlen;
|
||||
leveldb_keymetadata_t meta;
|
||||
struct CheckPut2Data * test;
|
||||
|
||||
test=&gCheckPut2Data[loop];
|
||||
klen=strlen(test->m_Key);
|
||||
vlen=strlen(test->m_Value);
|
||||
|
||||
CheckGet2(db, roptions, test->m_Key, test->m_Value,
|
||||
test->m_Type, test->m_Expiry);
|
||||
} // for
|
||||
}
|
||||
|
||||
//
|
||||
// This screws up "options" for real database work. execute last.
|
||||
StartPhase("filter");
|
||||
for (run = 0; run < 2; run++) {
|
||||
// First run uses custom filter, second run uses bloom filter
|
||||
|
@ -376,6 +585,8 @@ int main(int argc, char** argv) {
|
|||
leveldb_filterpolicy_destroy(policy);
|
||||
}
|
||||
|
||||
|
||||
|
||||
StartPhase("cleanup");
|
||||
leveldb_close(db);
|
||||
leveldb_options_destroy(options);
|
||||
|
@ -386,5 +597,7 @@ int main(int argc, char** argv) {
|
|||
leveldb_env_destroy(env);
|
||||
|
||||
fprintf(stderr, "PASS\n");
|
||||
|
||||
leveldb_env_shutdown();
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -35,8 +35,8 @@ class CorruptionTest {
|
|||
CorruptionTest() {
|
||||
tiny_cache_ = NewLRUCache(100);
|
||||
options_.env = &env_;
|
||||
options_.block_cache = tiny_cache_;
|
||||
dbname_ = test::TmpDir() + "/corruption_test";
|
||||
dbname_ = test::TmpDir() + "/db_test";
|
||||
dbname_ = MakeTieredDbname(dbname_, options_);
|
||||
DestroyDB(dbname_, options_);
|
||||
|
||||
db_ = NULL;
|
||||
|
@ -51,14 +51,17 @@ class CorruptionTest {
|
|||
delete tiny_cache_;
|
||||
}
|
||||
|
||||
Status TryReopen() {
|
||||
Status TryReopen(Options* options = NULL) {
|
||||
delete db_;
|
||||
db_ = NULL;
|
||||
return DB::Open(options_, dbname_, &db_);
|
||||
Options opt = (options ? *options : options_);
|
||||
opt.env = &env_;
|
||||
opt.block_cache = tiny_cache_;
|
||||
return DB::Open(opt, dbname_, &db_);
|
||||
}
|
||||
|
||||
void Reopen() {
|
||||
ASSERT_OK(TryReopen());
|
||||
void Reopen(Options* options = NULL) {
|
||||
ASSERT_OK(TryReopen(options));
|
||||
}
|
||||
|
||||
void RepairDB() {
|
||||
|
@ -75,13 +78,7 @@ class CorruptionTest {
|
|||
Slice key = Key(i, &key_space);
|
||||
batch.Clear();
|
||||
batch.Put(key, Value(i, &value_space));
|
||||
WriteOptions options;
|
||||
// Corrupt() doesn't work without this sync on windows; stat reports 0 for
|
||||
// the file size.
|
||||
if (i == n - 1) {
|
||||
options.sync = true;
|
||||
}
|
||||
ASSERT_OK(db_->Write(options, &batch));
|
||||
ASSERT_OK(db_->Write(WriteOptions(), &batch));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -96,10 +93,6 @@ class CorruptionTest {
|
|||
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
|
||||
uint64_t key;
|
||||
Slice in(iter->key());
|
||||
if (in == "" || in == "~") {
|
||||
// Ignore boundary keys.
|
||||
continue;
|
||||
}
|
||||
if (!ConsumeDecimalNumber(&in, &key) ||
|
||||
!in.empty() ||
|
||||
key < next_expected) {
|
||||
|
@ -123,19 +116,26 @@ class CorruptionTest {
|
|||
ASSERT_GE(max_expected, correct);
|
||||
}
|
||||
|
||||
void Corrupt(FileType filetype, int offset, int bytes_to_corrupt) {
|
||||
void Corrupt(FileType filetype, int offset, int bytes_to_corrupt, int level=0) {
|
||||
// Pick file to corrupt
|
||||
std::vector<std::string> filenames;
|
||||
ASSERT_OK(env_.GetChildren(dbname_, &filenames));
|
||||
std::string dirname;
|
||||
if (leveldb::kTableFile!=filetype)
|
||||
dirname=dbname_;
|
||||
else
|
||||
dirname=MakeDirName2(options_, level, "sst");
|
||||
|
||||
ASSERT_OK(env_.GetChildren(dirname, &filenames));
|
||||
|
||||
uint64_t number;
|
||||
FileType type;
|
||||
std::string fname;
|
||||
int picked_number = -1;
|
||||
for (size_t i = 0; i < filenames.size(); i++) {
|
||||
for (int i = 0; i < filenames.size(); i++) {
|
||||
if (ParseFileName(filenames[i], &number, &type) &&
|
||||
type == filetype &&
|
||||
int(number) > picked_number) { // Pick latest file
|
||||
fname = dbname_ + "/" + filenames[i];
|
||||
fname = dirname + "/" + filenames[i];
|
||||
picked_number = number;
|
||||
}
|
||||
}
|
||||
|
@ -222,12 +222,14 @@ TEST(CorruptionTest, NewFileErrorDuringWrite) {
|
|||
const int num = 3 + (Options().write_buffer_size / kValueSize);
|
||||
std::string value_storage;
|
||||
Status s;
|
||||
for (int i = 0; s.ok() && i < num; i++) {
|
||||
for (int i = 0;
|
||||
s.ok() && i < num && 0==env_.num_writable_file_errors_;
|
||||
i++) {
|
||||
WriteBatch batch;
|
||||
batch.Put("a", Value(100, &value_storage));
|
||||
s = db_->Write(WriteOptions(), &batch);
|
||||
}
|
||||
ASSERT_TRUE(!s.ok());
|
||||
// ASSERT_TRUE(!s.ok()); Background write thread will never report this
|
||||
ASSERT_GE(env_.num_writable_file_errors_, 1);
|
||||
env_.writable_file_error_ = false;
|
||||
Reopen();
|
||||
|
@ -240,34 +242,18 @@ TEST(CorruptionTest, TableFile) {
|
|||
dbi->TEST_CompactRange(0, NULL, NULL);
|
||||
dbi->TEST_CompactRange(1, NULL, NULL);
|
||||
|
||||
Corrupt(kTableFile, 100, 1);
|
||||
Check(90, 99);
|
||||
}
|
||||
|
||||
TEST(CorruptionTest, TableFileRepair) {
|
||||
options_.block_size = 2 * kValueSize; // Limit scope of corruption
|
||||
options_.paranoid_checks = true;
|
||||
Reopen();
|
||||
Build(100);
|
||||
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
|
||||
dbi->TEST_CompactMemTable();
|
||||
dbi->TEST_CompactRange(0, NULL, NULL);
|
||||
dbi->TEST_CompactRange(1, NULL, NULL);
|
||||
|
||||
Corrupt(kTableFile, 100, 1);
|
||||
RepairDB();
|
||||
Reopen();
|
||||
Corrupt(kTableFile, 100, 1, config::kMaxMemCompactLevel);
|
||||
Check(95, 99);
|
||||
}
|
||||
|
||||
TEST(CorruptionTest, TableFileIndexData) {
|
||||
Build(10000); // Enough to build multiple Tables
|
||||
Build(100000); // Enough to build multiple Tables
|
||||
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
|
||||
dbi->TEST_CompactMemTable();
|
||||
|
||||
Corrupt(kTableFile, -2000, 500);
|
||||
Corrupt(kTableFile, -2000, 500, config::kMaxMemCompactLevel);
|
||||
Reopen();
|
||||
Check(5000, 9999);
|
||||
Check(50000, 99999);
|
||||
}
|
||||
|
||||
TEST(CorruptionTest, MissingDescriptor) {
|
||||
|
@ -319,10 +305,10 @@ TEST(CorruptionTest, CompactionInputError) {
|
|||
Build(10);
|
||||
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
|
||||
dbi->TEST_CompactMemTable();
|
||||
const int last = config::kMaxMemCompactLevel;
|
||||
const int last = config::kMaxMemCompactLevel; // Riak does not "move" files
|
||||
ASSERT_EQ(1, Property("leveldb.num-files-at-level" + NumberToString(last)));
|
||||
|
||||
Corrupt(kTableFile, 100, 1);
|
||||
Corrupt(kTableFile, 100, 1, last);
|
||||
Check(5, 9);
|
||||
|
||||
// Force compactions by writing lots of values
|
||||
|
@ -331,23 +317,42 @@ TEST(CorruptionTest, CompactionInputError) {
|
|||
}
|
||||
|
||||
TEST(CorruptionTest, CompactionInputErrorParanoid) {
|
||||
options_.paranoid_checks = true;
|
||||
options_.write_buffer_size = 512 << 10;
|
||||
Reopen();
|
||||
Options options;
|
||||
options.paranoid_checks = true;
|
||||
options.write_buffer_size = 1048576;
|
||||
Reopen(&options);
|
||||
|
||||
int current_corruption=Property("leveldb.ReadBlockError");
|
||||
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
|
||||
|
||||
// Make multiple inputs so we need to compact.
|
||||
for (int i = 0; i < 2; i++) {
|
||||
// Fill levels >= 1 so memtable compaction outputs to level 1
|
||||
// matthewv 1/10/14 - what does "levels" have to do with this,
|
||||
// switching to compaction trigger.
|
||||
// 7/10/14 - compaction starts between 4 and 6 files ... assume 4 and 1 move
|
||||
// (will make a new, descriptive constant for 4)
|
||||
for (int level = Property("leveldb.num-files-at-level0")+1;
|
||||
level < config::kL0_GroomingTrigger; level++) {
|
||||
dbi->Put(WriteOptions(), "", "begin");
|
||||
dbi->Put(WriteOptions(), "~", "end");
|
||||
dbi->TEST_CompactMemTable();
|
||||
}
|
||||
|
||||
Build(10);
|
||||
dbi->TEST_CompactMemTable();
|
||||
Corrupt(kTableFile, 100, 1);
|
||||
env_.SleepForMicroseconds(100000);
|
||||
}
|
||||
dbi->CompactRange(NULL, NULL);
|
||||
ASSERT_TRUE(1 < Property("leveldb.num-files-at-level0"));
|
||||
|
||||
// Write must fail because of corrupted table
|
||||
Corrupt(kTableFile, 100, 1, 0);
|
||||
Check(5, 9);
|
||||
|
||||
// Write must eventually fail because of corrupted table
|
||||
Status s;
|
||||
std::string tmp1, tmp2;
|
||||
Status s = db_->Put(WriteOptions(), Key(5, &tmp1), Value(5, &tmp2));
|
||||
for (int i = 0; i < 10000 && s.ok(); i++) {
|
||||
s = db_->Put(WriteOptions(), Key(i, &tmp1), Value(i, &tmp2));
|
||||
}
|
||||
if (s.ok())
|
||||
ASSERT_NE(current_corruption, Property("leveldb.ReadBlockError")) << "no ReadBlockError seen";
|
||||
else
|
||||
ASSERT_TRUE(!s.ok()) << "write did not fail in corrupted paranoid db";
|
||||
}
|
||||
|
||||
|
@ -355,7 +360,7 @@ TEST(CorruptionTest, UnrelatedKeys) {
|
|||
Build(10);
|
||||
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
|
||||
dbi->TEST_CompactMemTable();
|
||||
Corrupt(kTableFile, 100, 1);
|
||||
Corrupt(kTableFile, 100, 1, config::kMaxMemCompactLevel);
|
||||
|
||||
std::string tmp1, tmp2;
|
||||
ASSERT_OK(db_->Put(WriteOptions(), Key(1000, &tmp1), Value(1000, &tmp2)));
|
||||
|
|
|
@ -33,7 +33,6 @@
|
|||
// readmissing -- read N missing keys in random order
|
||||
// readhot -- read N times in random order from 1% section of DB
|
||||
// seekrandom -- N random seeks
|
||||
// open -- cost of opening a DB
|
||||
// crc32c -- repeated crc32c of 4K of data
|
||||
// acquireload -- load N*1000 times
|
||||
// Meta operations:
|
||||
|
@ -84,14 +83,6 @@ static bool FLAGS_histogram = false;
|
|||
// (initialized to default value by "main")
|
||||
static int FLAGS_write_buffer_size = 0;
|
||||
|
||||
// Number of bytes written to each file.
|
||||
// (initialized to default value by "main")
|
||||
static int FLAGS_max_file_size = 0;
|
||||
|
||||
// Approximate size of user data packed per block (before compression.
|
||||
// (initialized to default value by "main")
|
||||
static int FLAGS_block_size = 0;
|
||||
|
||||
// Number of bytes to use as a cache of uncompressed data.
|
||||
// Negative means use default settings.
|
||||
static int FLAGS_cache_size = -1;
|
||||
|
@ -103,21 +94,26 @@ static int FLAGS_open_files = 0;
|
|||
// Negative means use default settings.
|
||||
static int FLAGS_bloom_bits = -1;
|
||||
|
||||
// Riak bloom adaptation
|
||||
static int FLAGS_bloom2_bits = -1;
|
||||
|
||||
// Riak param for total memory allocation (flex_cache)
|
||||
static uint64_t FLAGS_leveldb_memory = -1;
|
||||
|
||||
// Riak param for compression setting
|
||||
static int FLAGS_compression = 2;
|
||||
|
||||
// If true, do not destroy the existing database. If you set this
|
||||
// flag and also specify a benchmark that wants a fresh database, that
|
||||
// benchmark will fail.
|
||||
static bool FLAGS_use_existing_db = false;
|
||||
|
||||
// If true, reuse existing log/MANIFEST files when re-opening a database.
|
||||
static bool FLAGS_reuse_logs = false;
|
||||
|
||||
// Use the db with the following name.
|
||||
static const char* FLAGS_db = NULL;
|
||||
|
||||
namespace leveldb {
|
||||
|
||||
namespace {
|
||||
leveldb::Env* g_env = NULL;
|
||||
|
||||
// Helper for quickly generating random data.
|
||||
class RandomGenerator {
|
||||
|
@ -141,7 +137,7 @@ class RandomGenerator {
|
|||
pos_ = 0;
|
||||
}
|
||||
|
||||
Slice Generate(size_t len) {
|
||||
Slice Generate(int len) {
|
||||
if (pos_ + len > data_.size()) {
|
||||
pos_ = 0;
|
||||
assert(len < data_.size());
|
||||
|
@ -151,19 +147,17 @@ class RandomGenerator {
|
|||
}
|
||||
};
|
||||
|
||||
#if defined(__linux)
|
||||
static Slice TrimSpace(Slice s) {
|
||||
size_t start = 0;
|
||||
int start = 0;
|
||||
while (start < s.size() && isspace(s[start])) {
|
||||
start++;
|
||||
}
|
||||
size_t limit = s.size();
|
||||
int limit = s.size();
|
||||
while (limit > start && isspace(s[limit-1])) {
|
||||
limit--;
|
||||
}
|
||||
return Slice(s.data() + start, limit - start);
|
||||
}
|
||||
#endif
|
||||
|
||||
static void AppendWithSpace(std::string* str, Slice msg) {
|
||||
if (msg.empty()) return;
|
||||
|
@ -195,7 +189,7 @@ class Stats {
|
|||
done_ = 0;
|
||||
bytes_ = 0;
|
||||
seconds_ = 0;
|
||||
start_ = g_env->NowMicros();
|
||||
start_ = Env::Default()->NowMicros();
|
||||
finish_ = start_;
|
||||
message_.clear();
|
||||
}
|
||||
|
@ -213,7 +207,7 @@ class Stats {
|
|||
}
|
||||
|
||||
void Stop() {
|
||||
finish_ = g_env->NowMicros();
|
||||
finish_ = Env::Default()->NowMicros();
|
||||
seconds_ = (finish_ - start_) * 1e-6;
|
||||
}
|
||||
|
||||
|
@ -223,7 +217,7 @@ class Stats {
|
|||
|
||||
void FinishedSingleOp() {
|
||||
if (FLAGS_histogram) {
|
||||
double now = g_env->NowMicros();
|
||||
double now = Env::Default()->NowMicros();
|
||||
double micros = now - last_op_finish_;
|
||||
hist_.Add(micros);
|
||||
if (micros > 20000) {
|
||||
|
@ -405,7 +399,7 @@ class Benchmark {
|
|||
: cache_(FLAGS_cache_size >= 0 ? NewLRUCache(FLAGS_cache_size) : NULL),
|
||||
filter_policy_(FLAGS_bloom_bits >= 0
|
||||
? NewBloomFilterPolicy(FLAGS_bloom_bits)
|
||||
: NULL),
|
||||
: (FLAGS_bloom2_bits >=0 ? NewBloomFilterPolicy2(FLAGS_bloom2_bits) : NULL)),
|
||||
db_(NULL),
|
||||
num_(FLAGS_num),
|
||||
value_size_(FLAGS_value_size),
|
||||
|
@ -413,10 +407,10 @@ class Benchmark {
|
|||
reads_(FLAGS_reads < 0 ? FLAGS_num : FLAGS_reads),
|
||||
heap_counter_(0) {
|
||||
std::vector<std::string> files;
|
||||
g_env->GetChildren(FLAGS_db, &files);
|
||||
for (size_t i = 0; i < files.size(); i++) {
|
||||
Env::Default()->GetChildren(FLAGS_db, &files);
|
||||
for (int i = 0; i < files.size(); i++) {
|
||||
if (Slice(files[i]).starts_with("heap-")) {
|
||||
g_env->DeleteFile(std::string(FLAGS_db) + "/" + files[i]);
|
||||
Env::Default()->DeleteFile(std::string(FLAGS_db) + "/" + files[i]);
|
||||
}
|
||||
}
|
||||
if (!FLAGS_use_existing_db) {
|
||||
|
@ -446,7 +440,7 @@ class Benchmark {
|
|||
benchmarks = sep + 1;
|
||||
}
|
||||
|
||||
// Reset parameters that may be overridden below
|
||||
// Reset parameters that may be overriddden bwlow
|
||||
num_ = FLAGS_num;
|
||||
reads_ = (FLAGS_reads < 0 ? FLAGS_num : FLAGS_reads);
|
||||
value_size_ = FLAGS_value_size;
|
||||
|
@ -457,11 +451,7 @@ class Benchmark {
|
|||
bool fresh_db = false;
|
||||
int num_threads = FLAGS_threads;
|
||||
|
||||
if (name == Slice("open")) {
|
||||
method = &Benchmark::OpenBench;
|
||||
num_ /= 10000;
|
||||
if (num_ < 1) num_ = 1;
|
||||
} else if (name == Slice("fillseq")) {
|
||||
if (name == Slice("fillseq")) {
|
||||
fresh_db = true;
|
||||
method = &Benchmark::WriteSeq;
|
||||
} else if (name == Slice("fillbatch")) {
|
||||
|
@ -553,6 +543,7 @@ class Benchmark {
|
|||
SharedState* shared;
|
||||
ThreadState* thread;
|
||||
void (Benchmark::*method)(ThreadState*);
|
||||
pthread_t thread_id;
|
||||
};
|
||||
|
||||
static void ThreadBody(void* v) {
|
||||
|
@ -598,7 +589,8 @@ class Benchmark {
|
|||
arg[i].shared = &shared;
|
||||
arg[i].thread = new ThreadState(i);
|
||||
arg[i].thread->shared = &shared;
|
||||
g_env->StartThread(ThreadBody, &arg[i]);
|
||||
arg[i].thread_id=Env::Default()->StartThread(ThreadBody, &arg[i]);
|
||||
pthread_detach(arg[i].thread_id);
|
||||
}
|
||||
|
||||
shared.mu.Lock();
|
||||
|
@ -709,15 +701,12 @@ class Benchmark {
|
|||
void Open() {
|
||||
assert(db_ == NULL);
|
||||
Options options;
|
||||
options.env = g_env;
|
||||
options.create_if_missing = !FLAGS_use_existing_db;
|
||||
options.block_cache = cache_;
|
||||
options.write_buffer_size = FLAGS_write_buffer_size;
|
||||
options.max_file_size = FLAGS_max_file_size;
|
||||
options.block_size = FLAGS_block_size;
|
||||
options.max_open_files = FLAGS_open_files;
|
||||
options.filter_policy = filter_policy_;
|
||||
options.reuse_logs = FLAGS_reuse_logs;
|
||||
options.compression = (leveldb::CompressionType)FLAGS_compression;
|
||||
options.total_leveldb_mem = FLAGS_leveldb_memory;
|
||||
Status s = DB::Open(options, FLAGS_db, &db_);
|
||||
if (!s.ok()) {
|
||||
fprintf(stderr, "open error: %s\n", s.ToString().c_str());
|
||||
|
@ -725,14 +714,6 @@ class Benchmark {
|
|||
}
|
||||
}
|
||||
|
||||
void OpenBench(ThreadState* thread) {
|
||||
for (int i = 0; i < num_; i++) {
|
||||
delete db_;
|
||||
Open();
|
||||
thread->stats.FinishedSingleOp();
|
||||
}
|
||||
}
|
||||
|
||||
void WriteSeq(ThreadState* thread) {
|
||||
DoWrite(thread, true);
|
||||
}
|
||||
|
@ -842,6 +823,7 @@ class Benchmark {
|
|||
|
||||
void SeekRandom(ThreadState* thread) {
|
||||
ReadOptions options;
|
||||
std::string value;
|
||||
int found = 0;
|
||||
for (int i = 0; i < reads_; i++) {
|
||||
Iterator* iter = db_->NewIterator(options);
|
||||
|
@ -937,7 +919,7 @@ class Benchmark {
|
|||
char fname[100];
|
||||
snprintf(fname, sizeof(fname), "%s/heap-%04d", FLAGS_db, ++heap_counter_);
|
||||
WritableFile* file;
|
||||
Status s = g_env->NewWritableFile(fname, &file);
|
||||
Status s = Env::Default()->NewWritableFile(fname, &file, 2<<20);
|
||||
if (!s.ok()) {
|
||||
fprintf(stderr, "%s\n", s.ToString().c_str());
|
||||
return;
|
||||
|
@ -946,7 +928,7 @@ class Benchmark {
|
|||
delete file;
|
||||
if (!ok) {
|
||||
fprintf(stderr, "heap profiling not supported\n");
|
||||
g_env->DeleteFile(fname);
|
||||
Env::Default()->DeleteFile(fname);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
@ -955,14 +937,14 @@ class Benchmark {
|
|||
|
||||
int main(int argc, char** argv) {
|
||||
FLAGS_write_buffer_size = leveldb::Options().write_buffer_size;
|
||||
FLAGS_max_file_size = leveldb::Options().max_file_size;
|
||||
FLAGS_block_size = leveldb::Options().block_size;
|
||||
FLAGS_open_files = leveldb::Options().max_open_files;
|
||||
FLAGS_leveldb_memory = 25000000000LL;
|
||||
std::string default_db_path;
|
||||
|
||||
for (int i = 1; i < argc; i++) {
|
||||
double d;
|
||||
int n;
|
||||
uint64_t u;
|
||||
char junk;
|
||||
if (leveldb::Slice(argv[i]).starts_with("--benchmarks=")) {
|
||||
FLAGS_benchmarks = argv[i] + strlen("--benchmarks=");
|
||||
|
@ -974,9 +956,6 @@ int main(int argc, char** argv) {
|
|||
} else if (sscanf(argv[i], "--use_existing_db=%d%c", &n, &junk) == 1 &&
|
||||
(n == 0 || n == 1)) {
|
||||
FLAGS_use_existing_db = n;
|
||||
} else if (sscanf(argv[i], "--reuse_logs=%d%c", &n, &junk) == 1 &&
|
||||
(n == 0 || n == 1)) {
|
||||
FLAGS_reuse_logs = n;
|
||||
} else if (sscanf(argv[i], "--num=%d%c", &n, &junk) == 1) {
|
||||
FLAGS_num = n;
|
||||
} else if (sscanf(argv[i], "--reads=%d%c", &n, &junk) == 1) {
|
||||
|
@ -987,16 +966,18 @@ int main(int argc, char** argv) {
|
|||
FLAGS_value_size = n;
|
||||
} else if (sscanf(argv[i], "--write_buffer_size=%d%c", &n, &junk) == 1) {
|
||||
FLAGS_write_buffer_size = n;
|
||||
} else if (sscanf(argv[i], "--max_file_size=%d%c", &n, &junk) == 1) {
|
||||
FLAGS_max_file_size = n;
|
||||
} else if (sscanf(argv[i], "--block_size=%d%c", &n, &junk) == 1) {
|
||||
FLAGS_block_size = n;
|
||||
} else if (sscanf(argv[i], "--cache_size=%d%c", &n, &junk) == 1) {
|
||||
FLAGS_cache_size = n;
|
||||
} else if (sscanf(argv[i], "--bloom_bits=%d%c", &n, &junk) == 1) {
|
||||
FLAGS_bloom_bits = n;
|
||||
} else if (sscanf(argv[i], "--bloom_bits2=%d%c", &n, &junk) == 1) {
|
||||
FLAGS_bloom2_bits = n;
|
||||
} else if (sscanf(argv[i], "--leveldb_memory=%d%c", &n, &junk) == 1) {
|
||||
FLAGS_leveldb_memory = n * 1024 * 1024LL;
|
||||
} else if (sscanf(argv[i], "--open_files=%d%c", &n, &junk) == 1) {
|
||||
FLAGS_open_files = n;
|
||||
} else if (sscanf(argv[i], "--compression=%d%c", &n, &junk) == 1) {
|
||||
FLAGS_compression = n;
|
||||
} else if (strncmp(argv[i], "--db=", 5) == 0) {
|
||||
FLAGS_db = argv[i] + 5;
|
||||
} else {
|
||||
|
@ -1005,16 +986,20 @@ int main(int argc, char** argv) {
|
|||
}
|
||||
}
|
||||
|
||||
leveldb::g_env = leveldb::Env::Default();
|
||||
|
||||
// Choose a location for the test database if none given with --db=<path>
|
||||
if (FLAGS_db == NULL) {
|
||||
leveldb::g_env->GetTestDirectory(&default_db_path);
|
||||
leveldb::Env::Default()->GetTestDirectory(&default_db_path);
|
||||
default_db_path += "/dbbench";
|
||||
FLAGS_db = default_db_path.c_str();
|
||||
}
|
||||
|
||||
// benchmark class needs to destruct before Shutdown call
|
||||
{
|
||||
leveldb::Benchmark benchmark;
|
||||
benchmark.Run();
|
||||
}
|
||||
|
||||
leveldb::Env::Shutdown();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -13,7 +13,7 @@
|
|||
#include "leveldb/db.h"
|
||||
#include "leveldb/env.h"
|
||||
#include "port/port.h"
|
||||
#include "port/thread_annotations.h"
|
||||
#include "util/cache2.h"
|
||||
|
||||
namespace leveldb {
|
||||
|
||||
|
@ -29,26 +29,37 @@ class DBImpl : public DB {
|
|||
virtual ~DBImpl();
|
||||
|
||||
// Implementations of the DB interface
|
||||
virtual Status Put(const WriteOptions&, const Slice& key, const Slice& value);
|
||||
virtual Status Put(const WriteOptions&, const Slice& key, const Slice& value, const KeyMetaData * meta=NULL);
|
||||
virtual Status Delete(const WriteOptions&, const Slice& key);
|
||||
virtual Status Write(const WriteOptions& options, WriteBatch* updates);
|
||||
virtual Status Get(const ReadOptions& options,
|
||||
const Slice& key,
|
||||
std::string* value);
|
||||
std::string* value,
|
||||
KeyMetaData * meta=NULL);
|
||||
virtual Status Get(const ReadOptions& options,
|
||||
const Slice& key,
|
||||
Value* value,
|
||||
KeyMetaData * meta=NULL);
|
||||
virtual Iterator* NewIterator(const ReadOptions&);
|
||||
virtual const Snapshot* GetSnapshot();
|
||||
virtual void ReleaseSnapshot(const Snapshot* snapshot);
|
||||
virtual bool GetProperty(const Slice& property, std::string* value);
|
||||
virtual void GetApproximateSizes(const Range* range, int n, uint64_t* sizes);
|
||||
virtual void CompactRange(const Slice* begin, const Slice* end);
|
||||
virtual Status VerifyLevels();
|
||||
virtual void CheckAvailableCompactions();
|
||||
virtual Logger* GetLogger() const { return options_.info_log; }
|
||||
|
||||
// Extra methods (for testing) that are not in the public DB interface
|
||||
|
||||
const Options & GetOptions() const { return options_; };
|
||||
|
||||
// Compact any files in the named level that overlap [*begin,*end]
|
||||
void TEST_CompactRange(int level, const Slice* begin, const Slice* end);
|
||||
|
||||
// Force current memtable contents to be compacted.
|
||||
Status TEST_CompactMemTable();
|
||||
// Force current memtable contents to be compacted, waits for completion
|
||||
Status CompactMemTableSynchronous();
|
||||
Status TEST_CompactMemTable(); // wraps CompactMemTableSynchronous (historical)
|
||||
|
||||
// Return an internal iterator over the current state of the database.
|
||||
// The keys of this iterator are internal keys (see format.h).
|
||||
|
@ -59,64 +70,82 @@ class DBImpl : public DB {
|
|||
// file at a level >= 1.
|
||||
int64_t TEST_MaxNextLevelOverlappingBytes();
|
||||
|
||||
// Record a sample of bytes read at the specified internal key.
|
||||
// Samples are taken approximately once every config::kReadBytesPeriod
|
||||
// bytes.
|
||||
void RecordReadSample(Slice key);
|
||||
// These are routines that DBListImpl calls across all open databases
|
||||
void ResizeCaches() {double_cache.ResizeCaches();};
|
||||
size_t GetCacheCapacity() {return(double_cache.GetCapacity(false));}
|
||||
void PurgeExpiredFileCache() {double_cache.PurgeExpiredFiles();};
|
||||
|
||||
private:
|
||||
// in util/hot_backup.cc
|
||||
void HotBackup();
|
||||
bool PurgeWriteBuffer();
|
||||
bool WriteBackupManifest();
|
||||
bool CreateBackupLinks(Version * Version, Options & BackupOptions);
|
||||
bool CopyLOGSegment(long FileEnd);
|
||||
void HotBackupComplete();
|
||||
|
||||
void BackgroundCall2(Compaction * Compact);
|
||||
void BackgroundImmCompactCall();
|
||||
bool IsCompactionScheduled();
|
||||
uint32_t RunningCompactionCount() {mutex_.AssertHeld(); return(running_compactions_);};
|
||||
|
||||
protected:
|
||||
friend class DB;
|
||||
struct CompactionState;
|
||||
struct Writer;
|
||||
|
||||
Iterator* NewInternalIterator(const ReadOptions&,
|
||||
SequenceNumber* latest_snapshot,
|
||||
uint32_t* seed);
|
||||
SequenceNumber* latest_snapshot);
|
||||
|
||||
Status NewDB();
|
||||
|
||||
// Recover the descriptor from persistent storage. May do a significant
|
||||
// amount of work to recover recently logged updates. Any changes to
|
||||
// be made to the descriptor are added to *edit.
|
||||
Status Recover(VersionEdit* edit, bool* save_manifest)
|
||||
EXCLUSIVE_LOCKS_REQUIRED(mutex_);
|
||||
Status Recover(VersionEdit* edit);
|
||||
|
||||
// Riak routine: pause DB::Open if too many compactions
|
||||
// stacked up immediately. Happens in some repairs and
|
||||
// some Riak upgrades
|
||||
void CheckCompactionState();
|
||||
|
||||
void MaybeIgnoreError(Status* s) const;
|
||||
|
||||
// Delete any unneeded files and stale in-memory entries.
|
||||
void DeleteObsoleteFiles();
|
||||
void KeepOrDelete(const std::string & Filename, int level, const std::set<uint64_t> & Live);
|
||||
|
||||
// Compact the in-memory write buffer to disk. Switches to a new
|
||||
// log-file/memtable and writes a new descriptor iff successful.
|
||||
// Errors are recorded in bg_error_.
|
||||
void CompactMemTable() EXCLUSIVE_LOCKS_REQUIRED(mutex_);
|
||||
Status CompactMemTable();
|
||||
|
||||
Status RecoverLogFile(uint64_t log_number, bool last_log, bool* save_manifest,
|
||||
VersionEdit* edit, SequenceNumber* max_sequence)
|
||||
EXCLUSIVE_LOCKS_REQUIRED(mutex_);
|
||||
Status RecoverLogFile(uint64_t log_number,
|
||||
VersionEdit* edit,
|
||||
SequenceNumber* max_sequence);
|
||||
|
||||
Status WriteLevel0Table(MemTable* mem, VersionEdit* edit, Version* base)
|
||||
EXCLUSIVE_LOCKS_REQUIRED(mutex_);
|
||||
Status WriteLevel0Table(volatile MemTable* mem, VersionEdit* edit, Version* base);
|
||||
|
||||
Status MakeRoomForWrite(bool force /* TRUE forces memtable rotation to disk (for testing) */);
|
||||
Status NewRecoveryLog(uint64_t NewLogNumber);
|
||||
|
||||
Status MakeRoomForWrite(bool force /* compact even if there is room? */)
|
||||
EXCLUSIVE_LOCKS_REQUIRED(mutex_);
|
||||
WriteBatch* BuildBatchGroup(Writer** last_writer);
|
||||
|
||||
void RecordBackgroundError(const Status& s);
|
||||
void MaybeScheduleCompaction();
|
||||
|
||||
void MaybeScheduleCompaction() EXCLUSIVE_LOCKS_REQUIRED(mutex_);
|
||||
static void BGWork(void* db);
|
||||
void BackgroundCall();
|
||||
void BackgroundCompaction() EXCLUSIVE_LOCKS_REQUIRED(mutex_);
|
||||
void CleanupCompaction(CompactionState* compact)
|
||||
EXCLUSIVE_LOCKS_REQUIRED(mutex_);
|
||||
Status DoCompactionWork(CompactionState* compact)
|
||||
EXCLUSIVE_LOCKS_REQUIRED(mutex_);
|
||||
Status BackgroundCompaction(Compaction * Compact=NULL);
|
||||
Status BackgroundExpiry(Compaction * Compact=NULL);
|
||||
|
||||
Status OpenCompactionOutputFile(CompactionState* compact);
|
||||
void CleanupCompaction(CompactionState* compact);
|
||||
Status DoCompactionWork(CompactionState* compact);
|
||||
int64_t PrioritizeWork(bool IsLevel0);
|
||||
|
||||
Status OpenCompactionOutputFile(CompactionState* compact, size_t sample_value_size);
|
||||
bool Send2PageCache(CompactionState * compact);
|
||||
size_t MaybeRaiseBlockSize(Compaction & CompactionStuff, size_t SampleValueSize);
|
||||
Status FinishCompactionOutputFile(CompactionState* compact, Iterator* input);
|
||||
Status InstallCompactionResults(CompactionState* compact)
|
||||
EXCLUSIVE_LOCKS_REQUIRED(mutex_);
|
||||
Status InstallCompactionResults(CompactionState* compact);
|
||||
|
||||
// initialized before options so its block_cache is available
|
||||
class DoubleCache double_cache;
|
||||
|
||||
// Constant after construction
|
||||
Env* const env_;
|
||||
|
@ -130,20 +159,22 @@ class DBImpl : public DB {
|
|||
// table_cache_ provides its own synchronization
|
||||
TableCache* table_cache_;
|
||||
|
||||
|
||||
// Lock over the persistent DB state. Non-NULL iff successfully acquired.
|
||||
FileLock* db_lock_;
|
||||
|
||||
// State below is protected by mutex_
|
||||
port::Mutex mutex_;
|
||||
port::Mutex throttle_mutex_; // used by write throttle to force sequential waits on callers
|
||||
port::AtomicPointer shutting_down_;
|
||||
|
||||
port::CondVar bg_cv_; // Signalled when background work finishes
|
||||
MemTable* mem_;
|
||||
MemTable* imm_; // Memtable being compacted
|
||||
volatile MemTable* imm_; // Memtable being compacted
|
||||
port::AtomicPointer has_imm_; // So bg thread can detect non-NULL imm_
|
||||
WritableFile* logfile_;
|
||||
uint64_t logfile_number_;
|
||||
log::Writer* log_;
|
||||
uint32_t seed_; // For sampling.
|
||||
|
||||
// Queue of writers.
|
||||
std::deque<Writer*> writers_;
|
||||
|
@ -155,9 +186,6 @@ class DBImpl : public DB {
|
|||
// part of ongoing compactions.
|
||||
std::set<uint64_t> pending_outputs_;
|
||||
|
||||
// Has a background compaction been scheduled or is running?
|
||||
bool bg_compaction_scheduled_;
|
||||
|
||||
// Information for a manual compaction
|
||||
struct ManualCompaction {
|
||||
int level;
|
||||
|
@ -166,7 +194,7 @@ class DBImpl : public DB {
|
|||
const InternalKey* end; // NULL means end of key range
|
||||
InternalKey tmp_storage; // Used to keep track of compaction progress
|
||||
};
|
||||
ManualCompaction* manual_compaction_;
|
||||
volatile ManualCompaction* manual_compaction_;
|
||||
|
||||
VersionSet* versions_;
|
||||
|
||||
|
@ -190,6 +218,18 @@ class DBImpl : public DB {
|
|||
};
|
||||
CompactionStats stats_[config::kNumLevels];
|
||||
|
||||
volatile uint64_t throttle_end;
|
||||
volatile uint32_t running_compactions_;
|
||||
volatile size_t current_block_size_; // last dynamic block size computed
|
||||
volatile uint64_t block_size_changed_; // NowMicros() when block size computed
|
||||
volatile uint64_t last_low_mem_; // NowMicros() when low memory last seen
|
||||
|
||||
// accessor to new, dynamic block_cache
|
||||
Cache * block_cache() {return(double_cache.GetBlockCache());};
|
||||
Cache * file_cache() {return(double_cache.GetFileCache());};
|
||||
|
||||
volatile bool hotbackup_pending_;
|
||||
|
||||
// No copying allowed
|
||||
DBImpl(const DBImpl&);
|
||||
void operator=(const DBImpl&);
|
||||
|
@ -204,7 +244,8 @@ class DBImpl : public DB {
|
|||
extern Options SanitizeOptions(const std::string& db,
|
||||
const InternalKeyComparator* icmp,
|
||||
const InternalFilterPolicy* ipolicy,
|
||||
const Options& src);
|
||||
const Options& src,
|
||||
Cache * block_cache);
|
||||
|
||||
} // namespace leveldb
|
||||
|
||||
|
|
|
@ -5,14 +5,14 @@
|
|||
#include "db/db_iter.h"
|
||||
|
||||
#include "db/filename.h"
|
||||
#include "db/db_impl.h"
|
||||
#include "db/dbformat.h"
|
||||
#include "leveldb/env.h"
|
||||
#include "leveldb/expiry.h"
|
||||
#include "leveldb/iterator.h"
|
||||
#include "leveldb/perf_count.h"
|
||||
#include "port/port.h"
|
||||
#include "util/logging.h"
|
||||
#include "util/mutexlock.h"
|
||||
#include "util/random.h"
|
||||
|
||||
namespace leveldb {
|
||||
|
||||
|
@ -48,18 +48,20 @@ class DBIter: public Iterator {
|
|||
kReverse
|
||||
};
|
||||
|
||||
DBIter(DBImpl* db, const Comparator* cmp, Iterator* iter, SequenceNumber s,
|
||||
uint32_t seed)
|
||||
: db_(db),
|
||||
DBIter(const std::string* dbname, Env* env,
|
||||
const Comparator* cmp, Iterator* iter, SequenceNumber s,
|
||||
const ExpiryModule * expiry)
|
||||
: dbname_(dbname),
|
||||
env_(env),
|
||||
user_comparator_(cmp),
|
||||
iter_(iter),
|
||||
sequence_(s),
|
||||
direction_(kForward),
|
||||
valid_(false),
|
||||
rnd_(seed),
|
||||
bytes_counter_(RandomPeriod()) {
|
||||
expiry_(expiry) {
|
||||
}
|
||||
virtual ~DBIter() {
|
||||
gPerfCounters->Inc(ePerfIterDelete);
|
||||
delete iter_;
|
||||
}
|
||||
virtual bool Valid() const { return valid_; }
|
||||
|
@ -71,6 +73,26 @@ class DBIter: public Iterator {
|
|||
assert(valid_);
|
||||
return (direction_ == kForward) ? iter_->value() : saved_value_;
|
||||
}
|
||||
// Riak specific: if a database iterator, returns key meta data
|
||||
// REQUIRES: Valid() and forward iteration
|
||||
// (reverse iteration is possible, just needs code)
|
||||
virtual KeyMetaData & keymetadata() const
|
||||
{
|
||||
assert(valid_ && kForward==direction_);
|
||||
if (kForward==direction_)
|
||||
{
|
||||
ParsedInternalKey parsed;
|
||||
// this initialization clears a warning. ParsedInternalKey says
|
||||
// it is not initializing for performance reasons ... oh well
|
||||
parsed.type=kTypeValue; parsed.sequence=0; parsed.expiry=0;
|
||||
ParseInternalKey(iter_->key(), &parsed);
|
||||
keymetadata_.m_Type=parsed.type;
|
||||
keymetadata_.m_Sequence=parsed.sequence;
|
||||
keymetadata_.m_Expiry=parsed.expiry;
|
||||
}
|
||||
return(keymetadata_);
|
||||
}
|
||||
|
||||
virtual Status status() const {
|
||||
if (status_.ok()) {
|
||||
return iter_->status();
|
||||
|
@ -103,12 +125,8 @@ class DBIter: public Iterator {
|
|||
}
|
||||
}
|
||||
|
||||
// Pick next gap with average value of config::kReadBytesPeriod.
|
||||
ssize_t RandomPeriod() {
|
||||
return rnd_.Uniform(2*config::kReadBytesPeriod);
|
||||
}
|
||||
|
||||
DBImpl* db_;
|
||||
const std::string* const dbname_;
|
||||
Env* const env_;
|
||||
const Comparator* const user_comparator_;
|
||||
Iterator* const iter_;
|
||||
SequenceNumber const sequence_;
|
||||
|
@ -118,9 +136,7 @@ class DBIter: public Iterator {
|
|||
std::string saved_value_; // == current raw value when direction_==kReverse
|
||||
Direction direction_;
|
||||
bool valid_;
|
||||
|
||||
Random rnd_;
|
||||
ssize_t bytes_counter_;
|
||||
const ExpiryModule * expiry_;
|
||||
|
||||
// No copying allowed
|
||||
DBIter(const DBIter&);
|
||||
|
@ -128,14 +144,7 @@ class DBIter: public Iterator {
|
|||
};
|
||||
|
||||
inline bool DBIter::ParseKey(ParsedInternalKey* ikey) {
|
||||
Slice k = iter_->key();
|
||||
ssize_t n = k.size() + iter_->value().size();
|
||||
bytes_counter_ -= n;
|
||||
while (bytes_counter_ < 0) {
|
||||
bytes_counter_ += RandomPeriod();
|
||||
db_->RecordReadSample(k);
|
||||
}
|
||||
if (!ParseInternalKey(k, ikey)) {
|
||||
if (!ParseInternalKey(iter_->key(), ikey)) {
|
||||
status_ = Status::Corruption("corrupted internal key in DBIter");
|
||||
return false;
|
||||
} else {
|
||||
|
@ -146,6 +155,7 @@ inline bool DBIter::ParseKey(ParsedInternalKey* ikey) {
|
|||
void DBIter::Next() {
|
||||
assert(valid_);
|
||||
|
||||
gPerfCounters->Inc(ePerfIterNext);
|
||||
if (direction_ == kReverse) { // Switch directions?
|
||||
direction_ = kForward;
|
||||
// iter_ is pointing just before the entries for this->key(),
|
||||
|
@ -161,13 +171,12 @@ void DBIter::Next() {
|
|||
saved_key_.clear();
|
||||
return;
|
||||
}
|
||||
// saved_key_ already contains the key to skip past.
|
||||
} else {
|
||||
// Store in saved_key_ the current key so we skip it below.
|
||||
SaveKey(ExtractUserKey(iter_->key()), &saved_key_);
|
||||
}
|
||||
|
||||
FindNextUserEntry(true, &saved_key_);
|
||||
// Temporarily use saved_key_ as storage for key to skip.
|
||||
std::string* skip = &saved_key_;
|
||||
SaveKey(ExtractUserKey(iter_->key()), skip);
|
||||
FindNextUserEntry(true, skip);
|
||||
}
|
||||
|
||||
void DBIter::FindNextUserEntry(bool skipping, std::string* skip) {
|
||||
|
@ -177,6 +186,9 @@ void DBIter::FindNextUserEntry(bool skipping, std::string* skip) {
|
|||
do {
|
||||
ParsedInternalKey ikey;
|
||||
if (ParseKey(&ikey) && ikey.sequence <= sequence_) {
|
||||
if (IsExpiryKey(ikey.type) && NULL!=expiry_
|
||||
&& expiry_->KeyRetirementCallback(ikey))
|
||||
ikey.type=kTypeDeletion;
|
||||
switch (ikey.type) {
|
||||
case kTypeDeletion:
|
||||
// Arrange to skip all upcoming entries for this key since
|
||||
|
@ -184,6 +196,9 @@ void DBIter::FindNextUserEntry(bool skipping, std::string* skip) {
|
|||
SaveKey(ikey.user_key, skip);
|
||||
skipping = true;
|
||||
break;
|
||||
|
||||
case kTypeValueWriteTime:
|
||||
case kTypeValueExplicitExpiry:
|
||||
case kTypeValue:
|
||||
if (skipping &&
|
||||
user_comparator_->Compare(ikey.user_key, *skip) <= 0) {
|
||||
|
@ -205,6 +220,7 @@ void DBIter::FindNextUserEntry(bool skipping, std::string* skip) {
|
|||
void DBIter::Prev() {
|
||||
assert(valid_);
|
||||
|
||||
gPerfCounters->Inc(ePerfIterPrev);
|
||||
if (direction_ == kForward) { // Switch directions?
|
||||
// iter_ is pointing at the current entry. Scan backwards until
|
||||
// the key changes so we can use the normal reverse scanning code.
|
||||
|
@ -242,6 +258,10 @@ void DBIter::FindPrevUserEntry() {
|
|||
// We encountered a non-deleted value in entries for previous keys,
|
||||
break;
|
||||
}
|
||||
if (IsExpiryKey(ikey.type) && NULL!=expiry_
|
||||
&& expiry_->KeyRetirementCallback(ikey))
|
||||
ikey.type=kTypeDeletion;
|
||||
|
||||
value_type = ikey.type;
|
||||
if (value_type == kTypeDeletion) {
|
||||
saved_key_.clear();
|
||||
|
@ -272,11 +292,12 @@ void DBIter::FindPrevUserEntry() {
|
|||
}
|
||||
|
||||
void DBIter::Seek(const Slice& target) {
|
||||
gPerfCounters->Inc(ePerfIterSeek);
|
||||
direction_ = kForward;
|
||||
ClearSavedValue();
|
||||
saved_key_.clear();
|
||||
AppendInternalKey(
|
||||
&saved_key_, ParsedInternalKey(target, sequence_, kValueTypeForSeek));
|
||||
&saved_key_, ParsedInternalKey(target, 0, sequence_, kValueTypeForSeek));
|
||||
iter_->Seek(saved_key_);
|
||||
if (iter_->Valid()) {
|
||||
FindNextUserEntry(false, &saved_key_ /* temporary storage */);
|
||||
|
@ -286,6 +307,7 @@ void DBIter::Seek(const Slice& target) {
|
|||
}
|
||||
|
||||
void DBIter::SeekToFirst() {
|
||||
gPerfCounters->Inc(ePerfIterSeekFirst);
|
||||
direction_ = kForward;
|
||||
ClearSavedValue();
|
||||
iter_->SeekToFirst();
|
||||
|
@ -297,6 +319,7 @@ void DBIter::SeekToFirst() {
|
|||
}
|
||||
|
||||
void DBIter::SeekToLast() {
|
||||
gPerfCounters->Inc(ePerfIterSeekLast);
|
||||
direction_ = kReverse;
|
||||
ClearSavedValue();
|
||||
iter_->SeekToLast();
|
||||
|
@ -306,12 +329,13 @@ void DBIter::SeekToLast() {
|
|||
} // anonymous namespace
|
||||
|
||||
Iterator* NewDBIterator(
|
||||
DBImpl* db,
|
||||
const std::string* dbname,
|
||||
Env* env,
|
||||
const Comparator* user_key_comparator,
|
||||
Iterator* internal_iter,
|
||||
SequenceNumber sequence,
|
||||
uint32_t seed) {
|
||||
return new DBIter(db, user_key_comparator, internal_iter, sequence, seed);
|
||||
const SequenceNumber& sequence,
|
||||
const ExpiryModule * expiry) {
|
||||
return new DBIter(dbname, env, user_key_comparator, internal_iter, sequence, expiry);
|
||||
}
|
||||
|
||||
} // namespace leveldb
|
||||
|
|
|
@ -7,21 +7,21 @@
|
|||
|
||||
#include <stdint.h>
|
||||
#include "leveldb/db.h"
|
||||
#include "leveldb/expiry.h"
|
||||
#include "db/dbformat.h"
|
||||
|
||||
namespace leveldb {
|
||||
|
||||
class DBImpl;
|
||||
|
||||
// Return a new iterator that converts internal keys (yielded by
|
||||
// "*internal_iter") that were live at the specified "sequence" number
|
||||
// into appropriate user keys.
|
||||
extern Iterator* NewDBIterator(
|
||||
DBImpl* db,
|
||||
const std::string* dbname,
|
||||
Env* env,
|
||||
const Comparator* user_key_comparator,
|
||||
Iterator* internal_iter,
|
||||
SequenceNumber sequence,
|
||||
uint32_t seed);
|
||||
const SequenceNumber& sequence,
|
||||
const ExpiryModule * expiry=NULL);
|
||||
|
||||
} // namespace leveldb
|
||||
|
||||
|
|
|
@ -33,11 +33,8 @@ class AtomicCounter {
|
|||
public:
|
||||
AtomicCounter() : count_(0) { }
|
||||
void Increment() {
|
||||
IncrementBy(1);
|
||||
}
|
||||
void IncrementBy(int count) {
|
||||
MutexLock l(&mu_);
|
||||
count_ += count;
|
||||
count_++;
|
||||
}
|
||||
int Read() {
|
||||
MutexLock l(&mu_);
|
||||
|
@ -48,20 +45,13 @@ class AtomicCounter {
|
|||
count_ = 0;
|
||||
}
|
||||
};
|
||||
|
||||
void DelayMilliseconds(int millis) {
|
||||
Env::Default()->SleepForMicroseconds(millis * 1000);
|
||||
}
|
||||
}
|
||||
|
||||
// Special Env used to delay background operations
|
||||
class SpecialEnv : public EnvWrapper {
|
||||
public:
|
||||
// sstable/log Sync() calls are blocked while this pointer is non-NULL.
|
||||
port::AtomicPointer delay_data_sync_;
|
||||
|
||||
// sstable/log Sync() calls return an error.
|
||||
port::AtomicPointer data_sync_error_;
|
||||
// sstable Sync() calls are blocked while this pointer is non-NULL.
|
||||
port::AtomicPointer delay_sstable_sync_;
|
||||
|
||||
// Simulate no-space errors while this pointer is non-NULL.
|
||||
port::AtomicPointer no_space_;
|
||||
|
@ -69,37 +59,30 @@ class SpecialEnv : public EnvWrapper {
|
|||
// Simulate non-writable file system while this pointer is non-NULL
|
||||
port::AtomicPointer non_writable_;
|
||||
|
||||
// Force sync of manifest files to fail while this pointer is non-NULL
|
||||
port::AtomicPointer manifest_sync_error_;
|
||||
|
||||
// Force write to manifest files to fail while this pointer is non-NULL
|
||||
port::AtomicPointer manifest_write_error_;
|
||||
|
||||
bool count_random_reads_;
|
||||
AtomicCounter random_read_counter_;
|
||||
|
||||
AtomicCounter sleep_counter_;
|
||||
|
||||
explicit SpecialEnv(Env* base) : EnvWrapper(base) {
|
||||
delay_data_sync_.Release_Store(NULL);
|
||||
data_sync_error_.Release_Store(NULL);
|
||||
delay_sstable_sync_.Release_Store(NULL);
|
||||
no_space_.Release_Store(NULL);
|
||||
non_writable_.Release_Store(NULL);
|
||||
count_random_reads_ = false;
|
||||
manifest_sync_error_.Release_Store(NULL);
|
||||
manifest_write_error_.Release_Store(NULL);
|
||||
}
|
||||
|
||||
Status NewWritableFile(const std::string& f, WritableFile** r) {
|
||||
class DataFile : public WritableFile {
|
||||
Status NewWritableFile(const std::string& f, WritableFile** r, size_t map_size) {
|
||||
class SSTableFile : public WritableFile {
|
||||
private:
|
||||
SpecialEnv* env_;
|
||||
WritableFile* base_;
|
||||
|
||||
public:
|
||||
DataFile(SpecialEnv* env, WritableFile* base)
|
||||
SSTableFile(SpecialEnv* env, WritableFile* base)
|
||||
: env_(env),
|
||||
base_(base) {
|
||||
}
|
||||
~DataFile() { delete base_; }
|
||||
~SSTableFile() { delete base_; }
|
||||
Status Append(const Slice& data) {
|
||||
if (env_->no_space_.Acquire_Load() != NULL) {
|
||||
// Drop writes on the floor
|
||||
|
@ -111,51 +94,21 @@ class SpecialEnv : public EnvWrapper {
|
|||
Status Close() { return base_->Close(); }
|
||||
Status Flush() { return base_->Flush(); }
|
||||
Status Sync() {
|
||||
if (env_->data_sync_error_.Acquire_Load() != NULL) {
|
||||
return Status::IOError("simulated data sync error");
|
||||
}
|
||||
while (env_->delay_data_sync_.Acquire_Load() != NULL) {
|
||||
DelayMilliseconds(100);
|
||||
while (env_->delay_sstable_sync_.Acquire_Load() != NULL) {
|
||||
env_->SleepForMicroseconds(100000);
|
||||
}
|
||||
return base_->Sync();
|
||||
}
|
||||
};
|
||||
class ManifestFile : public WritableFile {
|
||||
private:
|
||||
SpecialEnv* env_;
|
||||
WritableFile* base_;
|
||||
public:
|
||||
ManifestFile(SpecialEnv* env, WritableFile* b) : env_(env), base_(b) { }
|
||||
~ManifestFile() { delete base_; }
|
||||
Status Append(const Slice& data) {
|
||||
if (env_->manifest_write_error_.Acquire_Load() != NULL) {
|
||||
return Status::IOError("simulated writer error");
|
||||
} else {
|
||||
return base_->Append(data);
|
||||
}
|
||||
}
|
||||
Status Close() { return base_->Close(); }
|
||||
Status Flush() { return base_->Flush(); }
|
||||
Status Sync() {
|
||||
if (env_->manifest_sync_error_.Acquire_Load() != NULL) {
|
||||
return Status::IOError("simulated sync error");
|
||||
} else {
|
||||
return base_->Sync();
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
if (non_writable_.Acquire_Load() != NULL) {
|
||||
return Status::IOError("simulated write error");
|
||||
}
|
||||
|
||||
Status s = target()->NewWritableFile(f, r);
|
||||
Status s = target()->NewWritableFile(f, r, 2<<20);
|
||||
if (s.ok()) {
|
||||
if (strstr(f.c_str(), ".ldb") != NULL ||
|
||||
strstr(f.c_str(), ".log") != NULL) {
|
||||
*r = new DataFile(this, *r);
|
||||
} else if (strstr(f.c_str(), "MANIFEST") != NULL) {
|
||||
*r = new ManifestFile(this, *r);
|
||||
if (strstr(f.c_str(), ".sst") != NULL) {
|
||||
*r = new SSTableFile(this, *r);
|
||||
}
|
||||
}
|
||||
return s;
|
||||
|
@ -184,6 +137,11 @@ class SpecialEnv : public EnvWrapper {
|
|||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
virtual void SleepForMicroseconds(int micros) {
|
||||
sleep_counter_.Increment();
|
||||
target()->SleepForMicroseconds(micros);
|
||||
}
|
||||
};
|
||||
|
||||
class DBTest {
|
||||
|
@ -193,7 +151,6 @@ class DBTest {
|
|||
// Sequence of option configurations to try
|
||||
enum OptionConfig {
|
||||
kDefault,
|
||||
kReuse,
|
||||
kFilter,
|
||||
kUncompressed,
|
||||
kEnd
|
||||
|
@ -209,7 +166,7 @@ class DBTest {
|
|||
|
||||
DBTest() : option_config_(kDefault),
|
||||
env_(new SpecialEnv(Env::Default())) {
|
||||
filter_policy_ = NewBloomFilterPolicy(10);
|
||||
filter_policy_ = NewBloomFilterPolicy2(16);
|
||||
dbname_ = test::TmpDir() + "/db_test";
|
||||
DestroyDB(dbname_, Options());
|
||||
db_ = NULL;
|
||||
|
@ -238,11 +195,7 @@ class DBTest {
|
|||
// Return the current option configuration.
|
||||
Options CurrentOptions() {
|
||||
Options options;
|
||||
options.reuse_logs = false;
|
||||
switch (option_config_) {
|
||||
case kReuse:
|
||||
options.reuse_logs = true;
|
||||
break;
|
||||
case kFilter:
|
||||
options.filter_policy = filter_policy_;
|
||||
break;
|
||||
|
@ -290,6 +243,23 @@ class DBTest {
|
|||
return DB::Open(opts, dbname_, &db_);
|
||||
}
|
||||
|
||||
Status DoubleOpen(Options* options = NULL) {
|
||||
DB * db_fail;
|
||||
delete db_;
|
||||
db_ = NULL;
|
||||
Options opts, opts2;
|
||||
if (options != NULL) {
|
||||
opts = *options;
|
||||
} else {
|
||||
opts = CurrentOptions();
|
||||
opts.create_if_missing = true;
|
||||
}
|
||||
last_options_ = opts;
|
||||
|
||||
DB::Open(opts, dbname_, &db_);
|
||||
return DB::Open(opts2, dbname_, &db_fail);
|
||||
}
|
||||
|
||||
Status Put(const std::string& k, const std::string& v) {
|
||||
return db_->Put(WriteOptions(), k, v);
|
||||
}
|
||||
|
@ -311,6 +281,20 @@ class DBTest {
|
|||
return result;
|
||||
}
|
||||
|
||||
std::string GetNoCache(const std::string& k, const Snapshot* snapshot = NULL) {
|
||||
ReadOptions options;
|
||||
options.snapshot = snapshot;
|
||||
options.fill_cache=false;
|
||||
std::string result;
|
||||
Status s = db_->Get(options, k, &result);
|
||||
if (s.IsNotFound()) {
|
||||
result = "NOT_FOUND";
|
||||
} else if (!s.ok()) {
|
||||
result = s.ToString();
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
// Return a string that contains all key,value pairs in order,
|
||||
// formatted like "(k1->v1)(k2->v2)".
|
||||
std::string Contents() {
|
||||
|
@ -326,7 +310,7 @@ class DBTest {
|
|||
}
|
||||
|
||||
// Check reverse iteration results are the reverse of forward results
|
||||
size_t matched = 0;
|
||||
int matched = 0;
|
||||
for (iter->SeekToLast(); iter->Valid(); iter->Prev()) {
|
||||
ASSERT_LT(matched, forward.size());
|
||||
ASSERT_EQ(IterStatus(iter), forward[forward.size() - matched - 1]);
|
||||
|
@ -340,7 +324,7 @@ class DBTest {
|
|||
|
||||
std::string AllEntriesFor(const Slice& user_key) {
|
||||
Iterator* iter = dbfull()->TEST_NewInternalIterator();
|
||||
InternalKey target(user_key, kMaxSequenceNumber, kTypeValue);
|
||||
InternalKey target(user_key, 0, kMaxSequenceNumber, kTypeValue);
|
||||
iter->Seek(target.Encode());
|
||||
std::string result;
|
||||
if (!iter->status().ok()) {
|
||||
|
@ -361,6 +345,8 @@ class DBTest {
|
|||
}
|
||||
first = false;
|
||||
switch (ikey.type) {
|
||||
case kTypeValueWriteTime:
|
||||
case kTypeValueExplicitExpiry:
|
||||
case kTypeValue:
|
||||
result += iter->value().ToString();
|
||||
break;
|
||||
|
@ -474,38 +460,6 @@ class DBTest {
|
|||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
bool DeleteAnSSTFile() {
|
||||
std::vector<std::string> filenames;
|
||||
ASSERT_OK(env_->GetChildren(dbname_, &filenames));
|
||||
uint64_t number;
|
||||
FileType type;
|
||||
for (size_t i = 0; i < filenames.size(); i++) {
|
||||
if (ParseFileName(filenames[i], &number, &type) && type == kTableFile) {
|
||||
ASSERT_OK(env_->DeleteFile(TableFileName(dbname_, number)));
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// Returns number of files renamed.
|
||||
int RenameLDBToSST() {
|
||||
std::vector<std::string> filenames;
|
||||
ASSERT_OK(env_->GetChildren(dbname_, &filenames));
|
||||
uint64_t number;
|
||||
FileType type;
|
||||
int files_renamed = 0;
|
||||
for (size_t i = 0; i < filenames.size(); i++) {
|
||||
if (ParseFileName(filenames[i], &number, &type) && type == kTableFile) {
|
||||
const std::string from = TableFileName(dbname_, number);
|
||||
const std::string to = SSTTableFileName(dbname_, number);
|
||||
ASSERT_OK(env_->RenameFile(from, to));
|
||||
files_renamed++;
|
||||
}
|
||||
}
|
||||
return files_renamed;
|
||||
}
|
||||
};
|
||||
|
||||
TEST(DBTest, Empty) {
|
||||
|
@ -515,6 +469,11 @@ TEST(DBTest, Empty) {
|
|||
} while (ChangeOptions());
|
||||
}
|
||||
|
||||
TEST(DBTest, DoubleOpen)
|
||||
{
|
||||
ASSERT_NOTOK(DoubleOpen());
|
||||
}
|
||||
|
||||
TEST(DBTest, ReadWrite) {
|
||||
do {
|
||||
ASSERT_OK(Put("foo", "v1"));
|
||||
|
@ -547,11 +506,11 @@ TEST(DBTest, GetFromImmutableLayer) {
|
|||
ASSERT_OK(Put("foo", "v1"));
|
||||
ASSERT_EQ("v1", Get("foo"));
|
||||
|
||||
env_->delay_data_sync_.Release_Store(env_); // Block sync calls
|
||||
env_->delay_sstable_sync_.Release_Store(env_); // Block sync calls
|
||||
Put("k1", std::string(100000, 'x')); // Fill memtable
|
||||
Put("k2", std::string(100000, 'y')); // Trigger compaction
|
||||
ASSERT_EQ("v1", Get("foo"));
|
||||
env_->delay_data_sync_.Release_Store(NULL); // Release sync calls
|
||||
env_->delay_sstable_sync_.Release_Store(NULL); // Release sync calls
|
||||
} while (ChangeOptions());
|
||||
}
|
||||
|
||||
|
@ -563,17 +522,6 @@ TEST(DBTest, GetFromVersions) {
|
|||
} while (ChangeOptions());
|
||||
}
|
||||
|
||||
TEST(DBTest, GetMemUsage) {
|
||||
do {
|
||||
ASSERT_OK(Put("foo", "v1"));
|
||||
std::string val;
|
||||
ASSERT_TRUE(db_->GetProperty("leveldb.approximate-memory-usage", &val));
|
||||
int mem_usage = atoi(val.c_str());
|
||||
ASSERT_GT(mem_usage, 0);
|
||||
ASSERT_LT(mem_usage, 5*1024*1024);
|
||||
} while (ChangeOptions());
|
||||
}
|
||||
|
||||
TEST(DBTest, GetSnapshot) {
|
||||
do {
|
||||
// Try with both a short key and a long key
|
||||
|
@ -634,6 +582,9 @@ TEST(DBTest, GetPicksCorrectFile) {
|
|||
} while (ChangeOptions());
|
||||
}
|
||||
|
||||
#if 0
|
||||
// riak does not execute compaction due to reads
|
||||
|
||||
TEST(DBTest, GetEncountersEmptyLevel) {
|
||||
do {
|
||||
// Arrange for the following to happen:
|
||||
|
@ -642,7 +593,7 @@ TEST(DBTest, GetEncountersEmptyLevel) {
|
|||
// * sstable B in level 2
|
||||
// Then do enough Get() calls to arrange for an automatic compaction
|
||||
// of sstable A. A bug would cause the compaction to be marked as
|
||||
// occurring at level 1 (instead of the correct level 0).
|
||||
// occuring at level 1 (instead of the correct level 0).
|
||||
|
||||
// Step 1: First place sstables in levels 0 and 2
|
||||
int compaction_count = 0;
|
||||
|
@ -667,11 +618,12 @@ TEST(DBTest, GetEncountersEmptyLevel) {
|
|||
}
|
||||
|
||||
// Step 4: Wait for compaction to finish
|
||||
DelayMilliseconds(1000);
|
||||
env_->SleepForMicroseconds(1000000);
|
||||
|
||||
ASSERT_EQ(NumTableFilesAtLevel(0), 0);
|
||||
} while (ChangeOptions());
|
||||
}
|
||||
#endif
|
||||
|
||||
TEST(DBTest, IterEmpty) {
|
||||
Iterator* iter = db_->NewIterator(ReadOptions());
|
||||
|
@ -996,7 +948,8 @@ TEST(DBTest, CompactionsGenerateMultipleFiles) {
|
|||
dbfull()->TEST_CompactRange(0, NULL, NULL);
|
||||
|
||||
ASSERT_EQ(NumTableFilesAtLevel(0), 0);
|
||||
ASSERT_GT(NumTableFilesAtLevel(1), 1);
|
||||
// not riak ASSERT_GT(NumTableFilesAtLevel(1), 1);
|
||||
ASSERT_EQ(NumTableFilesAtLevel(1), 1); // yes riak
|
||||
for (int i = 0; i < 80; i++) {
|
||||
ASSERT_EQ(Get(Key(i)), values[i]);
|
||||
}
|
||||
|
@ -1010,7 +963,8 @@ TEST(DBTest, RepeatedWritesToSameKey) {
|
|||
|
||||
// We must have at most one file per level except for level-0,
|
||||
// which may have up to kL0_StopWritesTrigger files.
|
||||
const int kMaxFiles = config::kNumLevels + config::kL0_StopWritesTrigger;
|
||||
// ... basho adds *2 since level-1 is now overlapped too
|
||||
const int kMaxFiles = config::kNumLevels + config::kL0_StopWritesTrigger*2;
|
||||
|
||||
Random rnd(301);
|
||||
std::string value = RandomString(&rnd, 2 * options.write_buffer_size);
|
||||
|
@ -1054,11 +1008,13 @@ TEST(DBTest, SparseMerge) {
|
|||
|
||||
// Compactions should not cause us to create a situation where
|
||||
// a file overlaps too much data at the next level.
|
||||
ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576);
|
||||
// 07/10/14 matthewv - we overlap first two levels. sparse test not appropriate there,
|
||||
// and we set overlaps into 100s of megabytes as "normal"
|
||||
// ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576);
|
||||
dbfull()->TEST_CompactRange(0, NULL, NULL);
|
||||
ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576);
|
||||
// ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576);
|
||||
dbfull()->TEST_CompactRange(1, NULL, NULL);
|
||||
ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576);
|
||||
// ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576);
|
||||
}
|
||||
|
||||
static bool Between(uint64_t val, uint64_t low, uint64_t high) {
|
||||
|
@ -1096,14 +1052,6 @@ TEST(DBTest, ApproximateSizes) {
|
|||
// 0 because GetApproximateSizes() does not account for memtable space
|
||||
ASSERT_TRUE(Between(Size("", Key(50)), 0, 0));
|
||||
|
||||
if (options.reuse_logs) {
|
||||
// Recovery will reuse memtable, and GetApproximateSizes() does not
|
||||
// account for memtable usage;
|
||||
Reopen(&options);
|
||||
ASSERT_TRUE(Between(Size("", Key(50)), 0, 0));
|
||||
continue;
|
||||
}
|
||||
|
||||
// Check sizes across recovery by reopening a few times
|
||||
for (int run = 0; run < 3; run++) {
|
||||
Reopen(&options);
|
||||
|
@ -1147,11 +1095,6 @@ TEST(DBTest, ApproximateSizes_MixOfSmallAndLarge) {
|
|||
ASSERT_OK(Put(Key(6), RandomString(&rnd, 300000)));
|
||||
ASSERT_OK(Put(Key(7), RandomString(&rnd, 10000)));
|
||||
|
||||
if (options.reuse_logs) {
|
||||
// Need to force a memtable compaction since recovery does not do so.
|
||||
ASSERT_OK(dbfull()->TEST_CompactMemTable());
|
||||
}
|
||||
|
||||
// Check sizes across recovery by reopening a few times
|
||||
for (int run = 0; run < 3; run++) {
|
||||
Reopen(&options);
|
||||
|
@ -1223,7 +1166,7 @@ TEST(DBTest, Snapshot) {
|
|||
ASSERT_EQ("v4", Get("foo"));
|
||||
} while (ChangeOptions());
|
||||
}
|
||||
|
||||
#if 0 // trouble under Riak due to assumed file sizes
|
||||
TEST(DBTest, HiddenValuesAreRemoved) {
|
||||
do {
|
||||
Random rnd(301);
|
||||
|
@ -1254,7 +1197,7 @@ TEST(DBTest, HiddenValuesAreRemoved) {
|
|||
ASSERT_TRUE(Between(Size("", "pastfoo"), 0, 1000));
|
||||
} while (ChangeOptions());
|
||||
}
|
||||
|
||||
#endif
|
||||
TEST(DBTest, DeletionMarkers1) {
|
||||
Put("foo", "v1");
|
||||
ASSERT_OK(dbfull()->TEST_CompactMemTable());
|
||||
|
@ -1271,13 +1214,14 @@ TEST(DBTest, DeletionMarkers1) {
|
|||
Delete("foo");
|
||||
Put("foo", "v2");
|
||||
ASSERT_EQ(AllEntriesFor("foo"), "[ v2, DEL, v1 ]");
|
||||
ASSERT_OK(dbfull()->TEST_CompactMemTable()); // Moves to level last-2
|
||||
ASSERT_EQ(AllEntriesFor("foo"), "[ v2, DEL, v1 ]");
|
||||
ASSERT_OK(dbfull()->TEST_CompactMemTable()); // stays at level 0
|
||||
ASSERT_EQ(AllEntriesFor("foo"), "[ v2, v1 ]"); // riak 1.3, DEL merged out by BuildTable
|
||||
Slice z("z");
|
||||
dbfull()->TEST_CompactRange(last-2, NULL, &z);
|
||||
dbfull()->TEST_CompactRange(0, NULL, &z);
|
||||
dbfull()->TEST_CompactRange(1, NULL, &z);
|
||||
// DEL eliminated, but v1 remains because we aren't compacting that level
|
||||
// (DEL can be eliminated because v2 hides v1).
|
||||
ASSERT_EQ(AllEntriesFor("foo"), "[ v2, v1 ]");
|
||||
ASSERT_EQ(AllEntriesFor("foo"), "[ v2, v1 ]"); // Riak 1.4 has merged to level 1
|
||||
dbfull()->TEST_CompactRange(last-1, NULL, NULL);
|
||||
// Merging last-1 w/ last, so we are the base level for "foo", so
|
||||
// DEL is removed. (as is v1).
|
||||
|
@ -1289,39 +1233,47 @@ TEST(DBTest, DeletionMarkers2) {
|
|||
ASSERT_OK(dbfull()->TEST_CompactMemTable());
|
||||
const int last = config::kMaxMemCompactLevel;
|
||||
ASSERT_EQ(NumTableFilesAtLevel(last), 1); // foo => v1 is now in last level
|
||||
dbfull()->TEST_CompactRange(0, NULL, NULL);
|
||||
ASSERT_EQ(NumTableFilesAtLevel(last), 1); // foo => v1 is now in last level
|
||||
ASSERT_EQ(NumTableFilesAtLevel(last-1), 0);
|
||||
|
||||
// Place a table at level last-1 to prevent merging with preceding mutation
|
||||
Put("a", "begin");
|
||||
Put("z", "end");
|
||||
dbfull()->TEST_CompactMemTable();
|
||||
ASSERT_EQ(NumTableFilesAtLevel(last), 1);
|
||||
dbfull()->TEST_CompactMemTable(); // goes to last-1
|
||||
ASSERT_EQ(NumTableFilesAtLevel(last-1), 1);
|
||||
|
||||
Delete("foo");
|
||||
ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]");
|
||||
ASSERT_OK(dbfull()->TEST_CompactMemTable()); // Moves to level last-2
|
||||
ASSERT_OK(dbfull()->TEST_CompactMemTable()); // Moves to level 0
|
||||
ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]");
|
||||
dbfull()->TEST_CompactRange(last-2, NULL, NULL);
|
||||
dbfull()->TEST_CompactRange(0, NULL, NULL); // Riak overlaps level 1
|
||||
// DEL kept: "last" file overlaps
|
||||
ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]");
|
||||
dbfull()->TEST_CompactRange(last-1, NULL, NULL);
|
||||
// Merging last-1 w/ last, so we are the base level for "foo", so
|
||||
// DEL is removed. (as is v1).
|
||||
dbfull()->TEST_CompactRange(1, NULL, NULL);
|
||||
ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]");
|
||||
|
||||
dbfull()->TEST_CompactRange(2, NULL, NULL);
|
||||
ASSERT_EQ(AllEntriesFor("foo"), "[ ]");
|
||||
}
|
||||
|
||||
TEST(DBTest, OverlapInLevel0) {
|
||||
do {
|
||||
ASSERT_EQ(config::kMaxMemCompactLevel, 2) << "Fix test to match config";
|
||||
ASSERT_EQ(config::kMaxMemCompactLevel, 3) << "Fix test to match config";
|
||||
|
||||
// Fill levels 1 and 2 to disable the pushing of new memtables to levels > 0.
|
||||
ASSERT_OK(Put("100", "v100"));
|
||||
ASSERT_OK(Put("999", "v999"));
|
||||
dbfull()->TEST_CompactMemTable();
|
||||
dbfull()->TEST_CompactRange(0, NULL, NULL);
|
||||
dbfull()->TEST_CompactRange(1, NULL, NULL);
|
||||
ASSERT_OK(Delete("100"));
|
||||
ASSERT_OK(Delete("999"));
|
||||
dbfull()->TEST_CompactMemTable();
|
||||
ASSERT_EQ("0,1,1", FilesPerLevel());
|
||||
dbfull()->TEST_CompactRange(0, NULL, NULL);
|
||||
ASSERT_EQ("0,0,1,1", FilesPerLevel());
|
||||
|
||||
// Make files spanning the following ranges in level-0:
|
||||
// files[0] 200 .. 900
|
||||
|
@ -1334,7 +1286,7 @@ TEST(DBTest, OverlapInLevel0) {
|
|||
ASSERT_OK(Put("600", "v600"));
|
||||
ASSERT_OK(Put("900", "v900"));
|
||||
dbfull()->TEST_CompactMemTable();
|
||||
ASSERT_EQ("2,1,1", FilesPerLevel());
|
||||
ASSERT_EQ("2,0,1,1", FilesPerLevel());
|
||||
|
||||
// Compact away the placeholder files we created initially
|
||||
dbfull()->TEST_CompactRange(1, NULL, NULL);
|
||||
|
@ -1364,7 +1316,7 @@ TEST(DBTest, L0_CompactionBug_Issue44_a) {
|
|||
Reopen();
|
||||
Reopen();
|
||||
ASSERT_EQ("(a->v)", Contents());
|
||||
DelayMilliseconds(1000); // Wait for compaction to finish
|
||||
env_->SleepForMicroseconds(1000000); // Wait for compaction to finish
|
||||
ASSERT_EQ("(a->v)", Contents());
|
||||
}
|
||||
|
||||
|
@ -1380,7 +1332,7 @@ TEST(DBTest, L0_CompactionBug_Issue44_b) {
|
|||
Put("","");
|
||||
Reopen();
|
||||
Put("","");
|
||||
DelayMilliseconds(1000); // Wait for compaction to finish
|
||||
env_->SleepForMicroseconds(1000000); // Wait for compaction to finish
|
||||
Reopen();
|
||||
Put("d","dv");
|
||||
Reopen();
|
||||
|
@ -1390,7 +1342,7 @@ TEST(DBTest, L0_CompactionBug_Issue44_b) {
|
|||
Delete("b");
|
||||
Reopen();
|
||||
ASSERT_EQ("(->)(c->cv)", Contents());
|
||||
DelayMilliseconds(1000); // Wait for compaction to finish
|
||||
env_->SleepForMicroseconds(1000000); // Wait for compaction to finish
|
||||
ASSERT_EQ("(->)(c->cv)", Contents());
|
||||
}
|
||||
|
||||
|
@ -1473,37 +1425,37 @@ TEST(DBTest, CustomComparator) {
|
|||
}
|
||||
|
||||
TEST(DBTest, ManualCompaction) {
|
||||
ASSERT_EQ(config::kMaxMemCompactLevel, 2)
|
||||
ASSERT_EQ(config::kMaxMemCompactLevel, 3)
|
||||
<< "Need to update this test to match kMaxMemCompactLevel";
|
||||
|
||||
MakeTables(3, "p", "q");
|
||||
ASSERT_EQ("1,1,1", FilesPerLevel());
|
||||
ASSERT_EQ("1,0,1,1", FilesPerLevel());
|
||||
|
||||
// Compaction range falls before files
|
||||
Compact("", "c");
|
||||
ASSERT_EQ("1,1,1", FilesPerLevel());
|
||||
ASSERT_EQ("0,1,1,1", FilesPerLevel());
|
||||
|
||||
// Compaction range falls after files
|
||||
Compact("r", "z");
|
||||
ASSERT_EQ("1,1,1", FilesPerLevel());
|
||||
ASSERT_EQ("0,1,1,1", FilesPerLevel());
|
||||
|
||||
// Compaction range overlaps files
|
||||
Compact("p1", "p9");
|
||||
ASSERT_EQ("0,0,1", FilesPerLevel());
|
||||
ASSERT_EQ("0,0,0,1", FilesPerLevel());
|
||||
|
||||
// Populate a different range
|
||||
MakeTables(3, "c", "e");
|
||||
ASSERT_EQ("1,1,2", FilesPerLevel());
|
||||
ASSERT_EQ("1,0,1,2", FilesPerLevel());
|
||||
|
||||
// Compact just the new range
|
||||
Compact("b", "f");
|
||||
ASSERT_EQ("0,0,2", FilesPerLevel());
|
||||
ASSERT_EQ("0,0,0,2", FilesPerLevel());
|
||||
|
||||
// Compact all
|
||||
MakeTables(1, "a", "z");
|
||||
ASSERT_EQ("0,1,2", FilesPerLevel());
|
||||
ASSERT_EQ("0,0,1,2", FilesPerLevel());
|
||||
db_->CompactRange(NULL, NULL);
|
||||
ASSERT_EQ("0,0,1", FilesPerLevel());
|
||||
ASSERT_EQ("0,0,0,1", FilesPerLevel());
|
||||
}
|
||||
|
||||
TEST(DBTest, DBOpen_Options) {
|
||||
|
@ -1545,12 +1497,6 @@ TEST(DBTest, DBOpen_Options) {
|
|||
db = NULL;
|
||||
}
|
||||
|
||||
TEST(DBTest, Locking) {
|
||||
DB* db2 = NULL;
|
||||
Status s = DB::Open(CurrentOptions(), dbname_, &db2);
|
||||
ASSERT_TRUE(!s.ok()) << "Locking did not prevent re-opening db";
|
||||
}
|
||||
|
||||
// Check that number of files does not grow when we are out of space
|
||||
TEST(DBTest, NoSpace) {
|
||||
Options options = CurrentOptions();
|
||||
|
@ -1562,15 +1508,19 @@ TEST(DBTest, NoSpace) {
|
|||
Compact("a", "z");
|
||||
const int num_files = CountFiles();
|
||||
env_->no_space_.Release_Store(env_); // Force out-of-space errors
|
||||
for (int i = 0; i < 10; i++) {
|
||||
env_->sleep_counter_.Reset();
|
||||
for (int i = 0; i < 5; i++) {
|
||||
for (int level = 0; level < config::kNumLevels-1; level++) {
|
||||
dbfull()->TEST_CompactRange(level, NULL, NULL);
|
||||
}
|
||||
}
|
||||
env_->no_space_.Release_Store(NULL);
|
||||
ASSERT_LT(CountFiles(), num_files + 3);
|
||||
}
|
||||
|
||||
// Check that compaction attempts slept after errors
|
||||
ASSERT_GE(env_->sleep_counter_.Read(), 5);
|
||||
}
|
||||
#if 0
|
||||
TEST(DBTest, NonWritableFileSystem) {
|
||||
Options options = CurrentOptions();
|
||||
options.write_buffer_size = 1000;
|
||||
|
@ -1584,119 +1534,13 @@ TEST(DBTest, NonWritableFileSystem) {
|
|||
fprintf(stderr, "iter %d; errors %d\n", i, errors);
|
||||
if (!Put("foo", big).ok()) {
|
||||
errors++;
|
||||
DelayMilliseconds(100);
|
||||
env_->SleepForMicroseconds(100000);
|
||||
}
|
||||
}
|
||||
ASSERT_GT(errors, 0);
|
||||
env_->non_writable_.Release_Store(NULL);
|
||||
}
|
||||
|
||||
TEST(DBTest, WriteSyncError) {
|
||||
// Check that log sync errors cause the DB to disallow future writes.
|
||||
|
||||
// (a) Cause log sync calls to fail
|
||||
Options options = CurrentOptions();
|
||||
options.env = env_;
|
||||
Reopen(&options);
|
||||
env_->data_sync_error_.Release_Store(env_);
|
||||
|
||||
// (b) Normal write should succeed
|
||||
WriteOptions w;
|
||||
ASSERT_OK(db_->Put(w, "k1", "v1"));
|
||||
ASSERT_EQ("v1", Get("k1"));
|
||||
|
||||
// (c) Do a sync write; should fail
|
||||
w.sync = true;
|
||||
ASSERT_TRUE(!db_->Put(w, "k2", "v2").ok());
|
||||
ASSERT_EQ("v1", Get("k1"));
|
||||
ASSERT_EQ("NOT_FOUND", Get("k2"));
|
||||
|
||||
// (d) make sync behave normally
|
||||
env_->data_sync_error_.Release_Store(NULL);
|
||||
|
||||
// (e) Do a non-sync write; should fail
|
||||
w.sync = false;
|
||||
ASSERT_TRUE(!db_->Put(w, "k3", "v3").ok());
|
||||
ASSERT_EQ("v1", Get("k1"));
|
||||
ASSERT_EQ("NOT_FOUND", Get("k2"));
|
||||
ASSERT_EQ("NOT_FOUND", Get("k3"));
|
||||
}
|
||||
|
||||
TEST(DBTest, ManifestWriteError) {
|
||||
// Test for the following problem:
|
||||
// (a) Compaction produces file F
|
||||
// (b) Log record containing F is written to MANIFEST file, but Sync() fails
|
||||
// (c) GC deletes F
|
||||
// (d) After reopening DB, reads fail since deleted F is named in log record
|
||||
|
||||
// We iterate twice. In the second iteration, everything is the
|
||||
// same except the log record never makes it to the MANIFEST file.
|
||||
for (int iter = 0; iter < 2; iter++) {
|
||||
port::AtomicPointer* error_type = (iter == 0)
|
||||
? &env_->manifest_sync_error_
|
||||
: &env_->manifest_write_error_;
|
||||
|
||||
// Insert foo=>bar mapping
|
||||
Options options = CurrentOptions();
|
||||
options.env = env_;
|
||||
options.create_if_missing = true;
|
||||
options.error_if_exists = false;
|
||||
DestroyAndReopen(&options);
|
||||
ASSERT_OK(Put("foo", "bar"));
|
||||
ASSERT_EQ("bar", Get("foo"));
|
||||
|
||||
// Memtable compaction (will succeed)
|
||||
dbfull()->TEST_CompactMemTable();
|
||||
ASSERT_EQ("bar", Get("foo"));
|
||||
const int last = config::kMaxMemCompactLevel;
|
||||
ASSERT_EQ(NumTableFilesAtLevel(last), 1); // foo=>bar is now in last level
|
||||
|
||||
// Merging compaction (will fail)
|
||||
error_type->Release_Store(env_);
|
||||
dbfull()->TEST_CompactRange(last, NULL, NULL); // Should fail
|
||||
ASSERT_EQ("bar", Get("foo"));
|
||||
|
||||
// Recovery: should not lose data
|
||||
error_type->Release_Store(NULL);
|
||||
Reopen(&options);
|
||||
ASSERT_EQ("bar", Get("foo"));
|
||||
}
|
||||
}
|
||||
|
||||
TEST(DBTest, MissingSSTFile) {
|
||||
ASSERT_OK(Put("foo", "bar"));
|
||||
ASSERT_EQ("bar", Get("foo"));
|
||||
|
||||
// Dump the memtable to disk.
|
||||
dbfull()->TEST_CompactMemTable();
|
||||
ASSERT_EQ("bar", Get("foo"));
|
||||
|
||||
Close();
|
||||
ASSERT_TRUE(DeleteAnSSTFile());
|
||||
Options options = CurrentOptions();
|
||||
options.paranoid_checks = true;
|
||||
Status s = TryReopen(&options);
|
||||
ASSERT_TRUE(!s.ok());
|
||||
ASSERT_TRUE(s.ToString().find("issing") != std::string::npos)
|
||||
<< s.ToString();
|
||||
}
|
||||
|
||||
TEST(DBTest, StillReadSST) {
|
||||
ASSERT_OK(Put("foo", "bar"));
|
||||
ASSERT_EQ("bar", Get("foo"));
|
||||
|
||||
// Dump the memtable to disk.
|
||||
dbfull()->TEST_CompactMemTable();
|
||||
ASSERT_EQ("bar", Get("foo"));
|
||||
Close();
|
||||
ASSERT_GT(RenameLDBToSST(), 0);
|
||||
Options options = CurrentOptions();
|
||||
options.paranoid_checks = true;
|
||||
Status s = TryReopen(&options);
|
||||
ASSERT_TRUE(s.ok());
|
||||
ASSERT_EQ("bar", Get("foo"));
|
||||
}
|
||||
|
||||
#endif
|
||||
TEST(DBTest, FilesDeletedAfterCompaction) {
|
||||
ASSERT_OK(Put("foo", "v2"));
|
||||
Compact("a", "z");
|
||||
|
@ -1713,7 +1557,7 @@ TEST(DBTest, BloomFilter) {
|
|||
Options options = CurrentOptions();
|
||||
options.env = env_;
|
||||
options.block_cache = NewLRUCache(0); // Prevent cache hits
|
||||
options.filter_policy = NewBloomFilterPolicy(10);
|
||||
options.filter_policy = NewBloomFilterPolicy2(16);
|
||||
Reopen(&options);
|
||||
|
||||
// Populate multiple layers
|
||||
|
@ -1728,12 +1572,12 @@ TEST(DBTest, BloomFilter) {
|
|||
dbfull()->TEST_CompactMemTable();
|
||||
|
||||
// Prevent auto compactions triggered by seeks
|
||||
env_->delay_data_sync_.Release_Store(env_);
|
||||
env_->delay_sstable_sync_.Release_Store(env_);
|
||||
|
||||
// Lookup present keys. Should rarely read from small sstable.
|
||||
env_->random_read_counter_.Reset();
|
||||
for (int i = 0; i < N; i++) {
|
||||
ASSERT_EQ(Key(i), Get(Key(i)));
|
||||
ASSERT_EQ(Key(i), GetNoCache(Key(i)));
|
||||
}
|
||||
int reads = env_->random_read_counter_.Read();
|
||||
fprintf(stderr, "%d present => %d reads\n", N, reads);
|
||||
|
@ -1743,13 +1587,13 @@ TEST(DBTest, BloomFilter) {
|
|||
// Lookup present keys. Should rarely read from either sstable.
|
||||
env_->random_read_counter_.Reset();
|
||||
for (int i = 0; i < N; i++) {
|
||||
ASSERT_EQ("NOT_FOUND", Get(Key(i) + ".missing"));
|
||||
ASSERT_EQ("NOT_FOUND", GetNoCache(Key(i) + ".missing"));
|
||||
}
|
||||
reads = env_->random_read_counter_.Read();
|
||||
fprintf(stderr, "%d missing => %d reads\n", N, reads);
|
||||
ASSERT_LE(reads, 3*N/100);
|
||||
|
||||
env_->delay_data_sync_.Release_Store(NULL);
|
||||
env_->delay_sstable_sync_.Release_Store(NULL);
|
||||
Close();
|
||||
delete options.block_cache;
|
||||
delete options.filter_policy;
|
||||
|
@ -1809,7 +1653,7 @@ static void MTThreadBody(void* arg) {
|
|||
ASSERT_EQ(k, key);
|
||||
ASSERT_GE(w, 0);
|
||||
ASSERT_LT(w, kNumThreads);
|
||||
ASSERT_LE(static_cast<uintptr_t>(c), reinterpret_cast<uintptr_t>(
|
||||
ASSERT_LE(c, reinterpret_cast<uintptr_t>(
|
||||
t->state->counter[w].Acquire_Load()));
|
||||
}
|
||||
}
|
||||
|
@ -1834,27 +1678,35 @@ TEST(DBTest, MultiThreaded) {
|
|||
|
||||
// Start threads
|
||||
MTThread thread[kNumThreads];
|
||||
pthread_t tid;
|
||||
for (int id = 0; id < kNumThreads; id++) {
|
||||
thread[id].state = &mt;
|
||||
thread[id].id = id;
|
||||
env_->StartThread(MTThreadBody, &thread[id]);
|
||||
tid=env_->StartThread(MTThreadBody, &thread[id]);
|
||||
pthread_detach(tid);
|
||||
}
|
||||
|
||||
// Let them run for a while
|
||||
DelayMilliseconds(kTestSeconds * 1000);
|
||||
env_->SleepForMicroseconds(kTestSeconds * 1000000);
|
||||
|
||||
// Stop the threads and wait for them to finish
|
||||
mt.stop.Release_Store(&mt);
|
||||
for (int id = 0; id < kNumThreads; id++) {
|
||||
while (mt.thread_done[id].Acquire_Load() == NULL) {
|
||||
DelayMilliseconds(100);
|
||||
env_->SleepForMicroseconds(100000);
|
||||
}
|
||||
}
|
||||
} while (ChangeOptions());
|
||||
}
|
||||
|
||||
namespace {
|
||||
typedef std::map<std::string, std::string> KVMap;
|
||||
struct KVEntry
|
||||
{
|
||||
std::string m_Value;
|
||||
KeyMetaData m_Meta;
|
||||
};
|
||||
|
||||
typedef std::map<std::string, KVEntry> KVMap;
|
||||
}
|
||||
|
||||
class ModelDB: public DB {
|
||||
|
@ -1866,14 +1718,21 @@ class ModelDB: public DB {
|
|||
|
||||
explicit ModelDB(const Options& options): options_(options) { }
|
||||
~ModelDB() { }
|
||||
virtual Status Put(const WriteOptions& o, const Slice& k, const Slice& v) {
|
||||
return DB::Put(o, k, v);
|
||||
virtual Status Put(const WriteOptions& o, const Slice& k, const Slice& v, const KeyMetaData * meta=NULL) {
|
||||
return DB::Put(o, k, v, meta);
|
||||
}
|
||||
virtual Status Delete(const WriteOptions& o, const Slice& key) {
|
||||
return DB::Delete(o, key);
|
||||
}
|
||||
virtual Status Get(const ReadOptions& options,
|
||||
const Slice& key, std::string* value) {
|
||||
const Slice& key, std::string* value,
|
||||
KeyMetaData * meta = NULL) {
|
||||
assert(false); // Not implemented
|
||||
return Status::NotFound(key);
|
||||
}
|
||||
virtual Status Get(const ReadOptions& options,
|
||||
const Slice& key, Value* value,
|
||||
KeyMetaData * meta = NULL) {
|
||||
assert(false); // Not implemented
|
||||
return Status::NotFound(key);
|
||||
}
|
||||
|
@ -1901,8 +1760,13 @@ class ModelDB: public DB {
|
|||
class Handler : public WriteBatch::Handler {
|
||||
public:
|
||||
KVMap* map_;
|
||||
virtual void Put(const Slice& key, const Slice& value) {
|
||||
(*map_)[key.ToString()] = value.ToString();
|
||||
virtual void Put(const Slice& key, const Slice& value,
|
||||
const ValueType & type, const ExpiryTimeMicros & expiry) {
|
||||
KVEntry ent;
|
||||
ent.m_Value=value.ToString();
|
||||
ent.m_Meta.m_Type=type;
|
||||
ent.m_Meta.m_Expiry=expiry;
|
||||
(*map_)[key.ToString()] = ent;
|
||||
}
|
||||
virtual void Delete(const Slice& key) {
|
||||
map_->erase(key.ToString());
|
||||
|
@ -1948,7 +1812,7 @@ class ModelDB: public DB {
|
|||
virtual void Next() { ++iter_; }
|
||||
virtual void Prev() { --iter_; }
|
||||
virtual Slice key() const { return iter_->first; }
|
||||
virtual Slice value() const { return iter_->second; }
|
||||
virtual Slice value() const { return iter_->second.m_Value; }
|
||||
virtual Status status() const { return Status::OK(); }
|
||||
private:
|
||||
const KVMap* const map_;
|
||||
|
@ -2085,6 +1949,44 @@ TEST(DBTest, Randomized) {
|
|||
} while (ChangeOptions());
|
||||
}
|
||||
|
||||
|
||||
class SimpleBugs
|
||||
{
|
||||
// need a class for the test harness
|
||||
};
|
||||
|
||||
|
||||
TEST(SimpleBugs, TieredRecoveryLog)
|
||||
{
|
||||
// DB::Open created first recovery log directly
|
||||
// which lead to it NOT being in tiered storage location.
|
||||
// nope std::string dbname = test::TmpDir() + "/leveldb_nontiered";
|
||||
std::string dbname = "leveldb";
|
||||
std::string fastname = test::TmpDir() + "/leveldb_fast";
|
||||
std::string slowname = test::TmpDir() + "/leveldb_slow";
|
||||
std::string combined;
|
||||
|
||||
DB* db = NULL;
|
||||
Options opts;
|
||||
|
||||
opts.tiered_slow_level = 4;
|
||||
opts.tiered_fast_prefix = fastname;
|
||||
opts.tiered_slow_prefix = slowname;
|
||||
opts.create_if_missing = true;
|
||||
|
||||
Env::Default()->CreateDir(fastname);
|
||||
Env::Default()->CreateDir(slowname);
|
||||
|
||||
Status s = DB::Open(opts, dbname, &db);
|
||||
ASSERT_OK(s);
|
||||
ASSERT_TRUE(db != NULL);
|
||||
|
||||
delete db;
|
||||
DestroyDB(dbname, opts);
|
||||
|
||||
} // TieredRecoveryLog
|
||||
|
||||
|
||||
std::string MakeKey(unsigned int num) {
|
||||
char buf[30];
|
||||
snprintf(buf, sizeof(buf), "%016u", num);
|
||||
|
@ -2113,14 +2015,13 @@ void BM_LogAndApply(int iters, int num_base_files) {
|
|||
InternalKeyComparator cmp(BytewiseComparator());
|
||||
Options options;
|
||||
VersionSet vset(dbname, &options, NULL, &cmp);
|
||||
bool save_manifest;
|
||||
ASSERT_OK(vset.Recover(&save_manifest));
|
||||
ASSERT_OK(vset.Recover());
|
||||
VersionEdit vbase;
|
||||
uint64_t fnum = 1;
|
||||
for (int i = 0; i < num_base_files; i++) {
|
||||
InternalKey start(MakeKey(2*fnum), 1, kTypeValue);
|
||||
InternalKey limit(MakeKey(2*fnum+1), 1, kTypeDeletion);
|
||||
vbase.AddFile(2, fnum++, 1 /* file size */, start, limit);
|
||||
InternalKey start(MakeKey(2*fnum), 0, 1, kTypeValue);
|
||||
InternalKey limit(MakeKey(2*fnum+1), 0, 1, kTypeDeletion);
|
||||
vbase.AddFile2(2, fnum++, 1 /* file size */, start, limit, 0,0,0);
|
||||
}
|
||||
ASSERT_OK(vset.LogAndApply(&vbase, &mu));
|
||||
|
||||
|
@ -2129,9 +2030,9 @@ void BM_LogAndApply(int iters, int num_base_files) {
|
|||
for (int i = 0; i < iters; i++) {
|
||||
VersionEdit vedit;
|
||||
vedit.DeleteFile(2, fnum);
|
||||
InternalKey start(MakeKey(2*fnum), 1, kTypeValue);
|
||||
InternalKey limit(MakeKey(2*fnum+1), 1, kTypeDeletion);
|
||||
vedit.AddFile(2, fnum++, 1 /* file size */, start, limit);
|
||||
InternalKey start(MakeKey(2*fnum), 0, 1, kTypeValue);
|
||||
InternalKey limit(MakeKey(2*fnum+1), 0, 1, kTypeDeletion);
|
||||
vedit.AddFile2(2, fnum++, 1 /* file size */, start, limit, 0,0,0);
|
||||
vset.LogAndApply(&vedit, &mu);
|
||||
}
|
||||
uint64_t stop_micros = env->NowMicros();
|
||||
|
|
|
@ -3,7 +3,9 @@
|
|||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include <stdio.h>
|
||||
//#include "leveldb/expiry.h"
|
||||
#include "db/dbformat.h"
|
||||
#include "db/version_set.h"
|
||||
#include "port/port.h"
|
||||
#include "util/coding.h"
|
||||
|
||||
|
@ -11,26 +13,66 @@ namespace leveldb {
|
|||
|
||||
static uint64_t PackSequenceAndType(uint64_t seq, ValueType t) {
|
||||
assert(seq <= kMaxSequenceNumber);
|
||||
assert(t <= kValueTypeForSeek);
|
||||
// assert(t <= kValueTypeForSeek); requires revisit once expiry live
|
||||
assert(t <= kTypeValueExplicitExpiry); // temp replacement for above
|
||||
return (seq << 8) | t;
|
||||
}
|
||||
|
||||
void AppendInternalKey(std::string* result, const ParsedInternalKey& key) {
|
||||
result->append(key.user_key.data(), key.user_key.size());
|
||||
if (IsExpiryKey(key.type))
|
||||
PutFixed64(result, key.expiry);
|
||||
PutFixed64(result, PackSequenceAndType(key.sequence, key.type));
|
||||
}
|
||||
|
||||
std::string ParsedInternalKey::DebugString() const {
|
||||
char buf[50];
|
||||
if (IsExpiryKey(type))
|
||||
snprintf(buf, sizeof(buf), "' @ %llu %llu : %d",
|
||||
(unsigned long long) expiry,
|
||||
(unsigned long long) sequence,
|
||||
int(type));
|
||||
else
|
||||
snprintf(buf, sizeof(buf), "' @ %llu : %d",
|
||||
(unsigned long long) sequence,
|
||||
int(type));
|
||||
std::string result = "'";
|
||||
result += EscapeString(user_key.ToString());
|
||||
result += HexString(user_key.ToString());
|
||||
result += buf;
|
||||
return result;
|
||||
}
|
||||
|
||||
std::string ParsedInternalKey::DebugStringHex() const {
|
||||
char buf[50];
|
||||
if (IsExpiryKey(type))
|
||||
snprintf(buf, sizeof(buf), "' @ %llu %llu : %d",
|
||||
(unsigned long long) expiry,
|
||||
(unsigned long long) sequence,
|
||||
int(type));
|
||||
else
|
||||
snprintf(buf, sizeof(buf), "' @ %llu : %d",
|
||||
(unsigned long long) sequence,
|
||||
int(type));
|
||||
std::string result = "'";
|
||||
result += HexString(user_key);
|
||||
result += buf;
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
const char * KeyTypeString(ValueType val_type) {
|
||||
const char * ret_ptr;
|
||||
switch(val_type)
|
||||
{
|
||||
case kTypeDeletion: ret_ptr="kTypeDelete"; break;
|
||||
case kTypeValue: ret_ptr="kTypeValue"; break;
|
||||
case kTypeValueWriteTime: ret_ptr="kTypeValueWriteTime"; break;
|
||||
case kTypeValueExplicitExpiry: ret_ptr="kTypeValueExplicitExpiry"; break;
|
||||
default: ret_ptr="(unknown ValueType)"; break;
|
||||
} // switch
|
||||
return(ret_ptr);
|
||||
}
|
||||
|
||||
std::string InternalKey::DebugString() const {
|
||||
std::string result;
|
||||
ParsedInternalKey parsed;
|
||||
|
@ -54,8 +96,10 @@ int InternalKeyComparator::Compare(const Slice& akey, const Slice& bkey) const {
|
|||
// decreasing type (though sequence# should be enough to disambiguate)
|
||||
int r = user_comparator_->Compare(ExtractUserKey(akey), ExtractUserKey(bkey));
|
||||
if (r == 0) {
|
||||
const uint64_t anum = DecodeFixed64(akey.data() + akey.size() - 8);
|
||||
const uint64_t bnum = DecodeFixed64(bkey.data() + bkey.size() - 8);
|
||||
uint64_t anum = DecodeFixed64(akey.data() + akey.size() - 8);
|
||||
uint64_t bnum = DecodeFixed64(bkey.data() + bkey.size() - 8);
|
||||
if (IsExpiryKey((ValueType)*(unsigned char *)&anum)) *(unsigned char*)&anum=(unsigned char)kTypeValue;
|
||||
if (IsExpiryKey((ValueType)*(unsigned char *)&bnum)) *(unsigned char*)&bnum=(unsigned char)kTypeValue;
|
||||
if (anum > bnum) {
|
||||
r = -1;
|
||||
} else if (anum < bnum) {
|
||||
|
@ -118,7 +162,8 @@ bool InternalFilterPolicy::KeyMayMatch(const Slice& key, const Slice& f) const {
|
|||
return user_policy_->KeyMayMatch(ExtractUserKey(key), f);
|
||||
}
|
||||
|
||||
LookupKey::LookupKey(const Slice& user_key, SequenceNumber s) {
|
||||
LookupKey::LookupKey(const Slice& user_key, SequenceNumber s, KeyMetaData * meta) {
|
||||
meta_=meta;
|
||||
size_t usize = user_key.size();
|
||||
size_t needed = usize + 13; // A conservative estimate
|
||||
char* dst;
|
||||
|
@ -137,4 +182,109 @@ LookupKey::LookupKey(const Slice& user_key, SequenceNumber s) {
|
|||
end_ = dst;
|
||||
}
|
||||
|
||||
|
||||
KeyRetirement::KeyRetirement(
|
||||
const Comparator * Comparator,
|
||||
SequenceNumber SmallestSnapshot,
|
||||
const Options * Opts,
|
||||
Compaction * const Compaction)
|
||||
: has_current_user_key(false), last_sequence_for_key(kMaxSequenceNumber),
|
||||
user_comparator(Comparator), smallest_snapshot(SmallestSnapshot),
|
||||
options(Opts), compaction(Compaction),
|
||||
valid(false), dropped(0), expired(0)
|
||||
{
|
||||
// NULL is ok for compaction
|
||||
valid=(NULL!=user_comparator);
|
||||
|
||||
return;
|
||||
} // KeyRetirement::KeyRetirement
|
||||
|
||||
|
||||
KeyRetirement::~KeyRetirement()
|
||||
{
|
||||
if (0!=expired)
|
||||
gPerfCounters->Add(ePerfExpiredKeys, expired);
|
||||
} // KeyRetirement::~KeyRetirement
|
||||
|
||||
|
||||
bool
|
||||
KeyRetirement::operator()(
|
||||
Slice & key)
|
||||
{
|
||||
ParsedInternalKey ikey;
|
||||
bool drop = false, expire_flag;
|
||||
|
||||
if (valid)
|
||||
{
|
||||
if (!ParseInternalKey(key, &ikey))
|
||||
{
|
||||
// Do not hide error keys
|
||||
current_user_key.clear();
|
||||
has_current_user_key = false;
|
||||
last_sequence_for_key = kMaxSequenceNumber;
|
||||
} // else
|
||||
else
|
||||
{
|
||||
if (!has_current_user_key ||
|
||||
user_comparator->Compare(ikey.user_key,
|
||||
Slice(current_user_key)) != 0)
|
||||
{
|
||||
// First occurrence of this user key
|
||||
current_user_key.assign(ikey.user_key.data(), ikey.user_key.size());
|
||||
has_current_user_key = true;
|
||||
last_sequence_for_key = kMaxSequenceNumber;
|
||||
} // if
|
||||
|
||||
if (last_sequence_for_key <= smallest_snapshot)
|
||||
{
|
||||
// Hidden by an newer entry for same user key
|
||||
drop = true; // (A)
|
||||
} // if
|
||||
|
||||
else
|
||||
{
|
||||
expire_flag=false;
|
||||
if (NULL!=options && options->ExpiryActivated())
|
||||
expire_flag=options->expiry_module->KeyRetirementCallback(ikey);
|
||||
|
||||
if ((ikey.type == kTypeDeletion || expire_flag)
|
||||
&& ikey.sequence <= smallest_snapshot
|
||||
&& NULL!=compaction // mem to level0 ignores this test
|
||||
&& compaction->IsBaseLevelForKey(ikey.user_key))
|
||||
{
|
||||
// For this user key:
|
||||
// (1) there is no data in higher levels
|
||||
// (2) data in lower levels will have larger sequence numbers
|
||||
// (3) data in layers that are being compacted here and have
|
||||
// smaller sequence numbers will be dropped in the next
|
||||
// few iterations of this loop (by rule (A) above).
|
||||
// Therefore this deletion marker is obsolete and can be dropped.
|
||||
drop = true;
|
||||
|
||||
if (expire_flag)
|
||||
++expired;
|
||||
else
|
||||
++dropped;
|
||||
} // if
|
||||
} // else
|
||||
|
||||
last_sequence_for_key = ikey.sequence;
|
||||
} // else
|
||||
} // if
|
||||
|
||||
#if 0
|
||||
// needs clean up to be used again
|
||||
Log(options_.info_log,
|
||||
" Compact: %s, seq %d, type: %d %d, drop: %d, is_base: %d, "
|
||||
"%d smallest_snapshot: %d",
|
||||
ikey.user_key.ToString().c_str(),
|
||||
(int)ikey.sequence, ikey.type, kTypeValue, drop,
|
||||
compact->compaction->IsBaseLevelForKey(ikey.user_key),
|
||||
(int)last_sequence_for_key, (int)compact->smallest_snapshot);
|
||||
#endif
|
||||
return(drop);
|
||||
|
||||
} // KeyRetirement::operator(Slice & )
|
||||
|
||||
|
||||
} // namespace leveldb
|
||||
|
|
|
@ -2,13 +2,14 @@
|
|||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#ifndef STORAGE_LEVELDB_DB_DBFORMAT_H_
|
||||
#define STORAGE_LEVELDB_DB_DBFORMAT_H_
|
||||
#ifndef STORAGE_LEVELDB_DB_FORMAT_H_
|
||||
#define STORAGE_LEVELDB_DB_FORMAT_H_
|
||||
|
||||
#include <stdio.h>
|
||||
#include "leveldb/comparator.h"
|
||||
#include "leveldb/db.h"
|
||||
#include "leveldb/filter_policy.h"
|
||||
#include "leveldb/options.h"
|
||||
#include "leveldb/slice.h"
|
||||
#include "leveldb/table_builder.h"
|
||||
#include "util/coding.h"
|
||||
|
@ -16,19 +17,33 @@
|
|||
|
||||
namespace leveldb {
|
||||
|
||||
class Compaction;
|
||||
|
||||
// Grouping of constants. We may want to make some of these
|
||||
// parameters set via options.
|
||||
namespace config {
|
||||
static const int kNumLevels = 7;
|
||||
static const int kNumOverlapLevels = 2;
|
||||
|
||||
// Level-0 compaction is started when we hit this many files.
|
||||
static const int kL0_CompactionTrigger = 4;
|
||||
// Google: static const size_t kL0_CompactionTrigger = 4;
|
||||
static const size_t kL0_CompactionTrigger = 6;
|
||||
|
||||
// Level-0 (any overlapped level) number of files where a grooming
|
||||
// compaction could start
|
||||
static const size_t kL0_GroomingTrigger = 4;
|
||||
static const size_t kL0_GroomingTrigger10min = 2;
|
||||
static const size_t kL0_GroomingTrigger20min = 1;
|
||||
|
||||
// ... time limits in microseconds
|
||||
static const size_t kL0_Grooming10minMicros = 10 * 60 * 1000000;
|
||||
static const size_t kL0_Grooming20minMicros = 20 * 60 * 1000000;
|
||||
|
||||
// Soft limit on number of level-0 files. We slow down writes at this point.
|
||||
static const int kL0_SlowdownWritesTrigger = 8;
|
||||
static const size_t kL0_SlowdownWritesTrigger = 8;
|
||||
|
||||
// Maximum number of level-0 files. We stop writes at this point.
|
||||
static const int kL0_StopWritesTrigger = 12;
|
||||
static const size_t kL0_StopWritesTrigger = 12;
|
||||
|
||||
// Maximum level to which a new compacted memtable is pushed if it
|
||||
// does not create overlap. We try to push to level 2 to avoid the
|
||||
|
@ -36,31 +51,28 @@ static const int kL0_StopWritesTrigger = 12;
|
|||
// expensive manifest file operations. We do not push all the way to
|
||||
// the largest level since that can generate a lot of wasted disk
|
||||
// space if the same key space is being repeatedly overwritten.
|
||||
static const int kMaxMemCompactLevel = 2;
|
||||
|
||||
// Approximate gap in bytes between samples of data read during iteration.
|
||||
static const int kReadBytesPeriod = 1048576;
|
||||
// Basho: push to kNumOverlapLevels +1 ... beyond "landing level"
|
||||
static const unsigned kMaxMemCompactLevel = kNumOverlapLevels+1;
|
||||
|
||||
} // namespace config
|
||||
|
||||
class InternalKey;
|
||||
|
||||
// Value types encoded as the last component of internal keys.
|
||||
// DO NOT CHANGE THESE ENUM VALUES: they are embedded in the on-disk
|
||||
// data structures.
|
||||
enum ValueType {
|
||||
kTypeDeletion = 0x0,
|
||||
kTypeValue = 0x1
|
||||
};
|
||||
// kValueTypeForSeek defines the ValueType that should be passed when
|
||||
// constructing a ParsedInternalKey object for seeking to a particular
|
||||
// sequence number (since we sort sequence numbers in decreasing order
|
||||
// and the value type is embedded as the low 8 bits in the sequence
|
||||
// number in internal keys, we need to use the highest-numbered
|
||||
// ValueType, not the lowest).
|
||||
// Riak note: kValueTypeForSeek is placed within temporary keys
|
||||
// for comparisons. Using kTypeValueExplicitExpiry would
|
||||
// force more code changes to increase internal key size.
|
||||
// But ValueTypeForSeek is redundant to sequence number for
|
||||
// disambiguaty. Therefore going for easiest path and NOT changing.
|
||||
static const ValueType kValueTypeForSeek = kTypeValue;
|
||||
|
||||
typedef uint64_t SequenceNumber;
|
||||
typedef uint64_t ExpiryTimeMicros;
|
||||
|
||||
// We leave eight bits empty at the bottom so a type and sequence#
|
||||
// can be packed together into 64-bits.
|
||||
|
@ -69,20 +81,17 @@ static const SequenceNumber kMaxSequenceNumber =
|
|||
|
||||
struct ParsedInternalKey {
|
||||
Slice user_key;
|
||||
ExpiryTimeMicros expiry;
|
||||
SequenceNumber sequence;
|
||||
ValueType type;
|
||||
|
||||
ParsedInternalKey() { } // Intentionally left uninitialized (for speed)
|
||||
ParsedInternalKey(const Slice& u, const SequenceNumber& seq, ValueType t)
|
||||
: user_key(u), sequence(seq), type(t) { }
|
||||
ParsedInternalKey(const Slice& u, const ExpiryTimeMicros & exp, const SequenceNumber& seq, ValueType t)
|
||||
: user_key(u), expiry(exp), sequence(seq), type(t) { }
|
||||
std::string DebugString() const;
|
||||
std::string DebugStringHex() const;
|
||||
};
|
||||
|
||||
// Return the length of the encoding of "key".
|
||||
inline size_t InternalKeyEncodingLength(const ParsedInternalKey& key) {
|
||||
return key.user_key.size() + 8;
|
||||
}
|
||||
|
||||
// Append the serialization of "key" to *result.
|
||||
extern void AppendInternalKey(std::string* result,
|
||||
const ParsedInternalKey& key);
|
||||
|
@ -94,20 +103,76 @@ extern void AppendInternalKey(std::string* result,
|
|||
extern bool ParseInternalKey(const Slice& internal_key,
|
||||
ParsedInternalKey* result);
|
||||
|
||||
// Returns the user key portion of an internal key.
|
||||
inline Slice ExtractUserKey(const Slice& internal_key) {
|
||||
assert(internal_key.size() >= 8);
|
||||
return Slice(internal_key.data(), internal_key.size() - 8);
|
||||
}
|
||||
|
||||
inline ValueType ExtractValueType(const Slice& internal_key) {
|
||||
assert(internal_key.size() >= 8);
|
||||
const size_t n = internal_key.size();
|
||||
uint64_t num = DecodeFixed64(internal_key.data() + n - 8);
|
||||
unsigned char c = num & 0xff;
|
||||
unsigned char c = DecodeLeastFixed64(internal_key.data() + n - sizeof(SequenceNumber));
|
||||
return static_cast<ValueType>(c);
|
||||
}
|
||||
|
||||
inline size_t KeySuffixSize(ValueType val_type) {
|
||||
size_t ret_val;
|
||||
switch(val_type)
|
||||
{
|
||||
case kTypeDeletion:
|
||||
case kTypeValue:
|
||||
ret_val=sizeof(SequenceNumber);
|
||||
break;
|
||||
|
||||
case kTypeValueWriteTime:
|
||||
case kTypeValueExplicitExpiry:
|
||||
ret_val=sizeof(SequenceNumber) + sizeof(ExpiryTimeMicros);
|
||||
break;
|
||||
|
||||
default:
|
||||
// assert(0); cannot use because bloom filter block's name is passed as internal key
|
||||
ret_val=sizeof(SequenceNumber);
|
||||
break;
|
||||
} // switch
|
||||
return(ret_val);
|
||||
}
|
||||
|
||||
const char * KeyTypeString(ValueType val_type);
|
||||
|
||||
inline size_t KeySuffixSize(const Slice & internal_key) {
|
||||
return(KeySuffixSize(ExtractValueType(internal_key)));
|
||||
}
|
||||
|
||||
// Returns the user key portion of an internal key.
|
||||
inline Slice ExtractUserKey(const Slice& internal_key) {
|
||||
assert(internal_key.size() >= 8);
|
||||
return Slice(internal_key.data(), internal_key.size() - KeySuffixSize(internal_key));
|
||||
}
|
||||
|
||||
// Returns the sequence number with ValueType removed
|
||||
inline SequenceNumber ExtractSequenceNumber(const Slice& internal_key) {
|
||||
assert(internal_key.size() >= 8);
|
||||
return(DecodeFixed64(internal_key.data() + internal_key.size() - 8)>>8);
|
||||
}
|
||||
|
||||
// Return the length of the encoding of "key".
|
||||
inline size_t InternalKeyEncodingLength(const ParsedInternalKey& key) {
|
||||
return key.user_key.size() + KeySuffixSize(key.type);
|
||||
}
|
||||
|
||||
// Riak: is this an expiry key and therefore contain extra ExpiryTime field
|
||||
inline bool IsExpiryKey(ValueType val_type) {
|
||||
return(kTypeValueWriteTime==val_type || kTypeValueExplicitExpiry==val_type);
|
||||
}
|
||||
|
||||
// Riak: is this an expiry key and therefore contain extra ExpiryTime field
|
||||
inline bool IsExpiryKey(const Slice & internal_key) {
|
||||
return(internal_key.size()>=KeySuffixSize(kTypeValueWriteTime)
|
||||
&& IsExpiryKey(ExtractValueType(internal_key)));
|
||||
}
|
||||
|
||||
// Riak: extracts expiry value
|
||||
inline ExpiryTimeMicros ExtractExpiry(const Slice& internal_key) {
|
||||
assert(internal_key.size() >= KeySuffixSize(kTypeValueWriteTime));
|
||||
assert(IsExpiryKey(internal_key));
|
||||
return(DecodeFixed64(internal_key.data() + internal_key.size() - KeySuffixSize(kTypeValueWriteTime)));
|
||||
}
|
||||
|
||||
// A comparator for internal keys that uses a specified comparator for
|
||||
// the user key portion and breaks ties by decreasing sequence number.
|
||||
class InternalKeyComparator : public Comparator {
|
||||
|
@ -129,7 +194,7 @@ class InternalKeyComparator : public Comparator {
|
|||
|
||||
// Filter policy wrapper that converts from internal keys to user keys
|
||||
class InternalFilterPolicy : public FilterPolicy {
|
||||
private:
|
||||
protected:
|
||||
const FilterPolicy* const user_policy_;
|
||||
public:
|
||||
explicit InternalFilterPolicy(const FilterPolicy* p) : user_policy_(p) { }
|
||||
|
@ -138,6 +203,12 @@ class InternalFilterPolicy : public FilterPolicy {
|
|||
virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const;
|
||||
};
|
||||
|
||||
class InternalFilterPolicy2 : public InternalFilterPolicy {
|
||||
public:
|
||||
explicit InternalFilterPolicy2(const FilterPolicy* p) : InternalFilterPolicy(p) { }
|
||||
virtual ~InternalFilterPolicy2() {delete user_policy_;};
|
||||
};
|
||||
|
||||
// Modules in this directory should keep internal keys wrapped inside
|
||||
// the following class instead of plain strings so that we do not
|
||||
// incorrectly use string comparisons instead of an InternalKeyComparator.
|
||||
|
@ -146,8 +217,8 @@ class InternalKey {
|
|||
std::string rep_;
|
||||
public:
|
||||
InternalKey() { } // Leave rep_ as empty to indicate it is invalid
|
||||
InternalKey(const Slice& user_key, SequenceNumber s, ValueType t) {
|
||||
AppendInternalKey(&rep_, ParsedInternalKey(user_key, s, t));
|
||||
InternalKey(const Slice& user_key, ExpiryTimeMicros exp, SequenceNumber s, ValueType t) {
|
||||
AppendInternalKey(&rep_, ParsedInternalKey(user_key, exp, s, t));
|
||||
}
|
||||
|
||||
void DecodeFrom(const Slice& s) { rep_.assign(s.data(), s.size()); }
|
||||
|
@ -157,6 +228,7 @@ class InternalKey {
|
|||
}
|
||||
|
||||
Slice user_key() const { return ExtractUserKey(rep_); }
|
||||
Slice internal_key() const { return Slice(rep_); }
|
||||
|
||||
void SetFrom(const ParsedInternalKey& p) {
|
||||
rep_.clear();
|
||||
|
@ -181,8 +253,12 @@ inline bool ParseInternalKey(const Slice& internal_key,
|
|||
unsigned char c = num & 0xff;
|
||||
result->sequence = num >> 8;
|
||||
result->type = static_cast<ValueType>(c);
|
||||
result->user_key = Slice(internal_key.data(), n - 8);
|
||||
return (c <= static_cast<unsigned char>(kTypeValue));
|
||||
if (IsExpiryKey((ValueType)c))
|
||||
result->expiry=DecodeFixed64(internal_key.data() + n - KeySuffixSize((ValueType)c));
|
||||
else
|
||||
result->expiry=0;
|
||||
result->user_key = Slice(internal_key.data(), n - KeySuffixSize((ValueType)c));
|
||||
return (c <= static_cast<unsigned char>(kTypeValueExplicitExpiry));
|
||||
}
|
||||
|
||||
// A helper class useful for DBImpl::Get()
|
||||
|
@ -190,7 +266,7 @@ class LookupKey {
|
|||
public:
|
||||
// Initialize *this for looking up user_key at a snapshot with
|
||||
// the specified sequence number.
|
||||
LookupKey(const Slice& user_key, SequenceNumber sequence);
|
||||
LookupKey(const Slice& user_key, SequenceNumber sequence, KeyMetaData * meta=NULL);
|
||||
|
||||
~LookupKey();
|
||||
|
||||
|
@ -201,12 +277,38 @@ class LookupKey {
|
|||
Slice internal_key() const { return Slice(kstart_, end_ - kstart_); }
|
||||
|
||||
// Return the user key
|
||||
Slice user_key() const { return Slice(kstart_, end_ - kstart_ - 8); }
|
||||
Slice user_key() const
|
||||
{ return Slice(kstart_, end_ - kstart_ - KeySuffixSize(internal_key())); }
|
||||
|
||||
// did requestor have metadata object?
|
||||
bool WantsKeyMetaData() const {return(NULL!=meta_);};
|
||||
|
||||
void SetKeyMetaData(ValueType type, SequenceNumber seq, ExpiryTimeMicros expiry) const
|
||||
{if (NULL!=meta_)
|
||||
{
|
||||
meta_->m_Type=type;
|
||||
meta_->m_Sequence=seq;
|
||||
meta_->m_Expiry=expiry;
|
||||
} // if
|
||||
};
|
||||
|
||||
void SetKeyMetaData(const ParsedInternalKey & pi_key) const
|
||||
{if (NULL!=meta_)
|
||||
{
|
||||
meta_->m_Type=pi_key.type;
|
||||
meta_->m_Sequence=pi_key.sequence;
|
||||
meta_->m_Expiry=pi_key.expiry;
|
||||
} // if
|
||||
};
|
||||
|
||||
void SetKeyMetaData(const KeyMetaData & meta) const
|
||||
{if (NULL!=meta_) *meta_=meta;};
|
||||
|
||||
private:
|
||||
// We construct a char array of the form:
|
||||
// klength varint32 <-- start_
|
||||
// userkey char[klength] <-- kstart_
|
||||
// optional uint64
|
||||
// tag uint64
|
||||
// <-- end_
|
||||
// The array is a suitable MemTable key.
|
||||
|
@ -216,6 +318,9 @@ class LookupKey {
|
|||
const char* end_;
|
||||
char space_[200]; // Avoid allocation for short keys
|
||||
|
||||
// allow code that finds the key to place metadata here, even if 'const'
|
||||
mutable KeyMetaData * meta_;
|
||||
|
||||
// No copying allowed
|
||||
LookupKey(const LookupKey&);
|
||||
void operator=(const LookupKey&);
|
||||
|
@ -223,8 +328,47 @@ class LookupKey {
|
|||
|
||||
inline LookupKey::~LookupKey() {
|
||||
if (start_ != space_) delete[] start_;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
// this class was constructed from code with DBImpl::DoCompactionWork (db_impl.cc)
|
||||
// so it could be shared within BuildTable (and thus reduce Level 0 bloating)
|
||||
class KeyRetirement
|
||||
{
|
||||
protected:
|
||||
// "state" from previous key reviewed
|
||||
std::string current_user_key;
|
||||
bool has_current_user_key;
|
||||
SequenceNumber last_sequence_for_key;
|
||||
|
||||
// database values needed for processing
|
||||
const Comparator * user_comparator;
|
||||
SequenceNumber smallest_snapshot;
|
||||
const Options * options;
|
||||
Compaction * const compaction;
|
||||
|
||||
bool valid;
|
||||
size_t dropped; // tombstone or old version dropped
|
||||
size_t expired; // expired dropped
|
||||
|
||||
public:
|
||||
KeyRetirement(const Comparator * UserComparator, SequenceNumber SmallestSnapshot,
|
||||
const Options * Opts, Compaction * const Compaction=NULL);
|
||||
|
||||
virtual ~KeyRetirement();
|
||||
|
||||
bool operator()(Slice & key);
|
||||
|
||||
size_t GetDroppedCount() const {return(dropped);};
|
||||
size_t GetExpiredCount() const {return(expired);};
|
||||
|
||||
private:
|
||||
KeyRetirement();
|
||||
KeyRetirement(const KeyRetirement &);
|
||||
const KeyRetirement & operator=(const KeyRetirement &);
|
||||
|
||||
}; // class KeyRetirement
|
||||
|
||||
} // namespace leveldb
|
||||
|
||||
#endif // STORAGE_LEVELDB_DB_DBFORMAT_H_
|
||||
#endif // STORAGE_LEVELDB_DB_FORMAT_H_
|
||||
|
|
|
@ -9,10 +9,11 @@
|
|||
namespace leveldb {
|
||||
|
||||
static std::string IKey(const std::string& user_key,
|
||||
ExpiryTimeMicros exp,
|
||||
uint64_t seq,
|
||||
ValueType vt) {
|
||||
std::string encoded;
|
||||
AppendInternalKey(&encoded, ParsedInternalKey(user_key, seq, vt));
|
||||
AppendInternalKey(&encoded, ParsedInternalKey(user_key, exp, seq, vt));
|
||||
return encoded;
|
||||
}
|
||||
|
||||
|
@ -29,12 +30,13 @@ static std::string ShortSuccessor(const std::string& s) {
|
|||
}
|
||||
|
||||
static void TestKey(const std::string& key,
|
||||
ExpiryTimeMicros exp,
|
||||
uint64_t seq,
|
||||
ValueType vt) {
|
||||
std::string encoded = IKey(key, seq, vt);
|
||||
std::string encoded = IKey(key, exp, seq, vt);
|
||||
|
||||
Slice in(encoded);
|
||||
ParsedInternalKey decoded("", 0, kTypeValue);
|
||||
ParsedInternalKey decoded("", 0, 0, kTypeValue);
|
||||
|
||||
ASSERT_TRUE(ParseInternalKey(in, &decoded));
|
||||
ASSERT_EQ(key, decoded.user_key.ToString());
|
||||
|
@ -56,53 +58,53 @@ TEST(FormatTest, InternalKey_EncodeDecode) {
|
|||
};
|
||||
for (int k = 0; k < sizeof(keys) / sizeof(keys[0]); k++) {
|
||||
for (int s = 0; s < sizeof(seq) / sizeof(seq[0]); s++) {
|
||||
TestKey(keys[k], seq[s], kTypeValue);
|
||||
TestKey("hello", 1, kTypeDeletion);
|
||||
TestKey(keys[k], 0, seq[s], kTypeValue);
|
||||
TestKey("hello", 0, 1, kTypeDeletion);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST(FormatTest, InternalKeyShortSeparator) {
|
||||
// When user keys are same
|
||||
ASSERT_EQ(IKey("foo", 100, kTypeValue),
|
||||
Shorten(IKey("foo", 100, kTypeValue),
|
||||
IKey("foo", 99, kTypeValue)));
|
||||
ASSERT_EQ(IKey("foo", 100, kTypeValue),
|
||||
Shorten(IKey("foo", 100, kTypeValue),
|
||||
IKey("foo", 101, kTypeValue)));
|
||||
ASSERT_EQ(IKey("foo", 100, kTypeValue),
|
||||
Shorten(IKey("foo", 100, kTypeValue),
|
||||
IKey("foo", 100, kTypeValue)));
|
||||
ASSERT_EQ(IKey("foo", 100, kTypeValue),
|
||||
Shorten(IKey("foo", 100, kTypeValue),
|
||||
IKey("foo", 100, kTypeDeletion)));
|
||||
ASSERT_EQ(IKey("foo", 0, 100, kTypeValue),
|
||||
Shorten(IKey("foo", 0, 100, kTypeValue),
|
||||
IKey("foo", 0, 99, kTypeValue)));
|
||||
ASSERT_EQ(IKey("foo", 0, 100, kTypeValue),
|
||||
Shorten(IKey("foo", 0, 100, kTypeValue),
|
||||
IKey("foo", 0, 101, kTypeValue)));
|
||||
ASSERT_EQ(IKey("foo", 0, 100, kTypeValue),
|
||||
Shorten(IKey("foo", 0, 100, kTypeValue),
|
||||
IKey("foo", 0, 100, kTypeValue)));
|
||||
ASSERT_EQ(IKey("foo", 0, 100, kTypeValue),
|
||||
Shorten(IKey("foo", 0, 100, kTypeValue),
|
||||
IKey("foo", 0, 100, kTypeDeletion)));
|
||||
|
||||
// When user keys are misordered
|
||||
ASSERT_EQ(IKey("foo", 100, kTypeValue),
|
||||
Shorten(IKey("foo", 100, kTypeValue),
|
||||
IKey("bar", 99, kTypeValue)));
|
||||
ASSERT_EQ(IKey("foo", 0, 100, kTypeValue),
|
||||
Shorten(IKey("foo", 0, 100, kTypeValue),
|
||||
IKey("bar", 0, 99, kTypeValue)));
|
||||
|
||||
// When user keys are different, but correctly ordered
|
||||
ASSERT_EQ(IKey("g", kMaxSequenceNumber, kValueTypeForSeek),
|
||||
Shorten(IKey("foo", 100, kTypeValue),
|
||||
IKey("hello", 200, kTypeValue)));
|
||||
ASSERT_EQ(IKey("g", 0, kMaxSequenceNumber, kValueTypeForSeek),
|
||||
Shorten(IKey("foo", 0, 100, kTypeValue),
|
||||
IKey("hello", 0, 200, kTypeValue)));
|
||||
|
||||
// When start user key is prefix of limit user key
|
||||
ASSERT_EQ(IKey("foo", 100, kTypeValue),
|
||||
Shorten(IKey("foo", 100, kTypeValue),
|
||||
IKey("foobar", 200, kTypeValue)));
|
||||
ASSERT_EQ(IKey("foo", 0, 100, kTypeValue),
|
||||
Shorten(IKey("foo", 0, 100, kTypeValue),
|
||||
IKey("foobar", 0, 200, kTypeValue)));
|
||||
|
||||
// When limit user key is prefix of start user key
|
||||
ASSERT_EQ(IKey("foobar", 100, kTypeValue),
|
||||
Shorten(IKey("foobar", 100, kTypeValue),
|
||||
IKey("foo", 200, kTypeValue)));
|
||||
ASSERT_EQ(IKey("foobar", 0, 100, kTypeValue),
|
||||
Shorten(IKey("foobar", 0, 100, kTypeValue),
|
||||
IKey("foo", 0, 200, kTypeValue)));
|
||||
}
|
||||
|
||||
TEST(FormatTest, InternalKeyShortestSuccessor) {
|
||||
ASSERT_EQ(IKey("g", kMaxSequenceNumber, kValueTypeForSeek),
|
||||
ShortSuccessor(IKey("foo", 100, kTypeValue)));
|
||||
ASSERT_EQ(IKey("\xff\xff", 100, kTypeValue),
|
||||
ShortSuccessor(IKey("\xff\xff", 100, kTypeValue)));
|
||||
ASSERT_EQ(IKey("g", 0, kMaxSequenceNumber, kValueTypeForSeek),
|
||||
ShortSuccessor(IKey("foo", 0, 100, kTypeValue)));
|
||||
ASSERT_EQ(IKey("\xff\xff", 0, 100, kTypeValue),
|
||||
ShortSuccessor(IKey("\xff\xff", 0, 100, kTypeValue)));
|
||||
}
|
||||
|
||||
} // namespace leveldb
|
||||
|
|
|
@ -1,554 +0,0 @@
|
|||
// Copyright 2014 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
// This test uses a custom Env to keep track of the state of a filesystem as of
|
||||
// the last "sync". It then checks for data loss errors by purposely dropping
|
||||
// file data (or entire files) not protected by a "sync".
|
||||
|
||||
#include "leveldb/db.h"
|
||||
|
||||
#include <map>
|
||||
#include <set>
|
||||
#include "db/db_impl.h"
|
||||
#include "db/filename.h"
|
||||
#include "db/log_format.h"
|
||||
#include "db/version_set.h"
|
||||
#include "leveldb/cache.h"
|
||||
#include "leveldb/env.h"
|
||||
#include "leveldb/table.h"
|
||||
#include "leveldb/write_batch.h"
|
||||
#include "util/logging.h"
|
||||
#include "util/mutexlock.h"
|
||||
#include "util/testharness.h"
|
||||
#include "util/testutil.h"
|
||||
|
||||
namespace leveldb {
|
||||
|
||||
static const int kValueSize = 1000;
|
||||
static const int kMaxNumValues = 2000;
|
||||
static const size_t kNumIterations = 3;
|
||||
|
||||
class FaultInjectionTestEnv;
|
||||
|
||||
namespace {
|
||||
|
||||
// Assume a filename, and not a directory name like "/foo/bar/"
|
||||
static std::string GetDirName(const std::string filename) {
|
||||
size_t found = filename.find_last_of("/\\");
|
||||
if (found == std::string::npos) {
|
||||
return "";
|
||||
} else {
|
||||
return filename.substr(0, found);
|
||||
}
|
||||
}
|
||||
|
||||
Status SyncDir(const std::string& dir) {
|
||||
// As this is a test it isn't required to *actually* sync this directory.
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
// A basic file truncation function suitable for this test.
|
||||
Status Truncate(const std::string& filename, uint64_t length) {
|
||||
leveldb::Env* env = leveldb::Env::Default();
|
||||
|
||||
SequentialFile* orig_file;
|
||||
Status s = env->NewSequentialFile(filename, &orig_file);
|
||||
if (!s.ok())
|
||||
return s;
|
||||
|
||||
char* scratch = new char[length];
|
||||
leveldb::Slice result;
|
||||
s = orig_file->Read(length, &result, scratch);
|
||||
delete orig_file;
|
||||
if (s.ok()) {
|
||||
std::string tmp_name = GetDirName(filename) + "/truncate.tmp";
|
||||
WritableFile* tmp_file;
|
||||
s = env->NewWritableFile(tmp_name, &tmp_file);
|
||||
if (s.ok()) {
|
||||
s = tmp_file->Append(result);
|
||||
delete tmp_file;
|
||||
if (s.ok()) {
|
||||
s = env->RenameFile(tmp_name, filename);
|
||||
} else {
|
||||
env->DeleteFile(tmp_name);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
delete[] scratch;
|
||||
|
||||
return s;
|
||||
}
|
||||
|
||||
struct FileState {
|
||||
std::string filename_;
|
||||
ssize_t pos_;
|
||||
ssize_t pos_at_last_sync_;
|
||||
ssize_t pos_at_last_flush_;
|
||||
|
||||
FileState(const std::string& filename)
|
||||
: filename_(filename),
|
||||
pos_(-1),
|
||||
pos_at_last_sync_(-1),
|
||||
pos_at_last_flush_(-1) { }
|
||||
|
||||
FileState() : pos_(-1), pos_at_last_sync_(-1), pos_at_last_flush_(-1) {}
|
||||
|
||||
bool IsFullySynced() const { return pos_ <= 0 || pos_ == pos_at_last_sync_; }
|
||||
|
||||
Status DropUnsyncedData() const;
|
||||
};
|
||||
|
||||
} // anonymous namespace
|
||||
|
||||
// A wrapper around WritableFile which informs another Env whenever this file
|
||||
// is written to or sync'ed.
|
||||
class TestWritableFile : public WritableFile {
|
||||
public:
|
||||
TestWritableFile(const FileState& state,
|
||||
WritableFile* f,
|
||||
FaultInjectionTestEnv* env);
|
||||
virtual ~TestWritableFile();
|
||||
virtual Status Append(const Slice& data);
|
||||
virtual Status Close();
|
||||
virtual Status Flush();
|
||||
virtual Status Sync();
|
||||
|
||||
private:
|
||||
FileState state_;
|
||||
WritableFile* target_;
|
||||
bool writable_file_opened_;
|
||||
FaultInjectionTestEnv* env_;
|
||||
|
||||
Status SyncParent();
|
||||
};
|
||||
|
||||
class FaultInjectionTestEnv : public EnvWrapper {
|
||||
public:
|
||||
FaultInjectionTestEnv() : EnvWrapper(Env::Default()), filesystem_active_(true) {}
|
||||
virtual ~FaultInjectionTestEnv() { }
|
||||
virtual Status NewWritableFile(const std::string& fname,
|
||||
WritableFile** result);
|
||||
virtual Status NewAppendableFile(const std::string& fname,
|
||||
WritableFile** result);
|
||||
virtual Status DeleteFile(const std::string& f);
|
||||
virtual Status RenameFile(const std::string& s, const std::string& t);
|
||||
|
||||
void WritableFileClosed(const FileState& state);
|
||||
Status DropUnsyncedFileData();
|
||||
Status DeleteFilesCreatedAfterLastDirSync();
|
||||
void DirWasSynced();
|
||||
bool IsFileCreatedSinceLastDirSync(const std::string& filename);
|
||||
void ResetState();
|
||||
void UntrackFile(const std::string& f);
|
||||
// Setting the filesystem to inactive is the test equivalent to simulating a
|
||||
// system reset. Setting to inactive will freeze our saved filesystem state so
|
||||
// that it will stop being recorded. It can then be reset back to the state at
|
||||
// the time of the reset.
|
||||
bool IsFilesystemActive() const { return filesystem_active_; }
|
||||
void SetFilesystemActive(bool active) { filesystem_active_ = active; }
|
||||
|
||||
private:
|
||||
port::Mutex mutex_;
|
||||
std::map<std::string, FileState> db_file_state_;
|
||||
std::set<std::string> new_files_since_last_dir_sync_;
|
||||
bool filesystem_active_; // Record flushes, syncs, writes
|
||||
};
|
||||
|
||||
TestWritableFile::TestWritableFile(const FileState& state,
|
||||
WritableFile* f,
|
||||
FaultInjectionTestEnv* env)
|
||||
: state_(state),
|
||||
target_(f),
|
||||
writable_file_opened_(true),
|
||||
env_(env) {
|
||||
assert(f != NULL);
|
||||
}
|
||||
|
||||
TestWritableFile::~TestWritableFile() {
|
||||
if (writable_file_opened_) {
|
||||
Close();
|
||||
}
|
||||
delete target_;
|
||||
}
|
||||
|
||||
Status TestWritableFile::Append(const Slice& data) {
|
||||
Status s = target_->Append(data);
|
||||
if (s.ok() && env_->IsFilesystemActive()) {
|
||||
state_.pos_ += data.size();
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
Status TestWritableFile::Close() {
|
||||
writable_file_opened_ = false;
|
||||
Status s = target_->Close();
|
||||
if (s.ok()) {
|
||||
env_->WritableFileClosed(state_);
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
Status TestWritableFile::Flush() {
|
||||
Status s = target_->Flush();
|
||||
if (s.ok() && env_->IsFilesystemActive()) {
|
||||
state_.pos_at_last_flush_ = state_.pos_;
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
Status TestWritableFile::SyncParent() {
|
||||
Status s = SyncDir(GetDirName(state_.filename_));
|
||||
if (s.ok()) {
|
||||
env_->DirWasSynced();
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
Status TestWritableFile::Sync() {
|
||||
if (!env_->IsFilesystemActive()) {
|
||||
return Status::OK();
|
||||
}
|
||||
// Ensure new files referred to by the manifest are in the filesystem.
|
||||
Status s = target_->Sync();
|
||||
if (s.ok()) {
|
||||
state_.pos_at_last_sync_ = state_.pos_;
|
||||
}
|
||||
if (env_->IsFileCreatedSinceLastDirSync(state_.filename_)) {
|
||||
Status ps = SyncParent();
|
||||
if (s.ok() && !ps.ok()) {
|
||||
s = ps;
|
||||
}
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
Status FaultInjectionTestEnv::NewWritableFile(const std::string& fname,
|
||||
WritableFile** result) {
|
||||
WritableFile* actual_writable_file;
|
||||
Status s = target()->NewWritableFile(fname, &actual_writable_file);
|
||||
if (s.ok()) {
|
||||
FileState state(fname);
|
||||
state.pos_ = 0;
|
||||
*result = new TestWritableFile(state, actual_writable_file, this);
|
||||
// NewWritableFile doesn't append to files, so if the same file is
|
||||
// opened again then it will be truncated - so forget our saved
|
||||
// state.
|
||||
UntrackFile(fname);
|
||||
MutexLock l(&mutex_);
|
||||
new_files_since_last_dir_sync_.insert(fname);
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
Status FaultInjectionTestEnv::NewAppendableFile(const std::string& fname,
|
||||
WritableFile** result) {
|
||||
WritableFile* actual_writable_file;
|
||||
Status s = target()->NewAppendableFile(fname, &actual_writable_file);
|
||||
if (s.ok()) {
|
||||
FileState state(fname);
|
||||
state.pos_ = 0;
|
||||
{
|
||||
MutexLock l(&mutex_);
|
||||
if (db_file_state_.count(fname) == 0) {
|
||||
new_files_since_last_dir_sync_.insert(fname);
|
||||
} else {
|
||||
state = db_file_state_[fname];
|
||||
}
|
||||
}
|
||||
*result = new TestWritableFile(state, actual_writable_file, this);
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
Status FaultInjectionTestEnv::DropUnsyncedFileData() {
|
||||
Status s;
|
||||
MutexLock l(&mutex_);
|
||||
for (std::map<std::string, FileState>::const_iterator it =
|
||||
db_file_state_.begin();
|
||||
s.ok() && it != db_file_state_.end(); ++it) {
|
||||
const FileState& state = it->second;
|
||||
if (!state.IsFullySynced()) {
|
||||
s = state.DropUnsyncedData();
|
||||
}
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
void FaultInjectionTestEnv::DirWasSynced() {
|
||||
MutexLock l(&mutex_);
|
||||
new_files_since_last_dir_sync_.clear();
|
||||
}
|
||||
|
||||
bool FaultInjectionTestEnv::IsFileCreatedSinceLastDirSync(
|
||||
const std::string& filename) {
|
||||
MutexLock l(&mutex_);
|
||||
return new_files_since_last_dir_sync_.find(filename) !=
|
||||
new_files_since_last_dir_sync_.end();
|
||||
}
|
||||
|
||||
void FaultInjectionTestEnv::UntrackFile(const std::string& f) {
|
||||
MutexLock l(&mutex_);
|
||||
db_file_state_.erase(f);
|
||||
new_files_since_last_dir_sync_.erase(f);
|
||||
}
|
||||
|
||||
Status FaultInjectionTestEnv::DeleteFile(const std::string& f) {
|
||||
Status s = EnvWrapper::DeleteFile(f);
|
||||
ASSERT_OK(s);
|
||||
if (s.ok()) {
|
||||
UntrackFile(f);
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
Status FaultInjectionTestEnv::RenameFile(const std::string& s,
|
||||
const std::string& t) {
|
||||
Status ret = EnvWrapper::RenameFile(s, t);
|
||||
|
||||
if (ret.ok()) {
|
||||
MutexLock l(&mutex_);
|
||||
if (db_file_state_.find(s) != db_file_state_.end()) {
|
||||
db_file_state_[t] = db_file_state_[s];
|
||||
db_file_state_.erase(s);
|
||||
}
|
||||
|
||||
if (new_files_since_last_dir_sync_.erase(s) != 0) {
|
||||
assert(new_files_since_last_dir_sync_.find(t) ==
|
||||
new_files_since_last_dir_sync_.end());
|
||||
new_files_since_last_dir_sync_.insert(t);
|
||||
}
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
void FaultInjectionTestEnv::ResetState() {
|
||||
// Since we are not destroying the database, the existing files
|
||||
// should keep their recorded synced/flushed state. Therefore
|
||||
// we do not reset db_file_state_ and new_files_since_last_dir_sync_.
|
||||
MutexLock l(&mutex_);
|
||||
SetFilesystemActive(true);
|
||||
}
|
||||
|
||||
Status FaultInjectionTestEnv::DeleteFilesCreatedAfterLastDirSync() {
|
||||
// Because DeleteFile access this container make a copy to avoid deadlock
|
||||
mutex_.Lock();
|
||||
std::set<std::string> new_files(new_files_since_last_dir_sync_.begin(),
|
||||
new_files_since_last_dir_sync_.end());
|
||||
mutex_.Unlock();
|
||||
Status s;
|
||||
std::set<std::string>::const_iterator it;
|
||||
for (it = new_files.begin(); s.ok() && it != new_files.end(); ++it) {
|
||||
s = DeleteFile(*it);
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
void FaultInjectionTestEnv::WritableFileClosed(const FileState& state) {
|
||||
MutexLock l(&mutex_);
|
||||
db_file_state_[state.filename_] = state;
|
||||
}
|
||||
|
||||
Status FileState::DropUnsyncedData() const {
|
||||
ssize_t sync_pos = pos_at_last_sync_ == -1 ? 0 : pos_at_last_sync_;
|
||||
return Truncate(filename_, sync_pos);
|
||||
}
|
||||
|
||||
class FaultInjectionTest {
|
||||
public:
|
||||
enum ExpectedVerifResult { VAL_EXPECT_NO_ERROR, VAL_EXPECT_ERROR };
|
||||
enum ResetMethod { RESET_DROP_UNSYNCED_DATA, RESET_DELETE_UNSYNCED_FILES };
|
||||
|
||||
FaultInjectionTestEnv* env_;
|
||||
std::string dbname_;
|
||||
Cache* tiny_cache_;
|
||||
Options options_;
|
||||
DB* db_;
|
||||
|
||||
FaultInjectionTest()
|
||||
: env_(new FaultInjectionTestEnv),
|
||||
tiny_cache_(NewLRUCache(100)),
|
||||
db_(NULL) {
|
||||
dbname_ = test::TmpDir() + "/fault_test";
|
||||
DestroyDB(dbname_, Options()); // Destroy any db from earlier run
|
||||
options_.reuse_logs = true;
|
||||
options_.env = env_;
|
||||
options_.paranoid_checks = true;
|
||||
options_.block_cache = tiny_cache_;
|
||||
options_.create_if_missing = true;
|
||||
}
|
||||
|
||||
~FaultInjectionTest() {
|
||||
CloseDB();
|
||||
DestroyDB(dbname_, Options());
|
||||
delete tiny_cache_;
|
||||
delete env_;
|
||||
}
|
||||
|
||||
void ReuseLogs(bool reuse) {
|
||||
options_.reuse_logs = reuse;
|
||||
}
|
||||
|
||||
void Build(int start_idx, int num_vals) {
|
||||
std::string key_space, value_space;
|
||||
WriteBatch batch;
|
||||
for (int i = start_idx; i < start_idx + num_vals; i++) {
|
||||
Slice key = Key(i, &key_space);
|
||||
batch.Clear();
|
||||
batch.Put(key, Value(i, &value_space));
|
||||
WriteOptions options;
|
||||
ASSERT_OK(db_->Write(options, &batch));
|
||||
}
|
||||
}
|
||||
|
||||
Status ReadValue(int i, std::string* val) const {
|
||||
std::string key_space, value_space;
|
||||
Slice key = Key(i, &key_space);
|
||||
Value(i, &value_space);
|
||||
ReadOptions options;
|
||||
return db_->Get(options, key, val);
|
||||
}
|
||||
|
||||
Status Verify(int start_idx, int num_vals,
|
||||
ExpectedVerifResult expected) const {
|
||||
std::string val;
|
||||
std::string value_space;
|
||||
Status s;
|
||||
for (int i = start_idx; i < start_idx + num_vals && s.ok(); i++) {
|
||||
Value(i, &value_space);
|
||||
s = ReadValue(i, &val);
|
||||
if (expected == VAL_EXPECT_NO_ERROR) {
|
||||
if (s.ok()) {
|
||||
ASSERT_EQ(value_space, val);
|
||||
}
|
||||
} else if (s.ok()) {
|
||||
fprintf(stderr, "Expected an error at %d, but was OK\n", i);
|
||||
s = Status::IOError(dbname_, "Expected value error:");
|
||||
} else {
|
||||
s = Status::OK(); // An expected error
|
||||
}
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
// Return the ith key
|
||||
Slice Key(int i, std::string* storage) const {
|
||||
char buf[100];
|
||||
snprintf(buf, sizeof(buf), "%016d", i);
|
||||
storage->assign(buf, strlen(buf));
|
||||
return Slice(*storage);
|
||||
}
|
||||
|
||||
// Return the value to associate with the specified key
|
||||
Slice Value(int k, std::string* storage) const {
|
||||
Random r(k);
|
||||
return test::RandomString(&r, kValueSize, storage);
|
||||
}
|
||||
|
||||
Status OpenDB() {
|
||||
delete db_;
|
||||
db_ = NULL;
|
||||
env_->ResetState();
|
||||
return DB::Open(options_, dbname_, &db_);
|
||||
}
|
||||
|
||||
void CloseDB() {
|
||||
delete db_;
|
||||
db_ = NULL;
|
||||
}
|
||||
|
||||
void DeleteAllData() {
|
||||
Iterator* iter = db_->NewIterator(ReadOptions());
|
||||
WriteOptions options;
|
||||
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
|
||||
ASSERT_OK(db_->Delete(WriteOptions(), iter->key()));
|
||||
}
|
||||
|
||||
delete iter;
|
||||
}
|
||||
|
||||
void ResetDBState(ResetMethod reset_method) {
|
||||
switch (reset_method) {
|
||||
case RESET_DROP_UNSYNCED_DATA:
|
||||
ASSERT_OK(env_->DropUnsyncedFileData());
|
||||
break;
|
||||
case RESET_DELETE_UNSYNCED_FILES:
|
||||
ASSERT_OK(env_->DeleteFilesCreatedAfterLastDirSync());
|
||||
break;
|
||||
default:
|
||||
assert(false);
|
||||
}
|
||||
}
|
||||
|
||||
void PartialCompactTestPreFault(int num_pre_sync, int num_post_sync) {
|
||||
DeleteAllData();
|
||||
Build(0, num_pre_sync);
|
||||
db_->CompactRange(NULL, NULL);
|
||||
Build(num_pre_sync, num_post_sync);
|
||||
}
|
||||
|
||||
void PartialCompactTestReopenWithFault(ResetMethod reset_method,
|
||||
int num_pre_sync,
|
||||
int num_post_sync) {
|
||||
env_->SetFilesystemActive(false);
|
||||
CloseDB();
|
||||
ResetDBState(reset_method);
|
||||
ASSERT_OK(OpenDB());
|
||||
ASSERT_OK(Verify(0, num_pre_sync, FaultInjectionTest::VAL_EXPECT_NO_ERROR));
|
||||
ASSERT_OK(Verify(num_pre_sync, num_post_sync, FaultInjectionTest::VAL_EXPECT_ERROR));
|
||||
}
|
||||
|
||||
void NoWriteTestPreFault() {
|
||||
}
|
||||
|
||||
void NoWriteTestReopenWithFault(ResetMethod reset_method) {
|
||||
CloseDB();
|
||||
ResetDBState(reset_method);
|
||||
ASSERT_OK(OpenDB());
|
||||
}
|
||||
|
||||
void DoTest() {
|
||||
Random rnd(0);
|
||||
ASSERT_OK(OpenDB());
|
||||
for (size_t idx = 0; idx < kNumIterations; idx++) {
|
||||
int num_pre_sync = rnd.Uniform(kMaxNumValues);
|
||||
int num_post_sync = rnd.Uniform(kMaxNumValues);
|
||||
|
||||
PartialCompactTestPreFault(num_pre_sync, num_post_sync);
|
||||
PartialCompactTestReopenWithFault(RESET_DROP_UNSYNCED_DATA,
|
||||
num_pre_sync,
|
||||
num_post_sync);
|
||||
|
||||
NoWriteTestPreFault();
|
||||
NoWriteTestReopenWithFault(RESET_DROP_UNSYNCED_DATA);
|
||||
|
||||
PartialCompactTestPreFault(num_pre_sync, num_post_sync);
|
||||
// No new files created so we expect all values since no files will be
|
||||
// dropped.
|
||||
PartialCompactTestReopenWithFault(RESET_DELETE_UNSYNCED_FILES,
|
||||
num_pre_sync + num_post_sync,
|
||||
0);
|
||||
|
||||
NoWriteTestPreFault();
|
||||
NoWriteTestReopenWithFault(RESET_DELETE_UNSYNCED_FILES);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
TEST(FaultInjectionTest, FaultTestNoLogReuse) {
|
||||
ReuseLogs(false);
|
||||
DoTest();
|
||||
}
|
||||
|
||||
TEST(FaultInjectionTest, FaultTestWithLogReuse) {
|
||||
ReuseLogs(true);
|
||||
DoTest();
|
||||
}
|
||||
|
||||
} // namespace leveldb
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
return leveldb::test::RunAllTests();
|
||||
}
|
|
@ -4,9 +4,14 @@
|
|||
|
||||
#include <ctype.h>
|
||||
#include <stdio.h>
|
||||
#include <errno.h>
|
||||
#include <sys/stat.h>
|
||||
#include <sys/types.h>
|
||||
#include "db/filename.h"
|
||||
#include "db/dbformat.h"
|
||||
#include "db/version_set.h"
|
||||
#include "leveldb/env.h"
|
||||
#include "leveldb/status.h"
|
||||
#include "util/logging.h"
|
||||
|
||||
namespace leveldb {
|
||||
|
@ -24,19 +29,50 @@ static std::string MakeFileName(const std::string& name, uint64_t number,
|
|||
return name + buf;
|
||||
}
|
||||
|
||||
static std::string MakeFileName2(const Options & options, uint64_t number,
|
||||
int level, const char* suffix) {
|
||||
char buf[100];
|
||||
if (0<=level)
|
||||
snprintf(buf, sizeof(buf), "/%s_%-d/%06llu.%s",
|
||||
suffix, level,
|
||||
static_cast<unsigned long long>(number),
|
||||
suffix);
|
||||
else if (-1==level)
|
||||
snprintf(buf, sizeof(buf), "/%s/%06llu.%s",
|
||||
suffix,
|
||||
static_cast<unsigned long long>(number),
|
||||
suffix);
|
||||
else if (-2==level)
|
||||
snprintf(buf, sizeof(buf), "/%06llu.%s",
|
||||
static_cast<unsigned long long>(number),
|
||||
suffix);
|
||||
|
||||
return((level<(int)options.tiered_slow_level ?
|
||||
options.tiered_fast_prefix : options.tiered_slow_prefix) + buf);
|
||||
}
|
||||
|
||||
std::string MakeDirName2(const Options & options,
|
||||
int level, const char* suffix) {
|
||||
char buf[100];
|
||||
if (-1!=level)
|
||||
snprintf(buf, sizeof(buf), "/%s_%-d",
|
||||
suffix, level);
|
||||
else
|
||||
snprintf(buf, sizeof(buf), "/%s",
|
||||
suffix);
|
||||
|
||||
return((level<(int)options.tiered_slow_level ?
|
||||
options.tiered_fast_prefix : options.tiered_slow_prefix) + buf);
|
||||
}
|
||||
|
||||
std::string LogFileName(const std::string& name, uint64_t number) {
|
||||
assert(number > 0);
|
||||
return MakeFileName(name, number, "log");
|
||||
}
|
||||
|
||||
std::string TableFileName(const std::string& name, uint64_t number) {
|
||||
std::string TableFileName(const Options & options, uint64_t number, int level) {
|
||||
assert(number > 0);
|
||||
return MakeFileName(name, number, "ldb");
|
||||
}
|
||||
|
||||
std::string SSTTableFileName(const std::string& name, uint64_t number) {
|
||||
assert(number > 0);
|
||||
return MakeFileName(name, number, "sst");
|
||||
return MakeFileName2(options, number, level, "sst");
|
||||
}
|
||||
|
||||
std::string DescriptorFileName(const std::string& dbname, uint64_t number) {
|
||||
|
@ -69,6 +105,36 @@ std::string OldInfoLogFileName(const std::string& dbname) {
|
|||
return dbname + "/LOG.old";
|
||||
}
|
||||
|
||||
//
|
||||
std::string CowFileName(const std::string& dbname) {
|
||||
return dbname + "/COW";
|
||||
}
|
||||
|
||||
|
||||
// Append appropriate "backup" string to input path
|
||||
std::string BackupPath(const std::string& dbname, int backup_num) {
|
||||
std::string dirname;
|
||||
|
||||
char buf[100];
|
||||
if (0 != backup_num)
|
||||
snprintf(buf, sizeof(buf), "/backup.%-d", backup_num);
|
||||
else
|
||||
snprintf(buf, sizeof(buf), "/backup");
|
||||
|
||||
return(dbname + buf);
|
||||
}
|
||||
|
||||
|
||||
// update tiered_fast_prefix and tiered_slow_prefix members of
|
||||
// given Options object to point to desired backup path
|
||||
bool SetBackupPaths(Options & options, int backup_num) {
|
||||
|
||||
options.tiered_fast_prefix = BackupPath(options.tiered_fast_prefix, backup_num);
|
||||
options.tiered_slow_prefix = BackupPath(options.tiered_slow_prefix, backup_num);
|
||||
|
||||
return(true);
|
||||
}
|
||||
|
||||
|
||||
// Owned filenames have the form:
|
||||
// dbname/CURRENT
|
||||
|
@ -76,7 +142,8 @@ std::string OldInfoLogFileName(const std::string& dbname) {
|
|||
// dbname/LOG
|
||||
// dbname/LOG.old
|
||||
// dbname/MANIFEST-[0-9]+
|
||||
// dbname/[0-9]+.(log|sst|ldb)
|
||||
// dbname/[0-9]+.(log|sst)
|
||||
// dbname/COW
|
||||
bool ParseFileName(const std::string& fname,
|
||||
uint64_t* number,
|
||||
FileType* type) {
|
||||
|
@ -84,6 +151,9 @@ bool ParseFileName(const std::string& fname,
|
|||
if (rest == "CURRENT") {
|
||||
*number = 0;
|
||||
*type = kCurrentFile;
|
||||
} else if (rest == "COW") {
|
||||
*number = 0;
|
||||
*type = kCacheWarming;
|
||||
} else if (rest == "LOCK") {
|
||||
*number = 0;
|
||||
*type = kDBLockFile;
|
||||
|
@ -111,7 +181,7 @@ bool ParseFileName(const std::string& fname,
|
|||
Slice suffix = rest;
|
||||
if (suffix == Slice(".log")) {
|
||||
*type = kLogFile;
|
||||
} else if (suffix == Slice(".sst") || suffix == Slice(".ldb")) {
|
||||
} else if (suffix == Slice(".sst")) {
|
||||
*type = kTableFile;
|
||||
} else if (suffix == Slice(".dbtmp")) {
|
||||
*type = kTempFile;
|
||||
|
@ -141,4 +211,99 @@ Status SetCurrentFile(Env* env, const std::string& dbname,
|
|||
return s;
|
||||
}
|
||||
|
||||
|
||||
Status
|
||||
MakeLevelDirectories(Env * env, const Options & options)
|
||||
{
|
||||
Status ret_stat;
|
||||
int level;
|
||||
std::string dirname;
|
||||
|
||||
for (level=0; level<config::kNumLevels && ret_stat.ok(); ++level)
|
||||
{
|
||||
dirname=MakeDirName2(options, level, "sst");
|
||||
|
||||
// ignoring error since no way to tell if "bad" error, or "already exists" error
|
||||
env->CreateDir(dirname.c_str());
|
||||
} // for
|
||||
|
||||
return(ret_stat);
|
||||
|
||||
} // MakeLevelDirectories
|
||||
|
||||
|
||||
bool
|
||||
TestForLevelDirectories(
|
||||
Env * env,
|
||||
const Options & options,
|
||||
Version * version)
|
||||
{
|
||||
bool ret_flag, again;
|
||||
int level;
|
||||
std::string dirname;
|
||||
|
||||
ret_flag=true;
|
||||
again=true;
|
||||
|
||||
// walk backwards, fault will be in higher levels if partial conversion
|
||||
for (level=config::kNumLevels-1; 0<=level && again; --level)
|
||||
{
|
||||
again=false;
|
||||
|
||||
// does directory exist
|
||||
dirname=MakeDirName2(options, level, "sst");
|
||||
ret_flag=env->FileExists(dirname.c_str());
|
||||
|
||||
// do all files exist in level
|
||||
if (ret_flag)
|
||||
{
|
||||
const std::vector<FileMetaData*> & level_files(version->GetFileList(level));
|
||||
std::vector<FileMetaData*>::const_iterator it;
|
||||
std::string table_name;
|
||||
Status s;
|
||||
|
||||
for (it=level_files.begin(); level_files.end()!=it && ret_flag; ++it)
|
||||
{
|
||||
table_name=TableFileName(options, (*it)->number, level);
|
||||
ret_flag=env->FileExists(table_name.c_str());
|
||||
} // for
|
||||
|
||||
again=ret_flag && 0==level_files.size();
|
||||
} // if
|
||||
} // for
|
||||
|
||||
return(ret_flag);
|
||||
|
||||
} // TestForLevelDirectories
|
||||
|
||||
std::string // replacement dbname ... potentially tiered
|
||||
MakeTieredDbname(
|
||||
const std::string & dbname, // input ... original dbname from DBImpl constructor
|
||||
Options & options) // input/output ... writable Options, tiered values changed
|
||||
{
|
||||
// case for "", used with internal calls to DestroyDB
|
||||
if (0==dbname.size() && 0!=options.tiered_fast_prefix.size())
|
||||
{
|
||||
// do NOTHING ... options already initialized
|
||||
} // if
|
||||
else if (0<(int)options.tiered_slow_level && (int)options.tiered_slow_level<config::kNumLevels
|
||||
&& 0!=options.tiered_fast_prefix.size() && 0!=options.tiered_slow_prefix.size())
|
||||
{
|
||||
options.tiered_fast_prefix.append("/");
|
||||
options.tiered_fast_prefix.append(dbname);
|
||||
|
||||
options.tiered_slow_prefix.append("/");
|
||||
options.tiered_slow_prefix.append(dbname);
|
||||
} // else if
|
||||
else
|
||||
{
|
||||
options.tiered_slow_level=0;
|
||||
options.tiered_fast_prefix=dbname; // duplicate as is
|
||||
options.tiered_slow_prefix=dbname;
|
||||
} // else
|
||||
|
||||
return(options.tiered_fast_prefix);
|
||||
|
||||
} // MakeTieredDbname
|
||||
|
||||
} // namespace leveldb
|
||||
|
|
|
@ -9,6 +9,7 @@
|
|||
|
||||
#include <stdint.h>
|
||||
#include <string>
|
||||
#include "leveldb/options.h"
|
||||
#include "leveldb/slice.h"
|
||||
#include "leveldb/status.h"
|
||||
#include "port/port.h"
|
||||
|
@ -16,6 +17,7 @@
|
|||
namespace leveldb {
|
||||
|
||||
class Env;
|
||||
class Version;
|
||||
|
||||
enum FileType {
|
||||
kLogFile,
|
||||
|
@ -24,9 +26,24 @@ enum FileType {
|
|||
kDescriptorFile,
|
||||
kCurrentFile,
|
||||
kTempFile,
|
||||
kInfoLogFile // Either the current one, or an old one
|
||||
kInfoLogFile, // Either the current one, or an old one
|
||||
kCacheWarming
|
||||
};
|
||||
|
||||
// Riak specific routine to help create sst_? subdirectory names
|
||||
std::string MakeDirName2(const Options & options,
|
||||
int level, const char* suffix);
|
||||
|
||||
// Riak specific routine to help create sst_? subdirectories
|
||||
Status MakeLevelDirectories(Env * env, const Options & options);
|
||||
|
||||
// Riak specific routine to test if sst_? subdirectories exist
|
||||
bool TestForLevelDirectories(Env * env, const Options & options, class Version *);
|
||||
|
||||
// Riak specific routine to standardize conversion of dbname and
|
||||
// Options' tiered directories (options parameter is MODIFIED)
|
||||
std::string MakeTieredDbname(const std::string &dbname, Options & options_rw);
|
||||
|
||||
// Return the name of the log file with the specified number
|
||||
// in the db named by "dbname". The result will be prefixed with
|
||||
// "dbname".
|
||||
|
@ -35,12 +52,8 @@ extern std::string LogFileName(const std::string& dbname, uint64_t number);
|
|||
// Return the name of the sstable with the specified number
|
||||
// in the db named by "dbname". The result will be prefixed with
|
||||
// "dbname".
|
||||
extern std::string TableFileName(const std::string& dbname, uint64_t number);
|
||||
|
||||
// Return the legacy file name for an sstable with the specified number
|
||||
// in the db named by "dbname". The result will be prefixed with
|
||||
// "dbname".
|
||||
extern std::string SSTTableFileName(const std::string& dbname, uint64_t number);
|
||||
extern std::string TableFileName(const Options & options, uint64_t number,
|
||||
int level);
|
||||
|
||||
// Return the name of the descriptor file for the db named by
|
||||
// "dbname" and the specified incarnation number. The result will be
|
||||
|
@ -67,10 +80,21 @@ extern std::string InfoLogFileName(const std::string& dbname);
|
|||
// Return the name of the old info log file for "dbname".
|
||||
extern std::string OldInfoLogFileName(const std::string& dbname);
|
||||
|
||||
// Return the name of the cache object file for the db named by
|
||||
// "dbname". The result will be prefixed with "dbname".
|
||||
extern std::string CowFileName(const std::string& dbname);
|
||||
|
||||
// Append appropriate "backup" string to input path
|
||||
extern std::string BackupPath(const std::string& dbname, int backup_num);
|
||||
|
||||
// update tiered_fast_prefix and tiered_slow_prefix members of
|
||||
// given Options object to point to backup path
|
||||
extern bool SetBackupPaths(Options & options, int backup_num);
|
||||
|
||||
// If filename is a leveldb file, store the type of the file in *type.
|
||||
// The number encoded in the filename is stored in *number. If the
|
||||
// filename was successfully parsed, returns true. Else return false.
|
||||
extern bool ParseFileName(const std::string& filename,
|
||||
extern bool ParseFileName(const std::string& tiered_filename,
|
||||
uint64_t* number,
|
||||
FileType* type);
|
||||
|
||||
|
|
|
@ -27,7 +27,6 @@ TEST(FileNameTest, Parse) {
|
|||
{ "100.log", 100, kLogFile },
|
||||
{ "0.log", 0, kLogFile },
|
||||
{ "0.sst", 0, kTableFile },
|
||||
{ "0.ldb", 0, kTableFile },
|
||||
{ "CURRENT", 0, kCurrentFile },
|
||||
{ "LOCK", 0, kDBLockFile },
|
||||
{ "MANIFEST-2", 2, kDescriptorFile },
|
||||
|
@ -71,13 +70,14 @@ TEST(FileNameTest, Parse) {
|
|||
for (int i = 0; i < sizeof(errors) / sizeof(errors[0]); i++) {
|
||||
std::string f = errors[i];
|
||||
ASSERT_TRUE(!ParseFileName(f, &number, &type)) << f;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
TEST(FileNameTest, Construction) {
|
||||
uint64_t number;
|
||||
FileType type;
|
||||
std::string fname;
|
||||
Options options;
|
||||
|
||||
fname = CurrentFileName("foo");
|
||||
ASSERT_EQ("foo/", std::string(fname.data(), 4));
|
||||
|
@ -97,12 +97,40 @@ TEST(FileNameTest, Construction) {
|
|||
ASSERT_EQ(192, number);
|
||||
ASSERT_EQ(kLogFile, type);
|
||||
|
||||
fname = TableFileName("bar", 200);
|
||||
options.tiered_fast_prefix="bar";
|
||||
options.tiered_slow_prefix="bar";
|
||||
fname = TableFileName(options, 200, 1);
|
||||
ASSERT_EQ("bar/", std::string(fname.data(), 4));
|
||||
ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
|
||||
ASSERT_EQ("sst_1/", std::string(fname.substr(4,6)));
|
||||
ASSERT_TRUE(ParseFileName(fname.c_str() + 10, &number, &type));
|
||||
ASSERT_EQ(200, number);
|
||||
ASSERT_EQ(kTableFile, type);
|
||||
|
||||
fname = TableFileName(options, 400, 4);
|
||||
ASSERT_EQ("bar/", std::string(fname.data(), 4));
|
||||
ASSERT_EQ("sst_4/", std::string(fname.substr(4,6)));
|
||||
ASSERT_TRUE(ParseFileName(fname.c_str() + 10, &number, &type));
|
||||
ASSERT_EQ(400, number);
|
||||
ASSERT_EQ(kTableFile, type);
|
||||
|
||||
options.tiered_slow_level=4;
|
||||
options.tiered_fast_prefix="fast";
|
||||
options.tiered_slow_prefix="slow";
|
||||
fname = TableFileName(options, 500, 3);
|
||||
ASSERT_EQ("fast/", std::string(fname.data(), 5));
|
||||
ASSERT_EQ("sst_3/", std::string(fname.substr(5,6)));
|
||||
ASSERT_TRUE(ParseFileName(fname.c_str() + 11, &number, &type));
|
||||
ASSERT_EQ(500, number);
|
||||
ASSERT_EQ(kTableFile, type);
|
||||
|
||||
fname = TableFileName(options, 600, 4);
|
||||
ASSERT_EQ("slow/", std::string(fname.data(), 5));
|
||||
ASSERT_EQ("sst_4/", std::string(fname.substr(5,6)));
|
||||
ASSERT_TRUE(ParseFileName(fname.c_str() + 11, &number, &type));
|
||||
ASSERT_EQ(600, number);
|
||||
ASSERT_EQ(kTableFile, type);
|
||||
|
||||
|
||||
fname = DescriptorFileName("bar", 100);
|
||||
ASSERT_EQ("bar/", std::string(fname.data(), 4));
|
||||
ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
|
||||
|
@ -114,6 +142,48 @@ TEST(FileNameTest, Construction) {
|
|||
ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
|
||||
ASSERT_EQ(999, number);
|
||||
ASSERT_EQ(kTempFile, type);
|
||||
|
||||
fname = CowFileName("/what/goes/moo");
|
||||
ASSERT_EQ("/what/goes/moo/COW", fname);
|
||||
|
||||
fname = BackupPath("/var/db/riak/data/leveldb/0",0);
|
||||
ASSERT_EQ("/var/db/riak/data/leveldb/0/backup", fname);
|
||||
|
||||
fname = BackupPath("/var/db/riak/data/leveldb/0",1);
|
||||
ASSERT_EQ("/var/db/riak/data/leveldb/0/backup.1", fname);
|
||||
|
||||
fname = BackupPath("/var/db/riak/data/leveldb/0",5);
|
||||
ASSERT_EQ("/var/db/riak/data/leveldb/0/backup.5", fname);
|
||||
|
||||
options.tiered_slow_level=4;
|
||||
options.tiered_fast_prefix="fast";
|
||||
options.tiered_slow_prefix="slow";
|
||||
fname = SetBackupPaths(options,0);
|
||||
ASSERT_EQ("fast/backup", options.tiered_fast_prefix);
|
||||
ASSERT_EQ("slow/backup", options.tiered_slow_prefix);
|
||||
|
||||
options.tiered_slow_level=4;
|
||||
options.tiered_fast_prefix="fast";
|
||||
options.tiered_slow_prefix="slow";
|
||||
fname = SetBackupPaths(options,3);
|
||||
ASSERT_EQ("fast/backup.3", options.tiered_fast_prefix);
|
||||
ASSERT_EQ("slow/backup.3", options.tiered_slow_prefix);
|
||||
|
||||
|
||||
options.tiered_slow_level=4;
|
||||
options.tiered_fast_prefix="//mnt/fast";
|
||||
options.tiered_slow_prefix="//mnt/slow";
|
||||
fname=MakeTieredDbname("riak/data/leveldb", options);
|
||||
ASSERT_EQ("//mnt/fast/riak/data/leveldb", fname);
|
||||
ASSERT_EQ("//mnt/fast/riak/data/leveldb", options.tiered_fast_prefix);
|
||||
ASSERT_EQ("//mnt/slow/riak/data/leveldb", options.tiered_slow_prefix);
|
||||
|
||||
// special case with no dbname given, should have no changes
|
||||
fname=MakeTieredDbname("", options);
|
||||
ASSERT_EQ("//mnt/fast/riak/data/leveldb", fname);
|
||||
ASSERT_EQ("//mnt/fast/riak/data/leveldb", options.tiered_fast_prefix);
|
||||
ASSERT_EQ("//mnt/slow/riak/data/leveldb", options.tiered_slow_prefix);
|
||||
|
||||
}
|
||||
|
||||
} // namespace leveldb
|
||||
|
|
|
@ -1,65 +0,0 @@
|
|||
// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include <stdio.h>
|
||||
#include "leveldb/dumpfile.h"
|
||||
#include "leveldb/env.h"
|
||||
#include "leveldb/status.h"
|
||||
|
||||
namespace leveldb {
|
||||
namespace {
|
||||
|
||||
class StdoutPrinter : public WritableFile {
|
||||
public:
|
||||
virtual Status Append(const Slice& data) {
|
||||
fwrite(data.data(), 1, data.size(), stdout);
|
||||
return Status::OK();
|
||||
}
|
||||
virtual Status Close() { return Status::OK(); }
|
||||
virtual Status Flush() { return Status::OK(); }
|
||||
virtual Status Sync() { return Status::OK(); }
|
||||
virtual std::string GetName() const { return "[stdout]"; }
|
||||
};
|
||||
|
||||
bool HandleDumpCommand(Env* env, char** files, int num) {
|
||||
StdoutPrinter printer;
|
||||
bool ok = true;
|
||||
for (int i = 0; i < num; i++) {
|
||||
Status s = DumpFile(env, files[i], &printer);
|
||||
if (!s.ok()) {
|
||||
fprintf(stderr, "%s\n", s.ToString().c_str());
|
||||
ok = false;
|
||||
}
|
||||
}
|
||||
return ok;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
} // namespace leveldb
|
||||
|
||||
static void Usage() {
|
||||
fprintf(
|
||||
stderr,
|
||||
"Usage: leveldbutil command...\n"
|
||||
" dump files... -- dump contents of specified files\n"
|
||||
);
|
||||
}
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
leveldb::Env* env = leveldb::Env::Default();
|
||||
bool ok = true;
|
||||
if (argc < 2) {
|
||||
Usage();
|
||||
ok = false;
|
||||
} else {
|
||||
std::string command = argv[1];
|
||||
if (command == "dump") {
|
||||
ok = leveldb::HandleDumpCommand(env, argv+2, argc-2);
|
||||
} else {
|
||||
Usage();
|
||||
ok = false;
|
||||
}
|
||||
}
|
||||
return (ok ? 0 : 1);
|
||||
}
|
|
@ -3,7 +3,7 @@
|
|||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
//
|
||||
// Log format information shared by reader and writer.
|
||||
// See ../doc/log_format.md for more detail.
|
||||
// See ../doc/log_format.txt for more detail.
|
||||
|
||||
#ifndef STORAGE_LEVELDB_DB_LOG_FORMAT_H_
|
||||
#define STORAGE_LEVELDB_DB_LOG_FORMAT_H_
|
||||
|
@ -26,8 +26,8 @@ static const int kMaxRecordType = kLastType;
|
|||
|
||||
static const int kBlockSize = 32768;
|
||||
|
||||
// Header is checksum (4 bytes), length (2 bytes), type (1 byte).
|
||||
static const int kHeaderSize = 4 + 2 + 1;
|
||||
// Header is checksum (4 bytes), type (1 byte), length (2 bytes).
|
||||
static const int kHeaderSize = 4 + 1 + 2;
|
||||
|
||||
} // namespace log
|
||||
} // namespace leveldb
|
||||
|
|
|
@ -25,8 +25,7 @@ Reader::Reader(SequentialFile* file, Reporter* reporter, bool checksum,
|
|||
eof_(false),
|
||||
last_record_offset_(0),
|
||||
end_of_buffer_offset_(0),
|
||||
initial_offset_(initial_offset),
|
||||
resyncing_(initial_offset > 0) {
|
||||
initial_offset_(initial_offset) {
|
||||
}
|
||||
|
||||
Reader::~Reader() {
|
||||
|
@ -73,25 +72,8 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch) {
|
|||
|
||||
Slice fragment;
|
||||
while (true) {
|
||||
uint64_t physical_record_offset = end_of_buffer_offset_ - buffer_.size();
|
||||
const unsigned int record_type = ReadPhysicalRecord(&fragment);
|
||||
|
||||
// ReadPhysicalRecord may have only had an empty trailer remaining in its
|
||||
// internal buffer. Calculate the offset of the next physical record now
|
||||
// that it has returned, properly accounting for its header size.
|
||||
uint64_t physical_record_offset =
|
||||
end_of_buffer_offset_ - buffer_.size() - kHeaderSize - fragment.size();
|
||||
|
||||
if (resyncing_) {
|
||||
if (record_type == kMiddleType) {
|
||||
continue;
|
||||
} else if (record_type == kLastType) {
|
||||
resyncing_ = false;
|
||||
continue;
|
||||
} else {
|
||||
resyncing_ = false;
|
||||
}
|
||||
}
|
||||
|
||||
switch (record_type) {
|
||||
case kFullType:
|
||||
if (in_fragmented_record) {
|
||||
|
@ -151,9 +133,7 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch) {
|
|||
|
||||
case kEof:
|
||||
if (in_fragmented_record) {
|
||||
// This can be caused by the writer dying immediately after
|
||||
// writing a physical record but before completing the next; don't
|
||||
// treat it as a corruption, just ignore the entire logical record.
|
||||
ReportCorruption(scratch->size(), "partial record without end(3)");
|
||||
scratch->clear();
|
||||
}
|
||||
return false;
|
||||
|
@ -185,20 +165,20 @@ uint64_t Reader::LastRecordOffset() {
|
|||
return last_record_offset_;
|
||||
}
|
||||
|
||||
void Reader::ReportCorruption(uint64_t bytes, const char* reason) {
|
||||
ReportDrop(bytes, Status::Corruption(reason, file_->GetName()));
|
||||
void Reader::ReportCorruption(size_t bytes, const char* reason) {
|
||||
ReportDrop(bytes, Status::Corruption(reason));
|
||||
}
|
||||
|
||||
void Reader::ReportDrop(uint64_t bytes, const Status& reason) {
|
||||
void Reader::ReportDrop(size_t bytes, const Status& reason) {
|
||||
if (reporter_ != NULL &&
|
||||
end_of_buffer_offset_ - buffer_.size() - bytes >= initial_offset_) {
|
||||
reporter_->Corruption(static_cast<size_t>(bytes), reason);
|
||||
reporter_->Corruption(bytes, reason);
|
||||
}
|
||||
}
|
||||
|
||||
unsigned int Reader::ReadPhysicalRecord(Slice* result) {
|
||||
while (true) {
|
||||
if (buffer_.size() < kHeaderSize) {
|
||||
if (buffer_.size() < (size_t)kHeaderSize) {
|
||||
if (!eof_) {
|
||||
// Last read was a full read, so this is a trailer to skip
|
||||
buffer_.clear();
|
||||
|
@ -209,16 +189,17 @@ unsigned int Reader::ReadPhysicalRecord(Slice* result) {
|
|||
ReportDrop(kBlockSize, status);
|
||||
eof_ = true;
|
||||
return kEof;
|
||||
} else if (buffer_.size() < kBlockSize) {
|
||||
} else if (buffer_.size() < (size_t)kBlockSize) {
|
||||
eof_ = true;
|
||||
}
|
||||
continue;
|
||||
} else if (buffer_.size() == 0) {
|
||||
// End of file
|
||||
return kEof;
|
||||
} else {
|
||||
// Note that if buffer_ is non-empty, we have a truncated header at the
|
||||
// end of the file, which can be caused by the writer crashing in the
|
||||
// middle of writing the header. Instead of considering this an error,
|
||||
// just report EOF.
|
||||
size_t drop_size = buffer_.size();
|
||||
buffer_.clear();
|
||||
ReportCorruption(drop_size, "truncated record at end of file");
|
||||
return kEof;
|
||||
}
|
||||
}
|
||||
|
@ -232,15 +213,9 @@ unsigned int Reader::ReadPhysicalRecord(Slice* result) {
|
|||
if (kHeaderSize + length > buffer_.size()) {
|
||||
size_t drop_size = buffer_.size();
|
||||
buffer_.clear();
|
||||
if (!eof_) {
|
||||
ReportCorruption(drop_size, "bad record length");
|
||||
return kBadRecord;
|
||||
}
|
||||
// If the end of the file has been reached without reading |length| bytes
|
||||
// of payload, assume the writer died in the middle of writing the record.
|
||||
// Don't report a corruption.
|
||||
return kEof;
|
||||
}
|
||||
|
||||
if (type == kZeroType && length == 0) {
|
||||
// Skip zero length record without reporting any drops since
|
||||
|
|
|
@ -73,11 +73,6 @@ class Reader {
|
|||
// Offset at which to start looking for the first record to return
|
||||
uint64_t const initial_offset_;
|
||||
|
||||
// True if we are resynchronizing after a seek (initial_offset_ > 0). In
|
||||
// particular, a run of kMiddleType and kLastType records can be silently
|
||||
// skipped in this mode
|
||||
bool resyncing_;
|
||||
|
||||
// Extend record types with the following special values
|
||||
enum {
|
||||
kEof = kMaxRecordType + 1,
|
||||
|
@ -99,8 +94,8 @@ class Reader {
|
|||
|
||||
// Reports dropped bytes to the reporter.
|
||||
// buffer_ must be updated to remove the dropped bytes prior to invocation.
|
||||
void ReportCorruption(uint64_t bytes, const char* reason);
|
||||
void ReportDrop(uint64_t bytes, const Status& reason);
|
||||
void ReportCorruption(size_t bytes, const char* reason);
|
||||
void ReportDrop(size_t bytes, const Status& reason);
|
||||
|
||||
// No copying allowed
|
||||
Reader(const Reader&);
|
||||
|
|
|
@ -79,7 +79,7 @@ class LogTest {
|
|||
virtual Status Skip(uint64_t n) {
|
||||
if (n > contents_.size()) {
|
||||
contents_.clear();
|
||||
return Status::NotFound("in-memory file skipped past end");
|
||||
return Status::NotFound("in-memory file skipepd past end");
|
||||
}
|
||||
|
||||
contents_.remove_prefix(n);
|
||||
|
@ -104,34 +104,23 @@ class LogTest {
|
|||
StringSource source_;
|
||||
ReportCollector report_;
|
||||
bool reading_;
|
||||
Writer* writer_;
|
||||
Reader* reader_;
|
||||
Writer writer_;
|
||||
Reader reader_;
|
||||
|
||||
// Record metadata for testing initial offset functionality
|
||||
static size_t initial_offset_record_sizes_[];
|
||||
static uint64_t initial_offset_last_record_offsets_[];
|
||||
static int num_initial_offset_records_;
|
||||
|
||||
public:
|
||||
LogTest() : reading_(false),
|
||||
writer_(new Writer(&dest_)),
|
||||
reader_(new Reader(&source_, &report_, true/*checksum*/,
|
||||
0/*initial_offset*/)) {
|
||||
}
|
||||
|
||||
~LogTest() {
|
||||
delete writer_;
|
||||
delete reader_;
|
||||
}
|
||||
|
||||
void ReopenForAppend() {
|
||||
delete writer_;
|
||||
writer_ = new Writer(&dest_, dest_.contents_.size());
|
||||
writer_(&dest_),
|
||||
reader_(&source_, &report_, true/*checksum*/,
|
||||
0/*initial_offset*/) {
|
||||
}
|
||||
|
||||
void Write(const std::string& msg) {
|
||||
ASSERT_TRUE(!reading_) << "Write() after starting to read";
|
||||
writer_->AddRecord(Slice(msg));
|
||||
writer_.AddRecord(Slice(msg));
|
||||
}
|
||||
|
||||
size_t WrittenBytes() const {
|
||||
|
@ -145,7 +134,7 @@ class LogTest {
|
|||
}
|
||||
std::string scratch;
|
||||
Slice record;
|
||||
if (reader_->ReadRecord(&record, &scratch)) {
|
||||
if (reader_.ReadRecord(&record, &scratch)) {
|
||||
return record.ToString();
|
||||
} else {
|
||||
return "EOF";
|
||||
|
@ -193,18 +182,13 @@ class LogTest {
|
|||
}
|
||||
|
||||
void WriteInitialOffsetLog() {
|
||||
for (int i = 0; i < num_initial_offset_records_; i++) {
|
||||
for (int i = 0; i < 4; i++) {
|
||||
std::string record(initial_offset_record_sizes_[i],
|
||||
static_cast<char>('a' + i));
|
||||
Write(record);
|
||||
}
|
||||
}
|
||||
|
||||
void StartReadingAt(uint64_t initial_offset) {
|
||||
delete reader_;
|
||||
reader_ = new Reader(&source_, &report_, true/*checksum*/, initial_offset);
|
||||
}
|
||||
|
||||
void CheckOffsetPastEndReturnsNoRecords(uint64_t offset_past_end) {
|
||||
WriteInitialOffsetLog();
|
||||
reading_ = true;
|
||||
|
@ -224,11 +208,6 @@ class LogTest {
|
|||
source_.contents_ = Slice(dest_.contents_);
|
||||
Reader* offset_reader = new Reader(&source_, &report_, true/*checksum*/,
|
||||
initial_offset);
|
||||
|
||||
// Read all records from expected_record_offset through the last one.
|
||||
ASSERT_LT(expected_record_offset, num_initial_offset_records_);
|
||||
for (; expected_record_offset < num_initial_offset_records_;
|
||||
++expected_record_offset) {
|
||||
Slice record;
|
||||
std::string scratch;
|
||||
ASSERT_TRUE(offset_reader->ReadRecord(&record, &scratch));
|
||||
|
@ -237,35 +216,24 @@ class LogTest {
|
|||
ASSERT_EQ(initial_offset_last_record_offsets_[expected_record_offset],
|
||||
offset_reader->LastRecordOffset());
|
||||
ASSERT_EQ((char)('a' + expected_record_offset), record.data()[0]);
|
||||
}
|
||||
delete offset_reader;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
size_t LogTest::initial_offset_record_sizes_[] =
|
||||
{10000, // Two sizable records in first block
|
||||
10000,
|
||||
2 * log::kBlockSize - 1000, // Span three blocks
|
||||
1,
|
||||
13716, // Consume all but two bytes of block 3.
|
||||
log::kBlockSize - kHeaderSize, // Consume the entirety of block 4.
|
||||
};
|
||||
1};
|
||||
|
||||
uint64_t LogTest::initial_offset_last_record_offsets_[] =
|
||||
{0,
|
||||
kHeaderSize + 10000,
|
||||
2 * (kHeaderSize + 10000),
|
||||
2 * (kHeaderSize + 10000) +
|
||||
(2 * log::kBlockSize - 1000) + 3 * kHeaderSize,
|
||||
2 * (kHeaderSize + 10000) +
|
||||
(2 * log::kBlockSize - 1000) + 3 * kHeaderSize
|
||||
+ kHeaderSize + 1,
|
||||
3 * log::kBlockSize,
|
||||
};
|
||||
(2 * log::kBlockSize - 1000) + 3 * kHeaderSize};
|
||||
|
||||
// LogTest::initial_offset_last_record_offsets_ must be defined before this.
|
||||
int LogTest::num_initial_offset_records_ =
|
||||
sizeof(LogTest::initial_offset_last_record_offsets_)/sizeof(uint64_t);
|
||||
|
||||
TEST(LogTest, Empty) {
|
||||
ASSERT_EQ("EOF", Read());
|
||||
|
@ -350,15 +318,6 @@ TEST(LogTest, AlignedEof) {
|
|||
ASSERT_EQ("EOF", Read());
|
||||
}
|
||||
|
||||
TEST(LogTest, OpenForAppend) {
|
||||
Write("hello");
|
||||
ReopenForAppend();
|
||||
Write("world");
|
||||
ASSERT_EQ("hello", Read());
|
||||
ASSERT_EQ("world", Read());
|
||||
ASSERT_EQ("EOF", Read());
|
||||
}
|
||||
|
||||
TEST(LogTest, RandomRead) {
|
||||
const int N = 500;
|
||||
Random write_rnd(301);
|
||||
|
@ -392,32 +351,20 @@ TEST(LogTest, BadRecordType) {
|
|||
ASSERT_EQ("OK", MatchError("unknown record type"));
|
||||
}
|
||||
|
||||
TEST(LogTest, TruncatedTrailingRecordIsIgnored) {
|
||||
TEST(LogTest, TruncatedTrailingRecord) {
|
||||
Write("foo");
|
||||
ShrinkSize(4); // Drop all payload as well as a header byte
|
||||
ASSERT_EQ("EOF", Read());
|
||||
// Truncated last record is ignored, not treated as an error.
|
||||
ASSERT_EQ(0, DroppedBytes());
|
||||
ASSERT_EQ("", ReportMessage());
|
||||
ASSERT_EQ(kHeaderSize - 1, DroppedBytes());
|
||||
ASSERT_EQ("OK", MatchError("truncated record at end of file"));
|
||||
}
|
||||
|
||||
TEST(LogTest, BadLength) {
|
||||
const int kPayloadSize = kBlockSize - kHeaderSize;
|
||||
Write(BigString("bar", kPayloadSize));
|
||||
Write("foo");
|
||||
// Least significant size byte is stored in header[4].
|
||||
IncrementByte(4, 1);
|
||||
ASSERT_EQ("foo", Read());
|
||||
ASSERT_EQ(kBlockSize, DroppedBytes());
|
||||
ASSERT_EQ("OK", MatchError("bad record length"));
|
||||
}
|
||||
|
||||
TEST(LogTest, BadLengthAtEndIsIgnored) {
|
||||
Write("foo");
|
||||
ShrinkSize(1);
|
||||
ASSERT_EQ("EOF", Read());
|
||||
ASSERT_EQ(0, DroppedBytes());
|
||||
ASSERT_EQ("", ReportMessage());
|
||||
ASSERT_EQ(kHeaderSize + 2, DroppedBytes());
|
||||
ASSERT_EQ("OK", MatchError("bad record length"));
|
||||
}
|
||||
|
||||
TEST(LogTest, ChecksumMismatch) {
|
||||
|
@ -468,40 +415,6 @@ TEST(LogTest, UnexpectedFirstType) {
|
|||
ASSERT_EQ("OK", MatchError("partial record without end"));
|
||||
}
|
||||
|
||||
TEST(LogTest, MissingLastIsIgnored) {
|
||||
Write(BigString("bar", kBlockSize));
|
||||
// Remove the LAST block, including header.
|
||||
ShrinkSize(14);
|
||||
ASSERT_EQ("EOF", Read());
|
||||
ASSERT_EQ("", ReportMessage());
|
||||
ASSERT_EQ(0, DroppedBytes());
|
||||
}
|
||||
|
||||
TEST(LogTest, PartialLastIsIgnored) {
|
||||
Write(BigString("bar", kBlockSize));
|
||||
// Cause a bad record length in the LAST block.
|
||||
ShrinkSize(1);
|
||||
ASSERT_EQ("EOF", Read());
|
||||
ASSERT_EQ("", ReportMessage());
|
||||
ASSERT_EQ(0, DroppedBytes());
|
||||
}
|
||||
|
||||
TEST(LogTest, SkipIntoMultiRecord) {
|
||||
// Consider a fragmented record:
|
||||
// first(R1), middle(R1), last(R1), first(R2)
|
||||
// If initial_offset points to a record after first(R1) but before first(R2)
|
||||
// incomplete fragment errors are not actual errors, and must be suppressed
|
||||
// until a new first or full record is encountered.
|
||||
Write(BigString("foo", 3*kBlockSize));
|
||||
Write("correct");
|
||||
StartReadingAt(kBlockSize);
|
||||
|
||||
ASSERT_EQ("correct", Read());
|
||||
ASSERT_EQ("", ReportMessage());
|
||||
ASSERT_EQ(0, DroppedBytes());
|
||||
ASSERT_EQ("EOF", Read());
|
||||
}
|
||||
|
||||
TEST(LogTest, ErrorJoinsRecords) {
|
||||
// Consider two fragmented records:
|
||||
// first(R1) last(R1) first(R2) last(R2)
|
||||
|
@ -520,7 +433,7 @@ TEST(LogTest, ErrorJoinsRecords) {
|
|||
|
||||
ASSERT_EQ("correct", Read());
|
||||
ASSERT_EQ("EOF", Read());
|
||||
const size_t dropped = DroppedBytes();
|
||||
const int dropped = DroppedBytes();
|
||||
ASSERT_LE(dropped, 2*kBlockSize + 100);
|
||||
ASSERT_GE(dropped, 2*kBlockSize);
|
||||
}
|
||||
|
@ -571,10 +484,6 @@ TEST(LogTest, ReadFourthStart) {
|
|||
3);
|
||||
}
|
||||
|
||||
TEST(LogTest, ReadInitialOffsetIntoBlockPadding) {
|
||||
CheckInitialOffsetRecord(3 * log::kBlockSize - 3, 5);
|
||||
}
|
||||
|
||||
TEST(LogTest, ReadEnd) {
|
||||
CheckOffsetPastEndReturnsNoRecords(0);
|
||||
}
|
||||
|
|
|
@ -12,22 +12,13 @@
|
|||
namespace leveldb {
|
||||
namespace log {
|
||||
|
||||
static void InitTypeCrc(uint32_t* type_crc) {
|
||||
for (int i = 0; i <= kMaxRecordType; i++) {
|
||||
char t = static_cast<char>(i);
|
||||
type_crc[i] = crc32c::Value(&t, 1);
|
||||
}
|
||||
}
|
||||
|
||||
Writer::Writer(WritableFile* dest)
|
||||
: dest_(dest),
|
||||
block_offset_(0) {
|
||||
InitTypeCrc(type_crc_);
|
||||
}
|
||||
|
||||
Writer::Writer(WritableFile* dest, uint64_t dest_length)
|
||||
: dest_(dest), block_offset_(dest_length % kBlockSize) {
|
||||
InitTypeCrc(type_crc_);
|
||||
for (int i = 0; i <= kMaxRecordType; i++) {
|
||||
char t = static_cast<char>(i);
|
||||
type_crc_[i] = crc32c::Value(&t, 1);
|
||||
}
|
||||
}
|
||||
|
||||
Writer::~Writer() {
|
||||
|
@ -83,7 +74,7 @@ Status Writer::AddRecord(const Slice& slice) {
|
|||
|
||||
Status Writer::EmitPhysicalRecord(RecordType t, const char* ptr, size_t n) {
|
||||
assert(n <= 0xffff); // Must fit in two bytes
|
||||
assert(block_offset_ + kHeaderSize + n <= kBlockSize);
|
||||
assert(block_offset_ + kHeaderSize + (int)n <= kBlockSize);
|
||||
|
||||
// Format the header
|
||||
char buf[kHeaderSize];
|
||||
|
|
|
@ -9,11 +9,10 @@
|
|||
#include "db/log_format.h"
|
||||
#include "leveldb/slice.h"
|
||||
#include "leveldb/status.h"
|
||||
#include "leveldb/env.h"
|
||||
|
||||
namespace leveldb {
|
||||
|
||||
class WritableFile;
|
||||
|
||||
namespace log {
|
||||
|
||||
class Writer {
|
||||
|
@ -22,16 +21,12 @@ class Writer {
|
|||
// "*dest" must be initially empty.
|
||||
// "*dest" must remain live while this Writer is in use.
|
||||
explicit Writer(WritableFile* dest);
|
||||
|
||||
// Create a writer that will append data to "*dest".
|
||||
// "*dest" must have initial length "dest_length".
|
||||
// "*dest" must remain live while this Writer is in use.
|
||||
Writer(WritableFile* dest, uint64_t dest_length);
|
||||
|
||||
~Writer();
|
||||
|
||||
Status AddRecord(const Slice& slice);
|
||||
|
||||
void Close() {delete dest_; dest_=NULL;};
|
||||
|
||||
private:
|
||||
WritableFile* dest_;
|
||||
int block_offset_; // Current offset in block
|
||||
|
|
|
@ -6,6 +6,7 @@
|
|||
#include "db/dbformat.h"
|
||||
#include "leveldb/comparator.h"
|
||||
#include "leveldb/env.h"
|
||||
#include "leveldb/expiry.h"
|
||||
#include "leveldb/iterator.h"
|
||||
#include "util/coding.h"
|
||||
|
||||
|
@ -63,6 +64,8 @@ class MemTableIterator: public Iterator {
|
|||
Slice key_slice = GetLengthPrefixedSlice(iter_.key());
|
||||
return GetLengthPrefixedSlice(key_slice.data() + key_slice.size());
|
||||
}
|
||||
virtual KeyMetaData & keymetadata() const
|
||||
{MemTable::DecodeKeyMetaData(iter_.key(), keymetadata_); return(keymetadata_);};
|
||||
|
||||
virtual Status status() const { return Status::OK(); }
|
||||
|
||||
|
@ -81,7 +84,8 @@ Iterator* MemTable::NewIterator() {
|
|||
|
||||
void MemTable::Add(SequenceNumber s, ValueType type,
|
||||
const Slice& key,
|
||||
const Slice& value) {
|
||||
const Slice& value,
|
||||
const ExpiryTimeMicros & expiry) {
|
||||
// Format of an entry is concatenation of:
|
||||
// key_size : varint32 of internal_key.size()
|
||||
// key bytes : char[internal_key.size()]
|
||||
|
@ -89,7 +93,7 @@ void MemTable::Add(SequenceNumber s, ValueType type,
|
|||
// value bytes : char[value.size()]
|
||||
size_t key_size = key.size();
|
||||
size_t val_size = value.size();
|
||||
size_t internal_key_size = key_size + 8;
|
||||
size_t internal_key_size = key_size + KeySuffixSize(type);
|
||||
const size_t encoded_len =
|
||||
VarintLength(internal_key_size) + internal_key_size +
|
||||
VarintLength(val_size) + val_size;
|
||||
|
@ -97,15 +101,22 @@ void MemTable::Add(SequenceNumber s, ValueType type,
|
|||
char* p = EncodeVarint32(buf, internal_key_size);
|
||||
memcpy(p, key.data(), key_size);
|
||||
p += key_size;
|
||||
if (IsExpiryKey(type))
|
||||
{
|
||||
EncodeFixed64(p, expiry);
|
||||
p+=8;
|
||||
}
|
||||
EncodeFixed64(p, (s << 8) | type);
|
||||
p += 8;
|
||||
p = EncodeVarint32(p, val_size);
|
||||
memcpy(p, value.data(), val_size);
|
||||
assert(p + val_size == buf + encoded_len);
|
||||
assert((size_t)((p + val_size) - buf) == encoded_len);
|
||||
table_.Insert(buf);
|
||||
}
|
||||
|
||||
bool MemTable::Get(const LookupKey& key, std::string* value, Status* s) {
|
||||
bool MemTable::Get(const LookupKey& key, Value* value, Status* s,
|
||||
const Options * options) {
|
||||
bool ret_flag(false);
|
||||
Slice memkey = key.memtable_key();
|
||||
Table::Iterator iter(&table_);
|
||||
iter.Seek(memkey.data());
|
||||
|
@ -113,6 +124,7 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s) {
|
|||
// entry format is:
|
||||
// klength varint32
|
||||
// userkey char[klength]
|
||||
// optional uint64
|
||||
// tag uint64
|
||||
// vlength varint32
|
||||
// value char[vlength]
|
||||
|
@ -122,24 +134,66 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s) {
|
|||
const char* entry = iter.key();
|
||||
uint32_t key_length;
|
||||
const char* key_ptr = GetVarint32Ptr(entry, entry+5, &key_length);
|
||||
Slice internal_key(key_ptr, key_length);
|
||||
if (comparator_.comparator.user_comparator()->Compare(
|
||||
Slice(key_ptr, key_length - 8),
|
||||
ExtractUserKey(internal_key),
|
||||
key.user_key()) == 0) {
|
||||
// Correct user key
|
||||
const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8);
|
||||
switch (static_cast<ValueType>(tag & 0xff)) {
|
||||
case kTypeValue: {
|
||||
KeyMetaData meta;
|
||||
DecodeKeyMetaData(entry, meta);
|
||||
|
||||
switch (meta.m_Type) {
|
||||
case kTypeValueWriteTime:
|
||||
case kTypeValueExplicitExpiry:
|
||||
{
|
||||
bool expired=false;
|
||||
if (NULL!=options && options->ExpiryActivated())
|
||||
expired=options->expiry_module->MemTableCallback(internal_key);
|
||||
if (expired)
|
||||
{
|
||||
// like kTypeDeletion
|
||||
*s = Status::NotFound(Slice());
|
||||
ret_flag=true;
|
||||
break;
|
||||
} // if
|
||||
//otherwise fall into kTypeValue code
|
||||
} // case
|
||||
|
||||
case kTypeValue:
|
||||
{
|
||||
Slice v = GetLengthPrefixedSlice(key_ptr + key_length);
|
||||
value->assign(v.data(), v.size());
|
||||
return true;
|
||||
ret_flag=true;
|
||||
break;
|
||||
}
|
||||
case kTypeDeletion:
|
||||
*s = Status::NotFound(Slice());
|
||||
return true;
|
||||
ret_flag=true;
|
||||
break;
|
||||
} // switch
|
||||
|
||||
// only unpack metadata if requested
|
||||
if (key.WantsKeyMetaData())
|
||||
key.SetKeyMetaData(meta);
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
return ret_flag;
|
||||
}
|
||||
|
||||
// this is a static function
|
||||
void MemTable::DecodeKeyMetaData(
|
||||
const char * key,
|
||||
KeyMetaData & meta)
|
||||
{
|
||||
Slice key_slice = GetLengthPrefixedSlice(key);
|
||||
|
||||
meta.m_Type=ExtractValueType(key_slice);
|
||||
meta.m_Sequence=ExtractSequenceNumber(key_slice);
|
||||
if (IsExpiryKey(meta.m_Type))
|
||||
meta.m_Expiry=ExtractExpiry(key_slice);
|
||||
else
|
||||
meta.m_Expiry=0;
|
||||
|
||||
} // DecodeKeyMetaData
|
||||
|
||||
} // namespace leveldb
|
||||
|
|
|
@ -24,10 +24,10 @@ class MemTable {
|
|||
explicit MemTable(const InternalKeyComparator& comparator);
|
||||
|
||||
// Increase reference count.
|
||||
void Ref() { ++refs_; }
|
||||
void Ref() volatile { ++refs_; }
|
||||
|
||||
// Drop reference count. Delete if no more references exist.
|
||||
void Unref() {
|
||||
void Unref() volatile {
|
||||
--refs_;
|
||||
assert(refs_ >= 0);
|
||||
if (refs_ <= 0) {
|
||||
|
@ -36,7 +36,10 @@ class MemTable {
|
|||
}
|
||||
|
||||
// Returns an estimate of the number of bytes of data in use by this
|
||||
// data structure. It is safe to call when MemTable is being modified.
|
||||
// data structure.
|
||||
//
|
||||
// REQUIRES: external synchronization to prevent simultaneous
|
||||
// operations on the same MemTable.
|
||||
size_t ApproximateMemoryUsage();
|
||||
|
||||
// Return an iterator that yields the contents of the memtable.
|
||||
|
@ -52,13 +55,17 @@ class MemTable {
|
|||
// Typically value will be empty if type==kTypeDeletion.
|
||||
void Add(SequenceNumber seq, ValueType type,
|
||||
const Slice& key,
|
||||
const Slice& value);
|
||||
const Slice& value,
|
||||
const ExpiryTimeMicros& expiry=0);
|
||||
|
||||
// If memtable contains a value for key, store it in *value and return true.
|
||||
// If memtable contains a deletion for key, store a NotFound() error
|
||||
// in *status and return true.
|
||||
// Else, return false.
|
||||
bool Get(const LookupKey& key, std::string* value, Status* s);
|
||||
bool Get(const LookupKey& key, Value* value, Status* s, const Options * options);
|
||||
|
||||
// parse keymetadata from skiplist key string
|
||||
static void DecodeKeyMetaData(const char * key, KeyMetaData & meta);
|
||||
|
||||
private:
|
||||
~MemTable(); // Private since only Unref() should be used to delete it
|
||||
|
@ -69,7 +76,7 @@ class MemTable {
|
|||
int operator()(const char* a, const char* b) const;
|
||||
};
|
||||
friend class MemTableIterator;
|
||||
friend class MemTableBackwardIterator;
|
||||
friend class MemTableBackwardIterator; // does not exist
|
||||
|
||||
typedef SkipList<const char*, KeyComparator> Table;
|
||||
|
||||
|
|
248
src/leveldb/db/penalty_test.cc
Normal file
248
src/leveldb/db/penalty_test.cc
Normal file
|
@ -0,0 +1,248 @@
|
|||
// -------------------------------------------------------------------
|
||||
//
|
||||
// penalty_test.cc
|
||||
//
|
||||
// Copyright (c) 2016 Basho Technologies, Inc. All Rights Reserved.
|
||||
//
|
||||
// This file is provided to you under the Apache License,
|
||||
// Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain
|
||||
// a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
//
|
||||
// -------------------------------------------------------------------
|
||||
|
||||
|
||||
#include "util/testharness.h"
|
||||
#include "util/testutil.h"
|
||||
|
||||
#include "leveldb/comparator.h"
|
||||
|
||||
#include "db/version_set.h"
|
||||
|
||||
/**
|
||||
* Execution routine
|
||||
*/
|
||||
int main(int argc, char** argv)
|
||||
{
|
||||
return leveldb::test::RunAllTests();
|
||||
}
|
||||
|
||||
|
||||
namespace leveldb {
|
||||
|
||||
class TestVersion : public Version
|
||||
{
|
||||
public:
|
||||
TestVersion()
|
||||
: Version(NULL)
|
||||
{
|
||||
int loop;
|
||||
|
||||
for (loop=0; loop<config::kNumLevels; ++loop)
|
||||
{
|
||||
m_FalseFile[loop].file_size=0;
|
||||
m_LevelFileCount[loop]=0;
|
||||
} // for
|
||||
};
|
||||
|
||||
virtual size_t NumFiles(int level) const {return(m_LevelFileCount[level]);};
|
||||
|
||||
virtual const std::vector<FileMetaData*> & GetFileList(int level) const
|
||||
{
|
||||
m_FalseVector.clear();
|
||||
m_FalseVector.push_back(&m_FalseFile[level]);
|
||||
return(m_FalseVector);
|
||||
};
|
||||
|
||||
mutable std::vector<FileMetaData*> m_FalseVector;
|
||||
mutable FileMetaData m_FalseFile[config::kNumLevels];
|
||||
|
||||
size_t m_LevelFileCount[config::kNumLevels];
|
||||
|
||||
}; // class TestVersion
|
||||
|
||||
/**
|
||||
* Wrapper class for tests. Holds working variables
|
||||
* and helper functions.
|
||||
*/
|
||||
class PenaltyTester : public VersionSet
|
||||
{
|
||||
public:
|
||||
PenaltyTester()
|
||||
: m_IntCompare(m_Options.comparator), VersionSet("", &m_Options, NULL, &m_IntCompare)
|
||||
{
|
||||
};
|
||||
|
||||
~PenaltyTester()
|
||||
{
|
||||
};
|
||||
|
||||
Options m_Options;
|
||||
InternalKeyComparator m_IntCompare;
|
||||
|
||||
}; // class PenaltyTester
|
||||
|
||||
|
||||
/*******************
|
||||
* Form note:
|
||||
* using ASSERT_TRUE(0==version.WritePenalty());
|
||||
* instead of ASSERT_EQ / ASSERT_NE because WritePenalty
|
||||
* returns a volatile int, which older compilers believe is
|
||||
* not an equivalent type to a constant. RedHat 5, Solaris,
|
||||
* and SmartOS were giving grief.
|
||||
*******************/
|
||||
|
||||
/**
|
||||
* Debug 1
|
||||
*/
|
||||
#if 0
|
||||
TEST(PenaltyTester, Debug1)
|
||||
{
|
||||
TestVersion version;
|
||||
int penalty;
|
||||
|
||||
m_Options.write_buffer_size=46416847;
|
||||
|
||||
version.m_FalseFile[2].file_size=1075676398;
|
||||
version.m_LevelFileCount[1]=1;
|
||||
|
||||
UpdatePenalty(&version);
|
||||
|
||||
ASSERT_TRUE(0==version.WritePenalty());
|
||||
|
||||
} // test Debug1
|
||||
#endif
|
||||
|
||||
|
||||
/**
|
||||
* No penalty scenarios
|
||||
*/
|
||||
TEST(PenaltyTester, NoPenalty)
|
||||
{
|
||||
TestVersion version;
|
||||
int level;
|
||||
|
||||
m_Options.write_buffer_size=46416847;
|
||||
|
||||
// nothing
|
||||
UpdatePenalty(&version);
|
||||
ASSERT_TRUE(0==version.WritePenalty());
|
||||
|
||||
/**
|
||||
* Level 0
|
||||
* (overlapped level, penalty is count based)
|
||||
*/
|
||||
// no penalty
|
||||
version.m_LevelFileCount[0]=config::kL0_CompactionTrigger;
|
||||
UpdatePenalty(&version);
|
||||
ASSERT_TRUE(0==version.WritePenalty());
|
||||
|
||||
version.m_LevelFileCount[0]=config::kL0_SlowdownWritesTrigger;
|
||||
UpdatePenalty(&version);
|
||||
ASSERT_TRUE(0==version.WritePenalty());
|
||||
|
||||
#if 0 // needs rewrite to be time based
|
||||
// threshold reached ... some penalty
|
||||
version.m_LevelFileCount[0]=config::kL0_SlowdownWritesTrigger+1;
|
||||
UpdatePenalty(&version);
|
||||
ASSERT_TRUE(0!=version.WritePenalty());
|
||||
|
||||
// clean up
|
||||
version.m_LevelFileCount[0]=0;
|
||||
|
||||
/**
|
||||
* Level 1
|
||||
* (overlapped level, penalty is count based)
|
||||
*/
|
||||
// no penalty
|
||||
version.m_LevelFileCount[1]=config::kL0_CompactionTrigger;
|
||||
UpdatePenalty(&version);
|
||||
ASSERT_TRUE(0==version.WritePenalty());
|
||||
|
||||
version.m_LevelFileCount[1]=config::kL0_SlowdownWritesTrigger;
|
||||
UpdatePenalty(&version);
|
||||
ASSERT_TRUE(0==version.WritePenalty());
|
||||
|
||||
// threshold reached ... some penalty
|
||||
version.m_LevelFileCount[1]=config::kL0_SlowdownWritesTrigger+1;
|
||||
UpdatePenalty(&version);
|
||||
ASSERT_TRUE(0!=version.WritePenalty());
|
||||
|
||||
// clean up
|
||||
version.m_LevelFileCount[1]=0;
|
||||
|
||||
/**
|
||||
* Level 2
|
||||
* (landing level, penalty size based)
|
||||
*/
|
||||
// no penalty
|
||||
version.m_FalseFile[2].file_size=0;
|
||||
UpdatePenalty(&version);
|
||||
ASSERT_TRUE(0==version.WritePenalty());
|
||||
|
||||
version.m_FalseFile[2].file_size=VersionSet::DesiredBytesForLevel(2);
|
||||
UpdatePenalty(&version);
|
||||
ASSERT_TRUE(0==version.WritePenalty());
|
||||
|
||||
version.m_FalseFile[2].file_size=VersionSet::MaxBytesForLevel(2)-1;
|
||||
UpdatePenalty(&version);
|
||||
ASSERT_TRUE(0==version.WritePenalty());
|
||||
|
||||
version.m_FalseFile[2].file_size=VersionSet::MaxBytesForLevel(2);
|
||||
UpdatePenalty(&version);
|
||||
ASSERT_TRUE(0!=version.WritePenalty());
|
||||
|
||||
// interaction rule with level 1
|
||||
version.m_FalseFile[2].file_size=VersionSet::MaxBytesForLevel(2)-1;
|
||||
version.m_LevelFileCount[1]=config::kL0_CompactionTrigger/2;
|
||||
UpdatePenalty(&version);
|
||||
ASSERT_TRUE(0!=version.WritePenalty());
|
||||
|
||||
// clean up
|
||||
version.m_LevelFileCount[1]=0;
|
||||
version.m_FalseFile[2].file_size=0;
|
||||
|
||||
/**
|
||||
* Level 3+
|
||||
* (landing level, penalty size based)
|
||||
*/
|
||||
for (level=3; level<config::kNumLevels; ++level)
|
||||
{
|
||||
// no penalty
|
||||
version.m_FalseFile[level].file_size=0;
|
||||
UpdatePenalty(&version);
|
||||
ASSERT_TRUE(0==version.WritePenalty());
|
||||
|
||||
version.m_FalseFile[level].file_size=VersionSet::DesiredBytesForLevel(level);
|
||||
UpdatePenalty(&version);
|
||||
ASSERT_TRUE(0==version.WritePenalty());
|
||||
|
||||
version.m_FalseFile[level].file_size=VersionSet::MaxBytesForLevel(level)-1;
|
||||
UpdatePenalty(&version);
|
||||
ASSERT_TRUE(0==version.WritePenalty());
|
||||
|
||||
version.m_FalseFile[level].file_size=VersionSet::MaxBytesForLevel(level);
|
||||
UpdatePenalty(&version);
|
||||
if ((config::kNumLevels-1)!=level)
|
||||
ASSERT_TRUE(0!=version.WritePenalty());
|
||||
else
|
||||
ASSERT_TRUE(0==version.WritePenalty());
|
||||
|
||||
// clean up
|
||||
version.m_FalseFile[level].file_size=0;
|
||||
} // for
|
||||
#endif
|
||||
} // test NoPenalty
|
||||
|
||||
|
||||
|
||||
} // namespace leveldb
|
|
@ -1,324 +0,0 @@
|
|||
// Copyright (c) 2014 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include "db/db_impl.h"
|
||||
#include "db/filename.h"
|
||||
#include "db/version_set.h"
|
||||
#include "db/write_batch_internal.h"
|
||||
#include "leveldb/db.h"
|
||||
#include "leveldb/env.h"
|
||||
#include "leveldb/write_batch.h"
|
||||
#include "util/logging.h"
|
||||
#include "util/testharness.h"
|
||||
#include "util/testutil.h"
|
||||
|
||||
namespace leveldb {
|
||||
|
||||
class RecoveryTest {
|
||||
public:
|
||||
RecoveryTest() : env_(Env::Default()), db_(NULL) {
|
||||
dbname_ = test::TmpDir() + "/recovery_test";
|
||||
DestroyDB(dbname_, Options());
|
||||
Open();
|
||||
}
|
||||
|
||||
~RecoveryTest() {
|
||||
Close();
|
||||
DestroyDB(dbname_, Options());
|
||||
}
|
||||
|
||||
DBImpl* dbfull() const { return reinterpret_cast<DBImpl*>(db_); }
|
||||
Env* env() const { return env_; }
|
||||
|
||||
bool CanAppend() {
|
||||
WritableFile* tmp;
|
||||
Status s = env_->NewAppendableFile(CurrentFileName(dbname_), &tmp);
|
||||
delete tmp;
|
||||
if (s.IsNotSupportedError()) {
|
||||
return false;
|
||||
} else {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
void Close() {
|
||||
delete db_;
|
||||
db_ = NULL;
|
||||
}
|
||||
|
||||
void Open(Options* options = NULL) {
|
||||
Close();
|
||||
Options opts;
|
||||
if (options != NULL) {
|
||||
opts = *options;
|
||||
} else {
|
||||
opts.reuse_logs = true; // TODO(sanjay): test both ways
|
||||
opts.create_if_missing = true;
|
||||
}
|
||||
if (opts.env == NULL) {
|
||||
opts.env = env_;
|
||||
}
|
||||
ASSERT_OK(DB::Open(opts, dbname_, &db_));
|
||||
ASSERT_EQ(1, NumLogs());
|
||||
}
|
||||
|
||||
Status Put(const std::string& k, const std::string& v) {
|
||||
return db_->Put(WriteOptions(), k, v);
|
||||
}
|
||||
|
||||
std::string Get(const std::string& k, const Snapshot* snapshot = NULL) {
|
||||
std::string result;
|
||||
Status s = db_->Get(ReadOptions(), k, &result);
|
||||
if (s.IsNotFound()) {
|
||||
result = "NOT_FOUND";
|
||||
} else if (!s.ok()) {
|
||||
result = s.ToString();
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
std::string ManifestFileName() {
|
||||
std::string current;
|
||||
ASSERT_OK(ReadFileToString(env_, CurrentFileName(dbname_), ¤t));
|
||||
size_t len = current.size();
|
||||
if (len > 0 && current[len-1] == '\n') {
|
||||
current.resize(len - 1);
|
||||
}
|
||||
return dbname_ + "/" + current;
|
||||
}
|
||||
|
||||
std::string LogName(uint64_t number) {
|
||||
return LogFileName(dbname_, number);
|
||||
}
|
||||
|
||||
size_t DeleteLogFiles() {
|
||||
std::vector<uint64_t> logs = GetFiles(kLogFile);
|
||||
for (size_t i = 0; i < logs.size(); i++) {
|
||||
ASSERT_OK(env_->DeleteFile(LogName(logs[i]))) << LogName(logs[i]);
|
||||
}
|
||||
return logs.size();
|
||||
}
|
||||
|
||||
uint64_t FirstLogFile() {
|
||||
return GetFiles(kLogFile)[0];
|
||||
}
|
||||
|
||||
std::vector<uint64_t> GetFiles(FileType t) {
|
||||
std::vector<std::string> filenames;
|
||||
ASSERT_OK(env_->GetChildren(dbname_, &filenames));
|
||||
std::vector<uint64_t> result;
|
||||
for (size_t i = 0; i < filenames.size(); i++) {
|
||||
uint64_t number;
|
||||
FileType type;
|
||||
if (ParseFileName(filenames[i], &number, &type) && type == t) {
|
||||
result.push_back(number);
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
int NumLogs() {
|
||||
return GetFiles(kLogFile).size();
|
||||
}
|
||||
|
||||
int NumTables() {
|
||||
return GetFiles(kTableFile).size();
|
||||
}
|
||||
|
||||
uint64_t FileSize(const std::string& fname) {
|
||||
uint64_t result;
|
||||
ASSERT_OK(env_->GetFileSize(fname, &result)) << fname;
|
||||
return result;
|
||||
}
|
||||
|
||||
void CompactMemTable() {
|
||||
dbfull()->TEST_CompactMemTable();
|
||||
}
|
||||
|
||||
// Directly construct a log file that sets key to val.
|
||||
void MakeLogFile(uint64_t lognum, SequenceNumber seq, Slice key, Slice val) {
|
||||
std::string fname = LogFileName(dbname_, lognum);
|
||||
WritableFile* file;
|
||||
ASSERT_OK(env_->NewWritableFile(fname, &file));
|
||||
log::Writer writer(file);
|
||||
WriteBatch batch;
|
||||
batch.Put(key, val);
|
||||
WriteBatchInternal::SetSequence(&batch, seq);
|
||||
ASSERT_OK(writer.AddRecord(WriteBatchInternal::Contents(&batch)));
|
||||
ASSERT_OK(file->Flush());
|
||||
delete file;
|
||||
}
|
||||
|
||||
private:
|
||||
std::string dbname_;
|
||||
Env* env_;
|
||||
DB* db_;
|
||||
};
|
||||
|
||||
TEST(RecoveryTest, ManifestReused) {
|
||||
if (!CanAppend()) {
|
||||
fprintf(stderr, "skipping test because env does not support appending\n");
|
||||
return;
|
||||
}
|
||||
ASSERT_OK(Put("foo", "bar"));
|
||||
Close();
|
||||
std::string old_manifest = ManifestFileName();
|
||||
Open();
|
||||
ASSERT_EQ(old_manifest, ManifestFileName());
|
||||
ASSERT_EQ("bar", Get("foo"));
|
||||
Open();
|
||||
ASSERT_EQ(old_manifest, ManifestFileName());
|
||||
ASSERT_EQ("bar", Get("foo"));
|
||||
}
|
||||
|
||||
TEST(RecoveryTest, LargeManifestCompacted) {
|
||||
if (!CanAppend()) {
|
||||
fprintf(stderr, "skipping test because env does not support appending\n");
|
||||
return;
|
||||
}
|
||||
ASSERT_OK(Put("foo", "bar"));
|
||||
Close();
|
||||
std::string old_manifest = ManifestFileName();
|
||||
|
||||
// Pad with zeroes to make manifest file very big.
|
||||
{
|
||||
uint64_t len = FileSize(old_manifest);
|
||||
WritableFile* file;
|
||||
ASSERT_OK(env()->NewAppendableFile(old_manifest, &file));
|
||||
std::string zeroes(3*1048576 - static_cast<size_t>(len), 0);
|
||||
ASSERT_OK(file->Append(zeroes));
|
||||
ASSERT_OK(file->Flush());
|
||||
delete file;
|
||||
}
|
||||
|
||||
Open();
|
||||
std::string new_manifest = ManifestFileName();
|
||||
ASSERT_NE(old_manifest, new_manifest);
|
||||
ASSERT_GT(10000, FileSize(new_manifest));
|
||||
ASSERT_EQ("bar", Get("foo"));
|
||||
|
||||
Open();
|
||||
ASSERT_EQ(new_manifest, ManifestFileName());
|
||||
ASSERT_EQ("bar", Get("foo"));
|
||||
}
|
||||
|
||||
TEST(RecoveryTest, NoLogFiles) {
|
||||
ASSERT_OK(Put("foo", "bar"));
|
||||
ASSERT_EQ(1, DeleteLogFiles());
|
||||
Open();
|
||||
ASSERT_EQ("NOT_FOUND", Get("foo"));
|
||||
Open();
|
||||
ASSERT_EQ("NOT_FOUND", Get("foo"));
|
||||
}
|
||||
|
||||
TEST(RecoveryTest, LogFileReuse) {
|
||||
if (!CanAppend()) {
|
||||
fprintf(stderr, "skipping test because env does not support appending\n");
|
||||
return;
|
||||
}
|
||||
for (int i = 0; i < 2; i++) {
|
||||
ASSERT_OK(Put("foo", "bar"));
|
||||
if (i == 0) {
|
||||
// Compact to ensure current log is empty
|
||||
CompactMemTable();
|
||||
}
|
||||
Close();
|
||||
ASSERT_EQ(1, NumLogs());
|
||||
uint64_t number = FirstLogFile();
|
||||
if (i == 0) {
|
||||
ASSERT_EQ(0, FileSize(LogName(number)));
|
||||
} else {
|
||||
ASSERT_LT(0, FileSize(LogName(number)));
|
||||
}
|
||||
Open();
|
||||
ASSERT_EQ(1, NumLogs());
|
||||
ASSERT_EQ(number, FirstLogFile()) << "did not reuse log file";
|
||||
ASSERT_EQ("bar", Get("foo"));
|
||||
Open();
|
||||
ASSERT_EQ(1, NumLogs());
|
||||
ASSERT_EQ(number, FirstLogFile()) << "did not reuse log file";
|
||||
ASSERT_EQ("bar", Get("foo"));
|
||||
}
|
||||
}
|
||||
|
||||
TEST(RecoveryTest, MultipleMemTables) {
|
||||
// Make a large log.
|
||||
const int kNum = 1000;
|
||||
for (int i = 0; i < kNum; i++) {
|
||||
char buf[100];
|
||||
snprintf(buf, sizeof(buf), "%050d", i);
|
||||
ASSERT_OK(Put(buf, buf));
|
||||
}
|
||||
ASSERT_EQ(0, NumTables());
|
||||
Close();
|
||||
ASSERT_EQ(0, NumTables());
|
||||
ASSERT_EQ(1, NumLogs());
|
||||
uint64_t old_log_file = FirstLogFile();
|
||||
|
||||
// Force creation of multiple memtables by reducing the write buffer size.
|
||||
Options opt;
|
||||
opt.reuse_logs = true;
|
||||
opt.write_buffer_size = (kNum*100) / 2;
|
||||
Open(&opt);
|
||||
ASSERT_LE(2, NumTables());
|
||||
ASSERT_EQ(1, NumLogs());
|
||||
ASSERT_NE(old_log_file, FirstLogFile()) << "must not reuse log";
|
||||
for (int i = 0; i < kNum; i++) {
|
||||
char buf[100];
|
||||
snprintf(buf, sizeof(buf), "%050d", i);
|
||||
ASSERT_EQ(buf, Get(buf));
|
||||
}
|
||||
}
|
||||
|
||||
TEST(RecoveryTest, MultipleLogFiles) {
|
||||
ASSERT_OK(Put("foo", "bar"));
|
||||
Close();
|
||||
ASSERT_EQ(1, NumLogs());
|
||||
|
||||
// Make a bunch of uncompacted log files.
|
||||
uint64_t old_log = FirstLogFile();
|
||||
MakeLogFile(old_log+1, 1000, "hello", "world");
|
||||
MakeLogFile(old_log+2, 1001, "hi", "there");
|
||||
MakeLogFile(old_log+3, 1002, "foo", "bar2");
|
||||
|
||||
// Recover and check that all log files were processed.
|
||||
Open();
|
||||
ASSERT_LE(1, NumTables());
|
||||
ASSERT_EQ(1, NumLogs());
|
||||
uint64_t new_log = FirstLogFile();
|
||||
ASSERT_LE(old_log+3, new_log);
|
||||
ASSERT_EQ("bar2", Get("foo"));
|
||||
ASSERT_EQ("world", Get("hello"));
|
||||
ASSERT_EQ("there", Get("hi"));
|
||||
|
||||
// Test that previous recovery produced recoverable state.
|
||||
Open();
|
||||
ASSERT_LE(1, NumTables());
|
||||
ASSERT_EQ(1, NumLogs());
|
||||
if (CanAppend()) {
|
||||
ASSERT_EQ(new_log, FirstLogFile());
|
||||
}
|
||||
ASSERT_EQ("bar2", Get("foo"));
|
||||
ASSERT_EQ("world", Get("hello"));
|
||||
ASSERT_EQ("there", Get("hi"));
|
||||
|
||||
// Check that introducing an older log file does not cause it to be re-read.
|
||||
Close();
|
||||
MakeLogFile(old_log+1, 2000, "hello", "stale write");
|
||||
Open();
|
||||
ASSERT_LE(1, NumTables());
|
||||
ASSERT_EQ(1, NumLogs());
|
||||
if (CanAppend()) {
|
||||
ASSERT_EQ(new_log, FirstLogFile());
|
||||
}
|
||||
ASSERT_EQ("bar2", Get("foo"));
|
||||
ASSERT_EQ("world", Get("hello"));
|
||||
ASSERT_EQ("there", Get("hi"));
|
||||
}
|
||||
|
||||
} // namespace leveldb
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
return leveldb::test::RunAllTests();
|
||||
}
|
|
@ -45,30 +45,56 @@ namespace {
|
|||
class Repairer {
|
||||
public:
|
||||
Repairer(const std::string& dbname, const Options& options)
|
||||
: dbname_(dbname),
|
||||
: double_cache_(options),
|
||||
options_(SanitizeOptions(dbname, &icmp_, &ipolicy_, options, double_cache_.GetBlockCache())),
|
||||
org_options_(options),
|
||||
dbname_(options_.tiered_fast_prefix),
|
||||
org_dbname_(dbname),
|
||||
env_(options.env),
|
||||
icmp_(options.comparator),
|
||||
ipolicy_(options.filter_policy),
|
||||
options_(SanitizeOptions(dbname, &icmp_, &ipolicy_, options)),
|
||||
owns_info_log_(options_.info_log != options.info_log),
|
||||
owns_cache_(options_.block_cache != options.block_cache),
|
||||
next_file_number_(1) {
|
||||
db_lock_(NULL),
|
||||
next_file_number_(1)
|
||||
{
|
||||
// TableCache can be small since we expect each table to be opened once.
|
||||
table_cache_ = new TableCache(dbname_, &options_, 10);
|
||||
table_cache_ = new TableCache(dbname_, &options_, double_cache_.GetFileCache(), double_cache_);
|
||||
|
||||
}
|
||||
|
||||
~Repairer() {
|
||||
delete table_cache_;
|
||||
if (owns_info_log_) {
|
||||
delete options_.info_log;
|
||||
}
|
||||
if (owns_cache_) {
|
||||
delete options_.block_cache;
|
||||
}
|
||||
// if (owns_cache_) {
|
||||
// delete options_.block_cache;
|
||||
// }
|
||||
|
||||
// must remove second ref counter that keeps overlapped files locked
|
||||
// table cache
|
||||
bool is_overlap;
|
||||
for (int level = 0; level < config::kNumLevels; level++) {
|
||||
{
|
||||
is_overlap=(level < leveldb::config::kNumOverlapLevels);
|
||||
for (size_t i = 0; i < table_numbers_[level].size(); i++) {
|
||||
table_cache_->Evict(table_numbers_[level][i], is_overlap);
|
||||
} // for
|
||||
} // if
|
||||
} // for
|
||||
|
||||
delete table_cache_;
|
||||
}
|
||||
|
||||
Status Run() {
|
||||
Status status = FindFiles();
|
||||
Status status;
|
||||
|
||||
status = env_->LockFile(LockFileName(dbname_), &db_lock_);
|
||||
|
||||
if (status.ok())
|
||||
status = MakeLevelDirectories(env_, options_);
|
||||
|
||||
if (status.ok()) {
|
||||
status = FindFiles();
|
||||
if (status.ok()) {
|
||||
ConvertLogFilesToTables();
|
||||
ExtractMetaData();
|
||||
|
@ -76,18 +102,56 @@ class Repairer {
|
|||
}
|
||||
if (status.ok()) {
|
||||
unsigned long long bytes = 0;
|
||||
for (size_t i = 0; i < tables_.size(); i++) {
|
||||
bytes += tables_[i].meta.file_size;
|
||||
unsigned long long files = 0;
|
||||
|
||||
// calculate size for log information
|
||||
for (int level=0; level<config::kNumLevels;++level)
|
||||
{
|
||||
std::vector<TableInfo> * table_ptr;
|
||||
std::vector<TableInfo>::const_iterator i;
|
||||
|
||||
table_ptr=&tables_[level];
|
||||
files+=table_ptr->size();
|
||||
|
||||
for ( i = table_ptr->begin(); table_ptr->end()!= i; i++) {
|
||||
bytes += i->meta.file_size;
|
||||
}
|
||||
} // for
|
||||
|
||||
Log(options_.info_log,
|
||||
"**** Repaired leveldb %s; "
|
||||
"recovered %d files; %llu bytes. "
|
||||
"Some data may have been lost. "
|
||||
"****",
|
||||
dbname_.c_str(),
|
||||
static_cast<int>(tables_.size()),
|
||||
static_cast<int>(files),
|
||||
bytes);
|
||||
}
|
||||
if (db_lock_ != NULL) {
|
||||
env_->UnlockFile(db_lock_);
|
||||
}
|
||||
}
|
||||
|
||||
// perform Riak specific scan for overlapping .sst files
|
||||
// within a level
|
||||
if (status.ok())
|
||||
{
|
||||
leveldb::DB * db_ptr;
|
||||
Options options;
|
||||
|
||||
db_ptr=NULL;
|
||||
options=org_options_;
|
||||
// options.block_cache=NULL; // not reusing for fear of edge cases
|
||||
options.is_repair=true;
|
||||
options.error_if_exists=false;
|
||||
status=leveldb::DB::Open(options, org_dbname_, &db_ptr);
|
||||
|
||||
if (status.ok())
|
||||
status=db_ptr->VerifyLevels();
|
||||
|
||||
delete db_ptr;
|
||||
|
||||
} // if
|
||||
return status;
|
||||
}
|
||||
|
||||
|
@ -97,34 +161,36 @@ class Repairer {
|
|||
SequenceNumber max_sequence;
|
||||
};
|
||||
|
||||
std::string const dbname_;
|
||||
DoubleCache double_cache_;
|
||||
Options const options_, org_options_;
|
||||
std::string const dbname_, org_dbname_;
|
||||
Env* const env_;
|
||||
InternalKeyComparator const icmp_;
|
||||
InternalFilterPolicy const ipolicy_;
|
||||
Options const options_;
|
||||
bool owns_info_log_;
|
||||
bool owns_cache_;
|
||||
FileLock* db_lock_;
|
||||
TableCache* table_cache_;
|
||||
VersionEdit edit_;
|
||||
|
||||
std::vector<std::string> manifests_;
|
||||
std::vector<uint64_t> table_numbers_;
|
||||
std::vector<uint64_t> table_numbers_[config::kNumLevels];
|
||||
std::vector<uint64_t> logs_;
|
||||
std::vector<TableInfo> tables_;
|
||||
std::vector<TableInfo> tables_[config::kNumLevels];
|
||||
uint64_t next_file_number_;
|
||||
|
||||
Status FindFiles() {
|
||||
Status FindFiles()
|
||||
{
|
||||
std::vector<std::string> filenames;
|
||||
uint64_t number;
|
||||
FileType type;
|
||||
int level;
|
||||
|
||||
// base directory
|
||||
Status status = env_->GetChildren(dbname_, &filenames);
|
||||
if (!status.ok()) {
|
||||
return status;
|
||||
}
|
||||
if (filenames.empty()) {
|
||||
return Status::IOError(dbname_, "repair found no files");
|
||||
}
|
||||
|
||||
uint64_t number;
|
||||
FileType type;
|
||||
for (size_t i = 0; i < filenames.size(); i++) {
|
||||
if (ParseFileName(filenames[i], &number, &type)) {
|
||||
if (type == kDescriptorFile) {
|
||||
|
@ -136,13 +202,38 @@ class Repairer {
|
|||
if (type == kLogFile) {
|
||||
logs_.push_back(number);
|
||||
} else if (type == kTableFile) {
|
||||
table_numbers_.push_back(number);
|
||||
table_numbers_[0].push_back(number);
|
||||
} else {
|
||||
// Ignore other files
|
||||
} // else
|
||||
} // else
|
||||
} // if
|
||||
} // for
|
||||
|
||||
for (level=0; level < config::kNumLevels; ++level)
|
||||
{
|
||||
std::string dirname;
|
||||
|
||||
filenames.clear();
|
||||
dirname=MakeDirName2(options_, level, "sst");
|
||||
Status status = env_->GetChildren(dirname, &filenames);
|
||||
if (!status.ok()) {
|
||||
return status;
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < filenames.size(); i++) {
|
||||
if (ParseFileName(filenames[i], &number, &type)) {
|
||||
if (number + 1 > next_file_number_) {
|
||||
next_file_number_ = number + 1;
|
||||
}
|
||||
|
||||
if (type == kTableFile) {
|
||||
table_numbers_[level].push_back(number);
|
||||
}
|
||||
}
|
||||
} // if
|
||||
} // for
|
||||
} // for
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
|
@ -186,7 +277,7 @@ class Repairer {
|
|||
reporter.env = env_;
|
||||
reporter.info_log = options_.info_log;
|
||||
reporter.lognum = log;
|
||||
// We intentionally make log::Reader do checksumming so that
|
||||
// We intentially make log::Reader do checksumming so that
|
||||
// corruptions cause entire commits to be skipped instead of
|
||||
// propagating bad information (like overly large sequence
|
||||
// numbers).
|
||||
|
@ -203,11 +294,11 @@ class Repairer {
|
|||
while (reader.ReadRecord(&record, &scratch)) {
|
||||
if (record.size() < 12) {
|
||||
reporter.Corruption(
|
||||
record.size(), Status::Corruption("log record too small", logname));
|
||||
record.size(), Status::Corruption("log record too small"));
|
||||
continue;
|
||||
}
|
||||
WriteBatchInternal::SetContents(&batch, record);
|
||||
status = WriteBatchInternal::InsertInto(&batch, mem);
|
||||
status = WriteBatchInternal::InsertInto(&batch, mem, &options_);
|
||||
if (status.ok()) {
|
||||
counter += WriteBatchInternal::Count(&batch);
|
||||
} else {
|
||||
|
@ -223,14 +314,15 @@ class Repairer {
|
|||
// since ExtractMetaData() will also generate edits.
|
||||
FileMetaData meta;
|
||||
meta.number = next_file_number_++;
|
||||
meta.level = 0;
|
||||
Iterator* iter = mem->NewIterator();
|
||||
status = BuildTable(dbname_, env_, options_, table_cache_, iter, &meta);
|
||||
status = BuildTable(dbname_, env_, options_, icmp_.user_comparator(), table_cache_, iter, &meta, 0);
|
||||
delete iter;
|
||||
mem->Unref();
|
||||
mem = NULL;
|
||||
if (status.ok()) {
|
||||
if (meta.file_size > 0) {
|
||||
table_numbers_.push_back(meta.number);
|
||||
table_numbers_[0].push_back(meta.number);
|
||||
}
|
||||
}
|
||||
Log(options_.info_log, "Log #%llu: %d ops saved to Table #%llu %s",
|
||||
|
@ -242,52 +334,48 @@ class Repairer {
|
|||
}
|
||||
|
||||
void ExtractMetaData() {
|
||||
for (size_t i = 0; i < table_numbers_.size(); i++) {
|
||||
ScanTable(table_numbers_[i]);
|
||||
}
|
||||
}
|
||||
for (int level=0; level < config::kNumLevels; ++level)
|
||||
{
|
||||
std::vector<uint64_t> * number_ptr;
|
||||
std::vector<uint64_t>::const_iterator i;
|
||||
|
||||
Iterator* NewTableIterator(const FileMetaData& meta) {
|
||||
// Same as compaction iterators: if paranoid_checks are on, turn
|
||||
// on checksum verification.
|
||||
ReadOptions r;
|
||||
r.verify_checksums = options_.paranoid_checks;
|
||||
return table_cache_->NewIterator(r, meta.number, meta.file_size);
|
||||
}
|
||||
|
||||
void ScanTable(uint64_t number) {
|
||||
number_ptr=&table_numbers_[level];
|
||||
for (i = number_ptr->begin(); number_ptr->end()!= i; ++i) {
|
||||
TableInfo t;
|
||||
t.meta.number = number;
|
||||
std::string fname = TableFileName(dbname_, number);
|
||||
Status status = env_->GetFileSize(fname, &t.meta.file_size);
|
||||
if (!status.ok()) {
|
||||
// Try alternate file name.
|
||||
fname = SSTTableFileName(dbname_, number);
|
||||
Status s2 = env_->GetFileSize(fname, &t.meta.file_size);
|
||||
if (s2.ok()) {
|
||||
status = Status::OK();
|
||||
}
|
||||
}
|
||||
if (!status.ok()) {
|
||||
ArchiveFile(TableFileName(dbname_, number));
|
||||
ArchiveFile(SSTTableFileName(dbname_, number));
|
||||
Log(options_.info_log, "Table #%llu: dropped: %s",
|
||||
t.meta.number = *i;
|
||||
t.meta.level = level;
|
||||
Status status = ScanTable(&t);
|
||||
if (!status.ok())
|
||||
{
|
||||
std::string fname = TableFileName(options_, t.meta.number, t.meta.level);
|
||||
Log(options_.info_log, "Table #%llu: ignoring %s",
|
||||
(unsigned long long) t.meta.number,
|
||||
status.ToString().c_str());
|
||||
return;
|
||||
ArchiveFile(fname, true);
|
||||
} else {
|
||||
tables_[level].push_back(t);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Extract metadata by scanning through table.
|
||||
Status ScanTable(TableInfo* t) {
|
||||
Table * table_ptr;
|
||||
SstCounters counters;
|
||||
std::string fname = TableFileName(options_, t->meta.number, t->meta.level);
|
||||
int counter = 0;
|
||||
Iterator* iter = NewTableIterator(t.meta);
|
||||
Status status = env_->GetFileSize(fname, &t->meta.file_size);
|
||||
if (status.ok()) {
|
||||
Iterator* iter = table_cache_->NewIterator(
|
||||
ReadOptions(), t->meta.number, t->meta.file_size, t->meta.level, &table_ptr);
|
||||
bool empty = true;
|
||||
ParsedInternalKey parsed;
|
||||
t.max_sequence = 0;
|
||||
t->max_sequence = 0;
|
||||
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
|
||||
Slice key = iter->key();
|
||||
if (!ParseInternalKey(key, &parsed)) {
|
||||
Log(options_.info_log, "Table #%llu: unparsable key %s",
|
||||
(unsigned long long) t.meta.number,
|
||||
(unsigned long long) t->meta.number,
|
||||
EscapeString(key).c_str());
|
||||
continue;
|
||||
}
|
||||
|
@ -295,115 +383,79 @@ class Repairer {
|
|||
counter++;
|
||||
if (empty) {
|
||||
empty = false;
|
||||
t.meta.smallest.DecodeFrom(key);
|
||||
t->meta.smallest.DecodeFrom(key);
|
||||
}
|
||||
t.meta.largest.DecodeFrom(key);
|
||||
if (parsed.sequence > t.max_sequence) {
|
||||
t.max_sequence = parsed.sequence;
|
||||
t->meta.largest.DecodeFrom(key);
|
||||
if (parsed.sequence > t->max_sequence) {
|
||||
t->max_sequence = parsed.sequence;
|
||||
}
|
||||
}
|
||||
if (!iter->status().ok()) {
|
||||
status = iter->status();
|
||||
}
|
||||
else {
|
||||
counters=table_ptr->GetSstCounters();
|
||||
t->meta.exp_write_low=counters.Value(eSstCountExpiry1);
|
||||
t->meta.exp_write_high=counters.Value(eSstCountExpiry2);
|
||||
t->meta.exp_explicit_high=counters.Value(eSstCountExpiry3);
|
||||
}
|
||||
delete iter;
|
||||
}
|
||||
Log(options_.info_log, "Table #%llu: %d entries %s",
|
||||
(unsigned long long) t.meta.number,
|
||||
(unsigned long long) t->meta.number,
|
||||
counter,
|
||||
status.ToString().c_str());
|
||||
|
||||
if (status.ok()) {
|
||||
tables_.push_back(t);
|
||||
} else {
|
||||
RepairTable(fname, t); // RepairTable archives input file.
|
||||
}
|
||||
}
|
||||
|
||||
void RepairTable(const std::string& src, TableInfo t) {
|
||||
// We will copy src contents to a new table and then rename the
|
||||
// new table over the source.
|
||||
|
||||
// Create builder.
|
||||
std::string copy = TableFileName(dbname_, next_file_number_++);
|
||||
WritableFile* file;
|
||||
Status s = env_->NewWritableFile(copy, &file);
|
||||
if (!s.ok()) {
|
||||
return;
|
||||
}
|
||||
TableBuilder* builder = new TableBuilder(options_, file);
|
||||
|
||||
// Copy data.
|
||||
Iterator* iter = NewTableIterator(t.meta);
|
||||
int counter = 0;
|
||||
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
|
||||
builder->Add(iter->key(), iter->value());
|
||||
counter++;
|
||||
}
|
||||
delete iter;
|
||||
|
||||
ArchiveFile(src);
|
||||
if (counter == 0) {
|
||||
builder->Abandon(); // Nothing to save
|
||||
} else {
|
||||
s = builder->Finish();
|
||||
if (s.ok()) {
|
||||
t.meta.file_size = builder->FileSize();
|
||||
}
|
||||
}
|
||||
delete builder;
|
||||
builder = NULL;
|
||||
|
||||
if (s.ok()) {
|
||||
s = file->Close();
|
||||
}
|
||||
delete file;
|
||||
file = NULL;
|
||||
|
||||
if (counter > 0 && s.ok()) {
|
||||
std::string orig = TableFileName(dbname_, t.meta.number);
|
||||
s = env_->RenameFile(copy, orig);
|
||||
if (s.ok()) {
|
||||
Log(options_.info_log, "Table #%llu: %d entries repaired",
|
||||
(unsigned long long) t.meta.number, counter);
|
||||
tables_.push_back(t);
|
||||
}
|
||||
}
|
||||
if (!s.ok()) {
|
||||
env_->DeleteFile(copy);
|
||||
}
|
||||
return status;
|
||||
}
|
||||
|
||||
Status WriteDescriptor() {
|
||||
std::string tmp = TempFileName(dbname_, 1);
|
||||
WritableFile* file;
|
||||
Status status = env_->NewWritableFile(tmp, &file);
|
||||
Status status = env_->NewWritableFile(tmp, &file, 4096);
|
||||
if (!status.ok()) {
|
||||
return status;
|
||||
}
|
||||
|
||||
SequenceNumber max_sequence = 0;
|
||||
for (size_t i = 0; i < tables_.size(); i++) {
|
||||
if (max_sequence < tables_[i].max_sequence) {
|
||||
max_sequence = tables_[i].max_sequence;
|
||||
}
|
||||
for (int level=0; level<config::kNumLevels;++level)
|
||||
{
|
||||
std::vector<TableInfo> * table_ptr;
|
||||
std::vector<TableInfo>::const_iterator i;
|
||||
|
||||
table_ptr=&tables_[level];
|
||||
|
||||
for ( i = table_ptr->begin(); table_ptr->end()!= i; i++) {
|
||||
if (max_sequence < i->max_sequence) {
|
||||
max_sequence = i->max_sequence;
|
||||
}
|
||||
} // for
|
||||
} // for
|
||||
|
||||
edit_.SetComparatorName(icmp_.user_comparator()->Name());
|
||||
edit_.SetLogNumber(0);
|
||||
edit_.SetNextFile(next_file_number_);
|
||||
edit_.SetLastSequence(max_sequence);
|
||||
|
||||
for (size_t i = 0; i < tables_.size(); i++) {
|
||||
// TODO(opt): separate out into multiple levels
|
||||
const TableInfo& t = tables_[i];
|
||||
edit_.AddFile(0, t.meta.number, t.meta.file_size,
|
||||
t.meta.smallest, t.meta.largest);
|
||||
}
|
||||
for (int level=0; level<config::kNumLevels;++level)
|
||||
{
|
||||
std::vector<TableInfo> * table_ptr;
|
||||
std::vector<TableInfo>::const_iterator i;
|
||||
|
||||
table_ptr=&tables_[level];
|
||||
|
||||
for ( i = table_ptr->begin(); table_ptr->end()!= i; i++) {
|
||||
edit_.AddFile2(level, i->meta.number, i->meta.file_size,
|
||||
i->meta.smallest, i->meta.largest,
|
||||
i->meta.exp_write_low, i->meta.exp_write_high, i->meta.exp_explicit_high);
|
||||
|
||||
} // for
|
||||
} // for
|
||||
|
||||
//fprintf(stderr, "NewDescriptor:\n%s\n", edit_.DebugString().c_str());
|
||||
{
|
||||
log::Writer log(file);
|
||||
std::string record;
|
||||
edit_.EncodeTo(&record);
|
||||
edit_.EncodeTo(&record); // manifest format is default for release, options_ often incomplete
|
||||
status = log.AddRecord(record);
|
||||
}
|
||||
if (status.ok()) {
|
||||
|
@ -431,21 +483,33 @@ class Repairer {
|
|||
return status;
|
||||
}
|
||||
|
||||
void ArchiveFile(const std::string& fname) {
|
||||
void ArchiveFile(const std::string& fname, bool two_levels=false) {
|
||||
// Move into another directory. E.g., for
|
||||
// dir/foo
|
||||
// rename to
|
||||
// dir/lost/foo
|
||||
const char* slash = strrchr(fname.c_str(), '/');
|
||||
std::string::size_type slash, slash2;
|
||||
|
||||
slash=fname.rfind('/');
|
||||
if (two_levels && std::string::npos!=slash && 0<slash)
|
||||
{
|
||||
slash2=fname.rfind('/',slash-1);
|
||||
if (std::string::npos==slash2)
|
||||
slash2=slash;
|
||||
} // if
|
||||
else
|
||||
slash2=slash;
|
||||
|
||||
std::string new_dir;
|
||||
if (slash != NULL) {
|
||||
new_dir.assign(fname.data(), slash - fname.data());
|
||||
}
|
||||
|
||||
if (std::string::npos != slash2 && 0<slash2)
|
||||
new_dir.append(fname,0,slash2);
|
||||
|
||||
new_dir.append("/lost");
|
||||
env_->CreateDir(new_dir); // Ignore error
|
||||
std::string new_file = new_dir;
|
||||
new_file.append("/");
|
||||
new_file.append((slash == NULL) ? fname.c_str() : slash + 1);
|
||||
new_file.append((std::string::npos!=slash) ? fname.substr(slash+1) : fname);
|
||||
Status s = env_->RenameFile(fname, new_file);
|
||||
Log(options_.info_log, "Archiving %s: %s\n",
|
||||
fname.c_str(), s.ToString().c_str());
|
||||
|
|
|
@ -1,10 +1,7 @@
|
|||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#ifndef STORAGE_LEVELDB_DB_SKIPLIST_H_
|
||||
#define STORAGE_LEVELDB_DB_SKIPLIST_H_
|
||||
|
||||
//
|
||||
// Thread safety
|
||||
// -------------
|
||||
//
|
||||
|
@ -55,6 +52,12 @@ class SkipList {
|
|||
// Returns true iff an entry that compares equal to key is in the list.
|
||||
bool Contains(const Key& key) const;
|
||||
|
||||
// Returns true if all inserts have been sequentially increasing;
|
||||
// else this SkipList has had keys inserted in non-sequential order
|
||||
bool InSequentialInsertMode() const {
|
||||
return sequentialInsertMode_;
|
||||
}
|
||||
|
||||
// Iteration over the contents of a skip list
|
||||
class Iterator {
|
||||
public:
|
||||
|
@ -94,8 +97,22 @@ class SkipList {
|
|||
// Intentionally copyable
|
||||
};
|
||||
|
||||
protected:
|
||||
// Checks the structure of this SkipList object, ensuring the keys are
|
||||
// properly ordered
|
||||
//
|
||||
// This is protected since it is intended for use by unit tests; if a lock
|
||||
// is used to protect Insert(), then it should be used to protect this
|
||||
// method as well
|
||||
bool Valid() const;
|
||||
|
||||
// Disables the sequential insert optimizations (used in performance testing)
|
||||
void DisableSequentialInsertMode() {
|
||||
sequentialInsertMode_ = false;
|
||||
}
|
||||
|
||||
private:
|
||||
enum { kMaxHeight = 12 };
|
||||
enum { kMaxHeight = 17 };
|
||||
|
||||
// Immutable after construction
|
||||
Comparator const compare_;
|
||||
|
@ -115,6 +132,18 @@ class SkipList {
|
|||
// Read/written only by Insert().
|
||||
Random rnd_;
|
||||
|
||||
// Points to the last node in the list; modified only by Insert()
|
||||
Node* tail_;
|
||||
|
||||
// Pointers to the nodes previous to the tail node; have max_height_ entries
|
||||
Node* tailPrev_[kMaxHeight];
|
||||
|
||||
// The height of the tail_ node
|
||||
int tailHeight_;
|
||||
|
||||
// We track the tail node until we have a non-sequential insert
|
||||
bool sequentialInsertMode_;
|
||||
|
||||
Node* NewNode(const Key& key, int height);
|
||||
int RandomHeight();
|
||||
bool Equal(const Key& a, const Key& b) const { return (compare_(a, b) == 0); }
|
||||
|
@ -129,6 +158,11 @@ class SkipList {
|
|||
// node at "level" for every level in [0..max_height_-1].
|
||||
Node* FindGreaterOrEqual(const Key& key, Node** prev) const;
|
||||
|
||||
// Similar to FindGreaterOrEqual() except it uses the barrier-free
|
||||
// variant of Next(); this is used only by Insert() and it
|
||||
// checks the tail_ pointer in case we're doing a sequential insert
|
||||
Node* NoBarrier_FindGreaterOrEqual(const Key& key, Node** prev) const;
|
||||
|
||||
// Return the latest node with a key < key.
|
||||
// Return head_ if there is no such node.
|
||||
Node* FindLessThan(const Key& key) const;
|
||||
|
@ -280,6 +314,54 @@ typename SkipList<Key,Comparator>::Node* SkipList<Key,Comparator>::FindGreaterOr
|
|||
}
|
||||
}
|
||||
|
||||
template<typename Key, class Comparator>
|
||||
typename SkipList<Key,Comparator>::Node*
|
||||
SkipList<Key,Comparator>::NoBarrier_FindGreaterOrEqual(const Key& key, Node** prev) const {
|
||||
int level = GetMaxHeight() - 1;
|
||||
|
||||
// If we have only seen sequential inserts up to this point, we can use
|
||||
// the tail_ node
|
||||
if ( sequentialInsertMode_ ) {
|
||||
if (tail_ == NULL) {
|
||||
// The list is currently empty, so the node being inserted
|
||||
// will be the new tail_
|
||||
assert(level == 0);
|
||||
if (prev != NULL) prev[0] = head_;
|
||||
return NULL;
|
||||
}
|
||||
else if (KeyIsAfterNode(key, tail_)) {
|
||||
// The new key must be inserted after the current tail_ node
|
||||
if (prev != NULL) {
|
||||
int i;
|
||||
for (i = 0; i < tailHeight_; ++i) {
|
||||
prev[i] = tail_;
|
||||
}
|
||||
for (/*continue with i*/; i <= level; ++i) {
|
||||
prev[i] = tailPrev_[i];
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
Node* x = head_;
|
||||
while (true) {
|
||||
Node* next = x->NoBarrier_Next(level);
|
||||
if (KeyIsAfterNode(key, next)) {
|
||||
// Keep searching in this list
|
||||
x = next;
|
||||
} else {
|
||||
if (prev != NULL) prev[level] = x;
|
||||
if (level == 0) {
|
||||
return next;
|
||||
} else {
|
||||
// Switch to next list
|
||||
level--;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Key, class Comparator>
|
||||
typename SkipList<Key,Comparator>::Node*
|
||||
SkipList<Key,Comparator>::FindLessThan(const Key& key) const {
|
||||
|
@ -327,25 +409,41 @@ SkipList<Key,Comparator>::SkipList(Comparator cmp, Arena* arena)
|
|||
arena_(arena),
|
||||
head_(NewNode(0 /* any key will do */, kMaxHeight)),
|
||||
max_height_(reinterpret_cast<void*>(1)),
|
||||
rnd_(0xdeadbeef) {
|
||||
rnd_(0xdeadbeef),
|
||||
tail_(NULL),
|
||||
tailHeight_(0),
|
||||
sequentialInsertMode_(true) {
|
||||
for (int i = 0; i < kMaxHeight; i++) {
|
||||
head_->SetNext(i, NULL);
|
||||
tailPrev_[i] = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Key, class Comparator>
|
||||
void SkipList<Key,Comparator>::Insert(const Key& key) {
|
||||
// TODO(opt): We can use a barrier-free variant of FindGreaterOrEqual()
|
||||
// We use a barrier-free variant of FindGreaterOrEqual()
|
||||
// here since Insert() is externally synchronized.
|
||||
Node* prev[kMaxHeight];
|
||||
Node* x = FindGreaterOrEqual(key, prev);
|
||||
Node* x = NoBarrier_FindGreaterOrEqual(key, prev);
|
||||
|
||||
// If we're still in sequential-insert mode, check if the new node is being
|
||||
// inserted at the end of the list, which is indicated by x being NULL
|
||||
if (sequentialInsertMode_) {
|
||||
if (x != NULL) {
|
||||
// we have a non-sequential (AKA random) insert, so stop maintaining
|
||||
// the tail bookkeeping overhead
|
||||
sequentialInsertMode_ = false;
|
||||
}
|
||||
}
|
||||
|
||||
// Our data structure does not allow duplicate insertion
|
||||
assert(x == NULL || !Equal(key, x->key));
|
||||
|
||||
int height = RandomHeight();
|
||||
int i, height = RandomHeight();
|
||||
if (height > GetMaxHeight()) {
|
||||
for (int i = GetMaxHeight(); i < height; i++) {
|
||||
// We are extending max_height_ which means we need to fill in the blanks
|
||||
// in prev[] that were not filled in by NoBarrier_FindGreaterOrEqual()
|
||||
for (i = GetMaxHeight(); i < height; ++i) {
|
||||
prev[i] = head_;
|
||||
}
|
||||
//fprintf(stderr, "Change height from %d to %d\n", max_height_, height);
|
||||
|
@ -361,12 +459,37 @@ void SkipList<Key,Comparator>::Insert(const Key& key) {
|
|||
}
|
||||
|
||||
x = NewNode(key, height);
|
||||
for (int i = 0; i < height; i++) {
|
||||
for (i = 0; i < height; ++i) {
|
||||
// NoBarrier_SetNext() suffices since we will add a barrier when
|
||||
// we publish a pointer to "x" in prev[i].
|
||||
x->NoBarrier_SetNext(i, prev[i]->NoBarrier_Next(i));
|
||||
prev[i]->SetNext(i, x);
|
||||
}
|
||||
|
||||
// Do we need to update our tail_ pointer?
|
||||
if (sequentialInsertMode_) {
|
||||
Node* prevTail = tail_;
|
||||
int prevTailHeight = tailHeight_;
|
||||
|
||||
tail_ = x;
|
||||
tailHeight_ = height;
|
||||
|
||||
// We also need to update our tailPrev_ pointers; first we capture
|
||||
// the nodes already pointing to the new tail_
|
||||
for (i = 0; i < height; ++i) {
|
||||
tailPrev_[i] = prev[i];
|
||||
}
|
||||
|
||||
// If the previous tail node was taller than the new tail node, then
|
||||
// the prev pointers above the current tail node's height (up to the
|
||||
// height of the previous tail node) are simply the previous tail node
|
||||
for (/*continue with i*/; i < prevTailHeight; ++i) {
|
||||
tailPrev_[i] = prevTail;
|
||||
}
|
||||
|
||||
// NOTE: any prev pointers above prevTailHeight (up to max_height_) were
|
||||
// already set in tailPrev_ by previous calls to this method
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Key, class Comparator>
|
||||
|
@ -379,6 +502,115 @@ bool SkipList<Key,Comparator>::Contains(const Key& key) const {
|
|||
}
|
||||
}
|
||||
|
||||
} // namespace leveldb
|
||||
template<typename Key, class Comparator>
|
||||
bool SkipList<Key,Comparator>::Valid() const
|
||||
{
|
||||
// Note that we can use barrier-free overloads in this method since it is
|
||||
// protected by the same lock as Insert().
|
||||
|
||||
#endif // STORAGE_LEVELDB_DB_SKIPLIST_H_
|
||||
// Ensure that the list is properly sorted; use an iterator for this check
|
||||
const Key* pPrevKey = NULL;
|
||||
typename SkipList<Key, Comparator>::Iterator iter(this);
|
||||
for ( iter.SeekToFirst(); iter.Valid(); iter.Next() ) {
|
||||
if ( pPrevKey != NULL ) {
|
||||
if ( compare_( *pPrevKey, iter.key() ) >= 0 ) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
pPrevKey = &iter.key();
|
||||
}
|
||||
|
||||
// Now walk the linked list at each level and ensure it's sorted. Also track
|
||||
// how many nodes we see at each level; the number of nodes in the linked
|
||||
// list at level n must not be larger than the number of nodes at level n-1.
|
||||
std::vector<int> nodeCounts( GetMaxHeight() );
|
||||
int level;
|
||||
for ( level = GetMaxHeight() - 1; level >= 0; --level ) {
|
||||
int nodeCount = 0;
|
||||
pPrevKey = NULL;
|
||||
for ( Node* pNode = head_->NoBarrier_Next( level );
|
||||
pNode != NULL;
|
||||
pNode = pNode->NoBarrier_Next( level ) ) {
|
||||
++nodeCount;
|
||||
if ( pPrevKey != NULL ) {
|
||||
if ( compare_( *pPrevKey, pNode->key ) >= 0 ) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
pPrevKey = &pNode->key;
|
||||
}
|
||||
nodeCounts[ level ] = nodeCount;
|
||||
}
|
||||
|
||||
// Ensure the node counts do not increase as we move up the levels
|
||||
int prevNodeCount = nodeCounts[0];
|
||||
for ( level = 1; level < GetMaxHeight(); ++level ) {
|
||||
int currentNodeCount = nodeCounts[ level ];
|
||||
if ( currentNodeCount > prevNodeCount ) {
|
||||
return false;
|
||||
}
|
||||
prevNodeCount = currentNodeCount;
|
||||
}
|
||||
|
||||
// Ensure that tail_ points to the last node
|
||||
if ( sequentialInsertMode_ ) {
|
||||
if ( tail_ == NULL ) {
|
||||
// tail_ is not set, so the list must be empty
|
||||
if ( tailPrev_[0] != NULL || head_->NoBarrier_Next(0) != NULL ) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
else {
|
||||
// we have a tail_ node; first ensure that its prev pointer actually
|
||||
// points to it
|
||||
if ( tailPrev_[0] == NULL || tailPrev_[0]->NoBarrier_Next(0) != tail_ ) {
|
||||
return false;
|
||||
}
|
||||
if ( compare_( tailPrev_[0]->key, tail_->key ) >= 0 ) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// now check the rest of the pointers in tailPrev_; up to tailHeight_,
|
||||
// the next pointer of the node in tailPrev_ should point to tail_; after
|
||||
// that, the next pointer should be NULL
|
||||
for ( level = 1; level < GetMaxHeight(); ++level ) {
|
||||
Node* tailPrev = tailPrev_[ level ];
|
||||
if ( tailPrev == NULL ) {
|
||||
return false;
|
||||
}
|
||||
if ( level < tailHeight_ ) {
|
||||
if ( tailPrev->NoBarrier_Next( level ) != tail_ ) {
|
||||
return false;
|
||||
}
|
||||
if ( compare_( tailPrev->key, tail_->key ) >= 0 ) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
else {
|
||||
if ( tailPrev->NoBarrier_Next( level ) != NULL ) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// the remainder of the tailPrev_ pointers (above max_height_)
|
||||
// should be NULL
|
||||
for ( /*continue with level*/; level < kMaxHeight; ++level ) {
|
||||
if ( tailPrev_[ level ] != NULL ) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// now ensure that FindLast() returns tail_
|
||||
Node* lastNode = FindLast();
|
||||
if ( lastNode != tail_ ) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// if we get here, all is good
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace leveldb
|
||||
|
|
|
@ -2,11 +2,15 @@
|
|||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#define __STDC_FORMAT_MACROS
|
||||
#include <inttypes.h>
|
||||
|
||||
#include "db/skiplist.h"
|
||||
#include <set>
|
||||
#include "leveldb/env.h"
|
||||
#include "util/arena.h"
|
||||
#include "util/hash.h"
|
||||
#include "util/mutexlock.h"
|
||||
#include "util/random.h"
|
||||
#include "util/testharness.h"
|
||||
|
||||
|
@ -26,15 +30,29 @@ struct Comparator {
|
|||
}
|
||||
};
|
||||
|
||||
template<typename Key, class Comparator>
|
||||
class SkipListTest : public SkipList<Key, Comparator>
|
||||
{
|
||||
public:
|
||||
SkipListTest(Comparator cmp, Arena* arena) : SkipList<Key, Comparator>(cmp, arena) {}
|
||||
|
||||
// check the validity of this SkipList object by calling the Valid() method
|
||||
// in the base class
|
||||
bool Valid() const { return SkipList<Key, Comparator>::Valid(); }
|
||||
|
||||
void DisableSequentialInsertMode() { SkipList<Key, Comparator>::DisableSequentialInsertMode(); }
|
||||
};
|
||||
|
||||
class SkipTest { };
|
||||
|
||||
TEST(SkipTest, Empty) {
|
||||
Arena arena;
|
||||
Comparator cmp;
|
||||
SkipList<Key, Comparator> list(cmp, &arena);
|
||||
SkipListTest<Key, Comparator> list(cmp, &arena);
|
||||
ASSERT_TRUE(!list.Contains(10));
|
||||
ASSERT_TRUE(list.Valid());
|
||||
|
||||
SkipList<Key, Comparator>::Iterator iter(&list);
|
||||
SkipListTest<Key, Comparator>::Iterator iter(&list);
|
||||
ASSERT_TRUE(!iter.Valid());
|
||||
iter.SeekToFirst();
|
||||
ASSERT_TRUE(!iter.Valid());
|
||||
|
@ -51,13 +69,14 @@ TEST(SkipTest, InsertAndLookup) {
|
|||
std::set<Key> keys;
|
||||
Arena arena;
|
||||
Comparator cmp;
|
||||
SkipList<Key, Comparator> list(cmp, &arena);
|
||||
SkipListTest<Key, Comparator> list(cmp, &arena);
|
||||
for (int i = 0; i < N; i++) {
|
||||
Key key = rnd.Next() % R;
|
||||
if (keys.insert(key).second) {
|
||||
list.Insert(key);
|
||||
}
|
||||
}
|
||||
ASSERT_TRUE(list.Valid());
|
||||
|
||||
for (int i = 0; i < R; i++) {
|
||||
if (list.Contains(i)) {
|
||||
|
@ -69,7 +88,7 @@ TEST(SkipTest, InsertAndLookup) {
|
|||
|
||||
// Simple iterator tests
|
||||
{
|
||||
SkipList<Key, Comparator>::Iterator iter(&list);
|
||||
SkipListTest<Key, Comparator>::Iterator iter(&list);
|
||||
ASSERT_TRUE(!iter.Valid());
|
||||
|
||||
iter.Seek(0);
|
||||
|
@ -87,7 +106,7 @@ TEST(SkipTest, InsertAndLookup) {
|
|||
|
||||
// Forward iteration test
|
||||
for (int i = 0; i < R; i++) {
|
||||
SkipList<Key, Comparator>::Iterator iter(&list);
|
||||
SkipListTest<Key, Comparator>::Iterator iter(&list);
|
||||
iter.Seek(i);
|
||||
|
||||
// Compare against model iterator
|
||||
|
@ -107,7 +126,7 @@ TEST(SkipTest, InsertAndLookup) {
|
|||
|
||||
// Backward iteration test
|
||||
{
|
||||
SkipList<Key, Comparator>::Iterator iter(&list);
|
||||
SkipListTest<Key, Comparator>::Iterator iter(&list);
|
||||
iter.SeekToLast();
|
||||
|
||||
// Compare against model iterator
|
||||
|
@ -250,7 +269,7 @@ class ConcurrentTest {
|
|||
// Note that generation 0 is never inserted, so it is ok if
|
||||
// <*,0,*> is missing.
|
||||
ASSERT_TRUE((gen(pos) == 0) ||
|
||||
(gen(pos) > static_cast<Key>(initial_state.Get(key(pos))))
|
||||
(gen(pos) > initial_state.Get(key(pos)))
|
||||
) << "key: " << key(pos)
|
||||
<< "; gen: " << gen(pos)
|
||||
<< "; initgen: "
|
||||
|
@ -313,18 +332,16 @@ class TestState {
|
|||
state_cv_(&mu_) {}
|
||||
|
||||
void Wait(ReaderState s) {
|
||||
mu_.Lock();
|
||||
MutexLock lock(&mu_);
|
||||
while (state_ != s) {
|
||||
state_cv_.Wait();
|
||||
}
|
||||
mu_.Unlock();
|
||||
}
|
||||
|
||||
void Change(ReaderState s) {
|
||||
mu_.Lock();
|
||||
MutexLock lock(&mu_);
|
||||
state_ = s;
|
||||
state_cv_.Signal();
|
||||
mu_.Unlock();
|
||||
}
|
||||
|
||||
private:
|
||||
|
@ -371,6 +388,211 @@ TEST(SkipTest, Concurrent3) { RunConcurrent(3); }
|
|||
TEST(SkipTest, Concurrent4) { RunConcurrent(4); }
|
||||
TEST(SkipTest, Concurrent5) { RunConcurrent(5); }
|
||||
|
||||
static void
|
||||
RunSequentialInsert(
|
||||
const int NumKeys,
|
||||
bool AcquireLock,
|
||||
bool ReverseInsert,
|
||||
bool SequentialInsertModeEnabled )
|
||||
{
|
||||
const int loopCount = 5; // repeat the whole process this many times and average the time spent
|
||||
std::vector<uint64_t> timeSpent;
|
||||
|
||||
port::Mutex mutex;
|
||||
Env* env = Env::Default();
|
||||
|
||||
fprintf( stderr,
|
||||
"Sequentially inserting %d keys in %s order,\n"
|
||||
" seqential insert mode is initially %sabled,\n"
|
||||
" %sacquiring a lock for each insert (averaging over %d runs)\n",
|
||||
NumKeys, ReverseInsert ? "reverse" : "forward",
|
||||
SequentialInsertModeEnabled ? "en" : "dis",
|
||||
AcquireLock ? "" : "not ", loopCount );
|
||||
|
||||
int k;
|
||||
for ( k = 0; k < loopCount; ++k ) {
|
||||
int j;
|
||||
Arena arena;
|
||||
Comparator cmp;
|
||||
SkipListTest<Key, Comparator> list( cmp, &arena );
|
||||
|
||||
// initially the SkipList should be in sequential mode
|
||||
ASSERT_TRUE( list.InSequentialInsertMode() );
|
||||
|
||||
// were we instructed to disable sequential insert mode?
|
||||
if ( !SequentialInsertModeEnabled ) {
|
||||
list.DisableSequentialInsertMode();
|
||||
ASSERT_TRUE( !list.InSequentialInsertMode() );
|
||||
}
|
||||
|
||||
uint64_t start = env->NowMicros();
|
||||
for ( j = 0; j < NumKeys; ++j ) {
|
||||
Key key = ReverseInsert ? NumKeys - 1 - j : j;
|
||||
|
||||
if ( AcquireLock ) mutex.Lock();
|
||||
list.Insert( key );
|
||||
if ( AcquireLock ) mutex.Unlock();
|
||||
}
|
||||
uint64_t stop = env->NowMicros();
|
||||
timeSpent.push_back( stop - start );
|
||||
//fprintf( stderr, " Time for run %d: %llu\n", k, timeSpent[k] );
|
||||
|
||||
// if SequentialInsertModeEnabled is true, the SkipList should still be
|
||||
// in sequential mode iff ReverseInsert is false
|
||||
if ( SequentialInsertModeEnabled ) {
|
||||
ASSERT_TRUE( list.InSequentialInsertMode() != ReverseInsert );
|
||||
}
|
||||
else {
|
||||
ASSERT_TRUE( !list.InSequentialInsertMode() );
|
||||
}
|
||||
|
||||
// ensure the SkipLlist is properly sorted
|
||||
if ( AcquireLock ) mutex.Lock();
|
||||
ASSERT_TRUE( list.Valid() );
|
||||
if ( AcquireLock ) mutex.Unlock();
|
||||
|
||||
// ensure the SkipList contains all the keys we inserted
|
||||
for ( j = 0; j < NumKeys; ++j ) {
|
||||
ASSERT_TRUE( list.Contains( j ) );
|
||||
}
|
||||
}
|
||||
|
||||
// throw out the low and high times and average the rest
|
||||
uint64_t totalTime, lowTime, highTime;
|
||||
totalTime = lowTime = highTime = timeSpent[0];
|
||||
for ( k = 1; k < loopCount; ++k ) {
|
||||
uint64_t currentTime = timeSpent[k];
|
||||
totalTime += currentTime;
|
||||
if ( lowTime > currentTime ) lowTime = currentTime;
|
||||
if ( highTime < currentTime ) highTime = currentTime;
|
||||
}
|
||||
|
||||
totalTime -= (lowTime + highTime);
|
||||
|
||||
uint64_t averageTime = (totalTime / (loopCount - 2));
|
||||
double timePerKey = (double)averageTime / (double)NumKeys;
|
||||
fprintf( stderr, " Average insertion time: %" PRIu64 " (%f/key)\n", averageTime, timePerKey );
|
||||
}
|
||||
|
||||
TEST(SkipTest, SequentialInsert_NoLock_ForwardInsert)
|
||||
{
|
||||
int numKeys = 100000;
|
||||
bool acquireLock = false;
|
||||
bool reverseInsert = false;
|
||||
bool sequentialInsertModeEnabled = true;
|
||||
RunSequentialInsert( numKeys, acquireLock, reverseInsert, sequentialInsertModeEnabled );
|
||||
|
||||
sequentialInsertModeEnabled = false;
|
||||
RunSequentialInsert( numKeys, acquireLock, reverseInsert, sequentialInsertModeEnabled );
|
||||
}
|
||||
|
||||
TEST(SkipTest, SequentialInsert_Lock_ForwardInsert)
|
||||
{
|
||||
int numKeys = 100000;
|
||||
bool acquireLock = true;
|
||||
bool reverseInsert = false;
|
||||
bool sequentialInsertModeEnabled = true;
|
||||
RunSequentialInsert( numKeys, acquireLock, reverseInsert, sequentialInsertModeEnabled );
|
||||
|
||||
sequentialInsertModeEnabled = false;
|
||||
RunSequentialInsert( numKeys, acquireLock, reverseInsert, sequentialInsertModeEnabled );
|
||||
}
|
||||
|
||||
TEST(SkipTest, SequentialInsert_NoLock_ReverseInsert)
|
||||
{
|
||||
int numKeys = 100000;
|
||||
bool acquireLock = false;
|
||||
bool reverseInsert = true;
|
||||
bool sequentialInsertModeEnabled = true;
|
||||
RunSequentialInsert( numKeys, acquireLock, reverseInsert, sequentialInsertModeEnabled );
|
||||
}
|
||||
|
||||
TEST(SkipTest, SequentialInsert_Lock_ReverseInsert)
|
||||
{
|
||||
int numKeys = 100000;
|
||||
bool acquireLock = true;
|
||||
bool reverseInsert = true;
|
||||
bool sequentialInsertModeEnabled = true;
|
||||
RunSequentialInsert( numKeys, acquireLock, reverseInsert, sequentialInsertModeEnabled );
|
||||
}
|
||||
|
||||
TEST(SkipTest, SequentialInsert_IncreasingNumberOfInserts)
|
||||
{
|
||||
// test with increasing numbers of keys, with sequential-insert mode both
|
||||
// enabled and disabled; we're looking to see if per-key insertion times
|
||||
// trend upward as the number of keys increases
|
||||
int numKeys = 10000;
|
||||
bool acquireLock = false;
|
||||
bool reverseInsert = false;
|
||||
bool sequentialInsertModeEnabled = true;
|
||||
RunSequentialInsert( numKeys, acquireLock, reverseInsert, sequentialInsertModeEnabled );
|
||||
|
||||
sequentialInsertModeEnabled = false;
|
||||
RunSequentialInsert( numKeys, acquireLock, reverseInsert, sequentialInsertModeEnabled );
|
||||
|
||||
numKeys = 100000;
|
||||
sequentialInsertModeEnabled = true;
|
||||
RunSequentialInsert( numKeys, acquireLock, reverseInsert, sequentialInsertModeEnabled );
|
||||
|
||||
sequentialInsertModeEnabled = false;
|
||||
RunSequentialInsert( numKeys, acquireLock, reverseInsert, sequentialInsertModeEnabled );
|
||||
|
||||
numKeys = 1000000;
|
||||
sequentialInsertModeEnabled = true;
|
||||
RunSequentialInsert( numKeys, acquireLock, reverseInsert, sequentialInsertModeEnabled );
|
||||
|
||||
sequentialInsertModeEnabled = false;
|
||||
RunSequentialInsert( numKeys, acquireLock, reverseInsert, sequentialInsertModeEnabled );
|
||||
}
|
||||
|
||||
TEST(SkipTest, SequentialInsert_MixedInsertionModes)
|
||||
{
|
||||
// start inserting sequentially, then switch to non-sequential inserts,
|
||||
// ensuring all works as intended
|
||||
int j, numSequentialKeys = 100000, numNonSequentialKeys = 100000;
|
||||
int totalNumKeys = numSequentialKeys + numNonSequentialKeys;
|
||||
Arena arena;
|
||||
Comparator cmp;
|
||||
SkipListTest<Key, Comparator> list( cmp, &arena );
|
||||
|
||||
// initially the SkipList should be in sequential mode
|
||||
ASSERT_TRUE( list.InSequentialInsertMode() );
|
||||
|
||||
// start inserting at key=1; when we insert 0 below, the list should switch
|
||||
// out of sequential insert mode
|
||||
for ( j = 1; j < numSequentialKeys; ++j ) {
|
||||
list.Insert( j );
|
||||
}
|
||||
|
||||
// the SkipList should still be in sequential mode
|
||||
ASSERT_TRUE( list.InSequentialInsertMode() );
|
||||
ASSERT_TRUE( list.Valid() );
|
||||
|
||||
list.Insert( 0 );
|
||||
ASSERT_TRUE( !list.InSequentialInsertMode() );
|
||||
ASSERT_TRUE( list.Valid() );
|
||||
|
||||
// now insert the remaining keys in non-sequential order (they're not
|
||||
// random, but that doesn't matter here; just ensure we switch to
|
||||
// non-sequential mode and that all continues to work)
|
||||
for ( j = 0; j < numNonSequentialKeys; j += 2 ) {
|
||||
int key = totalNumKeys - j - 1;
|
||||
list.Insert( key );
|
||||
}
|
||||
for ( j = 0; j < numNonSequentialKeys; j += 2 ) {
|
||||
int key = numSequentialKeys + j;
|
||||
list.Insert( key );
|
||||
}
|
||||
|
||||
ASSERT_TRUE( !list.InSequentialInsertMode() );
|
||||
ASSERT_TRUE( list.Valid() );
|
||||
|
||||
// ensure the SkipList contains all the keys we inserted
|
||||
for ( j = 0; j < totalNumKeys; ++j ) {
|
||||
ASSERT_TRUE( list.Contains( j ) );
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace leveldb
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
|
|
|
@ -5,7 +5,6 @@
|
|||
#ifndef STORAGE_LEVELDB_DB_SNAPSHOT_H_
|
||||
#define STORAGE_LEVELDB_DB_SNAPSHOT_H_
|
||||
|
||||
#include "db/dbformat.h"
|
||||
#include "leveldb/db.h"
|
||||
|
||||
namespace leveldb {
|
||||
|
|
|
@ -5,22 +5,26 @@
|
|||
#include "db/table_cache.h"
|
||||
|
||||
#include "db/filename.h"
|
||||
#include "db/log_reader.h"
|
||||
#include "db/log_writer.h"
|
||||
#include "db/version_edit.h"
|
||||
#include "leveldb/env.h"
|
||||
#include "leveldb/table.h"
|
||||
#include "util/coding.h"
|
||||
#include "leveldb/perf_count.h"
|
||||
|
||||
namespace leveldb {
|
||||
|
||||
struct TableAndFile {
|
||||
RandomAccessFile* file;
|
||||
Table* table;
|
||||
};
|
||||
|
||||
static void DeleteEntry(const Slice& key, void* value) {
|
||||
TableAndFile* tf = reinterpret_cast<TableAndFile*>(value);
|
||||
if (0==dec_and_fetch(&tf->user_count))
|
||||
{
|
||||
if (NULL!=tf->doublecache)
|
||||
tf->doublecache->SubFileSize(tf->table->GetFileSize());
|
||||
delete tf->table;
|
||||
delete tf->file;
|
||||
delete tf;
|
||||
} // if
|
||||
}
|
||||
|
||||
static void UnrefEntry(void* arg1, void* arg2) {
|
||||
|
@ -31,37 +35,38 @@ static void UnrefEntry(void* arg1, void* arg2) {
|
|||
|
||||
TableCache::TableCache(const std::string& dbname,
|
||||
const Options* options,
|
||||
int entries)
|
||||
Cache * file_cache,
|
||||
DoubleCache & doublecache)
|
||||
: env_(options->env),
|
||||
dbname_(dbname),
|
||||
options_(options),
|
||||
cache_(NewLRUCache(entries)) {
|
||||
cache_(file_cache),
|
||||
doublecache_(doublecache)
|
||||
{
|
||||
}
|
||||
|
||||
TableCache::~TableCache() {
|
||||
delete cache_;
|
||||
}
|
||||
|
||||
Status TableCache::FindTable(uint64_t file_number, uint64_t file_size,
|
||||
Cache::Handle** handle) {
|
||||
Status TableCache::FindTable(uint64_t file_number, uint64_t file_size, int level,
|
||||
Cache::Handle** handle, bool is_compaction,
|
||||
bool for_iterator) {
|
||||
Status s;
|
||||
char buf[sizeof(file_number)];
|
||||
EncodeFixed64(buf, file_number);
|
||||
Slice key(buf, sizeof(buf));
|
||||
*handle = cache_->Lookup(key);
|
||||
if (*handle == NULL) {
|
||||
std::string fname = TableFileName(dbname_, file_number);
|
||||
std::string fname = TableFileName(*options_, file_number, level);
|
||||
RandomAccessFile* file = NULL;
|
||||
Table* table = NULL;
|
||||
s = env_->NewRandomAccessFile(fname, &file);
|
||||
if (!s.ok()) {
|
||||
std::string old_fname = SSTTableFileName(dbname_, file_number);
|
||||
if (env_->NewRandomAccessFile(old_fname, &file).ok()) {
|
||||
s = Status::OK();
|
||||
}
|
||||
}
|
||||
if (s.ok()) {
|
||||
s = Table::Open(*options_, file, file_size, &table);
|
||||
|
||||
// Riak: support opportunity to manage Linux page cache
|
||||
if (is_compaction)
|
||||
file->SetForCompaction(file_size);
|
||||
}
|
||||
|
||||
if (!s.ok()) {
|
||||
|
@ -73,22 +78,74 @@ Status TableCache::FindTable(uint64_t file_number, uint64_t file_size,
|
|||
TableAndFile* tf = new TableAndFile;
|
||||
tf->file = file;
|
||||
tf->table = table;
|
||||
*handle = cache_->Insert(key, tf, 1, &DeleteEntry);
|
||||
tf->doublecache = &doublecache_;
|
||||
tf->file_number = file_number;
|
||||
tf->level = level;
|
||||
|
||||
*handle = cache_->Insert(key, tf, table->TableObjectSize(), &DeleteEntry);
|
||||
gPerfCounters->Inc(ePerfTableOpened);
|
||||
doublecache_.AddFileSize(table->GetFileSize());
|
||||
|
||||
// temporary hardcoding to match number of levels defined as
|
||||
// overlapped in version_set.cc
|
||||
if (level<config::kNumOverlapLevels)
|
||||
cache_->Addref(*handle);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
Table *table = reinterpret_cast<TableAndFile*>(cache_->Value(*handle))->table;
|
||||
|
||||
// this is NOT first access, see if bloom filter can load now
|
||||
if (!for_iterator && table->ReadFilter())
|
||||
{
|
||||
// TableAndFile now going to be present in two cache entries
|
||||
// 1. retrieve old entry within file cache
|
||||
TableAndFile* tf = reinterpret_cast<TableAndFile*>(cache_->Value(*handle));
|
||||
inc_and_fetch(&tf->user_count);
|
||||
|
||||
// 2. must clean file size, do not want double count
|
||||
if (NULL!=tf->doublecache)
|
||||
tf->doublecache->SubFileSize(tf->table->GetFileSize());
|
||||
|
||||
// 3. release current reference (and possible special overlap reference)
|
||||
cache_->Release(*handle);
|
||||
if (tf->level<config::kNumOverlapLevels)
|
||||
cache_->Release(*handle);
|
||||
|
||||
// 4. create second table cache entry using TableObjectSize that now includes
|
||||
// bloom filter size
|
||||
*handle = cache_->Insert(key, tf, table->TableObjectSize(), &DeleteEntry);
|
||||
|
||||
// 5. set double reference if an overlapped file (prevents from being flushed)
|
||||
if (level<config::kNumOverlapLevels)
|
||||
cache_->Addref(*handle);
|
||||
} // if
|
||||
|
||||
// for Linux, let fadvise start precaching
|
||||
if (is_compaction)
|
||||
{
|
||||
RandomAccessFile *file = reinterpret_cast<TableAndFile*>(cache_->Value(*handle))->file;
|
||||
file->SetForCompaction(file_size);
|
||||
} // if
|
||||
|
||||
gPerfCounters->Inc(ePerfTableCached);
|
||||
} // else
|
||||
return s;
|
||||
}
|
||||
|
||||
Iterator* TableCache::NewIterator(const ReadOptions& options,
|
||||
uint64_t file_number,
|
||||
uint64_t file_size,
|
||||
int level,
|
||||
Table** tableptr) {
|
||||
if (tableptr != NULL) {
|
||||
*tableptr = NULL;
|
||||
}
|
||||
|
||||
Cache::Handle* handle = NULL;
|
||||
Status s = FindTable(file_number, file_size, &handle);
|
||||
Status s = FindTable(file_number, file_size, level, &handle, options.IsCompaction(), true);
|
||||
|
||||
if (!s.ok()) {
|
||||
return NewErrorIterator(s);
|
||||
}
|
||||
|
@ -105,11 +162,13 @@ Iterator* TableCache::NewIterator(const ReadOptions& options,
|
|||
Status TableCache::Get(const ReadOptions& options,
|
||||
uint64_t file_number,
|
||||
uint64_t file_size,
|
||||
int level,
|
||||
const Slice& k,
|
||||
void* arg,
|
||||
void (*saver)(void*, const Slice&, const Slice&)) {
|
||||
bool (*saver)(void*, const Slice&, const Slice&)) {
|
||||
Cache::Handle* handle = NULL;
|
||||
Status s = FindTable(file_number, file_size, &handle);
|
||||
Status s = FindTable(file_number, file_size, level, &handle);
|
||||
|
||||
if (s.ok()) {
|
||||
Table* t = reinterpret_cast<TableAndFile*>(cache_->Value(handle))->table;
|
||||
s = t->InternalGet(options, k, arg, saver);
|
||||
|
@ -118,10 +177,60 @@ Status TableCache::Get(const ReadOptions& options,
|
|||
return s;
|
||||
}
|
||||
|
||||
void TableCache::Evict(uint64_t file_number) {
|
||||
void TableCache::Evict(uint64_t file_number, bool is_overlapped) {
|
||||
char buf[sizeof(file_number)];
|
||||
EncodeFixed64(buf, file_number);
|
||||
|
||||
// overlapped files have extra reference to prevent their purge,
|
||||
// release that reference now
|
||||
if (is_overlapped)
|
||||
{
|
||||
Cache::Handle *handle;
|
||||
|
||||
// the Lookup call adds a reference too, back out both
|
||||
handle=cache_->Lookup(Slice(buf, sizeof(buf)));
|
||||
|
||||
// with multiple background threads, file might already be
|
||||
// evicted
|
||||
if (NULL!=handle)
|
||||
{
|
||||
cache_->Release(handle); // release for Lookup() call just made
|
||||
cache_->Release(handle); // release for extra reference
|
||||
} // if
|
||||
} // if
|
||||
|
||||
cache_->Erase(Slice(buf, sizeof(buf)));
|
||||
}
|
||||
|
||||
/**
|
||||
* Riak specific routine to return table statistic ONLY if table metadata
|
||||
* already within cache ... otherwise return 0.
|
||||
*/
|
||||
uint64_t
|
||||
TableCache::GetStatisticValue(
|
||||
uint64_t file_number,
|
||||
unsigned Index)
|
||||
{
|
||||
uint64_t ret_val;
|
||||
char buf[sizeof(file_number)];
|
||||
Cache::Handle *handle;
|
||||
|
||||
ret_val=0;
|
||||
EncodeFixed64(buf, file_number);
|
||||
Slice key(buf, sizeof(buf));
|
||||
handle = cache_->Lookup(key);
|
||||
|
||||
if (NULL != handle)
|
||||
{
|
||||
TableAndFile * tf;
|
||||
|
||||
tf=reinterpret_cast<TableAndFile*>(cache_->Value(handle));
|
||||
ret_val=tf->table->GetSstCounters().Value(Index);
|
||||
cache_->Release(handle);
|
||||
} // if
|
||||
|
||||
return(ret_val);
|
||||
|
||||
} // TableCache::GetStatisticValue
|
||||
|
||||
} // namespace leveldb
|
||||
|
|
|
@ -13,6 +13,7 @@
|
|||
#include "leveldb/cache.h"
|
||||
#include "leveldb/table.h"
|
||||
#include "port/port.h"
|
||||
#include "util/cache2.h"
|
||||
|
||||
namespace leveldb {
|
||||
|
||||
|
@ -20,8 +21,10 @@ class Env;
|
|||
|
||||
class TableCache {
|
||||
public:
|
||||
TableCache(const std::string& dbname, const Options* options, int entries);
|
||||
~TableCache();
|
||||
// clean up note: file_cache is redundant to GetFileCache available from doublecache
|
||||
TableCache(const std::string& dbname, const Options* options, Cache * file_cache,
|
||||
DoubleCache & doublecache);
|
||||
virtual ~TableCache();
|
||||
|
||||
// Return an iterator for the specified file number (the corresponding
|
||||
// file length must be exactly "file_size" bytes). If "tableptr" is
|
||||
|
@ -33,6 +36,7 @@ class TableCache {
|
|||
Iterator* NewIterator(const ReadOptions& options,
|
||||
uint64_t file_number,
|
||||
uint64_t file_size,
|
||||
int level,
|
||||
Table** tableptr = NULL);
|
||||
|
||||
// If a seek to internal key "k" in specified file finds an entry,
|
||||
|
@ -40,22 +44,65 @@ class TableCache {
|
|||
Status Get(const ReadOptions& options,
|
||||
uint64_t file_number,
|
||||
uint64_t file_size,
|
||||
int level,
|
||||
const Slice& k,
|
||||
void* arg,
|
||||
void (*handle_result)(void*, const Slice&, const Slice&));
|
||||
bool (*handle_result)(void*, const Slice&, const Slice&));
|
||||
|
||||
// Evict any entry for the specified file number
|
||||
void Evict(uint64_t file_number);
|
||||
void Evict(uint64_t file_number, bool is_overlapped);
|
||||
|
||||
private:
|
||||
// Riak specific: return table statistic ONLY if table in cache, otherwise zero
|
||||
uint64_t GetStatisticValue(uint64_t file_number, unsigned Index);
|
||||
|
||||
|
||||
// access for testing tools, not for public access
|
||||
Status TEST_FindTable(uint64_t file_number, uint64_t file_size, int level, Cache::Handle** handle)
|
||||
{return( FindTable(file_number, file_size, level, handle));};
|
||||
|
||||
Cache* TEST_GetInternalCache() {return(cache_);};
|
||||
|
||||
void Release(Cache::Handle * handle) {cache_->Release(handle);};
|
||||
|
||||
// routine called if Options::cache_object_warming is true.
|
||||
// Writes list of all file names currently in file cache to disk.
|
||||
Status SaveOpenFileList();
|
||||
|
||||
// routine called if Options::cache_object_warming is true.
|
||||
// Reads file created by SaveOpenFileList() and attempts to open
|
||||
// every file.
|
||||
Status PreloadTableCache();
|
||||
|
||||
// was private, now protected to allow easy unit test overrides
|
||||
protected:
|
||||
Env* const env_;
|
||||
const std::string dbname_;
|
||||
const Options* options_;
|
||||
Cache* cache_;
|
||||
Cache * cache_;
|
||||
DoubleCache & doublecache_;
|
||||
|
||||
Status FindTable(uint64_t file_number, uint64_t file_size, Cache::Handle**);
|
||||
// virtual to enable unit test overrides
|
||||
virtual Status FindTable(uint64_t file_number, uint64_t file_size, int level,
|
||||
Cache::Handle**, bool is_compaction=false,
|
||||
bool for_iterator=false);
|
||||
};
|
||||
|
||||
|
||||
struct TableAndFile {
|
||||
RandomAccessFile* file;
|
||||
Table* table;
|
||||
DoubleCache * doublecache;
|
||||
uint64_t file_number; // saved for cache object warming
|
||||
int level; // saved for cache object warming
|
||||
volatile uint32_t user_count;
|
||||
|
||||
TableAndFile()
|
||||
: file(NULL), table(NULL), doublecache(NULL),
|
||||
file_number(0), level(0), user_count(1)
|
||||
{};
|
||||
};
|
||||
|
||||
|
||||
} // namespace leveldb
|
||||
|
||||
#endif // STORAGE_LEVELDB_DB_TABLE_CACHE_H_
|
||||
|
|
|
@ -9,20 +9,6 @@
|
|||
|
||||
namespace leveldb {
|
||||
|
||||
// Tag numbers for serialized VersionEdit. These numbers are written to
|
||||
// disk and should not be changed.
|
||||
enum Tag {
|
||||
kComparator = 1,
|
||||
kLogNumber = 2,
|
||||
kNextFileNumber = 3,
|
||||
kLastSequence = 4,
|
||||
kCompactPointer = 5,
|
||||
kDeletedFile = 6,
|
||||
kNewFile = 7,
|
||||
// 8 was used for large value refs
|
||||
kPrevLogNumber = 9
|
||||
};
|
||||
|
||||
void VersionEdit::Clear() {
|
||||
comparator_.clear();
|
||||
log_number_ = 0;
|
||||
|
@ -34,11 +20,21 @@ void VersionEdit::Clear() {
|
|||
has_prev_log_number_ = false;
|
||||
has_next_file_number_ = false;
|
||||
has_last_sequence_ = false;
|
||||
has_f1_files_ = false;
|
||||
has_f2_files_ = false;
|
||||
|
||||
deleted_files_.clear();
|
||||
new_files_.clear();
|
||||
}
|
||||
|
||||
void VersionEdit::EncodeTo(std::string* dst) const {
|
||||
/**
|
||||
* EncodeTo serializes the VersionEdit object
|
||||
* to the "dst" string parameter. "format2" flag
|
||||
* indicates whether serialization should use original
|
||||
* Google format for file objects (false) or Basho's updated
|
||||
* file2 format for expiry enabled file objects (true)
|
||||
*/
|
||||
void VersionEdit::EncodeTo(std::string* dst, bool format2) const {
|
||||
if (has_comparator_) {
|
||||
PutVarint32(dst, kComparator);
|
||||
PutLengthPrefixedSlice(dst, comparator_);
|
||||
|
@ -76,12 +72,21 @@ void VersionEdit::EncodeTo(std::string* dst) const {
|
|||
|
||||
for (size_t i = 0; i < new_files_.size(); i++) {
|
||||
const FileMetaData& f = new_files_[i].second;
|
||||
if (format2)
|
||||
PutVarint32(dst, kNewFile2);
|
||||
else
|
||||
PutVarint32(dst, kNewFile);
|
||||
PutVarint32(dst, new_files_[i].first); // level
|
||||
PutVarint64(dst, f.number);
|
||||
PutVarint64(dst, f.file_size);
|
||||
PutLengthPrefixedSlice(dst, f.smallest.Encode());
|
||||
PutLengthPrefixedSlice(dst, f.largest.Encode());
|
||||
if (format2)
|
||||
{
|
||||
PutVarint64(dst, f.exp_write_low);
|
||||
PutVarint64(dst, f.exp_write_high);
|
||||
PutVarint64(dst, f.exp_explicit_high);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -98,7 +103,7 @@ static bool GetInternalKey(Slice* input, InternalKey* dst) {
|
|||
static bool GetLevel(Slice* input, int* level) {
|
||||
uint32_t v;
|
||||
if (GetVarint32(input, &v) &&
|
||||
v < config::kNumLevels) {
|
||||
v < (unsigned)config::kNumLevels) {
|
||||
*level = v;
|
||||
return true;
|
||||
} else {
|
||||
|
@ -185,13 +190,34 @@ Status VersionEdit::DecodeFrom(const Slice& src) {
|
|||
GetVarint64(&input, &f.number) &&
|
||||
GetVarint64(&input, &f.file_size) &&
|
||||
GetInternalKey(&input, &f.smallest) &&
|
||||
GetInternalKey(&input, &f.largest)) {
|
||||
GetInternalKey(&input, &f.largest))
|
||||
{
|
||||
has_f1_files_ = true;
|
||||
f.level=level;
|
||||
new_files_.push_back(std::make_pair(level, f));
|
||||
} else {
|
||||
msg = "new-file entry";
|
||||
}
|
||||
break;
|
||||
|
||||
case kNewFile2:
|
||||
if (GetLevel(&input, &level) &&
|
||||
GetVarint64(&input, &f.number) &&
|
||||
GetVarint64(&input, &f.file_size) &&
|
||||
GetInternalKey(&input, &f.smallest) &&
|
||||
GetInternalKey(&input, &f.largest) &&
|
||||
GetVarint64(&input, &f.exp_write_low) &&
|
||||
GetVarint64(&input, &f.exp_write_high) &&
|
||||
GetVarint64(&input, &f.exp_explicit_high))
|
||||
{
|
||||
has_f2_files_ = true;
|
||||
f.level=level;
|
||||
new_files_.push_back(std::make_pair(level, f));
|
||||
} else {
|
||||
msg = "new-file2 entry";
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
msg = "unknown tag";
|
||||
break;
|
||||
|
@ -258,6 +284,12 @@ std::string VersionEdit::DebugString() const {
|
|||
r.append(f.smallest.DebugString());
|
||||
r.append(" .. ");
|
||||
r.append(f.largest.DebugString());
|
||||
r.append(" ");
|
||||
AppendNumberTo(&r, f.exp_write_low);
|
||||
r.append(" ");
|
||||
AppendNumberTo(&r, f.exp_write_high);
|
||||
r.append(" ");
|
||||
AppendNumberTo(&r, f.exp_explicit_high);
|
||||
}
|
||||
r.append("\n}\n");
|
||||
return r;
|
||||
|
|
|
@ -16,15 +16,41 @@ class VersionSet;
|
|||
|
||||
struct FileMetaData {
|
||||
int refs;
|
||||
int allowed_seeks; // Seeks allowed until compaction
|
||||
// int allowed_seeks; // Seeks allowed until compaction
|
||||
uint64_t number;
|
||||
uint64_t file_size; // File size in bytes
|
||||
uint64_t num_entries; // count of values in .sst file, only valid during table build
|
||||
InternalKey smallest; // Smallest internal key served by table
|
||||
InternalKey largest; // Largest internal key served by table
|
||||
int level;
|
||||
ExpiryTimeMicros exp_write_low; // oldest write time in file:
|
||||
// 0 - non-expiry keys exist too
|
||||
// ULLONG_MAX - no write time expiry & no plain keys
|
||||
ExpiryTimeMicros exp_write_high; // most recent write time in file
|
||||
ExpiryTimeMicros exp_explicit_high; // most recent/furthest into future explicit expiry
|
||||
|
||||
FileMetaData() : refs(0), allowed_seeks(1 << 30), file_size(0) { }
|
||||
FileMetaData()
|
||||
: refs(0), /*allowed_seeks(1 << 30),*/ file_size(0),
|
||||
num_entries(0), level(-1), exp_write_low(0), exp_write_high(0), exp_explicit_high(0)
|
||||
{ }
|
||||
};
|
||||
|
||||
|
||||
class FileMetaDataPtrCompare
|
||||
{
|
||||
protected:
|
||||
const Comparator * comparator_;
|
||||
|
||||
public:
|
||||
explicit FileMetaDataPtrCompare(const Comparator * Comparer)
|
||||
: comparator_(Comparer) {};
|
||||
|
||||
bool operator() (const FileMetaData * file1, const FileMetaData * file2) const
|
||||
{
|
||||
return(comparator_->Compare(file1->smallest.user_key(), file2->smallest.user_key()) < 0);
|
||||
}
|
||||
}; // class FileMetaDataPtrCompare
|
||||
|
||||
class VersionEdit {
|
||||
public:
|
||||
VersionEdit() { Clear(); }
|
||||
|
@ -59,6 +85,7 @@ class VersionEdit {
|
|||
// Add the specified file at the specified number.
|
||||
// REQUIRES: This version has not been saved (see VersionSet::SaveTo)
|
||||
// REQUIRES: "smallest" and "largest" are smallest and largest keys in file
|
||||
#if 0
|
||||
void AddFile(int level, uint64_t file,
|
||||
uint64_t file_size,
|
||||
const InternalKey& smallest,
|
||||
|
@ -68,6 +95,27 @@ class VersionEdit {
|
|||
f.file_size = file_size;
|
||||
f.smallest = smallest;
|
||||
f.largest = largest;
|
||||
f.level = level;
|
||||
new_files_.push_back(std::make_pair(level, f));
|
||||
}
|
||||
#endif
|
||||
|
||||
void AddFile2(int level, uint64_t file,
|
||||
uint64_t file_size,
|
||||
const InternalKey& smallest,
|
||||
const InternalKey& largest,
|
||||
uint64_t exp_write_low,
|
||||
uint64_t exp_write_high,
|
||||
uint64_t exp_explicit_high) {
|
||||
FileMetaData f;
|
||||
f.number = file;
|
||||
f.file_size = file_size;
|
||||
f.smallest = smallest;
|
||||
f.largest = largest;
|
||||
f.level = level;
|
||||
f.exp_write_low = exp_write_low;
|
||||
f.exp_write_high = exp_write_high;
|
||||
f.exp_explicit_high = exp_explicit_high;
|
||||
new_files_.push_back(std::make_pair(level, f));
|
||||
}
|
||||
|
||||
|
@ -75,16 +123,37 @@ class VersionEdit {
|
|||
void DeleteFile(int level, uint64_t file) {
|
||||
deleted_files_.insert(std::make_pair(level, file));
|
||||
}
|
||||
size_t DeletedFileCount() const {return(deleted_files_.size());};
|
||||
|
||||
void EncodeTo(std::string* dst) const;
|
||||
void EncodeTo(std::string* dst, bool format2=true) const;
|
||||
Status DecodeFrom(const Slice& src);
|
||||
|
||||
// unit test access to validate file entries' format types
|
||||
bool HasF1Files() const {return(has_f1_files_);};
|
||||
bool HasF2Files() const {return(has_f2_files_);};
|
||||
|
||||
std::string DebugString() const;
|
||||
|
||||
// Tag numbers for serialized VersionEdit. These numbers are written to
|
||||
// disk and should not be changed.
|
||||
enum Tag {
|
||||
kComparator = 1,
|
||||
kLogNumber = 2,
|
||||
kNextFileNumber = 3,
|
||||
kLastSequence = 4,
|
||||
kCompactPointer = 5,
|
||||
kDeletedFile = 6,
|
||||
kNewFile = 7,
|
||||
// 8 was used for large value refs
|
||||
kPrevLogNumber = 9,
|
||||
kFileCacheObject = 10,
|
||||
kNewFile2 = 11 // expiry capable file
|
||||
};
|
||||
|
||||
private:
|
||||
friend class VersionSet;
|
||||
|
||||
typedef std::set< std::pair<int, uint64_t> > DeletedFileSet;
|
||||
USED_BY_NESTED_FRIEND2(typedef std::set< std::pair<int, uint64_t> > DeletedFileSet)
|
||||
|
||||
std::string comparator_;
|
||||
uint64_t log_number_;
|
||||
|
@ -96,10 +165,13 @@ class VersionEdit {
|
|||
bool has_prev_log_number_;
|
||||
bool has_next_file_number_;
|
||||
bool has_last_sequence_;
|
||||
// following should be mutually exclusive, but tested independently to be sure
|
||||
bool has_f1_files_; // manifest uses format 1 (for unit tests)
|
||||
bool has_f2_files_; // manifest uses format 2 (for unit tests)
|
||||
|
||||
std::vector< std::pair<int, InternalKey> > compact_pointers_;
|
||||
DeletedFileSet deleted_files_;
|
||||
std::vector< std::pair<int, FileMetaData> > new_files_;
|
||||
USED_BY_NESTED_FRIEND2(std::vector< std::pair<int, InternalKey> > compact_pointers_)
|
||||
USED_BY_NESTED_FRIEND(DeletedFileSet deleted_files_)
|
||||
USED_BY_NESTED_FRIEND2(std::vector< std::pair<int, FileMetaData> > new_files_)
|
||||
};
|
||||
|
||||
} // namespace leveldb
|
||||
|
|
|
@ -7,14 +7,22 @@
|
|||
|
||||
namespace leveldb {
|
||||
|
||||
static void TestEncodeDecode(const VersionEdit& edit) {
|
||||
static void TestEncodeDecode(
|
||||
const VersionEdit& edit,
|
||||
bool format2=false) {
|
||||
std::string encoded, encoded2;
|
||||
edit.EncodeTo(&encoded);
|
||||
edit.EncodeTo(&encoded,format2);
|
||||
VersionEdit parsed;
|
||||
Status s = parsed.DecodeFrom(encoded);
|
||||
ASSERT_TRUE(s.ok()) << s.ToString();
|
||||
parsed.EncodeTo(&encoded2);
|
||||
parsed.EncodeTo(&encoded2,format2);
|
||||
ASSERT_EQ(encoded, encoded2);
|
||||
|
||||
if (parsed.HasF1Files() || parsed.HasF2Files())
|
||||
{
|
||||
ASSERT_EQ(parsed.HasF1Files(), !format2);
|
||||
ASSERT_EQ(parsed.HasF2Files(), format2);
|
||||
} // if
|
||||
}
|
||||
|
||||
class VersionEditTest { };
|
||||
|
@ -25,11 +33,12 @@ TEST(VersionEditTest, EncodeDecode) {
|
|||
VersionEdit edit;
|
||||
for (int i = 0; i < 4; i++) {
|
||||
TestEncodeDecode(edit);
|
||||
edit.AddFile(3, kBig + 300 + i, kBig + 400 + i,
|
||||
InternalKey("foo", kBig + 500 + i, kTypeValue),
|
||||
InternalKey("zoo", kBig + 600 + i, kTypeDeletion));
|
||||
edit.AddFile2(3, kBig + 300 + i, kBig + 400 + i,
|
||||
InternalKey("foo", 0, kBig + 500 + i, kTypeValue),
|
||||
InternalKey("zoo", 0, kBig + 600 + i, kTypeDeletion),
|
||||
0,0,0);
|
||||
edit.DeleteFile(4, kBig + 700 + i);
|
||||
edit.SetCompactPointer(i, InternalKey("x", kBig + 900 + i, kTypeValue));
|
||||
edit.SetCompactPointer(i, InternalKey("x", 0, kBig + 900 + i, kTypeValue));
|
||||
}
|
||||
|
||||
edit.SetComparatorName("foo");
|
||||
|
@ -39,6 +48,29 @@ TEST(VersionEditTest, EncodeDecode) {
|
|||
TestEncodeDecode(edit);
|
||||
}
|
||||
|
||||
TEST(VersionEditTest, EncodeDecodeExpiry) {
|
||||
static const uint64_t kBig = 1ull << 25;
|
||||
|
||||
VersionEdit edit;
|
||||
for (int i = 0; i < 4; i++) {
|
||||
TestEncodeDecode(edit, false); // only testing for s.ok()
|
||||
edit.AddFile2(3, kBig + 300 + i, kBig + 400 + i,
|
||||
InternalKey("foo", 700+i, kBig + 500 + i, kTypeValueExplicitExpiry),
|
||||
InternalKey("zoo", 800+i, kBig + 600 + i, kTypeDeletion),
|
||||
10203040,
|
||||
123456789,
|
||||
987654321);
|
||||
edit.DeleteFile(4, kBig + 700 + i);
|
||||
edit.SetCompactPointer(i, InternalKey("x", 0, kBig + 900 + i, kTypeValue));
|
||||
}
|
||||
|
||||
edit.SetComparatorName("foo");
|
||||
edit.SetLogNumber(kBig + 100);
|
||||
edit.SetNextFile(kBig + 200);
|
||||
edit.SetLastSequence(kBig + 1000);
|
||||
TestEncodeDecode(edit, true);
|
||||
}
|
||||
|
||||
} // namespace leveldb
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -21,7 +21,9 @@
|
|||
#include "db/dbformat.h"
|
||||
#include "db/version_edit.h"
|
||||
#include "port/port.h"
|
||||
#include "port/thread_annotations.h"
|
||||
#include "leveldb/atomics.h"
|
||||
#include "leveldb/env.h"
|
||||
#include "util/throttle.h"
|
||||
|
||||
namespace leveldb {
|
||||
|
||||
|
@ -70,7 +72,7 @@ class Version {
|
|||
FileMetaData* seek_file;
|
||||
int seek_file_level;
|
||||
};
|
||||
Status Get(const ReadOptions&, const LookupKey& key, std::string* val,
|
||||
Status Get(const ReadOptions&, const LookupKey& key, Value* val,
|
||||
GetStats* stats);
|
||||
|
||||
// Adds "stats" into the current state. Returns true if a new
|
||||
|
@ -78,12 +80,6 @@ class Version {
|
|||
// REQUIRES: lock is held
|
||||
bool UpdateStats(const GetStats& stats);
|
||||
|
||||
// Record a sample of bytes read at the specified internal key.
|
||||
// Samples are taken approximately once every config::kReadBytesPeriod
|
||||
// bytes. Returns true if a new compaction may need to be triggered.
|
||||
// REQUIRES: lock is held
|
||||
bool RecordReadSample(Slice key);
|
||||
|
||||
// Reference count management (so Versions do not disappear out from
|
||||
// under live iterators)
|
||||
void Ref();
|
||||
|
@ -101,43 +97,47 @@ class Version {
|
|||
// largest_user_key==NULL represents a key largest than all keys in the DB.
|
||||
bool OverlapInLevel(int level,
|
||||
const Slice* smallest_user_key,
|
||||
const Slice* largest_user_key);
|
||||
const Slice* largest_user_key) const;
|
||||
|
||||
// Return the level at which we should place a new memtable compaction
|
||||
// result that covers the range [smallest_user_key,largest_user_key].
|
||||
int PickLevelForMemTableOutput(const Slice& smallest_user_key,
|
||||
const Slice& largest_user_key);
|
||||
const Slice& largest_user_key,
|
||||
const int level_limit);
|
||||
|
||||
int NumFiles(int level) const { return files_[level].size(); }
|
||||
virtual size_t NumFiles(int level) const { return files_[level].size(); }
|
||||
|
||||
const VersionSet * GetVersionSet() const { return vset_; }
|
||||
|
||||
typedef std::vector<FileMetaData*> FileMetaDataVector_t;
|
||||
|
||||
virtual const std::vector<FileMetaData*> & GetFileList(int level) const {return files_[level];};
|
||||
|
||||
volatile int WritePenalty() const {return write_penalty_; }
|
||||
|
||||
// Riak specific repair routine
|
||||
bool VerifyLevels(int & level, InternalKey & begin, InternalKey & end);
|
||||
|
||||
// Return a human readable string that describes this version's contents.
|
||||
std::string DebugString() const;
|
||||
|
||||
private:
|
||||
protected:
|
||||
friend class Compaction;
|
||||
friend class VersionSet;
|
||||
|
||||
class LevelFileNumIterator;
|
||||
Iterator* NewConcatenatingIterator(const ReadOptions&, int level) const;
|
||||
|
||||
// Call func(arg, level, f) for every file that overlaps user_key in
|
||||
// order from newest to oldest. If an invocation of func returns
|
||||
// false, makes no more calls.
|
||||
//
|
||||
// REQUIRES: user portion of internal_key == user_key.
|
||||
void ForEachOverlapping(Slice user_key, Slice internal_key,
|
||||
void* arg,
|
||||
bool (*func)(void*, int, FileMetaData*));
|
||||
|
||||
VersionSet* vset_; // VersionSet to which this Version belongs
|
||||
Version* next_; // Next version in linked list
|
||||
Version* prev_; // Previous version in linked list
|
||||
int refs_; // Number of live refs to this version
|
||||
|
||||
// List of files per level
|
||||
std::vector<FileMetaData*> files_[config::kNumLevels];
|
||||
USED_BY_NESTED_FRIEND(std::vector<FileMetaData*> files_[config::kNumLevels];)
|
||||
|
||||
// Next file to compact based on seek stats.
|
||||
protected:
|
||||
// Next file to compact based on seek stats (or Riak delete test)
|
||||
FileMetaData* file_to_compact_;
|
||||
int file_to_compact_level_;
|
||||
|
||||
|
@ -146,17 +146,29 @@ class Version {
|
|||
// are initialized by Finalize().
|
||||
double compaction_score_;
|
||||
int compaction_level_;
|
||||
bool compaction_grooming_;
|
||||
bool compaction_no_move_;
|
||||
bool compaction_expirefile_;
|
||||
volatile int write_penalty_;
|
||||
|
||||
protected:
|
||||
// make the ctor/dtor protected, so that a unit test can subclass
|
||||
explicit Version(VersionSet* vset)
|
||||
: vset_(vset), next_(this), prev_(this), refs_(0),
|
||||
file_to_compact_(NULL),
|
||||
file_to_compact_level_(-1),
|
||||
compaction_score_(-1),
|
||||
compaction_level_(-1) {
|
||||
compaction_level_(-1),
|
||||
compaction_grooming_(false),
|
||||
compaction_no_move_(false),
|
||||
compaction_expirefile_(false),
|
||||
write_penalty_(0)
|
||||
{
|
||||
}
|
||||
|
||||
~Version();
|
||||
virtual ~Version();
|
||||
|
||||
private:
|
||||
// No copying allowed
|
||||
Version(const Version&);
|
||||
void operator=(const Version&);
|
||||
|
@ -175,11 +187,10 @@ class VersionSet {
|
|||
// current version. Will release *mu while actually writing to the file.
|
||||
// REQUIRES: *mu is held on entry.
|
||||
// REQUIRES: no other thread concurrently calls LogAndApply()
|
||||
Status LogAndApply(VersionEdit* edit, port::Mutex* mu)
|
||||
EXCLUSIVE_LOCKS_REQUIRED(mu);
|
||||
Status LogAndApply(VersionEdit* edit, port::Mutex* mu);
|
||||
|
||||
// Recover the last saved descriptor from persistent storage.
|
||||
Status Recover(bool *save_manifest);
|
||||
Status Recover();
|
||||
|
||||
// Return the current version.
|
||||
Version* current() const { return current_; }
|
||||
|
@ -188,19 +199,29 @@ class VersionSet {
|
|||
uint64_t ManifestFileNumber() const { return manifest_file_number_; }
|
||||
|
||||
// Allocate and return a new file number
|
||||
uint64_t NewFileNumber() { return next_file_number_++; }
|
||||
// (-1 is to "duplicate" old post-increment logic while maintaining
|
||||
// some threading integrity ... next_file_number_ used naked a bunch)
|
||||
uint64_t NewFileNumber() { return(inc_and_fetch(&next_file_number_) -1); }
|
||||
|
||||
// Arrange to reuse "file_number" unless a newer file number has
|
||||
// already been allocated.
|
||||
// REQUIRES: "file_number" was returned by a call to NewFileNumber().
|
||||
// (disabled due to threading concerns ... and desire NOT to use mutex, matthewv)
|
||||
void ReuseFileNumber(uint64_t file_number) {
|
||||
if (next_file_number_ == file_number + 1) {
|
||||
next_file_number_ = file_number;
|
||||
}
|
||||
// if (next_file_number_ == file_number + 1) {
|
||||
// next_file_number_ = file_number;
|
||||
// }
|
||||
}
|
||||
|
||||
// Return the number of Table files at the specified level.
|
||||
int NumLevelFiles(int level) const;
|
||||
size_t NumLevelFiles(int level) const;
|
||||
|
||||
// is the specified level overlapped (or if false->sorted)
|
||||
static bool IsLevelOverlapped(int level);
|
||||
|
||||
static uint64_t DesiredBytesForLevel(int level);
|
||||
static uint64_t MaxBytesForLevel(int level);
|
||||
static uint64_t MaxFileSizeForLevel(int level);
|
||||
|
||||
// Return the combined file size of all files at the specified level.
|
||||
int64_t NumLevelBytes(int level) const;
|
||||
|
@ -224,11 +245,36 @@ class VersionSet {
|
|||
// being compacted, or zero if there is no such log file.
|
||||
uint64_t PrevLogNumber() const { return prev_log_number_; }
|
||||
|
||||
int WriteThrottleUsec(bool active_compaction)
|
||||
{
|
||||
uint64_t penalty, throttle;
|
||||
int ret_val;
|
||||
|
||||
penalty=current_->write_penalty_;
|
||||
throttle=GetThrottleWriteRate();
|
||||
|
||||
ret_val=0;
|
||||
if (0==penalty && 1!=throttle)
|
||||
ret_val=(int)throttle;
|
||||
else if (0!=penalty)
|
||||
{
|
||||
if (1==throttle)
|
||||
throttle=GetUnadjustedThrottleWriteRate();
|
||||
ret_val=(int)penalty * throttle;
|
||||
} // else if
|
||||
|
||||
return(ret_val);
|
||||
}
|
||||
|
||||
|
||||
// Pick level and inputs for a new compaction.
|
||||
// Returns NULL if there is no compaction to be done.
|
||||
// Otherwise returns a pointer to a heap-allocated object that
|
||||
// describes the compaction. Caller should delete the result.
|
||||
Compaction* PickCompaction();
|
||||
//
|
||||
// Riak October 2013: Pick Compaction now posts work directly
|
||||
// to hot_thread pools
|
||||
void PickCompaction(class DBImpl * db_impl);
|
||||
|
||||
// Return a compaction object for compacting the range [begin,end] in
|
||||
// the specified level. Returns NULL if there is nothing in that
|
||||
|
@ -267,16 +313,42 @@ class VersionSet {
|
|||
char buffer[100];
|
||||
};
|
||||
const char* LevelSummary(LevelSummaryStorage* scratch) const;
|
||||
const char* CompactionSummary(LevelSummaryStorage* scratch) const;
|
||||
|
||||
private:
|
||||
TableCache* GetTableCache() {return(table_cache_);};
|
||||
|
||||
const Options * GetOptions() const {return(options_);};
|
||||
|
||||
bool IsCompactionSubmitted(int level)
|
||||
{return(m_CompactionStatus[level].m_Submitted);}
|
||||
|
||||
void SetCompactionSubmitted(int level)
|
||||
{m_CompactionStatus[level].m_Submitted=true;}
|
||||
|
||||
void SetCompactionRunning(int level)
|
||||
{m_CompactionStatus[level].m_Running=true;}
|
||||
|
||||
void SetCompactionDone(int level, uint64_t Now)
|
||||
{ m_CompactionStatus[level].m_Running=false;
|
||||
m_CompactionStatus[level].m_Submitted=false;
|
||||
// must set both source and destination. otherwise
|
||||
// destination might immediately decide it needs a
|
||||
// timed grooming too ... defeating idea to spreadout the groomings
|
||||
m_CompactionStatus[level].m_LastCompaction=Now;
|
||||
if ((level+1)<config::kNumLevels)
|
||||
m_CompactionStatus[level+1].m_LastCompaction=Now;
|
||||
}
|
||||
|
||||
bool NeighborCompactionsQuiet(int level);
|
||||
|
||||
protected:
|
||||
class Builder;
|
||||
|
||||
friend class Compaction;
|
||||
friend class Version;
|
||||
|
||||
bool ReuseManifest(const std::string& dscname, const std::string& dscbase);
|
||||
|
||||
void Finalize(Version* v);
|
||||
bool Finalize(Version* v);
|
||||
void UpdatePenalty(Version *v);
|
||||
|
||||
void GetRange(const std::vector<FileMetaData*>& inputs,
|
||||
InternalKey* smallest,
|
||||
|
@ -299,7 +371,7 @@ class VersionSet {
|
|||
const Options* const options_;
|
||||
TableCache* const table_cache_;
|
||||
const InternalKeyComparator icmp_;
|
||||
uint64_t next_file_number_;
|
||||
volatile uint64_t next_file_number_;
|
||||
uint64_t manifest_file_number_;
|
||||
uint64_t last_sequence_;
|
||||
uint64_t log_number_;
|
||||
|
@ -315,11 +387,44 @@ class VersionSet {
|
|||
// Either an empty string, or a valid InternalKey.
|
||||
std::string compact_pointer_[config::kNumLevels];
|
||||
|
||||
// Riak allows multiple compaction threads, this mutex allows
|
||||
// only one to write to manifest at a time. Only used in LogAndApply
|
||||
port::Mutex manifest_mutex_;
|
||||
|
||||
volatile uint64_t last_penalty_minutes_;
|
||||
volatile int prev_write_penalty_;
|
||||
|
||||
|
||||
|
||||
struct CompactionStatus_s
|
||||
{
|
||||
bool m_Submitted; //!< level submitted to hot thread pool
|
||||
bool m_Running; //!< thread actually running compaction
|
||||
uint64_t m_LastCompaction; //!<NowMicros() when last compaction completed
|
||||
|
||||
CompactionStatus_s()
|
||||
: m_Submitted(false), m_Running(false), m_LastCompaction(0)
|
||||
{};
|
||||
} m_CompactionStatus[config::kNumLevels];
|
||||
|
||||
private:
|
||||
// No copying allowed
|
||||
VersionSet(const VersionSet&);
|
||||
void operator=(const VersionSet&);
|
||||
};
|
||||
|
||||
//
|
||||
// allows routing of compaction request to
|
||||
// diverse processing routines via common
|
||||
// BackgroundCall2 thread entry
|
||||
//
|
||||
enum CompactionType
|
||||
{
|
||||
kNormalCompaction = 0x0,
|
||||
kExpiryFileCompaction = 0x1
|
||||
}; // CompactionType
|
||||
|
||||
|
||||
// A Compaction encapsulates information about a compaction.
|
||||
class Compaction {
|
||||
public:
|
||||
|
@ -329,6 +434,9 @@ class Compaction {
|
|||
// and "level+1" will be merged to produce a set of "level+1" files.
|
||||
int level() const { return level_; }
|
||||
|
||||
// Return parent Version object
|
||||
const Version * version() const { return input_version_; }
|
||||
|
||||
// Return the object that holds the edits to the descriptor done
|
||||
// by this compaction.
|
||||
VersionEdit* edit() { return &edit_; }
|
||||
|
@ -356,32 +464,47 @@ class Compaction {
|
|||
|
||||
// Returns true iff we should stop building the current output
|
||||
// before processing "internal_key".
|
||||
bool ShouldStopBefore(const Slice& internal_key);
|
||||
bool ShouldStopBefore(const Slice& internal_key, size_t key_count);
|
||||
|
||||
// Release the input version for the compaction, once the compaction
|
||||
// is successful.
|
||||
void ReleaseInputs();
|
||||
|
||||
// Riak specific: get summary statistics from compaction inputs
|
||||
void CalcInputStats(TableCache & tables);
|
||||
size_t TotalUserDataSize() const {return(tot_user_data_);};
|
||||
size_t TotalIndexKeys() const {return(tot_index_keys_);};
|
||||
size_t AverageValueSize() const {return(avg_value_size_);};
|
||||
size_t AverageKeySize() const {return(avg_key_size_);};
|
||||
size_t AverageBlockSize() const {return(avg_block_size_);};
|
||||
bool IsCompressible() const {return(compressible_);};
|
||||
|
||||
// Riak specific: is move operation ok for compaction?
|
||||
bool IsMoveOk() const {return(!no_move_);};
|
||||
|
||||
enum CompactionType GetCompactionType() const {return(compaction_type_);};
|
||||
|
||||
private:
|
||||
friend class Version;
|
||||
friend class VersionSet;
|
||||
|
||||
Compaction(const Options* options, int level);
|
||||
explicit Compaction(int level);
|
||||
|
||||
int level_;
|
||||
uint64_t max_output_file_size_;
|
||||
Version* input_version_;
|
||||
VersionEdit edit_;
|
||||
CompactionType compaction_type_;
|
||||
|
||||
// Each compaction reads inputs from "level_" and "level_+1"
|
||||
std::vector<FileMetaData*> inputs_[2]; // The two sets of inputs
|
||||
|
||||
// State used to check for number of overlapping grandparent files
|
||||
// State used to check for number of of overlapping grandparent files
|
||||
// (parent == level_ + 1, grandparent == level_ + 2)
|
||||
std::vector<FileMetaData*> grandparents_;
|
||||
size_t grandparent_index_; // Index in grandparent_starts_
|
||||
bool seen_key_; // Some output key has been seen
|
||||
int64_t overlapped_bytes_; // Bytes of overlap between current output
|
||||
uint64_t overlapped_bytes_; // Bytes of overlap between current output
|
||||
// and grandparent files
|
||||
|
||||
// State for implementing IsBaseLevelForKey
|
||||
|
@ -391,6 +514,16 @@ class Compaction {
|
|||
// higher level than the ones involved in this compaction (i.e. for
|
||||
// all L >= level_ + 2).
|
||||
size_t level_ptrs_[config::kNumLevels];
|
||||
|
||||
// Riak specific: output statistics from CalcInputStats
|
||||
size_t tot_user_data_;
|
||||
size_t tot_index_keys_;
|
||||
size_t avg_value_size_;
|
||||
size_t avg_key_size_;
|
||||
size_t avg_block_size_;
|
||||
bool compressible_;
|
||||
bool stats_done_;
|
||||
bool no_move_;
|
||||
};
|
||||
|
||||
} // namespace leveldb
|
||||
|
|
|
@ -27,13 +27,13 @@ class FindFileTest {
|
|||
SequenceNumber largest_seq = 100) {
|
||||
FileMetaData* f = new FileMetaData;
|
||||
f->number = files_.size() + 1;
|
||||
f->smallest = InternalKey(smallest, smallest_seq, kTypeValue);
|
||||
f->largest = InternalKey(largest, largest_seq, kTypeValue);
|
||||
f->smallest = InternalKey(smallest, 0, smallest_seq, kTypeValue);
|
||||
f->largest = InternalKey(largest, 0, largest_seq, kTypeValue);
|
||||
files_.push_back(f);
|
||||
}
|
||||
|
||||
int Find(const char* key) {
|
||||
InternalKey target(key, 100, kTypeValue);
|
||||
InternalKey target(key, 0, 100, kTypeValue);
|
||||
InternalKeyComparator cmp(BytewiseComparator());
|
||||
return FindFile(cmp, files_, target.Encode());
|
||||
}
|
||||
|
|
|
@ -13,13 +13,17 @@
|
|||
// len: varint32
|
||||
// data: uint8[len]
|
||||
|
||||
#include "leveldb/write_batch.h"
|
||||
#include <stdint.h>
|
||||
|
||||
#include "leveldb/db.h"
|
||||
#include "leveldb/env.h"
|
||||
#include "leveldb/expiry.h"
|
||||
#include "leveldb/write_batch.h"
|
||||
#include "db/dbformat.h"
|
||||
#include "db/memtable.h"
|
||||
#include "db/write_batch_internal.h"
|
||||
#include "util/coding.h"
|
||||
#include "util/throttle.h"
|
||||
|
||||
namespace leveldb {
|
||||
|
||||
|
@ -47,16 +51,17 @@ Status WriteBatch::Iterate(Handler* handler) const {
|
|||
|
||||
input.remove_prefix(kHeader);
|
||||
Slice key, value;
|
||||
ExpiryTimeMicros expiry;
|
||||
int found = 0;
|
||||
while (!input.empty()) {
|
||||
found++;
|
||||
char tag = input[0];
|
||||
ValueType tag = (ValueType)input[0];
|
||||
input.remove_prefix(1);
|
||||
switch (tag) {
|
||||
case kTypeValue:
|
||||
if (GetLengthPrefixedSlice(&input, &key) &&
|
||||
GetLengthPrefixedSlice(&input, &value)) {
|
||||
handler->Put(key, value);
|
||||
handler->Put(key, value, kTypeValue, 0);
|
||||
} else {
|
||||
return Status::Corruption("bad WriteBatch Put");
|
||||
}
|
||||
|
@ -68,6 +73,16 @@ Status WriteBatch::Iterate(Handler* handler) const {
|
|||
return Status::Corruption("bad WriteBatch Delete");
|
||||
}
|
||||
break;
|
||||
case kTypeValueWriteTime:
|
||||
case kTypeValueExplicitExpiry:
|
||||
if (GetLengthPrefixedSlice(&input, &key) &&
|
||||
GetVarint64(&input, &expiry) &&
|
||||
GetLengthPrefixedSlice(&input, &value)) {
|
||||
handler->Put(key, value, tag, expiry);
|
||||
} else {
|
||||
return Status::Corruption("bad WriteBatch Expiry");
|
||||
}
|
||||
break;
|
||||
default:
|
||||
return Status::Corruption("unknown WriteBatch tag");
|
||||
}
|
||||
|
@ -95,10 +110,20 @@ void WriteBatchInternal::SetSequence(WriteBatch* b, SequenceNumber seq) {
|
|||
EncodeFixed64(&b->rep_[0], seq);
|
||||
}
|
||||
|
||||
void WriteBatch::Put(const Slice& key, const Slice& value) {
|
||||
void WriteBatch::Put(const Slice& key, const Slice& value, const KeyMetaData * meta) {
|
||||
KeyMetaData local_meta;
|
||||
WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1);
|
||||
rep_.push_back(static_cast<char>(kTypeValue));
|
||||
if (NULL!=meta)
|
||||
local_meta=*meta;
|
||||
rep_.push_back(static_cast<char>(local_meta.m_Type));
|
||||
PutLengthPrefixedSlice(&rep_, key);
|
||||
if (kTypeValueExplicitExpiry==local_meta.m_Type
|
||||
|| kTypeValueWriteTime==local_meta.m_Type)
|
||||
{
|
||||
if (kTypeValueWriteTime==local_meta.m_Type && 0==local_meta.m_Expiry)
|
||||
local_meta.m_Expiry=GetCachedTimeMicros();
|
||||
PutVarint64(&rep_, local_meta.m_Expiry);
|
||||
} // if
|
||||
PutLengthPrefixedSlice(&rep_, value);
|
||||
}
|
||||
|
||||
|
@ -113,23 +138,33 @@ class MemTableInserter : public WriteBatch::Handler {
|
|||
public:
|
||||
SequenceNumber sequence_;
|
||||
MemTable* mem_;
|
||||
const Options * options_;
|
||||
|
||||
virtual void Put(const Slice& key, const Slice& value) {
|
||||
mem_->Add(sequence_, kTypeValue, key, value);
|
||||
MemTableInserter() : mem_(NULL), options_(NULL) {};
|
||||
|
||||
virtual void Put(const Slice& key, const Slice& value, const ValueType &type, const ExpiryTimeMicros &expiry) {
|
||||
ValueType type_use(type);
|
||||
ExpiryTimeMicros expiry_use(expiry);
|
||||
|
||||
if (NULL!=options_ && options_->ExpiryActivated())
|
||||
options_->expiry_module->MemTableInserterCallback(key, value, type_use, expiry_use);
|
||||
mem_->Add(sequence_, (ValueType)type_use, key, value, expiry_use);
|
||||
sequence_++;
|
||||
}
|
||||
virtual void Delete(const Slice& key) {
|
||||
mem_->Add(sequence_, kTypeDeletion, key, Slice());
|
||||
mem_->Add(sequence_, kTypeDeletion, key, Slice(), 0);
|
||||
sequence_++;
|
||||
}
|
||||
};
|
||||
} // namespace
|
||||
|
||||
Status WriteBatchInternal::InsertInto(const WriteBatch* b,
|
||||
MemTable* memtable) {
|
||||
MemTable* memtable,
|
||||
const Options * options) {
|
||||
MemTableInserter inserter;
|
||||
inserter.sequence_ = WriteBatchInternal::Sequence(b);
|
||||
inserter.mem_ = memtable;
|
||||
inserter.options_ = options;
|
||||
return b->Iterate(&inserter);
|
||||
}
|
||||
|
||||
|
|
|
@ -5,7 +5,6 @@
|
|||
#ifndef STORAGE_LEVELDB_DB_WRITE_BATCH_INTERNAL_H_
|
||||
#define STORAGE_LEVELDB_DB_WRITE_BATCH_INTERNAL_H_
|
||||
|
||||
#include "db/dbformat.h"
|
||||
#include "leveldb/write_batch.h"
|
||||
|
||||
namespace leveldb {
|
||||
|
@ -22,10 +21,10 @@ class WriteBatchInternal {
|
|||
// Set the count for the number of entries in the batch.
|
||||
static void SetCount(WriteBatch* batch, int n);
|
||||
|
||||
// Return the sequence number for the start of this batch.
|
||||
// Return the seqeunce number for the start of this batch.
|
||||
static SequenceNumber Sequence(const WriteBatch* batch);
|
||||
|
||||
// Store the specified number as the sequence number for the start of
|
||||
// Store the specified number as the seqeunce number for the start of
|
||||
// this batch.
|
||||
static void SetSequence(WriteBatch* batch, SequenceNumber seq);
|
||||
|
||||
|
@ -39,7 +38,7 @@ class WriteBatchInternal {
|
|||
|
||||
static void SetContents(WriteBatch* batch, const Slice& contents);
|
||||
|
||||
static Status InsertInto(const WriteBatch* batch, MemTable* memtable);
|
||||
static Status InsertInto(const WriteBatch* batch, MemTable* memtable, const Options * options);
|
||||
|
||||
static void Append(WriteBatch* dst, const WriteBatch* src);
|
||||
};
|
||||
|
|
|
@ -2,6 +2,7 @@
|
|||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include <sstream>
|
||||
#include "leveldb/db.h"
|
||||
|
||||
#include "db/memtable.h"
|
||||
|
@ -17,11 +18,12 @@ static std::string PrintContents(WriteBatch* b) {
|
|||
MemTable* mem = new MemTable(cmp);
|
||||
mem->Ref();
|
||||
std::string state;
|
||||
Status s = WriteBatchInternal::InsertInto(b, mem);
|
||||
Status s = WriteBatchInternal::InsertInto(b, mem, NULL);
|
||||
int count = 0;
|
||||
Iterator* iter = mem->NewIterator();
|
||||
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
|
||||
ParsedInternalKey ikey;
|
||||
std::stringstream sstr;
|
||||
ASSERT_TRUE(ParseInternalKey(iter->key(), &ikey));
|
||||
switch (ikey.type) {
|
||||
case kTypeValue:
|
||||
|
@ -32,6 +34,28 @@ static std::string PrintContents(WriteBatch* b) {
|
|||
state.append(")");
|
||||
count++;
|
||||
break;
|
||||
case kTypeValueWriteTime:
|
||||
state.append("PutWT(");
|
||||
state.append(ikey.user_key.ToString());
|
||||
state.append(", ");
|
||||
sstr << ikey.expiry;
|
||||
state.append(sstr.str());
|
||||
state.append(", ");
|
||||
state.append(iter->value().ToString());
|
||||
state.append(")");
|
||||
count++;
|
||||
break;
|
||||
case kTypeValueExplicitExpiry:
|
||||
state.append("PutEE(");
|
||||
state.append(ikey.user_key.ToString());
|
||||
state.append(", ");
|
||||
sstr << ikey.expiry;
|
||||
state.append(sstr.str());
|
||||
state.append(", ");
|
||||
state.append(iter->value().ToString());
|
||||
state.append(")");
|
||||
count++;
|
||||
break;
|
||||
case kTypeDeletion:
|
||||
state.append("Delete(");
|
||||
state.append(ikey.user_key.ToString());
|
||||
|
@ -74,6 +98,32 @@ TEST(WriteBatchTest, Multiple) {
|
|||
PrintContents(&batch));
|
||||
}
|
||||
|
||||
TEST(WriteBatchTest, MultipleExpiry) {
|
||||
WriteBatch batch;
|
||||
KeyMetaData meta;
|
||||
batch.Put(Slice("Mary"), Slice("Lamb"));
|
||||
meta.m_Type=kTypeValueExplicitExpiry;
|
||||
meta.m_Expiry=2347;
|
||||
batch.Put(Slice("Adam"), Slice("Ant"), &meta);
|
||||
//batch.PutExplicitExpiry(Slice("Adam"), Slice("Ant"), 2347);
|
||||
batch.Put(Slice("Frosty"), Slice("Snowman"));
|
||||
batch.Put(Slice("Tip"), Slice("ONeal"));
|
||||
batch.Delete(Slice("Frosty"));
|
||||
meta.m_Type=kTypeValueExplicitExpiry;
|
||||
meta.m_Expiry=987654321;
|
||||
batch.Put(Slice("The"), Slice("Fonz"), &meta);
|
||||
WriteBatchInternal::SetSequence(&batch, 200);
|
||||
ASSERT_EQ(200, WriteBatchInternal::Sequence(&batch));
|
||||
ASSERT_EQ(6, WriteBatchInternal::Count(&batch));
|
||||
ASSERT_EQ("PutEE(Adam, 2347, Ant)@201"
|
||||
"Delete(Frosty)@204"
|
||||
"Put(Frosty, Snowman)@202"
|
||||
"Put(Mary, Lamb)@200"
|
||||
"PutEE(The, 987654321, Fonz)@205"
|
||||
"Put(Tip, ONeal)@203",
|
||||
PrintContents(&batch));
|
||||
}
|
||||
|
||||
TEST(WriteBatchTest, Corruption) {
|
||||
WriteBatch batch;
|
||||
batch.Put(Slice("foo"), Slice("bar"));
|
||||
|
|
|
@ -618,7 +618,7 @@ class Benchmark {
|
|||
ErrorCheck(status);
|
||||
|
||||
// Execute read statement
|
||||
while ((status = sqlite3_step(read_stmt)) == SQLITE_ROW) {}
|
||||
while ((status = sqlite3_step(read_stmt)) == SQLITE_ROW);
|
||||
StepErrorCheck(status);
|
||||
|
||||
// Reset SQLite statement for another use
|
||||
|
|
|
@ -338,7 +338,7 @@ class Benchmark {
|
|||
bool write_sync = false;
|
||||
if (name == Slice("fillseq")) {
|
||||
Write(write_sync, SEQUENTIAL, FRESH, num_, FLAGS_value_size, 1);
|
||||
DBSynchronize(db_);
|
||||
|
||||
} else if (name == Slice("fillrandom")) {
|
||||
Write(write_sync, RANDOM, FRESH, num_, FLAGS_value_size, 1);
|
||||
DBSynchronize(db_);
|
||||
|
|
89
src/leveldb/doc/doc.css
Normal file
89
src/leveldb/doc/doc.css
Normal file
|
@ -0,0 +1,89 @@
|
|||
body {
|
||||
margin-left: 0.5in;
|
||||
margin-right: 0.5in;
|
||||
background: white;
|
||||
color: black;
|
||||
}
|
||||
|
||||
h1 {
|
||||
margin-left: -0.2in;
|
||||
font-size: 14pt;
|
||||
}
|
||||
h2 {
|
||||
margin-left: -0in;
|
||||
font-size: 12pt;
|
||||
}
|
||||
h3 {
|
||||
margin-left: -0in;
|
||||
}
|
||||
h4 {
|
||||
margin-left: -0in;
|
||||
}
|
||||
hr {
|
||||
margin-left: -0in;
|
||||
}
|
||||
|
||||
/* Definition lists: definition term bold */
|
||||
dt {
|
||||
font-weight: bold;
|
||||
}
|
||||
|
||||
address {
|
||||
text-align: center;
|
||||
}
|
||||
code,samp,var {
|
||||
color: blue;
|
||||
}
|
||||
kbd {
|
||||
color: #600000;
|
||||
}
|
||||
div.note p {
|
||||
float: right;
|
||||
width: 3in;
|
||||
margin-right: 0%;
|
||||
padding: 1px;
|
||||
border: 2px solid #6060a0;
|
||||
background-color: #fffff0;
|
||||
}
|
||||
|
||||
ul {
|
||||
margin-top: -0em;
|
||||
margin-bottom: -0em;
|
||||
}
|
||||
|
||||
ol {
|
||||
margin-top: -0em;
|
||||
margin-bottom: -0em;
|
||||
}
|
||||
|
||||
UL.nobullets {
|
||||
list-style-type: none;
|
||||
list-style-image: none;
|
||||
margin-left: -1em;
|
||||
}
|
||||
|
||||
p {
|
||||
margin: 1em 0 1em 0;
|
||||
padding: 0 0 0 0;
|
||||
}
|
||||
|
||||
pre {
|
||||
line-height: 1.3em;
|
||||
padding: 0.4em 0 0.8em 0;
|
||||
margin: 0 0 0 0;
|
||||
border: 0 0 0 0;
|
||||
color: blue;
|
||||
}
|
||||
|
||||
.datatable {
|
||||
margin-left: auto;
|
||||
margin-right: auto;
|
||||
margin-top: 2em;
|
||||
margin-bottom: 2em;
|
||||
border: 1px solid;
|
||||
}
|
||||
|
||||
.datatable td,th {
|
||||
padding: 0 0.5em 0 0.5em;
|
||||
text-align: right;
|
||||
}
|
213
src/leveldb/doc/impl.html
Normal file
213
src/leveldb/doc/impl.html
Normal file
|
@ -0,0 +1,213 @@
|
|||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<link rel="stylesheet" type="text/css" href="doc.css" />
|
||||
<title>Leveldb file layout and compactions</title>
|
||||
</head>
|
||||
|
||||
<body>
|
||||
|
||||
<h1>Files</h1>
|
||||
|
||||
The implementation of leveldb is similar in spirit to the
|
||||
representation of a single
|
||||
<a href="http://labs.google.com/papers/bigtable.html">
|
||||
Bigtable tablet (section 5.3)</a>.
|
||||
However the organization of the files that make up the representation
|
||||
is somewhat different and is explained below.
|
||||
|
||||
<p>
|
||||
Each database is represented by a set of files stored in a directory.
|
||||
There are several different types of files as documented below:
|
||||
<p>
|
||||
<h2>Log files</h2>
|
||||
<p>
|
||||
A log file (*.log) stores a sequence of recent updates. Each update
|
||||
is appended to the current log file. When the log file reaches a
|
||||
pre-determined size (approximately 4MB by default), it is converted
|
||||
to a sorted table (see below) and a new log file is created for future
|
||||
updates.
|
||||
<p>
|
||||
A copy of the current log file is kept in an in-memory structure (the
|
||||
<code>memtable</code>). This copy is consulted on every read so that read
|
||||
operations reflect all logged updates.
|
||||
<p>
|
||||
<h2>Sorted tables</h2>
|
||||
<p>
|
||||
A sorted table (*.sst) stores a sequence of entries sorted by key.
|
||||
Each entry is either a value for the key, or a deletion marker for the
|
||||
key. (Deletion markers are kept around to hide obsolete values
|
||||
present in older sorted tables).
|
||||
<p>
|
||||
The set of sorted tables are organized into a sequence of levels. The
|
||||
sorted table generated from a log file is placed in a special <code>young</code>
|
||||
level (also called level-0). When the number of young files exceeds a
|
||||
certain threshold (currently four), all of the young files are merged
|
||||
together with all of the overlapping level-1 files to produce a
|
||||
sequence of new level-1 files (we create a new level-1 file for every
|
||||
2MB of data.)
|
||||
<p>
|
||||
Files in the young level may contain overlapping keys. However files
|
||||
in other levels have distinct non-overlapping key ranges. Consider
|
||||
level number L where L >= 1. When the combined size of files in
|
||||
level-L exceeds (10^L) MB (i.e., 10MB for level-1, 100MB for level-2,
|
||||
...), one file in level-L, and all of the overlapping files in
|
||||
level-(L+1) are merged to form a set of new files for level-(L+1).
|
||||
These merges have the effect of gradually migrating new updates from
|
||||
the young level to the largest level using only bulk reads and writes
|
||||
(i.e., minimizing expensive seeks).
|
||||
|
||||
<h2>Manifest</h2>
|
||||
<p>
|
||||
A MANIFEST file lists the set of sorted tables that make up each
|
||||
level, the corresponding key ranges, and other important metadata.
|
||||
A new MANIFEST file (with a new number embedded in the file name)
|
||||
is created whenever the database is reopened. The MANIFEST file is
|
||||
formatted as a log, and changes made to the serving state (as files
|
||||
are added or removed) are appended to this log.
|
||||
<p>
|
||||
<h2>Current</h2>
|
||||
<p>
|
||||
CURRENT is a simple text file that contains the name of the latest
|
||||
MANIFEST file.
|
||||
<p>
|
||||
<h2>Info logs</h2>
|
||||
<p>
|
||||
Informational messages are printed to files named LOG and LOG.old.
|
||||
<p>
|
||||
<h2>Others</h2>
|
||||
<p>
|
||||
Other files used for miscellaneous purposes may also be present
|
||||
(LOCK, *.dbtmp).
|
||||
|
||||
<h1>Level 0</h1>
|
||||
When the log file grows above a certain size (1MB by default):
|
||||
<ul>
|
||||
<li>Create a brand new memtable and log file and direct future updates here
|
||||
<li>In the background:
|
||||
<ul>
|
||||
<li>Write the contents of the previous memtable to an sstable
|
||||
<li>Discard the memtable
|
||||
<li>Delete the old log file and the old memtable
|
||||
<li>Add the new sstable to the young (level-0) level.
|
||||
</ul>
|
||||
</ul>
|
||||
|
||||
<h1>Compactions</h1>
|
||||
|
||||
<p>
|
||||
When the size of level L exceeds its limit, we compact it in a
|
||||
background thread. The compaction picks a file from level L and all
|
||||
overlapping files from the next level L+1. Note that if a level-L
|
||||
file overlaps only part of a level-(L+1) file, the entire file at
|
||||
level-(L+1) is used as an input to the compaction and will be
|
||||
discarded after the compaction. Aside: because level-0 is special
|
||||
(files in it may overlap each other), we treat compactions from
|
||||
level-0 to level-1 specially: a level-0 compaction may pick more than
|
||||
one level-0 file in case some of these files overlap each other.
|
||||
|
||||
<p>
|
||||
A compaction merges the contents of the picked files to produce a
|
||||
sequence of level-(L+1) files. We switch to producing a new
|
||||
level-(L+1) file after the current output file has reached the target
|
||||
file size (2MB). We also switch to a new output file when the key
|
||||
range of the current output file has grown enough to overlap more then
|
||||
ten level-(L+2) files. This last rule ensures that a later compaction
|
||||
of a level-(L+1) file will not pick up too much data from level-(L+2).
|
||||
|
||||
<p>
|
||||
The old files are discarded and the new files are added to the serving
|
||||
state.
|
||||
|
||||
<p>
|
||||
Compactions for a particular level rotate through the key space. In
|
||||
more detail, for each level L, we remember the ending key of the last
|
||||
compaction at level L. The next compaction for level L will pick the
|
||||
first file that starts after this key (wrapping around to the
|
||||
beginning of the key space if there is no such file).
|
||||
|
||||
<p>
|
||||
Compactions drop overwritten values. They also drop deletion markers
|
||||
if there are no higher numbered levels that contain a file whose range
|
||||
overlaps the current key.
|
||||
|
||||
<h2>Timing</h2>
|
||||
|
||||
Level-0 compactions will read up to four 1MB files from level-0, and
|
||||
at worst all the level-1 files (10MB). I.e., we will read 14MB and
|
||||
write 14MB.
|
||||
|
||||
<p>
|
||||
Other than the special level-0 compactions, we will pick one 2MB file
|
||||
from level L. In the worst case, this will overlap ~ 12 files from
|
||||
level L+1 (10 because level-(L+1) is ten times the size of level-L,
|
||||
and another two at the boundaries since the file ranges at level-L
|
||||
will usually not be aligned with the file ranges at level-L+1). The
|
||||
compaction will therefore read 26MB and write 26MB. Assuming a disk
|
||||
IO rate of 100MB/s (ballpark range for modern drives), the worst
|
||||
compaction cost will be approximately 0.5 second.
|
||||
|
||||
<p>
|
||||
If we throttle the background writing to something small, say 10% of
|
||||
the full 100MB/s speed, a compaction may take up to 5 seconds. If the
|
||||
user is writing at 10MB/s, we might build up lots of level-0 files
|
||||
(~50 to hold the 5*10MB). This may signficantly increase the cost of
|
||||
reads due to the overhead of merging more files together on every
|
||||
read.
|
||||
|
||||
<p>
|
||||
Solution 1: To reduce this problem, we might want to increase the log
|
||||
switching threshold when the number of level-0 files is large. Though
|
||||
the downside is that the larger this threshold, the more memory we will
|
||||
need to hold the corresponding memtable.
|
||||
|
||||
<p>
|
||||
Solution 2: We might want to decrease write rate artificially when the
|
||||
number of level-0 files goes up.
|
||||
|
||||
<p>
|
||||
Solution 3: We work on reducing the cost of very wide merges.
|
||||
Perhaps most of the level-0 files will have their blocks sitting
|
||||
uncompressed in the cache and we will only need to worry about the
|
||||
O(N) complexity in the merging iterator.
|
||||
|
||||
<h2>Number of files</h2>
|
||||
|
||||
Instead of always making 2MB files, we could make larger files for
|
||||
larger levels to reduce the total file count, though at the expense of
|
||||
more bursty compactions. Alternatively, we could shard the set of
|
||||
files into multiple directories.
|
||||
|
||||
<p>
|
||||
An experiment on an <code>ext3</code> filesystem on Feb 04, 2011 shows
|
||||
the following timings to do 100K file opens in directories with
|
||||
varying number of files:
|
||||
<table class="datatable">
|
||||
<tr><th>Files in directory</th><th>Microseconds to open a file</th></tr>
|
||||
<tr><td>1000</td><td>9</td>
|
||||
<tr><td>10000</td><td>10</td>
|
||||
<tr><td>100000</td><td>16</td>
|
||||
</table>
|
||||
So maybe even the sharding is not necessary on modern filesystems?
|
||||
|
||||
<h1>Recovery</h1>
|
||||
|
||||
<ul>
|
||||
<li> Read CURRENT to find name of the latest committed MANIFEST
|
||||
<li> Read the named MANIFEST file
|
||||
<li> Clean up stale files
|
||||
<li> We could open all sstables here, but it is probably better to be lazy...
|
||||
<li> Convert log chunk to a new level-0 sstable
|
||||
<li> Start directing new writes to a new log file with recovered sequence#
|
||||
</ul>
|
||||
|
||||
<h1>Garbage collection of files</h1>
|
||||
|
||||
<code>DeleteObsoleteFiles()</code> is called at the end of every
|
||||
compaction and at the end of recovery. It finds the names of all
|
||||
files in the database. It deletes all log files that are not the
|
||||
current log file. It deletes all table files that are not referenced
|
||||
from some level and are not the output of an active compaction.
|
||||
|
||||
</body>
|
||||
</html>
|
|
@ -1,170 +0,0 @@
|
|||
## Files
|
||||
|
||||
The implementation of leveldb is similar in spirit to the representation of a
|
||||
single [Bigtable tablet (section 5.3)](http://research.google.com/archive/bigtable.html).
|
||||
However the organization of the files that make up the representation is
|
||||
somewhat different and is explained below.
|
||||
|
||||
Each database is represented by a set of files stored in a directory. There are
|
||||
several different types of files as documented below:
|
||||
|
||||
### Log files
|
||||
|
||||
A log file (*.log) stores a sequence of recent updates. Each update is appended
|
||||
to the current log file. When the log file reaches a pre-determined size
|
||||
(approximately 4MB by default), it is converted to a sorted table (see below)
|
||||
and a new log file is created for future updates.
|
||||
|
||||
A copy of the current log file is kept in an in-memory structure (the
|
||||
`memtable`). This copy is consulted on every read so that read operations
|
||||
reflect all logged updates.
|
||||
|
||||
## Sorted tables
|
||||
|
||||
A sorted table (*.ldb) stores a sequence of entries sorted by key. Each entry is
|
||||
either a value for the key, or a deletion marker for the key. (Deletion markers
|
||||
are kept around to hide obsolete values present in older sorted tables).
|
||||
|
||||
The set of sorted tables are organized into a sequence of levels. The sorted
|
||||
table generated from a log file is placed in a special **young** level (also
|
||||
called level-0). When the number of young files exceeds a certain threshold
|
||||
(currently four), all of the young files are merged together with all of the
|
||||
overlapping level-1 files to produce a sequence of new level-1 files (we create
|
||||
a new level-1 file for every 2MB of data.)
|
||||
|
||||
Files in the young level may contain overlapping keys. However files in other
|
||||
levels have distinct non-overlapping key ranges. Consider level number L where
|
||||
L >= 1. When the combined size of files in level-L exceeds (10^L) MB (i.e., 10MB
|
||||
for level-1, 100MB for level-2, ...), one file in level-L, and all of the
|
||||
overlapping files in level-(L+1) are merged to form a set of new files for
|
||||
level-(L+1). These merges have the effect of gradually migrating new updates
|
||||
from the young level to the largest level using only bulk reads and writes
|
||||
(i.e., minimizing expensive seeks).
|
||||
|
||||
### Manifest
|
||||
|
||||
A MANIFEST file lists the set of sorted tables that make up each level, the
|
||||
corresponding key ranges, and other important metadata. A new MANIFEST file
|
||||
(with a new number embedded in the file name) is created whenever the database
|
||||
is reopened. The MANIFEST file is formatted as a log, and changes made to the
|
||||
serving state (as files are added or removed) are appended to this log.
|
||||
|
||||
### Current
|
||||
|
||||
CURRENT is a simple text file that contains the name of the latest MANIFEST
|
||||
file.
|
||||
|
||||
### Info logs
|
||||
|
||||
Informational messages are printed to files named LOG and LOG.old.
|
||||
|
||||
### Others
|
||||
|
||||
Other files used for miscellaneous purposes may also be present (LOCK, *.dbtmp).
|
||||
|
||||
## Level 0
|
||||
|
||||
When the log file grows above a certain size (1MB by default):
|
||||
Create a brand new memtable and log file and direct future updates here
|
||||
In the background:
|
||||
Write the contents of the previous memtable to an sstable
|
||||
Discard the memtable
|
||||
Delete the old log file and the old memtable
|
||||
Add the new sstable to the young (level-0) level.
|
||||
|
||||
## Compactions
|
||||
|
||||
When the size of level L exceeds its limit, we compact it in a background
|
||||
thread. The compaction picks a file from level L and all overlapping files from
|
||||
the next level L+1. Note that if a level-L file overlaps only part of a
|
||||
level-(L+1) file, the entire file at level-(L+1) is used as an input to the
|
||||
compaction and will be discarded after the compaction. Aside: because level-0
|
||||
is special (files in it may overlap each other), we treat compactions from
|
||||
level-0 to level-1 specially: a level-0 compaction may pick more than one
|
||||
level-0 file in case some of these files overlap each other.
|
||||
|
||||
A compaction merges the contents of the picked files to produce a sequence of
|
||||
level-(L+1) files. We switch to producing a new level-(L+1) file after the
|
||||
current output file has reached the target file size (2MB). We also switch to a
|
||||
new output file when the key range of the current output file has grown enough
|
||||
to overlap more than ten level-(L+2) files. This last rule ensures that a later
|
||||
compaction of a level-(L+1) file will not pick up too much data from
|
||||
level-(L+2).
|
||||
|
||||
The old files are discarded and the new files are added to the serving state.
|
||||
|
||||
Compactions for a particular level rotate through the key space. In more detail,
|
||||
for each level L, we remember the ending key of the last compaction at level L.
|
||||
The next compaction for level L will pick the first file that starts after this
|
||||
key (wrapping around to the beginning of the key space if there is no such
|
||||
file).
|
||||
|
||||
Compactions drop overwritten values. They also drop deletion markers if there
|
||||
are no higher numbered levels that contain a file whose range overlaps the
|
||||
current key.
|
||||
|
||||
### Timing
|
||||
|
||||
Level-0 compactions will read up to four 1MB files from level-0, and at worst
|
||||
all the level-1 files (10MB). I.e., we will read 14MB and write 14MB.
|
||||
|
||||
Other than the special level-0 compactions, we will pick one 2MB file from level
|
||||
L. In the worst case, this will overlap ~ 12 files from level L+1 (10 because
|
||||
level-(L+1) is ten times the size of level-L, and another two at the boundaries
|
||||
since the file ranges at level-L will usually not be aligned with the file
|
||||
ranges at level-L+1). The compaction will therefore read 26MB and write 26MB.
|
||||
Assuming a disk IO rate of 100MB/s (ballpark range for modern drives), the worst
|
||||
compaction cost will be approximately 0.5 second.
|
||||
|
||||
If we throttle the background writing to something small, say 10% of the full
|
||||
100MB/s speed, a compaction may take up to 5 seconds. If the user is writing at
|
||||
10MB/s, we might build up lots of level-0 files (~50 to hold the 5*10MB). This
|
||||
may significantly increase the cost of reads due to the overhead of merging more
|
||||
files together on every read.
|
||||
|
||||
Solution 1: To reduce this problem, we might want to increase the log switching
|
||||
threshold when the number of level-0 files is large. Though the downside is that
|
||||
the larger this threshold, the more memory we will need to hold the
|
||||
corresponding memtable.
|
||||
|
||||
Solution 2: We might want to decrease write rate artificially when the number of
|
||||
level-0 files goes up.
|
||||
|
||||
Solution 3: We work on reducing the cost of very wide merges. Perhaps most of
|
||||
the level-0 files will have their blocks sitting uncompressed in the cache and
|
||||
we will only need to worry about the O(N) complexity in the merging iterator.
|
||||
|
||||
### Number of files
|
||||
|
||||
Instead of always making 2MB files, we could make larger files for larger levels
|
||||
to reduce the total file count, though at the expense of more bursty
|
||||
compactions. Alternatively, we could shard the set of files into multiple
|
||||
directories.
|
||||
|
||||
An experiment on an ext3 filesystem on Feb 04, 2011 shows the following timings
|
||||
to do 100K file opens in directories with varying number of files:
|
||||
|
||||
|
||||
| Files in directory | Microseconds to open a file |
|
||||
|-------------------:|----------------------------:|
|
||||
| 1000 | 9 |
|
||||
| 10000 | 10 |
|
||||
| 100000 | 16 |
|
||||
|
||||
So maybe even the sharding is not necessary on modern filesystems?
|
||||
|
||||
## Recovery
|
||||
|
||||
* Read CURRENT to find name of the latest committed MANIFEST
|
||||
* Read the named MANIFEST file
|
||||
* Clean up stale files
|
||||
* We could open all sstables here, but it is probably better to be lazy...
|
||||
* Convert log chunk to a new level-0 sstable
|
||||
* Start directing new writes to a new log file with recovered sequence#
|
||||
|
||||
## Garbage collection of files
|
||||
|
||||
`DeleteObsoleteFiles()` is called at the end of every compaction and at the end
|
||||
of recovery. It finds the names of all files in the database. It deletes all log
|
||||
files that are not the current log file. It deletes all table files that are not
|
||||
referenced from some level and are not the output of an active compaction.
|
549
src/leveldb/doc/index.html
Normal file
549
src/leveldb/doc/index.html
Normal file
|
@ -0,0 +1,549 @@
|
|||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<link rel="stylesheet" type="text/css" href="doc.css" />
|
||||
<title>Leveldb</title>
|
||||
</head>
|
||||
|
||||
<body>
|
||||
<h1>Leveldb</h1>
|
||||
<address>Jeff Dean, Sanjay Ghemawat</address>
|
||||
<p>
|
||||
The <code>leveldb</code> library provides a persistent key value store. Keys and
|
||||
values are arbitrary byte arrays. The keys are ordered within the key
|
||||
value store according to a user-specified comparator function.
|
||||
|
||||
<p>
|
||||
<h1>Opening A Database</h1>
|
||||
<p>
|
||||
A <code>leveldb</code> database has a name which corresponds to a file system
|
||||
directory. All of the contents of database are stored in this
|
||||
directory. The following example shows how to open a database,
|
||||
creating it if necessary:
|
||||
<p>
|
||||
<pre>
|
||||
#include <assert>
|
||||
#include "leveldb/db.h"
|
||||
|
||||
leveldb::DB* db;
|
||||
leveldb::Options options;
|
||||
options.create_if_missing = true;
|
||||
leveldb::Status status = leveldb::DB::Open(options, "/tmp/testdb", &db);
|
||||
assert(status.ok());
|
||||
...
|
||||
</pre>
|
||||
If you want to raise an error if the database already exists, add
|
||||
the following line before the <code>leveldb::DB::Open</code> call:
|
||||
<pre>
|
||||
options.error_if_exists = true;
|
||||
</pre>
|
||||
<h1>Status</h1>
|
||||
<p>
|
||||
You may have noticed the <code>leveldb::Status</code> type above. Values of this
|
||||
type are returned by most functions in <code>leveldb</code> that may encounter an
|
||||
error. You can check if such a result is ok, and also print an
|
||||
associated error message:
|
||||
<p>
|
||||
<pre>
|
||||
leveldb::Status s = ...;
|
||||
if (!s.ok()) cerr << s.ToString() << endl;
|
||||
</pre>
|
||||
<h1>Closing A Database</h1>
|
||||
<p>
|
||||
When you are done with a database, just delete the database object.
|
||||
Example:
|
||||
<p>
|
||||
<pre>
|
||||
... open the db as described above ...
|
||||
... do something with db ...
|
||||
delete db;
|
||||
</pre>
|
||||
<h1>Reads And Writes</h1>
|
||||
<p>
|
||||
The database provides <code>Put</code>, <code>Delete</code>, and <code>Get</code> methods to
|
||||
modify/query the database. For example, the following code
|
||||
moves the value stored under key1 to key2.
|
||||
<pre>
|
||||
std::string value;
|
||||
leveldb::Status s = db->Get(leveldb::ReadOptions(), key1, &value);
|
||||
if (s.ok()) s = db->Put(leveldb::WriteOptions(), key2, value);
|
||||
if (s.ok()) s = db->Delete(leveldb::WriteOptions(), key1);
|
||||
</pre>
|
||||
|
||||
<h1>Atomic Updates</h1>
|
||||
<p>
|
||||
Note that if the process dies after the Put of key2 but before the
|
||||
delete of key1, the same value may be left stored under multiple keys.
|
||||
Such problems can be avoided by using the <code>WriteBatch</code> class to
|
||||
atomically apply a set of updates:
|
||||
<p>
|
||||
<pre>
|
||||
#include "leveldb/write_batch.h"
|
||||
...
|
||||
std::string value;
|
||||
leveldb::Status s = db->Get(leveldb::ReadOptions(), key1, &value);
|
||||
if (s.ok()) {
|
||||
leveldb::WriteBatch batch;
|
||||
batch.Delete(key1);
|
||||
batch.Put(key2, value);
|
||||
s = db->Write(leveldb::WriteOptions(), &batch);
|
||||
}
|
||||
</pre>
|
||||
The <code>WriteBatch</code> holds a sequence of edits to be made to the database,
|
||||
and these edits within the batch are applied in order. Note that we
|
||||
called <code>Delete</code> before <code>Put</code> so that if <code>key1</code> is identical to <code>key2</code>,
|
||||
we do not end up erroneously dropping the value entirely.
|
||||
<p>
|
||||
Apart from its atomicity benefits, <code>WriteBatch</code> may also be used to
|
||||
speed up bulk updates by placing lots of individual mutations into the
|
||||
same batch.
|
||||
|
||||
<h1>Synchronous Writes</h1>
|
||||
By default, each write to <code>leveldb</code> is asynchronous: it
|
||||
returns after pushing the write from the process into the operating
|
||||
system. The transfer from operating system memory to the underlying
|
||||
persistent storage happens asynchronously. The <code>sync</code> flag
|
||||
can be turned on for a particular write to make the write operation
|
||||
not return until the data being written has been pushed all the way to
|
||||
persistent storage. (On Posix systems, this is implemented by calling
|
||||
either <code>fsync(...)</code> or <code>fdatasync(...)</code> or
|
||||
<code>msync(..., MS_SYNC)</code> before the write operation returns.)
|
||||
<pre>
|
||||
leveldb::WriteOptions write_options;
|
||||
write_options.sync = true;
|
||||
db->Put(write_options, ...);
|
||||
</pre>
|
||||
Asynchronous writes are often more than a thousand times as fast as
|
||||
synchronous writes. The downside of asynchronous writes is that a
|
||||
crash of the machine may cause the last few updates to be lost. Note
|
||||
that a crash of just the writing process (i.e., not a reboot) will not
|
||||
cause any loss since even when <code>sync</code> is false, an update
|
||||
is pushed from the process memory into the operating system before it
|
||||
is considered done.
|
||||
|
||||
<p>
|
||||
Asynchronous writes can often be used safely. For example, when
|
||||
loading a large amount of data into the database you can handle lost
|
||||
updates by restarting the bulk load after a crash. A hybrid scheme is
|
||||
also possible where every Nth write is synchronous, and in the event
|
||||
of a crash, the bulk load is restarted just after the last synchronous
|
||||
write finished by the previous run. (The synchronous write can update
|
||||
a marker that describes where to restart on a crash.)
|
||||
|
||||
<p>
|
||||
<code>WriteBatch</code> provides an alternative to asynchronous writes.
|
||||
Multiple updates may be placed in the same <code>WriteBatch</code> and
|
||||
applied together using a synchronous write (i.e.,
|
||||
<code>write_options.sync</code> is set to true). The extra cost of
|
||||
the synchronous write will be amortized across all of the writes in
|
||||
the batch.
|
||||
|
||||
<p>
|
||||
<h1>Concurrency</h1>
|
||||
<p>
|
||||
A database may only be opened by one process at a time.
|
||||
The <code>leveldb</code> implementation acquires a lock from the
|
||||
operating system to prevent misuse. Within a single process, the
|
||||
same <code>leveldb::DB</code> object may be safely shared by multiple
|
||||
concurrent threads. I.e., different threads may write into or fetch
|
||||
iterators or call <code>Get</code> on the same database without any
|
||||
external synchronization (the leveldb implementation will
|
||||
automatically do the required synchronization). However other objects
|
||||
(like Iterator and WriteBatch) may require external synchronization.
|
||||
If two threads share such an object, they must protect access to it
|
||||
using their own locking protocol. More details are available in
|
||||
the public header files.
|
||||
<p>
|
||||
<h1>Iteration</h1>
|
||||
<p>
|
||||
The following example demonstrates how to print all key,value pairs
|
||||
in a database.
|
||||
<p>
|
||||
<pre>
|
||||
leveldb::Iterator* it = db->NewIterator(leveldb::ReadOptions());
|
||||
for (it->SeekToFirst(); it->Valid(); it->Next()) {
|
||||
cout << it->key().ToString() << ": " << it->value().ToString() << endl;
|
||||
}
|
||||
assert(it->status().ok()); // Check for any errors found during the scan
|
||||
delete it;
|
||||
</pre>
|
||||
The following variation shows how to process just the keys in the
|
||||
range <code>[start,limit)</code>:
|
||||
<p>
|
||||
<pre>
|
||||
for (it->Seek(start);
|
||||
it->Valid() && it->key().ToString() < limit;
|
||||
it->Next()) {
|
||||
...
|
||||
}
|
||||
</pre>
|
||||
You can also process entries in reverse order. (Caveat: reverse
|
||||
iteration may be somewhat slower than forward iteration.)
|
||||
<p>
|
||||
<pre>
|
||||
for (it->SeekToLast(); it->Valid(); it->Prev()) {
|
||||
...
|
||||
}
|
||||
</pre>
|
||||
<h1>Snapshots</h1>
|
||||
<p>
|
||||
Snapshots provide consistent read-only views over the entire state of
|
||||
the key-value store. <code>ReadOptions::snapshot</code> may be non-NULL to indicate
|
||||
that a read should operate on a particular version of the DB state.
|
||||
If <code>ReadOptions::snapshot</code> is NULL, the read will operate on an
|
||||
implicit snapshot of the current state.
|
||||
<p>
|
||||
Snapshots are created by the DB::GetSnapshot() method:
|
||||
<p>
|
||||
<pre>
|
||||
leveldb::ReadOptions options;
|
||||
options.snapshot = db->GetSnapshot();
|
||||
... apply some updates to db ...
|
||||
leveldb::Iterator* iter = db->NewIterator(options);
|
||||
... read using iter to view the state when the snapshot was created ...
|
||||
delete iter;
|
||||
db->ReleaseSnapshot(options.snapshot);
|
||||
</pre>
|
||||
Note that when a snapshot is no longer needed, it should be released
|
||||
using the DB::ReleaseSnapshot interface. This allows the
|
||||
implementation to get rid of state that was being maintained just to
|
||||
support reading as of that snapshot.
|
||||
<h1>Slice</h1>
|
||||
<p>
|
||||
The return value of the <code>it->key()</code> and <code>it->value()</code> calls above
|
||||
are instances of the <code>leveldb::Slice</code> type. <code>Slice</code> is a simple
|
||||
structure that contains a length and a pointer to an external byte
|
||||
array. Returning a <code>Slice</code> is a cheaper alternative to returning a
|
||||
<code>std::string</code> since we do not need to copy potentially large keys and
|
||||
values. In addition, <code>leveldb</code> methods do not return null-terminated
|
||||
C-style strings since <code>leveldb</code> keys and values are allowed to
|
||||
contain '\0' bytes.
|
||||
<p>
|
||||
C++ strings and null-terminated C-style strings can be easily converted
|
||||
to a Slice:
|
||||
<p>
|
||||
<pre>
|
||||
leveldb::Slice s1 = "hello";
|
||||
|
||||
std::string str("world");
|
||||
leveldb::Slice s2 = str;
|
||||
</pre>
|
||||
A Slice can be easily converted back to a C++ string:
|
||||
<pre>
|
||||
std::string str = s1.ToString();
|
||||
assert(str == std::string("hello"));
|
||||
</pre>
|
||||
Be careful when using Slices since it is up to the caller to ensure that
|
||||
the external byte array into which the Slice points remains live while
|
||||
the Slice is in use. For example, the following is buggy:
|
||||
<p>
|
||||
<pre>
|
||||
leveldb::Slice slice;
|
||||
if (...) {
|
||||
std::string str = ...;
|
||||
slice = str;
|
||||
}
|
||||
Use(slice);
|
||||
</pre>
|
||||
When the <code>if</code> statement goes out of scope, <code>str</code> will be destroyed and the
|
||||
backing storage for <code>slice</code> will disappear.
|
||||
<p>
|
||||
<h1>Comparators</h1>
|
||||
<p>
|
||||
The preceding examples used the default ordering function for key,
|
||||
which orders bytes lexicographically. You can however supply a custom
|
||||
comparator when opening a database. For example, suppose each
|
||||
database key consists of two numbers and we should sort by the first
|
||||
number, breaking ties by the second number. First, define a proper
|
||||
subclass of <code>leveldb::Comparator</code> that expresses these rules:
|
||||
<p>
|
||||
<pre>
|
||||
class TwoPartComparator : public leveldb::Comparator {
|
||||
public:
|
||||
// Three-way comparison function:
|
||||
// if a < b: negative result
|
||||
// if a > b: positive result
|
||||
// else: zero result
|
||||
int Compare(const leveldb::Slice& a, const leveldb::Slice& b) const {
|
||||
int a1, a2, b1, b2;
|
||||
ParseKey(a, &a1, &a2);
|
||||
ParseKey(b, &b1, &b2);
|
||||
if (a1 < b1) return -1;
|
||||
if (a1 > b1) return +1;
|
||||
if (a2 < b2) return -1;
|
||||
if (a2 > b2) return +1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Ignore the following methods for now:
|
||||
const char* Name() const { return "TwoPartComparator"; }
|
||||
void FindShortestSeparator(std::string*, const leveldb::Slice&) const { }
|
||||
void FindShortSuccessor(std::string*) const { }
|
||||
};
|
||||
</pre>
|
||||
Now create a database using this custom comparator:
|
||||
<p>
|
||||
<pre>
|
||||
TwoPartComparator cmp;
|
||||
leveldb::DB* db;
|
||||
leveldb::Options options;
|
||||
options.create_if_missing = true;
|
||||
options.comparator = &cmp;
|
||||
leveldb::Status status = leveldb::DB::Open(options, "/tmp/testdb", &db);
|
||||
...
|
||||
</pre>
|
||||
<h2>Backwards compatibility</h2>
|
||||
<p>
|
||||
The result of the comparator's <code>Name</code> method is attached to the
|
||||
database when it is created, and is checked on every subsequent
|
||||
database open. If the name changes, the <code>leveldb::DB::Open</code> call will
|
||||
fail. Therefore, change the name if and only if the new key format
|
||||
and comparison function are incompatible with existing databases, and
|
||||
it is ok to discard the contents of all existing databases.
|
||||
<p>
|
||||
You can however still gradually evolve your key format over time with
|
||||
a little bit of pre-planning. For example, you could store a version
|
||||
number at the end of each key (one byte should suffice for most uses).
|
||||
When you wish to switch to a new key format (e.g., adding an optional
|
||||
third part to the keys processed by <code>TwoPartComparator</code>),
|
||||
(a) keep the same comparator name (b) increment the version number
|
||||
for new keys (c) change the comparator function so it uses the
|
||||
version numbers found in the keys to decide how to interpret them.
|
||||
<p>
|
||||
<h1>Performance</h1>
|
||||
<p>
|
||||
Performance can be tuned by changing the default values of the
|
||||
types defined in <code>include/leveldb/options.h</code>.
|
||||
|
||||
<p>
|
||||
<h2>Block size</h2>
|
||||
<p>
|
||||
<code>leveldb</code> groups adjacent keys together into the same block and such a
|
||||
block is the unit of transfer to and from persistent storage. The
|
||||
default block size is approximately 4096 uncompressed bytes.
|
||||
Applications that mostly do bulk scans over the contents of the
|
||||
database may wish to increase this size. Applications that do a lot
|
||||
of point reads of small values may wish to switch to a smaller block
|
||||
size if performance measurements indicate an improvement. There isn't
|
||||
much benefit in using blocks smaller than one kilobyte, or larger than
|
||||
a few megabytes. Also note that compression will be more effective
|
||||
with larger block sizes.
|
||||
<p>
|
||||
<h2>Compression</h2>
|
||||
<p>
|
||||
Each block is individually compressed before being written to
|
||||
persistent storage. Compression is on by default since the default
|
||||
compression method is very fast, and is automatically disabled for
|
||||
uncompressible data. In rare cases, applications may want to disable
|
||||
compression entirely, but should only do so if benchmarks show a
|
||||
performance improvement:
|
||||
<p>
|
||||
<pre>
|
||||
leveldb::Options options;
|
||||
options.compression = leveldb::kNoCompression;
|
||||
... leveldb::DB::Open(options, name, ...) ....
|
||||
</pre>
|
||||
<h2>Cache</h2>
|
||||
<p>
|
||||
The contents of the database are stored in a set of files in the
|
||||
filesystem and each file stores a sequence of compressed blocks. If
|
||||
<code>options.cache</code> is non-NULL, it is used to cache frequently used
|
||||
uncompressed block contents.
|
||||
<p>
|
||||
<pre>
|
||||
#include "leveldb/cache.h"
|
||||
|
||||
leveldb::Options options;
|
||||
options.cache = leveldb::NewLRUCache(100 * 1048576); // 100MB cache
|
||||
leveldb::DB* db;
|
||||
leveldb::DB::Open(options, name, &db);
|
||||
... use the db ...
|
||||
delete db
|
||||
delete options.cache;
|
||||
</pre>
|
||||
Note that the cache holds uncompressed data, and therefore it should
|
||||
be sized according to application level data sizes, without any
|
||||
reduction from compression. (Caching of compressed blocks is left to
|
||||
the operating system buffer cache, or any custom <code>Env</code>
|
||||
implementation provided by the client.)
|
||||
<p>
|
||||
When performing a bulk read, the application may wish to disable
|
||||
caching so that the data processed by the bulk read does not end up
|
||||
displacing most of the cached contents. A per-iterator option can be
|
||||
used to achieve this:
|
||||
<p>
|
||||
<pre>
|
||||
leveldb::ReadOptions options;
|
||||
options.fill_cache = false;
|
||||
leveldb::Iterator* it = db->NewIterator(options);
|
||||
for (it->SeekToFirst(); it->Valid(); it->Next()) {
|
||||
...
|
||||
}
|
||||
</pre>
|
||||
<h2>Key Layout</h2>
|
||||
<p>
|
||||
Note that the unit of disk transfer and caching is a block. Adjacent
|
||||
keys (according to the database sort order) will usually be placed in
|
||||
the same block. Therefore the application can improve its performance
|
||||
by placing keys that are accessed together near each other and placing
|
||||
infrequently used keys in a separate region of the key space.
|
||||
<p>
|
||||
For example, suppose we are implementing a simple file system on top
|
||||
of <code>leveldb</code>. The types of entries we might wish to store are:
|
||||
<p>
|
||||
<pre>
|
||||
filename -> permission-bits, length, list of file_block_ids
|
||||
file_block_id -> data
|
||||
</pre>
|
||||
We might want to prefix <code>filename</code> keys with one letter (say '/') and the
|
||||
<code>file_block_id</code> keys with a different letter (say '0') so that scans
|
||||
over just the metadata do not force us to fetch and cache bulky file
|
||||
contents.
|
||||
<p>
|
||||
<h2>Filters</h2>
|
||||
<p>
|
||||
Because of the way <code>leveldb</code> data is organized on disk,
|
||||
a single <code>Get()</code> call may involve multiple reads from disk.
|
||||
The optional <code>FilterPolicy</code> mechanism can be used to reduce
|
||||
the number of disk reads substantially.
|
||||
<pre>
|
||||
leveldb::Options options;
|
||||
options.filter_policy = NewBloomFilter(10);
|
||||
leveldb::DB* db;
|
||||
leveldb::DB::Open(options, "/tmp/testdb", &db);
|
||||
... use the database ...
|
||||
delete db;
|
||||
delete options.filter_policy;
|
||||
</pre>
|
||||
The preceding code associates a
|
||||
<a href="http://en.wikipedia.org/wiki/Bloom_filter">Bloom filter</a>
|
||||
based filtering policy with the database. Bloom filter based
|
||||
filtering relies on keeping some number of bits of data in memory per
|
||||
key (in this case 10 bits per key since that is the argument we passed
|
||||
to NewBloomFilter). This filter will reduce the number of unnecessary
|
||||
disk reads needed for <code>Get()</code> calls by a factor of
|
||||
approximately a 100. Increasing the bits per key will lead to a
|
||||
larger reduction at the cost of more memory usage. We recommend that
|
||||
applications whose working set does not fit in memory and that do a
|
||||
lot of random reads set a filter policy.
|
||||
<p>
|
||||
If you are using a custom comparator, you should ensure that the filter
|
||||
policy you are using is compatible with your comparator. For example,
|
||||
consider a comparator that ignores trailing spaces when comparing keys.
|
||||
<code>NewBloomFilter</code> must not be used with such a comparator.
|
||||
Instead, the application should provide a custom filter policy that
|
||||
also ignores trailing spaces. For example:
|
||||
<pre>
|
||||
class CustomFilterPolicy : public leveldb::FilterPolicy {
|
||||
private:
|
||||
FilterPolicy* builtin_policy_;
|
||||
public:
|
||||
CustomFilterPolicy() : builtin_policy_(NewBloomFilter(10)) { }
|
||||
~CustomFilterPolicy() { delete builtin_policy_; }
|
||||
|
||||
const char* Name() const { return "IgnoreTrailingSpacesFilter"; }
|
||||
|
||||
void CreateFilter(const Slice* keys, int n, std::string* dst) const {
|
||||
// Use builtin bloom filter code after removing trailing spaces
|
||||
std::vector<Slice> trimmed(n);
|
||||
for (int i = 0; i < n; i++) {
|
||||
trimmed[i] = RemoveTrailingSpaces(keys[i]);
|
||||
}
|
||||
return builtin_policy_->CreateFilter(&trimmed[i], n, dst);
|
||||
}
|
||||
|
||||
bool KeyMayMatch(const Slice& key, const Slice& filter) const {
|
||||
// Use builtin bloom filter code after removing trailing spaces
|
||||
return builtin_policy_->KeyMayMatch(RemoveTrailingSpaces(key), filter);
|
||||
}
|
||||
};
|
||||
</pre>
|
||||
<p>
|
||||
Advanced applications may provide a filter policy that does not use
|
||||
a bloom filter but uses some other mechanism for summarizing a set
|
||||
of keys. See <code>leveldb/filter_policy.h</code> for detail.
|
||||
<p>
|
||||
<h1>Checksums</h1>
|
||||
<p>
|
||||
<code>leveldb</code> associates checksums with all data it stores in the file system.
|
||||
There are two separate controls provided over how aggressively these
|
||||
checksums are verified:
|
||||
<p>
|
||||
<ul>
|
||||
<li> <code>ReadOptions::verify_checksums</code> may be set to true to force
|
||||
checksum verification of all data that is read from the file system on
|
||||
behalf of a particular read. By default, no such verification is
|
||||
done.
|
||||
<p>
|
||||
<li> <code>Options::paranoid_checks</code> may be set to true before opening a
|
||||
database to make the database implementation raise an error as soon as
|
||||
it detects an internal corruption. Depending on which portion of the
|
||||
database has been corrupted, the error may be raised when the database
|
||||
is opened, or later by another database operation. By default,
|
||||
paranoid checking is off so that the database can be used even if
|
||||
parts of its persistent storage have been corrupted.
|
||||
<p>
|
||||
If a database is corrupted (perhaps it cannot be opened when
|
||||
paranoid checking is turned on), the <code>leveldb::RepairDB</code> function
|
||||
may be used to recover as much of the data as possible
|
||||
<p>
|
||||
</ul>
|
||||
<h1>Approximate Sizes</h1>
|
||||
<p>
|
||||
The <code>GetApproximateSizes</code> method can used to get the approximate
|
||||
number of bytes of file system space used by one or more key ranges.
|
||||
<p>
|
||||
<pre>
|
||||
leveldb::Range ranges[2];
|
||||
ranges[0] = leveldb::Range("a", "c");
|
||||
ranges[1] = leveldb::Range("x", "z");
|
||||
uint64_t sizes[2];
|
||||
leveldb::Status s = db->GetApproximateSizes(ranges, 2, sizes);
|
||||
</pre>
|
||||
The preceding call will set <code>sizes[0]</code> to the approximate number of
|
||||
bytes of file system space used by the key range <code>[a..c)</code> and
|
||||
<code>sizes[1]</code> to the approximate number of bytes used by the key range
|
||||
<code>[x..z)</code>.
|
||||
<p>
|
||||
<h1>Environment</h1>
|
||||
<p>
|
||||
All file operations (and other operating system calls) issued by the
|
||||
<code>leveldb</code> implementation are routed through a <code>leveldb::Env</code> object.
|
||||
Sophisticated clients may wish to provide their own <code>Env</code>
|
||||
implementation to get better control. For example, an application may
|
||||
introduce artificial delays in the file IO paths to limit the impact
|
||||
of <code>leveldb</code> on other activities in the system.
|
||||
<p>
|
||||
<pre>
|
||||
class SlowEnv : public leveldb::Env {
|
||||
.. implementation of the Env interface ...
|
||||
};
|
||||
|
||||
SlowEnv env;
|
||||
leveldb::Options options;
|
||||
options.env = &env;
|
||||
Status s = leveldb::DB::Open(options, ...);
|
||||
</pre>
|
||||
<h1>Porting</h1>
|
||||
<p>
|
||||
<code>leveldb</code> may be ported to a new platform by providing platform
|
||||
specific implementations of the types/methods/functions exported by
|
||||
<code>leveldb/port/port.h</code>. See <code>leveldb/port/port_example.h</code> for more
|
||||
details.
|
||||
<p>
|
||||
In addition, the new platform may need a new default <code>leveldb::Env</code>
|
||||
implementation. See <code>leveldb/util/env_posix.h</code> for an example.
|
||||
|
||||
<h1>Other Information</h1>
|
||||
|
||||
<p>
|
||||
Details about the <code>leveldb</code> implementation may be found in
|
||||
the following documents:
|
||||
<ul>
|
||||
<li> <a href="impl.html">Implementation notes</a>
|
||||
<li> <a href="table_format.txt">Format of an immutable Table file</a>
|
||||
<li> <a href="log_format.txt">Format of a log file</a>
|
||||
</ul>
|
||||
|
||||
</body>
|
||||
</html>
|
|
@ -1,523 +0,0 @@
|
|||
leveldb
|
||||
=======
|
||||
|
||||
_Jeff Dean, Sanjay Ghemawat_
|
||||
|
||||
The leveldb library provides a persistent key value store. Keys and values are
|
||||
arbitrary byte arrays. The keys are ordered within the key value store
|
||||
according to a user-specified comparator function.
|
||||
|
||||
## Opening A Database
|
||||
|
||||
A leveldb database has a name which corresponds to a file system directory. All
|
||||
of the contents of database are stored in this directory. The following example
|
||||
shows how to open a database, creating it if necessary:
|
||||
|
||||
```c++
|
||||
#include <cassert>
|
||||
#include "leveldb/db.h"
|
||||
|
||||
leveldb::DB* db;
|
||||
leveldb::Options options;
|
||||
options.create_if_missing = true;
|
||||
leveldb::Status status = leveldb::DB::Open(options, "/tmp/testdb", &db);
|
||||
assert(status.ok());
|
||||
...
|
||||
```
|
||||
|
||||
If you want to raise an error if the database already exists, add the following
|
||||
line before the `leveldb::DB::Open` call:
|
||||
|
||||
```c++
|
||||
options.error_if_exists = true;
|
||||
```
|
||||
|
||||
## Status
|
||||
|
||||
You may have noticed the `leveldb::Status` type above. Values of this type are
|
||||
returned by most functions in leveldb that may encounter an error. You can check
|
||||
if such a result is ok, and also print an associated error message:
|
||||
|
||||
```c++
|
||||
leveldb::Status s = ...;
|
||||
if (!s.ok()) cerr << s.ToString() << endl;
|
||||
```
|
||||
|
||||
## Closing A Database
|
||||
|
||||
When you are done with a database, just delete the database object. Example:
|
||||
|
||||
```c++
|
||||
... open the db as described above ...
|
||||
... do something with db ...
|
||||
delete db;
|
||||
```
|
||||
|
||||
## Reads And Writes
|
||||
|
||||
The database provides Put, Delete, and Get methods to modify/query the database.
|
||||
For example, the following code moves the value stored under key1 to key2.
|
||||
|
||||
```c++
|
||||
std::string value;
|
||||
leveldb::Status s = db->Get(leveldb::ReadOptions(), key1, &value);
|
||||
if (s.ok()) s = db->Put(leveldb::WriteOptions(), key2, value);
|
||||
if (s.ok()) s = db->Delete(leveldb::WriteOptions(), key1);
|
||||
```
|
||||
|
||||
## Atomic Updates
|
||||
|
||||
Note that if the process dies after the Put of key2 but before the delete of
|
||||
key1, the same value may be left stored under multiple keys. Such problems can
|
||||
be avoided by using the `WriteBatch` class to atomically apply a set of updates:
|
||||
|
||||
```c++
|
||||
#include "leveldb/write_batch.h"
|
||||
...
|
||||
std::string value;
|
||||
leveldb::Status s = db->Get(leveldb::ReadOptions(), key1, &value);
|
||||
if (s.ok()) {
|
||||
leveldb::WriteBatch batch;
|
||||
batch.Delete(key1);
|
||||
batch.Put(key2, value);
|
||||
s = db->Write(leveldb::WriteOptions(), &batch);
|
||||
}
|
||||
```
|
||||
|
||||
The `WriteBatch` holds a sequence of edits to be made to the database, and these
|
||||
edits within the batch are applied in order. Note that we called Delete before
|
||||
Put so that if key1 is identical to key2, we do not end up erroneously dropping
|
||||
the value entirely.
|
||||
|
||||
Apart from its atomicity benefits, `WriteBatch` may also be used to speed up
|
||||
bulk updates by placing lots of individual mutations into the same batch.
|
||||
|
||||
## Synchronous Writes
|
||||
|
||||
By default, each write to leveldb is asynchronous: it returns after pushing the
|
||||
write from the process into the operating system. The transfer from operating
|
||||
system memory to the underlying persistent storage happens asynchronously. The
|
||||
sync flag can be turned on for a particular write to make the write operation
|
||||
not return until the data being written has been pushed all the way to
|
||||
persistent storage. (On Posix systems, this is implemented by calling either
|
||||
`fsync(...)` or `fdatasync(...)` or `msync(..., MS_SYNC)` before the write
|
||||
operation returns.)
|
||||
|
||||
```c++
|
||||
leveldb::WriteOptions write_options;
|
||||
write_options.sync = true;
|
||||
db->Put(write_options, ...);
|
||||
```
|
||||
|
||||
Asynchronous writes are often more than a thousand times as fast as synchronous
|
||||
writes. The downside of asynchronous writes is that a crash of the machine may
|
||||
cause the last few updates to be lost. Note that a crash of just the writing
|
||||
process (i.e., not a reboot) will not cause any loss since even when sync is
|
||||
false, an update is pushed from the process memory into the operating system
|
||||
before it is considered done.
|
||||
|
||||
Asynchronous writes can often be used safely. For example, when loading a large
|
||||
amount of data into the database you can handle lost updates by restarting the
|
||||
bulk load after a crash. A hybrid scheme is also possible where every Nth write
|
||||
is synchronous, and in the event of a crash, the bulk load is restarted just
|
||||
after the last synchronous write finished by the previous run. (The synchronous
|
||||
write can update a marker that describes where to restart on a crash.)
|
||||
|
||||
`WriteBatch` provides an alternative to asynchronous writes. Multiple updates
|
||||
may be placed in the same WriteBatch and applied together using a synchronous
|
||||
write (i.e., `write_options.sync` is set to true). The extra cost of the
|
||||
synchronous write will be amortized across all of the writes in the batch.
|
||||
|
||||
## Concurrency
|
||||
|
||||
A database may only be opened by one process at a time. The leveldb
|
||||
implementation acquires a lock from the operating system to prevent misuse.
|
||||
Within a single process, the same `leveldb::DB` object may be safely shared by
|
||||
multiple concurrent threads. I.e., different threads may write into or fetch
|
||||
iterators or call Get on the same database without any external synchronization
|
||||
(the leveldb implementation will automatically do the required synchronization).
|
||||
However other objects (like Iterator and `WriteBatch`) may require external
|
||||
synchronization. If two threads share such an object, they must protect access
|
||||
to it using their own locking protocol. More details are available in the public
|
||||
header files.
|
||||
|
||||
## Iteration
|
||||
|
||||
The following example demonstrates how to print all key,value pairs in a
|
||||
database.
|
||||
|
||||
```c++
|
||||
leveldb::Iterator* it = db->NewIterator(leveldb::ReadOptions());
|
||||
for (it->SeekToFirst(); it->Valid(); it->Next()) {
|
||||
cout << it->key().ToString() << ": " << it->value().ToString() << endl;
|
||||
}
|
||||
assert(it->status().ok()); // Check for any errors found during the scan
|
||||
delete it;
|
||||
```
|
||||
|
||||
The following variation shows how to process just the keys in the range
|
||||
[start,limit):
|
||||
|
||||
```c++
|
||||
for (it->Seek(start);
|
||||
it->Valid() && it->key().ToString() < limit;
|
||||
it->Next()) {
|
||||
...
|
||||
}
|
||||
```
|
||||
|
||||
You can also process entries in reverse order. (Caveat: reverse iteration may be
|
||||
somewhat slower than forward iteration.)
|
||||
|
||||
```c++
|
||||
for (it->SeekToLast(); it->Valid(); it->Prev()) {
|
||||
...
|
||||
}
|
||||
```
|
||||
|
||||
## Snapshots
|
||||
|
||||
Snapshots provide consistent read-only views over the entire state of the
|
||||
key-value store. `ReadOptions::snapshot` may be non-NULL to indicate that a
|
||||
read should operate on a particular version of the DB state. If
|
||||
`ReadOptions::snapshot` is NULL, the read will operate on an implicit snapshot
|
||||
of the current state.
|
||||
|
||||
Snapshots are created by the `DB::GetSnapshot()` method:
|
||||
|
||||
```c++
|
||||
leveldb::ReadOptions options;
|
||||
options.snapshot = db->GetSnapshot();
|
||||
... apply some updates to db ...
|
||||
leveldb::Iterator* iter = db->NewIterator(options);
|
||||
... read using iter to view the state when the snapshot was created ...
|
||||
delete iter;
|
||||
db->ReleaseSnapshot(options.snapshot);
|
||||
```
|
||||
|
||||
Note that when a snapshot is no longer needed, it should be released using the
|
||||
`DB::ReleaseSnapshot` interface. This allows the implementation to get rid of
|
||||
state that was being maintained just to support reading as of that snapshot.
|
||||
|
||||
## Slice
|
||||
|
||||
The return value of the `it->key()` and `it->value()` calls above are instances
|
||||
of the `leveldb::Slice` type. Slice is a simple structure that contains a length
|
||||
and a pointer to an external byte array. Returning a Slice is a cheaper
|
||||
alternative to returning a `std::string` since we do not need to copy
|
||||
potentially large keys and values. In addition, leveldb methods do not return
|
||||
null-terminated C-style strings since leveldb keys and values are allowed to
|
||||
contain `'\0'` bytes.
|
||||
|
||||
C++ strings and null-terminated C-style strings can be easily converted to a
|
||||
Slice:
|
||||
|
||||
```c++
|
||||
leveldb::Slice s1 = "hello";
|
||||
|
||||
std::string str("world");
|
||||
leveldb::Slice s2 = str;
|
||||
```
|
||||
|
||||
A Slice can be easily converted back to a C++ string:
|
||||
|
||||
```c++
|
||||
std::string str = s1.ToString();
|
||||
assert(str == std::string("hello"));
|
||||
```
|
||||
|
||||
Be careful when using Slices since it is up to the caller to ensure that the
|
||||
external byte array into which the Slice points remains live while the Slice is
|
||||
in use. For example, the following is buggy:
|
||||
|
||||
```c++
|
||||
leveldb::Slice slice;
|
||||
if (...) {
|
||||
std::string str = ...;
|
||||
slice = str;
|
||||
}
|
||||
Use(slice);
|
||||
```
|
||||
|
||||
When the if statement goes out of scope, str will be destroyed and the backing
|
||||
storage for slice will disappear.
|
||||
|
||||
## Comparators
|
||||
|
||||
The preceding examples used the default ordering function for key, which orders
|
||||
bytes lexicographically. You can however supply a custom comparator when opening
|
||||
a database. For example, suppose each database key consists of two numbers and
|
||||
we should sort by the first number, breaking ties by the second number. First,
|
||||
define a proper subclass of `leveldb::Comparator` that expresses these rules:
|
||||
|
||||
```c++
|
||||
class TwoPartComparator : public leveldb::Comparator {
|
||||
public:
|
||||
// Three-way comparison function:
|
||||
// if a < b: negative result
|
||||
// if a > b: positive result
|
||||
// else: zero result
|
||||
int Compare(const leveldb::Slice& a, const leveldb::Slice& b) const {
|
||||
int a1, a2, b1, b2;
|
||||
ParseKey(a, &a1, &a2);
|
||||
ParseKey(b, &b1, &b2);
|
||||
if (a1 < b1) return -1;
|
||||
if (a1 > b1) return +1;
|
||||
if (a2 < b2) return -1;
|
||||
if (a2 > b2) return +1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Ignore the following methods for now:
|
||||
const char* Name() const { return "TwoPartComparator"; }
|
||||
void FindShortestSeparator(std::string*, const leveldb::Slice&) const {}
|
||||
void FindShortSuccessor(std::string*) const {}
|
||||
};
|
||||
```
|
||||
|
||||
Now create a database using this custom comparator:
|
||||
|
||||
```c++
|
||||
TwoPartComparator cmp;
|
||||
leveldb::DB* db;
|
||||
leveldb::Options options;
|
||||
options.create_if_missing = true;
|
||||
options.comparator = &cmp;
|
||||
leveldb::Status status = leveldb::DB::Open(options, "/tmp/testdb", &db);
|
||||
...
|
||||
```
|
||||
|
||||
### Backwards compatibility
|
||||
|
||||
The result of the comparator's Name method is attached to the database when it
|
||||
is created, and is checked on every subsequent database open. If the name
|
||||
changes, the `leveldb::DB::Open` call will fail. Therefore, change the name if
|
||||
and only if the new key format and comparison function are incompatible with
|
||||
existing databases, and it is ok to discard the contents of all existing
|
||||
databases.
|
||||
|
||||
You can however still gradually evolve your key format over time with a little
|
||||
bit of pre-planning. For example, you could store a version number at the end of
|
||||
each key (one byte should suffice for most uses). When you wish to switch to a
|
||||
new key format (e.g., adding an optional third part to the keys processed by
|
||||
`TwoPartComparator`), (a) keep the same comparator name (b) increment the
|
||||
version number for new keys (c) change the comparator function so it uses the
|
||||
version numbers found in the keys to decide how to interpret them.
|
||||
|
||||
## Performance
|
||||
|
||||
Performance can be tuned by changing the default values of the types defined in
|
||||
`include/leveldb/options.h`.
|
||||
|
||||
### Block size
|
||||
|
||||
leveldb groups adjacent keys together into the same block and such a block is
|
||||
the unit of transfer to and from persistent storage. The default block size is
|
||||
approximately 4096 uncompressed bytes. Applications that mostly do bulk scans
|
||||
over the contents of the database may wish to increase this size. Applications
|
||||
that do a lot of point reads of small values may wish to switch to a smaller
|
||||
block size if performance measurements indicate an improvement. There isn't much
|
||||
benefit in using blocks smaller than one kilobyte, or larger than a few
|
||||
megabytes. Also note that compression will be more effective with larger block
|
||||
sizes.
|
||||
|
||||
### Compression
|
||||
|
||||
Each block is individually compressed before being written to persistent
|
||||
storage. Compression is on by default since the default compression method is
|
||||
very fast, and is automatically disabled for uncompressible data. In rare cases,
|
||||
applications may want to disable compression entirely, but should only do so if
|
||||
benchmarks show a performance improvement:
|
||||
|
||||
```c++
|
||||
leveldb::Options options;
|
||||
options.compression = leveldb::kNoCompression;
|
||||
... leveldb::DB::Open(options, name, ...) ....
|
||||
```
|
||||
|
||||
### Cache
|
||||
|
||||
The contents of the database are stored in a set of files in the filesystem and
|
||||
each file stores a sequence of compressed blocks. If options.cache is non-NULL,
|
||||
it is used to cache frequently used uncompressed block contents.
|
||||
|
||||
```c++
|
||||
#include "leveldb/cache.h"
|
||||
|
||||
leveldb::Options options;
|
||||
options.cache = leveldb::NewLRUCache(100 * 1048576); // 100MB cache
|
||||
leveldb::DB* db;
|
||||
leveldb::DB::Open(options, name, &db);
|
||||
... use the db ...
|
||||
delete db
|
||||
delete options.cache;
|
||||
```
|
||||
|
||||
Note that the cache holds uncompressed data, and therefore it should be sized
|
||||
according to application level data sizes, without any reduction from
|
||||
compression. (Caching of compressed blocks is left to the operating system
|
||||
buffer cache, or any custom Env implementation provided by the client.)
|
||||
|
||||
When performing a bulk read, the application may wish to disable caching so that
|
||||
the data processed by the bulk read does not end up displacing most of the
|
||||
cached contents. A per-iterator option can be used to achieve this:
|
||||
|
||||
```c++
|
||||
leveldb::ReadOptions options;
|
||||
options.fill_cache = false;
|
||||
leveldb::Iterator* it = db->NewIterator(options);
|
||||
for (it->SeekToFirst(); it->Valid(); it->Next()) {
|
||||
...
|
||||
}
|
||||
```
|
||||
|
||||
### Key Layout
|
||||
|
||||
Note that the unit of disk transfer and caching is a block. Adjacent keys
|
||||
(according to the database sort order) will usually be placed in the same block.
|
||||
Therefore the application can improve its performance by placing keys that are
|
||||
accessed together near each other and placing infrequently used keys in a
|
||||
separate region of the key space.
|
||||
|
||||
For example, suppose we are implementing a simple file system on top of leveldb.
|
||||
The types of entries we might wish to store are:
|
||||
|
||||
filename -> permission-bits, length, list of file_block_ids
|
||||
file_block_id -> data
|
||||
|
||||
We might want to prefix filename keys with one letter (say '/') and the
|
||||
`file_block_id` keys with a different letter (say '0') so that scans over just
|
||||
the metadata do not force us to fetch and cache bulky file contents.
|
||||
|
||||
### Filters
|
||||
|
||||
Because of the way leveldb data is organized on disk, a single `Get()` call may
|
||||
involve multiple reads from disk. The optional FilterPolicy mechanism can be
|
||||
used to reduce the number of disk reads substantially.
|
||||
|
||||
```c++
|
||||
leveldb::Options options;
|
||||
options.filter_policy = NewBloomFilterPolicy(10);
|
||||
leveldb::DB* db;
|
||||
leveldb::DB::Open(options, "/tmp/testdb", &db);
|
||||
... use the database ...
|
||||
delete db;
|
||||
delete options.filter_policy;
|
||||
```
|
||||
|
||||
The preceding code associates a Bloom filter based filtering policy with the
|
||||
database. Bloom filter based filtering relies on keeping some number of bits of
|
||||
data in memory per key (in this case 10 bits per key since that is the argument
|
||||
we passed to `NewBloomFilterPolicy`). This filter will reduce the number of
|
||||
unnecessary disk reads needed for Get() calls by a factor of approximately
|
||||
a 100. Increasing the bits per key will lead to a larger reduction at the cost
|
||||
of more memory usage. We recommend that applications whose working set does not
|
||||
fit in memory and that do a lot of random reads set a filter policy.
|
||||
|
||||
If you are using a custom comparator, you should ensure that the filter policy
|
||||
you are using is compatible with your comparator. For example, consider a
|
||||
comparator that ignores trailing spaces when comparing keys.
|
||||
`NewBloomFilterPolicy` must not be used with such a comparator. Instead, the
|
||||
application should provide a custom filter policy that also ignores trailing
|
||||
spaces. For example:
|
||||
|
||||
```c++
|
||||
class CustomFilterPolicy : public leveldb::FilterPolicy {
|
||||
private:
|
||||
FilterPolicy* builtin_policy_;
|
||||
|
||||
public:
|
||||
CustomFilterPolicy() : builtin_policy_(NewBloomFilterPolicy(10)) {}
|
||||
~CustomFilterPolicy() { delete builtin_policy_; }
|
||||
|
||||
const char* Name() const { return "IgnoreTrailingSpacesFilter"; }
|
||||
|
||||
void CreateFilter(const Slice* keys, int n, std::string* dst) const {
|
||||
// Use builtin bloom filter code after removing trailing spaces
|
||||
std::vector<Slice> trimmed(n);
|
||||
for (int i = 0; i < n; i++) {
|
||||
trimmed[i] = RemoveTrailingSpaces(keys[i]);
|
||||
}
|
||||
return builtin_policy_->CreateFilter(&trimmed[i], n, dst);
|
||||
}
|
||||
};
|
||||
```
|
||||
|
||||
Advanced applications may provide a filter policy that does not use a bloom
|
||||
filter but uses some other mechanism for summarizing a set of keys. See
|
||||
`leveldb/filter_policy.h` for detail.
|
||||
|
||||
## Checksums
|
||||
|
||||
leveldb associates checksums with all data it stores in the file system. There
|
||||
are two separate controls provided over how aggressively these checksums are
|
||||
verified:
|
||||
|
||||
`ReadOptions::verify_checksums` may be set to true to force checksum
|
||||
verification of all data that is read from the file system on behalf of a
|
||||
particular read. By default, no such verification is done.
|
||||
|
||||
`Options::paranoid_checks` may be set to true before opening a database to make
|
||||
the database implementation raise an error as soon as it detects an internal
|
||||
corruption. Depending on which portion of the database has been corrupted, the
|
||||
error may be raised when the database is opened, or later by another database
|
||||
operation. By default, paranoid checking is off so that the database can be used
|
||||
even if parts of its persistent storage have been corrupted.
|
||||
|
||||
If a database is corrupted (perhaps it cannot be opened when paranoid checking
|
||||
is turned on), the `leveldb::RepairDB` function may be used to recover as much
|
||||
of the data as possible
|
||||
|
||||
## Approximate Sizes
|
||||
|
||||
The `GetApproximateSizes` method can used to get the approximate number of bytes
|
||||
of file system space used by one or more key ranges.
|
||||
|
||||
```c++
|
||||
leveldb::Range ranges[2];
|
||||
ranges[0] = leveldb::Range("a", "c");
|
||||
ranges[1] = leveldb::Range("x", "z");
|
||||
uint64_t sizes[2];
|
||||
leveldb::Status s = db->GetApproximateSizes(ranges, 2, sizes);
|
||||
```
|
||||
|
||||
The preceding call will set `sizes[0]` to the approximate number of bytes of
|
||||
file system space used by the key range `[a..c)` and `sizes[1]` to the
|
||||
approximate number of bytes used by the key range `[x..z)`.
|
||||
|
||||
## Environment
|
||||
|
||||
All file operations (and other operating system calls) issued by the leveldb
|
||||
implementation are routed through a `leveldb::Env` object. Sophisticated clients
|
||||
may wish to provide their own Env implementation to get better control.
|
||||
For example, an application may introduce artificial delays in the file IO
|
||||
paths to limit the impact of leveldb on other activities in the system.
|
||||
|
||||
```c++
|
||||
class SlowEnv : public leveldb::Env {
|
||||
... implementation of the Env interface ...
|
||||
};
|
||||
|
||||
SlowEnv env;
|
||||
leveldb::Options options;
|
||||
options.env = &env;
|
||||
Status s = leveldb::DB::Open(options, ...);
|
||||
```
|
||||
|
||||
## Porting
|
||||
|
||||
leveldb may be ported to a new platform by providing platform specific
|
||||
implementations of the types/methods/functions exported by
|
||||
`leveldb/port/port.h`. See `leveldb/port/port_example.h` for more details.
|
||||
|
||||
In addition, the new platform may need a new default `leveldb::Env`
|
||||
implementation. See `leveldb/util/env_posix.h` for an example.
|
||||
|
||||
## Other Information
|
||||
|
||||
Details about the leveldb implementation may be found in the following
|
||||
documents:
|
||||
|
||||
1. [Implementation notes](impl.md)
|
||||
2. [Format of an immutable Table file](table_format.md)
|
||||
3. [Format of a log file](log_format.md)
|
|
@ -1,75 +0,0 @@
|
|||
leveldb Log format
|
||||
==================
|
||||
The log file contents are a sequence of 32KB blocks. The only exception is that
|
||||
the tail of the file may contain a partial block.
|
||||
|
||||
Each block consists of a sequence of records:
|
||||
|
||||
block := record* trailer?
|
||||
record :=
|
||||
checksum: uint32 // crc32c of type and data[] ; little-endian
|
||||
length: uint16 // little-endian
|
||||
type: uint8 // One of FULL, FIRST, MIDDLE, LAST
|
||||
data: uint8[length]
|
||||
|
||||
A record never starts within the last six bytes of a block (since it won't fit).
|
||||
Any leftover bytes here form the trailer, which must consist entirely of zero
|
||||
bytes and must be skipped by readers.
|
||||
|
||||
Aside: if exactly seven bytes are left in the current block, and a new non-zero
|
||||
length record is added, the writer must emit a FIRST record (which contains zero
|
||||
bytes of user data) to fill up the trailing seven bytes of the block and then
|
||||
emit all of the user data in subsequent blocks.
|
||||
|
||||
More types may be added in the future. Some Readers may skip record types they
|
||||
do not understand, others may report that some data was skipped.
|
||||
|
||||
FULL == 1
|
||||
FIRST == 2
|
||||
MIDDLE == 3
|
||||
LAST == 4
|
||||
|
||||
The FULL record contains the contents of an entire user record.
|
||||
|
||||
FIRST, MIDDLE, LAST are types used for user records that have been split into
|
||||
multiple fragments (typically because of block boundaries). FIRST is the type
|
||||
of the first fragment of a user record, LAST is the type of the last fragment of
|
||||
a user record, and MIDDLE is the type of all interior fragments of a user
|
||||
record.
|
||||
|
||||
Example: consider a sequence of user records:
|
||||
|
||||
A: length 1000
|
||||
B: length 97270
|
||||
C: length 8000
|
||||
|
||||
**A** will be stored as a FULL record in the first block.
|
||||
|
||||
**B** will be split into three fragments: first fragment occupies the rest of
|
||||
the first block, second fragment occupies the entirety of the second block, and
|
||||
the third fragment occupies a prefix of the third block. This will leave six
|
||||
bytes free in the third block, which will be left empty as the trailer.
|
||||
|
||||
**C** will be stored as a FULL record in the fourth block.
|
||||
|
||||
----
|
||||
|
||||
## Some benefits over the recordio format:
|
||||
|
||||
1. We do not need any heuristics for resyncing - just go to next block boundary
|
||||
and scan. If there is a corruption, skip to the next block. As a
|
||||
side-benefit, we do not get confused when part of the contents of one log
|
||||
file are embedded as a record inside another log file.
|
||||
|
||||
2. Splitting at approximate boundaries (e.g., for mapreduce) is simple: find the
|
||||
next block boundary and skip records until we hit a FULL or FIRST record.
|
||||
|
||||
3. We do not need extra buffering for large records.
|
||||
|
||||
## Some downsides compared to recordio format:
|
||||
|
||||
1. No packing of tiny records. This could be fixed by adding a new record type,
|
||||
so it is a shortcoming of the current implementation, not necessarily the
|
||||
format.
|
||||
|
||||
2. No compression. Again, this could be fixed by adding new record types.
|
75
src/leveldb/doc/log_format.txt
Normal file
75
src/leveldb/doc/log_format.txt
Normal file
|
@ -0,0 +1,75 @@
|
|||
The log file contents are a sequence of 32KB blocks. The only
|
||||
exception is that the tail of the file may contain a partial block.
|
||||
|
||||
Each block consists of a sequence of records:
|
||||
block := record* trailer?
|
||||
record :=
|
||||
checksum: uint32 // crc32c of type and data[]
|
||||
length: uint16
|
||||
type: uint8 // One of FULL, FIRST, MIDDLE, LAST
|
||||
data: uint8[length]
|
||||
|
||||
A record never starts within the last six bytes of a block (since it
|
||||
won't fit). Any leftover bytes here form the trailer, which must
|
||||
consist entirely of zero bytes and must be skipped by readers.
|
||||
|
||||
Aside: if exactly seven bytes are left in the current block, and a new
|
||||
non-zero length record is added, the writer must emit a FIRST record
|
||||
(which contains zero bytes of user data) to fill up the trailing seven
|
||||
bytes of the block and then emit all of the user data in subsequent
|
||||
blocks.
|
||||
|
||||
More types may be added in the future. Some Readers may skip record
|
||||
types they do not understand, others may report that some data was
|
||||
skipped.
|
||||
|
||||
FULL == 1
|
||||
FIRST == 2
|
||||
MIDDLE == 3
|
||||
LAST == 4
|
||||
|
||||
The FULL record contains the contents of an entire user record.
|
||||
|
||||
FIRST, MIDDLE, LAST are types used for user records that have been
|
||||
split into multiple fragments (typically because of block boundaries).
|
||||
FIRST is the type of the first fragment of a user record, LAST is the
|
||||
type of the last fragment of a user record, and MID is the type of all
|
||||
interior fragments of a user record.
|
||||
|
||||
Example: consider a sequence of user records:
|
||||
A: length 1000
|
||||
B: length 97270
|
||||
C: length 8000
|
||||
A will be stored as a FULL record in the first block.
|
||||
|
||||
B will be split into three fragments: first fragment occupies the rest
|
||||
of the first block, second fragment occupies the entirety of the
|
||||
second block, and the third fragment occupies a prefix of the third
|
||||
block. This will leave six bytes free in the third block, which will
|
||||
be left empty as the trailer.
|
||||
|
||||
C will be stored as a FULL record in the fourth block.
|
||||
|
||||
===================
|
||||
|
||||
Some benefits over the recordio format:
|
||||
|
||||
(1) We do not need any heuristics for resyncing - just go to next
|
||||
block boundary and scan. If there is a corruption, skip to the next
|
||||
block. As a side-benefit, we do not get confused when part of the
|
||||
contents of one log file are embedded as a record inside another log
|
||||
file.
|
||||
|
||||
(2) Splitting at approximate boundaries (e.g., for mapreduce) is
|
||||
simple: find the next block boundary and skip records until we
|
||||
hit a FULL or FIRST record.
|
||||
|
||||
(3) We do not need extra buffering for large records.
|
||||
|
||||
Some downsides compared to recordio format:
|
||||
|
||||
(1) No packing of tiny records. This could be fixed by adding a new
|
||||
record type, so it is a shortcoming of the current implementation,
|
||||
not necessarily the format.
|
||||
|
||||
(2) No compression. Again, this could be fixed by adding new record types.
|
|
@ -1,107 +0,0 @@
|
|||
leveldb File format
|
||||
===================
|
||||
|
||||
<beginning_of_file>
|
||||
[data block 1]
|
||||
[data block 2]
|
||||
...
|
||||
[data block N]
|
||||
[meta block 1]
|
||||
...
|
||||
[meta block K]
|
||||
[metaindex block]
|
||||
[index block]
|
||||
[Footer] (fixed size; starts at file_size - sizeof(Footer))
|
||||
<end_of_file>
|
||||
|
||||
The file contains internal pointers. Each such pointer is called
|
||||
a BlockHandle and contains the following information:
|
||||
|
||||
offset: varint64
|
||||
size: varint64
|
||||
|
||||
See [varints](https://developers.google.com/protocol-buffers/docs/encoding#varints)
|
||||
for an explanation of varint64 format.
|
||||
|
||||
1. The sequence of key/value pairs in the file are stored in sorted
|
||||
order and partitioned into a sequence of data blocks. These blocks
|
||||
come one after another at the beginning of the file. Each data block
|
||||
is formatted according to the code in `block_builder.cc`, and then
|
||||
optionally compressed.
|
||||
|
||||
2. After the data blocks we store a bunch of meta blocks. The
|
||||
supported meta block types are described below. More meta block types
|
||||
may be added in the future. Each meta block is again formatted using
|
||||
`block_builder.cc` and then optionally compressed.
|
||||
|
||||
3. A "metaindex" block. It contains one entry for every other meta
|
||||
block where the key is the name of the meta block and the value is a
|
||||
BlockHandle pointing to that meta block.
|
||||
|
||||
4. An "index" block. This block contains one entry per data block,
|
||||
where the key is a string >= last key in that data block and before
|
||||
the first key in the successive data block. The value is the
|
||||
BlockHandle for the data block.
|
||||
|
||||
5. At the very end of the file is a fixed length footer that contains
|
||||
the BlockHandle of the metaindex and index blocks as well as a magic number.
|
||||
|
||||
metaindex_handle: char[p]; // Block handle for metaindex
|
||||
index_handle: char[q]; // Block handle for index
|
||||
padding: char[40-p-q];// zeroed bytes to make fixed length
|
||||
// (40==2*BlockHandle::kMaxEncodedLength)
|
||||
magic: fixed64; // == 0xdb4775248b80fb57 (little-endian)
|
||||
|
||||
## "filter" Meta Block
|
||||
|
||||
If a `FilterPolicy` was specified when the database was opened, a
|
||||
filter block is stored in each table. The "metaindex" block contains
|
||||
an entry that maps from `filter.<N>` to the BlockHandle for the filter
|
||||
block where `<N>` is the string returned by the filter policy's
|
||||
`Name()` method.
|
||||
|
||||
The filter block stores a sequence of filters, where filter i contains
|
||||
the output of `FilterPolicy::CreateFilter()` on all keys that are stored
|
||||
in a block whose file offset falls within the range
|
||||
|
||||
[ i*base ... (i+1)*base-1 ]
|
||||
|
||||
Currently, "base" is 2KB. So for example, if blocks X and Y start in
|
||||
the range `[ 0KB .. 2KB-1 ]`, all of the keys in X and Y will be
|
||||
converted to a filter by calling `FilterPolicy::CreateFilter()`, and the
|
||||
resulting filter will be stored as the first filter in the filter
|
||||
block.
|
||||
|
||||
The filter block is formatted as follows:
|
||||
|
||||
[filter 0]
|
||||
[filter 1]
|
||||
[filter 2]
|
||||
...
|
||||
[filter N-1]
|
||||
|
||||
[offset of filter 0] : 4 bytes
|
||||
[offset of filter 1] : 4 bytes
|
||||
[offset of filter 2] : 4 bytes
|
||||
...
|
||||
[offset of filter N-1] : 4 bytes
|
||||
|
||||
[offset of beginning of offset array] : 4 bytes
|
||||
lg(base) : 1 byte
|
||||
|
||||
The offset array at the end of the filter block allows efficient
|
||||
mapping from a data block offset to the corresponding filter.
|
||||
|
||||
## "stats" Meta Block
|
||||
|
||||
This meta block contains a bunch of stats. The key is the name
|
||||
of the statistic. The value contains the statistic.
|
||||
|
||||
TODO(postrelease): record following stats.
|
||||
|
||||
data size
|
||||
index size
|
||||
key size (uncompressed)
|
||||
value size (uncompressed)
|
||||
number of entries
|
||||
number of data blocks
|
102
src/leveldb/doc/table_format.txt
Normal file
102
src/leveldb/doc/table_format.txt
Normal file
|
@ -0,0 +1,102 @@
|
|||
File format
|
||||
===========
|
||||
|
||||
<beginning_of_file>
|
||||
[data block 1]
|
||||
[data block 2]
|
||||
...
|
||||
[data block N]
|
||||
[meta block 1]
|
||||
...
|
||||
[meta block K]
|
||||
[metaindex block]
|
||||
[index block]
|
||||
[Footer] (fixed size; starts at file_size - sizeof(Footer))
|
||||
<end_of_file>
|
||||
|
||||
The file contains internal pointers. Each such pointer is called
|
||||
a BlockHandle and contains the following information:
|
||||
offset: varint64
|
||||
size: varint64
|
||||
|
||||
(1) The sequence of key/value pairs in the file are stored in sorted
|
||||
order and partitioned into a sequence of data blocks. These blocks
|
||||
come one after another at the beginning of the file. Each data block
|
||||
is formatted according to the code in block_builder.cc, and then
|
||||
optionally compressed.
|
||||
|
||||
(2) After the data blocks we store a bunch of meta blocks. The
|
||||
supported meta block types are described below. More meta block types
|
||||
may be added in the future. Each meta block is again formatted using
|
||||
block_builder.cc and then optionally compressed.
|
||||
|
||||
(3) A "metaindex" block. It contains one entry for every other meta
|
||||
block where the key is the name of the meta block and the value is a
|
||||
BlockHandle pointing to that meta block.
|
||||
|
||||
(4) An "index" block. This block contains one entry per data block,
|
||||
where the key is a string >= last key in that data block and before
|
||||
the first key in the successive data block. The value is the
|
||||
BlockHandle for the data block.
|
||||
|
||||
(6) At the very end of the file is a fixed length footer that contains
|
||||
the BlockHandle of the metaindex and index blocks as well as a magic number.
|
||||
metaindex_handle: char[p]; // Block handle for metaindex
|
||||
index_handle: char[q]; // Block handle for index
|
||||
padding: char[40-p-q]; // 0 bytes to make fixed length
|
||||
// (40==2*BlockHandle::kMaxEncodedLength)
|
||||
magic: fixed64; // == 0xdb4775248b80fb57
|
||||
|
||||
"filter" Meta Block
|
||||
-------------------
|
||||
|
||||
If a "FilterPolicy" was specified when the database was opened, a
|
||||
filter block is stored in each table. The "metaindex" block contains
|
||||
an entry that maps from "filter.<N>" to the BlockHandle for the filter
|
||||
block where "<N>" is the string returned by the filter policy's
|
||||
"Name()" method.
|
||||
|
||||
The filter block stores a sequence of filters, where filter i contains
|
||||
the output of FilterPolicy::CreateFilter() on all keys that are stored
|
||||
in a block whose file offset falls within the range
|
||||
|
||||
[ i*base ... (i+1)*base-1 ]
|
||||
|
||||
Currently, "base" is 2KB. So for example, if blocks X and Y start in
|
||||
the range [ 0KB .. 2KB-1 ], all of the keys in X and Y will be
|
||||
converted to a filter by calling FilterPolicy::CreateFilter(), and the
|
||||
resulting filter will be stored as the first filter in the filter
|
||||
block.
|
||||
|
||||
The filter block is formatted as follows:
|
||||
|
||||
[filter 0]
|
||||
[filter 1]
|
||||
[filter 2]
|
||||
...
|
||||
[filter N-1]
|
||||
|
||||
[offset of filter 0] : 4 bytes
|
||||
[offset of filter 1] : 4 bytes
|
||||
[offset of filter 2] : 4 bytes
|
||||
...
|
||||
[offset of filter N-1] : 4 bytes
|
||||
|
||||
[offset of beginning of offset array] : 4 bytes
|
||||
lg(base) : 1 byte
|
||||
|
||||
The offset array at the end of the filter block allows efficient
|
||||
mapping from a data block offset to the corresponding filter.
|
||||
|
||||
"stats" Meta Block
|
||||
------------------
|
||||
|
||||
This meta block contains a bunch of stats. The key is the name
|
||||
of the statistic. The value contains the statistic.
|
||||
TODO(postrelease): record following stats.
|
||||
data size
|
||||
index size
|
||||
key size (uncompressed)
|
||||
value size (uncompressed)
|
||||
number of entries
|
||||
number of data blocks
|
|
@ -55,15 +55,14 @@ class FileState {
|
|||
}
|
||||
const uint64_t available = size_ - offset;
|
||||
if (n > available) {
|
||||
n = static_cast<size_t>(available);
|
||||
n = available;
|
||||
}
|
||||
if (n == 0) {
|
||||
*result = Slice();
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
assert(offset / kBlockSize <= SIZE_MAX);
|
||||
size_t block = static_cast<size_t>(offset / kBlockSize);
|
||||
size_t block = offset / kBlockSize;
|
||||
size_t block_offset = offset % kBlockSize;
|
||||
|
||||
if (n <= kBlockSize - block_offset) {
|
||||
|
@ -168,7 +167,7 @@ class SequentialFileImpl : public SequentialFile {
|
|||
if (pos_ > file_->Size()) {
|
||||
return Status::IOError("pos_ > file_->Size()");
|
||||
}
|
||||
const uint64_t available = file_->Size() - pos_;
|
||||
const size_t available = file_->Size() - pos_;
|
||||
if (n > available) {
|
||||
n = available;
|
||||
}
|
||||
|
@ -176,10 +175,9 @@ class SequentialFileImpl : public SequentialFile {
|
|||
return Status::OK();
|
||||
}
|
||||
|
||||
virtual std::string GetName() const { return "[memenv]"; }
|
||||
private:
|
||||
FileState* file_;
|
||||
uint64_t pos_;
|
||||
size_t pos_;
|
||||
};
|
||||
|
||||
class RandomAccessFileImpl : public RandomAccessFile {
|
||||
|
@ -197,7 +195,6 @@ class RandomAccessFileImpl : public RandomAccessFile {
|
|||
return file_->Read(offset, n, result, scratch);
|
||||
}
|
||||
|
||||
virtual std::string GetName() const { return "[memenv]"; }
|
||||
private:
|
||||
FileState* file_;
|
||||
};
|
||||
|
@ -220,16 +217,10 @@ class WritableFileImpl : public WritableFile {
|
|||
virtual Status Flush() { return Status::OK(); }
|
||||
virtual Status Sync() { return Status::OK(); }
|
||||
|
||||
virtual std::string GetName() const { return "[memenv]"; }
|
||||
private:
|
||||
FileState* file_;
|
||||
};
|
||||
|
||||
class NoOpLogger : public Logger {
|
||||
public:
|
||||
virtual void Logv(const char* format, va_list ap) { }
|
||||
};
|
||||
|
||||
class InMemoryEnv : public EnvWrapper {
|
||||
public:
|
||||
explicit InMemoryEnv(Env* base_env) : EnvWrapper(base_env) { }
|
||||
|
@ -266,7 +257,7 @@ class InMemoryEnv : public EnvWrapper {
|
|||
}
|
||||
|
||||
virtual Status NewWritableFile(const std::string& fname,
|
||||
WritableFile** result) {
|
||||
WritableFile** result, size_t) {
|
||||
MutexLock lock(&mutex_);
|
||||
if (file_map_.find(fname) != file_map_.end()) {
|
||||
DeleteFileInternal(fname);
|
||||
|
@ -280,19 +271,6 @@ class InMemoryEnv : public EnvWrapper {
|
|||
return Status::OK();
|
||||
}
|
||||
|
||||
virtual Status NewAppendableFile(const std::string& fname,
|
||||
WritableFile** result) {
|
||||
MutexLock lock(&mutex_);
|
||||
FileState** sptr = &file_map_[fname];
|
||||
FileState* file = *sptr;
|
||||
if (file == NULL) {
|
||||
file = new FileState();
|
||||
file->Ref();
|
||||
}
|
||||
*result = new WritableFileImpl(file);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
virtual bool FileExists(const std::string& fname) {
|
||||
MutexLock lock(&mutex_);
|
||||
return file_map_.find(fname) != file_map_.end();
|
||||
|
@ -380,11 +358,6 @@ class InMemoryEnv : public EnvWrapper {
|
|||
return Status::OK();
|
||||
}
|
||||
|
||||
virtual Status NewLogger(const std::string& fname, Logger** result) {
|
||||
*result = new NoOpLogger;
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
private:
|
||||
// Map from filenames to FileState objects, representing a simple file system.
|
||||
typedef std::map<std::string, FileState*> FileSystem;
|
||||
|
|
|
@ -29,68 +29,61 @@ TEST(MemEnvTest, Basics) {
|
|||
uint64_t file_size;
|
||||
WritableFile* writable_file;
|
||||
std::vector<std::string> children;
|
||||
std::string dbname;
|
||||
|
||||
ASSERT_OK(env_->CreateDir("/dir"));
|
||||
dbname=test::TmpDir();
|
||||
ASSERT_OK(env_->CreateDir(dbname.c_str()));
|
||||
|
||||
// Check that the directory is empty.
|
||||
ASSERT_TRUE(!env_->FileExists("/dir/non_existent"));
|
||||
ASSERT_TRUE(!env_->GetFileSize("/dir/non_existent", &file_size).ok());
|
||||
ASSERT_OK(env_->GetChildren("/dir", &children));
|
||||
ASSERT_TRUE(!env_->FileExists(dbname + "/non_existent"));
|
||||
ASSERT_TRUE(!env_->GetFileSize(dbname + "/non_existent", &file_size).ok());
|
||||
ASSERT_OK(env_->GetChildren(dbname + "", &children));
|
||||
ASSERT_EQ(0, children.size());
|
||||
|
||||
// Create a file.
|
||||
ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file));
|
||||
ASSERT_OK(env_->GetFileSize("/dir/f", &file_size));
|
||||
ASSERT_EQ(0, file_size);
|
||||
ASSERT_OK(env_->NewWritableFile(dbname + "/f", &writable_file, 2<<20));
|
||||
delete writable_file;
|
||||
|
||||
// Check that the file exists.
|
||||
ASSERT_TRUE(env_->FileExists("/dir/f"));
|
||||
ASSERT_OK(env_->GetFileSize("/dir/f", &file_size));
|
||||
ASSERT_TRUE(env_->FileExists(dbname + "/f"));
|
||||
ASSERT_OK(env_->GetFileSize(dbname + "/f", &file_size));
|
||||
ASSERT_EQ(0, file_size);
|
||||
ASSERT_OK(env_->GetChildren("/dir", &children));
|
||||
ASSERT_OK(env_->GetChildren(dbname + "", &children));
|
||||
ASSERT_EQ(1, children.size());
|
||||
ASSERT_EQ("f", children[0]);
|
||||
|
||||
// Write to the file.
|
||||
ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file));
|
||||
ASSERT_OK(env_->NewWritableFile(dbname + "/f", &writable_file, 2<<20));
|
||||
ASSERT_OK(writable_file->Append("abc"));
|
||||
delete writable_file;
|
||||
|
||||
// Check that append works.
|
||||
ASSERT_OK(env_->NewAppendableFile("/dir/f", &writable_file));
|
||||
ASSERT_OK(env_->GetFileSize("/dir/f", &file_size));
|
||||
ASSERT_EQ(3, file_size);
|
||||
ASSERT_OK(writable_file->Append("hello"));
|
||||
delete writable_file;
|
||||
|
||||
// Check for expected size.
|
||||
ASSERT_OK(env_->GetFileSize("/dir/f", &file_size));
|
||||
ASSERT_EQ(8, file_size);
|
||||
ASSERT_OK(env_->GetFileSize(dbname + "/f", &file_size));
|
||||
ASSERT_EQ(3, file_size);
|
||||
|
||||
// Check that renaming works.
|
||||
ASSERT_TRUE(!env_->RenameFile("/dir/non_existent", "/dir/g").ok());
|
||||
ASSERT_OK(env_->RenameFile("/dir/f", "/dir/g"));
|
||||
ASSERT_TRUE(!env_->FileExists("/dir/f"));
|
||||
ASSERT_TRUE(env_->FileExists("/dir/g"));
|
||||
ASSERT_OK(env_->GetFileSize("/dir/g", &file_size));
|
||||
ASSERT_EQ(8, file_size);
|
||||
ASSERT_TRUE(!env_->RenameFile(dbname + "/non_existent", dbname + "/g").ok());
|
||||
ASSERT_OK(env_->RenameFile(dbname + "/f", dbname + "/g"));
|
||||
ASSERT_TRUE(!env_->FileExists(dbname + "/f"));
|
||||
ASSERT_TRUE(env_->FileExists(dbname + "/g"));
|
||||
ASSERT_OK(env_->GetFileSize(dbname + "/g", &file_size));
|
||||
ASSERT_EQ(3, file_size);
|
||||
|
||||
// Check that opening non-existent file fails.
|
||||
SequentialFile* seq_file;
|
||||
RandomAccessFile* rand_file;
|
||||
ASSERT_TRUE(!env_->NewSequentialFile("/dir/non_existent", &seq_file).ok());
|
||||
ASSERT_TRUE(!env_->NewSequentialFile(dbname + "/non_existent", &seq_file).ok());
|
||||
ASSERT_TRUE(!seq_file);
|
||||
ASSERT_TRUE(!env_->NewRandomAccessFile("/dir/non_existent", &rand_file).ok());
|
||||
ASSERT_TRUE(!env_->NewRandomAccessFile(dbname + "/non_existent", &rand_file).ok());
|
||||
ASSERT_TRUE(!rand_file);
|
||||
|
||||
// Check that deleting works.
|
||||
ASSERT_TRUE(!env_->DeleteFile("/dir/non_existent").ok());
|
||||
ASSERT_OK(env_->DeleteFile("/dir/g"));
|
||||
ASSERT_TRUE(!env_->FileExists("/dir/g"));
|
||||
ASSERT_OK(env_->GetChildren("/dir", &children));
|
||||
ASSERT_TRUE(!env_->DeleteFile(dbname + "/non_existent").ok());
|
||||
ASSERT_OK(env_->DeleteFile(dbname + "/g"));
|
||||
ASSERT_TRUE(!env_->FileExists(dbname + "/g"));
|
||||
ASSERT_OK(env_->GetChildren(dbname + "", &children));
|
||||
ASSERT_EQ(0, children.size());
|
||||
ASSERT_OK(env_->DeleteDir("/dir"));
|
||||
ASSERT_OK(env_->DeleteDir(dbname + ""));
|
||||
}
|
||||
|
||||
TEST(MemEnvTest, ReadWrite) {
|
||||
|
@ -99,16 +92,19 @@ TEST(MemEnvTest, ReadWrite) {
|
|||
RandomAccessFile* rand_file;
|
||||
Slice result;
|
||||
char scratch[100];
|
||||
std::string dbname;
|
||||
|
||||
ASSERT_OK(env_->CreateDir("/dir"));
|
||||
dbname=test::TmpDir();
|
||||
|
||||
ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file));
|
||||
ASSERT_OK(env_->CreateDir(dbname + ""));
|
||||
|
||||
ASSERT_OK(env_->NewWritableFile(dbname + "/f", &writable_file, 2<<20));
|
||||
ASSERT_OK(writable_file->Append("hello "));
|
||||
ASSERT_OK(writable_file->Append("world"));
|
||||
delete writable_file;
|
||||
|
||||
// Read sequentially.
|
||||
ASSERT_OK(env_->NewSequentialFile("/dir/f", &seq_file));
|
||||
ASSERT_OK(env_->NewSequentialFile(dbname + "/f", &seq_file));
|
||||
ASSERT_OK(seq_file->Read(5, &result, scratch)); // Read "hello".
|
||||
ASSERT_EQ(0, result.compare("hello"));
|
||||
ASSERT_OK(seq_file->Skip(1));
|
||||
|
@ -122,7 +118,7 @@ TEST(MemEnvTest, ReadWrite) {
|
|||
delete seq_file;
|
||||
|
||||
// Random reads.
|
||||
ASSERT_OK(env_->NewRandomAccessFile("/dir/f", &rand_file));
|
||||
ASSERT_OK(env_->NewRandomAccessFile(dbname + "/f", &rand_file));
|
||||
ASSERT_OK(rand_file->Read(6, 5, &result, scratch)); // Read "world".
|
||||
ASSERT_EQ(0, result.compare("world"));
|
||||
ASSERT_OK(rand_file->Read(0, 5, &result, scratch)); // Read "hello".
|
||||
|
@ -149,7 +145,7 @@ TEST(MemEnvTest, Misc) {
|
|||
ASSERT_TRUE(!test_dir.empty());
|
||||
|
||||
WritableFile* writable_file;
|
||||
ASSERT_OK(env_->NewWritableFile("/a/b", &writable_file));
|
||||
ASSERT_OK(env_->NewWritableFile("/a/b", &writable_file, 2<<20));
|
||||
|
||||
// These are no-ops, but we test they return success.
|
||||
ASSERT_OK(writable_file->Sync());
|
||||
|
@ -161,6 +157,9 @@ TEST(MemEnvTest, Misc) {
|
|||
TEST(MemEnvTest, LargeWrite) {
|
||||
const size_t kWriteSize = 300 * 1024;
|
||||
char* scratch = new char[kWriteSize * 2];
|
||||
std::string dbname;
|
||||
|
||||
dbname=test::TmpDir();
|
||||
|
||||
std::string write_data;
|
||||
for (size_t i = 0; i < kWriteSize; ++i) {
|
||||
|
@ -168,14 +167,14 @@ TEST(MemEnvTest, LargeWrite) {
|
|||
}
|
||||
|
||||
WritableFile* writable_file;
|
||||
ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file));
|
||||
ASSERT_OK(env_->NewWritableFile(dbname + "/f", &writable_file, 2<<20));
|
||||
ASSERT_OK(writable_file->Append("foo"));
|
||||
ASSERT_OK(writable_file->Append(write_data));
|
||||
delete writable_file;
|
||||
|
||||
SequentialFile* seq_file;
|
||||
Slice result;
|
||||
ASSERT_OK(env_->NewSequentialFile("/dir/f", &seq_file));
|
||||
ASSERT_OK(env_->NewSequentialFile(dbname + "/f", &seq_file));
|
||||
ASSERT_OK(seq_file->Read(3, &result, scratch)); // Read "foo".
|
||||
ASSERT_EQ(0, result.compare("foo"));
|
||||
|
||||
|
@ -190,17 +189,21 @@ TEST(MemEnvTest, LargeWrite) {
|
|||
delete seq_file;
|
||||
delete [] scratch;
|
||||
}
|
||||
|
||||
#if 0
|
||||
TEST(MemEnvTest, DBTest) {
|
||||
Options options;
|
||||
options.create_if_missing = true;
|
||||
options.env = env_;
|
||||
DB* db;
|
||||
std::string dbname;
|
||||
|
||||
dbname=test::TmpDir();
|
||||
ASSERT_OK(env_->CreateDir(dbname+ "/db"));
|
||||
|
||||
const Slice keys[] = {Slice("aaa"), Slice("bbb"), Slice("ccc")};
|
||||
const Slice vals[] = {Slice("foo"), Slice("bar"), Slice("baz")};
|
||||
|
||||
ASSERT_OK(DB::Open(options, "/dir/db", &db));
|
||||
ASSERT_OK(DB::Open(options, dbname + "/db", &db));
|
||||
for (size_t i = 0; i < 3; ++i) {
|
||||
ASSERT_OK(db->Put(WriteOptions(), keys[i], vals[i]));
|
||||
}
|
||||
|
@ -233,7 +236,7 @@ TEST(MemEnvTest, DBTest) {
|
|||
|
||||
delete db;
|
||||
}
|
||||
|
||||
#endif
|
||||
} // namespace leveldb
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
|
|
227
src/leveldb/include/leveldb/atomics.h
Normal file
227
src/leveldb/include/leveldb/atomics.h
Normal file
|
@ -0,0 +1,227 @@
|
|||
// -------------------------------------------------------------------
|
||||
//
|
||||
// atomics.h: portable atomic operations for leveldb/eleveldb (http://code.google.com/p/leveldb/)
|
||||
//
|
||||
// Copyright (c) 2011-2013 Basho Technologies, Inc. All Rights Reserved.
|
||||
//
|
||||
// This file is provided to you under the Apache License,
|
||||
// Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain
|
||||
// a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
//
|
||||
// -------------------------------------------------------------------
|
||||
|
||||
/// Copied from basho/eleveldb/c_src/detail.hpp September 8, 2013
|
||||
|
||||
#ifndef LEVELDB_ATOMIC_H
|
||||
#define LEVELDB_ATOMIC_H 1
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
|
||||
/* These can be hopefully-replaced with constexpr or compile-time assert later: */
|
||||
#if defined(OS_SOLARIS) || defined(SOLARIS) || defined(sun)
|
||||
#define LEVELDB_IS_SOLARIS 1
|
||||
#else
|
||||
#undef LEVELDB_IS_SOLARIS
|
||||
#endif
|
||||
|
||||
#ifdef LEVELDB_IS_SOLARIS
|
||||
#include <atomic.h>
|
||||
#endif
|
||||
|
||||
namespace leveldb {
|
||||
|
||||
/**
|
||||
* Compare and swap
|
||||
*/
|
||||
|
||||
// primary template
|
||||
template <typename PtrT, typename ValueT>
|
||||
inline bool compare_and_swap(volatile PtrT *ptr, const ValueT& comp_val, const ValueT& exchange_val);
|
||||
|
||||
|
||||
// uint32 size (needed for solaris)
|
||||
template <>
|
||||
inline bool compare_and_swap(volatile uint32_t *ptr, const int& comp_val, const int& exchange_val)
|
||||
{
|
||||
#if LEVELDB_IS_SOLARIS
|
||||
return ((uint32_t) comp_val==atomic_cas_32(ptr, comp_val, exchange_val));
|
||||
#else
|
||||
return __sync_bool_compare_and_swap(ptr, comp_val, exchange_val);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
// generic specification ... for pointers
|
||||
template <typename PtrT, typename ValueT>
|
||||
inline bool compare_and_swap(volatile PtrT *ptr, const ValueT& comp_val, const ValueT& exchange_val)
|
||||
{
|
||||
#if LEVELDB_IS_SOLARIS
|
||||
return (comp_val==atomic_cas_ptr(ptr, comp_val, exchange_val));
|
||||
#else
|
||||
return __sync_bool_compare_and_swap(ptr, comp_val, exchange_val);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Atomic increment
|
||||
*/
|
||||
|
||||
template <typename ValueT>
|
||||
inline ValueT inc_and_fetch(volatile ValueT *ptr);
|
||||
|
||||
template <>
|
||||
inline uint64_t inc_and_fetch(volatile uint64_t *ptr)
|
||||
{
|
||||
#if LEVELDB_IS_SOLARIS
|
||||
return atomic_inc_64_nv(ptr);
|
||||
#else
|
||||
return __sync_add_and_fetch(ptr, 1);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <>
|
||||
inline uint32_t inc_and_fetch(volatile uint32_t *ptr)
|
||||
{
|
||||
#if LEVELDB_IS_SOLARIS
|
||||
return atomic_inc_32_nv(ptr);
|
||||
#else
|
||||
return __sync_add_and_fetch(ptr, 1);
|
||||
#endif
|
||||
}
|
||||
|
||||
#if defined(__APPLE__) || defined(__OpenBSD__) || (defined(__s390__) && !defined(__s390x__))
|
||||
template <>
|
||||
inline size_t inc_and_fetch(volatile size_t *ptr)
|
||||
{
|
||||
return __sync_add_and_fetch(ptr, 1);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
/**
|
||||
* atomic decrement
|
||||
*/
|
||||
|
||||
template <typename ValueT>
|
||||
inline ValueT dec_and_fetch(volatile ValueT *ptr);
|
||||
|
||||
template <>
|
||||
inline uint64_t dec_and_fetch(volatile uint64_t *ptr)
|
||||
{
|
||||
#if LEVELDB_IS_SOLARIS
|
||||
return atomic_dec_64_nv(ptr);
|
||||
#else
|
||||
return __sync_sub_and_fetch(ptr, 1);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <>
|
||||
inline uint32_t dec_and_fetch(volatile uint32_t *ptr)
|
||||
{
|
||||
#if LEVELDB_IS_SOLARIS
|
||||
return atomic_dec_32_nv(ptr);
|
||||
#else
|
||||
return __sync_sub_and_fetch(ptr, 1);
|
||||
#endif
|
||||
}
|
||||
|
||||
#if defined(__APPLE__) || defined(__OpenBSD__) || (defined(__s390__) && !defined(__s390x__))
|
||||
template <>
|
||||
inline size_t dec_and_fetch(volatile size_t *ptr)
|
||||
{
|
||||
return __sync_sub_and_fetch(ptr, 1);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
/**
|
||||
* Atomic add
|
||||
*/
|
||||
|
||||
|
||||
template <typename ValueT>
|
||||
inline ValueT add_and_fetch(volatile ValueT *ptr, ValueT val);
|
||||
|
||||
template <>
|
||||
inline uint64_t add_and_fetch(volatile uint64_t *ptr, uint64_t val)
|
||||
{
|
||||
#if LEVELDB_IS_SOLARIS
|
||||
return atomic_add_64_nv(ptr, val);
|
||||
#else
|
||||
return __sync_add_and_fetch(ptr, val);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <>
|
||||
inline uint32_t add_and_fetch(volatile uint32_t *ptr, uint32_t val)
|
||||
{
|
||||
#if LEVELDB_IS_SOLARIS
|
||||
return atomic_add_32_nv(ptr, val);
|
||||
#else
|
||||
return __sync_add_and_fetch(ptr, val);
|
||||
#endif
|
||||
}
|
||||
|
||||
#if defined(__APPLE__) || defined(__OpenBSD__) || (defined(__s390__) && !defined(__s390x__))
|
||||
template <>
|
||||
inline size_t add_and_fetch(volatile size_t *ptr, size_t val)
|
||||
{
|
||||
return __sync_add_and_fetch(ptr, val);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
/**
|
||||
* Atomic subtract
|
||||
*/
|
||||
|
||||
template <typename ValueT>
|
||||
inline ValueT sub_and_fetch(volatile ValueT *ptr, ValueT val);
|
||||
|
||||
template <>
|
||||
inline uint64_t sub_and_fetch(volatile uint64_t *ptr, uint64_t val)
|
||||
{
|
||||
#if LEVELDB_IS_SOLARIS
|
||||
uint64_t temp=(~val)+1; // 2's complement, bypass sign warnings
|
||||
return atomic_add_64_nv(ptr, temp);
|
||||
#else
|
||||
return __sync_sub_and_fetch(ptr, val);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <>
|
||||
inline uint32_t sub_and_fetch(volatile uint32_t *ptr, uint32_t val)
|
||||
{
|
||||
#if LEVELDB_IS_SOLARIS
|
||||
uint32_t temp=(~val)+1; // 2's complement, bypass sign warnings
|
||||
return atomic_add_32_nv(ptr, temp);
|
||||
#else
|
||||
return __sync_sub_and_fetch(ptr, val);
|
||||
#endif
|
||||
}
|
||||
|
||||
#if defined(__APPLE__) || defined(__OpenBSD__) || (defined(__s390__) && !defined(__s390x__))
|
||||
template <>
|
||||
inline size_t sub_and_fetch(volatile size_t *ptr, size_t val)
|
||||
{
|
||||
return __sync_sub_and_fetch(ptr, val);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
} // namespace leveldb
|
||||
|
||||
#endif
|
|
@ -9,6 +9,7 @@
|
|||
Does not support:
|
||||
. getters for the option types
|
||||
. custom comparators that implement key shortening
|
||||
. capturing post-write-snapshot
|
||||
. custom iter, db, env, cache implementations using just the C bindings
|
||||
|
||||
Some conventions:
|
||||
|
@ -27,7 +28,6 @@
|
|||
be true on entry:
|
||||
*errptr == NULL
|
||||
*errptr points to a malloc()ed null-terminated error message
|
||||
(On Windows, *errptr must have been malloc()-ed by this library.)
|
||||
On success, a leveldb routine leaves *errptr unchanged.
|
||||
On failure, leveldb frees the old value of *errptr and
|
||||
set *errptr to a malloc()ed error message.
|
||||
|
@ -66,7 +66,7 @@ typedef struct leveldb_snapshot_t leveldb_snapshot_t;
|
|||
typedef struct leveldb_writablefile_t leveldb_writablefile_t;
|
||||
typedef struct leveldb_writebatch_t leveldb_writebatch_t;
|
||||
typedef struct leveldb_writeoptions_t leveldb_writeoptions_t;
|
||||
|
||||
typedef struct leveldb_keymetadata_t leveldb_keymetadata_t;
|
||||
/* DB operations */
|
||||
|
||||
extern leveldb_t* leveldb_open(
|
||||
|
@ -83,6 +83,14 @@ extern void leveldb_put(
|
|||
const char* val, size_t vallen,
|
||||
char** errptr);
|
||||
|
||||
extern void leveldb_put2(
|
||||
leveldb_t* db,
|
||||
const leveldb_writeoptions_t* options,
|
||||
const char* key, size_t keylen,
|
||||
const char* val, size_t vallen,
|
||||
char** errptr,
|
||||
const leveldb_keymetadata_t * metadata);
|
||||
|
||||
extern void leveldb_delete(
|
||||
leveldb_t* db,
|
||||
const leveldb_writeoptions_t* options,
|
||||
|
@ -104,6 +112,14 @@ extern char* leveldb_get(
|
|||
size_t* vallen,
|
||||
char** errptr);
|
||||
|
||||
extern char* leveldb_get2(
|
||||
leveldb_t* db,
|
||||
const leveldb_readoptions_t* options,
|
||||
const char* key, size_t keylen,
|
||||
size_t* vallen,
|
||||
char** errptr,
|
||||
leveldb_keymetadata_t * metadata);
|
||||
|
||||
extern leveldb_iterator_t* leveldb_create_iterator(
|
||||
leveldb_t* db,
|
||||
const leveldb_readoptions_t* options);
|
||||
|
@ -156,6 +172,7 @@ extern void leveldb_iter_next(leveldb_iterator_t*);
|
|||
extern void leveldb_iter_prev(leveldb_iterator_t*);
|
||||
extern const char* leveldb_iter_key(const leveldb_iterator_t*, size_t* klen);
|
||||
extern const char* leveldb_iter_value(const leveldb_iterator_t*, size_t* vlen);
|
||||
extern const void leveldb_iter_keymetadata(const leveldb_iterator_t *, leveldb_keymetadata_t *);
|
||||
extern void leveldb_iter_get_error(const leveldb_iterator_t*, char** errptr);
|
||||
|
||||
/* Write batch */
|
||||
|
@ -167,13 +184,19 @@ extern void leveldb_writebatch_put(
|
|||
leveldb_writebatch_t*,
|
||||
const char* key, size_t klen,
|
||||
const char* val, size_t vlen);
|
||||
extern void leveldb_writebatch_put2(
|
||||
leveldb_writebatch_t*,
|
||||
const char* key, size_t klen,
|
||||
const char* val, size_t vlen,
|
||||
const leveldb_keymetadata_t * meta);
|
||||
extern void leveldb_writebatch_delete(
|
||||
leveldb_writebatch_t*,
|
||||
const char* key, size_t klen);
|
||||
extern void leveldb_writebatch_iterate(
|
||||
leveldb_writebatch_t*,
|
||||
void* state,
|
||||
void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen),
|
||||
void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen,
|
||||
const int & type, const uint64_t & expiry),
|
||||
void (*deleted)(void*, const char* k, size_t klen));
|
||||
|
||||
/* Options */
|
||||
|
@ -192,6 +215,8 @@ extern void leveldb_options_set_error_if_exists(
|
|||
leveldb_options_t*, unsigned char);
|
||||
extern void leveldb_options_set_paranoid_checks(
|
||||
leveldb_options_t*, unsigned char);
|
||||
extern void leveldb_options_set_verify_compactions(
|
||||
leveldb_options_t*, unsigned char);
|
||||
extern void leveldb_options_set_env(leveldb_options_t*, leveldb_env_t*);
|
||||
extern void leveldb_options_set_info_log(leveldb_options_t*, leveldb_logger_t*);
|
||||
extern void leveldb_options_set_write_buffer_size(leveldb_options_t*, size_t);
|
||||
|
@ -199,6 +224,7 @@ extern void leveldb_options_set_max_open_files(leveldb_options_t*, int);
|
|||
extern void leveldb_options_set_cache(leveldb_options_t*, leveldb_cache_t*);
|
||||
extern void leveldb_options_set_block_size(leveldb_options_t*, size_t);
|
||||
extern void leveldb_options_set_block_restart_interval(leveldb_options_t*, int);
|
||||
extern void leveldb_options_set_total_leveldb_mem(leveldb_options_t*, size_t);
|
||||
|
||||
enum {
|
||||
leveldb_no_compression = 0,
|
||||
|
@ -267,20 +293,20 @@ extern void leveldb_cache_destroy(leveldb_cache_t* cache);
|
|||
|
||||
extern leveldb_env_t* leveldb_create_default_env();
|
||||
extern void leveldb_env_destroy(leveldb_env_t*);
|
||||
extern void leveldb_env_shutdown();
|
||||
|
||||
/* Utility */
|
||||
/* Util */
|
||||
|
||||
/* Calls free(ptr).
|
||||
REQUIRES: ptr was malloc()-ed and returned by one of the routines
|
||||
in this file. Note that in certain cases (typically on Windows), you
|
||||
may need to call this routine instead of free(ptr) to dispose of
|
||||
malloc()-ed memory returned by this library. */
|
||||
/**
|
||||
* CAUTION: this call is only for char * objects returned by
|
||||
* functions like leveldb_get and leveldb_property_value.
|
||||
* Also used to release errptr strings.
|
||||
*/
|
||||
extern void leveldb_free(void* ptr);
|
||||
|
||||
/* Return the major version number for this release. */
|
||||
extern int leveldb_major_version();
|
||||
/* Version */
|
||||
|
||||
/* Return the minor version number for this release. */
|
||||
extern int leveldb_major_version();
|
||||
extern int leveldb_minor_version();
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
|
|
@ -29,6 +29,11 @@ class Cache;
|
|||
// of Cache uses a least-recently-used eviction policy.
|
||||
extern Cache* NewLRUCache(size_t capacity);
|
||||
|
||||
// Riak customization - just like NewLRUCache except the underlying
|
||||
// structure is NOT sharded. Better for file cache.
|
||||
extern Cache* NewLRUCache2(size_t capacity);
|
||||
|
||||
|
||||
class Cache {
|
||||
public:
|
||||
Cache() { }
|
||||
|
@ -81,16 +86,17 @@ class Cache {
|
|||
// its cache keys.
|
||||
virtual uint64_t NewId() = 0;
|
||||
|
||||
// Remove all cache entries that are not actively in use. Memory-constrained
|
||||
// applications may wish to call this method to reduce memory usage.
|
||||
// Default implementation of Prune() does nothing. Subclasses are strongly
|
||||
// encouraged to override the default implementation. A future release of
|
||||
// leveldb may change Prune() to a pure abstract method.
|
||||
virtual void Prune() {}
|
||||
// Return size, if any, of per entry overhead for item placed in cache.
|
||||
// Allows more accurate tracking of "charge" against each cache item.
|
||||
virtual size_t EntryOverheadSize() {return(0);};
|
||||
|
||||
// Return an estimate of the combined charges of all elements stored in the
|
||||
// cache.
|
||||
virtual size_t TotalCharge() const = 0;
|
||||
// Riak specific: Add a reference to cache object to help hold it
|
||||
// in memory
|
||||
virtual void Addref(Handle* e) = 0;
|
||||
|
||||
// Riak specific: walk contents of entire cache, calling functor Acc
|
||||
// with the "value" for each cache entry. Locks cache throughout call.
|
||||
virtual bool WalkCache(class CacheAccumulator & Acc) {return(true);};
|
||||
|
||||
private:
|
||||
void LRU_Remove(Handle* e);
|
||||
|
@ -107,4 +113,4 @@ class Cache {
|
|||
|
||||
} // namespace leveldb
|
||||
|
||||
#endif // STORAGE_LEVELDB_INCLUDE_CACHE_H_
|
||||
#endif // STORAGE_LEVELDB_UTIL_CACHE_H_
|
||||
|
|
|
@ -58,6 +58,10 @@ class Comparator {
|
|||
// must not be deleted.
|
||||
extern const Comparator* BytewiseComparator();
|
||||
|
||||
// Riak specific: cleans up the default comparitor to make
|
||||
// valgrind results clean
|
||||
extern void ComparatorShutdown();
|
||||
|
||||
} // namespace leveldb
|
||||
|
||||
#endif // STORAGE_LEVELDB_INCLUDE_COMPARATOR_H_
|
||||
|
|
|
@ -14,7 +14,7 @@ namespace leveldb {
|
|||
|
||||
// Update Makefile if you change these
|
||||
static const int kMajorVersion = 1;
|
||||
static const int kMinorVersion = 20;
|
||||
static const int kMinorVersion = 9;
|
||||
|
||||
struct Options;
|
||||
struct ReadOptions;
|
||||
|
@ -38,6 +38,17 @@ struct Range {
|
|||
Range(const Slice& s, const Slice& l) : start(s), limit(l) { }
|
||||
};
|
||||
|
||||
// Abstract holder for a DB value.
|
||||
// This allows callers to manage their own value buffers and have
|
||||
// DB values copied directly into those buffers.
|
||||
class Value {
|
||||
public:
|
||||
virtual Value& assign(const char* data, size_t size) = 0;
|
||||
|
||||
protected:
|
||||
virtual ~Value();
|
||||
};
|
||||
|
||||
// A DB is a persistent ordered map from keys to values.
|
||||
// A DB is safe for concurrent access from multiple threads without
|
||||
// any external synchronization.
|
||||
|
@ -60,7 +71,8 @@ class DB {
|
|||
// Note: consider setting options.sync = true.
|
||||
virtual Status Put(const WriteOptions& options,
|
||||
const Slice& key,
|
||||
const Slice& value) = 0;
|
||||
const Slice& value,
|
||||
const KeyMetaData * meta=NULL) = 0;
|
||||
|
||||
// Remove the database entry (if any) for "key". Returns OK on
|
||||
// success, and a non-OK status on error. It is not an error if "key"
|
||||
|
@ -81,7 +93,11 @@ class DB {
|
|||
//
|
||||
// May return some other Status on an error.
|
||||
virtual Status Get(const ReadOptions& options,
|
||||
const Slice& key, std::string* value) = 0;
|
||||
const Slice& key, std::string* value,
|
||||
KeyMetaData * meta=NULL) = 0;
|
||||
virtual Status Get(const ReadOptions& options,
|
||||
const Slice& key, Value* value,
|
||||
KeyMetaData * meta=NULL) = 0;
|
||||
|
||||
// Return a heap-allocated iterator over the contents of the database.
|
||||
// The result of NewIterator() is initially invalid (caller must
|
||||
|
@ -115,8 +131,6 @@ class DB {
|
|||
// about the internal operation of the DB.
|
||||
// "leveldb.sstables" - returns a multi-line string that describes all
|
||||
// of the sstables that make up the db contents.
|
||||
// "leveldb.approximate-memory-usage" - returns the approximate number of
|
||||
// bytes of memory in use by the DB.
|
||||
virtual bool GetProperty(const Slice& property, std::string* value) = 0;
|
||||
|
||||
// For each i in [0,n-1], store in "sizes[i]", the approximate
|
||||
|
@ -142,6 +156,21 @@ class DB {
|
|||
// db->CompactRange(NULL, NULL);
|
||||
virtual void CompactRange(const Slice* begin, const Slice* end) = 0;
|
||||
|
||||
// Riak specific function: Verify that no .sst files overlap
|
||||
// within the levels that expect non-overlapping files. Run
|
||||
// compactions as necessary to correct. Assumes DB opened
|
||||
// with Options.is_repair=true
|
||||
virtual Status VerifyLevels();
|
||||
|
||||
// Riak specific function: Request database check for
|
||||
// available compactions. This is to stimulate retry of
|
||||
// grooming that might have been offered and rejected previously
|
||||
virtual void CheckAvailableCompactions();
|
||||
|
||||
// Riak specific function: Give external code, namely
|
||||
// eleveldb, access to leveldb's logging routines.
|
||||
virtual Logger* GetLogger() const { return NULL; }
|
||||
|
||||
private:
|
||||
// No copying allowed
|
||||
DB(const DB&);
|
||||
|
|
|
@ -1,25 +0,0 @@
|
|||
// Copyright (c) 2014 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#ifndef STORAGE_LEVELDB_INCLUDE_DUMPFILE_H_
|
||||
#define STORAGE_LEVELDB_INCLUDE_DUMPFILE_H_
|
||||
|
||||
#include <string>
|
||||
#include "leveldb/env.h"
|
||||
#include "leveldb/status.h"
|
||||
|
||||
namespace leveldb {
|
||||
|
||||
// Dump the contents of the file named by fname in text format to
|
||||
// *dst. Makes a sequence of dst->Append() calls; each call is passed
|
||||
// the newline-terminated text corresponding to a single item found
|
||||
// in the file.
|
||||
//
|
||||
// Returns a non-OK result if fname does not name a leveldb storage
|
||||
// file, or if the file cannot be read.
|
||||
Status DumpFile(Env* env, const std::string& fname, WritableFile* dst);
|
||||
|
||||
} // namespace leveldb
|
||||
|
||||
#endif // STORAGE_LEVELDB_INCLUDE_DUMPFILE_H_
|
|
@ -13,15 +13,19 @@
|
|||
#ifndef STORAGE_LEVELDB_INCLUDE_ENV_H_
|
||||
#define STORAGE_LEVELDB_INCLUDE_ENV_H_
|
||||
|
||||
#include <cstdarg>
|
||||
#include <pthread.h>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <stdarg.h>
|
||||
#include <stdint.h>
|
||||
#include "leveldb/perf_count.h"
|
||||
#include "leveldb/status.h"
|
||||
|
||||
namespace leveldb {
|
||||
|
||||
class AppendableFile;
|
||||
class FileLock;
|
||||
struct Options;
|
||||
class Logger;
|
||||
class RandomAccessFile;
|
||||
class SequentialFile;
|
||||
|
@ -40,6 +44,11 @@ class Env {
|
|||
// The result of Default() belongs to leveldb and must never be deleted.
|
||||
static Env* Default();
|
||||
|
||||
// Riak specific: Shutdown background work threads and other objects
|
||||
// to get clean environment for valgrind memory test. No restart supported
|
||||
// after this call. Not thread safe.
|
||||
static void Shutdown();
|
||||
|
||||
// Create a brand new sequentially-readable file with the specified name.
|
||||
// On success, stores a pointer to the new file in *result and returns OK.
|
||||
// On failure stores NULL in *result and returns non-OK. If the file does
|
||||
|
@ -67,22 +76,31 @@ class Env {
|
|||
//
|
||||
// The returned file will only be accessed by one thread at a time.
|
||||
virtual Status NewWritableFile(const std::string& fname,
|
||||
WritableFile** result) = 0;
|
||||
WritableFile** result,
|
||||
size_t map_size) = 0;
|
||||
|
||||
// Create an object that either appends to an existing file, or
|
||||
// writes to a new file (if the file does not exist to begin with).
|
||||
// On success, stores a pointer to the new file in *result and
|
||||
// returns OK. On failure stores NULL in *result and returns
|
||||
// non-OK.
|
||||
// Riak specific:
|
||||
// Derived from NewWritableFile. One change: if the file exists,
|
||||
// move to the end of the file and continue writing.
|
||||
// new file. On success, stores a pointer to the open file in
|
||||
// *result and returns OK. On failure stores NULL in *result and
|
||||
// returns non-OK.
|
||||
//
|
||||
// The returned file will only be accessed by one thread at a time.
|
||||
//
|
||||
// May return an IsNotSupportedError error if this Env does
|
||||
// not allow appending to an existing file. Users of Env (including
|
||||
// the leveldb implementation) must be prepared to deal with
|
||||
// an Env that does not support appending.
|
||||
virtual Status NewAppendableFile(const std::string& fname,
|
||||
WritableFile** result);
|
||||
WritableFile** result,
|
||||
size_t map_size) = 0;
|
||||
|
||||
// Riak specific:
|
||||
// Allows for virtualized version of NewWritableFile that enables write
|
||||
// and close operations to execute on background threads
|
||||
// (where platform supported).
|
||||
//
|
||||
// The returned file will only be accessed by one thread at a time.
|
||||
virtual Status NewWriteOnlyFile(const std::string& fname,
|
||||
WritableFile** result,
|
||||
size_t map_size)
|
||||
{return(NewWritableFile(fname, result, map_size));};
|
||||
|
||||
// Returns true iff the named file exists.
|
||||
virtual bool FileExists(const std::string& fname) = 0;
|
||||
|
@ -142,7 +160,7 @@ class Env {
|
|||
|
||||
// Start a new thread, invoking "function(arg)" within the new thread.
|
||||
// When "function(arg)" returns, the thread will be destroyed.
|
||||
virtual void StartThread(void (*function)(void* arg), void* arg) = 0;
|
||||
virtual pthread_t StartThread(void (*function)(void* arg), void* arg) = 0;
|
||||
|
||||
// *path is set to a temporary directory that can be used for testing. It may
|
||||
// or many not have just been created. The directory may or may not differ
|
||||
|
@ -157,9 +175,16 @@ class Env {
|
|||
// useful for computing deltas of time.
|
||||
virtual uint64_t NowMicros() = 0;
|
||||
|
||||
// Sleep/delay the thread for the prescribed number of micro-seconds.
|
||||
// Sleep/delay the thread for the perscribed number of micro-seconds.
|
||||
virtual void SleepForMicroseconds(int micros) = 0;
|
||||
|
||||
// Riak specific: Get object that is tracking various software counters
|
||||
virtual PerformanceCounters * GetPerformanceCounters() {return(gPerfCounters);};
|
||||
|
||||
// Riak specific: Request size of recovery memory map, potentially using
|
||||
// Options data for the decision. Default 2Mbyte is Google's original size.
|
||||
virtual size_t RecoveryMmapSize(const struct Options *) const {return(2*1024*1024L);};
|
||||
|
||||
private:
|
||||
// No copying allowed
|
||||
Env(const Env&);
|
||||
|
@ -190,14 +215,6 @@ class SequentialFile {
|
|||
//
|
||||
// REQUIRES: External synchronization
|
||||
virtual Status Skip(uint64_t n) = 0;
|
||||
|
||||
// Get a name for the file, only for error reporting
|
||||
virtual std::string GetName() const = 0;
|
||||
|
||||
private:
|
||||
// No copying allowed
|
||||
SequentialFile(const SequentialFile&);
|
||||
void operator=(const SequentialFile&);
|
||||
};
|
||||
|
||||
// A file abstraction for randomly reading the contents of a file.
|
||||
|
@ -218,13 +235,11 @@ class RandomAccessFile {
|
|||
virtual Status Read(uint64_t offset, size_t n, Slice* result,
|
||||
char* scratch) const = 0;
|
||||
|
||||
// Get a name for the file, only for error reporting
|
||||
virtual std::string GetName() const = 0;
|
||||
// Riak optimization: allows advising Linux page cache
|
||||
virtual void SetForCompaction(uint64_t file_size) {};
|
||||
|
||||
private:
|
||||
// No copying allowed
|
||||
RandomAccessFile(const RandomAccessFile&);
|
||||
void operator=(const RandomAccessFile&);
|
||||
// Riak addition: size of this structure in bytes
|
||||
virtual size_t ObjectSize() {return(sizeof(RandomAccessFile));};
|
||||
};
|
||||
|
||||
// A file abstraction for sequential writing. The implementation
|
||||
|
@ -240,8 +255,10 @@ class WritableFile {
|
|||
virtual Status Flush() = 0;
|
||||
virtual Status Sync() = 0;
|
||||
|
||||
// Get a name for the file, only for error reporting
|
||||
virtual std::string GetName() const = 0;
|
||||
// Riak specific:
|
||||
// Provide hint where key/value data ends and metadata starts
|
||||
// in an .sst table file.
|
||||
virtual void SetMetadataOffset(uint64_t) {};
|
||||
|
||||
private:
|
||||
// No copying allowed
|
||||
|
@ -249,12 +266,30 @@ class WritableFile {
|
|||
void operator=(const WritableFile&);
|
||||
};
|
||||
|
||||
// A file abstraction for sequential writing at end of existing file.
|
||||
class AppendableFile: public WritableFile {
|
||||
public:
|
||||
AppendableFile() { }
|
||||
virtual ~AppendableFile();
|
||||
|
||||
private:
|
||||
// No copying allowed
|
||||
AppendableFile(const AppendableFile&);
|
||||
void operator=(const AppendableFile&);
|
||||
};
|
||||
|
||||
// An interface for writing log messages.
|
||||
class Logger {
|
||||
public:
|
||||
Logger() { }
|
||||
virtual ~Logger();
|
||||
|
||||
// Riak specific function for hot backup.
|
||||
// hot_backup.cc assumes that it can rotate the LOG file
|
||||
// via standard Env routines if this function returns a
|
||||
// non-zero value.
|
||||
virtual long LogSize() {return(0);};
|
||||
|
||||
// Write an entry to the log file with the specified format.
|
||||
virtual void Logv(const char* format, va_list ap) = 0;
|
||||
|
||||
|
@ -310,11 +345,14 @@ class EnvWrapper : public Env {
|
|||
Status NewRandomAccessFile(const std::string& f, RandomAccessFile** r) {
|
||||
return target_->NewRandomAccessFile(f, r);
|
||||
}
|
||||
Status NewWritableFile(const std::string& f, WritableFile** r) {
|
||||
return target_->NewWritableFile(f, r);
|
||||
Status NewWritableFile(const std::string& f, WritableFile** r, size_t s=0) {
|
||||
return target_->NewWritableFile(f, r, s);
|
||||
}
|
||||
Status NewAppendableFile(const std::string& f, WritableFile** r) {
|
||||
return target_->NewAppendableFile(f, r);
|
||||
Status NewAppendableFile(const std::string& f, WritableFile** r, size_t s=0) {
|
||||
return target_->NewAppendableFile(f, r, s);
|
||||
}
|
||||
Status NewWriteOnlyFile(const std::string& f, WritableFile** r, size_t s=0) {
|
||||
return target_->NewWriteOnlyFile(f, r, s);
|
||||
}
|
||||
bool FileExists(const std::string& f) { return target_->FileExists(f); }
|
||||
Status GetChildren(const std::string& dir, std::vector<std::string>* r) {
|
||||
|
@ -336,7 +374,7 @@ class EnvWrapper : public Env {
|
|||
void Schedule(void (*f)(void*), void* a) {
|
||||
return target_->Schedule(f, a);
|
||||
}
|
||||
void StartThread(void (*f)(void*), void* a) {
|
||||
pthread_t StartThread(void (*f)(void*), void* a) {
|
||||
return target_->StartThread(f, a);
|
||||
}
|
||||
virtual Status GetTestDirectory(std::string* path) {
|
||||
|
@ -355,6 +393,12 @@ class EnvWrapper : public Env {
|
|||
Env* target_;
|
||||
};
|
||||
|
||||
// Riak specific hack to allow runtime change
|
||||
// of mapping size
|
||||
extern volatile size_t gMapSize;
|
||||
|
||||
extern bool gFadviseWillNeed;
|
||||
|
||||
} // namespace leveldb
|
||||
|
||||
#endif // STORAGE_LEVELDB_INCLUDE_ENV_H_
|
||||
|
|
135
src/leveldb/include/leveldb/expiry.h
Normal file
135
src/leveldb/include/leveldb/expiry.h
Normal file
|
@ -0,0 +1,135 @@
|
|||
// -------------------------------------------------------------------
|
||||
//
|
||||
// expiry.h: background expiry management for Basho's modified leveldb
|
||||
//
|
||||
// Copyright (c) 2016 Basho Technologies, Inc. All Rights Reserved.
|
||||
//
|
||||
// This file is provided to you under the Apache License,
|
||||
// Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain
|
||||
// a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
//
|
||||
// -------------------------------------------------------------------
|
||||
|
||||
#ifndef EXPIRY_H
|
||||
#define EXPIRY_H
|
||||
|
||||
#include <limits.h>
|
||||
#include <stdint.h>
|
||||
#include "leveldb/env.h"
|
||||
#include "leveldb/options.h"
|
||||
#include "util/refobject_base.h"
|
||||
|
||||
namespace leveldb {
|
||||
|
||||
class Compaction;
|
||||
class Logger;
|
||||
struct ParsedInternalKey;
|
||||
class Slice;
|
||||
class SstCounters;
|
||||
class Version;
|
||||
class VersionEdit;
|
||||
struct FileMetaData;
|
||||
|
||||
|
||||
enum EleveldbRouterActions_t
|
||||
{
|
||||
eGetBucketProperties=1
|
||||
}; // enum EleveldbRouterActions_t
|
||||
|
||||
|
||||
typedef bool (* EleveldbRouter_t)(EleveldbRouterActions_t Action, int ParamCount, const void ** Params);
|
||||
|
||||
|
||||
class ExpiryModule : public RefObjectBase
|
||||
{
|
||||
public:
|
||||
virtual ~ExpiryModule() {};
|
||||
|
||||
// Print expiry options to LOG file
|
||||
virtual void Dump(Logger * log) const
|
||||
{Log(log," Expiry: (none)");};
|
||||
|
||||
// Quick test to allow manifest logic and such know if
|
||||
// extra expiry logic should be checked
|
||||
virtual bool ExpiryActivated() const {return(false);};
|
||||
|
||||
// db/write_batch.cc MemTableInserter::Put() calls this.
|
||||
// returns false on internal error
|
||||
virtual bool MemTableInserterCallback(
|
||||
const Slice & Key, // input: user's key about to be written
|
||||
const Slice & Value, // input: user's value object
|
||||
ValueType & ValType, // input/output: key type. call might change
|
||||
ExpiryTimeMicros & Expiry) const // input/output: 0 or specific expiry. call might change
|
||||
{return(true);};
|
||||
|
||||
// db/dbformat.cc KeyRetirement::operator() calls this.
|
||||
// db/version_set.cc SaveValue() calls this too.
|
||||
// returns true if key is expired, returns false if key not expired
|
||||
virtual bool KeyRetirementCallback(
|
||||
const ParsedInternalKey & Ikey) const
|
||||
{return(false);};
|
||||
|
||||
// table/table_builder.cc TableBuilder::Add() calls this.
|
||||
// returns false on internal error
|
||||
virtual bool TableBuilderCallback(
|
||||
const Slice & Key, // input: internal key
|
||||
SstCounters & Counters) const // input/output: counters for new sst table
|
||||
{return(true);};
|
||||
|
||||
// db/memtable.cc MemTable::Get() calls this.
|
||||
// returns true if type/expiry is expired, returns false if not expired
|
||||
virtual bool MemTableCallback(
|
||||
const Slice & Key) const // input: leveldb internal key
|
||||
{return(false);};
|
||||
|
||||
// db/version_set.cc VersionSet::Finalize() calls this if no
|
||||
// other compaction selected for a level
|
||||
// returns true if there is an expiry compaction eligible
|
||||
virtual bool CompactionFinalizeCallback(
|
||||
bool WantAll, // input: true - examine all expired files
|
||||
const Version & Ver, // input: database state for examination
|
||||
int Level, // input: level to review for expiry
|
||||
VersionEdit * Edit) const // output: NULL or destination of delete list
|
||||
{return(false);};
|
||||
|
||||
// yep, sometimes we want to expiry this expiry module object.
|
||||
// mostly for bucket level properties in Riak EE
|
||||
virtual uint64_t ExpiryModuleExpiryMicros() {return(0);};
|
||||
|
||||
// Creates derived ExpiryModule object that matches compile time
|
||||
// switch for open source or Basho enterprise edition features.
|
||||
static ExpiryModule * CreateExpiryModule(EleveldbRouter_t Router);
|
||||
|
||||
// Cleans up global objects related to expiry
|
||||
// switch for open source or Basho enterprise edition features.
|
||||
static void ShutdownExpiryModule();
|
||||
|
||||
// Riak EE: stash a user created module with settings
|
||||
virtual void NoteUserExpirySettings() {};
|
||||
|
||||
protected:
|
||||
ExpiryModule() {};
|
||||
|
||||
private:
|
||||
ExpiryModule(const ExpiryModule &);
|
||||
ExpiryModule & operator=(const ExpiryModule &);
|
||||
|
||||
}; // ExpiryModule
|
||||
|
||||
|
||||
typedef RefPtr<class ExpiryModule> ExpiryPtr_t;
|
||||
|
||||
} // namespace leveldb
|
||||
|
||||
#endif // ifndef
|
||||
|
|
@ -23,9 +23,21 @@ namespace leveldb {
|
|||
class Slice;
|
||||
|
||||
class FilterPolicy {
|
||||
public:
|
||||
protected:
|
||||
mutable const FilterPolicy * m_Next; // used by FilterInventory
|
||||
|
||||
public:
|
||||
FilterPolicy()
|
||||
: m_Next(NULL)
|
||||
{};
|
||||
|
||||
virtual ~FilterPolicy();
|
||||
|
||||
// list pointer accessors
|
||||
const FilterPolicy * GetNext() const {return(m_Next);};
|
||||
void SetNext(const FilterPolicy * Next) const {m_Next=Next;};
|
||||
|
||||
|
||||
// Return the name of this policy. Note that if the filter encoding
|
||||
// changes in an incompatible way, the name returned by this method
|
||||
// must be changed. Otherwise, old incompatible filters may be
|
||||
|
@ -47,6 +59,7 @@ class FilterPolicy {
|
|||
// This method may return true or false if the key was not on the
|
||||
// list, but it should aim to return false with a high probability.
|
||||
virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const = 0;
|
||||
|
||||
};
|
||||
|
||||
// Return a new filter policy that uses a bloom filter with approximately
|
||||
|
@ -64,7 +77,29 @@ class FilterPolicy {
|
|||
// FilterPolicy (like NewBloomFilterPolicy) that does not ignore
|
||||
// trailing spaces in keys.
|
||||
extern const FilterPolicy* NewBloomFilterPolicy(int bits_per_key);
|
||||
extern const FilterPolicy* NewBloomFilterPolicy2(int bits_per_key);
|
||||
|
||||
}
|
||||
|
||||
class FilterInventory
|
||||
{
|
||||
public:
|
||||
// MUST be static variable so that it initializes before any static objects
|
||||
// have their initializers called
|
||||
static const FilterPolicy * ListHead;
|
||||
|
||||
// This might be called prior to singleton FilterInventory object
|
||||
// being initialized. NOT THREAD SAFE.
|
||||
static void AddFilterToInventory(const FilterPolicy * Filter)
|
||||
{
|
||||
if (NULL!=Filter)
|
||||
{
|
||||
Filter->SetNext(ListHead);
|
||||
ListHead=Filter;
|
||||
} // if
|
||||
return;
|
||||
}
|
||||
}; // class FilterInventory
|
||||
|
||||
} // namespace leveldb
|
||||
|
||||
#endif // STORAGE_LEVELDB_INCLUDE_FILTER_POLICY_H_
|
||||
|
|
|
@ -17,6 +17,7 @@
|
|||
|
||||
#include "leveldb/slice.h"
|
||||
#include "leveldb/status.h"
|
||||
#include "leveldb/options.h"
|
||||
|
||||
namespace leveldb {
|
||||
|
||||
|
@ -37,7 +38,7 @@ class Iterator {
|
|||
// Valid() after this call iff the source is not empty.
|
||||
virtual void SeekToLast() = 0;
|
||||
|
||||
// Position at the first key in the source that is at or past target.
|
||||
// Position at the first key in the source that at or past target
|
||||
// The iterator is Valid() after this call iff the source contains
|
||||
// an entry that comes at or past target.
|
||||
virtual void Seek(const Slice& target) = 0;
|
||||
|
@ -61,9 +62,13 @@ class Iterator {
|
|||
// Return the value for the current entry. The underlying storage for
|
||||
// the returned slice is valid only until the next modification of
|
||||
// the iterator.
|
||||
// REQUIRES: Valid()
|
||||
// REQUIRES: !AtEnd() && !AtStart()
|
||||
virtual Slice value() const = 0;
|
||||
|
||||
// Riak specific: if a database iterator, returns key meta data
|
||||
// REQUIRES: Valid()
|
||||
virtual KeyMetaData & keymetadata() const {return(keymetadata_); };
|
||||
|
||||
// If an error has occurred, return it. Else return an ok status.
|
||||
virtual Status status() const = 0;
|
||||
|
||||
|
@ -75,6 +80,10 @@ class Iterator {
|
|||
typedef void (*CleanupFunction)(void* arg1, void* arg2);
|
||||
void RegisterCleanup(CleanupFunction function, void* arg1, void* arg2);
|
||||
|
||||
protected:
|
||||
// mutable so reusable by derived classes
|
||||
mutable KeyMetaData keymetadata_;
|
||||
|
||||
private:
|
||||
struct Cleanup {
|
||||
CleanupFunction function;
|
||||
|
|
|
@ -6,15 +6,23 @@
|
|||
#define STORAGE_LEVELDB_INCLUDE_OPTIONS_H_
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <string>
|
||||
#include <memory>
|
||||
|
||||
namespace leveldb {
|
||||
|
||||
class Cache;
|
||||
class Comparator;
|
||||
class Env;
|
||||
class ExpiryModule;
|
||||
class FilterPolicy;
|
||||
class Logger;
|
||||
class Snapshot;
|
||||
namespace log
|
||||
{
|
||||
class Writer;
|
||||
} // namespace log
|
||||
|
||||
// DB contents are stored in a set of blocks, each of which holds a
|
||||
// sequence of key,value pairs. Each block may be compressed before
|
||||
|
@ -24,9 +32,34 @@ enum CompressionType {
|
|||
// NOTE: do not change the values of existing entries, as these are
|
||||
// part of the persistent format on disk.
|
||||
kNoCompression = 0x0,
|
||||
kSnappyCompression = 0x1
|
||||
kSnappyCompression = 0x1,
|
||||
kLZ4Compression = 0x2,
|
||||
kNoCompressionAutomated = 0x3
|
||||
};
|
||||
|
||||
// Originally located in db/dbformat.h. Now available publically.
|
||||
// Value types encoded as the last component of internal keys.
|
||||
// DO NOT CHANGE THESE ENUM VALUES: they are embedded in the on-disk
|
||||
// data structures.
|
||||
enum ValueType {
|
||||
kTypeDeletion = 0x0,
|
||||
kTypeValue = 0x1,
|
||||
kTypeValueWriteTime = 0x2,
|
||||
kTypeValueExplicitExpiry = 0x3
|
||||
};
|
||||
|
||||
// Originally located in db/dbformat.h
|
||||
typedef uint64_t SequenceNumber;
|
||||
typedef uint64_t ExpiryTimeMicros;
|
||||
|
||||
}; // namespace leveldb
|
||||
|
||||
//
|
||||
// must follow ValueType declaration
|
||||
#include "leveldb/expiry.h"
|
||||
|
||||
namespace leveldb {
|
||||
|
||||
// Options to control the behavior of a database (passed to DB::Open)
|
||||
struct Options {
|
||||
// -------------------
|
||||
|
@ -56,6 +89,14 @@ struct Options {
|
|||
// Default: false
|
||||
bool paranoid_checks;
|
||||
|
||||
// Riak specific: this variable replaces paranoid_checks at one
|
||||
// one place in the code. This variable alone controls whether or not
|
||||
// compaction read operations check CRC values. Riak needs
|
||||
// the compaction CRC check, but not other paranoid_checks ... so
|
||||
// this independent control.
|
||||
// Default: true
|
||||
bool verify_compactions;
|
||||
|
||||
// Use the specified object to interact with the environment,
|
||||
// e.g. to read/write files, schedule background work, etc.
|
||||
// Default: Env::Default()
|
||||
|
@ -85,7 +126,7 @@ struct Options {
|
|||
// Number of open files that can be used by the DB. You may need to
|
||||
// increase this if your database has a large working set (budget
|
||||
// one open file per 2MB of working set).
|
||||
//
|
||||
// RIAK: NO LONGER USED
|
||||
// Default: 1000
|
||||
int max_open_files;
|
||||
|
||||
|
@ -105,6 +146,15 @@ struct Options {
|
|||
// Default: 4K
|
||||
size_t block_size;
|
||||
|
||||
// Riak specific: non-zero value activates code to automatically
|
||||
// increase block_size as needed to ensure maximum number of files
|
||||
// are available in the file cache. The value indicates how many
|
||||
// incremental increases to use between the original block_size
|
||||
// and largest, reasonable block_size.
|
||||
//
|
||||
// Default: 16
|
||||
int block_size_steps;
|
||||
|
||||
// Number of keys between restart points for delta encoding of keys.
|
||||
// This parameter can be changed dynamically. Most clients should
|
||||
// leave this parameter alone.
|
||||
|
@ -112,18 +162,6 @@ struct Options {
|
|||
// Default: 16
|
||||
int block_restart_interval;
|
||||
|
||||
// Leveldb will write up to this amount of bytes to a file before
|
||||
// switching to a new one.
|
||||
// Most clients should leave this parameter alone. However if your
|
||||
// filesystem is more efficient with larger files, you could
|
||||
// consider increasing the value. The downside will be longer
|
||||
// compactions and hence longer latency/performance hiccups.
|
||||
// Another reason to increase this parameter might be when you are
|
||||
// initially populating a large database.
|
||||
//
|
||||
// Default: 2MB
|
||||
size_t max_file_size;
|
||||
|
||||
// Compress blocks using the specified compression algorithm. This
|
||||
// parameter can be changed dynamically.
|
||||
//
|
||||
|
@ -140,12 +178,6 @@ struct Options {
|
|||
// efficiently detect that and will switch to uncompressed mode.
|
||||
CompressionType compression;
|
||||
|
||||
// EXPERIMENTAL: If true, append to existing MANIFEST and log files
|
||||
// when a database is opened. This can significantly speed up open.
|
||||
//
|
||||
// Default: currently false, but may become true later.
|
||||
bool reuse_logs;
|
||||
|
||||
// If non-NULL, use the specified filter policy to reduce disk reads.
|
||||
// Many applications will benefit from passing the result of
|
||||
// NewBloomFilterPolicy() here.
|
||||
|
@ -153,8 +185,84 @@ struct Options {
|
|||
// Default: NULL
|
||||
const FilterPolicy* filter_policy;
|
||||
|
||||
// Riak specific flag used to indicate when database is open
|
||||
// as part of a Repair operation. Default is false
|
||||
bool is_repair;
|
||||
|
||||
// Riak specific flag to mark Riak internal database versus
|
||||
// user database. (User database gets larger cache resources.)
|
||||
bool is_internal_db;
|
||||
|
||||
// Riak replacement for max_open_files and block_cache. This is
|
||||
// TOTAL memory to be used by leveldb across ALL DATABASES.
|
||||
// Most recent value seen upon database open, wins. Zero for default.
|
||||
uint64_t total_leveldb_mem;
|
||||
|
||||
// Riak specific option specifying block cache space that cannot
|
||||
// be released for page cache use. The space may still be
|
||||
// released for file cache.
|
||||
uint64_t block_cache_threshold;
|
||||
|
||||
// Riak option to override most memory modeling and create
|
||||
// smaller memory footprint for developers. Helps when
|
||||
// running large number of databases and multiple VMs. Do
|
||||
// NOT use this option if making performance measurements.
|
||||
// Default: false
|
||||
bool limited_developer_mem;
|
||||
|
||||
// The size of each MMAped file, choose 0 for the default (20M)
|
||||
uint64_t mmap_size;
|
||||
|
||||
// Riak option to adjust aggressive delete behavior.
|
||||
// - zero disables aggressive delete
|
||||
// - positive value indicates how many deletes must exist
|
||||
// in a file for it to be compacted due to deletes
|
||||
uint64_t delete_threshold;
|
||||
|
||||
// Riak specific flag used to indicate when fadvise() management
|
||||
// should default to WILLNEED instead of DONTNEED. Default is false
|
||||
bool fadvise_willneed;
|
||||
|
||||
// *****
|
||||
// Riak specific options for establishing two tiers of disk arrays.
|
||||
// All three tier options must be valid for the option to activate.
|
||||
// When active, leveldb directories are constructed using either
|
||||
// the fast or slow prefix followed by the database name given
|
||||
// in the DB::Open call. (a synonym for "prefix" is "mount")
|
||||
// *****
|
||||
|
||||
// Riak specific option setting the level number at which the
|
||||
// "tiered_slow_prefix" should be used. Default is zero which
|
||||
// disables the option. Valid values are 1 to 6. 3 or 4 recommended.
|
||||
unsigned tiered_slow_level;
|
||||
|
||||
// Riak specific option with the path prefix used for "fast" disk
|
||||
// array. levels 0 to tiered_slow_level-1 use this path prefix
|
||||
std::string tiered_fast_prefix;
|
||||
|
||||
// Riak specific option with the path prefix used for "slow" disk
|
||||
// array. levels tiered_slow_level through 6 use this path prefix
|
||||
std::string tiered_slow_prefix;
|
||||
|
||||
// Riak specific option that writes a list of open table files
|
||||
// to disk on close then automatically opens same files again
|
||||
// upon restart.
|
||||
bool cache_object_warming;
|
||||
|
||||
// Riak specific object that defines expiry policy for data
|
||||
// written to leveldb.
|
||||
ExpiryPtr_t expiry_module;
|
||||
|
||||
// Create an Options object with default values for all fields.
|
||||
Options();
|
||||
|
||||
void Dump(Logger * log) const;
|
||||
|
||||
bool ExpiryActivated() const
|
||||
{return(NULL!=expiry_module.get() && expiry_module->ExpiryActivated());};
|
||||
|
||||
private:
|
||||
|
||||
};
|
||||
|
||||
// Options that control read operations
|
||||
|
@ -171,16 +279,57 @@ struct ReadOptions {
|
|||
|
||||
// If "snapshot" is non-NULL, read as of the supplied snapshot
|
||||
// (which must belong to the DB that is being read and which must
|
||||
// not have been released). If "snapshot" is NULL, use an implicit
|
||||
// not have been released). If "snapshot" is NULL, use an impliicit
|
||||
// snapshot of the state at the beginning of this read operation.
|
||||
// Default: NULL
|
||||
const Snapshot* snapshot;
|
||||
|
||||
// Riak specific flag, currently used within Erlang adaptor
|
||||
// to enable automatic delete and new of fresh snapshot
|
||||
// and database iterator objects for long running iterations
|
||||
// (only supports iterator NEXT operations).
|
||||
// Default: false
|
||||
bool iterator_refresh;
|
||||
|
||||
ReadOptions()
|
||||
: verify_checksums(false),
|
||||
: verify_checksums(true),
|
||||
fill_cache(true),
|
||||
snapshot(NULL) {
|
||||
snapshot(NULL),
|
||||
iterator_refresh(false),
|
||||
is_compaction(false),
|
||||
env(NULL),
|
||||
info_log(NULL)
|
||||
{
|
||||
}
|
||||
|
||||
|
||||
// accessors to the private data
|
||||
bool IsCompaction() const {return(is_compaction);};
|
||||
|
||||
Logger * GetInfoLog() const {return(info_log);};
|
||||
|
||||
const std::string & GetDBName() const {return(dbname);};
|
||||
|
||||
Env * GetEnv() const {return(env);};
|
||||
|
||||
// The items below are internal options, not for external manipulation.
|
||||
// They are populated by VersionSet::MakeInputIterator only during compaction operations
|
||||
private:
|
||||
friend class VersionSet;
|
||||
|
||||
// true when used on background compaction
|
||||
bool is_compaction;
|
||||
|
||||
// Database name for potential creation of bad blocks file
|
||||
std::string dbname;
|
||||
|
||||
// Needed for file operations if creating bad blocks file
|
||||
Env * env;
|
||||
|
||||
// Open log file for error notifications
|
||||
// Only valid when is_compation==true
|
||||
Logger* info_log;
|
||||
|
||||
};
|
||||
|
||||
// Options that control write operations
|
||||
|
@ -208,6 +357,22 @@ struct WriteOptions {
|
|||
}
|
||||
};
|
||||
|
||||
|
||||
// Riak specific object that can return key metadata
|
||||
// during get or iterate operation
|
||||
struct KeyMetaData
|
||||
{
|
||||
ValueType m_Type; // see above
|
||||
SequenceNumber m_Sequence; // output only, leveldb internal
|
||||
ExpiryTimeMicros m_Expiry; // microseconds since Epoch, UTC
|
||||
|
||||
KeyMetaData()
|
||||
: m_Type(kTypeValue), m_Sequence(0), m_Expiry(0)
|
||||
{};
|
||||
}; // struct KeyMetaData
|
||||
|
||||
const char * CompileOptionsString();
|
||||
|
||||
} // namespace leveldb
|
||||
|
||||
#endif // STORAGE_LEVELDB_INCLUDE_OPTIONS_H_
|
||||
|
|
329
src/leveldb/include/leveldb/perf_count.h
Normal file
329
src/leveldb/include/leveldb/perf_count.h
Normal file
|
@ -0,0 +1,329 @@
|
|||
// -------------------------------------------------------------------
|
||||
//
|
||||
// perf_count.h: performance counters LevelDB
|
||||
//
|
||||
// Copyright (c) 2012-2016 Basho Technologies, Inc. All Rights Reserved.
|
||||
//
|
||||
// This file is provided to you under the Apache License,
|
||||
// Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain
|
||||
// a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
//
|
||||
// -------------------------------------------------------------------
|
||||
|
||||
#ifndef STORAGE_LEVELDB_INCLUDE_PERF_COUNT_H_
|
||||
#define STORAGE_LEVELDB_INCLUDE_PERF_COUNT_H_
|
||||
|
||||
#include <stdint.h>
|
||||
#include <string>
|
||||
#include "leveldb/status.h"
|
||||
|
||||
namespace leveldb {
|
||||
|
||||
enum SstCountEnum
|
||||
{
|
||||
//
|
||||
// array index values/names
|
||||
//
|
||||
eSstCountKeys=0, //!< how many keys in this sst
|
||||
eSstCountBlocks=1, //!< how many blocks in this sst
|
||||
eSstCountCompressAborted=2,//!< how many blocks attempted compression and aborted use
|
||||
eSstCountKeySize=3, //!< byte count of all keys
|
||||
eSstCountValueSize=4, //!< byte count of all values
|
||||
eSstCountBlockSize=5, //!< byte count of all blocks (pre-compression)
|
||||
eSstCountBlockWriteSize=6, //!< post-compression size, or BlockSize if no compression
|
||||
eSstCountIndexKeys=7, //!< how many keys in the index block
|
||||
eSstCountKeyLargest=8, //!< largest key in sst
|
||||
eSstCountKeySmallest=9, //!< smallest key in sst
|
||||
eSstCountValueLargest=10, //!< largest value in sst
|
||||
eSstCountValueSmallest=11, //!< smallest value in sst
|
||||
eSstCountDeleteKey=12, //!< tombstone count
|
||||
eSstCountBlockSizeUsed=13, //!< Options::block_size used with this file
|
||||
eSstCountUserDataSize=14, //!< post-compression size of non-metadata (user keys/values/block overhead)
|
||||
eSstCountExpiry1=15, //!< undocumented expiry counter 1
|
||||
eSstCountExpiry2=16, //!< undocumented expiry counter 2
|
||||
eSstCountExpiry3=17, //!< undocumented expiry counter 3
|
||||
eSstCountSequence=18, //!< highest sequence number in file
|
||||
|
||||
// must follow last index name to represent size of array
|
||||
eSstCountEnumSize, //!< size of the array described by the enum values
|
||||
|
||||
eSstCountVersion=1
|
||||
|
||||
}; // enum SstCountEnum
|
||||
|
||||
|
||||
class SstCounters
|
||||
{
|
||||
protected:
|
||||
bool m_IsReadOnly; //!< set when data decoded from a file
|
||||
uint32_t m_Version; //!< object revision identification
|
||||
uint32_t m_CounterSize; //!< number of objects in m_Counter
|
||||
|
||||
uint64_t m_Counter[eSstCountEnumSize];
|
||||
|
||||
public:
|
||||
// constructors / destructor
|
||||
SstCounters();
|
||||
|
||||
// Put data into disk form
|
||||
void EncodeTo(std::string & Dst) const;
|
||||
|
||||
// Populate member data from prior EncodeTo block
|
||||
Status DecodeFrom(const Slice& src);
|
||||
|
||||
// increment the counter
|
||||
uint64_t Inc(unsigned Index);
|
||||
|
||||
// add value to the counter
|
||||
uint64_t Add(unsigned Index, uint64_t Amount);
|
||||
|
||||
// return value of a counter
|
||||
uint64_t Value(unsigned Index) const;
|
||||
|
||||
// set a value
|
||||
void Set(unsigned Index, uint64_t);
|
||||
|
||||
// return number of counters
|
||||
uint32_t Size() const {return(m_CounterSize);};
|
||||
|
||||
// printf all values
|
||||
void Dump() const;
|
||||
|
||||
}; // class SstCounters
|
||||
|
||||
|
||||
extern struct PerformanceCounters * gPerfCounters;
|
||||
|
||||
|
||||
enum PerformanceCountersEnum
|
||||
{
|
||||
//
|
||||
// array index values/names
|
||||
// (enum explicitly numbered to allow future edits / moves / inserts)
|
||||
//
|
||||
ePerfROFileOpen=0, //!< PosixMmapReadableFile open
|
||||
ePerfROFileClose=1, //!< closed
|
||||
ePerfROFileUnmap=2, //!< unmap without close
|
||||
|
||||
ePerfRWFileOpen=3, //!< PosixMmapFile open
|
||||
ePerfRWFileClose=4, //!< closed
|
||||
ePerfRWFileUnmap=5, //!< unmap without close
|
||||
|
||||
ePerfApiOpen=6, //!< Count of DB::Open completions
|
||||
ePerfApiGet=7, //!< Count of DBImpl::Get completions
|
||||
ePerfApiWrite=8, //!< Count of DBImpl::Get completions
|
||||
|
||||
ePerfWriteSleep=9, //!< DBImpl::MakeRoomForWrite called sleep
|
||||
ePerfWriteWaitImm=10, //!< DBImpl::MakeRoomForWrite called Wait on Imm compact
|
||||
ePerfWriteWaitLevel0=11,//!< DBImpl::MakeRoomForWrite called Wait on Level0 compact
|
||||
ePerfWriteNewMem=12, //!< DBImpl::MakeRoomForWrite created new memory log
|
||||
ePerfWriteError=13, //!< DBImpl::MakeRoomForWrite saw bg_error_
|
||||
ePerfWriteNoWait=14, //!< DBImpl::MakeRoomForWrite took no action
|
||||
|
||||
ePerfGetMem=15, //!< DBImpl::Get read from memory log
|
||||
ePerfGetImm=16, //!< DBImpl::Get read from previous memory log
|
||||
ePerfGetVersion=17, //!< DBImpl::Get read from Version object
|
||||
|
||||
// code ASSUMES the levels are in numerical order,
|
||||
// i.e. based off of ePerfSearchLevel0
|
||||
ePerfSearchLevel0=18, //!< Version::Get read searched one or more files here
|
||||
ePerfSearchLevel1=19, //!< Version::Get read searched one or more files here
|
||||
ePerfSearchLevel2=20, //!< Version::Get read searched one or more files here
|
||||
ePerfSearchLevel3=21, //!< Version::Get read searched one or more files here
|
||||
ePerfSearchLevel4=22, //!< Version::Get read searched one or more files here
|
||||
ePerfSearchLevel5=23, //!< Version::Get read searched one or more files here
|
||||
ePerfSearchLevel6=24, //!< Version::Get read searched one or more files here
|
||||
|
||||
ePerfTableCached=25, //!< TableCache::FindTable found table in cache
|
||||
ePerfTableOpened=26, //!< TableCache::FindTable had to open table file
|
||||
ePerfTableGet=27, //!< TableCache::Get used to retrieve a key
|
||||
|
||||
ePerfBGCloseUnmap=28, //!< PosixEnv::BGThreaed started Unmap/Close job
|
||||
ePerfBGCompactImm=29, //!< PosixEnv::BGThreaed started compaction of Imm
|
||||
ePerfBGNormal=30, //!< PosixEnv::BGThreaed started normal compaction job
|
||||
ePerfBGCompactLevel0=31,//!< PosixEnv::BGThreaed started compaction of Level0
|
||||
|
||||
ePerfBlockFiltered=32, //!< Table::BlockReader search stopped due to filter
|
||||
ePerfBlockFilterFalse=33,//!< Table::BlockReader gave a false positive for match
|
||||
ePerfBlockCached=34, //!< Table::BlockReader found block in cache
|
||||
ePerfBlockRead=35, //!< Table::BlockReader read block from disk
|
||||
ePerfBlockFilterRead=36,//!< Table::ReadMeta filter loaded from file
|
||||
ePerfBlockValidGet=37, //!< Table::InternalGet has valid iterator
|
||||
|
||||
ePerfDebug0=38, //!< Developer debug counters, moveable
|
||||
ePerfDebug1=39, //!< Developer debug counters, moveable
|
||||
ePerfDebug2=40, //!< Developer debug counters, moveable
|
||||
ePerfDebug3=41, //!< Developer debug counters, moveable
|
||||
ePerfDebug4=42, //!< Developer debug counters, moveable
|
||||
|
||||
ePerfReadBlockError=43, //!< crc or compression error in ReadBlock (format.cc)
|
||||
|
||||
ePerfIterNew=44, //!< Count of DBImpl::NewDBIterator calls
|
||||
ePerfIterNext=45, //!< Count of DBIter::Next calls
|
||||
ePerfIterPrev=46, //!< Count of DBIter::Prev calls
|
||||
ePerfIterSeek=47, //!< Count of DBIter::Seek calls
|
||||
ePerfIterSeekFirst=48, //!< Count of DBIter::SeekFirst calls
|
||||
ePerfIterSeekLast=49, //!< Count of DBIter::SeekLast calls
|
||||
ePerfIterDelete=50, //!< Count of DBIter::~DBIter
|
||||
|
||||
ePerfElevelDirect=51, //!< eleveldb's FindWaitingThread went direct to thread
|
||||
ePerfElevelQueued=52, //!< eleveldb's FindWaitingThread queued work item
|
||||
ePerfElevelDequeued=53, //!< eleveldb's worker took item from backlog queue
|
||||
|
||||
ePerfElevelRefCreate=54,//!< eleveldb RefObject constructed
|
||||
ePerfElevelRefDelete=55,//!< eleveldb RefObject destructed
|
||||
|
||||
ePerfThrottleGauge=56, //!< current throttle value
|
||||
ePerfThrottleCounter=57,//!< running throttle by seconds
|
||||
|
||||
ePerfThrottleMicros0=58,//!< level 0 micros spent compacting
|
||||
ePerfThrottleKeys0=59, //!< level 0 keys processed
|
||||
ePerfThrottleBacklog0=60,//!< backlog at time of posting (level0)
|
||||
ePerfThrottleCompacts0=61,//!< number of level 0 compactions
|
||||
|
||||
ePerfThrottleMicros1=62,//!< level 1+ micros spent compacting
|
||||
ePerfThrottleKeys1=63, //!< level 1+ keys processed
|
||||
ePerfThrottleBacklog1=64,//!< backlog at time of posting (level1+)
|
||||
ePerfThrottleCompacts1=65,//!< number of level 1+ compactions
|
||||
|
||||
ePerfBGWriteError=66, //!< error in write/close, see syslog
|
||||
|
||||
ePerfThrottleWait=67, //!< milliseconds of throttle wait
|
||||
ePerfThreadError=68, //!< system error on thread related call, no LOG access
|
||||
|
||||
ePerfBGImmDirect=69, //!< count Imm compactions happened directly
|
||||
ePerfBGImmQueued=70, //!< count Imm compactions placed on queue
|
||||
ePerfBGImmDequeued=71, //!< count Imm compactions removed from queue
|
||||
ePerfBGImmWeighted=72, //!< total microseconds item spent on queue
|
||||
|
||||
ePerfBGUnmapDirect=73, //!< count Unmap operations happened directly
|
||||
ePerfBGUnmapQueued=74, //!< count Unmap operations placed on queue
|
||||
ePerfBGUnmapDequeued=75,//!< count Unmap operations removed from queue
|
||||
ePerfBGUnmapWeighted=76,//!< total microseconds item spent on queue
|
||||
|
||||
ePerfBGLevel0Direct=77, //!< count Level0 compactions happened directly
|
||||
ePerfBGLevel0Queued=78, //!< count Level0 compactions placed on queue
|
||||
ePerfBGLevel0Dequeued=79,//!< count Level0 compactions removed from queue
|
||||
ePerfBGLevel0Weighted=80,//!< total microseconds item spent on queue
|
||||
|
||||
ePerfBGCompactDirect=81, //!< count generic compactions happened directly
|
||||
ePerfBGCompactQueued=82, //!< count generic compactions placed on queue
|
||||
ePerfBGCompactDequeued=83,//!< count generic compactions removed from queue
|
||||
ePerfBGCompactWeighted=84,//!< total microseconds item spent on queue
|
||||
|
||||
ePerfFileCacheInsert=85, //!< total bytes inserted into file cache
|
||||
ePerfFileCacheRemove=86, //!< total bytes removed from file cache
|
||||
|
||||
ePerfBlockCacheInsert=87, //!< total bytes inserted into block cache
|
||||
ePerfBlockCacheRemove=88, //!< total bytes removed from block cache
|
||||
|
||||
ePerfApiDelete=89, //!< Count of DB::Delete
|
||||
|
||||
ePerfBGMove=90, //!< compaction was a successful move
|
||||
ePerfBGMoveFail=91, //!< compaction move failed, regular compaction attempted
|
||||
|
||||
ePerfThrottleUnadjusted=92,//!< current unadjusted throttle gauge
|
||||
|
||||
// this one was added to the other ePerfElevelXxx counters above when we backported HotThreadPool to eleveldb
|
||||
ePerfElevelWeighted=93, //!< total microseconds item spent on queue
|
||||
|
||||
ePerfExpiredKeys=94, //!< key physically removed because it expired
|
||||
ePerfExpiredFiles=95, //!< entire file removed because all keys expired
|
||||
|
||||
ePerfSyslogWrite=96, //!< logged message to syslog
|
||||
ePerfBackupStarted=97, //!< hot backup initiated
|
||||
ePerfBackupError=98, //!< hot backup had an error
|
||||
|
||||
ePerfPropCacheHit=99, //!< property cache had data
|
||||
ePerfPropCacheMiss=100, //!< property cache had to look up data
|
||||
ePerfPropCacheError=101, //!< no property cache entry built/located
|
||||
|
||||
// must follow last index name to represent size of array
|
||||
// (ASSUMES previous enum is highest value)
|
||||
ePerfCountEnumSize, //!< size of the array described by the enum values
|
||||
|
||||
ePerfVersion=1, //!< structure versioning
|
||||
ePerfKey=41207 //!< random number as shared memory identifier
|
||||
};
|
||||
|
||||
|
||||
struct PerfCounterAttributes
|
||||
{
|
||||
const char * m_PerfCounterName; //!< text description
|
||||
const bool m_PerfDiscretionary; //!< true if ok to disable
|
||||
}; // PerfCounterAttributes
|
||||
|
||||
|
||||
//
|
||||
// Do NOT use virtual functions. This structure will be aligned at different
|
||||
// locations in multiple processes. Things can get messy with virtuals.
|
||||
|
||||
struct PerformanceCounters
|
||||
{
|
||||
public:
|
||||
static int m_LastError;
|
||||
|
||||
protected:
|
||||
uint32_t m_Version; //!< object revision identification
|
||||
uint32_t m_CounterSize; //!< number of objects in m_Counter
|
||||
|
||||
volatile uint64_t m_Counter[ePerfCountEnumSize];
|
||||
|
||||
static const PerfCounterAttributes m_PerfCounterAttr[];
|
||||
static int m_PerfSharedId;
|
||||
static volatile uint64_t m_BogusCounter; //!< for out of range GetPtr calls
|
||||
|
||||
public:
|
||||
// only called for local object, not for shared memory
|
||||
PerformanceCounters();
|
||||
|
||||
//!< does executable's idea of version match shared object?
|
||||
bool VersionTest()
|
||||
{return(ePerfCountEnumSize<=m_CounterSize && ePerfVersion==m_Version);};
|
||||
|
||||
//!< mostly for perf_count_test.cc
|
||||
void SetVersion(uint32_t Version, uint32_t CounterSize)
|
||||
{m_Version=Version; m_CounterSize=CounterSize;};
|
||||
|
||||
static PerformanceCounters * Init(bool IsReadOnly);
|
||||
static int Close(PerformanceCounters * Counts);
|
||||
|
||||
uint64_t Inc(unsigned Index);
|
||||
uint64_t Dec(unsigned Index);
|
||||
|
||||
// add value to the counter
|
||||
uint64_t Add(unsigned Index, uint64_t Amount);
|
||||
|
||||
// return value of a counter
|
||||
uint64_t Value(unsigned Index) const;
|
||||
|
||||
// set a value
|
||||
void Set(unsigned Index, uint64_t);
|
||||
|
||||
volatile const uint64_t * GetPtr(unsigned Index) const;
|
||||
|
||||
static const char * GetNamePtr(unsigned Index);
|
||||
|
||||
int LookupCounter(const char * Name);
|
||||
|
||||
void Dump();
|
||||
|
||||
}; // struct PerformanceCounters
|
||||
|
||||
extern PerformanceCounters * gPerfCounters;
|
||||
|
||||
extern volatile bool gPerfCountersDisabled;
|
||||
|
||||
} // namespace leveldb
|
||||
|
||||
#endif // STORAGE_LEVELDB_INCLUDE_PERF_COUNT_H_
|
|
@ -94,7 +94,7 @@ inline bool operator!=(const Slice& x, const Slice& y) {
|
|||
}
|
||||
|
||||
inline int Slice::compare(const Slice& b) const {
|
||||
const size_t min_len = (size_ < b.size_) ? size_ : b.size_;
|
||||
const int min_len = (size_ < b.size_) ? size_ : b.size_;
|
||||
int r = memcmp(data_, b.data_, min_len);
|
||||
if (r == 0) {
|
||||
if (size_ < b.size_) r = -1;
|
||||
|
|
|
@ -60,12 +60,6 @@ class Status {
|
|||
// Returns true iff the status indicates an IOError.
|
||||
bool IsIOError() const { return code() == kIOError; }
|
||||
|
||||
// Returns true iff the status indicates a NotSupportedError.
|
||||
bool IsNotSupportedError() const { return code() == kNotSupported; }
|
||||
|
||||
// Returns true iff the status indicates an InvalidArgument.
|
||||
bool IsInvalidArgument() const { return code() == kInvalidArgument; }
|
||||
|
||||
// Return a string representation of this status suitable for printing.
|
||||
// Returns the string "OK" for success.
|
||||
std::string ToString() const;
|
||||
|
|
|
@ -7,6 +7,7 @@
|
|||
|
||||
#include <stdint.h>
|
||||
#include "leveldb/iterator.h"
|
||||
#include "leveldb/perf_count.h"
|
||||
|
||||
namespace leveldb {
|
||||
|
||||
|
@ -40,7 +41,7 @@ class Table {
|
|||
uint64_t file_size,
|
||||
Table** table);
|
||||
|
||||
~Table();
|
||||
virtual ~Table();
|
||||
|
||||
// Returns a new iterator over the table contents.
|
||||
// The result of NewIterator() is initially invalid (caller must
|
||||
|
@ -55,7 +56,29 @@ class Table {
|
|||
// be close to the file length.
|
||||
uint64_t ApproximateOffsetOf(const Slice& key) const;
|
||||
|
||||
private:
|
||||
// return a static copy of the table's counters.
|
||||
SstCounters GetSstCounters() const;
|
||||
|
||||
// riak routine to retrieve total memory footprint of an open table
|
||||
// object in memory
|
||||
size_t TableObjectSize();
|
||||
|
||||
// riak routine to retrieve disk size of table file
|
||||
// ("virtual" is for unit test activites)
|
||||
virtual uint64_t GetFileSize();
|
||||
|
||||
// Riak routine to request bloom filter load on
|
||||
// second read operation (not iterator read)
|
||||
bool ReadFilter();
|
||||
|
||||
// access routines for testing tools, not for public use
|
||||
Block * TEST_GetIndexBlock();
|
||||
size_t TEST_TableObjectSize() {return(TableObjectSize());};
|
||||
size_t TEST_FilterDataSize();
|
||||
static Iterator* TEST_BlockReader(void* Ptr, const ReadOptions& ROptions, const Slice& SliceReturn)
|
||||
{return(BlockReader(Ptr, ROptions, SliceReturn));};
|
||||
|
||||
protected: // was private, made protected for unit tests
|
||||
struct Rep;
|
||||
Rep* rep_;
|
||||
|
||||
|
@ -69,11 +92,12 @@ class Table {
|
|||
Status InternalGet(
|
||||
const ReadOptions&, const Slice& key,
|
||||
void* arg,
|
||||
void (*handle_result)(void* arg, const Slice& k, const Slice& v));
|
||||
bool (*handle_result)(void* arg, const Slice& k, const Slice& v));
|
||||
|
||||
|
||||
void ReadMeta(const Footer& footer);
|
||||
void ReadFilter(const Slice& filter_handle_value);
|
||||
void ReadFilter(class BlockHandle & filter_handle_value, const class FilterPolicy * policy);
|
||||
void ReadSstCounters(const Slice& sst_counters_handle_value);
|
||||
|
||||
// No copying allowed
|
||||
Table(const Table&);
|
||||
|
|
|
@ -74,6 +74,14 @@ class TableBuilder {
|
|||
// Finish() call, returns the size of the final generated file.
|
||||
uint64_t FileSize() const;
|
||||
|
||||
// Number of delete tombstones so far.
|
||||
uint64_t NumDeletes() const;
|
||||
|
||||
// Retrieve expiry control values
|
||||
uint64_t GetExpiryWriteLow() const;
|
||||
uint64_t GetExpiryWriteHigh() const;
|
||||
uint64_t GetExpiryExplicitHigh() const;
|
||||
|
||||
private:
|
||||
bool ok() const { return status().ok(); }
|
||||
void WriteBlock(BlockBuilder* block, BlockHandle* handle);
|
||||
|
|
|
@ -23,6 +23,7 @@
|
|||
|
||||
#include <string>
|
||||
#include "leveldb/status.h"
|
||||
#include "leveldb/options.h"
|
||||
|
||||
namespace leveldb {
|
||||
|
||||
|
@ -34,7 +35,7 @@ class WriteBatch {
|
|||
~WriteBatch();
|
||||
|
||||
// Store the mapping "key->value" in the database.
|
||||
void Put(const Slice& key, const Slice& value);
|
||||
void Put(const Slice& key, const Slice& value, const KeyMetaData * meta=NULL);
|
||||
|
||||
// If the database contains a mapping for "key", erase it. Else do nothing.
|
||||
void Delete(const Slice& key);
|
||||
|
@ -46,7 +47,8 @@ class WriteBatch {
|
|||
class Handler {
|
||||
public:
|
||||
virtual ~Handler();
|
||||
virtual void Put(const Slice& key, const Slice& value) = 0;
|
||||
virtual void Put(const Slice& key, const Slice& value,
|
||||
const ValueType & type, const ExpiryTimeMicros & expiry) = 0;
|
||||
virtual void Delete(const Slice& key) = 0;
|
||||
};
|
||||
Status Iterate(Handler* handler) const;
|
||||
|
|
|
@ -1,92 +0,0 @@
|
|||
// Copyright (c) 2013 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
// Test for issue 178: a manual compaction causes deleted data to reappear.
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include <cstdlib>
|
||||
|
||||
#include "leveldb/db.h"
|
||||
#include "leveldb/write_batch.h"
|
||||
#include "util/testharness.h"
|
||||
|
||||
namespace {
|
||||
|
||||
const int kNumKeys = 1100000;
|
||||
|
||||
std::string Key1(int i) {
|
||||
char buf[100];
|
||||
snprintf(buf, sizeof(buf), "my_key_%d", i);
|
||||
return buf;
|
||||
}
|
||||
|
||||
std::string Key2(int i) {
|
||||
return Key1(i) + "_xxx";
|
||||
}
|
||||
|
||||
class Issue178 { };
|
||||
|
||||
TEST(Issue178, Test) {
|
||||
// Get rid of any state from an old run.
|
||||
std::string dbpath = leveldb::test::TmpDir() + "/leveldb_cbug_test";
|
||||
DestroyDB(dbpath, leveldb::Options());
|
||||
|
||||
// Open database. Disable compression since it affects the creation
|
||||
// of layers and the code below is trying to test against a very
|
||||
// specific scenario.
|
||||
leveldb::DB* db;
|
||||
leveldb::Options db_options;
|
||||
db_options.create_if_missing = true;
|
||||
db_options.compression = leveldb::kNoCompression;
|
||||
ASSERT_OK(leveldb::DB::Open(db_options, dbpath, &db));
|
||||
|
||||
// create first key range
|
||||
leveldb::WriteBatch batch;
|
||||
for (size_t i = 0; i < kNumKeys; i++) {
|
||||
batch.Put(Key1(i), "value for range 1 key");
|
||||
}
|
||||
ASSERT_OK(db->Write(leveldb::WriteOptions(), &batch));
|
||||
|
||||
// create second key range
|
||||
batch.Clear();
|
||||
for (size_t i = 0; i < kNumKeys; i++) {
|
||||
batch.Put(Key2(i), "value for range 2 key");
|
||||
}
|
||||
ASSERT_OK(db->Write(leveldb::WriteOptions(), &batch));
|
||||
|
||||
// delete second key range
|
||||
batch.Clear();
|
||||
for (size_t i = 0; i < kNumKeys; i++) {
|
||||
batch.Delete(Key2(i));
|
||||
}
|
||||
ASSERT_OK(db->Write(leveldb::WriteOptions(), &batch));
|
||||
|
||||
// compact database
|
||||
std::string start_key = Key1(0);
|
||||
std::string end_key = Key1(kNumKeys - 1);
|
||||
leveldb::Slice least(start_key.data(), start_key.size());
|
||||
leveldb::Slice greatest(end_key.data(), end_key.size());
|
||||
|
||||
// commenting out the line below causes the example to work correctly
|
||||
db->CompactRange(&least, &greatest);
|
||||
|
||||
// count the keys
|
||||
leveldb::Iterator* iter = db->NewIterator(leveldb::ReadOptions());
|
||||
size_t num_keys = 0;
|
||||
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
|
||||
num_keys++;
|
||||
}
|
||||
delete iter;
|
||||
ASSERT_EQ(kNumKeys, num_keys) << "Bad number of keys";
|
||||
|
||||
// close database
|
||||
delete db;
|
||||
DestroyDB(dbpath, leveldb::Options());
|
||||
}
|
||||
|
||||
} // anonymous namespace
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
return leveldb::test::RunAllTests();
|
||||
}
|
|
@ -1,59 +0,0 @@
|
|||
// Copyright (c) 2013 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
// Test for issue 200: when iterator switches direction from backward
|
||||
// to forward, the current key can be yielded unexpectedly if a new
|
||||
// mutation has been added just before the current key.
|
||||
|
||||
#include "leveldb/db.h"
|
||||
#include "util/testharness.h"
|
||||
|
||||
namespace leveldb {
|
||||
|
||||
class Issue200 { };
|
||||
|
||||
TEST(Issue200, Test) {
|
||||
// Get rid of any state from an old run.
|
||||
std::string dbpath = test::TmpDir() + "/leveldb_issue200_test";
|
||||
DestroyDB(dbpath, Options());
|
||||
|
||||
DB *db;
|
||||
Options options;
|
||||
options.create_if_missing = true;
|
||||
ASSERT_OK(DB::Open(options, dbpath, &db));
|
||||
|
||||
WriteOptions write_options;
|
||||
ASSERT_OK(db->Put(write_options, "1", "b"));
|
||||
ASSERT_OK(db->Put(write_options, "2", "c"));
|
||||
ASSERT_OK(db->Put(write_options, "3", "d"));
|
||||
ASSERT_OK(db->Put(write_options, "4", "e"));
|
||||
ASSERT_OK(db->Put(write_options, "5", "f"));
|
||||
|
||||
ReadOptions read_options;
|
||||
Iterator *iter = db->NewIterator(read_options);
|
||||
|
||||
// Add an element that should not be reflected in the iterator.
|
||||
ASSERT_OK(db->Put(write_options, "25", "cd"));
|
||||
|
||||
iter->Seek("5");
|
||||
ASSERT_EQ(iter->key().ToString(), "5");
|
||||
iter->Prev();
|
||||
ASSERT_EQ(iter->key().ToString(), "4");
|
||||
iter->Prev();
|
||||
ASSERT_EQ(iter->key().ToString(), "3");
|
||||
iter->Next();
|
||||
ASSERT_EQ(iter->key().ToString(), "4");
|
||||
iter->Next();
|
||||
ASSERT_EQ(iter->key().ToString(), "5");
|
||||
|
||||
delete iter;
|
||||
delete db;
|
||||
DestroyDB(dbpath, options);
|
||||
}
|
||||
|
||||
} // namespace leveldb
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
return leveldb::test::RunAllTests();
|
||||
}
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Reference in a new issue