Compare commits
4 commits
master
...
experiment
Author | SHA1 | Date | |
---|---|---|---|
|
1069eb65b5 | ||
|
83319b7f31 | ||
|
6b8935718e | ||
|
05d89e91cf |
209 changed files with 23549 additions and 8105 deletions
|
@ -475,7 +475,6 @@ lbrycrdd_LDADD = \
|
||||||
$(LIBBITCOIN_CONSENSUS) \
|
$(LIBBITCOIN_CONSENSUS) \
|
||||||
$(LIBBITCOIN_CRYPTO) \
|
$(LIBBITCOIN_CRYPTO) \
|
||||||
$(LIBLEVELDB) \
|
$(LIBLEVELDB) \
|
||||||
$(LIBLEVELDB_SSE42) \
|
|
||||||
$(LIBMEMENV) \
|
$(LIBMEMENV) \
|
||||||
$(LIBSECP256K1)
|
$(LIBSECP256K1)
|
||||||
|
|
||||||
|
@ -573,7 +572,7 @@ $(top_srcdir)/$(subdir)/config/bitcoin-config.h.in: $(am__configure_deps)
|
||||||
clean-local:
|
clean-local:
|
||||||
-$(MAKE) -C secp256k1 clean
|
-$(MAKE) -C secp256k1 clean
|
||||||
-$(MAKE) -C univalue clean
|
-$(MAKE) -C univalue clean
|
||||||
-rm -f leveldb/*/*.gcda leveldb/*/*.gcno leveldb/helpers/memenv/*.gcda leveldb/helpers/memenv/*.gcno
|
-$(MAKE) -C leveldb clean
|
||||||
-rm -f config.h
|
-rm -f config.h
|
||||||
-rm -rf test/__pycache__
|
-rm -rf test/__pycache__
|
||||||
|
|
||||||
|
|
|
@ -42,7 +42,6 @@ bench_bench_bitcoin_LDADD = \
|
||||||
$(LIBBITCOIN_CONSENSUS) \
|
$(LIBBITCOIN_CONSENSUS) \
|
||||||
$(LIBBITCOIN_CRYPTO) \
|
$(LIBBITCOIN_CRYPTO) \
|
||||||
$(LIBLEVELDB) \
|
$(LIBLEVELDB) \
|
||||||
$(LIBLEVELDB_SSE42) \
|
|
||||||
$(LIBMEMENV) \
|
$(LIBMEMENV) \
|
||||||
$(LIBSECP256K1) \
|
$(LIBSECP256K1) \
|
||||||
$(LIBUNIVALUE)
|
$(LIBUNIVALUE)
|
||||||
|
|
|
@ -2,148 +2,23 @@
|
||||||
# Distributed under the MIT software license, see the accompanying
|
# Distributed under the MIT software license, see the accompanying
|
||||||
# file COPYING or http://www.opensource.org/licenses/mit-license.php.
|
# file COPYING or http://www.opensource.org/licenses/mit-license.php.
|
||||||
|
|
||||||
|
SUBDIRS = leveldb
|
||||||
|
|
||||||
LIBLEVELDB_INT = leveldb/libleveldb.a
|
LIBLEVELDB_INT = leveldb/libleveldb.a
|
||||||
LIBMEMENV_INT = leveldb/libmemenv.a
|
LIBMEMENV_INT = leveldb/libmemenv.a
|
||||||
LIBLEVELDB_SSE42_INT = leveldb/libleveldb_sse42.a
|
|
||||||
|
|
||||||
EXTRA_LIBRARIES += $(LIBLEVELDB_INT)
|
EXTRA_LIBRARIES += $(LIBLEVELDB_INT)
|
||||||
EXTRA_LIBRARIES += $(LIBMEMENV_INT)
|
EXTRA_LIBRARIES += $(LIBMEMENV_INT)
|
||||||
EXTRA_LIBRARIES += $(LIBLEVELDB_SSE42_INT)
|
|
||||||
|
|
||||||
LIBLEVELDB += $(LIBLEVELDB_INT)
|
LIBLEVELDB += $(LIBLEVELDB_INT)
|
||||||
LIBMEMENV += $(LIBMEMENV_INT)
|
LIBMEMENV += $(LIBMEMENV_INT)
|
||||||
LIBLEVELDB_SSE42 = $(LIBLEVELDB_SSE42_INT)
|
|
||||||
|
|
||||||
LEVELDB_CPPFLAGS += -I$(srcdir)/leveldb/include
|
LEVELDB_CPPFLAGS += -I$(srcdir)/leveldb/include
|
||||||
LEVELDB_CPPFLAGS += -I$(srcdir)/leveldb/helpers/memenv
|
LEVELDB_CPPFLAGS += -I$(srcdir)/leveldb/helpers/memenv
|
||||||
|
LEVELDB_CPPFLAGS += -I$(srcdir)/leveldb
|
||||||
|
|
||||||
LEVELDB_CPPFLAGS_INT =
|
leveldb/libleveldb.a:
|
||||||
LEVELDB_CPPFLAGS_INT += -I$(srcdir)/leveldb
|
$(AM_V_at)$(MAKE) $(AM_MAKEFLAGS) -C leveldb
|
||||||
LEVELDB_CPPFLAGS_INT += $(LEVELDB_TARGET_FLAGS)
|
|
||||||
LEVELDB_CPPFLAGS_INT += -DLEVELDB_ATOMIC_PRESENT
|
|
||||||
LEVELDB_CPPFLAGS_INT += -D__STDC_LIMIT_MACROS
|
|
||||||
|
|
||||||
if TARGET_WINDOWS
|
leveldb/libmemenv.a: leveldb/libleveldb.a
|
||||||
LEVELDB_CPPFLAGS_INT += -DLEVELDB_PLATFORM_WINDOWS -DWINVER=0x0500 -D__USE_MINGW_ANSI_STDIO=1
|
$(AM_V_at)$(MAKE) $(AM_MAKEFLAGS) -C leveldb memenv_test
|
||||||
else
|
|
||||||
LEVELDB_CPPFLAGS_INT += -DLEVELDB_PLATFORM_POSIX
|
|
||||||
endif
|
|
||||||
|
|
||||||
leveldb_libleveldb_a_CPPFLAGS = $(AM_CPPFLAGS) $(LEVELDB_CPPFLAGS_INT) $(LEVELDB_CPPFLAGS)
|
|
||||||
leveldb_libleveldb_a_CXXFLAGS = $(AM_CXXFLAGS) $(PIE_FLAGS)
|
|
||||||
|
|
||||||
leveldb_libleveldb_a_SOURCES=
|
|
||||||
leveldb_libleveldb_a_SOURCES += leveldb/port/atomic_pointer.h
|
|
||||||
leveldb_libleveldb_a_SOURCES += leveldb/port/port_example.h
|
|
||||||
leveldb_libleveldb_a_SOURCES += leveldb/port/port_posix.h
|
|
||||||
leveldb_libleveldb_a_SOURCES += leveldb/port/win/stdint.h
|
|
||||||
leveldb_libleveldb_a_SOURCES += leveldb/port/port.h
|
|
||||||
leveldb_libleveldb_a_SOURCES += leveldb/port/port_win.h
|
|
||||||
leveldb_libleveldb_a_SOURCES += leveldb/port/thread_annotations.h
|
|
||||||
leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/db.h
|
|
||||||
leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/options.h
|
|
||||||
leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/comparator.h
|
|
||||||
leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/filter_policy.h
|
|
||||||
leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/slice.h
|
|
||||||
leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/table_builder.h
|
|
||||||
leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/env.h
|
|
||||||
leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/c.h
|
|
||||||
leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/iterator.h
|
|
||||||
leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/cache.h
|
|
||||||
leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/dumpfile.h
|
|
||||||
leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/table.h
|
|
||||||
leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/write_batch.h
|
|
||||||
leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/status.h
|
|
||||||
leveldb_libleveldb_a_SOURCES += leveldb/db/log_format.h
|
|
||||||
leveldb_libleveldb_a_SOURCES += leveldb/db/memtable.h
|
|
||||||
leveldb_libleveldb_a_SOURCES += leveldb/db/version_set.h
|
|
||||||
leveldb_libleveldb_a_SOURCES += leveldb/db/write_batch_internal.h
|
|
||||||
leveldb_libleveldb_a_SOURCES += leveldb/db/filename.h
|
|
||||||
leveldb_libleveldb_a_SOURCES += leveldb/db/version_edit.h
|
|
||||||
leveldb_libleveldb_a_SOURCES += leveldb/db/dbformat.h
|
|
||||||
leveldb_libleveldb_a_SOURCES += leveldb/db/builder.h
|
|
||||||
leveldb_libleveldb_a_SOURCES += leveldb/db/log_writer.h
|
|
||||||
leveldb_libleveldb_a_SOURCES += leveldb/db/db_iter.h
|
|
||||||
leveldb_libleveldb_a_SOURCES += leveldb/db/skiplist.h
|
|
||||||
leveldb_libleveldb_a_SOURCES += leveldb/db/db_impl.h
|
|
||||||
leveldb_libleveldb_a_SOURCES += leveldb/db/table_cache.h
|
|
||||||
leveldb_libleveldb_a_SOURCES += leveldb/db/snapshot.h
|
|
||||||
leveldb_libleveldb_a_SOURCES += leveldb/db/log_reader.h
|
|
||||||
leveldb_libleveldb_a_SOURCES += leveldb/table/filter_block.h
|
|
||||||
leveldb_libleveldb_a_SOURCES += leveldb/table/block_builder.h
|
|
||||||
leveldb_libleveldb_a_SOURCES += leveldb/table/block.h
|
|
||||||
leveldb_libleveldb_a_SOURCES += leveldb/table/two_level_iterator.h
|
|
||||||
leveldb_libleveldb_a_SOURCES += leveldb/table/merger.h
|
|
||||||
leveldb_libleveldb_a_SOURCES += leveldb/table/format.h
|
|
||||||
leveldb_libleveldb_a_SOURCES += leveldb/table/iterator_wrapper.h
|
|
||||||
leveldb_libleveldb_a_SOURCES += leveldb/util/crc32c.h
|
|
||||||
leveldb_libleveldb_a_SOURCES += leveldb/util/env_posix_test_helper.h
|
|
||||||
leveldb_libleveldb_a_SOURCES += leveldb/util/arena.h
|
|
||||||
leveldb_libleveldb_a_SOURCES += leveldb/util/random.h
|
|
||||||
leveldb_libleveldb_a_SOURCES += leveldb/util/posix_logger.h
|
|
||||||
leveldb_libleveldb_a_SOURCES += leveldb/util/hash.h
|
|
||||||
leveldb_libleveldb_a_SOURCES += leveldb/util/histogram.h
|
|
||||||
leveldb_libleveldb_a_SOURCES += leveldb/util/coding.h
|
|
||||||
leveldb_libleveldb_a_SOURCES += leveldb/util/testutil.h
|
|
||||||
leveldb_libleveldb_a_SOURCES += leveldb/util/mutexlock.h
|
|
||||||
leveldb_libleveldb_a_SOURCES += leveldb/util/logging.h
|
|
||||||
leveldb_libleveldb_a_SOURCES += leveldb/util/testharness.h
|
|
||||||
|
|
||||||
leveldb_libleveldb_a_SOURCES += leveldb/db/builder.cc
|
|
||||||
leveldb_libleveldb_a_SOURCES += leveldb/db/c.cc
|
|
||||||
leveldb_libleveldb_a_SOURCES += leveldb/db/dbformat.cc
|
|
||||||
leveldb_libleveldb_a_SOURCES += leveldb/db/db_impl.cc
|
|
||||||
leveldb_libleveldb_a_SOURCES += leveldb/db/db_iter.cc
|
|
||||||
leveldb_libleveldb_a_SOURCES += leveldb/db/dumpfile.cc
|
|
||||||
leveldb_libleveldb_a_SOURCES += leveldb/db/filename.cc
|
|
||||||
leveldb_libleveldb_a_SOURCES += leveldb/db/log_reader.cc
|
|
||||||
leveldb_libleveldb_a_SOURCES += leveldb/db/log_writer.cc
|
|
||||||
leveldb_libleveldb_a_SOURCES += leveldb/db/memtable.cc
|
|
||||||
leveldb_libleveldb_a_SOURCES += leveldb/db/repair.cc
|
|
||||||
leveldb_libleveldb_a_SOURCES += leveldb/db/table_cache.cc
|
|
||||||
leveldb_libleveldb_a_SOURCES += leveldb/db/version_edit.cc
|
|
||||||
leveldb_libleveldb_a_SOURCES += leveldb/db/version_set.cc
|
|
||||||
leveldb_libleveldb_a_SOURCES += leveldb/db/write_batch.cc
|
|
||||||
leveldb_libleveldb_a_SOURCES += leveldb/table/block_builder.cc
|
|
||||||
leveldb_libleveldb_a_SOURCES += leveldb/table/block.cc
|
|
||||||
leveldb_libleveldb_a_SOURCES += leveldb/table/filter_block.cc
|
|
||||||
leveldb_libleveldb_a_SOURCES += leveldb/table/format.cc
|
|
||||||
leveldb_libleveldb_a_SOURCES += leveldb/table/iterator.cc
|
|
||||||
leveldb_libleveldb_a_SOURCES += leveldb/table/merger.cc
|
|
||||||
leveldb_libleveldb_a_SOURCES += leveldb/table/table_builder.cc
|
|
||||||
leveldb_libleveldb_a_SOURCES += leveldb/table/table.cc
|
|
||||||
leveldb_libleveldb_a_SOURCES += leveldb/table/two_level_iterator.cc
|
|
||||||
leveldb_libleveldb_a_SOURCES += leveldb/util/arena.cc
|
|
||||||
leveldb_libleveldb_a_SOURCES += leveldb/util/bloom.cc
|
|
||||||
leveldb_libleveldb_a_SOURCES += leveldb/util/cache.cc
|
|
||||||
leveldb_libleveldb_a_SOURCES += leveldb/util/coding.cc
|
|
||||||
leveldb_libleveldb_a_SOURCES += leveldb/util/comparator.cc
|
|
||||||
leveldb_libleveldb_a_SOURCES += leveldb/util/crc32c.cc
|
|
||||||
leveldb_libleveldb_a_SOURCES += leveldb/util/env.cc
|
|
||||||
leveldb_libleveldb_a_SOURCES += leveldb/util/env_posix.cc
|
|
||||||
leveldb_libleveldb_a_SOURCES += leveldb/util/filter_policy.cc
|
|
||||||
leveldb_libleveldb_a_SOURCES += leveldb/util/hash.cc
|
|
||||||
leveldb_libleveldb_a_SOURCES += leveldb/util/histogram.cc
|
|
||||||
leveldb_libleveldb_a_SOURCES += leveldb/util/logging.cc
|
|
||||||
leveldb_libleveldb_a_SOURCES += leveldb/util/options.cc
|
|
||||||
leveldb_libleveldb_a_SOURCES += leveldb/util/status.cc
|
|
||||||
|
|
||||||
if TARGET_WINDOWS
|
|
||||||
leveldb_libleveldb_a_SOURCES += leveldb/util/env_win.cc
|
|
||||||
leveldb_libleveldb_a_SOURCES += leveldb/port/port_win.cc
|
|
||||||
else
|
|
||||||
leveldb_libleveldb_a_SOURCES += leveldb/port/port_posix.cc
|
|
||||||
endif
|
|
||||||
|
|
||||||
leveldb_libmemenv_a_CPPFLAGS = $(leveldb_libleveldb_a_CPPFLAGS)
|
|
||||||
leveldb_libmemenv_a_CXXFLAGS = $(leveldb_libleveldb_a_CXXFLAGS)
|
|
||||||
leveldb_libmemenv_a_SOURCES = leveldb/helpers/memenv/memenv.cc
|
|
||||||
leveldb_libmemenv_a_SOURCES += leveldb/helpers/memenv/memenv.h
|
|
||||||
|
|
||||||
leveldb_libleveldb_sse42_a_CPPFLAGS = $(leveldb_libleveldb_a_CPPFLAGS)
|
|
||||||
leveldb_libleveldb_sse42_a_CXXFLAGS = $(leveldb_libleveldb_a_CXXFLAGS)
|
|
||||||
if ENABLE_HWCRC32
|
|
||||||
leveldb_libleveldb_sse42_a_CPPFLAGS += -DLEVELDB_PLATFORM_POSIX_SSE
|
|
||||||
leveldb_libleveldb_sse42_a_CXXFLAGS += $(SSE42_CXXFLAGS)
|
|
||||||
endif
|
|
||||||
leveldb_libleveldb_sse42_a_SOURCES = leveldb/port/port_posix_sse.cc
|
|
||||||
|
|
|
@ -408,7 +408,7 @@ endif
|
||||||
if ENABLE_ZMQ
|
if ENABLE_ZMQ
|
||||||
qt_lbrycrd_qt_LDADD += $(LIBBITCOIN_ZMQ) $(ZMQ_LIBS)
|
qt_lbrycrd_qt_LDADD += $(LIBBITCOIN_ZMQ) $(ZMQ_LIBS)
|
||||||
endif
|
endif
|
||||||
qt_lbrycrd_qt_LDADD += $(LIBBITCOIN_CLI) $(LIBBITCOIN_COMMON) $(LIBBITCOIN_UTIL) $(LIBBITCOIN_CONSENSUS) $(LIBBITCOIN_CRYPTO) $(LIBUNIVALUE) $(LIBLEVELDB) $(LIBLEVELDB_SSE42) $(LIBMEMENV) \
|
qt_lbrycrd_qt_LDADD += $(LIBBITCOIN_CLI) $(LIBBITCOIN_COMMON) $(LIBBITCOIN_UTIL) $(LIBBITCOIN_CONSENSUS) $(LIBBITCOIN_CRYPTO) $(LIBUNIVALUE) $(LIBLEVELDB) $(LIBMEMENV) \
|
||||||
$(BOOST_LIBS) $(QT_LIBS) $(QT_DBUS_LIBS) $(QR_LIBS) $(PROTOBUF_LIBS) $(ICU_LIBS) $(BDB_LIBS) $(SSL_LIBS) $(CRYPTO_LIBS) $(MINIUPNPC_LIBS) $(LIBSECP256K1) \
|
$(BOOST_LIBS) $(QT_LIBS) $(QT_DBUS_LIBS) $(QR_LIBS) $(PROTOBUF_LIBS) $(ICU_LIBS) $(BDB_LIBS) $(SSL_LIBS) $(CRYPTO_LIBS) $(MINIUPNPC_LIBS) $(LIBSECP256K1) \
|
||||||
$(EVENT_PTHREADS_LIBS) $(EVENT_LIBS)
|
$(EVENT_PTHREADS_LIBS) $(EVENT_LIBS)
|
||||||
qt_lbrycrd_qt_LDFLAGS = $(RELDFLAGS) $(AM_LDFLAGS) $(QT_LDFLAGS) $(LIBTOOL_APP_LDFLAGS)
|
qt_lbrycrd_qt_LDFLAGS = $(RELDFLAGS) $(AM_LDFLAGS) $(QT_LDFLAGS) $(LIBTOOL_APP_LDFLAGS)
|
||||||
|
|
|
@ -63,7 +63,7 @@ if ENABLE_ZMQ
|
||||||
qt_test_test_lbrycrd_qt_LDADD += $(LIBBITCOIN_ZMQ) $(ZMQ_LIBS)
|
qt_test_test_lbrycrd_qt_LDADD += $(LIBBITCOIN_ZMQ) $(ZMQ_LIBS)
|
||||||
endif
|
endif
|
||||||
qt_test_test_lbrycrd_qt_LDADD += $(LIBBITCOIN_CLI) $(LIBBITCOIN_COMMON) $(LIBBITCOIN_UTIL) $(LIBBITCOIN_CONSENSUS) $(LIBBITCOIN_CRYPTO) $(LIBUNIVALUE) $(LIBLEVELDB) \
|
qt_test_test_lbrycrd_qt_LDADD += $(LIBBITCOIN_CLI) $(LIBBITCOIN_COMMON) $(LIBBITCOIN_UTIL) $(LIBBITCOIN_CONSENSUS) $(LIBBITCOIN_CRYPTO) $(LIBUNIVALUE) $(LIBLEVELDB) \
|
||||||
$(LIBLEVELDB_SSE42) $(LIBMEMENV) $(BOOST_LIBS) $(QT_DBUS_LIBS) $(QT_TEST_LIBS) $(QT_LIBS) \
|
$(LIBMEMENV) $(BOOST_LIBS) $(QT_DBUS_LIBS) $(QT_TEST_LIBS) $(QT_LIBS) \
|
||||||
$(QR_LIBS) $(PROTOBUF_LIBS) $(ICU_LIBS) $(BDB_LIBS) $(SSL_LIBS) $(CRYPTO_LIBS) $(MINIUPNPC_LIBS) $(LIBSECP256K1) \
|
$(QR_LIBS) $(PROTOBUF_LIBS) $(ICU_LIBS) $(BDB_LIBS) $(SSL_LIBS) $(CRYPTO_LIBS) $(MINIUPNPC_LIBS) $(LIBSECP256K1) \
|
||||||
$(EVENT_PTHREADS_LIBS) $(EVENT_LIBS)
|
$(EVENT_PTHREADS_LIBS) $(EVENT_LIBS)
|
||||||
qt_test_test_lbrycrd_qt_LDFLAGS = $(RELDFLAGS) $(AM_LDFLAGS) $(QT_LDFLAGS) $(LIBTOOL_APP_LDFLAGS)
|
qt_test_test_lbrycrd_qt_LDFLAGS = $(RELDFLAGS) $(AM_LDFLAGS) $(QT_LDFLAGS) $(LIBTOOL_APP_LDFLAGS)
|
||||||
|
|
|
@ -122,7 +122,7 @@ test_test_lbrycrd_LDADD += $(LIBBITCOIN_WALLET)
|
||||||
endif
|
endif
|
||||||
|
|
||||||
test_test_lbrycrd_LDADD += $(LIBBITCOIN_SERVER) $(LIBBITCOIN_CLI) $(LIBBITCOIN_COMMON) $(LIBBITCOIN_UTIL) $(LIBBITCOIN_CONSENSUS) $(LIBBITCOIN_CRYPTO) $(LIBUNIVALUE) \
|
test_test_lbrycrd_LDADD += $(LIBBITCOIN_SERVER) $(LIBBITCOIN_CLI) $(LIBBITCOIN_COMMON) $(LIBBITCOIN_UTIL) $(LIBBITCOIN_CONSENSUS) $(LIBBITCOIN_CRYPTO) $(LIBUNIVALUE) \
|
||||||
$(LIBLEVELDB) $(LIBLEVELDB_SSE42) $(LIBMEMENV) $(BOOST_LIBS) $(BOOST_UNIT_TEST_FRAMEWORK_LIB) $(LIBSECP256K1) $(EVENT_LIBS) $(EVENT_PTHREADS_LIBS)
|
$(LIBLEVELDB) $(LIBMEMENV) $(BOOST_LIBS) $(BOOST_UNIT_TEST_FRAMEWORK_LIB) $(LIBSECP256K1) $(EVENT_LIBS) $(EVENT_PTHREADS_LIBS)
|
||||||
test_test_lbrycrd_CXXFLAGS = $(AM_CXXFLAGS) $(PIE_FLAGS)
|
test_test_lbrycrd_CXXFLAGS = $(AM_CXXFLAGS) $(PIE_FLAGS)
|
||||||
|
|
||||||
test_test_lbrycrd_LDADD += $(LIBBITCOIN_CONSENSUS) $(BDB_LIBS) $(CRYPTO_LIBS) $(ICU_LIBS) $(MINIUPNPC_LIBS)
|
test_test_lbrycrd_LDADD += $(LIBBITCOIN_CONSENSUS) $(BDB_LIBS) $(CRYPTO_LIBS) $(ICU_LIBS) $(MINIUPNPC_LIBS)
|
||||||
|
|
|
@ -8,8 +8,6 @@
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <memory>
|
#include <memory>
|
||||||
|
|
||||||
#include <boost/scoped_ptr.hpp>
|
|
||||||
|
|
||||||
static const uint256 one = uint256S("0000000000000000000000000000000000000000000000000000000000000001");
|
static const uint256 one = uint256S("0000000000000000000000000000000000000000000000000000000000000001");
|
||||||
|
|
||||||
std::vector<unsigned char> heightToVch(int n)
|
std::vector<unsigned char> heightToVch(int n)
|
||||||
|
@ -123,13 +121,13 @@ void CClaimTrieData::reorderClaims(const supportEntryType& supports)
|
||||||
claim.nEffectiveAmount += support.nAmount;
|
claim.nEffectiveAmount += support.nAmount;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::make_heap(claims.begin(), claims.end());
|
std::sort(claims.rbegin(), claims.rend());
|
||||||
}
|
}
|
||||||
|
|
||||||
CClaimTrie::CClaimTrie(bool fMemory, bool fWipe, int proportionalDelayFactor)
|
CClaimTrie::CClaimTrie(bool fMemory, bool fWipe, int proportionalDelayFactor)
|
||||||
{
|
{
|
||||||
nProportionalDelayFactor = proportionalDelayFactor;
|
nProportionalDelayFactor = proportionalDelayFactor;
|
||||||
db.reset(new CDBWrapper(GetDataDir() / "claimtrie", 100 * 1024 * 1024, fMemory, fWipe, false));
|
db.reset(new CDBWrapper(GetDataDir() / "claimtrie", 200 * 1024 * 1024, fMemory, fWipe, false));
|
||||||
}
|
}
|
||||||
|
|
||||||
bool CClaimTrie::SyncToDisk()
|
bool CClaimTrie::SyncToDisk()
|
||||||
|
@ -200,7 +198,7 @@ typename queueNameType::value_type* CClaimTrieCacheBase::getQueueCacheNameRow(co
|
||||||
template <>
|
template <>
|
||||||
typename expirationQueueType::value_type* CClaimTrieCacheBase::getExpirationQueueCacheRow<CClaimValue>(int nHeight, bool createIfNotExists)
|
typename expirationQueueType::value_type* CClaimTrieCacheBase::getExpirationQueueCacheRow<CClaimValue>(int nHeight, bool createIfNotExists)
|
||||||
{
|
{
|
||||||
return getQueue(*(base->db), EXP_QUEUE_ROW, nHeight, expirationQueueCache, createIfNotExists);
|
return getQueue(*(base->db), CLAIM_EXP_QUEUE_ROW, nHeight, expirationQueueCache, createIfNotExists);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
|
@ -218,8 +216,14 @@ typename expirationQueueType::value_type* CClaimTrieCacheBase::getExpirationQueu
|
||||||
|
|
||||||
bool CClaimTrieCacheBase::haveClaim(const std::string& name, const COutPoint& outPoint) const
|
bool CClaimTrieCacheBase::haveClaim(const std::string& name, const COutPoint& outPoint) const
|
||||||
{
|
{
|
||||||
auto it = find(name);
|
auto it = nodesToAddOrUpdate.find(name);
|
||||||
return it && it->haveClaim(outPoint);
|
if (it && it->haveClaim(outPoint))
|
||||||
|
return true;
|
||||||
|
if (it || nodesToDelete.count(name))
|
||||||
|
return false;
|
||||||
|
CClaimTrieDataNode node;
|
||||||
|
node.childrenSerialization = false;
|
||||||
|
return base->find(name, node) && node.data.haveClaim(outPoint);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool CClaimTrieCacheBase::haveSupport(const std::string& name, const COutPoint& outPoint) const
|
bool CClaimTrieCacheBase::haveSupport(const std::string& name, const COutPoint& outPoint) const
|
||||||
|
@ -272,39 +276,63 @@ bool CClaimTrieCacheBase::haveSupportInQueue(const std::string& name, const COut
|
||||||
return haveInQueue<CSupportValue>(name, outPoint, nValidAtHeight);
|
return haveInQueue<CSupportValue>(name, outPoint, nValidAtHeight);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::size_t CClaimTrieCacheBase::getTotalNamesInTrie() const
|
void CClaimTrie::recurseAllHashedNodes(const std::string& name, const CClaimTrieDataNode& current, std::function<void(const std::string&, const CClaimTrieDataNode&)> function) const {
|
||||||
|
function(name, current);
|
||||||
|
for (auto& child: current.children) {
|
||||||
|
CClaimTrieDataNode node;
|
||||||
|
if (find(child.second, node))
|
||||||
|
recurseAllHashedNodes(name + child.first, node, function);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std::size_t CClaimTrie::getTotalNamesInTrie() const
|
||||||
{
|
{
|
||||||
std::size_t count = 0;
|
std::size_t count = 0;
|
||||||
for (auto it = base->cbegin(); it != base->cend(); ++it)
|
CClaimTrieDataNode node;
|
||||||
if (!it->empty()) ++count;
|
if (find("", node))
|
||||||
|
recurseAllHashedNodes("", node, [&count](const std::string&, const CClaimTrieDataNode& node) {
|
||||||
|
count += !node.data.empty();
|
||||||
|
});
|
||||||
return count;
|
return count;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::size_t CClaimTrieCacheBase::getTotalClaimsInTrie() const
|
std::size_t CClaimTrie::getTotalClaimsInTrie() const
|
||||||
{
|
{
|
||||||
std::size_t count = 0;
|
std::size_t count = 0;
|
||||||
for (auto it = base->cbegin(); it != base->cend(); ++it)
|
CClaimTrieDataNode node;
|
||||||
count += it->claims.size();
|
if (find("", node))
|
||||||
|
recurseAllHashedNodes("", node, [&count](const std::string&, const CClaimTrieDataNode& node) {
|
||||||
|
count += node.data.claims.size();
|
||||||
|
});
|
||||||
return count;
|
return count;
|
||||||
}
|
}
|
||||||
|
|
||||||
CAmount CClaimTrieCacheBase::getTotalValueOfClaimsInTrie(bool fControllingOnly) const
|
CAmount CClaimTrie::getTotalValueOfClaimsInTrie(bool fControllingOnly) const
|
||||||
{
|
{
|
||||||
CAmount value_in_subtrie = 0;
|
CAmount value_in_subtrie = 0;
|
||||||
for (auto it = base->cbegin(); it != base->cend(); ++it) {
|
std::size_t count = 0;
|
||||||
for (const auto& claim : it->claims) {
|
CClaimTrieDataNode node;
|
||||||
|
if (find("", node))
|
||||||
|
recurseAllHashedNodes("", node, [&value_in_subtrie, fControllingOnly](const std::string&, const CClaimTrieDataNode& node) {
|
||||||
|
for (const auto& claim : node.data.claims) {
|
||||||
value_in_subtrie += claim.nAmount;
|
value_in_subtrie += claim.nAmount;
|
||||||
if (fControllingOnly)
|
if (fControllingOnly)
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
});
|
||||||
return value_in_subtrie;
|
return value_in_subtrie;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool CClaimTrieCacheBase::getInfoForName(const std::string& name, CClaimValue& claim) const
|
bool CClaimTrieCacheBase::getInfoForName(const std::string& name, CClaimValue& claim) const
|
||||||
{
|
{
|
||||||
auto it = find(name);
|
auto it = nodesToAddOrUpdate.find(name);
|
||||||
return it && it->getBestClaim(claim);
|
if (it && it->getBestClaim(claim))
|
||||||
|
return true;
|
||||||
|
if (it || nodesToDelete.count(name))
|
||||||
|
return false;
|
||||||
|
CClaimTrieDataNode node;
|
||||||
|
node.childrenSerialization = false;
|
||||||
|
return base->find(name, node) && node.data.getBestClaim(claim);
|
||||||
}
|
}
|
||||||
|
|
||||||
CClaimsForNameType CClaimTrieCacheBase::getClaimsForName(const std::string& name) const
|
CClaimsForNameType CClaimTrieCacheBase::getClaimsForName(const std::string& name) const
|
||||||
|
@ -313,10 +341,16 @@ CClaimsForNameType CClaimTrieCacheBase::getClaimsForName(const std::string& name
|
||||||
int nLastTakeoverHeight = 0;
|
int nLastTakeoverHeight = 0;
|
||||||
auto supports = getSupportsForName(name);
|
auto supports = getSupportsForName(name);
|
||||||
|
|
||||||
if (auto it = find(name)) {
|
CClaimTrieDataNode node;
|
||||||
|
node.childrenSerialization = false;
|
||||||
|
if (auto it = nodesToAddOrUpdate.find(name)) {
|
||||||
claims = it->claims;
|
claims = it->claims;
|
||||||
nLastTakeoverHeight = it->nHeightOfLastTakeover;
|
nLastTakeoverHeight = it->nHeightOfLastTakeover;
|
||||||
}
|
}
|
||||||
|
else if (!nodesToDelete.count(name) && base->find(name, node)) {
|
||||||
|
claims = node.data.claims;
|
||||||
|
nLastTakeoverHeight = node.data.nHeightOfLastTakeover;
|
||||||
|
}
|
||||||
return {std::move(claims), std::move(supports), nLastTakeoverHeight, name};
|
return {std::move(claims), std::move(supports), nLastTakeoverHeight, name};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -381,60 +415,97 @@ uint256 recursiveMerkleHash(TIterator& it, const iCbType<TIterator>& process, co
|
||||||
return Hash(vchToHash.begin(), vchToHash.end());
|
return Hash(vchToHash.begin(), vchToHash.end());
|
||||||
}
|
}
|
||||||
|
|
||||||
bool recursiveCheckConsistency(CClaimTrie::const_iterator& it, std::string& failed)
|
bool CClaimTrie::checkConsistency(const uint256& rootHash) const
|
||||||
{
|
{
|
||||||
struct CRecursiveBreak : public std::exception {};
|
CClaimTrieDataNode node;
|
||||||
|
if (!find("", node) || node.data.hash != rootHash) {
|
||||||
using iterator = CClaimTrie::const_iterator;
|
if (rootHash == one)
|
||||||
iCbType<iterator> verify = [&failed](iterator& it) {
|
|
||||||
if (!it.hasChildren()) {
|
|
||||||
// we don't allow a situation of no children and no claims; no empty leaf nodes allowed
|
|
||||||
failed = it.key();
|
|
||||||
throw CRecursiveBreak();
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
iCbType<iterator> process = [&failed, &process, &verify](iterator& it) {
|
|
||||||
if (it->hash != recursiveMerkleHash(it, process, verify)) {
|
|
||||||
failed = it.key();
|
|
||||||
throw CRecursiveBreak();
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
try {
|
|
||||||
process(it);
|
|
||||||
} catch (const CRecursiveBreak&) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
return true;
|
return true;
|
||||||
|
|
||||||
|
return error("Mismatched root claim trie hashes. This may happen when there is not a clean process shutdown. Please run with -reindex.");
|
||||||
|
}
|
||||||
|
|
||||||
|
bool success = true;
|
||||||
|
recurseAllHashedNodes("", node, [&success, this](const std::string& name, const CClaimTrieDataNode& node) {
|
||||||
|
if (!success) return;
|
||||||
|
|
||||||
|
success &= contains(name);
|
||||||
|
|
||||||
|
std::vector<uint8_t> vchToHash;
|
||||||
|
const auto pos = name.size();
|
||||||
|
for (auto& child : node.children) {
|
||||||
|
auto key = name + child.first;
|
||||||
|
auto hash = child.second;
|
||||||
|
completeHash(hash, key, pos);
|
||||||
|
vchToHash.push_back(key[pos]);
|
||||||
|
vchToHash.insert(vchToHash.end(), hash.begin(), hash.end());
|
||||||
|
}
|
||||||
|
|
||||||
|
CClaimValue claim;
|
||||||
|
if (node.data.getBestClaim(claim)) {
|
||||||
|
uint256 valueHash = getValueHash(claim.outPoint, node.data.nHeightOfLastTakeover);
|
||||||
|
vchToHash.insert(vchToHash.end(), valueHash.begin(), valueHash.end());
|
||||||
|
} else {
|
||||||
|
success &= !node.children.empty(); // we disallow leaf nodes without claims
|
||||||
|
}
|
||||||
|
|
||||||
|
success &= node.data.hash == Hash(vchToHash.begin(), vchToHash.end());
|
||||||
|
});
|
||||||
|
|
||||||
|
return success;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool CClaimTrieCacheBase::checkConsistency() const
|
std::vector<std::pair<std::string, CClaimTrieDataNode>> CClaimTrie::nodes(const std::string &key) const {
|
||||||
{
|
std::vector<std::pair<std::string, CClaimTrieDataNode>> ret;
|
||||||
if (base->empty())
|
CClaimTrieDataNode node;
|
||||||
return true;
|
|
||||||
|
|
||||||
auto it = base->cbegin();
|
if (!find("", node))
|
||||||
std::string failed;
|
return ret;
|
||||||
auto consistent = recursiveCheckConsistency(it, failed);
|
ret.emplace_back("", node);
|
||||||
if (!consistent) {
|
|
||||||
LogPrintf("\nPrinting base tree from its parent:\n");
|
std::string partialKey = key;
|
||||||
auto basePath = base->nodes(failed);
|
|
||||||
if (basePath.size() > 1) basePath.pop_back();
|
while (!node.children.empty()) {
|
||||||
dumpToLog(basePath.back(), false);
|
// auto it = node.children.lower_bound(partialKey); // for using a std::map
|
||||||
auto cachePath = nodesToAddOrUpdate.nodes(failed);
|
auto it = std::lower_bound(node.children.begin(), node.children.end(), std::make_pair(partialKey, uint256()));
|
||||||
if (!cachePath.empty()) {
|
if (it != node.children.end() && it->first == partialKey) {
|
||||||
LogPrintf("\nPrinting %s's parent from cache:\n", failed);
|
// we're completely done
|
||||||
if (cachePath.size() > 1) cachePath.pop_back();
|
if (find(it->second, node))
|
||||||
dumpToLog(cachePath.back(), false);
|
ret.emplace_back(key, node);
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
if (!nodesToDelete.empty()) {
|
if (it != node.children.begin()) --it;
|
||||||
std::string joined;
|
const auto count = match(partialKey, it->first);
|
||||||
for (const auto &piece : nodesToDelete) joined += ", " + piece;
|
|
||||||
LogPrintf("Nodes to be deleted: %s\n", joined.substr(2));
|
if (count != it->first.size()) break;
|
||||||
|
if (count == partialKey.size()) break;
|
||||||
|
partialKey = partialKey.substr(count);
|
||||||
|
if (find(it->second, node))
|
||||||
|
ret.emplace_back(key.substr(0, key.size() - partialKey.size()), node);
|
||||||
|
else break;
|
||||||
}
|
}
|
||||||
}
|
|
||||||
return consistent;
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool CClaimTrie::contains(const std::string &key) const {
|
||||||
|
return db->Exists(std::make_pair(TRIE_NODE_BY_NAME, key));
|
||||||
|
}
|
||||||
|
|
||||||
|
bool CClaimTrie::empty() const {
|
||||||
|
return !contains("");
|
||||||
|
}
|
||||||
|
|
||||||
|
bool CClaimTrie::find(const std::string &key, CClaimTrieDataNode &node) const {
|
||||||
|
uint256 hash;
|
||||||
|
if (!db->Read(std::make_pair(TRIE_NODE_BY_NAME, key), hash))
|
||||||
|
return false;
|
||||||
|
auto found = find(hash, node);
|
||||||
|
return found;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool CClaimTrie::find(const uint256 &key, CClaimTrieDataNode &node) const {
|
||||||
|
return db->Read(std::make_pair(TRIE_NODE_BY_HASH, key), node);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool CClaimTrieCacheBase::getClaimById(const uint160& claimId, std::string& name, CClaimValue& claim) const
|
bool CClaimTrieCacheBase::getClaimById(const uint160& claimId, std::string& name, CClaimValue& claim) const
|
||||||
|
@ -486,99 +557,79 @@ bool CClaimTrieCacheBase::flush()
|
||||||
|
|
||||||
getMerkleHash();
|
getMerkleHash();
|
||||||
|
|
||||||
|
std::set<std::string> forDeletion;
|
||||||
for (const auto& nodeName : nodesToDelete) {
|
for (const auto& nodeName : nodesToDelete) {
|
||||||
if (nodesToAddOrUpdate.contains(nodeName))
|
// TODO: we don't need to deserialize all the nodes right here
|
||||||
continue;
|
// we could be smarter about this and fill in the whole list in removeClaimFromTrie
|
||||||
auto nodes = base->nodes(nodeName);
|
auto nodes = base->nodes(nodeName);
|
||||||
base->erase(nodeName);
|
|
||||||
for (auto& node : nodes)
|
for (auto& node : nodes)
|
||||||
if (!node)
|
forDeletion.insert(node.first);
|
||||||
batch.Erase(std::make_pair(TRIE_NODE, node.key()));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
for (auto it = nodesToAddOrUpdate.begin(); it != nodesToAddOrUpdate.end(); ++it) {
|
for (auto it = nodesToAddOrUpdate.begin(); it != nodesToAddOrUpdate.end(); ++it) {
|
||||||
auto old = base->find(it.key());
|
forDeletion.erase(it.key());
|
||||||
if (!old || old.data() != it.data()) {
|
if (!dirtyNodes.count(it.key()))
|
||||||
base->copy(it);
|
continue;
|
||||||
batch.Write(std::make_pair(TRIE_NODE, it.key()), it.data());
|
|
||||||
|
CClaimTrieDataNode node;
|
||||||
|
node.data = it.data();
|
||||||
|
for (auto &child: it.children()) // ordering here is important
|
||||||
|
node.children.emplace_back(child.key().substr(it.key().size()), child->hash);
|
||||||
|
|
||||||
|
batch.Write(std::make_pair(TRIE_NODE_BY_HASH, it->hash), node);
|
||||||
|
batch.Write(std::make_pair(TRIE_NODE_BY_NAME, it.key()), it->hash);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for (auto& name: forDeletion) {
|
||||||
|
batch.Erase(std::make_pair(TRIE_NODE_BY_NAME, name));
|
||||||
}
|
}
|
||||||
|
|
||||||
BatchWriteQueue(batch, SUPPORT, supportCache);
|
BatchWriteQueue(batch, SUPPORT, supportCache);
|
||||||
|
|
||||||
BatchWriteQueue(batch, CLAIM_QUEUE_ROW, claimQueueCache);
|
BatchWriteQueue(batch, CLAIM_QUEUE_ROW, claimQueueCache);
|
||||||
BatchWriteQueue(batch, CLAIM_QUEUE_NAME_ROW, claimQueueNameCache);
|
BatchWriteQueue(batch, CLAIM_QUEUE_NAME_ROW, claimQueueNameCache);
|
||||||
BatchWriteQueue(batch, EXP_QUEUE_ROW, expirationQueueCache);
|
BatchWriteQueue(batch, CLAIM_EXP_QUEUE_ROW, expirationQueueCache);
|
||||||
|
|
||||||
BatchWriteQueue(batch, SUPPORT_QUEUE_ROW, supportQueueCache);
|
BatchWriteQueue(batch, SUPPORT_QUEUE_ROW, supportQueueCache);
|
||||||
BatchWriteQueue(batch, SUPPORT_QUEUE_NAME_ROW, supportQueueNameCache);
|
BatchWriteQueue(batch, SUPPORT_QUEUE_NAME_ROW, supportQueueNameCache);
|
||||||
BatchWriteQueue(batch, SUPPORT_EXP_QUEUE_ROW, supportExpirationQueueCache);
|
BatchWriteQueue(batch, SUPPORT_EXP_QUEUE_ROW, supportExpirationQueueCache);
|
||||||
|
|
||||||
base->nNextHeight = nNextHeight;
|
base->nNextHeight = nNextHeight;
|
||||||
if (!nodesToAddOrUpdate.empty())
|
if (!nodesToAddOrUpdate.empty() && (LogAcceptCategory(BCLog::CLAIMS) || LogAcceptCategory(BCLog::BENCH))) {
|
||||||
LogPrint(BCLog::CLAIMS, "Cache size: %zu from base size: %zu on block %d\n", nodesToAddOrUpdate.height(), base->height(), nNextHeight);
|
LogPrintf("TrieCache size: %zu nodes on block %d, batch writes %zu bytes.\n",
|
||||||
|
nodesToAddOrUpdate.height(), nNextHeight, batch.SizeEstimate(), base->db->DynamicMemoryUsage());
|
||||||
|
}
|
||||||
auto ret = base->db->WriteBatch(batch);
|
auto ret = base->db->WriteBatch(batch);
|
||||||
clear();
|
clear();
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool CClaimTrieCacheBase::ReadFromDisk(const CBlockIndex* tip)
|
bool CClaimTrieCacheBase::validateTrieConsistency(const CBlockIndex* tip)
|
||||||
{
|
{
|
||||||
LogPrintf("Loading the claim trie from disk...\n");
|
if (!tip || tip->nHeight < 1)
|
||||||
|
return true;
|
||||||
base->nNextHeight = nNextHeight = tip ? tip->nHeight + 1 : 0;
|
|
||||||
|
|
||||||
clear();
|
|
||||||
base->clear();
|
|
||||||
boost::scoped_ptr<CDBIterator> pcursor(base->db->NewIterator());
|
|
||||||
|
|
||||||
std::vector<std::pair<std::string, uint256>> hashesOnEmptyNodes;
|
|
||||||
|
|
||||||
for (pcursor->SeekToFirst(); pcursor->Valid(); pcursor->Next()) {
|
|
||||||
std::pair<uint8_t, std::string> key;
|
|
||||||
if (!pcursor->GetKey(key) || key.first != TRIE_NODE)
|
|
||||||
continue;
|
|
||||||
|
|
||||||
CClaimTrieData data;
|
|
||||||
if (pcursor->GetValue(data)) {
|
|
||||||
if (data.empty()) {
|
|
||||||
// we have a situation where our old trie had many empty nodes
|
|
||||||
// we don't want to automatically throw those all into our prefix trie
|
|
||||||
hashesOnEmptyNodes.emplace_back(key.second, data.hash);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
// nEffectiveAmount isn't serialized but it needs to be initialized (as done in reorderClaims):
|
|
||||||
auto supports = getSupportsForName(key.second);
|
|
||||||
data.reorderClaims(supports);
|
|
||||||
base->insert(key.second, std::move(data));
|
|
||||||
} else {
|
|
||||||
return error("%s(): error reading claim trie from disk", __func__);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
CDBBatch batch(*(base->db));
|
|
||||||
for (auto& kvp: hashesOnEmptyNodes) {
|
|
||||||
auto hit = base->find(kvp.first);
|
|
||||||
if (hit != base->end())
|
|
||||||
hit->hash = kvp.second;
|
|
||||||
else {
|
|
||||||
// the first time the prefix trie is ran there will be many unused nodes
|
|
||||||
// we need to clean those out so that we can go faster next time
|
|
||||||
batch.Erase(std::make_pair(TRIE_NODE, kvp.first));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
LogPrintf("Checking claim trie consistency... ");
|
LogPrintf("Checking claim trie consistency... ");
|
||||||
if (checkConsistency()) {
|
if (base->checkConsistency(tip->hashClaimTrie)) {
|
||||||
LogPrintf("consistent\n");
|
LogPrintf("consistent\n");
|
||||||
if (tip && tip->hashClaimTrie != getMerkleHash())
|
|
||||||
return error("%s(): hashes don't match when reading claimtrie from disk", __func__);
|
|
||||||
base->db->WriteBatch(batch);
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
LogPrintf("inconsistent!\n");
|
LogPrintf("inconsistent!\n");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool CClaimTrieCacheBase::ReadFromDisk(const CBlockIndex* tip)
|
||||||
|
{
|
||||||
|
base->nNextHeight = nNextHeight = tip ? tip->nHeight + 1 : 0;
|
||||||
|
clear();
|
||||||
|
|
||||||
|
if (tip && (base->db->Exists(std::make_pair(TRIE_NODE, std::string())) || !base->db->Exists(std::make_pair(TRIE_NODE_BY_HASH, tip->hashClaimTrie)))) {
|
||||||
|
LogPrintf("The claim trie database contains deprecated data and will need to be rebuilt");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return validateTrieConsistency(tip);
|
||||||
|
}
|
||||||
|
|
||||||
CClaimTrieCacheBase::CClaimTrieCacheBase(CClaimTrie* base) : base(base)
|
CClaimTrieCacheBase::CClaimTrieCacheBase(CClaimTrie* base) : base(base)
|
||||||
{
|
{
|
||||||
assert(base);
|
assert(base);
|
||||||
|
@ -590,9 +641,9 @@ int CClaimTrieCacheBase::expirationTime() const
|
||||||
return Params().GetConsensus().nOriginalClaimExpirationTime;
|
return Params().GetConsensus().nOriginalClaimExpirationTime;
|
||||||
}
|
}
|
||||||
|
|
||||||
uint256 CClaimTrieCacheBase::recursiveComputeMerkleHash(CClaimTrie::iterator& it)
|
uint256 CClaimTrieCacheBase::recursiveComputeMerkleHash(CClaimPrefixTrie::iterator& it)
|
||||||
{
|
{
|
||||||
using iterator = CClaimTrie::iterator;
|
using iterator = CClaimPrefixTrie::iterator;
|
||||||
iCbType<iterator> process = [&process](iterator& it) {
|
iCbType<iterator> process = [&process](iterator& it) {
|
||||||
if (it->hash.IsNull())
|
if (it->hash.IsNull())
|
||||||
it->hash = recursiveMerkleHash(it, process);
|
it->hash = recursiveMerkleHash(it, process);
|
||||||
|
@ -604,54 +655,52 @@ uint256 CClaimTrieCacheBase::recursiveComputeMerkleHash(CClaimTrie::iterator& it
|
||||||
uint256 CClaimTrieCacheBase::getMerkleHash()
|
uint256 CClaimTrieCacheBase::getMerkleHash()
|
||||||
{
|
{
|
||||||
auto it = nodesToAddOrUpdate.begin();
|
auto it = nodesToAddOrUpdate.begin();
|
||||||
if (nodesToAddOrUpdate.empty() && nodesToDelete.empty())
|
if (it)
|
||||||
it = base->begin();
|
return recursiveComputeMerkleHash(it);
|
||||||
return !it ? one : recursiveComputeMerkleHash(it);
|
if (nodesToDelete.empty() && nodesAlreadyCached.empty()) {
|
||||||
|
CClaimTrieDataNode node;
|
||||||
|
node.childrenSerialization = false;
|
||||||
|
if (base->find("", node))
|
||||||
|
return node.data.hash; // it may be valuable to have base cache its current root hash
|
||||||
|
}
|
||||||
|
return one; // we have no data or we deleted everything
|
||||||
}
|
}
|
||||||
|
|
||||||
CClaimTrie::const_iterator CClaimTrieCacheBase::begin() const
|
CClaimPrefixTrie::const_iterator CClaimTrieCacheBase::begin() const
|
||||||
{
|
{
|
||||||
return nodesToAddOrUpdate.empty() && nodesToDelete.empty() ? base->cbegin() : nodesToAddOrUpdate.begin();
|
return nodesToAddOrUpdate.begin();
|
||||||
}
|
}
|
||||||
|
|
||||||
CClaimTrie::const_iterator CClaimTrieCacheBase::end() const
|
CClaimPrefixTrie::const_iterator CClaimTrieCacheBase::end() const
|
||||||
{
|
{
|
||||||
return nodesToAddOrUpdate.empty() && nodesToDelete.empty() ? base->cend() : nodesToAddOrUpdate.end();
|
return nodesToAddOrUpdate.end();
|
||||||
}
|
|
||||||
|
|
||||||
CClaimTrie::const_iterator CClaimTrieCacheBase::find(const std::string& name) const
|
|
||||||
{
|
|
||||||
if (auto it = nodesToAddOrUpdate.find(name))
|
|
||||||
return it;
|
|
||||||
return base->find(name);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
bool CClaimTrieCacheBase::empty() const
|
bool CClaimTrieCacheBase::empty() const
|
||||||
{
|
{
|
||||||
return base->empty() && nodesToAddOrUpdate.empty();
|
return nodesToAddOrUpdate.empty();
|
||||||
}
|
}
|
||||||
|
|
||||||
CClaimTrie::iterator CClaimTrieCacheBase::cacheData(const std::string& name, bool create)
|
CClaimPrefixTrie::iterator CClaimTrieCacheBase::cacheData(const std::string& name, bool create)
|
||||||
{
|
{
|
||||||
// get data from the cache. if no data, create empty one
|
|
||||||
const auto insert = [this](CClaimTrie::iterator& it) {
|
|
||||||
auto& key = it.key();
|
|
||||||
// we only ever cache nodes once per cache instance
|
|
||||||
if (!nodesAlreadyCached.count(key)) {
|
|
||||||
// do not insert nodes that are already present
|
|
||||||
nodesAlreadyCached.insert(key);
|
|
||||||
nodesToAddOrUpdate.insert(key, it.data());
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
// we need all parent nodes and their one level deep children
|
// we need all parent nodes and their one level deep children
|
||||||
// to calculate merkle hash
|
// to calculate merkle hash
|
||||||
auto nodes = base->nodes(name);
|
auto nodes = base->nodes(name);
|
||||||
for (auto& node: nodes) {
|
for (auto& node: nodes) {
|
||||||
for (auto& child : node.children())
|
if (nodesAlreadyCached.insert(node.first).second) {
|
||||||
if (!nodesAlreadyCached.count(child.key()))
|
// do not insert nodes that are already present
|
||||||
nodesToAddOrUpdate.copy(child);
|
nodesToAddOrUpdate.insert(node.first, node.second.data);
|
||||||
insert(node);
|
}
|
||||||
|
for (auto& child : node.second.children) {
|
||||||
|
auto childKey = node.first + child.first;
|
||||||
|
if (nodesAlreadyCached.insert(childKey).second) {
|
||||||
|
CClaimTrieDataNode childNode;
|
||||||
|
childNode.childrenSerialization = false;
|
||||||
|
if (base->find(child.second, childNode)) {
|
||||||
|
nodesToAddOrUpdate.insert(childKey, childNode.data);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
auto it = nodesToAddOrUpdate.find(name);
|
auto it = nodesToAddOrUpdate.find(name);
|
||||||
|
@ -677,10 +726,12 @@ bool CClaimTrieCacheBase::getLastTakeoverForName(const std::string& name, uint16
|
||||||
std::tie(claimId, takeoverHeight) = cit->second;
|
std::tie(claimId, takeoverHeight) = cit->second;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
if (auto it = base->find(name)) {
|
CClaimTrieDataNode data;
|
||||||
takeoverHeight = it->nHeightOfLastTakeover;
|
data.childrenSerialization = false;
|
||||||
|
if (base->find(name, data)) {
|
||||||
|
takeoverHeight = data.data.nHeightOfLastTakeover;
|
||||||
CClaimValue claim;
|
CClaimValue claim;
|
||||||
if (it->getBestClaim(claim)) {
|
if (data.data.getBestClaim(claim)) {
|
||||||
claimId = claim.claimId;
|
claimId = claim.claimId;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
@ -690,8 +741,10 @@ bool CClaimTrieCacheBase::getLastTakeoverForName(const std::string& name, uint16
|
||||||
|
|
||||||
void CClaimTrieCacheBase::markAsDirty(const std::string& name, bool fCheckTakeover)
|
void CClaimTrieCacheBase::markAsDirty(const std::string& name, bool fCheckTakeover)
|
||||||
{
|
{
|
||||||
for (auto& node : nodesToAddOrUpdate.nodes(name))
|
for (auto& node : nodesToAddOrUpdate.nodes(name)) {
|
||||||
|
dirtyNodes.insert(node.key());
|
||||||
node->hash.SetNull();
|
node->hash.SetNull();
|
||||||
|
}
|
||||||
|
|
||||||
if (fCheckTakeover)
|
if (fCheckTakeover)
|
||||||
namesToCheckForTakeover.insert(name);
|
namesToCheckForTakeover.insert(name);
|
||||||
|
@ -712,7 +765,7 @@ bool CClaimTrieCacheBase::removeClaimFromTrie(const std::string& name, const COu
|
||||||
auto it = cacheData(name, false);
|
auto it = cacheData(name, false);
|
||||||
|
|
||||||
if (!it || !it->removeClaim(outPoint, claim)) {
|
if (!it || !it->removeClaim(outPoint, claim)) {
|
||||||
LogPrint(BCLog::CLAIMS, "%s: Removing a claim was unsuccessful. name = %s, txhash = %s, nOut = %d", __func__, name, outPoint.hash.GetHex(), outPoint.n);
|
LogPrint(BCLog::CLAIMS, "%s: Removing a claim was unsuccessful. name = %s, txhash = %s, nOut = %d\n", __func__, name, outPoint.hash.GetHex(), outPoint.n);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -963,11 +1016,14 @@ bool CClaimTrieCacheBase::removeSupportFromMap(const std::string& name, const CO
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
void CClaimTrieCacheBase::dumpToLog(CClaimTrie::const_iterator it, bool diffFromBase) const
|
void CClaimTrieCacheBase::dumpToLog(CClaimPrefixTrie::const_iterator it, bool diffFromBase) const
|
||||||
{
|
{
|
||||||
|
if (!it) return;
|
||||||
|
|
||||||
if (diffFromBase) {
|
if (diffFromBase) {
|
||||||
auto hit = base->find(it.key());
|
CClaimTrieDataNode node;
|
||||||
if (hit && hit->hash == it->hash)
|
node.childrenSerialization = false;
|
||||||
|
if (base->find(it.key(), node) && node.data.hash == it->hash)
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1281,8 +1337,16 @@ int CClaimTrieCacheBase::getNumBlocksOfContinuousOwnership(const std::string& na
|
||||||
that->removalWorkaround.erase(hit);
|
that->removalWorkaround.erase(hit);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
auto it = find(name);
|
auto it = nodesToAddOrUpdate.find(name);
|
||||||
return it && !it->empty() ? nNextHeight - it->nHeightOfLastTakeover : 0;
|
if (it && !it->empty())
|
||||||
|
return nNextHeight - it->nHeightOfLastTakeover;
|
||||||
|
if (it) // we specifically ignore deleted nodes here to allow this to fall into the base lookup in that scenario
|
||||||
|
return 0;
|
||||||
|
CClaimTrieDataNode node;
|
||||||
|
node.childrenSerialization = false;
|
||||||
|
if (base->find(name, node) && !node.data.empty())
|
||||||
|
return nNextHeight - node.data.nHeightOfLastTakeover;
|
||||||
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
int CClaimTrieCacheBase::getDelayForName(const std::string& name) const
|
int CClaimTrieCacheBase::getDelayForName(const std::string& name) const
|
||||||
|
@ -1311,6 +1375,7 @@ bool CClaimTrieCacheBase::clear()
|
||||||
{
|
{
|
||||||
nodesToAddOrUpdate.clear();
|
nodesToAddOrUpdate.clear();
|
||||||
claimsToAddToByIdIndex.clear();
|
claimsToAddToByIdIndex.clear();
|
||||||
|
dirtyNodes.clear();
|
||||||
supportCache.clear();
|
supportCache.clear();
|
||||||
nodesToDelete.clear();
|
nodesToDelete.clear();
|
||||||
claimsToDeleteFromByIdIndex.clear();
|
claimsToDeleteFromByIdIndex.clear();
|
||||||
|
|
|
@ -18,11 +18,13 @@
|
||||||
#include <unordered_set>
|
#include <unordered_set>
|
||||||
|
|
||||||
// leveldb keys
|
// leveldb keys
|
||||||
#define TRIE_NODE 'n'
|
#define TRIE_NODE 'n' // deprecated
|
||||||
|
#define TRIE_NODE_BY_HASH 'h'
|
||||||
|
#define TRIE_NODE_BY_NAME 'g'
|
||||||
#define CLAIM_BY_ID 'i'
|
#define CLAIM_BY_ID 'i'
|
||||||
#define CLAIM_QUEUE_ROW 'r'
|
#define CLAIM_QUEUE_ROW 'r'
|
||||||
#define CLAIM_QUEUE_NAME_ROW 'm'
|
#define CLAIM_QUEUE_NAME_ROW 'm'
|
||||||
#define EXP_QUEUE_ROW 'e'
|
#define CLAIM_EXP_QUEUE_ROW 'e'
|
||||||
#define SUPPORT 's'
|
#define SUPPORT 's'
|
||||||
#define SUPPORT_QUEUE_ROW 'u'
|
#define SUPPORT_QUEUE_ROW 'u'
|
||||||
#define SUPPORT_QUEUE_NAME_ROW 'p'
|
#define SUPPORT_QUEUE_NAME_ROW 'p'
|
||||||
|
@ -61,6 +63,7 @@ struct CClaimValue
|
||||||
READWRITE(nAmount);
|
READWRITE(nAmount);
|
||||||
READWRITE(nHeight);
|
READWRITE(nHeight);
|
||||||
READWRITE(nValidAtHeight);
|
READWRITE(nValidAtHeight);
|
||||||
|
READWRITE(nEffectiveAmount);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool operator<(const CClaimValue& other) const
|
bool operator<(const CClaimValue& other) const
|
||||||
|
@ -157,17 +160,6 @@ struct CClaimTrieData
|
||||||
inline void SerializationOp(Stream& s, Operation ser_action)
|
inline void SerializationOp(Stream& s, Operation ser_action)
|
||||||
{
|
{
|
||||||
READWRITE(hash);
|
READWRITE(hash);
|
||||||
|
|
||||||
if (ser_action.ForRead()) {
|
|
||||||
if (s.eof()) {
|
|
||||||
claims.clear();
|
|
||||||
nHeightOfLastTakeover = 0;
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else if (claims.empty())
|
|
||||||
return;
|
|
||||||
|
|
||||||
READWRITE(claims);
|
READWRITE(claims);
|
||||||
READWRITE(nHeightOfLastTakeover);
|
READWRITE(nHeightOfLastTakeover);
|
||||||
}
|
}
|
||||||
|
@ -188,6 +180,30 @@ struct CClaimTrieData
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct CClaimTrieDataNode {
|
||||||
|
CClaimTrieData data;
|
||||||
|
// we're using a vector to avoid RAM thrashing and for faster serialization ops.
|
||||||
|
// We're assuming its data is inserted in order and never modified.
|
||||||
|
std::vector<std::pair<std::string, uint256>> children;
|
||||||
|
bool childrenSerialization = true;
|
||||||
|
|
||||||
|
CClaimTrieDataNode() = default;
|
||||||
|
CClaimTrieDataNode(CClaimTrieDataNode&&) = default;
|
||||||
|
CClaimTrieDataNode(const CClaimTrieDataNode&) = default;
|
||||||
|
CClaimTrieDataNode& operator=(CClaimTrieDataNode&&) = default;
|
||||||
|
CClaimTrieDataNode& operator=(const CClaimTrieDataNode& d) = default;
|
||||||
|
|
||||||
|
ADD_SERIALIZE_METHODS;
|
||||||
|
|
||||||
|
template <typename Stream, typename Operation>
|
||||||
|
inline void SerializationOp(Stream& s, Operation ser_action)
|
||||||
|
{
|
||||||
|
READWRITE(data);
|
||||||
|
if (childrenSerialization) // wanting constexpr but hoping the compiler is smart enough anyway
|
||||||
|
READWRITE(children);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
struct COutPointHeightType
|
struct COutPointHeightType
|
||||||
{
|
{
|
||||||
COutPoint outPoint;
|
COutPoint outPoint;
|
||||||
|
@ -301,7 +317,7 @@ struct CClaimsForNameType
|
||||||
CClaimsForNameType& operator=(const CClaimsForNameType&) = default;
|
CClaimsForNameType& operator=(const CClaimsForNameType&) = default;
|
||||||
};
|
};
|
||||||
|
|
||||||
class CClaimTrie : public CPrefixTrie<std::string, CClaimTrieData>
|
class CClaimTrie
|
||||||
{
|
{
|
||||||
int nNextHeight = 0;
|
int nNextHeight = 0;
|
||||||
int nProportionalDelayFactor = 0;
|
int nProportionalDelayFactor = 0;
|
||||||
|
@ -322,6 +338,19 @@ public:
|
||||||
friend struct ClaimTrieChainFixture;
|
friend struct ClaimTrieChainFixture;
|
||||||
friend class CClaimTrieCacheExpirationFork;
|
friend class CClaimTrieCacheExpirationFork;
|
||||||
friend class CClaimTrieCacheNormalizationFork;
|
friend class CClaimTrieCacheNormalizationFork;
|
||||||
|
|
||||||
|
std::size_t getTotalNamesInTrie() const;
|
||||||
|
std::size_t getTotalClaimsInTrie() const;
|
||||||
|
CAmount getTotalValueOfClaimsInTrie(bool fControllingOnly) const;
|
||||||
|
bool checkConsistency(const uint256& rootHash) const;
|
||||||
|
|
||||||
|
bool contains(const std::string& key) const;
|
||||||
|
bool empty() const;
|
||||||
|
bool find(const uint256& key, CClaimTrieDataNode& node) const;
|
||||||
|
bool find(const std::string& key, CClaimTrieDataNode& node) const;
|
||||||
|
|
||||||
|
std::vector<std::pair<std::string, CClaimTrieDataNode>> nodes(const std::string& key) const;
|
||||||
|
void recurseAllHashedNodes(const std::string& name, const CClaimTrieDataNode& current, std::function<void(const std::string&, const CClaimTrieDataNode&)> function) const;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct CClaimTrieProofNode
|
struct CClaimTrieProofNode
|
||||||
|
@ -381,6 +410,8 @@ typedef std::map<int, expirationQueueRowType> expirationQueueType;
|
||||||
typedef std::set<CClaimValue> claimIndexClaimListType;
|
typedef std::set<CClaimValue> claimIndexClaimListType;
|
||||||
typedef std::vector<CClaimIndexElement> claimIndexElementListType;
|
typedef std::vector<CClaimIndexElement> claimIndexElementListType;
|
||||||
|
|
||||||
|
typedef CPrefixTrie<std::string, CClaimTrieData> CClaimPrefixTrie;
|
||||||
|
|
||||||
class CClaimTrieCacheBase
|
class CClaimTrieCacheBase
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
|
@ -388,7 +419,6 @@ public:
|
||||||
virtual ~CClaimTrieCacheBase() = default;
|
virtual ~CClaimTrieCacheBase() = default;
|
||||||
|
|
||||||
uint256 getMerkleHash();
|
uint256 getMerkleHash();
|
||||||
bool checkConsistency() const;
|
|
||||||
|
|
||||||
bool getClaimById(const uint160& claimId, std::string& name, CClaimValue& claim) const;
|
bool getClaimById(const uint160& claimId, std::string& name, CClaimValue& claim) const;
|
||||||
|
|
||||||
|
@ -402,10 +432,6 @@ public:
|
||||||
bool haveSupport(const std::string& name, const COutPoint& outPoint) const;
|
bool haveSupport(const std::string& name, const COutPoint& outPoint) const;
|
||||||
bool haveSupportInQueue(const std::string& name, const COutPoint& outPoint, int& nValidAtHeight);
|
bool haveSupportInQueue(const std::string& name, const COutPoint& outPoint, int& nValidAtHeight);
|
||||||
|
|
||||||
std::size_t getTotalNamesInTrie() const;
|
|
||||||
std::size_t getTotalClaimsInTrie() const;
|
|
||||||
CAmount getTotalValueOfClaimsInTrie(bool fControllingOnly) const;
|
|
||||||
|
|
||||||
bool addClaim(const std::string& name, const COutPoint& outPoint, const uint160& claimId, CAmount nAmount, int nHeight);
|
bool addClaim(const std::string& name, const COutPoint& outPoint, const uint160& claimId, CAmount nAmount, int nHeight);
|
||||||
bool undoAddClaim(const std::string& name, const COutPoint& outPoint, int nHeight);
|
bool undoAddClaim(const std::string& name, const COutPoint& outPoint, int nHeight);
|
||||||
|
|
||||||
|
@ -441,18 +467,18 @@ public:
|
||||||
CAmount getEffectiveAmountForClaim(const std::string& name, const uint160& claimId, std::vector<CSupportValue>* supports = nullptr) const;
|
CAmount getEffectiveAmountForClaim(const std::string& name, const uint160& claimId, std::vector<CSupportValue>* supports = nullptr) const;
|
||||||
CAmount getEffectiveAmountForClaim(const CClaimsForNameType& claims, const uint160& claimId, std::vector<CSupportValue>* supports = nullptr) const;
|
CAmount getEffectiveAmountForClaim(const CClaimsForNameType& claims, const uint160& claimId, std::vector<CSupportValue>* supports = nullptr) const;
|
||||||
|
|
||||||
CClaimTrie::const_iterator begin() const;
|
CClaimPrefixTrie::const_iterator begin() const;
|
||||||
CClaimTrie::const_iterator end() const;
|
CClaimPrefixTrie::const_iterator end() const;
|
||||||
CClaimTrie::const_iterator find(const std::string& name) const;
|
|
||||||
|
|
||||||
void dumpToLog(CClaimTrie::const_iterator it, bool diffFromBase = true) const;
|
void dumpToLog(CClaimPrefixTrie::const_iterator it, bool diffFromBase = true) const;
|
||||||
|
virtual std::string adjustNameForValidHeight(const std::string& name, int validHeight) const;
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
CClaimTrie* base;
|
CClaimTrie* base;
|
||||||
CClaimTrie nodesToAddOrUpdate; // nodes pulled in from base (and possibly modified thereafter), written to base on flush
|
CClaimPrefixTrie nodesToAddOrUpdate; // nodes pulled in from base (and possibly modified thereafter), written to base on flush
|
||||||
std::unordered_set<std::string> namesToCheckForTakeover; // takeover numbers are updated on increment
|
std::unordered_set<std::string> namesToCheckForTakeover; // takeover numbers are updated on increment
|
||||||
|
|
||||||
uint256 recursiveComputeMerkleHash(CClaimTrie::iterator& it);
|
uint256 recursiveComputeMerkleHash(CClaimPrefixTrie::iterator& it);
|
||||||
|
|
||||||
virtual bool insertClaimIntoTrie(const std::string& name, const CClaimValue& claim, bool fCheckTakeover);
|
virtual bool insertClaimIntoTrie(const std::string& name, const CClaimValue& claim, bool fCheckTakeover);
|
||||||
virtual bool removeClaimFromTrie(const std::string& name, const COutPoint& outPoint, CClaimValue& claim, bool fCheckTakeover);
|
virtual bool removeClaimFromTrie(const std::string& name, const COutPoint& outPoint, CClaimValue& claim, bool fCheckTakeover);
|
||||||
|
@ -460,14 +486,12 @@ protected:
|
||||||
virtual bool insertSupportIntoMap(const std::string& name, const CSupportValue& support, bool fCheckTakeover);
|
virtual bool insertSupportIntoMap(const std::string& name, const CSupportValue& support, bool fCheckTakeover);
|
||||||
virtual bool removeSupportFromMap(const std::string& name, const COutPoint& outPoint, CSupportValue& support, bool fCheckTakeover);
|
virtual bool removeSupportFromMap(const std::string& name, const COutPoint& outPoint, CSupportValue& support, bool fCheckTakeover);
|
||||||
|
|
||||||
virtual std::string adjustNameForValidHeight(const std::string& name, int validHeight) const;
|
|
||||||
|
|
||||||
supportEntryType getSupportsForName(const std::string& name) const;
|
supportEntryType getSupportsForName(const std::string& name) const;
|
||||||
|
|
||||||
int getDelayForName(const std::string& name) const;
|
int getDelayForName(const std::string& name) const;
|
||||||
virtual int getDelayForName(const std::string& name, const uint160& claimId) const;
|
virtual int getDelayForName(const std::string& name, const uint160& claimId) const;
|
||||||
|
|
||||||
CClaimTrie::iterator cacheData(const std::string& name, bool create = true);
|
CClaimPrefixTrie::iterator cacheData(const std::string& name, bool create = true);
|
||||||
|
|
||||||
bool getLastTakeoverForName(const std::string& name, uint160& claimId, int& takeoverHeight) const;
|
bool getLastTakeoverForName(const std::string& name, uint160& claimId, int& takeoverHeight) const;
|
||||||
|
|
||||||
|
@ -499,6 +523,7 @@ private:
|
||||||
std::unordered_set<std::string> nodesAlreadyCached; // set of nodes already pulled into cache from base
|
std::unordered_set<std::string> nodesAlreadyCached; // set of nodes already pulled into cache from base
|
||||||
std::unordered_map<std::string, bool> takeoverWorkaround;
|
std::unordered_map<std::string, bool> takeoverWorkaround;
|
||||||
std::unordered_set<std::string> removalWorkaround;
|
std::unordered_set<std::string> removalWorkaround;
|
||||||
|
std::unordered_set<std::string> dirtyNodes;
|
||||||
|
|
||||||
bool shouldUseTakeoverWorkaround(const std::string& key) const;
|
bool shouldUseTakeoverWorkaround(const std::string& key) const;
|
||||||
void addTakeoverWorkaroundPotential(const std::string& key);
|
void addTakeoverWorkaroundPotential(const std::string& key);
|
||||||
|
@ -510,6 +535,8 @@ private:
|
||||||
bool removeSupport(const std::string& name, const COutPoint& outPoint, int nHeight, int& nValidAtHeight, bool fCheckTakeover);
|
bool removeSupport(const std::string& name, const COutPoint& outPoint, int nHeight, int& nValidAtHeight, bool fCheckTakeover);
|
||||||
bool removeClaim(const std::string& name, const COutPoint& outPoint, int nHeight, int& nValidAtHeight, bool fCheckTakeover);
|
bool removeClaim(const std::string& name, const COutPoint& outPoint, int nHeight, int& nValidAtHeight, bool fCheckTakeover);
|
||||||
|
|
||||||
|
bool validateTrieConsistency(const CBlockIndex* tip);
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
std::pair<const int, std::vector<queueEntryType<T>>>* getQueueCacheRow(int nHeight, bool createIfNotExists = false);
|
std::pair<const int, std::vector<queueEntryType<T>>>* getQueueCacheRow(int nHeight, bool createIfNotExists = false);
|
||||||
|
|
||||||
|
@ -614,6 +641,7 @@ public:
|
||||||
bool getProofForName(const std::string& name, CClaimTrieProof& proof) override;
|
bool getProofForName(const std::string& name, CClaimTrieProof& proof) override;
|
||||||
bool getInfoForName(const std::string& name, CClaimValue& claim) const override;
|
bool getInfoForName(const std::string& name, CClaimValue& claim) const override;
|
||||||
CClaimsForNameType getClaimsForName(const std::string& name) const override;
|
CClaimsForNameType getClaimsForName(const std::string& name) const override;
|
||||||
|
std::string adjustNameForValidHeight(const std::string& name, int validHeight) const override;
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
bool insertClaimIntoTrie(const std::string& name, const CClaimValue& claim, bool fCheckTakeover) override;
|
bool insertClaimIntoTrie(const std::string& name, const CClaimValue& claim, bool fCheckTakeover) override;
|
||||||
|
@ -624,8 +652,6 @@ protected:
|
||||||
|
|
||||||
int getDelayForName(const std::string& name, const uint160& claimId) const override;
|
int getDelayForName(const std::string& name, const uint160& claimId) const override;
|
||||||
|
|
||||||
std::string adjustNameForValidHeight(const std::string& name, int validHeight) const override;
|
|
||||||
|
|
||||||
private:
|
private:
|
||||||
bool overrideInsertNormalization;
|
bool overrideInsertNormalization;
|
||||||
bool overrideRemoveNormalization;
|
bool overrideRemoveNormalization;
|
||||||
|
|
|
@ -8,6 +8,7 @@
|
||||||
#include <boost/locale/conversion.hpp>
|
#include <boost/locale/conversion.hpp>
|
||||||
#include <boost/locale/localization_backend.hpp>
|
#include <boost/locale/localization_backend.hpp>
|
||||||
#include <boost/scope_exit.hpp>
|
#include <boost/scope_exit.hpp>
|
||||||
|
#include <boost/scoped_ptr.hpp>
|
||||||
|
|
||||||
CClaimTrieCacheExpirationFork::CClaimTrieCacheExpirationFork(CClaimTrie* base)
|
CClaimTrieCacheExpirationFork::CClaimTrieCacheExpirationFork(CClaimTrie* base)
|
||||||
: CClaimTrieCacheBase(base)
|
: CClaimTrieCacheBase(base)
|
||||||
|
@ -66,7 +67,7 @@ bool CClaimTrieCacheExpirationFork::forkForExpirationChange(bool increment)
|
||||||
if (!pcursor->GetKey(key))
|
if (!pcursor->GetKey(key))
|
||||||
continue;
|
continue;
|
||||||
int height = key.second;
|
int height = key.second;
|
||||||
if (key.first == EXP_QUEUE_ROW) {
|
if (key.first == CLAIM_EXP_QUEUE_ROW) {
|
||||||
expirationQueueRowType row;
|
expirationQueueRowType row;
|
||||||
if (pcursor->GetValue(row)) {
|
if (pcursor->GetValue(row)) {
|
||||||
reactivateClaim(row, height, increment);
|
reactivateClaim(row, height, increment);
|
||||||
|
@ -160,40 +161,48 @@ bool CClaimTrieCacheNormalizationFork::normalizeAllNamesInTrieIfNecessary(insert
|
||||||
// run the one-time upgrade of all names that need to change
|
// run the one-time upgrade of all names that need to change
|
||||||
// it modifies the (cache) trie as it goes, so we need to grab everything to be modified first
|
// it modifies the (cache) trie as it goes, so we need to grab everything to be modified first
|
||||||
|
|
||||||
for (auto it = base->begin(); it != base->end(); ++it) {
|
boost::scoped_ptr<CDBIterator> pcursor(base->db->NewIterator());
|
||||||
const std::string normalized = normalizeClaimName(it.key(), true);
|
for (pcursor->SeekToFirst(); pcursor->Valid(); pcursor->Next()) {
|
||||||
if (normalized == it.key())
|
std::pair<uint8_t, std::string> key;
|
||||||
|
if (!pcursor->GetKey(key) || key.first != TRIE_NODE_BY_NAME)
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
auto supports = getSupportsForName(it.key());
|
const auto& name = key.second;
|
||||||
|
const std::string normalized = normalizeClaimName(name, true);
|
||||||
|
if (normalized == key.second)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
auto supports = getSupportsForName(name);
|
||||||
for (auto support : supports) {
|
for (auto support : supports) {
|
||||||
// if it's already going to expire just skip it
|
// if it's already going to expire just skip it
|
||||||
if (support.nHeight + expirationTime() <= nNextHeight)
|
if (support.nHeight + expirationTime() <= nNextHeight)
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
assert(removeSupportFromMap(it.key(), support.outPoint, support, false));
|
assert(removeSupportFromMap(name, support.outPoint, support, false));
|
||||||
expireSupportUndo.emplace_back(it.key(), support);
|
expireSupportUndo.emplace_back(name, support);
|
||||||
assert(insertSupportIntoMap(normalized, support, false));
|
assert(insertSupportIntoMap(normalized, support, false));
|
||||||
insertSupportUndo.emplace_back(it.key(), support.outPoint, -1);
|
insertSupportUndo.emplace_back(name, support.outPoint, -1);
|
||||||
}
|
}
|
||||||
|
|
||||||
namesToCheckForTakeover.insert(normalized);
|
namesToCheckForTakeover.insert(normalized);
|
||||||
|
|
||||||
auto cached = cacheData(it.key(), false);
|
auto cached = cacheData(name, false);
|
||||||
if (!cached || cached->empty())
|
if (!cached || cached->empty())
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
for (auto claim : it->claims) {
|
auto claimsCopy = cached->claims;
|
||||||
|
auto takeoverHeightCopy = cached->nHeightOfLastTakeover;
|
||||||
|
for (auto claim : claimsCopy) {
|
||||||
if (claim.nHeight + expirationTime() <= nNextHeight)
|
if (claim.nHeight + expirationTime() <= nNextHeight)
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
assert(removeClaimFromTrie(it.key(), claim.outPoint, claim, false));
|
assert(removeClaimFromTrie(name, claim.outPoint, claim, false));
|
||||||
removeUndo.emplace_back(it.key(), claim);
|
removeUndo.emplace_back(name, claim);
|
||||||
assert(insertClaimIntoTrie(normalized, claim, true));
|
assert(insertClaimIntoTrie(normalized, claim, true));
|
||||||
insertUndo.emplace_back(it.key(), claim.outPoint, -1);
|
insertUndo.emplace_back(name, claim.outPoint, -1);
|
||||||
}
|
}
|
||||||
|
|
||||||
takeoverHeightUndo.emplace_back(it.key(), it->nHeightOfLastTakeover);
|
takeoverHeightUndo.emplace_back(name, takeoverHeightCopy);
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
|
@ -97,11 +97,45 @@ static void SetMaxOpenFiles(leveldb::Options *options) {
|
||||||
options->max_open_files, default_open_files);
|
options->max_open_files, default_open_files);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
class CappedLenCache: public leveldb::Cache {
|
||||||
|
leveldb::Cache* inner;
|
||||||
|
std::size_t maxKeyLen;
|
||||||
|
public:
|
||||||
|
CappedLenCache(std::size_t capacity, std::size_t maxKeyLen)
|
||||||
|
: inner(leveldb::NewLRUCache(capacity)), maxKeyLen(maxKeyLen) {}
|
||||||
|
|
||||||
|
~CappedLenCache() override { delete inner; }
|
||||||
|
|
||||||
|
Handle* Insert(const leveldb::Slice& key, void* value, size_t charge,
|
||||||
|
void (*deleter)(const leveldb::Slice& key, void* value)) override {
|
||||||
|
if (key.size() <= maxKeyLen)
|
||||||
|
return inner->Insert(key, value, charge, deleter);
|
||||||
|
deleter(key, value);
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
Handle* Lookup(const leveldb::Slice& key) override { return inner->Lookup(key); }
|
||||||
|
void Release(Handle* handle) override { return inner->Release(handle); }
|
||||||
|
void* Value(Handle* handle) override { return inner->Value(handle); }
|
||||||
|
void Erase(const leveldb::Slice& key) override {return inner->Erase(key); }
|
||||||
|
uint64_t NewId() override { return inner->NewId(); }
|
||||||
|
};
|
||||||
|
|
||||||
static leveldb::Options GetOptions(size_t nCacheSize)
|
static leveldb::Options GetOptions(size_t nCacheSize)
|
||||||
{
|
{
|
||||||
leveldb::Options options;
|
leveldb::Options options;
|
||||||
auto write_cache = std::min(nCacheSize / 4, size_t(16) << 20U); // cap write_cache at 16MB (4x default)
|
|
||||||
|
options.filter_policy=leveldb::NewBloomFilterPolicy2(16);
|
||||||
|
options.write_buffer_size=60 * 1024 * 1024;
|
||||||
|
options.total_leveldb_mem=2500ULL * 1024ULL * 1024ULL;
|
||||||
|
options.env=leveldb::Env::Default();
|
||||||
|
options.compression = leveldb::kNoCompression;
|
||||||
|
options.info_log = new CBitcoinLevelDBLogger();
|
||||||
|
return options;
|
||||||
|
|
||||||
|
auto write_cache = std::min(nCacheSize / 4, size_t(4 * 1024 * 1024)); // cap write_cache at 4MB (default)
|
||||||
options.block_cache = leveldb::NewLRUCache(nCacheSize - write_cache * 2);
|
options.block_cache = leveldb::NewLRUCache(nCacheSize - write_cache * 2);
|
||||||
|
// options.block_cache = new CappedLenCache(nCacheSize - write_cache * 2, 6);
|
||||||
options.write_buffer_size = write_cache; // up to two write buffers may be held in memory simultaneously
|
options.write_buffer_size = write_cache; // up to two write buffers may be held in memory simultaneously
|
||||||
options.filter_policy = leveldb::NewBloomFilterPolicy(10);
|
options.filter_policy = leveldb::NewBloomFilterPolicy(10);
|
||||||
options.compression = leveldb::kNoCompression;
|
options.compression = leveldb::kNoCompression;
|
||||||
|
@ -112,6 +146,7 @@ static leveldb::Options GetOptions(size_t nCacheSize)
|
||||||
options.paranoid_checks = true;
|
options.paranoid_checks = true;
|
||||||
}
|
}
|
||||||
SetMaxOpenFiles(&options);
|
SetMaxOpenFiles(&options);
|
||||||
|
options.max_open_files = 30000;
|
||||||
return options;
|
return options;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -81,7 +81,7 @@ public:
|
||||||
ssValue.Xor(dbwrapper_private::GetObfuscateKey(parent));
|
ssValue.Xor(dbwrapper_private::GetObfuscateKey(parent));
|
||||||
leveldb::Slice slValue(ssValue.data(), ssValue.size());
|
leveldb::Slice slValue(ssValue.data(), ssValue.size());
|
||||||
|
|
||||||
batch.Put(slKey, slValue);
|
batch.Put(slKey, slValue, nullptr);
|
||||||
// LevelDB serializes writes as:
|
// LevelDB serializes writes as:
|
||||||
// - byte: header
|
// - byte: header
|
||||||
// - varint: key length (1 byte up to 127B, 2 bytes up to 16383B, ...)
|
// - varint: key length (1 byte up to 127B, 2 bytes up to 16383B, ...)
|
||||||
|
|
|
@ -1461,7 +1461,7 @@ bool AppInitMain()
|
||||||
pblocktree.reset();
|
pblocktree.reset();
|
||||||
pblocktree.reset(new CBlockTreeDB(nBlockTreeDBCache, false, fReset));
|
pblocktree.reset(new CBlockTreeDB(nBlockTreeDBCache, false, fReset));
|
||||||
delete pclaimTrie;
|
delete pclaimTrie;
|
||||||
pclaimTrie = new CClaimTrie(false, fReindex);
|
pclaimTrie = new CClaimTrie(false, fReindex || fReindexChainState);
|
||||||
|
|
||||||
if (fReset) {
|
if (fReset) {
|
||||||
pblocktree->WriteReindexing(true);
|
pblocktree->WriteReindexing(true);
|
||||||
|
|
13
src/leveldb/.gitignore
vendored
13
src/leveldb/.gitignore
vendored
|
@ -1,13 +0,0 @@
|
||||||
build_config.mk
|
|
||||||
*.a
|
|
||||||
*.o
|
|
||||||
*.dylib*
|
|
||||||
*.so
|
|
||||||
*.so.*
|
|
||||||
*_test
|
|
||||||
db_bench
|
|
||||||
leveldbutil
|
|
||||||
Release
|
|
||||||
Debug
|
|
||||||
Benchmark
|
|
||||||
vs2010.*
|
|
|
@ -6,7 +6,3 @@ Google Inc.
|
||||||
# Initial version authors:
|
# Initial version authors:
|
||||||
Jeffrey Dean <jeff@google.com>
|
Jeffrey Dean <jeff@google.com>
|
||||||
Sanjay Ghemawat <sanjay@google.com>
|
Sanjay Ghemawat <sanjay@google.com>
|
||||||
|
|
||||||
# Partial list of contributors:
|
|
||||||
Kevin Regan <kevin.d.regan@gmail.com>
|
|
||||||
Johan Bilien <jobi@litl.com>
|
|
||||||
|
|
72
src/leveldb/BASHO_RELEASES
Normal file
72
src/leveldb/BASHO_RELEASES
Normal file
|
@ -0,0 +1,72 @@
|
||||||
|
github.com tag 2.0.34 - February 15, 2017
|
||||||
|
-----------------------------------------
|
||||||
|
mv-hot-backup2: - correct MakeTieredDbname() within db/filename.cc
|
||||||
|
for case where dbname input is blank and fast/slow
|
||||||
|
already populated in options. Corrects issue
|
||||||
|
with hot backup in non-tiered storage situations
|
||||||
|
|
||||||
|
github.com tag 2.0.33 - November 21, 2016
|
||||||
|
-----------------------------------------
|
||||||
|
mv-bucket-expiry: - partial branch to enable X-Riak-Meta-Expiry-Base-Seconds
|
||||||
|
property within enterprise edition
|
||||||
|
|
||||||
|
--- no 2.0.32 tag on leveldb ---
|
||||||
|
|
||||||
|
github.com tag 2.0.31 - November 1, 2016
|
||||||
|
----------------------------------------
|
||||||
|
- version shipped with Riak 2.2
|
||||||
|
mv-no-md-expiry: - Riak specific
|
||||||
|
- never convert a key prefix of sext:encoded "{md" to expiry
|
||||||
|
- update sst_scan for dumping Riak formated keys
|
||||||
|
mv-tuning8: - rework penalty rules in version_set.cc UpdatePenalty()
|
||||||
|
- add unit test framework for UpdatePenalty()
|
||||||
|
|
||||||
|
github.com tag 2.0.30 - October 11, 2016
|
||||||
|
----------------------------------------
|
||||||
|
mv-delayed-bloom: - when opening an .sst table file, only load
|
||||||
|
bloom filter on second Get() operation. Saves time.
|
||||||
|
- correct VersionSet::Finalize() logic for level 1 when
|
||||||
|
when level 2 is above desired size
|
||||||
|
- move hot backup to Riak ee build
|
||||||
|
|
||||||
|
github.com tag 2.0.29 - September 13, 2016
|
||||||
|
------------------------------------------
|
||||||
|
mv-expiry-manifest: only switch to expiry enabled manifest format
|
||||||
|
if expiry function enabled. Eases downgrade
|
||||||
|
during early Riak releases containing expiry
|
||||||
|
|
||||||
|
github.com tag 2.0.28 - September 6, 2016
|
||||||
|
-----------------------------------------
|
||||||
|
mv-hot-backup: add externally triggered hot backup feature
|
||||||
|
|
||||||
|
github.com tag 2.0.27 - August 22, 2016
|
||||||
|
---------------------------------------
|
||||||
|
mv-mem-fences: fix iterator double delete bug in eleveldb and
|
||||||
|
build better memory fenced operations for referenced count objects.
|
||||||
|
|
||||||
|
github.com tag 2.0.26 - August 21, 2016
|
||||||
|
---------------------------------------
|
||||||
|
mv-expiry-iter-bug: DBImpl::NewIterator() was not setting the new expiry parameter.
|
||||||
|
|
||||||
|
github.com tag 2.0.25 - August 10, 2016
|
||||||
|
---------------------------------------
|
||||||
|
Make LZ4 the default compression instead of Snappy.
|
||||||
|
|
||||||
|
github.com tag 2.0.24 - August 2, 2016
|
||||||
|
--------------------------------------
|
||||||
|
mv-expiry: open source expiry. Supports one expiry policy for all databases.
|
||||||
|
|
||||||
|
github.com tag 2.0.23 - July 20, 2016
|
||||||
|
-------------------------------------
|
||||||
|
mv-no-semaphore: remove semaphore controlled thread in hot_threads.cc. Instead use
|
||||||
|
use mutex of thread 0 (only one thread's mutex) to address know race condition.
|
||||||
|
|
||||||
|
github.com tag 2.0.22 - June 22, 2016
|
||||||
|
-------------------------------------
|
||||||
|
no change: iterator fix in eleveldb
|
||||||
|
|
||||||
|
github.com tag 2.0.21 - June 16, 2016
|
||||||
|
-------------------------------------
|
||||||
|
branch mv-iterator-hot-threads: correct condition where eleveldb MoveTask
|
||||||
|
could hang an iterator. (https://github.com/basho/leveldb/wiki/mv-iterator-hot-threads)
|
||||||
|
|
|
@ -1,36 +0,0 @@
|
||||||
# Contributing
|
|
||||||
|
|
||||||
We'd love to accept your code patches! However, before we can take them, we
|
|
||||||
have to jump a couple of legal hurdles.
|
|
||||||
|
|
||||||
## Contributor License Agreements
|
|
||||||
|
|
||||||
Please fill out either the individual or corporate Contributor License
|
|
||||||
Agreement as appropriate.
|
|
||||||
|
|
||||||
* If you are an individual writing original source code and you're sure you
|
|
||||||
own the intellectual property, then sign an [individual CLA](https://developers.google.com/open-source/cla/individual).
|
|
||||||
* If you work for a company that wants to allow you to contribute your work,
|
|
||||||
then sign a [corporate CLA](https://developers.google.com/open-source/cla/corporate).
|
|
||||||
|
|
||||||
Follow either of the two links above to access the appropriate CLA and
|
|
||||||
instructions for how to sign and return it.
|
|
||||||
|
|
||||||
## Submitting a Patch
|
|
||||||
|
|
||||||
1. Sign the contributors license agreement above.
|
|
||||||
2. Decide which code you want to submit. A submission should be a set of changes
|
|
||||||
that addresses one issue in the [issue tracker](https://github.com/google/leveldb/issues).
|
|
||||||
Please don't mix more than one logical change per submission, because it makes
|
|
||||||
the history hard to follow. If you want to make a change
|
|
||||||
(e.g. add a sample or feature) that doesn't have a corresponding issue in the
|
|
||||||
issue tracker, please create one.
|
|
||||||
3. **Submitting**: When you are ready to submit, send us a Pull Request. Be
|
|
||||||
sure to include the issue number you fixed and the name you used to sign
|
|
||||||
the CLA.
|
|
||||||
|
|
||||||
## Writing Code ##
|
|
||||||
|
|
||||||
If your contribution contains code, please make sure that it follows
|
|
||||||
[the style guide](http://google-styleguide.googlecode.com/svn/trunk/cppguide.xml).
|
|
||||||
Otherwise we will have to ask you to make changes, and that's no fun for anyone.
|
|
|
@ -2,423 +2,219 @@
|
||||||
# Use of this source code is governed by a BSD-style license that can be
|
# Use of this source code is governed by a BSD-style license that can be
|
||||||
# found in the LICENSE file. See the AUTHORS file for names of contributors.
|
# found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||||
|
|
||||||
|
# Inherit some settings from environment variables, if available
|
||||||
|
INSTALL_PATH ?= $(CURDIR)
|
||||||
|
|
||||||
#-----------------------------------------------
|
#-----------------------------------------------
|
||||||
# Uncomment exactly one of the lines labelled (A), (B), and (C) below
|
# Uncomment exactly one of the lines labelled (A), (B), and (C) below
|
||||||
# to switch between compilation modes.
|
# to switch between compilation modes.
|
||||||
|
# NOTE: targets "debug" and "prof" provide same functionality
|
||||||
|
# NOTE 2: -DNDEBUG disables assert() statements within C code,
|
||||||
|
# i.e. no assert()s in production code
|
||||||
|
|
||||||
# (A) Production use (optimized mode)
|
OPT ?= -O2 -g -DNDEBUG # (A) Production use (optimized mode)
|
||||||
OPT ?= -O2 -DNDEBUG
|
# OPT ?= -g2 # (B) Debug mode, w/ full line-level debugging symbols
|
||||||
# (B) Debug mode, w/ full line-level debugging symbols
|
# OPT ?= -O2 -g2 -DNDEBUG # (C) Profiling mode: opt, but w/debugging symbols
|
||||||
# OPT ?= -g2
|
|
||||||
# (C) Profiling mode: opt, but w/debugging symbols
|
|
||||||
# OPT ?= -O2 -g2 -DNDEBUG
|
|
||||||
#-----------------------------------------------
|
#-----------------------------------------------
|
||||||
|
|
||||||
# detect what platform we're building on
|
# detect what platform we're building on
|
||||||
$(shell CC="$(CC)" CXX="$(CXX)" TARGET_OS="$(TARGET_OS)" \
|
ifeq ($(wildcard build_config.mk),)
|
||||||
./build_detect_platform build_config.mk ./)
|
$(shell ./build_detect_platform build_config.mk)
|
||||||
|
endif
|
||||||
# this file is generated by the previous line to set build flags and sources
|
# this file is generated by the previous line to set build flags and sources
|
||||||
include build_config.mk
|
include build_config.mk
|
||||||
|
|
||||||
TESTS = \
|
|
||||||
db/autocompact_test \
|
|
||||||
db/c_test \
|
|
||||||
db/corruption_test \
|
|
||||||
db/db_test \
|
|
||||||
db/dbformat_test \
|
|
||||||
db/fault_injection_test \
|
|
||||||
db/filename_test \
|
|
||||||
db/log_test \
|
|
||||||
db/recovery_test \
|
|
||||||
db/skiplist_test \
|
|
||||||
db/version_edit_test \
|
|
||||||
db/version_set_test \
|
|
||||||
db/write_batch_test \
|
|
||||||
helpers/memenv/memenv_test \
|
|
||||||
issues/issue178_test \
|
|
||||||
issues/issue200_test \
|
|
||||||
table/filter_block_test \
|
|
||||||
table/table_test \
|
|
||||||
util/arena_test \
|
|
||||||
util/bloom_test \
|
|
||||||
util/cache_test \
|
|
||||||
util/coding_test \
|
|
||||||
util/crc32c_test \
|
|
||||||
util/env_posix_test \
|
|
||||||
util/env_test \
|
|
||||||
util/hash_test
|
|
||||||
|
|
||||||
UTILS = \
|
|
||||||
db/db_bench \
|
|
||||||
db/leveldbutil
|
|
||||||
|
|
||||||
# Put the object files in a subdirectory, but the application at the top of the object dir.
|
|
||||||
PROGNAMES := $(notdir $(TESTS) $(UTILS))
|
|
||||||
|
|
||||||
# On Linux may need libkyotocabinet-dev for dependency.
|
|
||||||
BENCHMARKS = \
|
|
||||||
doc/bench/db_bench_sqlite3 \
|
|
||||||
doc/bench/db_bench_tree_db
|
|
||||||
|
|
||||||
CFLAGS += -I. -I./include $(PLATFORM_CCFLAGS) $(OPT)
|
CFLAGS += -I. -I./include $(PLATFORM_CCFLAGS) $(OPT)
|
||||||
CXXFLAGS += -I. -I./include $(PLATFORM_CXXFLAGS) $(OPT)
|
CXXFLAGS += -I. -I./include $(PLATFORM_CXXFLAGS) $(OPT)
|
||||||
|
|
||||||
LDFLAGS += $(PLATFORM_LDFLAGS)
|
LDFLAGS += $(PLATFORM_LDFLAGS)
|
||||||
LIBS += $(PLATFORM_LIBS)
|
|
||||||
|
|
||||||
SIMULATOR_OUTDIR=out-ios-x86
|
LIBOBJECTS := $(SOURCES:.cc=.o)
|
||||||
DEVICE_OUTDIR=out-ios-arm
|
LIBOBJECTS += util/lz4.o
|
||||||
|
MEMENVOBJECTS = $(MEMENV_SOURCES:.cc=.o)
|
||||||
|
DEPEND := $(SOURCES:.cc=.d)
|
||||||
|
|
||||||
ifeq ($(PLATFORM), IOS)
|
TESTUTIL = ./util/testutil.o
|
||||||
# Note: iOS should probably be using libtool, not ar.
|
TESTHARNESS = ./util/testharness.o $(TESTUTIL)
|
||||||
AR=xcrun ar
|
|
||||||
SIMULATORSDK=$(shell xcrun -sdk iphonesimulator --show-sdk-path)
|
TESTS := $(sort $(notdir $(basename $(TEST_SOURCES))))
|
||||||
DEVICESDK=$(shell xcrun -sdk iphoneos --show-sdk-path)
|
|
||||||
DEVICE_CFLAGS = -isysroot "$(DEVICESDK)" -arch armv6 -arch armv7 -arch armv7s -arch arm64
|
TOOLS = \
|
||||||
SIMULATOR_CFLAGS = -isysroot "$(SIMULATORSDK)" -arch i686 -arch x86_64
|
leveldb_repair \
|
||||||
STATIC_OUTDIR=out-ios-universal
|
perf_dump \
|
||||||
|
sst_rewrite \
|
||||||
|
sst_scan
|
||||||
|
|
||||||
|
PROGRAMS = db_bench $(TESTS) $(TOOLS)
|
||||||
|
BENCHMARKS = db_bench_sqlite3 db_bench_tree_db
|
||||||
|
|
||||||
|
LIBRARY = libleveldb.a
|
||||||
|
MEMENVLIBRARY = libmemenv.a
|
||||||
|
|
||||||
|
#
|
||||||
|
# static link leveldb to tools to simplify platform usage (if Linux)
|
||||||
|
#
|
||||||
|
ifeq ($(PLATFORM),OS_LINUX)
|
||||||
|
LEVEL_LDFLAGS := -L . -Wl,-non_shared -lleveldb -Wl,-call_shared
|
||||||
else
|
else
|
||||||
STATIC_OUTDIR=out-static
|
LEVEL_LDFLAGS := -L . -lleveldb
|
||||||
SHARED_OUTDIR=out-shared
|
|
||||||
STATIC_PROGRAMS := $(addprefix $(STATIC_OUTDIR)/, $(PROGNAMES))
|
|
||||||
SHARED_PROGRAMS := $(addprefix $(SHARED_OUTDIR)/, db_bench)
|
|
||||||
endif
|
endif
|
||||||
|
|
||||||
STATIC_LIBOBJECTS := $(addprefix $(STATIC_OUTDIR)/, $(SOURCES:.cc=.o))
|
|
||||||
STATIC_MEMENVOBJECTS := $(addprefix $(STATIC_OUTDIR)/, $(MEMENV_SOURCES:.cc=.o))
|
|
||||||
|
|
||||||
DEVICE_LIBOBJECTS := $(addprefix $(DEVICE_OUTDIR)/, $(SOURCES:.cc=.o))
|
|
||||||
DEVICE_MEMENVOBJECTS := $(addprefix $(DEVICE_OUTDIR)/, $(MEMENV_SOURCES:.cc=.o))
|
|
||||||
|
|
||||||
SIMULATOR_LIBOBJECTS := $(addprefix $(SIMULATOR_OUTDIR)/, $(SOURCES:.cc=.o))
|
|
||||||
SIMULATOR_MEMENVOBJECTS := $(addprefix $(SIMULATOR_OUTDIR)/, $(MEMENV_SOURCES:.cc=.o))
|
|
||||||
|
|
||||||
SHARED_LIBOBJECTS := $(addprefix $(SHARED_OUTDIR)/, $(SOURCES:.cc=.o))
|
|
||||||
SHARED_MEMENVOBJECTS := $(addprefix $(SHARED_OUTDIR)/, $(MEMENV_SOURCES:.cc=.o))
|
|
||||||
|
|
||||||
TESTUTIL := $(STATIC_OUTDIR)/util/testutil.o
|
|
||||||
TESTHARNESS := $(STATIC_OUTDIR)/util/testharness.o $(TESTUTIL)
|
|
||||||
|
|
||||||
STATIC_TESTOBJS := $(addprefix $(STATIC_OUTDIR)/, $(addsuffix .o, $(TESTS)))
|
|
||||||
STATIC_UTILOBJS := $(addprefix $(STATIC_OUTDIR)/, $(addsuffix .o, $(UTILS)))
|
|
||||||
STATIC_ALLOBJS := $(STATIC_LIBOBJECTS) $(STATIC_MEMENVOBJECTS) $(STATIC_TESTOBJS) $(STATIC_UTILOBJS) $(TESTHARNESS)
|
|
||||||
DEVICE_ALLOBJS := $(DEVICE_LIBOBJECTS) $(DEVICE_MEMENVOBJECTS)
|
|
||||||
SIMULATOR_ALLOBJS := $(SIMULATOR_LIBOBJECTS) $(SIMULATOR_MEMENVOBJECTS)
|
|
||||||
|
|
||||||
default: all
|
default: all
|
||||||
|
|
||||||
# Should we build shared libraries?
|
# Should we build shared libraries?
|
||||||
ifneq ($(PLATFORM_SHARED_EXT),)
|
ifneq ($(PLATFORM_SHARED_EXT),)
|
||||||
|
|
||||||
# Many leveldb test apps use non-exported API's. Only build a subset for testing.
|
|
||||||
SHARED_ALLOBJS := $(SHARED_LIBOBJECTS) $(SHARED_MEMENVOBJECTS) $(TESTHARNESS)
|
|
||||||
|
|
||||||
ifneq ($(PLATFORM_SHARED_VERSIONED),true)
|
ifneq ($(PLATFORM_SHARED_VERSIONED),true)
|
||||||
SHARED_LIB1 = libleveldb.$(PLATFORM_SHARED_EXT)
|
SHARED1 = libleveldb.$(PLATFORM_SHARED_EXT)
|
||||||
SHARED_LIB2 = $(SHARED_LIB1)
|
SHARED2 = $(SHARED1)
|
||||||
SHARED_LIB3 = $(SHARED_LIB1)
|
SHARED3 = $(SHARED1)
|
||||||
SHARED_LIBS = $(SHARED_LIB1)
|
SHARED = $(SHARED1)
|
||||||
SHARED_MEMENVLIB = $(SHARED_OUTDIR)/libmemenv.a
|
|
||||||
else
|
else
|
||||||
# Update db.h if you change these.
|
# Update db.h if you change these.
|
||||||
SHARED_VERSION_MAJOR = 1
|
SHARED_MAJOR = 1
|
||||||
SHARED_VERSION_MINOR = 20
|
SHARED_MINOR = 9
|
||||||
SHARED_LIB1 = libleveldb.$(PLATFORM_SHARED_EXT)
|
SHARED1 = libleveldb.$(PLATFORM_SHARED_EXT)
|
||||||
SHARED_LIB2 = $(SHARED_LIB1).$(SHARED_VERSION_MAJOR)
|
SHARED2 = $(SHARED1).$(SHARED_MAJOR)
|
||||||
SHARED_LIB3 = $(SHARED_LIB1).$(SHARED_VERSION_MAJOR).$(SHARED_VERSION_MINOR)
|
SHARED3 = $(SHARED1).$(SHARED_MAJOR).$(SHARED_MINOR)
|
||||||
SHARED_LIBS = $(SHARED_OUTDIR)/$(SHARED_LIB1) $(SHARED_OUTDIR)/$(SHARED_LIB2) $(SHARED_OUTDIR)/$(SHARED_LIB3)
|
SHARED = $(SHARED1) $(SHARED2) $(SHARED3)
|
||||||
$(SHARED_OUTDIR)/$(SHARED_LIB1): $(SHARED_OUTDIR)/$(SHARED_LIB3)
|
$(SHARED1): $(SHARED3)
|
||||||
ln -fs $(SHARED_LIB3) $(SHARED_OUTDIR)/$(SHARED_LIB1)
|
ln -fs $(SHARED3) $(SHARED1)
|
||||||
$(SHARED_OUTDIR)/$(SHARED_LIB2): $(SHARED_OUTDIR)/$(SHARED_LIB3)
|
$(SHARED2): $(SHARED3)
|
||||||
ln -fs $(SHARED_LIB3) $(SHARED_OUTDIR)/$(SHARED_LIB2)
|
ln -fs $(SHARED3) $(SHARED2)
|
||||||
SHARED_MEMENVLIB = $(SHARED_OUTDIR)/libmemenv.a
|
|
||||||
endif
|
endif
|
||||||
|
|
||||||
$(SHARED_OUTDIR)/$(SHARED_LIB3): $(SHARED_LIBOBJECTS)
|
$(SHARED3): $(LIBOBJECTS)
|
||||||
$(CXX) $(LDFLAGS) $(PLATFORM_SHARED_LDFLAGS)$(SHARED_LIB2) $(SHARED_LIBOBJECTS) -o $(SHARED_OUTDIR)/$(SHARED_LIB3) $(LIBS)
|
$(CXX) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) $(LIBOBJECTS) -o $(SHARED3) $(LDFLAGS) $(PLATFORM_SHARED_LDFLAGS)$(SHARED2)
|
||||||
|
|
||||||
endif # PLATFORM_SHARED_EXT
|
endif # PLATFORM_SHARED_EXT
|
||||||
|
|
||||||
all: $(SHARED_LIBS) $(SHARED_PROGRAMS) $(STATIC_OUTDIR)/libleveldb.a $(STATIC_OUTDIR)/libmemenv.a $(STATIC_PROGRAMS)
|
all: $(SHARED) $(LIBRARY)
|
||||||
|
|
||||||
check: $(STATIC_PROGRAMS)
|
test check: all $(PROGRAMS) $(TESTS)
|
||||||
for t in $(notdir $(TESTS)); do echo "***** Running $$t"; $(STATIC_OUTDIR)/$$t || exit 1; done
|
for t in $(TESTS); do echo "***** Running $$t"; ./$$t || exit 1; done
|
||||||
|
|
||||||
clean:
|
tools: all $(TOOLS)
|
||||||
-rm -rf out-static out-shared out-ios-x86 out-ios-arm out-ios-universal
|
|
||||||
-rm -f build_config.mk
|
|
||||||
-rm -rf ios-x86 ios-arm
|
|
||||||
|
|
||||||
$(STATIC_OUTDIR):
|
#
|
||||||
mkdir $@
|
# command line targets: debug and prof
|
||||||
|
# just like
|
||||||
$(STATIC_OUTDIR)/db: | $(STATIC_OUTDIR)
|
ifneq ($(filter debug,$(MAKECMDGOALS)),)
|
||||||
mkdir $@
|
OPT := -g2 # (B) Debug mode, w/ full line-level debugging symbols
|
||||||
|
debug: all
|
||||||
$(STATIC_OUTDIR)/helpers/memenv: | $(STATIC_OUTDIR)
|
|
||||||
mkdir -p $@
|
|
||||||
|
|
||||||
$(STATIC_OUTDIR)/port: | $(STATIC_OUTDIR)
|
|
||||||
mkdir $@
|
|
||||||
|
|
||||||
$(STATIC_OUTDIR)/table: | $(STATIC_OUTDIR)
|
|
||||||
mkdir $@
|
|
||||||
|
|
||||||
$(STATIC_OUTDIR)/util: | $(STATIC_OUTDIR)
|
|
||||||
mkdir $@
|
|
||||||
|
|
||||||
.PHONY: STATIC_OBJDIRS
|
|
||||||
STATIC_OBJDIRS: \
|
|
||||||
$(STATIC_OUTDIR)/db \
|
|
||||||
$(STATIC_OUTDIR)/port \
|
|
||||||
$(STATIC_OUTDIR)/table \
|
|
||||||
$(STATIC_OUTDIR)/util \
|
|
||||||
$(STATIC_OUTDIR)/helpers/memenv
|
|
||||||
|
|
||||||
$(SHARED_OUTDIR):
|
|
||||||
mkdir $@
|
|
||||||
|
|
||||||
$(SHARED_OUTDIR)/db: | $(SHARED_OUTDIR)
|
|
||||||
mkdir $@
|
|
||||||
|
|
||||||
$(SHARED_OUTDIR)/helpers/memenv: | $(SHARED_OUTDIR)
|
|
||||||
mkdir -p $@
|
|
||||||
|
|
||||||
$(SHARED_OUTDIR)/port: | $(SHARED_OUTDIR)
|
|
||||||
mkdir $@
|
|
||||||
|
|
||||||
$(SHARED_OUTDIR)/table: | $(SHARED_OUTDIR)
|
|
||||||
mkdir $@
|
|
||||||
|
|
||||||
$(SHARED_OUTDIR)/util: | $(SHARED_OUTDIR)
|
|
||||||
mkdir $@
|
|
||||||
|
|
||||||
.PHONY: SHARED_OBJDIRS
|
|
||||||
SHARED_OBJDIRS: \
|
|
||||||
$(SHARED_OUTDIR)/db \
|
|
||||||
$(SHARED_OUTDIR)/port \
|
|
||||||
$(SHARED_OUTDIR)/table \
|
|
||||||
$(SHARED_OUTDIR)/util \
|
|
||||||
$(SHARED_OUTDIR)/helpers/memenv
|
|
||||||
|
|
||||||
$(DEVICE_OUTDIR):
|
|
||||||
mkdir $@
|
|
||||||
|
|
||||||
$(DEVICE_OUTDIR)/db: | $(DEVICE_OUTDIR)
|
|
||||||
mkdir $@
|
|
||||||
|
|
||||||
$(DEVICE_OUTDIR)/helpers/memenv: | $(DEVICE_OUTDIR)
|
|
||||||
mkdir -p $@
|
|
||||||
|
|
||||||
$(DEVICE_OUTDIR)/port: | $(DEVICE_OUTDIR)
|
|
||||||
mkdir $@
|
|
||||||
|
|
||||||
$(DEVICE_OUTDIR)/table: | $(DEVICE_OUTDIR)
|
|
||||||
mkdir $@
|
|
||||||
|
|
||||||
$(DEVICE_OUTDIR)/util: | $(DEVICE_OUTDIR)
|
|
||||||
mkdir $@
|
|
||||||
|
|
||||||
.PHONY: DEVICE_OBJDIRS
|
|
||||||
DEVICE_OBJDIRS: \
|
|
||||||
$(DEVICE_OUTDIR)/db \
|
|
||||||
$(DEVICE_OUTDIR)/port \
|
|
||||||
$(DEVICE_OUTDIR)/table \
|
|
||||||
$(DEVICE_OUTDIR)/util \
|
|
||||||
$(DEVICE_OUTDIR)/helpers/memenv
|
|
||||||
|
|
||||||
$(SIMULATOR_OUTDIR):
|
|
||||||
mkdir $@
|
|
||||||
|
|
||||||
$(SIMULATOR_OUTDIR)/db: | $(SIMULATOR_OUTDIR)
|
|
||||||
mkdir $@
|
|
||||||
|
|
||||||
$(SIMULATOR_OUTDIR)/helpers/memenv: | $(SIMULATOR_OUTDIR)
|
|
||||||
mkdir -p $@
|
|
||||||
|
|
||||||
$(SIMULATOR_OUTDIR)/port: | $(SIMULATOR_OUTDIR)
|
|
||||||
mkdir $@
|
|
||||||
|
|
||||||
$(SIMULATOR_OUTDIR)/table: | $(SIMULATOR_OUTDIR)
|
|
||||||
mkdir $@
|
|
||||||
|
|
||||||
$(SIMULATOR_OUTDIR)/util: | $(SIMULATOR_OUTDIR)
|
|
||||||
mkdir $@
|
|
||||||
|
|
||||||
.PHONY: SIMULATOR_OBJDIRS
|
|
||||||
SIMULATOR_OBJDIRS: \
|
|
||||||
$(SIMULATOR_OUTDIR)/db \
|
|
||||||
$(SIMULATOR_OUTDIR)/port \
|
|
||||||
$(SIMULATOR_OUTDIR)/table \
|
|
||||||
$(SIMULATOR_OUTDIR)/util \
|
|
||||||
$(SIMULATOR_OUTDIR)/helpers/memenv
|
|
||||||
|
|
||||||
$(STATIC_ALLOBJS): | STATIC_OBJDIRS
|
|
||||||
$(DEVICE_ALLOBJS): | DEVICE_OBJDIRS
|
|
||||||
$(SIMULATOR_ALLOBJS): | SIMULATOR_OBJDIRS
|
|
||||||
$(SHARED_ALLOBJS): | SHARED_OBJDIRS
|
|
||||||
|
|
||||||
ifeq ($(PLATFORM), IOS)
|
|
||||||
$(DEVICE_OUTDIR)/libleveldb.a: $(DEVICE_LIBOBJECTS)
|
|
||||||
rm -f $@
|
|
||||||
$(AR) -rs $@ $(DEVICE_LIBOBJECTS)
|
|
||||||
|
|
||||||
$(SIMULATOR_OUTDIR)/libleveldb.a: $(SIMULATOR_LIBOBJECTS)
|
|
||||||
rm -f $@
|
|
||||||
$(AR) -rs $@ $(SIMULATOR_LIBOBJECTS)
|
|
||||||
|
|
||||||
$(DEVICE_OUTDIR)/libmemenv.a: $(DEVICE_MEMENVOBJECTS)
|
|
||||||
rm -f $@
|
|
||||||
$(AR) -rs $@ $(DEVICE_MEMENVOBJECTS)
|
|
||||||
|
|
||||||
$(SIMULATOR_OUTDIR)/libmemenv.a: $(SIMULATOR_MEMENVOBJECTS)
|
|
||||||
rm -f $@
|
|
||||||
$(AR) -rs $@ $(SIMULATOR_MEMENVOBJECTS)
|
|
||||||
|
|
||||||
# For iOS, create universal object libraries to be used on both the simulator and
|
|
||||||
# a device.
|
|
||||||
$(STATIC_OUTDIR)/libleveldb.a: $(STATIC_OUTDIR) $(DEVICE_OUTDIR)/libleveldb.a $(SIMULATOR_OUTDIR)/libleveldb.a
|
|
||||||
lipo -create $(DEVICE_OUTDIR)/libleveldb.a $(SIMULATOR_OUTDIR)/libleveldb.a -output $@
|
|
||||||
|
|
||||||
$(STATIC_OUTDIR)/libmemenv.a: $(STATIC_OUTDIR) $(DEVICE_OUTDIR)/libmemenv.a $(SIMULATOR_OUTDIR)/libmemenv.a
|
|
||||||
lipo -create $(DEVICE_OUTDIR)/libmemenv.a $(SIMULATOR_OUTDIR)/libmemenv.a -output $@
|
|
||||||
else
|
|
||||||
$(STATIC_OUTDIR)/libleveldb.a:$(STATIC_LIBOBJECTS)
|
|
||||||
rm -f $@
|
|
||||||
$(AR) -rs $@ $(STATIC_LIBOBJECTS)
|
|
||||||
|
|
||||||
$(STATIC_OUTDIR)/libmemenv.a:$(STATIC_MEMENVOBJECTS)
|
|
||||||
rm -f $@
|
|
||||||
$(AR) -rs $@ $(STATIC_MEMENVOBJECTS)
|
|
||||||
endif
|
endif
|
||||||
|
|
||||||
$(SHARED_MEMENVLIB):$(SHARED_MEMENVOBJECTS)
|
ifneq ($(filter prof,$(MAKECMDGOALS)),)
|
||||||
|
OPT := -O2 -g2 -DNDEBUG # (C) Profiling mode: opt, but w/debugging symbols
|
||||||
|
prof: all
|
||||||
|
endif
|
||||||
|
|
||||||
|
|
||||||
|
clean:
|
||||||
|
-rm -f $(PROGRAMS) $(BENCHMARKS) $(LIBRARY) $(SHARED) $(MEMENVLIBRARY) */*.o */*/*.o */*.d */*/*.d ios-x86/*/*.o ios-arm/*/*.o build_config.mk include/leveldb/ldb_config.h
|
||||||
|
-rm -rf ios-x86/* ios-arm/* *.dSYM
|
||||||
|
|
||||||
|
|
||||||
|
$(LIBRARY): $(LIBOBJECTS)
|
||||||
rm -f $@
|
rm -f $@
|
||||||
$(AR) -rs $@ $(SHARED_MEMENVOBJECTS)
|
$(AR) -rs $@ $(LIBOBJECTS)
|
||||||
|
|
||||||
$(STATIC_OUTDIR)/db_bench:db/db_bench.cc $(STATIC_LIBOBJECTS) $(TESTUTIL)
|
#
|
||||||
$(CXX) $(LDFLAGS) $(CXXFLAGS) db/db_bench.cc $(STATIC_LIBOBJECTS) $(TESTUTIL) -o $@ $(LIBS)
|
# all tools, programs, and tests depend upon the static library
|
||||||
|
$(TESTS) $(PROGRAMS) $(TOOLS) : $(LIBRARY)
|
||||||
|
|
||||||
$(STATIC_OUTDIR)/db_bench_sqlite3:doc/bench/db_bench_sqlite3.cc $(STATIC_LIBOBJECTS) $(TESTUTIL)
|
#
|
||||||
$(CXX) $(LDFLAGS) $(CXXFLAGS) doc/bench/db_bench_sqlite3.cc $(STATIC_LIBOBJECTS) $(TESTUTIL) -o $@ -lsqlite3 $(LIBS)
|
# all tests depend upon the test harness
|
||||||
|
$(TESTS) : $(TESTHARNESS)
|
||||||
|
|
||||||
$(STATIC_OUTDIR)/db_bench_tree_db:doc/bench/db_bench_tree_db.cc $(STATIC_LIBOBJECTS) $(TESTUTIL)
|
#
|
||||||
$(CXX) $(LDFLAGS) $(CXXFLAGS) doc/bench/db_bench_tree_db.cc $(STATIC_LIBOBJECTS) $(TESTUTIL) -o $@ -lkyotocabinet $(LIBS)
|
# tools, programs, and tests will compile to the root directory
|
||||||
|
# but their .cc source file will be in one of the following subdirectories
|
||||||
|
vpath %.cc db:table:util:leveldb_ee:leveldb_os
|
||||||
|
|
||||||
$(STATIC_OUTDIR)/leveldbutil:db/leveldbutil.cc $(STATIC_LIBOBJECTS)
|
# special case for c_test
|
||||||
$(CXX) $(LDFLAGS) $(CXXFLAGS) db/leveldbutil.cc $(STATIC_LIBOBJECTS) -o $@ $(LIBS)
|
vpath %.c db
|
||||||
|
|
||||||
$(STATIC_OUTDIR)/arena_test:util/arena_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
|
db_bench: db/db_bench.o $(LIBRARY) $(TESTUTIL)
|
||||||
$(CXX) $(LDFLAGS) $(CXXFLAGS) util/arena_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
|
$(CXX) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) $< $(TESTUTIL) -o $@ $(LEVEL_LDFLAGS) $(LDFLAGS)
|
||||||
|
|
||||||
$(STATIC_OUTDIR)/autocompact_test:db/autocompact_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
|
db_bench_sqlite3: doc/bench/db_bench_sqlite3.o $(LIBRARY) $(TESTUTIL)
|
||||||
$(CXX) $(LDFLAGS) $(CXXFLAGS) db/autocompact_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
|
|
||||||
|
|
||||||
$(STATIC_OUTDIR)/bloom_test:util/bloom_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
|
db_bench_tree_db: doc/bench/db_bench_tree_db.o $(LIBRARY) $(TESTUTIL)
|
||||||
$(CXX) $(LDFLAGS) $(CXXFLAGS) util/bloom_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
|
|
||||||
|
|
||||||
$(STATIC_OUTDIR)/c_test:$(STATIC_OUTDIR)/db/c_test.o $(STATIC_LIBOBJECTS) $(TESTHARNESS)
|
|
||||||
$(CXX) $(LDFLAGS) $(STATIC_OUTDIR)/db/c_test.o $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
|
|
||||||
|
|
||||||
$(STATIC_OUTDIR)/cache_test:util/cache_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
|
#
|
||||||
$(CXX) $(LDFLAGS) $(CXXFLAGS) util/cache_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
|
# build line taken from lz4 makefile
|
||||||
|
#
|
||||||
|
util/lz4.o: util/lz4.c util/lz4.h
|
||||||
|
$(CC) $(CFLAGS) $(PLATFORM_SHARED_CFLAGS) -O3 -std=c99 -Wall -Wextra -Wundef -Wshadow -Wcast-qual -Wcast-align -Wstrict-prototypes -pedantic -DLZ4_VERSION=\"r130\" -c util/lz4.c -o util/lz4.o
|
||||||
|
|
||||||
$(STATIC_OUTDIR)/coding_test:util/coding_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
|
#
|
||||||
$(CXX) $(LDFLAGS) $(CXXFLAGS) util/coding_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
|
# memory env
|
||||||
|
#
|
||||||
|
$(MEMENVLIBRARY) : $(MEMENVOBJECTS)
|
||||||
|
rm -f $@
|
||||||
|
$(AR) -rs $@ $(MEMENVOBJECTS)
|
||||||
|
|
||||||
$(STATIC_OUTDIR)/corruption_test:db/corruption_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
|
memenv_test : helpers/memenv/memenv_test.o $(MEMENVLIBRARY) $(LIBRARY) $(TESTHARNESS)
|
||||||
$(CXX) $(LDFLAGS) $(CXXFLAGS) db/corruption_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
|
$(CXX) helpers/memenv/memenv_test.o $(MEMENVLIBRARY) $(LIBRARY) $(TESTHARNESS) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
$(STATIC_OUTDIR)/crc32c_test:util/crc32c_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
|
#
|
||||||
$(CXX) $(LDFLAGS) $(CXXFLAGS) util/crc32c_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
|
# IOS build
|
||||||
|
#
|
||||||
|
ifeq ($(PLATFORM), IOS)
|
||||||
|
# For iOS, create universal object files to be used on both the simulator and
|
||||||
|
# a device.
|
||||||
|
PLATFORMSROOT=/Applications/Xcode.app/Contents/Developer/Platforms
|
||||||
|
SIMULATORROOT=$(PLATFORMSROOT)/iPhoneSimulator.platform/Developer
|
||||||
|
DEVICEROOT=$(PLATFORMSROOT)/iPhoneOS.platform/Developer
|
||||||
|
IOSVERSION=$(shell defaults read $(PLATFORMSROOT)/iPhoneOS.platform/version CFBundleShortVersionString)
|
||||||
|
|
||||||
$(STATIC_OUTDIR)/db_test:db/db_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
|
.cc.o:
|
||||||
$(CXX) $(LDFLAGS) $(CXXFLAGS) db/db_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
|
mkdir -p ios-x86/$(dir $@)
|
||||||
|
$(SIMULATORROOT)/usr/bin/$(CXX) $(CXXFLAGS) -isysroot $(SIMULATORROOT)/SDKs/iPhoneSimulator$(IOSVERSION).sdk -arch i686 -c $< -o ios-x86/$@
|
||||||
|
mkdir -p ios-arm/$(dir $@)
|
||||||
|
$(DEVICEROOT)/usr/bin/$(CXX) $(CXXFLAGS) -isysroot $(DEVICEROOT)/SDKs/iPhoneOS$(IOSVERSION).sdk -arch armv6 -arch armv7 -c $< -o ios-arm/$@
|
||||||
|
lipo ios-x86/$@ ios-arm/$@ -create -output $@
|
||||||
|
|
||||||
$(STATIC_OUTDIR)/dbformat_test:db/dbformat_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
|
.c.o:
|
||||||
$(CXX) $(LDFLAGS) $(CXXFLAGS) db/dbformat_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
|
mkdir -p ios-x86/$(dir $@)
|
||||||
|
$(SIMULATORROOT)/usr/bin/$(CC) $(CFLAGS) -isysroot $(SIMULATORROOT)/SDKs/iPhoneSimulator$(IOSVERSION).sdk -arch i686 -c $< -o ios-x86/$@
|
||||||
|
mkdir -p ios-arm/$(dir $@)
|
||||||
|
$(DEVICEROOT)/usr/bin/$(CC) $(CFLAGS) -isysroot $(DEVICEROOT)/SDKs/iPhoneOS$(IOSVERSION).sdk -arch armv6 -arch armv7 -c $< -o ios-arm/$@
|
||||||
|
lipo ios-x86/$@ ios-arm/$@ -create -output $@
|
||||||
|
|
||||||
$(STATIC_OUTDIR)/env_posix_test:util/env_posix_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
|
else
|
||||||
$(CXX) $(LDFLAGS) $(CXXFLAGS) util/env_posix_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
|
#
|
||||||
|
# build for everything NOT IOS
|
||||||
$(STATIC_OUTDIR)/env_test:util/env_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
|
#
|
||||||
$(CXX) $(LDFLAGS) $(CXXFLAGS) util/env_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
|
.cc.o:
|
||||||
|
|
||||||
$(STATIC_OUTDIR)/fault_injection_test:db/fault_injection_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
|
|
||||||
$(CXX) $(LDFLAGS) $(CXXFLAGS) db/fault_injection_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
|
|
||||||
|
|
||||||
$(STATIC_OUTDIR)/filename_test:db/filename_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
|
|
||||||
$(CXX) $(LDFLAGS) $(CXXFLAGS) db/filename_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
|
|
||||||
|
|
||||||
$(STATIC_OUTDIR)/filter_block_test:table/filter_block_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
|
|
||||||
$(CXX) $(LDFLAGS) $(CXXFLAGS) table/filter_block_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
|
|
||||||
|
|
||||||
$(STATIC_OUTDIR)/hash_test:util/hash_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
|
|
||||||
$(CXX) $(LDFLAGS) $(CXXFLAGS) util/hash_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
|
|
||||||
|
|
||||||
$(STATIC_OUTDIR)/issue178_test:issues/issue178_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
|
|
||||||
$(CXX) $(LDFLAGS) $(CXXFLAGS) issues/issue178_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
|
|
||||||
|
|
||||||
$(STATIC_OUTDIR)/issue200_test:issues/issue200_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
|
|
||||||
$(CXX) $(LDFLAGS) $(CXXFLAGS) issues/issue200_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
|
|
||||||
|
|
||||||
$(STATIC_OUTDIR)/log_test:db/log_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
|
|
||||||
$(CXX) $(LDFLAGS) $(CXXFLAGS) db/log_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
|
|
||||||
|
|
||||||
$(STATIC_OUTDIR)/recovery_test:db/recovery_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
|
|
||||||
$(CXX) $(LDFLAGS) $(CXXFLAGS) db/recovery_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
|
|
||||||
|
|
||||||
$(STATIC_OUTDIR)/table_test:table/table_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
|
|
||||||
$(CXX) $(LDFLAGS) $(CXXFLAGS) table/table_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
|
|
||||||
|
|
||||||
$(STATIC_OUTDIR)/skiplist_test:db/skiplist_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
|
|
||||||
$(CXX) $(LDFLAGS) $(CXXFLAGS) db/skiplist_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
|
|
||||||
|
|
||||||
$(STATIC_OUTDIR)/version_edit_test:db/version_edit_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
|
|
||||||
$(CXX) $(LDFLAGS) $(CXXFLAGS) db/version_edit_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
|
|
||||||
|
|
||||||
$(STATIC_OUTDIR)/version_set_test:db/version_set_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
|
|
||||||
$(CXX) $(LDFLAGS) $(CXXFLAGS) db/version_set_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
|
|
||||||
|
|
||||||
$(STATIC_OUTDIR)/write_batch_test:db/write_batch_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
|
|
||||||
$(CXX) $(LDFLAGS) $(CXXFLAGS) db/write_batch_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
|
|
||||||
|
|
||||||
$(STATIC_OUTDIR)/memenv_test:$(STATIC_OUTDIR)/helpers/memenv/memenv_test.o $(STATIC_OUTDIR)/libmemenv.a $(STATIC_OUTDIR)/libleveldb.a $(TESTHARNESS)
|
|
||||||
$(XCRUN) $(CXX) $(LDFLAGS) $(STATIC_OUTDIR)/helpers/memenv/memenv_test.o $(STATIC_OUTDIR)/libmemenv.a $(STATIC_OUTDIR)/libleveldb.a $(TESTHARNESS) -o $@ $(LIBS)
|
|
||||||
|
|
||||||
$(SHARED_OUTDIR)/db_bench:$(SHARED_OUTDIR)/db/db_bench.o $(SHARED_LIBS) $(TESTUTIL)
|
|
||||||
$(XCRUN) $(CXX) $(LDFLAGS) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) $(SHARED_OUTDIR)/db/db_bench.o $(TESTUTIL) $(SHARED_OUTDIR)/$(SHARED_LIB3) -o $@ $(LIBS)
|
|
||||||
|
|
||||||
.PHONY: run-shared
|
|
||||||
run-shared: $(SHARED_OUTDIR)/db_bench
|
|
||||||
LD_LIBRARY_PATH=$(SHARED_OUTDIR) $(SHARED_OUTDIR)/db_bench
|
|
||||||
|
|
||||||
$(SIMULATOR_OUTDIR)/%.o: %.cc
|
|
||||||
xcrun -sdk iphonesimulator $(CXX) $(CXXFLAGS) $(SIMULATOR_CFLAGS) -c $< -o $@
|
|
||||||
|
|
||||||
$(DEVICE_OUTDIR)/%.o: %.cc
|
|
||||||
xcrun -sdk iphoneos $(CXX) $(CXXFLAGS) $(DEVICE_CFLAGS) -c $< -o $@
|
|
||||||
|
|
||||||
$(SIMULATOR_OUTDIR)/%.o: %.c
|
|
||||||
xcrun -sdk iphonesimulator $(CC) $(CFLAGS) $(SIMULATOR_CFLAGS) -c $< -o $@
|
|
||||||
|
|
||||||
$(DEVICE_OUTDIR)/%.o: %.c
|
|
||||||
xcrun -sdk iphoneos $(CC) $(CFLAGS) $(DEVICE_CFLAGS) -c $< -o $@
|
|
||||||
|
|
||||||
$(STATIC_OUTDIR)/%.o: %.cc
|
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
|
||||||
|
|
||||||
$(STATIC_OUTDIR)/%.o: %.c
|
|
||||||
$(CC) $(CFLAGS) -c $< -o $@
|
|
||||||
|
|
||||||
$(SHARED_OUTDIR)/%.o: %.cc
|
|
||||||
$(CXX) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) -c $< -o $@
|
$(CXX) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) -c $< -o $@
|
||||||
|
|
||||||
$(SHARED_OUTDIR)/%.o: %.c
|
.c.o:
|
||||||
$(CC) $(CFLAGS) $(PLATFORM_SHARED_CFLAGS) -c $< -o $@
|
$(CC) $(CFLAGS) $(PLATFORM_SHARED_CFLAGS) -c $< -o $@
|
||||||
|
|
||||||
$(STATIC_OUTDIR)/port/port_posix_sse.o: port/port_posix_sse.cc
|
## @echo -- Creating dependency file for $<
|
||||||
$(CXX) $(CXXFLAGS) $(PLATFORM_SSEFLAGS) -c $< -o $@
|
%.d: %.cc
|
||||||
|
$(CC) $(CFLAGS) $(PLATFORM_SHARED_CFLAGS) -MM -E -MT $(basename $@).d -MT $(basename $@).o -MF $@ $<
|
||||||
|
@echo $(basename $@).o: $(basename $@).d >>$@
|
||||||
|
|
||||||
$(SHARED_OUTDIR)/port/port_posix_sse.o: port/port_posix_sse.cc
|
# generic build for command line tests
|
||||||
$(CXX) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) $(PLATFORM_SSEFLAGS) -c $< -o $@
|
%: %.cc
|
||||||
|
$(CXX) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) $< $(TESTHARNESS) -o $@ $(LEVEL_LDFLAGS) $(LDFLAGS)
|
||||||
|
|
||||||
|
%: db/%.c
|
||||||
|
$(CXX) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) $< $(TESTHARNESS) -o $@ $(LEVEL_LDFLAGS) $(LDFLAGS)
|
||||||
|
|
||||||
|
# for tools, omits test harness
|
||||||
|
%: tools/%.cc
|
||||||
|
$(CXX) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) $< -o $@ $(LEVEL_LDFLAGS) $(LDFLAGS)
|
||||||
|
|
||||||
|
endif
|
||||||
|
|
||||||
|
#
|
||||||
|
# load dependency files
|
||||||
|
#
|
||||||
|
ifeq ($(filter tar clean allclean distclean,$(MAKECMDGOALS)),)
|
||||||
|
-include $(DEPEND)
|
||||||
|
endif
|
||||||
|
|
83
src/leveldb/README
Normal file
83
src/leveldb/README
Normal file
|
@ -0,0 +1,83 @@
|
||||||
|
leveldb: A key-value store
|
||||||
|
Authors: Sanjay Ghemawat (sanjay@google.com) and Jeff Dean (jeff@google.com)
|
||||||
|
|
||||||
|
The original Google README is now README.GOOGLE.
|
||||||
|
|
||||||
|
** Introduction
|
||||||
|
|
||||||
|
This repository contains the Google source code as modified to benefit
|
||||||
|
the Riak environment. The typical Riak environment has two attributes
|
||||||
|
that necessitate leveldb adjustments, both in options and code:
|
||||||
|
|
||||||
|
- production servers: Riak often runs in heavy Internet environments:
|
||||||
|
servers with many CPU cores, lots of memory, and 24x7 disk activity.
|
||||||
|
Basho's leveldb takes advantage of the environment by adding
|
||||||
|
hardware CRC calculation, increasing Bloom filter accuracy, and
|
||||||
|
defaulting to integrity checking enabled.
|
||||||
|
|
||||||
|
- multiple databases open: Riak opens 8 to 128 databases
|
||||||
|
simultaneously. Google's leveldb supports this, but its background
|
||||||
|
compaction thread can fall behind. leveldb will "stall" new user
|
||||||
|
writes whenever the compaction thread gets too far behind. Basho's
|
||||||
|
leveldb modification include multiple thread blocks that each
|
||||||
|
contain prioritized threads for specific compaction activities.
|
||||||
|
|
||||||
|
Details for Basho's customizations exist in the leveldb wiki:
|
||||||
|
|
||||||
|
http://github.com/basho/leveldb/wiki
|
||||||
|
|
||||||
|
|
||||||
|
** Branch pattern
|
||||||
|
|
||||||
|
This repository follows the Basho standard for branch management
|
||||||
|
as of November 28, 2013. The standard is found here:
|
||||||
|
|
||||||
|
https://github.com/basho/riak/wiki/Basho-repository-management
|
||||||
|
|
||||||
|
In summary, the "develop" branch contains the most recently reviewed
|
||||||
|
engineering work. The "master" branch contains the most recently
|
||||||
|
released work, i.e. distributed as part of a Riak release.
|
||||||
|
|
||||||
|
|
||||||
|
** Basic options needed
|
||||||
|
|
||||||
|
Those wishing to truly savor the benefits of Basho's modifications
|
||||||
|
need to initialize a new leveldb::Options structure similar to the
|
||||||
|
following before each call to leveldb::DB::Open:
|
||||||
|
|
||||||
|
leveldb::Options * options;
|
||||||
|
|
||||||
|
options=new Leveldb::Options;
|
||||||
|
|
||||||
|
options.filter_policy=leveldb::NewBloomFilterPolicy2(16);
|
||||||
|
options.write_buffer_size=62914560; // 60Mbytes
|
||||||
|
options.total_leveldb_mem=2684354560; // 2.5Gbytes (details below)
|
||||||
|
options.env=leveldb::Env::Default();
|
||||||
|
|
||||||
|
|
||||||
|
** Memory plan
|
||||||
|
|
||||||
|
Basho's leveldb dramatically departed from Google's original internal
|
||||||
|
memory allotment plan with Riak 2.0. Basho's leveldb uses a methodology
|
||||||
|
called flexcache. The technical details are here:
|
||||||
|
|
||||||
|
https://github.com/basho/leveldb/wiki/mv-flexcache
|
||||||
|
|
||||||
|
The key points are:
|
||||||
|
|
||||||
|
- options.total_leveldb_mem is an allocation for the entire process,
|
||||||
|
not a single database
|
||||||
|
|
||||||
|
- giving different values to options.total_leveldb_mem on subsequent Open
|
||||||
|
calls causes memory to rearrange to current value across all databases
|
||||||
|
|
||||||
|
- recommended minimum for Basho's leveldb is 340Mbytes per database.
|
||||||
|
|
||||||
|
- performance improves rapidly from 340Mbytes to 2.5Gbytes per database (3.0Gbytes
|
||||||
|
if using Riak's active anti-entropy). Even more is nice, but not as helpful.
|
||||||
|
|
||||||
|
- never assign more than 75% of available RAM to total_leveldb_mem. There is
|
||||||
|
too much unaccounted memory overhead (worse if you use tcmalloc library).
|
||||||
|
|
||||||
|
- options.max_open_files and options.block_cache should not be used.
|
||||||
|
|
51
src/leveldb/README.GOOGLE
Normal file
51
src/leveldb/README.GOOGLE
Normal file
|
@ -0,0 +1,51 @@
|
||||||
|
leveldb: A key-value store
|
||||||
|
Authors: Sanjay Ghemawat (sanjay@google.com) and Jeff Dean (jeff@google.com)
|
||||||
|
|
||||||
|
The code under this directory implements a system for maintaining a
|
||||||
|
persistent key/value store.
|
||||||
|
|
||||||
|
See doc/index.html for more explanation.
|
||||||
|
See doc/impl.html for a brief overview of the implementation.
|
||||||
|
|
||||||
|
The public interface is in include/*.h. Callers should not include or
|
||||||
|
rely on the details of any other header files in this package. Those
|
||||||
|
internal APIs may be changed without warning.
|
||||||
|
|
||||||
|
Guide to header files:
|
||||||
|
|
||||||
|
include/db.h
|
||||||
|
Main interface to the DB: Start here
|
||||||
|
|
||||||
|
include/options.h
|
||||||
|
Control over the behavior of an entire database, and also
|
||||||
|
control over the behavior of individual reads and writes.
|
||||||
|
|
||||||
|
include/comparator.h
|
||||||
|
Abstraction for user-specified comparison function. If you want
|
||||||
|
just bytewise comparison of keys, you can use the default comparator,
|
||||||
|
but clients can write their own comparator implementations if they
|
||||||
|
want custom ordering (e.g. to handle different character
|
||||||
|
encodings, etc.)
|
||||||
|
|
||||||
|
include/iterator.h
|
||||||
|
Interface for iterating over data. You can get an iterator
|
||||||
|
from a DB object.
|
||||||
|
|
||||||
|
include/write_batch.h
|
||||||
|
Interface for atomically applying multiple updates to a database.
|
||||||
|
|
||||||
|
include/slice.h
|
||||||
|
A simple module for maintaining a pointer and a length into some
|
||||||
|
other byte array.
|
||||||
|
|
||||||
|
include/status.h
|
||||||
|
Status is returned from many of the public interfaces and is used
|
||||||
|
to report success and various kinds of errors.
|
||||||
|
|
||||||
|
include/env.h
|
||||||
|
Abstraction of the OS environment. A posix implementation of
|
||||||
|
this interface is in util/env_posix.cc
|
||||||
|
|
||||||
|
include/table.h
|
||||||
|
include/table_builder.h
|
||||||
|
Lower-level modules that most clients probably won't use directly
|
|
@ -1,174 +0,0 @@
|
||||||
**LevelDB is a fast key-value storage library written at Google that provides an ordered mapping from string keys to string values.**
|
|
||||||
|
|
||||||
[![Build Status](https://travis-ci.org/google/leveldb.svg?branch=master)](https://travis-ci.org/google/leveldb)
|
|
||||||
|
|
||||||
Authors: Sanjay Ghemawat (sanjay@google.com) and Jeff Dean (jeff@google.com)
|
|
||||||
|
|
||||||
# Features
|
|
||||||
* Keys and values are arbitrary byte arrays.
|
|
||||||
* Data is stored sorted by key.
|
|
||||||
* Callers can provide a custom comparison function to override the sort order.
|
|
||||||
* The basic operations are `Put(key,value)`, `Get(key)`, `Delete(key)`.
|
|
||||||
* Multiple changes can be made in one atomic batch.
|
|
||||||
* Users can create a transient snapshot to get a consistent view of data.
|
|
||||||
* Forward and backward iteration is supported over the data.
|
|
||||||
* Data is automatically compressed using the [Snappy compression library](http://google.github.io/snappy/).
|
|
||||||
* External activity (file system operations etc.) is relayed through a virtual interface so users can customize the operating system interactions.
|
|
||||||
|
|
||||||
# Documentation
|
|
||||||
[LevelDB library documentation](https://github.com/google/leveldb/blob/master/doc/index.md) is online and bundled with the source code.
|
|
||||||
|
|
||||||
|
|
||||||
# Limitations
|
|
||||||
* This is not a SQL database. It does not have a relational data model, it does not support SQL queries, and it has no support for indexes.
|
|
||||||
* Only a single process (possibly multi-threaded) can access a particular database at a time.
|
|
||||||
* There is no client-server support builtin to the library. An application that needs such support will have to wrap their own server around the library.
|
|
||||||
|
|
||||||
# Contributing to the leveldb Project
|
|
||||||
The leveldb project welcomes contributions. leveldb's primary goal is to be
|
|
||||||
a reliable and fast key/value store. Changes that are in line with the
|
|
||||||
features/limitations outlined above, and meet the requirements below,
|
|
||||||
will be considered.
|
|
||||||
|
|
||||||
Contribution requirements:
|
|
||||||
|
|
||||||
1. **POSIX only**. We _generally_ will only accept changes that are both
|
|
||||||
compiled, and tested on a POSIX platform - usually Linux. Very small
|
|
||||||
changes will sometimes be accepted, but consider that more of an
|
|
||||||
exception than the rule.
|
|
||||||
|
|
||||||
2. **Stable API**. We strive very hard to maintain a stable API. Changes that
|
|
||||||
require changes for projects using leveldb _might_ be rejected without
|
|
||||||
sufficient benefit to the project.
|
|
||||||
|
|
||||||
3. **Tests**: All changes must be accompanied by a new (or changed) test, or
|
|
||||||
a sufficient explanation as to why a new (or changed) test is not required.
|
|
||||||
|
|
||||||
## Submitting a Pull Request
|
|
||||||
Before any pull request will be accepted the author must first sign a
|
|
||||||
Contributor License Agreement (CLA) at https://cla.developers.google.com/.
|
|
||||||
|
|
||||||
In order to keep the commit timeline linear
|
|
||||||
[squash](https://git-scm.com/book/en/v2/Git-Tools-Rewriting-History#Squashing-Commits)
|
|
||||||
your changes down to a single commit and [rebase](https://git-scm.com/docs/git-rebase)
|
|
||||||
on google/leveldb/master. This keeps the commit timeline linear and more easily sync'ed
|
|
||||||
with the internal repository at Google. More information at GitHub's
|
|
||||||
[About Git rebase](https://help.github.com/articles/about-git-rebase/) page.
|
|
||||||
|
|
||||||
# Performance
|
|
||||||
|
|
||||||
Here is a performance report (with explanations) from the run of the
|
|
||||||
included db_bench program. The results are somewhat noisy, but should
|
|
||||||
be enough to get a ballpark performance estimate.
|
|
||||||
|
|
||||||
## Setup
|
|
||||||
|
|
||||||
We use a database with a million entries. Each entry has a 16 byte
|
|
||||||
key, and a 100 byte value. Values used by the benchmark compress to
|
|
||||||
about half their original size.
|
|
||||||
|
|
||||||
LevelDB: version 1.1
|
|
||||||
Date: Sun May 1 12:11:26 2011
|
|
||||||
CPU: 4 x Intel(R) Core(TM)2 Quad CPU Q6600 @ 2.40GHz
|
|
||||||
CPUCache: 4096 KB
|
|
||||||
Keys: 16 bytes each
|
|
||||||
Values: 100 bytes each (50 bytes after compression)
|
|
||||||
Entries: 1000000
|
|
||||||
Raw Size: 110.6 MB (estimated)
|
|
||||||
File Size: 62.9 MB (estimated)
|
|
||||||
|
|
||||||
## Write performance
|
|
||||||
|
|
||||||
The "fill" benchmarks create a brand new database, in either
|
|
||||||
sequential, or random order. The "fillsync" benchmark flushes data
|
|
||||||
from the operating system to the disk after every operation; the other
|
|
||||||
write operations leave the data sitting in the operating system buffer
|
|
||||||
cache for a while. The "overwrite" benchmark does random writes that
|
|
||||||
update existing keys in the database.
|
|
||||||
|
|
||||||
fillseq : 1.765 micros/op; 62.7 MB/s
|
|
||||||
fillsync : 268.409 micros/op; 0.4 MB/s (10000 ops)
|
|
||||||
fillrandom : 2.460 micros/op; 45.0 MB/s
|
|
||||||
overwrite : 2.380 micros/op; 46.5 MB/s
|
|
||||||
|
|
||||||
Each "op" above corresponds to a write of a single key/value pair.
|
|
||||||
I.e., a random write benchmark goes at approximately 400,000 writes per second.
|
|
||||||
|
|
||||||
Each "fillsync" operation costs much less (0.3 millisecond)
|
|
||||||
than a disk seek (typically 10 milliseconds). We suspect that this is
|
|
||||||
because the hard disk itself is buffering the update in its memory and
|
|
||||||
responding before the data has been written to the platter. This may
|
|
||||||
or may not be safe based on whether or not the hard disk has enough
|
|
||||||
power to save its memory in the event of a power failure.
|
|
||||||
|
|
||||||
## Read performance
|
|
||||||
|
|
||||||
We list the performance of reading sequentially in both the forward
|
|
||||||
and reverse direction, and also the performance of a random lookup.
|
|
||||||
Note that the database created by the benchmark is quite small.
|
|
||||||
Therefore the report characterizes the performance of leveldb when the
|
|
||||||
working set fits in memory. The cost of reading a piece of data that
|
|
||||||
is not present in the operating system buffer cache will be dominated
|
|
||||||
by the one or two disk seeks needed to fetch the data from disk.
|
|
||||||
Write performance will be mostly unaffected by whether or not the
|
|
||||||
working set fits in memory.
|
|
||||||
|
|
||||||
readrandom : 16.677 micros/op; (approximately 60,000 reads per second)
|
|
||||||
readseq : 0.476 micros/op; 232.3 MB/s
|
|
||||||
readreverse : 0.724 micros/op; 152.9 MB/s
|
|
||||||
|
|
||||||
LevelDB compacts its underlying storage data in the background to
|
|
||||||
improve read performance. The results listed above were done
|
|
||||||
immediately after a lot of random writes. The results after
|
|
||||||
compactions (which are usually triggered automatically) are better.
|
|
||||||
|
|
||||||
readrandom : 11.602 micros/op; (approximately 85,000 reads per second)
|
|
||||||
readseq : 0.423 micros/op; 261.8 MB/s
|
|
||||||
readreverse : 0.663 micros/op; 166.9 MB/s
|
|
||||||
|
|
||||||
Some of the high cost of reads comes from repeated decompression of blocks
|
|
||||||
read from disk. If we supply enough cache to the leveldb so it can hold the
|
|
||||||
uncompressed blocks in memory, the read performance improves again:
|
|
||||||
|
|
||||||
readrandom : 9.775 micros/op; (approximately 100,000 reads per second before compaction)
|
|
||||||
readrandom : 5.215 micros/op; (approximately 190,000 reads per second after compaction)
|
|
||||||
|
|
||||||
## Repository contents
|
|
||||||
|
|
||||||
See [doc/index.md](doc/index.md) for more explanation. See
|
|
||||||
[doc/impl.md](doc/impl.md) for a brief overview of the implementation.
|
|
||||||
|
|
||||||
The public interface is in include/*.h. Callers should not include or
|
|
||||||
rely on the details of any other header files in this package. Those
|
|
||||||
internal APIs may be changed without warning.
|
|
||||||
|
|
||||||
Guide to header files:
|
|
||||||
|
|
||||||
* **include/db.h**: Main interface to the DB: Start here
|
|
||||||
|
|
||||||
* **include/options.h**: Control over the behavior of an entire database,
|
|
||||||
and also control over the behavior of individual reads and writes.
|
|
||||||
|
|
||||||
* **include/comparator.h**: Abstraction for user-specified comparison function.
|
|
||||||
If you want just bytewise comparison of keys, you can use the default
|
|
||||||
comparator, but clients can write their own comparator implementations if they
|
|
||||||
want custom ordering (e.g. to handle different character encodings, etc.)
|
|
||||||
|
|
||||||
* **include/iterator.h**: Interface for iterating over data. You can get
|
|
||||||
an iterator from a DB object.
|
|
||||||
|
|
||||||
* **include/write_batch.h**: Interface for atomically applying multiple
|
|
||||||
updates to a database.
|
|
||||||
|
|
||||||
* **include/slice.h**: A simple module for maintaining a pointer and a
|
|
||||||
length into some other byte array.
|
|
||||||
|
|
||||||
* **include/status.h**: Status is returned from many of the public interfaces
|
|
||||||
and is used to report success and various kinds of errors.
|
|
||||||
|
|
||||||
* **include/env.h**:
|
|
||||||
Abstraction of the OS environment. A posix implementation of this interface is
|
|
||||||
in util/env_posix.cc
|
|
||||||
|
|
||||||
* **include/table.h, include/table_builder.h**: Lower-level modules that most
|
|
||||||
clients probably won't use directly
|
|
|
@ -7,7 +7,6 @@ db
|
||||||
within [start_key..end_key]? For Chrome, deletion of obsolete
|
within [start_key..end_key]? For Chrome, deletion of obsolete
|
||||||
object stores, etc. can be done in the background anyway, so
|
object stores, etc. can be done in the background anyway, so
|
||||||
probably not that important.
|
probably not that important.
|
||||||
- There have been requests for MultiGet.
|
|
||||||
|
|
||||||
After a range is completely deleted, what gets rid of the
|
After a range is completely deleted, what gets rid of the
|
||||||
corresponding files if we do no future changes to that range. Make
|
corresponding files if we do no future changes to that range. Make
|
||||||
|
|
|
@ -1,39 +0,0 @@
|
||||||
# Building LevelDB On Windows
|
|
||||||
|
|
||||||
## Prereqs
|
|
||||||
|
|
||||||
Install the [Windows Software Development Kit version 7.1](http://www.microsoft.com/downloads/dlx/en-us/listdetailsview.aspx?FamilyID=6b6c21d2-2006-4afa-9702-529fa782d63b).
|
|
||||||
|
|
||||||
Download and extract the [Snappy source distribution](http://snappy.googlecode.com/files/snappy-1.0.5.tar.gz)
|
|
||||||
|
|
||||||
1. Open the "Windows SDK 7.1 Command Prompt" :
|
|
||||||
Start Menu -> "Microsoft Windows SDK v7.1" > "Windows SDK 7.1 Command Prompt"
|
|
||||||
2. Change the directory to the leveldb project
|
|
||||||
|
|
||||||
## Building the Static lib
|
|
||||||
|
|
||||||
* 32 bit Version
|
|
||||||
|
|
||||||
setenv /x86
|
|
||||||
msbuild.exe /p:Configuration=Release /p:Platform=Win32 /p:Snappy=..\snappy-1.0.5
|
|
||||||
|
|
||||||
* 64 bit Version
|
|
||||||
|
|
||||||
setenv /x64
|
|
||||||
msbuild.exe /p:Configuration=Release /p:Platform=x64 /p:Snappy=..\snappy-1.0.5
|
|
||||||
|
|
||||||
|
|
||||||
## Building and Running the Benchmark app
|
|
||||||
|
|
||||||
* 32 bit Version
|
|
||||||
|
|
||||||
setenv /x86
|
|
||||||
msbuild.exe /p:Configuration=Benchmark /p:Platform=Win32 /p:Snappy=..\snappy-1.0.5
|
|
||||||
Benchmark\leveldb.exe
|
|
||||||
|
|
||||||
* 64 bit Version
|
|
||||||
|
|
||||||
setenv /x64
|
|
||||||
msbuild.exe /p:Configuration=Benchmark /p:Platform=x64 /p:Snappy=..\snappy-1.0.5
|
|
||||||
x64\Benchmark\leveldb.exe
|
|
||||||
|
|
|
@ -7,11 +7,8 @@
|
||||||
# CC C Compiler path
|
# CC C Compiler path
|
||||||
# CXX C++ Compiler path
|
# CXX C++ Compiler path
|
||||||
# PLATFORM_LDFLAGS Linker flags
|
# PLATFORM_LDFLAGS Linker flags
|
||||||
# PLATFORM_LIBS Libraries flags
|
|
||||||
# PLATFORM_SHARED_EXT Extension for shared libraries
|
# PLATFORM_SHARED_EXT Extension for shared libraries
|
||||||
# PLATFORM_SHARED_LDFLAGS Flags for building shared library
|
# PLATFORM_SHARED_LDFLAGS Flags for building shared library
|
||||||
# This flag is embedded just before the name
|
|
||||||
# of the shared library without intervening spaces
|
|
||||||
# PLATFORM_SHARED_CFLAGS Flags for compiling objects for shared library
|
# PLATFORM_SHARED_CFLAGS Flags for compiling objects for shared library
|
||||||
# PLATFORM_CCFLAGS C compiler flags
|
# PLATFORM_CCFLAGS C compiler flags
|
||||||
# PLATFORM_CXXFLAGS C++ compiler flags. Will contain:
|
# PLATFORM_CXXFLAGS C++ compiler flags. Will contain:
|
||||||
|
@ -20,15 +17,14 @@
|
||||||
#
|
#
|
||||||
# The PLATFORM_CCFLAGS and PLATFORM_CXXFLAGS might include the following:
|
# The PLATFORM_CCFLAGS and PLATFORM_CXXFLAGS might include the following:
|
||||||
#
|
#
|
||||||
# -DLEVELDB_ATOMIC_PRESENT if <atomic> is present
|
# -DLEVELDB_CSTDATOMIC_PRESENT if <cstdatomic> is present
|
||||||
# -DLEVELDB_PLATFORM_POSIX for Posix-based platforms
|
# -DLEVELDB_PLATFORM_POSIX for Posix-based platforms
|
||||||
# -DSNAPPY if the Snappy library is present
|
# -DSNAPPY if the Snappy library is present
|
||||||
#
|
#
|
||||||
|
|
||||||
OUTPUT=$1
|
OUTPUT=$1
|
||||||
PREFIX=$2
|
if test -z "$OUTPUT"; then
|
||||||
if test -z "$OUTPUT" || test -z "$PREFIX"; then
|
echo "usage: $0 <output-filename>" >&2
|
||||||
echo "usage: $0 <output-filename> <directory_prefix>" >&2
|
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
@ -44,10 +40,6 @@ if test -z "$CXX"; then
|
||||||
CXX=g++
|
CXX=g++
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if test -z "$TMPDIR"; then
|
|
||||||
TMPDIR=/tmp
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Detect OS
|
# Detect OS
|
||||||
if test -z "$TARGET_OS"; then
|
if test -z "$TARGET_OS"; then
|
||||||
TARGET_OS=`uname -s`
|
TARGET_OS=`uname -s`
|
||||||
|
@ -58,119 +50,77 @@ CROSS_COMPILE=
|
||||||
PLATFORM_CCFLAGS=
|
PLATFORM_CCFLAGS=
|
||||||
PLATFORM_CXXFLAGS=
|
PLATFORM_CXXFLAGS=
|
||||||
PLATFORM_LDFLAGS=
|
PLATFORM_LDFLAGS=
|
||||||
PLATFORM_LIBS=
|
PLATFORM_SHARED_EXT=
|
||||||
PLATFORM_SHARED_EXT="so"
|
|
||||||
PLATFORM_SHARED_LDFLAGS="-shared -Wl,-soname -Wl,"
|
PLATFORM_SHARED_LDFLAGS="-shared -Wl,-soname -Wl,"
|
||||||
PLATFORM_SHARED_CFLAGS="-fPIC"
|
PLATFORM_SHARED_CFLAGS="-fPIC"
|
||||||
PLATFORM_SHARED_VERSIONED=true
|
PLATFORM_SHARED_VERSIONED=true
|
||||||
PLATFORM_SSEFLAGS=
|
|
||||||
|
|
||||||
MEMCMP_FLAG=
|
if test -n "$LEVELDB_VSN"; then
|
||||||
if [ "$CXX" = "g++" ]; then
|
VERSION_FLAGS="$VERSION_FLAGS -DLEVELDB_VSN=\"$LEVELDB_VSN\""
|
||||||
# Use libc's memcmp instead of GCC's memcmp. This results in ~40%
|
|
||||||
# performance improvement on readrandom under gcc 4.4.3 on Linux/x86.
|
|
||||||
MEMCMP_FLAG="-fno-builtin-memcmp"
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
# On GCC, we pick libc's memcmp over GCC's memcmp via -fno-builtin-memcmp
|
||||||
case "$TARGET_OS" in
|
case "$TARGET_OS" in
|
||||||
CYGWIN_*)
|
|
||||||
PLATFORM=OS_LINUX
|
|
||||||
COMMON_FLAGS="$MEMCMP_FLAG -lpthread -DOS_LINUX -DCYGWIN"
|
|
||||||
PLATFORM_LDFLAGS="-lpthread"
|
|
||||||
PORT_FILE=port/port_posix.cc
|
|
||||||
PORT_SSE_FILE=port/port_posix_sse.cc
|
|
||||||
;;
|
|
||||||
Darwin)
|
Darwin)
|
||||||
PLATFORM=OS_MACOSX
|
PLATFORM=OS_MACOSX
|
||||||
COMMON_FLAGS="$MEMCMP_FLAG -DOS_MACOSX"
|
oIFS="$IFS"; IFS=.
|
||||||
PLATFORM_SHARED_EXT=dylib
|
set `uname -r`
|
||||||
[ -z "$INSTALL_PATH" ] && INSTALL_PATH=`pwd`
|
IFS="$oIFS"
|
||||||
PLATFORM_SHARED_LDFLAGS="-dynamiclib -install_name $INSTALL_PATH/"
|
if [ "$1" -ge 13 ]; then
|
||||||
|
# assume clang compiler
|
||||||
|
COMMON_FLAGS="-mmacosx-version-min=10.8 -DOS_MACOSX -stdlib=libc++"
|
||||||
|
PLATFORM_LDFLAGS="-mmacosx-version-min=10.8"
|
||||||
|
else
|
||||||
|
COMMON_FLAGS="-fno-builtin-memcmp -DOS_MACOSX"
|
||||||
|
fi
|
||||||
|
PLATFORM_SHARED_EXT=
|
||||||
|
PLATFORM_SHARED_LDFLAGS="-dynamiclib -install_name "
|
||||||
PORT_FILE=port/port_posix.cc
|
PORT_FILE=port/port_posix.cc
|
||||||
PORT_SSE_FILE=port/port_posix_sse.cc
|
|
||||||
;;
|
;;
|
||||||
Linux)
|
Linux)
|
||||||
PLATFORM=OS_LINUX
|
PLATFORM=OS_LINUX
|
||||||
COMMON_FLAGS="$MEMCMP_FLAG -pthread -DOS_LINUX"
|
COMMON_FLAGS="-fno-builtin-memcmp -pthread -DOS_LINUX"
|
||||||
PLATFORM_LDFLAGS="-pthread"
|
PLATFORM_LDFLAGS="-pthread -lrt"
|
||||||
PORT_FILE=port/port_posix.cc
|
PORT_FILE=port/port_posix.cc
|
||||||
PORT_SSE_FILE=port/port_posix_sse.cc
|
|
||||||
;;
|
;;
|
||||||
SunOS)
|
SunOS)
|
||||||
PLATFORM=OS_SOLARIS
|
PLATFORM=OS_SOLARIS
|
||||||
COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_SOLARIS"
|
COMMON_FLAGS="-fno-builtin-memcmp -D_REENTRANT -DOS_SOLARIS -m64"
|
||||||
PLATFORM_LIBS="-lpthread -lrt"
|
PLATFORM_LDFLAGS="-lpthread -lrt"
|
||||||
|
PLATFORM_SHARED_EXT=
|
||||||
PORT_FILE=port/port_posix.cc
|
PORT_FILE=port/port_posix.cc
|
||||||
PORT_SSE_FILE=port/port_posix_sse.cc
|
|
||||||
;;
|
;;
|
||||||
FreeBSD)
|
FreeBSD)
|
||||||
|
CC=cc
|
||||||
|
CXX=c++
|
||||||
PLATFORM=OS_FREEBSD
|
PLATFORM=OS_FREEBSD
|
||||||
COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_FREEBSD"
|
COMMON_FLAGS="-fno-builtin-memcmp -D_REENTRANT -DOS_FREEBSD"
|
||||||
PLATFORM_LIBS="-lpthread"
|
PLATFORM_LDFLAGS="-lpthread"
|
||||||
PORT_FILE=port/port_posix.cc
|
|
||||||
PORT_SSE_FILE=port/port_posix_sse.cc
|
|
||||||
;;
|
|
||||||
GNU/kFreeBSD)
|
|
||||||
PLATFORM=OS_KFREEBSD
|
|
||||||
COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_KFREEBSD"
|
|
||||||
PLATFORM_LIBS="-lpthread"
|
|
||||||
PORT_FILE=port/port_posix.cc
|
PORT_FILE=port/port_posix.cc
|
||||||
;;
|
;;
|
||||||
NetBSD)
|
NetBSD)
|
||||||
PLATFORM=OS_NETBSD
|
PLATFORM=OS_NETBSD
|
||||||
COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_NETBSD"
|
COMMON_FLAGS="-fno-builtin-memcmp -D_REENTRANT -DOS_NETBSD"
|
||||||
PLATFORM_LIBS="-lpthread -lgcc_s"
|
PLATFORM_LDFLAGS="-lpthread -lgcc_s"
|
||||||
PORT_FILE=port/port_posix.cc
|
PORT_FILE=port/port_posix.cc
|
||||||
PORT_SSE_FILE=port/port_posix_sse.cc
|
|
||||||
;;
|
;;
|
||||||
OpenBSD)
|
OpenBSD)
|
||||||
PLATFORM=OS_OPENBSD
|
PLATFORM=OS_OPENBSD
|
||||||
COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_OPENBSD"
|
COMMON_FLAGS="-fno-builtin-memcmp -D_REENTRANT -DOS_OPENBSD"
|
||||||
PLATFORM_LDFLAGS="-pthread"
|
PLATFORM_LDFLAGS="-pthread"
|
||||||
PORT_FILE=port/port_posix.cc
|
PORT_FILE=port/port_posix.cc
|
||||||
PORT_SSE_FILE=port/port_posix_sse.cc
|
|
||||||
;;
|
;;
|
||||||
DragonFly)
|
DragonFly)
|
||||||
PLATFORM=OS_DRAGONFLYBSD
|
PLATFORM=OS_DRAGONFLYBSD
|
||||||
COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_DRAGONFLYBSD"
|
COMMON_FLAGS="-fno-builtin-memcmp -D_REENTRANT -DOS_DRAGONFLYBSD"
|
||||||
PLATFORM_LIBS="-lpthread"
|
PLATFORM_LDFLAGS="-lpthread"
|
||||||
PORT_FILE=port/port_posix.cc
|
PORT_FILE=port/port_posix.cc
|
||||||
PORT_SSE_FILE=port/port_posix_sse.cc
|
|
||||||
;;
|
;;
|
||||||
OS_ANDROID_CROSSCOMPILE)
|
OS_ANDROID_CROSSCOMPILE)
|
||||||
PLATFORM=OS_ANDROID
|
PLATFORM=OS_ANDROID
|
||||||
COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_ANDROID -DLEVELDB_PLATFORM_POSIX"
|
COMMON_FLAGS="-fno-builtin-memcmp -D_REENTRANT -DOS_ANDROID -DLEVELDB_PLATFORM_POSIX"
|
||||||
PLATFORM_LDFLAGS="" # All pthread features are in the Android C library
|
PLATFORM_LDFLAGS="" # All pthread features are in the Android C library
|
||||||
PORT_FILE=port/port_posix.cc
|
PORT_FILE=port/port_posix.cc
|
||||||
PORT_SSE_FILE=port/port_posix_sse.cc
|
|
||||||
CROSS_COMPILE=true
|
|
||||||
;;
|
|
||||||
HP-UX)
|
|
||||||
PLATFORM=OS_HPUX
|
|
||||||
COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_HPUX"
|
|
||||||
PLATFORM_LDFLAGS="-pthread"
|
|
||||||
PORT_FILE=port/port_posix.cc
|
|
||||||
PORT_SSE_FILE=port/port_posix_sse.cc
|
|
||||||
# man ld: +h internal_name
|
|
||||||
PLATFORM_SHARED_LDFLAGS="-shared -Wl,+h -Wl,"
|
|
||||||
;;
|
|
||||||
IOS)
|
|
||||||
PLATFORM=IOS
|
|
||||||
COMMON_FLAGS="$MEMCMP_FLAG -DOS_MACOSX"
|
|
||||||
[ -z "$INSTALL_PATH" ] && INSTALL_PATH=`pwd`
|
|
||||||
PORT_FILE=port/port_posix.cc
|
|
||||||
PORT_SSE_FILE=port/port_posix_sse.cc
|
|
||||||
PLATFORM_SHARED_EXT=
|
|
||||||
PLATFORM_SHARED_LDFLAGS=
|
|
||||||
PLATFORM_SHARED_CFLAGS=
|
|
||||||
PLATFORM_SHARED_VERSIONED=
|
|
||||||
;;
|
|
||||||
OS_WINDOWS_CROSSCOMPILE | NATIVE_WINDOWS)
|
|
||||||
PLATFORM=OS_WINDOWS
|
|
||||||
COMMON_FLAGS="-fno-builtin-memcmp -D_REENTRANT -DOS_WINDOWS -DLEVELDB_PLATFORM_WINDOWS -DWINVER=0x0500 -D__USE_MINGW_ANSI_STDIO=1"
|
|
||||||
PLATFORM_SOURCES="util/env_win.cc"
|
|
||||||
PLATFORM_LIBS="-lshlwapi"
|
|
||||||
PORT_FILE=port/port_win.cc
|
|
||||||
CROSS_COMPILE=true
|
CROSS_COMPILE=true
|
||||||
;;
|
;;
|
||||||
*)
|
*)
|
||||||
|
@ -182,78 +132,106 @@ esac
|
||||||
# except for the test and benchmark files. By default, find will output a list
|
# except for the test and benchmark files. By default, find will output a list
|
||||||
# of all files matching either rule, so we need to append -print to make the
|
# of all files matching either rule, so we need to append -print to make the
|
||||||
# prune take effect.
|
# prune take effect.
|
||||||
DIRS="$PREFIX/db $PREFIX/util $PREFIX/table"
|
if [ -f leveldb_ee/README.md ]; then
|
||||||
|
DIRS="util db table leveldb_ee"
|
||||||
|
else
|
||||||
|
DIRS="util db table leveldb_os"
|
||||||
|
fi
|
||||||
set -f # temporarily disable globbing so that our patterns aren't expanded
|
set -f # temporarily disable globbing so that our patterns aren't expanded
|
||||||
PRUNE_TEST="-name *test*.cc -prune"
|
PRUNE_TEST="-name *test*.cc -prune"
|
||||||
PRUNE_BENCH="-name *_bench.cc -prune"
|
PRUNE_BENCH="-name *_bench.cc -prune"
|
||||||
PRUNE_TOOL="-name leveldbutil.cc -prune"
|
PORTABLE_FILES=`find $DIRS $PRUNE_TEST -o $PRUNE_BENCH -o -name '*.cc' -print | sort | tr "\n" " "`
|
||||||
PORTABLE_FILES=`find $DIRS $PRUNE_TEST -o $PRUNE_BENCH -o $PRUNE_TOOL -o -name '*.cc' -print | sort | sed "s,^$PREFIX/,," | tr "\n" " "`
|
TESTS=`find $DIRS -name '*_test.c*' -print | sort | tr "\n" " "`
|
||||||
|
|
||||||
set +f # re-enable globbing
|
set +f # re-enable globbing
|
||||||
|
|
||||||
# The sources consist of the portable files, plus the platform-specific port
|
# The sources consist of the portable files, plus the platform-specific port
|
||||||
# file.
|
# file.
|
||||||
echo "SOURCES=$PORTABLE_FILES $PORT_FILE $PORT_SSE_FILE" >> $OUTPUT
|
echo "SOURCES=$PORTABLE_FILES $PORT_FILE" >> $OUTPUT
|
||||||
echo "MEMENV_SOURCES=helpers/memenv/memenv.cc" >> $OUTPUT
|
echo "MEMENV_SOURCES=helpers/memenv/memenv.cc" >> $OUTPUT
|
||||||
|
echo "TEST_SOURCES=$TESTS" >>$OUTPUT
|
||||||
|
|
||||||
if [ "$CROSS_COMPILE" = "true" ]; then
|
if [ "$CROSS_COMPILE" = "true" ]; then
|
||||||
# Cross-compiling; do not try any compilation tests.
|
# Cross-compiling; do not try any compilation tests.
|
||||||
true
|
true
|
||||||
else
|
else
|
||||||
CXXOUTPUT="${TMPDIR}/leveldb_build_detect_platform-cxx.$$"
|
# If -std=c++0x works, use <cstdatomic>. Otherwise use port_posix.h.
|
||||||
|
$CXX $CFLAGS -std=c++0x -x c++ - -o /dev/null 2>/dev/null <<EOF
|
||||||
# If -std=c++0x works, use <atomic> as fallback for when memory barriers
|
#include <cstdatomic>
|
||||||
# are not available.
|
|
||||||
$CXX $CXXFLAGS -std=c++0x -x c++ - -o $CXXOUTPUT 2>/dev/null <<EOF
|
|
||||||
#include <atomic>
|
|
||||||
int main() {}
|
int main() {}
|
||||||
EOF
|
EOF
|
||||||
if [ "$?" = 0 ]; then
|
if [ "$?" = 0 ]; then
|
||||||
COMMON_FLAGS="$COMMON_FLAGS -DLEVELDB_PLATFORM_POSIX -DLEVELDB_ATOMIC_PRESENT"
|
COMMON_FLAGS="$COMMON_FLAGS -DLEVELDB_PLATFORM_POSIX -DLEVELDB_CSTDATOMIC_PRESENT"
|
||||||
PLATFORM_CXXFLAGS="-std=c++0x"
|
PLATFORM_CXXFLAGS="-std=c++0x"
|
||||||
else
|
else
|
||||||
COMMON_FLAGS="$COMMON_FLAGS -DLEVELDB_PLATFORM_POSIX"
|
COMMON_FLAGS="$COMMON_FLAGS -DLEVELDB_PLATFORM_POSIX"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
# Test whether Snappy library is installed
|
||||||
|
# http://code.google.com/p/snappy/
|
||||||
|
$CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null <<EOF
|
||||||
|
#include <snappy.h>
|
||||||
|
int main() {}
|
||||||
|
EOF
|
||||||
|
if [ "$?" = 0 ]; then
|
||||||
|
COMMON_FLAGS="$COMMON_FLAGS -DSNAPPY"
|
||||||
|
if [ "$PLATFORM" = "OS_LINUX" ]; then
|
||||||
|
# Basho: switching to static snappy library to make tools more portable
|
||||||
|
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -Wl,-non_shared -lsnappy -Wl,-call_shared"
|
||||||
|
else
|
||||||
|
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lsnappy"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
# Test whether tcmalloc is available
|
# Test whether tcmalloc is available
|
||||||
$CXX $CXXFLAGS -x c++ - -o $CXXOUTPUT -ltcmalloc 2>/dev/null <<EOF
|
$CXX $CFLAGS -x c++ - -o /dev/null -ltcmalloc 2>/dev/null <<EOF
|
||||||
int main() {}
|
int main() {}
|
||||||
EOF
|
EOF
|
||||||
if [ "$?" = 0 ]; then
|
if [ "$?" = 0 ]; then
|
||||||
PLATFORM_LIBS="$PLATFORM_LIBS -ltcmalloc"
|
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -ltcmalloc"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
rm -f $CXXOUTPUT 2>/dev/null
|
|
||||||
|
|
||||||
# Test if gcc SSE 4.2 is supported
|
|
||||||
$CXX $CXXFLAGS -x c++ - -o $CXXOUTPUT -msse4.2 2>/dev/null <<EOF
|
|
||||||
int main() {}
|
|
||||||
EOF
|
|
||||||
if [ "$?" = 0 ]; then
|
|
||||||
PLATFORM_SSEFLAGS="-msse4.2"
|
|
||||||
fi
|
|
||||||
|
|
||||||
rm -f $CXXOUTPUT 2>/dev/null
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Use the SSE 4.2 CRC32C intrinsics iff runtime checks indicate compiler supports them.
|
PLATFORM_CCFLAGS="$PLATFORM_CCFLAGS $COMMON_FLAGS $VERSION_FLAGS"
|
||||||
if [ -n "$PLATFORM_SSEFLAGS" ]; then
|
PLATFORM_CXXFLAGS="$PLATFORM_CXXFLAGS $COMMON_FLAGS $VERSION_FLAGS"
|
||||||
PLATFORM_SSEFLAGS="$PLATFORM_SSEFLAGS -DLEVELDB_PLATFORM_POSIX_SSE"
|
|
||||||
fi
|
|
||||||
|
|
||||||
PLATFORM_CCFLAGS="$PLATFORM_CCFLAGS $COMMON_FLAGS"
|
|
||||||
PLATFORM_CXXFLAGS="$PLATFORM_CXXFLAGS $COMMON_FLAGS"
|
|
||||||
|
|
||||||
echo "CC=$CC" >> $OUTPUT
|
echo "CC=$CC" >> $OUTPUT
|
||||||
echo "CXX=$CXX" >> $OUTPUT
|
echo "CXX=$CXX" >> $OUTPUT
|
||||||
echo "PLATFORM=$PLATFORM" >> $OUTPUT
|
echo "PLATFORM=$PLATFORM" >> $OUTPUT
|
||||||
echo "PLATFORM_LDFLAGS=$PLATFORM_LDFLAGS" >> $OUTPUT
|
echo "PLATFORM_LDFLAGS=$PLATFORM_LDFLAGS" >> $OUTPUT
|
||||||
echo "PLATFORM_LIBS=$PLATFORM_LIBS" >> $OUTPUT
|
|
||||||
echo "PLATFORM_CCFLAGS=$PLATFORM_CCFLAGS" >> $OUTPUT
|
echo "PLATFORM_CCFLAGS=$PLATFORM_CCFLAGS" >> $OUTPUT
|
||||||
echo "PLATFORM_CXXFLAGS=$PLATFORM_CXXFLAGS" >> $OUTPUT
|
echo "PLATFORM_CXXFLAGS=$PLATFORM_CXXFLAGS" >> $OUTPUT
|
||||||
echo "PLATFORM_SSEFLAGS=$PLATFORM_SSEFLAGS" >> $OUTPUT
|
|
||||||
echo "PLATFORM_SHARED_CFLAGS=$PLATFORM_SHARED_CFLAGS" >> $OUTPUT
|
echo "PLATFORM_SHARED_CFLAGS=$PLATFORM_SHARED_CFLAGS" >> $OUTPUT
|
||||||
echo "PLATFORM_SHARED_EXT=$PLATFORM_SHARED_EXT" >> $OUTPUT
|
echo "PLATFORM_SHARED_EXT=$PLATFORM_SHARED_EXT" >> $OUTPUT
|
||||||
echo "PLATFORM_SHARED_LDFLAGS=$PLATFORM_SHARED_LDFLAGS" >> $OUTPUT
|
echo "PLATFORM_SHARED_LDFLAGS=$PLATFORM_SHARED_LDFLAGS" >> $OUTPUT
|
||||||
echo "PLATFORM_SHARED_VERSIONED=$PLATFORM_SHARED_VERSIONED" >> $OUTPUT
|
echo "PLATFORM_SHARED_VERSIONED=$PLATFORM_SHARED_VERSIONED" >> $OUTPUT
|
||||||
|
|
||||||
|
#
|
||||||
|
# Basho extension to place -D variable in include/leveldb/ldb_config.h
|
||||||
|
#
|
||||||
|
|
||||||
|
LDB_CONFIG="include/leveldb/ldb_config.h"
|
||||||
|
|
||||||
|
# Delete existing output, if it exists
|
||||||
|
rm -f $LDB_CONFIG
|
||||||
|
|
||||||
|
write_config_h()
|
||||||
|
{
|
||||||
|
for param in $@
|
||||||
|
do
|
||||||
|
prefix=$(expr -- $param : "\(..\)")
|
||||||
|
if [ X$prefix = "X-D" ]
|
||||||
|
then
|
||||||
|
echo "" >>$LDB_CONFIG
|
||||||
|
echo "#ifndef $(expr -- $param : '..\(.*\)')" >>$LDB_CONFIG
|
||||||
|
echo " #define $(expr -- $param : '..\(.*\)')" >>$LDB_CONFIG
|
||||||
|
echo "#endif" >>$LDB_CONFIG
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
}
|
||||||
|
|
||||||
|
echo "/** This file is generated by build_detect_platform." >$LDB_CONFIG
|
||||||
|
echo " * It saves the state of compile flags. This benefits the reuse" >>$LDB_CONFIG
|
||||||
|
echo " * of internal include files outside of a leveldb build." >>$LDB_CONFIG
|
||||||
|
echo " */" >>$LDB_CONFIG
|
||||||
|
|
||||||
|
write_config_h $COMMON_FLAGS
|
||||||
|
|
|
@ -1,118 +0,0 @@
|
||||||
// Copyright (c) 2013 The LevelDB Authors. All rights reserved.
|
|
||||||
// Use of this source code is governed by a BSD-style license that can be
|
|
||||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
||||||
|
|
||||||
#include "leveldb/db.h"
|
|
||||||
#include "db/db_impl.h"
|
|
||||||
#include "leveldb/cache.h"
|
|
||||||
#include "util/testharness.h"
|
|
||||||
#include "util/testutil.h"
|
|
||||||
|
|
||||||
namespace leveldb {
|
|
||||||
|
|
||||||
class AutoCompactTest {
|
|
||||||
public:
|
|
||||||
std::string dbname_;
|
|
||||||
Cache* tiny_cache_;
|
|
||||||
Options options_;
|
|
||||||
DB* db_;
|
|
||||||
|
|
||||||
AutoCompactTest() {
|
|
||||||
dbname_ = test::TmpDir() + "/autocompact_test";
|
|
||||||
tiny_cache_ = NewLRUCache(100);
|
|
||||||
options_.block_cache = tiny_cache_;
|
|
||||||
DestroyDB(dbname_, options_);
|
|
||||||
options_.create_if_missing = true;
|
|
||||||
options_.compression = kNoCompression;
|
|
||||||
ASSERT_OK(DB::Open(options_, dbname_, &db_));
|
|
||||||
}
|
|
||||||
|
|
||||||
~AutoCompactTest() {
|
|
||||||
delete db_;
|
|
||||||
DestroyDB(dbname_, Options());
|
|
||||||
delete tiny_cache_;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::string Key(int i) {
|
|
||||||
char buf[100];
|
|
||||||
snprintf(buf, sizeof(buf), "key%06d", i);
|
|
||||||
return std::string(buf);
|
|
||||||
}
|
|
||||||
|
|
||||||
uint64_t Size(const Slice& start, const Slice& limit) {
|
|
||||||
Range r(start, limit);
|
|
||||||
uint64_t size;
|
|
||||||
db_->GetApproximateSizes(&r, 1, &size);
|
|
||||||
return size;
|
|
||||||
}
|
|
||||||
|
|
||||||
void DoReads(int n);
|
|
||||||
};
|
|
||||||
|
|
||||||
static const int kValueSize = 200 * 1024;
|
|
||||||
static const int kTotalSize = 100 * 1024 * 1024;
|
|
||||||
static const int kCount = kTotalSize / kValueSize;
|
|
||||||
|
|
||||||
// Read through the first n keys repeatedly and check that they get
|
|
||||||
// compacted (verified by checking the size of the key space).
|
|
||||||
void AutoCompactTest::DoReads(int n) {
|
|
||||||
std::string value(kValueSize, 'x');
|
|
||||||
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
|
|
||||||
|
|
||||||
// Fill database
|
|
||||||
for (int i = 0; i < kCount; i++) {
|
|
||||||
ASSERT_OK(db_->Put(WriteOptions(), Key(i), value));
|
|
||||||
}
|
|
||||||
ASSERT_OK(dbi->TEST_CompactMemTable());
|
|
||||||
|
|
||||||
// Delete everything
|
|
||||||
for (int i = 0; i < kCount; i++) {
|
|
||||||
ASSERT_OK(db_->Delete(WriteOptions(), Key(i)));
|
|
||||||
}
|
|
||||||
ASSERT_OK(dbi->TEST_CompactMemTable());
|
|
||||||
|
|
||||||
// Get initial measurement of the space we will be reading.
|
|
||||||
const int64_t initial_size = Size(Key(0), Key(n));
|
|
||||||
const int64_t initial_other_size = Size(Key(n), Key(kCount));
|
|
||||||
|
|
||||||
// Read until size drops significantly.
|
|
||||||
std::string limit_key = Key(n);
|
|
||||||
for (int read = 0; true; read++) {
|
|
||||||
ASSERT_LT(read, 100) << "Taking too long to compact";
|
|
||||||
Iterator* iter = db_->NewIterator(ReadOptions());
|
|
||||||
for (iter->SeekToFirst();
|
|
||||||
iter->Valid() && iter->key().ToString() < limit_key;
|
|
||||||
iter->Next()) {
|
|
||||||
// Drop data
|
|
||||||
}
|
|
||||||
delete iter;
|
|
||||||
// Wait a little bit to allow any triggered compactions to complete.
|
|
||||||
Env::Default()->SleepForMicroseconds(1000000);
|
|
||||||
uint64_t size = Size(Key(0), Key(n));
|
|
||||||
fprintf(stderr, "iter %3d => %7.3f MB [other %7.3f MB]\n",
|
|
||||||
read+1, size/1048576.0, Size(Key(n), Key(kCount))/1048576.0);
|
|
||||||
if (size <= initial_size/10) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Verify that the size of the key space not touched by the reads
|
|
||||||
// is pretty much unchanged.
|
|
||||||
const int64_t final_other_size = Size(Key(n), Key(kCount));
|
|
||||||
ASSERT_LE(final_other_size, initial_other_size + 1048576);
|
|
||||||
ASSERT_GE(final_other_size, initial_other_size/5 - 1048576);
|
|
||||||
}
|
|
||||||
|
|
||||||
TEST(AutoCompactTest, ReadAll) {
|
|
||||||
DoReads(kCount);
|
|
||||||
}
|
|
||||||
|
|
||||||
TEST(AutoCompactTest, ReadHalf) {
|
|
||||||
DoReads(kCount/2);
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace leveldb
|
|
||||||
|
|
||||||
int main(int argc, char** argv) {
|
|
||||||
return leveldb::test::RunAllTests();
|
|
||||||
}
|
|
|
@ -2,12 +2,16 @@
|
||||||
// Use of this source code is governed by a BSD-style license that can be
|
// Use of this source code is governed by a BSD-style license that can be
|
||||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||||
|
|
||||||
|
#define __STDC_FORMAT_MACROS
|
||||||
|
#include <inttypes.h>
|
||||||
|
|
||||||
#include "db/builder.h"
|
#include "db/builder.h"
|
||||||
|
|
||||||
#include "db/filename.h"
|
#include "db/filename.h"
|
||||||
#include "db/dbformat.h"
|
#include "db/dbformat.h"
|
||||||
#include "db/table_cache.h"
|
#include "db/table_cache.h"
|
||||||
#include "db/version_edit.h"
|
#include "db/version_edit.h"
|
||||||
|
#include "db/version_set.h"
|
||||||
#include "leveldb/db.h"
|
#include "leveldb/db.h"
|
||||||
#include "leveldb/env.h"
|
#include "leveldb/env.h"
|
||||||
#include "leveldb/iterator.h"
|
#include "leveldb/iterator.h"
|
||||||
|
@ -17,27 +21,51 @@ namespace leveldb {
|
||||||
Status BuildTable(const std::string& dbname,
|
Status BuildTable(const std::string& dbname,
|
||||||
Env* env,
|
Env* env,
|
||||||
const Options& options,
|
const Options& options,
|
||||||
|
const Comparator * user_comparator,
|
||||||
TableCache* table_cache,
|
TableCache* table_cache,
|
||||||
Iterator* iter,
|
Iterator* iter,
|
||||||
FileMetaData* meta) {
|
FileMetaData* meta,
|
||||||
|
SequenceNumber smallest_snapshot) {
|
||||||
Status s;
|
Status s;
|
||||||
|
size_t keys_seen, keys_retired;
|
||||||
|
|
||||||
|
keys_seen=0;
|
||||||
|
keys_retired=0;
|
||||||
|
|
||||||
meta->file_size = 0;
|
meta->file_size = 0;
|
||||||
iter->SeekToFirst();
|
iter->SeekToFirst();
|
||||||
|
|
||||||
std::string fname = TableFileName(dbname, meta->number);
|
KeyRetirement retire(user_comparator, smallest_snapshot, &options);
|
||||||
|
|
||||||
|
std::string fname = TableFileName(options, meta->number, meta->level);
|
||||||
if (iter->Valid()) {
|
if (iter->Valid()) {
|
||||||
WritableFile* file;
|
WritableFile* file;
|
||||||
s = env->NewWritableFile(fname, &file);
|
|
||||||
|
s = env->NewWritableFile(fname, &file,
|
||||||
|
env->RecoveryMmapSize(&options));
|
||||||
if (!s.ok()) {
|
if (!s.ok()) {
|
||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// tune fadvise to keep all of this lower level file in page cache
|
||||||
|
// (compaction of unsorted files causes severe cache misses)
|
||||||
|
file->SetMetadataOffset(1);
|
||||||
|
|
||||||
TableBuilder* builder = new TableBuilder(options, file);
|
TableBuilder* builder = new TableBuilder(options, file);
|
||||||
meta->smallest.DecodeFrom(iter->key());
|
meta->smallest.DecodeFrom(iter->key());
|
||||||
for (; iter->Valid(); iter->Next()) {
|
for (; iter->Valid(); iter->Next()) {
|
||||||
|
++keys_seen;
|
||||||
Slice key = iter->key();
|
Slice key = iter->key();
|
||||||
|
if (!retire(key))
|
||||||
|
{
|
||||||
meta->largest.DecodeFrom(key);
|
meta->largest.DecodeFrom(key);
|
||||||
builder->Add(key, iter->value());
|
builder->Add(key, iter->value());
|
||||||
|
++meta->num_entries;
|
||||||
|
} // if
|
||||||
|
else
|
||||||
|
{
|
||||||
|
++keys_retired;
|
||||||
|
} // else
|
||||||
}
|
}
|
||||||
|
|
||||||
// Finish and check for builder errors
|
// Finish and check for builder errors
|
||||||
|
@ -45,6 +73,9 @@ Status BuildTable(const std::string& dbname,
|
||||||
s = builder->Finish();
|
s = builder->Finish();
|
||||||
if (s.ok()) {
|
if (s.ok()) {
|
||||||
meta->file_size = builder->FileSize();
|
meta->file_size = builder->FileSize();
|
||||||
|
meta->exp_write_low = builder->GetExpiryWriteLow();
|
||||||
|
meta->exp_write_high = builder->GetExpiryWriteHigh();
|
||||||
|
meta->exp_explicit_high = builder->GetExpiryExplicitHigh();
|
||||||
assert(meta->file_size > 0);
|
assert(meta->file_size > 0);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
@ -64,10 +95,20 @@ Status BuildTable(const std::string& dbname,
|
||||||
|
|
||||||
if (s.ok()) {
|
if (s.ok()) {
|
||||||
// Verify that the table is usable
|
// Verify that the table is usable
|
||||||
|
Table * table_ptr;
|
||||||
Iterator* it = table_cache->NewIterator(ReadOptions(),
|
Iterator* it = table_cache->NewIterator(ReadOptions(),
|
||||||
meta->number,
|
meta->number,
|
||||||
meta->file_size);
|
meta->file_size,
|
||||||
|
meta->level,
|
||||||
|
&table_ptr);
|
||||||
s = it->status();
|
s = it->status();
|
||||||
|
|
||||||
|
// Riak specific: bloom filter is no longer read by default,
|
||||||
|
// force read on highly used overlapped table files
|
||||||
|
if (s.ok() && VersionSet::IsLevelOverlapped(meta->level))
|
||||||
|
table_ptr->ReadFilter();
|
||||||
|
|
||||||
|
// table_ptr is owned by it and therefore invalidated by this delete
|
||||||
delete it;
|
delete it;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -79,6 +120,11 @@ Status BuildTable(const std::string& dbname,
|
||||||
|
|
||||||
if (s.ok() && meta->file_size > 0) {
|
if (s.ok() && meta->file_size > 0) {
|
||||||
// Keep it
|
// Keep it
|
||||||
|
if (0!=keys_retired)
|
||||||
|
{
|
||||||
|
Log(options.info_log, "Level-0 table #%" PRIu64 ": %zd keys seen, %zd keys retired, %zd keys expired",
|
||||||
|
meta->number, keys_seen, retire.GetDroppedCount(), retire.GetExpiredCount());
|
||||||
|
} // if
|
||||||
} else {
|
} else {
|
||||||
env->DeleteFile(fname);
|
env->DeleteFile(fname);
|
||||||
}
|
}
|
||||||
|
|
|
@ -6,6 +6,7 @@
|
||||||
#define STORAGE_LEVELDB_DB_BUILDER_H_
|
#define STORAGE_LEVELDB_DB_BUILDER_H_
|
||||||
|
|
||||||
#include "leveldb/status.h"
|
#include "leveldb/status.h"
|
||||||
|
#include "db/dbformat.h"
|
||||||
|
|
||||||
namespace leveldb {
|
namespace leveldb {
|
||||||
|
|
||||||
|
@ -25,9 +26,11 @@ class VersionEdit;
|
||||||
extern Status BuildTable(const std::string& dbname,
|
extern Status BuildTable(const std::string& dbname,
|
||||||
Env* env,
|
Env* env,
|
||||||
const Options& options,
|
const Options& options,
|
||||||
|
const Comparator * user_comparator,
|
||||||
TableCache* table_cache,
|
TableCache* table_cache,
|
||||||
Iterator* iter,
|
Iterator* iter,
|
||||||
FileMetaData* meta);
|
FileMetaData* meta,
|
||||||
|
SequenceNumber smallest_snapshot);
|
||||||
|
|
||||||
} // namespace leveldb
|
} // namespace leveldb
|
||||||
|
|
||||||
|
|
|
@ -6,6 +6,7 @@
|
||||||
|
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
|
#include <stdint.h>
|
||||||
#include "leveldb/cache.h"
|
#include "leveldb/cache.h"
|
||||||
#include "leveldb/comparator.h"
|
#include "leveldb/comparator.h"
|
||||||
#include "leveldb/db.h"
|
#include "leveldb/db.h"
|
||||||
|
@ -40,6 +41,8 @@ using leveldb::Status;
|
||||||
using leveldb::WritableFile;
|
using leveldb::WritableFile;
|
||||||
using leveldb::WriteBatch;
|
using leveldb::WriteBatch;
|
||||||
using leveldb::WriteOptions;
|
using leveldb::WriteOptions;
|
||||||
|
using leveldb::KeyMetaData;
|
||||||
|
using leveldb::ValueType;
|
||||||
|
|
||||||
extern "C" {
|
extern "C" {
|
||||||
|
|
||||||
|
@ -49,6 +52,7 @@ struct leveldb_writebatch_t { WriteBatch rep; };
|
||||||
struct leveldb_snapshot_t { const Snapshot* rep; };
|
struct leveldb_snapshot_t { const Snapshot* rep; };
|
||||||
struct leveldb_readoptions_t { ReadOptions rep; };
|
struct leveldb_readoptions_t { ReadOptions rep; };
|
||||||
struct leveldb_writeoptions_t { WriteOptions rep; };
|
struct leveldb_writeoptions_t { WriteOptions rep; };
|
||||||
|
struct leveldb_keymetadata_t { KeyMetaData rep; };
|
||||||
struct leveldb_options_t { Options rep; };
|
struct leveldb_options_t { Options rep; };
|
||||||
struct leveldb_cache_t { Cache* rep; };
|
struct leveldb_cache_t { Cache* rep; };
|
||||||
struct leveldb_seqfile_t { SequentialFile* rep; };
|
struct leveldb_seqfile_t { SequentialFile* rep; };
|
||||||
|
@ -173,8 +177,19 @@ void leveldb_put(
|
||||||
const char* key, size_t keylen,
|
const char* key, size_t keylen,
|
||||||
const char* val, size_t vallen,
|
const char* val, size_t vallen,
|
||||||
char** errptr) {
|
char** errptr) {
|
||||||
|
return(leveldb_put2(db, options, key, keylen, val, vallen, errptr, NULL));
|
||||||
|
}
|
||||||
|
|
||||||
|
void leveldb_put2(
|
||||||
|
leveldb_t* db,
|
||||||
|
const leveldb_writeoptions_t* options,
|
||||||
|
const char* key, size_t keylen,
|
||||||
|
const char* val, size_t vallen,
|
||||||
|
char** errptr,
|
||||||
|
const leveldb_keymetadata_t * metadata) {
|
||||||
SaveError(errptr,
|
SaveError(errptr,
|
||||||
db->rep->Put(options->rep, Slice(key, keylen), Slice(val, vallen)));
|
db->rep->Put(options->rep, Slice(key, keylen), Slice(val, vallen),
|
||||||
|
(NULL==metadata ? NULL : &metadata->rep)));
|
||||||
}
|
}
|
||||||
|
|
||||||
void leveldb_delete(
|
void leveldb_delete(
|
||||||
|
@ -200,9 +215,21 @@ char* leveldb_get(
|
||||||
const char* key, size_t keylen,
|
const char* key, size_t keylen,
|
||||||
size_t* vallen,
|
size_t* vallen,
|
||||||
char** errptr) {
|
char** errptr) {
|
||||||
|
|
||||||
|
return(leveldb_get2(db, options, key, keylen, vallen, errptr, NULL));
|
||||||
|
}
|
||||||
|
|
||||||
|
char* leveldb_get2(
|
||||||
|
leveldb_t* db,
|
||||||
|
const leveldb_readoptions_t* options,
|
||||||
|
const char* key, size_t keylen,
|
||||||
|
size_t* vallen,
|
||||||
|
char** errptr,
|
||||||
|
leveldb_keymetadata_t * metadata) {
|
||||||
char* result = NULL;
|
char* result = NULL;
|
||||||
std::string tmp;
|
std::string tmp;
|
||||||
Status s = db->rep->Get(options->rep, Slice(key, keylen), &tmp);
|
Status s = db->rep->Get(options->rep, Slice(key, keylen), &tmp,
|
||||||
|
(NULL==metadata ? NULL : &metadata->rep));
|
||||||
if (s.ok()) {
|
if (s.ok()) {
|
||||||
*vallen = tmp.size();
|
*vallen = tmp.size();
|
||||||
result = CopyString(tmp);
|
result = CopyString(tmp);
|
||||||
|
@ -330,6 +357,15 @@ const char* leveldb_iter_value(const leveldb_iterator_t* iter, size_t* vlen) {
|
||||||
return s.data();
|
return s.data();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const void leveldb_iter_keymetadata(const leveldb_iterator_t* iter,
|
||||||
|
leveldb_keymetadata_t * meta)
|
||||||
|
{
|
||||||
|
if (NULL!=iter && NULL!=meta)
|
||||||
|
{
|
||||||
|
meta->rep=iter->rep->keymetadata();
|
||||||
|
} // if
|
||||||
|
}
|
||||||
|
|
||||||
void leveldb_iter_get_error(const leveldb_iterator_t* iter, char** errptr) {
|
void leveldb_iter_get_error(const leveldb_iterator_t* iter, char** errptr) {
|
||||||
SaveError(errptr, iter->rep->status());
|
SaveError(errptr, iter->rep->status());
|
||||||
}
|
}
|
||||||
|
@ -350,7 +386,16 @@ void leveldb_writebatch_put(
|
||||||
leveldb_writebatch_t* b,
|
leveldb_writebatch_t* b,
|
||||||
const char* key, size_t klen,
|
const char* key, size_t klen,
|
||||||
const char* val, size_t vlen) {
|
const char* val, size_t vlen) {
|
||||||
b->rep.Put(Slice(key, klen), Slice(val, vlen));
|
leveldb_writebatch_put2(b, key, klen, val, vlen,NULL);
|
||||||
|
}
|
||||||
|
|
||||||
|
void leveldb_writebatch_put2(
|
||||||
|
leveldb_writebatch_t* b,
|
||||||
|
const char* key, size_t klen,
|
||||||
|
const char* val, size_t vlen,
|
||||||
|
const leveldb_keymetadata_t * metadata) {
|
||||||
|
b->rep.Put(Slice(key, klen), Slice(val, vlen),
|
||||||
|
(NULL==metadata ? NULL : &metadata->rep));
|
||||||
}
|
}
|
||||||
|
|
||||||
void leveldb_writebatch_delete(
|
void leveldb_writebatch_delete(
|
||||||
|
@ -362,15 +407,20 @@ void leveldb_writebatch_delete(
|
||||||
void leveldb_writebatch_iterate(
|
void leveldb_writebatch_iterate(
|
||||||
leveldb_writebatch_t* b,
|
leveldb_writebatch_t* b,
|
||||||
void* state,
|
void* state,
|
||||||
void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen),
|
void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen,
|
||||||
|
const int & type, const uint64_t & expiry),
|
||||||
void (*deleted)(void*, const char* k, size_t klen)) {
|
void (*deleted)(void*, const char* k, size_t klen)) {
|
||||||
class H : public WriteBatch::Handler {
|
class H : public WriteBatch::Handler {
|
||||||
public:
|
public:
|
||||||
void* state_;
|
void* state_;
|
||||||
void (*put_)(void*, const char* k, size_t klen, const char* v, size_t vlen);
|
void (*put_)(void*, const char* k, size_t klen, const char* v, size_t vlen,
|
||||||
|
const int & type, const uint64_t & expiry);
|
||||||
void (*deleted_)(void*, const char* k, size_t klen);
|
void (*deleted_)(void*, const char* k, size_t klen);
|
||||||
virtual void Put(const Slice& key, const Slice& value) {
|
virtual void Put(const Slice& key, const Slice& value,
|
||||||
(*put_)(state_, key.data(), key.size(), value.data(), value.size());
|
const leveldb::ValueType & type,
|
||||||
|
const leveldb::ExpiryTimeMicros & expiry)
|
||||||
|
{
|
||||||
|
(*put_)(state_, key.data(), key.size(), value.data(), value.size(), (int)type, (uint64_t)expiry);
|
||||||
}
|
}
|
||||||
virtual void Delete(const Slice& key) {
|
virtual void Delete(const Slice& key) {
|
||||||
(*deleted_)(state_, key.data(), key.size());
|
(*deleted_)(state_, key.data(), key.size());
|
||||||
|
@ -418,6 +468,11 @@ void leveldb_options_set_paranoid_checks(
|
||||||
opt->rep.paranoid_checks = v;
|
opt->rep.paranoid_checks = v;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void leveldb_options_set_verify_compactions(
|
||||||
|
leveldb_options_t* opt, unsigned char v) {
|
||||||
|
opt->rep.verify_compactions = v;
|
||||||
|
}
|
||||||
|
|
||||||
void leveldb_options_set_env(leveldb_options_t* opt, leveldb_env_t* env) {
|
void leveldb_options_set_env(leveldb_options_t* opt, leveldb_env_t* env) {
|
||||||
opt->rep.env = (env ? env->rep : NULL);
|
opt->rep.env = (env ? env->rep : NULL);
|
||||||
}
|
}
|
||||||
|
@ -450,6 +505,10 @@ void leveldb_options_set_compression(leveldb_options_t* opt, int t) {
|
||||||
opt->rep.compression = static_cast<CompressionType>(t);
|
opt->rep.compression = static_cast<CompressionType>(t);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void leveldb_options_set_total_leveldb_mem(leveldb_options_t* opt, size_t s) {
|
||||||
|
opt->rep.total_leveldb_mem = s;
|
||||||
|
}
|
||||||
|
|
||||||
leveldb_comparator_t* leveldb_comparator_create(
|
leveldb_comparator_t* leveldb_comparator_create(
|
||||||
void* state,
|
void* state,
|
||||||
void (*destructor)(void*),
|
void (*destructor)(void*),
|
||||||
|
@ -580,7 +639,17 @@ void leveldb_env_destroy(leveldb_env_t* env) {
|
||||||
delete env;
|
delete env;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void leveldb_env_shutdown() {
|
||||||
|
Env::Shutdown();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* CAUTION: this call is only for char * objects returned by
|
||||||
|
* functions like leveldb_get and leveldb_property_value.
|
||||||
|
* Also used to release errptr strings.
|
||||||
|
*/
|
||||||
void leveldb_free(void* ptr) {
|
void leveldb_free(void* ptr) {
|
||||||
|
if (NULL!=ptr)
|
||||||
free(ptr);
|
free(ptr);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -3,6 +3,8 @@
|
||||||
found in the LICENSE file. See the AUTHORS file for names of contributors. */
|
found in the LICENSE file. See the AUTHORS file for names of contributors. */
|
||||||
|
|
||||||
#include "leveldb/c.h"
|
#include "leveldb/c.h"
|
||||||
|
#include "leveldb/options.h"
|
||||||
|
#include "port/port.h"
|
||||||
|
|
||||||
#include <stddef.h>
|
#include <stddef.h>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
|
@ -11,8 +13,13 @@
|
||||||
#include <sys/types.h>
|
#include <sys/types.h>
|
||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
|
|
||||||
|
using leveldb::ValueType;
|
||||||
|
|
||||||
|
struct leveldb_keymetadata_t { leveldb::KeyMetaData rep; };
|
||||||
|
|
||||||
const char* phase = "";
|
const char* phase = "";
|
||||||
static char dbname[200];
|
static char dbname[200];
|
||||||
|
static leveldb::ExpiryTimeMicros gStartTime;
|
||||||
|
|
||||||
static void StartPhase(const char* name) {
|
static void StartPhase(const char* name) {
|
||||||
fprintf(stderr, "=== Test %s\n", name);
|
fprintf(stderr, "=== Test %s\n", name);
|
||||||
|
@ -49,7 +56,7 @@ static void CheckEqual(const char* expected, const char* v, size_t n) {
|
||||||
fprintf(stderr, "%s: expected '%s', got '%s'\n",
|
fprintf(stderr, "%s: expected '%s', got '%s'\n",
|
||||||
phase,
|
phase,
|
||||||
(expected ? expected : "(null)"),
|
(expected ? expected : "(null)"),
|
||||||
(v ? v : "(null"));
|
(v ? v : "(null)"));
|
||||||
abort();
|
abort();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -112,6 +119,117 @@ static void CheckDel(void* ptr, const char* k, size_t klen) {
|
||||||
(*state)++;
|
(*state)++;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// (expiry enabled)
|
||||||
|
static void CheckGet2(
|
||||||
|
leveldb_t* db,
|
||||||
|
const leveldb_readoptions_t* options,
|
||||||
|
const char* key,
|
||||||
|
const char* expected,
|
||||||
|
ValueType type,
|
||||||
|
uint64_t expiry) {
|
||||||
|
char* err = NULL;
|
||||||
|
size_t val_len;
|
||||||
|
char* val;
|
||||||
|
leveldb_keymetadata_t meta;
|
||||||
|
|
||||||
|
val = leveldb_get2(db, options, key, strlen(key), &val_len, &err, &meta);
|
||||||
|
CheckNoError(err);
|
||||||
|
CheckEqual(expected, val, val_len);
|
||||||
|
CheckCondition(type==meta.rep.m_Type);
|
||||||
|
if (0==expiry && leveldb::kTypeValueWriteTime==type)
|
||||||
|
{
|
||||||
|
leveldb::ExpiryTimeMicros now=leveldb::port::TimeMicros();
|
||||||
|
CheckCondition(gStartTime<=meta.rep.m_Expiry && meta.rep.m_Expiry<=now);
|
||||||
|
} // if
|
||||||
|
else
|
||||||
|
{CheckCondition(expiry==meta.rep.m_Expiry);}
|
||||||
|
|
||||||
|
Free(&val);
|
||||||
|
}
|
||||||
|
|
||||||
|
// (expiry enabled)
|
||||||
|
static void CheckIter2(leveldb_iterator_t* iter,
|
||||||
|
const char* key, const char* val,
|
||||||
|
const leveldb::KeyMetaData & meta) {
|
||||||
|
size_t len;
|
||||||
|
const char* str;
|
||||||
|
leveldb_keymetadata_t it_meta;
|
||||||
|
|
||||||
|
str = leveldb_iter_key(iter, &len);
|
||||||
|
CheckEqual(key, str, len);
|
||||||
|
str = leveldb_iter_value(iter, &len);
|
||||||
|
CheckEqual(val, str, len);
|
||||||
|
|
||||||
|
leveldb_iter_keymetadata(iter, &it_meta);
|
||||||
|
CheckCondition(meta.m_Type==it_meta.rep.m_Type);
|
||||||
|
if (0==meta.m_Expiry && leveldb::kTypeValueWriteTime==meta.m_Type)
|
||||||
|
{
|
||||||
|
leveldb::ExpiryTimeMicros now=leveldb::port::TimeMicros();
|
||||||
|
CheckCondition(gStartTime<=it_meta.rep.m_Expiry && it_meta.rep.m_Expiry<=now);
|
||||||
|
} // if
|
||||||
|
else
|
||||||
|
{CheckCondition(meta.m_Expiry==it_meta.rep.m_Expiry);}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
// Callback from leveldb_writebatch_iterate()
|
||||||
|
// (expiry enabled)
|
||||||
|
struct CheckPut2Data
|
||||||
|
{
|
||||||
|
const char * m_Key;
|
||||||
|
const char * m_Value;
|
||||||
|
ValueType m_Type;
|
||||||
|
uint64_t m_Expiry;
|
||||||
|
};
|
||||||
|
|
||||||
|
static struct CheckPut2Data gCheckPut2Data[]=
|
||||||
|
{
|
||||||
|
{"foo","hello_put2",leveldb::kTypeValue,0},
|
||||||
|
{"box","c_put2",leveldb::kTypeValue,0},
|
||||||
|
{"disney","cartoon_put2",leveldb::kTypeValueWriteTime, 0},
|
||||||
|
{"money","lotsof_put2",leveldb::kTypeValueWriteTime, 9988776655},
|
||||||
|
{"time","ismoney_put2",leveldb::kTypeValueExplicitExpiry, 221199887766}
|
||||||
|
};
|
||||||
|
|
||||||
|
static struct CheckPut2Data gCheckPut2ItrData[]=
|
||||||
|
{
|
||||||
|
{"bar","b",leveldb::kTypeValue,0},
|
||||||
|
{"box","c",leveldb::kTypeValue,0},
|
||||||
|
{"bar","",leveldb::kTypeDeletion,0},
|
||||||
|
{"mom","texas",leveldb::kTypeValueWriteTime,0},
|
||||||
|
{"dad","poland",leveldb::kTypeValueExplicitExpiry,22446688}
|
||||||
|
};
|
||||||
|
|
||||||
|
static void CheckPut2(void* ptr,
|
||||||
|
const char* k, size_t klen,
|
||||||
|
const char* v, size_t vlen,
|
||||||
|
const int & type_int,
|
||||||
|
const uint64_t & expiry) {
|
||||||
|
int* state = (int*) ptr;
|
||||||
|
CheckCondition(*state < (sizeof(gCheckPut2ItrData)/sizeof(gCheckPut2ItrData[0])));
|
||||||
|
struct CheckPut2Data * test;
|
||||||
|
|
||||||
|
test=&gCheckPut2ItrData[*state];
|
||||||
|
CheckEqual(test->m_Key, k, klen);
|
||||||
|
CheckEqual(test->m_Value, v, vlen);
|
||||||
|
CheckCondition((int)test->m_Type==type_int);
|
||||||
|
if (leveldb::kTypeValueWriteTime!=test->m_Type)
|
||||||
|
{CheckCondition((uint64_t)test->m_Expiry==expiry);}
|
||||||
|
(*state)++;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Callback from leveldb_writebatch_iterate()
|
||||||
|
// (expiry enabled)
|
||||||
|
static void CheckDel2(void* ptr, const char* k, size_t klen) {
|
||||||
|
int* state = (int*) ptr;
|
||||||
|
CheckCondition(*state < (sizeof(gCheckPut2ItrData)/sizeof(gCheckPut2ItrData[0])));
|
||||||
|
struct CheckPut2Data * test;
|
||||||
|
|
||||||
|
test=&gCheckPut2ItrData[*state];
|
||||||
|
CheckEqual(test->m_Key, k, klen);
|
||||||
|
(*state)++;
|
||||||
|
}
|
||||||
|
|
||||||
static void CmpDestroy(void* arg) { }
|
static void CmpDestroy(void* arg) { }
|
||||||
|
|
||||||
static int CmpCompare(void* arg, const char* a, size_t alen,
|
static int CmpCompare(void* arg, const char* a, size_t alen,
|
||||||
|
@ -141,7 +259,7 @@ static char* FilterCreate(
|
||||||
int num_keys,
|
int num_keys,
|
||||||
size_t* filter_length) {
|
size_t* filter_length) {
|
||||||
*filter_length = 4;
|
*filter_length = 4;
|
||||||
char* result = malloc(4);
|
char* result = (char*)malloc(4);
|
||||||
memcpy(result, "fake", 4);
|
memcpy(result, "fake", 4);
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
@ -167,6 +285,7 @@ int main(int argc, char** argv) {
|
||||||
|
|
||||||
CheckCondition(leveldb_major_version() >= 1);
|
CheckCondition(leveldb_major_version() >= 1);
|
||||||
CheckCondition(leveldb_minor_version() >= 1);
|
CheckCondition(leveldb_minor_version() >= 1);
|
||||||
|
gStartTime=leveldb::port::TimeMicros();
|
||||||
|
|
||||||
snprintf(dbname, sizeof(dbname),
|
snprintf(dbname, sizeof(dbname),
|
||||||
"%s/leveldb_c_test-%d",
|
"%s/leveldb_c_test-%d",
|
||||||
|
@ -207,12 +326,6 @@ int main(int argc, char** argv) {
|
||||||
CheckCondition(err != NULL);
|
CheckCondition(err != NULL);
|
||||||
Free(&err);
|
Free(&err);
|
||||||
|
|
||||||
StartPhase("leveldb_free");
|
|
||||||
db = leveldb_open(options, dbname, &err);
|
|
||||||
CheckCondition(err != NULL);
|
|
||||||
leveldb_free(err);
|
|
||||||
err = NULL;
|
|
||||||
|
|
||||||
StartPhase("open");
|
StartPhase("open");
|
||||||
leveldb_options_set_create_if_missing(options, 1);
|
leveldb_options_set_create_if_missing(options, 1);
|
||||||
db = leveldb_open(options, dbname, &err);
|
db = leveldb_open(options, dbname, &err);
|
||||||
|
@ -234,42 +347,74 @@ int main(int argc, char** argv) {
|
||||||
|
|
||||||
StartPhase("writebatch");
|
StartPhase("writebatch");
|
||||||
{
|
{
|
||||||
|
leveldb_keymetadata_t meta;
|
||||||
leveldb_writebatch_t* wb = leveldb_writebatch_create();
|
leveldb_writebatch_t* wb = leveldb_writebatch_create();
|
||||||
leveldb_writebatch_put(wb, "foo", 3, "a", 1);
|
leveldb_writebatch_put(wb, "foo", 3, "a", 1);
|
||||||
leveldb_writebatch_clear(wb);
|
leveldb_writebatch_clear(wb);
|
||||||
leveldb_writebatch_put(wb, "bar", 3, "b", 1);
|
leveldb_writebatch_put(wb, "bar", 3, "b", 1);
|
||||||
leveldb_writebatch_put(wb, "box", 3, "c", 1);
|
leveldb_writebatch_put(wb, "box", 3, "c", 1);
|
||||||
leveldb_writebatch_delete(wb, "bar", 3);
|
leveldb_writebatch_delete(wb, "bar", 3);
|
||||||
|
meta.rep.m_Type=leveldb::kTypeValueWriteTime;
|
||||||
|
meta.rep.m_Expiry=0;
|
||||||
|
leveldb_writebatch_put2(wb, "mom", 3, "texas", 5, &meta);
|
||||||
|
meta.rep.m_Type=leveldb::kTypeValueExplicitExpiry;
|
||||||
|
meta.rep.m_Expiry=22446688;
|
||||||
|
leveldb_writebatch_put2(wb, "dad", 3, "poland", 6, &meta);
|
||||||
leveldb_write(db, woptions, wb, &err);
|
leveldb_write(db, woptions, wb, &err);
|
||||||
CheckNoError(err);
|
CheckNoError(err);
|
||||||
CheckGet(db, roptions, "foo", "hello");
|
CheckGet(db, roptions, "foo", "hello");
|
||||||
CheckGet(db, roptions, "bar", NULL);
|
CheckGet(db, roptions, "bar", NULL);
|
||||||
CheckGet(db, roptions, "box", "c");
|
CheckGet(db, roptions, "box", "c");
|
||||||
|
CheckGet2(db, roptions, "dad", "poland", leveldb::kTypeValueExplicitExpiry, 22446688);
|
||||||
|
CheckGet2(db, roptions, "mom", "texas", leveldb::kTypeValueWriteTime, 0);
|
||||||
int pos = 0;
|
int pos = 0;
|
||||||
leveldb_writebatch_iterate(wb, &pos, CheckPut, CheckDel);
|
leveldb_writebatch_iterate(wb, &pos, CheckPut2, CheckDel2);
|
||||||
CheckCondition(pos == 3);
|
CheckCondition(pos == 5);
|
||||||
leveldb_writebatch_destroy(wb);
|
leveldb_writebatch_destroy(wb);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// reminder: keymetadata not supported on backward iteration
|
||||||
StartPhase("iter");
|
StartPhase("iter");
|
||||||
{
|
{
|
||||||
|
leveldb::KeyMetaData meta;
|
||||||
leveldb_iterator_t* iter = leveldb_create_iterator(db, roptions);
|
leveldb_iterator_t* iter = leveldb_create_iterator(db, roptions);
|
||||||
CheckCondition(!leveldb_iter_valid(iter));
|
CheckCondition(!leveldb_iter_valid(iter));
|
||||||
leveldb_iter_seek_to_first(iter);
|
leveldb_iter_seek_to_first(iter);
|
||||||
CheckCondition(leveldb_iter_valid(iter));
|
CheckCondition(leveldb_iter_valid(iter));
|
||||||
CheckIter(iter, "box", "c");
|
CheckIter(iter, "box", "c");
|
||||||
|
meta.m_Type=leveldb::kTypeValue;
|
||||||
|
meta.m_Expiry=0;
|
||||||
|
CheckIter2(iter, "box", "c", meta);
|
||||||
|
|
||||||
|
meta.m_Type=leveldb::kTypeValueExplicitExpiry;
|
||||||
|
meta.m_Expiry=22446688;
|
||||||
|
leveldb_iter_next(iter);
|
||||||
|
CheckIter2(iter, "dad", "poland", meta);
|
||||||
leveldb_iter_next(iter);
|
leveldb_iter_next(iter);
|
||||||
CheckIter(iter, "foo", "hello");
|
CheckIter(iter, "foo", "hello");
|
||||||
leveldb_iter_prev(iter);
|
leveldb_iter_prev(iter);
|
||||||
|
CheckIter(iter, "dad", "poland");
|
||||||
|
leveldb_iter_prev(iter);
|
||||||
CheckIter(iter, "box", "c");
|
CheckIter(iter, "box", "c");
|
||||||
leveldb_iter_prev(iter);
|
leveldb_iter_prev(iter);
|
||||||
CheckCondition(!leveldb_iter_valid(iter));
|
CheckCondition(!leveldb_iter_valid(iter));
|
||||||
leveldb_iter_seek_to_last(iter);
|
leveldb_iter_seek_to_last(iter);
|
||||||
CheckIter(iter, "foo", "hello");
|
CheckIter(iter, "mom", "texas");
|
||||||
leveldb_iter_seek(iter, "b", 1);
|
leveldb_iter_seek(iter, "b", 1);
|
||||||
CheckIter(iter, "box", "c");
|
CheckIter(iter, "box", "c");
|
||||||
leveldb_iter_get_error(iter, &err);
|
leveldb_iter_get_error(iter, &err);
|
||||||
CheckNoError(err);
|
CheckNoError(err);
|
||||||
|
|
||||||
|
meta.m_Type=leveldb::kTypeValue;
|
||||||
|
meta.m_Expiry=0;
|
||||||
|
CheckIter2(iter, "box", "c", meta);
|
||||||
|
leveldb_iter_seek(iter, "m", 1);
|
||||||
|
meta.m_Type=leveldb::kTypeValueWriteTime;
|
||||||
|
meta.m_Expiry=0;
|
||||||
|
CheckIter2(iter, "mom", "texas", meta);
|
||||||
|
leveldb_iter_get_error(iter, &err);
|
||||||
|
CheckNoError(err);
|
||||||
|
|
||||||
leveldb_iter_destroy(iter);
|
leveldb_iter_destroy(iter);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -335,6 +480,70 @@ int main(int argc, char** argv) {
|
||||||
leveldb_options_set_error_if_exists(options, 1);
|
leveldb_options_set_error_if_exists(options, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
StartPhase("put expiry");
|
||||||
|
{
|
||||||
|
leveldb_keymetadata_t meta;
|
||||||
|
int loop, count;
|
||||||
|
|
||||||
|
count = sizeof(gCheckPut2Data) / sizeof(gCheckPut2Data[0]);
|
||||||
|
|
||||||
|
for (loop=0; loop<count; ++loop)
|
||||||
|
{
|
||||||
|
size_t klen, vlen;
|
||||||
|
leveldb_keymetadata_t meta;
|
||||||
|
struct CheckPut2Data * test;
|
||||||
|
|
||||||
|
test=&gCheckPut2Data[loop];
|
||||||
|
klen=strlen(test->m_Key);
|
||||||
|
vlen=strlen(test->m_Value);
|
||||||
|
meta.rep.m_Type=test->m_Type;
|
||||||
|
meta.rep.m_Expiry=test->m_Expiry;
|
||||||
|
|
||||||
|
leveldb_put2(db, woptions, test->m_Key, klen,
|
||||||
|
test->m_Value, vlen, &err,
|
||||||
|
&meta);
|
||||||
|
CheckNoError(err);
|
||||||
|
} // for
|
||||||
|
|
||||||
|
// testing memtable right now
|
||||||
|
for (loop=0; loop<count; ++loop)
|
||||||
|
{
|
||||||
|
size_t klen, vlen;
|
||||||
|
leveldb_keymetadata_t meta;
|
||||||
|
struct CheckPut2Data * test;
|
||||||
|
|
||||||
|
test=&gCheckPut2Data[loop];
|
||||||
|
klen=strlen(test->m_Key);
|
||||||
|
vlen=strlen(test->m_Value);
|
||||||
|
|
||||||
|
CheckGet2(db, roptions, test->m_Key, test->m_Value,
|
||||||
|
test->m_Type, test->m_Expiry);
|
||||||
|
} // for
|
||||||
|
|
||||||
|
// close and open to force memory table into .sst upon open
|
||||||
|
leveldb_close(db);
|
||||||
|
leveldb_options_set_error_if_exists(options, 0);
|
||||||
|
db = leveldb_open(options, dbname, &err);
|
||||||
|
CheckNoError(err);
|
||||||
|
|
||||||
|
// now testing get from a level-0 .sst file
|
||||||
|
for (loop=0; loop<count; ++loop)
|
||||||
|
{
|
||||||
|
size_t klen, vlen;
|
||||||
|
leveldb_keymetadata_t meta;
|
||||||
|
struct CheckPut2Data * test;
|
||||||
|
|
||||||
|
test=&gCheckPut2Data[loop];
|
||||||
|
klen=strlen(test->m_Key);
|
||||||
|
vlen=strlen(test->m_Value);
|
||||||
|
|
||||||
|
CheckGet2(db, roptions, test->m_Key, test->m_Value,
|
||||||
|
test->m_Type, test->m_Expiry);
|
||||||
|
} // for
|
||||||
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
// This screws up "options" for real database work. execute last.
|
||||||
StartPhase("filter");
|
StartPhase("filter");
|
||||||
for (run = 0; run < 2; run++) {
|
for (run = 0; run < 2; run++) {
|
||||||
// First run uses custom filter, second run uses bloom filter
|
// First run uses custom filter, second run uses bloom filter
|
||||||
|
@ -376,6 +585,8 @@ int main(int argc, char** argv) {
|
||||||
leveldb_filterpolicy_destroy(policy);
|
leveldb_filterpolicy_destroy(policy);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
StartPhase("cleanup");
|
StartPhase("cleanup");
|
||||||
leveldb_close(db);
|
leveldb_close(db);
|
||||||
leveldb_options_destroy(options);
|
leveldb_options_destroy(options);
|
||||||
|
@ -386,5 +597,7 @@ int main(int argc, char** argv) {
|
||||||
leveldb_env_destroy(env);
|
leveldb_env_destroy(env);
|
||||||
|
|
||||||
fprintf(stderr, "PASS\n");
|
fprintf(stderr, "PASS\n");
|
||||||
|
|
||||||
|
leveldb_env_shutdown();
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
|
@ -35,8 +35,8 @@ class CorruptionTest {
|
||||||
CorruptionTest() {
|
CorruptionTest() {
|
||||||
tiny_cache_ = NewLRUCache(100);
|
tiny_cache_ = NewLRUCache(100);
|
||||||
options_.env = &env_;
|
options_.env = &env_;
|
||||||
options_.block_cache = tiny_cache_;
|
dbname_ = test::TmpDir() + "/db_test";
|
||||||
dbname_ = test::TmpDir() + "/corruption_test";
|
dbname_ = MakeTieredDbname(dbname_, options_);
|
||||||
DestroyDB(dbname_, options_);
|
DestroyDB(dbname_, options_);
|
||||||
|
|
||||||
db_ = NULL;
|
db_ = NULL;
|
||||||
|
@ -51,14 +51,17 @@ class CorruptionTest {
|
||||||
delete tiny_cache_;
|
delete tiny_cache_;
|
||||||
}
|
}
|
||||||
|
|
||||||
Status TryReopen() {
|
Status TryReopen(Options* options = NULL) {
|
||||||
delete db_;
|
delete db_;
|
||||||
db_ = NULL;
|
db_ = NULL;
|
||||||
return DB::Open(options_, dbname_, &db_);
|
Options opt = (options ? *options : options_);
|
||||||
|
opt.env = &env_;
|
||||||
|
opt.block_cache = tiny_cache_;
|
||||||
|
return DB::Open(opt, dbname_, &db_);
|
||||||
}
|
}
|
||||||
|
|
||||||
void Reopen() {
|
void Reopen(Options* options = NULL) {
|
||||||
ASSERT_OK(TryReopen());
|
ASSERT_OK(TryReopen(options));
|
||||||
}
|
}
|
||||||
|
|
||||||
void RepairDB() {
|
void RepairDB() {
|
||||||
|
@ -75,13 +78,7 @@ class CorruptionTest {
|
||||||
Slice key = Key(i, &key_space);
|
Slice key = Key(i, &key_space);
|
||||||
batch.Clear();
|
batch.Clear();
|
||||||
batch.Put(key, Value(i, &value_space));
|
batch.Put(key, Value(i, &value_space));
|
||||||
WriteOptions options;
|
ASSERT_OK(db_->Write(WriteOptions(), &batch));
|
||||||
// Corrupt() doesn't work without this sync on windows; stat reports 0 for
|
|
||||||
// the file size.
|
|
||||||
if (i == n - 1) {
|
|
||||||
options.sync = true;
|
|
||||||
}
|
|
||||||
ASSERT_OK(db_->Write(options, &batch));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -96,10 +93,6 @@ class CorruptionTest {
|
||||||
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
|
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
|
||||||
uint64_t key;
|
uint64_t key;
|
||||||
Slice in(iter->key());
|
Slice in(iter->key());
|
||||||
if (in == "" || in == "~") {
|
|
||||||
// Ignore boundary keys.
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if (!ConsumeDecimalNumber(&in, &key) ||
|
if (!ConsumeDecimalNumber(&in, &key) ||
|
||||||
!in.empty() ||
|
!in.empty() ||
|
||||||
key < next_expected) {
|
key < next_expected) {
|
||||||
|
@ -123,19 +116,26 @@ class CorruptionTest {
|
||||||
ASSERT_GE(max_expected, correct);
|
ASSERT_GE(max_expected, correct);
|
||||||
}
|
}
|
||||||
|
|
||||||
void Corrupt(FileType filetype, int offset, int bytes_to_corrupt) {
|
void Corrupt(FileType filetype, int offset, int bytes_to_corrupt, int level=0) {
|
||||||
// Pick file to corrupt
|
// Pick file to corrupt
|
||||||
std::vector<std::string> filenames;
|
std::vector<std::string> filenames;
|
||||||
ASSERT_OK(env_.GetChildren(dbname_, &filenames));
|
std::string dirname;
|
||||||
|
if (leveldb::kTableFile!=filetype)
|
||||||
|
dirname=dbname_;
|
||||||
|
else
|
||||||
|
dirname=MakeDirName2(options_, level, "sst");
|
||||||
|
|
||||||
|
ASSERT_OK(env_.GetChildren(dirname, &filenames));
|
||||||
|
|
||||||
uint64_t number;
|
uint64_t number;
|
||||||
FileType type;
|
FileType type;
|
||||||
std::string fname;
|
std::string fname;
|
||||||
int picked_number = -1;
|
int picked_number = -1;
|
||||||
for (size_t i = 0; i < filenames.size(); i++) {
|
for (int i = 0; i < filenames.size(); i++) {
|
||||||
if (ParseFileName(filenames[i], &number, &type) &&
|
if (ParseFileName(filenames[i], &number, &type) &&
|
||||||
type == filetype &&
|
type == filetype &&
|
||||||
int(number) > picked_number) { // Pick latest file
|
int(number) > picked_number) { // Pick latest file
|
||||||
fname = dbname_ + "/" + filenames[i];
|
fname = dirname + "/" + filenames[i];
|
||||||
picked_number = number;
|
picked_number = number;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -222,12 +222,14 @@ TEST(CorruptionTest, NewFileErrorDuringWrite) {
|
||||||
const int num = 3 + (Options().write_buffer_size / kValueSize);
|
const int num = 3 + (Options().write_buffer_size / kValueSize);
|
||||||
std::string value_storage;
|
std::string value_storage;
|
||||||
Status s;
|
Status s;
|
||||||
for (int i = 0; s.ok() && i < num; i++) {
|
for (int i = 0;
|
||||||
|
s.ok() && i < num && 0==env_.num_writable_file_errors_;
|
||||||
|
i++) {
|
||||||
WriteBatch batch;
|
WriteBatch batch;
|
||||||
batch.Put("a", Value(100, &value_storage));
|
batch.Put("a", Value(100, &value_storage));
|
||||||
s = db_->Write(WriteOptions(), &batch);
|
s = db_->Write(WriteOptions(), &batch);
|
||||||
}
|
}
|
||||||
ASSERT_TRUE(!s.ok());
|
// ASSERT_TRUE(!s.ok()); Background write thread will never report this
|
||||||
ASSERT_GE(env_.num_writable_file_errors_, 1);
|
ASSERT_GE(env_.num_writable_file_errors_, 1);
|
||||||
env_.writable_file_error_ = false;
|
env_.writable_file_error_ = false;
|
||||||
Reopen();
|
Reopen();
|
||||||
|
@ -240,34 +242,18 @@ TEST(CorruptionTest, TableFile) {
|
||||||
dbi->TEST_CompactRange(0, NULL, NULL);
|
dbi->TEST_CompactRange(0, NULL, NULL);
|
||||||
dbi->TEST_CompactRange(1, NULL, NULL);
|
dbi->TEST_CompactRange(1, NULL, NULL);
|
||||||
|
|
||||||
Corrupt(kTableFile, 100, 1);
|
Corrupt(kTableFile, 100, 1, config::kMaxMemCompactLevel);
|
||||||
Check(90, 99);
|
|
||||||
}
|
|
||||||
|
|
||||||
TEST(CorruptionTest, TableFileRepair) {
|
|
||||||
options_.block_size = 2 * kValueSize; // Limit scope of corruption
|
|
||||||
options_.paranoid_checks = true;
|
|
||||||
Reopen();
|
|
||||||
Build(100);
|
|
||||||
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
|
|
||||||
dbi->TEST_CompactMemTable();
|
|
||||||
dbi->TEST_CompactRange(0, NULL, NULL);
|
|
||||||
dbi->TEST_CompactRange(1, NULL, NULL);
|
|
||||||
|
|
||||||
Corrupt(kTableFile, 100, 1);
|
|
||||||
RepairDB();
|
|
||||||
Reopen();
|
|
||||||
Check(95, 99);
|
Check(95, 99);
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(CorruptionTest, TableFileIndexData) {
|
TEST(CorruptionTest, TableFileIndexData) {
|
||||||
Build(10000); // Enough to build multiple Tables
|
Build(100000); // Enough to build multiple Tables
|
||||||
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
|
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
|
||||||
dbi->TEST_CompactMemTable();
|
dbi->TEST_CompactMemTable();
|
||||||
|
|
||||||
Corrupt(kTableFile, -2000, 500);
|
Corrupt(kTableFile, -2000, 500, config::kMaxMemCompactLevel);
|
||||||
Reopen();
|
Reopen();
|
||||||
Check(5000, 9999);
|
Check(50000, 99999);
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(CorruptionTest, MissingDescriptor) {
|
TEST(CorruptionTest, MissingDescriptor) {
|
||||||
|
@ -319,10 +305,10 @@ TEST(CorruptionTest, CompactionInputError) {
|
||||||
Build(10);
|
Build(10);
|
||||||
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
|
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
|
||||||
dbi->TEST_CompactMemTable();
|
dbi->TEST_CompactMemTable();
|
||||||
const int last = config::kMaxMemCompactLevel;
|
const int last = config::kMaxMemCompactLevel; // Riak does not "move" files
|
||||||
ASSERT_EQ(1, Property("leveldb.num-files-at-level" + NumberToString(last)));
|
ASSERT_EQ(1, Property("leveldb.num-files-at-level" + NumberToString(last)));
|
||||||
|
|
||||||
Corrupt(kTableFile, 100, 1);
|
Corrupt(kTableFile, 100, 1, last);
|
||||||
Check(5, 9);
|
Check(5, 9);
|
||||||
|
|
||||||
// Force compactions by writing lots of values
|
// Force compactions by writing lots of values
|
||||||
|
@ -331,23 +317,42 @@ TEST(CorruptionTest, CompactionInputError) {
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(CorruptionTest, CompactionInputErrorParanoid) {
|
TEST(CorruptionTest, CompactionInputErrorParanoid) {
|
||||||
options_.paranoid_checks = true;
|
Options options;
|
||||||
options_.write_buffer_size = 512 << 10;
|
options.paranoid_checks = true;
|
||||||
Reopen();
|
options.write_buffer_size = 1048576;
|
||||||
|
Reopen(&options);
|
||||||
|
|
||||||
|
int current_corruption=Property("leveldb.ReadBlockError");
|
||||||
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
|
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
|
||||||
|
|
||||||
// Make multiple inputs so we need to compact.
|
// Fill levels >= 1 so memtable compaction outputs to level 1
|
||||||
for (int i = 0; i < 2; i++) {
|
// matthewv 1/10/14 - what does "levels" have to do with this,
|
||||||
|
// switching to compaction trigger.
|
||||||
|
// 7/10/14 - compaction starts between 4 and 6 files ... assume 4 and 1 move
|
||||||
|
// (will make a new, descriptive constant for 4)
|
||||||
|
for (int level = Property("leveldb.num-files-at-level0")+1;
|
||||||
|
level < config::kL0_GroomingTrigger; level++) {
|
||||||
|
dbi->Put(WriteOptions(), "", "begin");
|
||||||
|
dbi->Put(WriteOptions(), "~", "end");
|
||||||
|
dbi->TEST_CompactMemTable();
|
||||||
|
}
|
||||||
|
|
||||||
Build(10);
|
Build(10);
|
||||||
dbi->TEST_CompactMemTable();
|
dbi->TEST_CompactMemTable();
|
||||||
Corrupt(kTableFile, 100, 1);
|
ASSERT_TRUE(1 < Property("leveldb.num-files-at-level0"));
|
||||||
env_.SleepForMicroseconds(100000);
|
|
||||||
}
|
|
||||||
dbi->CompactRange(NULL, NULL);
|
|
||||||
|
|
||||||
// Write must fail because of corrupted table
|
Corrupt(kTableFile, 100, 1, 0);
|
||||||
|
Check(5, 9);
|
||||||
|
|
||||||
|
// Write must eventually fail because of corrupted table
|
||||||
|
Status s;
|
||||||
std::string tmp1, tmp2;
|
std::string tmp1, tmp2;
|
||||||
Status s = db_->Put(WriteOptions(), Key(5, &tmp1), Value(5, &tmp2));
|
for (int i = 0; i < 10000 && s.ok(); i++) {
|
||||||
|
s = db_->Put(WriteOptions(), Key(i, &tmp1), Value(i, &tmp2));
|
||||||
|
}
|
||||||
|
if (s.ok())
|
||||||
|
ASSERT_NE(current_corruption, Property("leveldb.ReadBlockError")) << "no ReadBlockError seen";
|
||||||
|
else
|
||||||
ASSERT_TRUE(!s.ok()) << "write did not fail in corrupted paranoid db";
|
ASSERT_TRUE(!s.ok()) << "write did not fail in corrupted paranoid db";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -355,7 +360,7 @@ TEST(CorruptionTest, UnrelatedKeys) {
|
||||||
Build(10);
|
Build(10);
|
||||||
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
|
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
|
||||||
dbi->TEST_CompactMemTable();
|
dbi->TEST_CompactMemTable();
|
||||||
Corrupt(kTableFile, 100, 1);
|
Corrupt(kTableFile, 100, 1, config::kMaxMemCompactLevel);
|
||||||
|
|
||||||
std::string tmp1, tmp2;
|
std::string tmp1, tmp2;
|
||||||
ASSERT_OK(db_->Put(WriteOptions(), Key(1000, &tmp1), Value(1000, &tmp2)));
|
ASSERT_OK(db_->Put(WriteOptions(), Key(1000, &tmp1), Value(1000, &tmp2)));
|
||||||
|
|
|
@ -33,7 +33,6 @@
|
||||||
// readmissing -- read N missing keys in random order
|
// readmissing -- read N missing keys in random order
|
||||||
// readhot -- read N times in random order from 1% section of DB
|
// readhot -- read N times in random order from 1% section of DB
|
||||||
// seekrandom -- N random seeks
|
// seekrandom -- N random seeks
|
||||||
// open -- cost of opening a DB
|
|
||||||
// crc32c -- repeated crc32c of 4K of data
|
// crc32c -- repeated crc32c of 4K of data
|
||||||
// acquireload -- load N*1000 times
|
// acquireload -- load N*1000 times
|
||||||
// Meta operations:
|
// Meta operations:
|
||||||
|
@ -84,14 +83,6 @@ static bool FLAGS_histogram = false;
|
||||||
// (initialized to default value by "main")
|
// (initialized to default value by "main")
|
||||||
static int FLAGS_write_buffer_size = 0;
|
static int FLAGS_write_buffer_size = 0;
|
||||||
|
|
||||||
// Number of bytes written to each file.
|
|
||||||
// (initialized to default value by "main")
|
|
||||||
static int FLAGS_max_file_size = 0;
|
|
||||||
|
|
||||||
// Approximate size of user data packed per block (before compression.
|
|
||||||
// (initialized to default value by "main")
|
|
||||||
static int FLAGS_block_size = 0;
|
|
||||||
|
|
||||||
// Number of bytes to use as a cache of uncompressed data.
|
// Number of bytes to use as a cache of uncompressed data.
|
||||||
// Negative means use default settings.
|
// Negative means use default settings.
|
||||||
static int FLAGS_cache_size = -1;
|
static int FLAGS_cache_size = -1;
|
||||||
|
@ -103,21 +94,26 @@ static int FLAGS_open_files = 0;
|
||||||
// Negative means use default settings.
|
// Negative means use default settings.
|
||||||
static int FLAGS_bloom_bits = -1;
|
static int FLAGS_bloom_bits = -1;
|
||||||
|
|
||||||
|
// Riak bloom adaptation
|
||||||
|
static int FLAGS_bloom2_bits = -1;
|
||||||
|
|
||||||
|
// Riak param for total memory allocation (flex_cache)
|
||||||
|
static uint64_t FLAGS_leveldb_memory = -1;
|
||||||
|
|
||||||
|
// Riak param for compression setting
|
||||||
|
static int FLAGS_compression = 2;
|
||||||
|
|
||||||
// If true, do not destroy the existing database. If you set this
|
// If true, do not destroy the existing database. If you set this
|
||||||
// flag and also specify a benchmark that wants a fresh database, that
|
// flag and also specify a benchmark that wants a fresh database, that
|
||||||
// benchmark will fail.
|
// benchmark will fail.
|
||||||
static bool FLAGS_use_existing_db = false;
|
static bool FLAGS_use_existing_db = false;
|
||||||
|
|
||||||
// If true, reuse existing log/MANIFEST files when re-opening a database.
|
|
||||||
static bool FLAGS_reuse_logs = false;
|
|
||||||
|
|
||||||
// Use the db with the following name.
|
// Use the db with the following name.
|
||||||
static const char* FLAGS_db = NULL;
|
static const char* FLAGS_db = NULL;
|
||||||
|
|
||||||
namespace leveldb {
|
namespace leveldb {
|
||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
leveldb::Env* g_env = NULL;
|
|
||||||
|
|
||||||
// Helper for quickly generating random data.
|
// Helper for quickly generating random data.
|
||||||
class RandomGenerator {
|
class RandomGenerator {
|
||||||
|
@ -141,7 +137,7 @@ class RandomGenerator {
|
||||||
pos_ = 0;
|
pos_ = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
Slice Generate(size_t len) {
|
Slice Generate(int len) {
|
||||||
if (pos_ + len > data_.size()) {
|
if (pos_ + len > data_.size()) {
|
||||||
pos_ = 0;
|
pos_ = 0;
|
||||||
assert(len < data_.size());
|
assert(len < data_.size());
|
||||||
|
@ -151,19 +147,17 @@ class RandomGenerator {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
#if defined(__linux)
|
|
||||||
static Slice TrimSpace(Slice s) {
|
static Slice TrimSpace(Slice s) {
|
||||||
size_t start = 0;
|
int start = 0;
|
||||||
while (start < s.size() && isspace(s[start])) {
|
while (start < s.size() && isspace(s[start])) {
|
||||||
start++;
|
start++;
|
||||||
}
|
}
|
||||||
size_t limit = s.size();
|
int limit = s.size();
|
||||||
while (limit > start && isspace(s[limit-1])) {
|
while (limit > start && isspace(s[limit-1])) {
|
||||||
limit--;
|
limit--;
|
||||||
}
|
}
|
||||||
return Slice(s.data() + start, limit - start);
|
return Slice(s.data() + start, limit - start);
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
|
|
||||||
static void AppendWithSpace(std::string* str, Slice msg) {
|
static void AppendWithSpace(std::string* str, Slice msg) {
|
||||||
if (msg.empty()) return;
|
if (msg.empty()) return;
|
||||||
|
@ -195,7 +189,7 @@ class Stats {
|
||||||
done_ = 0;
|
done_ = 0;
|
||||||
bytes_ = 0;
|
bytes_ = 0;
|
||||||
seconds_ = 0;
|
seconds_ = 0;
|
||||||
start_ = g_env->NowMicros();
|
start_ = Env::Default()->NowMicros();
|
||||||
finish_ = start_;
|
finish_ = start_;
|
||||||
message_.clear();
|
message_.clear();
|
||||||
}
|
}
|
||||||
|
@ -213,7 +207,7 @@ class Stats {
|
||||||
}
|
}
|
||||||
|
|
||||||
void Stop() {
|
void Stop() {
|
||||||
finish_ = g_env->NowMicros();
|
finish_ = Env::Default()->NowMicros();
|
||||||
seconds_ = (finish_ - start_) * 1e-6;
|
seconds_ = (finish_ - start_) * 1e-6;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -223,7 +217,7 @@ class Stats {
|
||||||
|
|
||||||
void FinishedSingleOp() {
|
void FinishedSingleOp() {
|
||||||
if (FLAGS_histogram) {
|
if (FLAGS_histogram) {
|
||||||
double now = g_env->NowMicros();
|
double now = Env::Default()->NowMicros();
|
||||||
double micros = now - last_op_finish_;
|
double micros = now - last_op_finish_;
|
||||||
hist_.Add(micros);
|
hist_.Add(micros);
|
||||||
if (micros > 20000) {
|
if (micros > 20000) {
|
||||||
|
@ -405,7 +399,7 @@ class Benchmark {
|
||||||
: cache_(FLAGS_cache_size >= 0 ? NewLRUCache(FLAGS_cache_size) : NULL),
|
: cache_(FLAGS_cache_size >= 0 ? NewLRUCache(FLAGS_cache_size) : NULL),
|
||||||
filter_policy_(FLAGS_bloom_bits >= 0
|
filter_policy_(FLAGS_bloom_bits >= 0
|
||||||
? NewBloomFilterPolicy(FLAGS_bloom_bits)
|
? NewBloomFilterPolicy(FLAGS_bloom_bits)
|
||||||
: NULL),
|
: (FLAGS_bloom2_bits >=0 ? NewBloomFilterPolicy2(FLAGS_bloom2_bits) : NULL)),
|
||||||
db_(NULL),
|
db_(NULL),
|
||||||
num_(FLAGS_num),
|
num_(FLAGS_num),
|
||||||
value_size_(FLAGS_value_size),
|
value_size_(FLAGS_value_size),
|
||||||
|
@ -413,10 +407,10 @@ class Benchmark {
|
||||||
reads_(FLAGS_reads < 0 ? FLAGS_num : FLAGS_reads),
|
reads_(FLAGS_reads < 0 ? FLAGS_num : FLAGS_reads),
|
||||||
heap_counter_(0) {
|
heap_counter_(0) {
|
||||||
std::vector<std::string> files;
|
std::vector<std::string> files;
|
||||||
g_env->GetChildren(FLAGS_db, &files);
|
Env::Default()->GetChildren(FLAGS_db, &files);
|
||||||
for (size_t i = 0; i < files.size(); i++) {
|
for (int i = 0; i < files.size(); i++) {
|
||||||
if (Slice(files[i]).starts_with("heap-")) {
|
if (Slice(files[i]).starts_with("heap-")) {
|
||||||
g_env->DeleteFile(std::string(FLAGS_db) + "/" + files[i]);
|
Env::Default()->DeleteFile(std::string(FLAGS_db) + "/" + files[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (!FLAGS_use_existing_db) {
|
if (!FLAGS_use_existing_db) {
|
||||||
|
@ -446,7 +440,7 @@ class Benchmark {
|
||||||
benchmarks = sep + 1;
|
benchmarks = sep + 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Reset parameters that may be overridden below
|
// Reset parameters that may be overriddden bwlow
|
||||||
num_ = FLAGS_num;
|
num_ = FLAGS_num;
|
||||||
reads_ = (FLAGS_reads < 0 ? FLAGS_num : FLAGS_reads);
|
reads_ = (FLAGS_reads < 0 ? FLAGS_num : FLAGS_reads);
|
||||||
value_size_ = FLAGS_value_size;
|
value_size_ = FLAGS_value_size;
|
||||||
|
@ -457,11 +451,7 @@ class Benchmark {
|
||||||
bool fresh_db = false;
|
bool fresh_db = false;
|
||||||
int num_threads = FLAGS_threads;
|
int num_threads = FLAGS_threads;
|
||||||
|
|
||||||
if (name == Slice("open")) {
|
if (name == Slice("fillseq")) {
|
||||||
method = &Benchmark::OpenBench;
|
|
||||||
num_ /= 10000;
|
|
||||||
if (num_ < 1) num_ = 1;
|
|
||||||
} else if (name == Slice("fillseq")) {
|
|
||||||
fresh_db = true;
|
fresh_db = true;
|
||||||
method = &Benchmark::WriteSeq;
|
method = &Benchmark::WriteSeq;
|
||||||
} else if (name == Slice("fillbatch")) {
|
} else if (name == Slice("fillbatch")) {
|
||||||
|
@ -553,6 +543,7 @@ class Benchmark {
|
||||||
SharedState* shared;
|
SharedState* shared;
|
||||||
ThreadState* thread;
|
ThreadState* thread;
|
||||||
void (Benchmark::*method)(ThreadState*);
|
void (Benchmark::*method)(ThreadState*);
|
||||||
|
pthread_t thread_id;
|
||||||
};
|
};
|
||||||
|
|
||||||
static void ThreadBody(void* v) {
|
static void ThreadBody(void* v) {
|
||||||
|
@ -598,7 +589,8 @@ class Benchmark {
|
||||||
arg[i].shared = &shared;
|
arg[i].shared = &shared;
|
||||||
arg[i].thread = new ThreadState(i);
|
arg[i].thread = new ThreadState(i);
|
||||||
arg[i].thread->shared = &shared;
|
arg[i].thread->shared = &shared;
|
||||||
g_env->StartThread(ThreadBody, &arg[i]);
|
arg[i].thread_id=Env::Default()->StartThread(ThreadBody, &arg[i]);
|
||||||
|
pthread_detach(arg[i].thread_id);
|
||||||
}
|
}
|
||||||
|
|
||||||
shared.mu.Lock();
|
shared.mu.Lock();
|
||||||
|
@ -709,15 +701,12 @@ class Benchmark {
|
||||||
void Open() {
|
void Open() {
|
||||||
assert(db_ == NULL);
|
assert(db_ == NULL);
|
||||||
Options options;
|
Options options;
|
||||||
options.env = g_env;
|
|
||||||
options.create_if_missing = !FLAGS_use_existing_db;
|
options.create_if_missing = !FLAGS_use_existing_db;
|
||||||
options.block_cache = cache_;
|
options.block_cache = cache_;
|
||||||
options.write_buffer_size = FLAGS_write_buffer_size;
|
options.write_buffer_size = FLAGS_write_buffer_size;
|
||||||
options.max_file_size = FLAGS_max_file_size;
|
|
||||||
options.block_size = FLAGS_block_size;
|
|
||||||
options.max_open_files = FLAGS_open_files;
|
|
||||||
options.filter_policy = filter_policy_;
|
options.filter_policy = filter_policy_;
|
||||||
options.reuse_logs = FLAGS_reuse_logs;
|
options.compression = (leveldb::CompressionType)FLAGS_compression;
|
||||||
|
options.total_leveldb_mem = FLAGS_leveldb_memory;
|
||||||
Status s = DB::Open(options, FLAGS_db, &db_);
|
Status s = DB::Open(options, FLAGS_db, &db_);
|
||||||
if (!s.ok()) {
|
if (!s.ok()) {
|
||||||
fprintf(stderr, "open error: %s\n", s.ToString().c_str());
|
fprintf(stderr, "open error: %s\n", s.ToString().c_str());
|
||||||
|
@ -725,14 +714,6 @@ class Benchmark {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void OpenBench(ThreadState* thread) {
|
|
||||||
for (int i = 0; i < num_; i++) {
|
|
||||||
delete db_;
|
|
||||||
Open();
|
|
||||||
thread->stats.FinishedSingleOp();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void WriteSeq(ThreadState* thread) {
|
void WriteSeq(ThreadState* thread) {
|
||||||
DoWrite(thread, true);
|
DoWrite(thread, true);
|
||||||
}
|
}
|
||||||
|
@ -842,6 +823,7 @@ class Benchmark {
|
||||||
|
|
||||||
void SeekRandom(ThreadState* thread) {
|
void SeekRandom(ThreadState* thread) {
|
||||||
ReadOptions options;
|
ReadOptions options;
|
||||||
|
std::string value;
|
||||||
int found = 0;
|
int found = 0;
|
||||||
for (int i = 0; i < reads_; i++) {
|
for (int i = 0; i < reads_; i++) {
|
||||||
Iterator* iter = db_->NewIterator(options);
|
Iterator* iter = db_->NewIterator(options);
|
||||||
|
@ -937,7 +919,7 @@ class Benchmark {
|
||||||
char fname[100];
|
char fname[100];
|
||||||
snprintf(fname, sizeof(fname), "%s/heap-%04d", FLAGS_db, ++heap_counter_);
|
snprintf(fname, sizeof(fname), "%s/heap-%04d", FLAGS_db, ++heap_counter_);
|
||||||
WritableFile* file;
|
WritableFile* file;
|
||||||
Status s = g_env->NewWritableFile(fname, &file);
|
Status s = Env::Default()->NewWritableFile(fname, &file, 2<<20);
|
||||||
if (!s.ok()) {
|
if (!s.ok()) {
|
||||||
fprintf(stderr, "%s\n", s.ToString().c_str());
|
fprintf(stderr, "%s\n", s.ToString().c_str());
|
||||||
return;
|
return;
|
||||||
|
@ -946,7 +928,7 @@ class Benchmark {
|
||||||
delete file;
|
delete file;
|
||||||
if (!ok) {
|
if (!ok) {
|
||||||
fprintf(stderr, "heap profiling not supported\n");
|
fprintf(stderr, "heap profiling not supported\n");
|
||||||
g_env->DeleteFile(fname);
|
Env::Default()->DeleteFile(fname);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
@ -955,14 +937,14 @@ class Benchmark {
|
||||||
|
|
||||||
int main(int argc, char** argv) {
|
int main(int argc, char** argv) {
|
||||||
FLAGS_write_buffer_size = leveldb::Options().write_buffer_size;
|
FLAGS_write_buffer_size = leveldb::Options().write_buffer_size;
|
||||||
FLAGS_max_file_size = leveldb::Options().max_file_size;
|
|
||||||
FLAGS_block_size = leveldb::Options().block_size;
|
|
||||||
FLAGS_open_files = leveldb::Options().max_open_files;
|
FLAGS_open_files = leveldb::Options().max_open_files;
|
||||||
|
FLAGS_leveldb_memory = 25000000000LL;
|
||||||
std::string default_db_path;
|
std::string default_db_path;
|
||||||
|
|
||||||
for (int i = 1; i < argc; i++) {
|
for (int i = 1; i < argc; i++) {
|
||||||
double d;
|
double d;
|
||||||
int n;
|
int n;
|
||||||
|
uint64_t u;
|
||||||
char junk;
|
char junk;
|
||||||
if (leveldb::Slice(argv[i]).starts_with("--benchmarks=")) {
|
if (leveldb::Slice(argv[i]).starts_with("--benchmarks=")) {
|
||||||
FLAGS_benchmarks = argv[i] + strlen("--benchmarks=");
|
FLAGS_benchmarks = argv[i] + strlen("--benchmarks=");
|
||||||
|
@ -974,9 +956,6 @@ int main(int argc, char** argv) {
|
||||||
} else if (sscanf(argv[i], "--use_existing_db=%d%c", &n, &junk) == 1 &&
|
} else if (sscanf(argv[i], "--use_existing_db=%d%c", &n, &junk) == 1 &&
|
||||||
(n == 0 || n == 1)) {
|
(n == 0 || n == 1)) {
|
||||||
FLAGS_use_existing_db = n;
|
FLAGS_use_existing_db = n;
|
||||||
} else if (sscanf(argv[i], "--reuse_logs=%d%c", &n, &junk) == 1 &&
|
|
||||||
(n == 0 || n == 1)) {
|
|
||||||
FLAGS_reuse_logs = n;
|
|
||||||
} else if (sscanf(argv[i], "--num=%d%c", &n, &junk) == 1) {
|
} else if (sscanf(argv[i], "--num=%d%c", &n, &junk) == 1) {
|
||||||
FLAGS_num = n;
|
FLAGS_num = n;
|
||||||
} else if (sscanf(argv[i], "--reads=%d%c", &n, &junk) == 1) {
|
} else if (sscanf(argv[i], "--reads=%d%c", &n, &junk) == 1) {
|
||||||
|
@ -987,16 +966,18 @@ int main(int argc, char** argv) {
|
||||||
FLAGS_value_size = n;
|
FLAGS_value_size = n;
|
||||||
} else if (sscanf(argv[i], "--write_buffer_size=%d%c", &n, &junk) == 1) {
|
} else if (sscanf(argv[i], "--write_buffer_size=%d%c", &n, &junk) == 1) {
|
||||||
FLAGS_write_buffer_size = n;
|
FLAGS_write_buffer_size = n;
|
||||||
} else if (sscanf(argv[i], "--max_file_size=%d%c", &n, &junk) == 1) {
|
|
||||||
FLAGS_max_file_size = n;
|
|
||||||
} else if (sscanf(argv[i], "--block_size=%d%c", &n, &junk) == 1) {
|
|
||||||
FLAGS_block_size = n;
|
|
||||||
} else if (sscanf(argv[i], "--cache_size=%d%c", &n, &junk) == 1) {
|
} else if (sscanf(argv[i], "--cache_size=%d%c", &n, &junk) == 1) {
|
||||||
FLAGS_cache_size = n;
|
FLAGS_cache_size = n;
|
||||||
} else if (sscanf(argv[i], "--bloom_bits=%d%c", &n, &junk) == 1) {
|
} else if (sscanf(argv[i], "--bloom_bits=%d%c", &n, &junk) == 1) {
|
||||||
FLAGS_bloom_bits = n;
|
FLAGS_bloom_bits = n;
|
||||||
|
} else if (sscanf(argv[i], "--bloom_bits2=%d%c", &n, &junk) == 1) {
|
||||||
|
FLAGS_bloom2_bits = n;
|
||||||
|
} else if (sscanf(argv[i], "--leveldb_memory=%d%c", &n, &junk) == 1) {
|
||||||
|
FLAGS_leveldb_memory = n * 1024 * 1024LL;
|
||||||
} else if (sscanf(argv[i], "--open_files=%d%c", &n, &junk) == 1) {
|
} else if (sscanf(argv[i], "--open_files=%d%c", &n, &junk) == 1) {
|
||||||
FLAGS_open_files = n;
|
FLAGS_open_files = n;
|
||||||
|
} else if (sscanf(argv[i], "--compression=%d%c", &n, &junk) == 1) {
|
||||||
|
FLAGS_compression = n;
|
||||||
} else if (strncmp(argv[i], "--db=", 5) == 0) {
|
} else if (strncmp(argv[i], "--db=", 5) == 0) {
|
||||||
FLAGS_db = argv[i] + 5;
|
FLAGS_db = argv[i] + 5;
|
||||||
} else {
|
} else {
|
||||||
|
@ -1005,16 +986,20 @@ int main(int argc, char** argv) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
leveldb::g_env = leveldb::Env::Default();
|
|
||||||
|
|
||||||
// Choose a location for the test database if none given with --db=<path>
|
// Choose a location for the test database if none given with --db=<path>
|
||||||
if (FLAGS_db == NULL) {
|
if (FLAGS_db == NULL) {
|
||||||
leveldb::g_env->GetTestDirectory(&default_db_path);
|
leveldb::Env::Default()->GetTestDirectory(&default_db_path);
|
||||||
default_db_path += "/dbbench";
|
default_db_path += "/dbbench";
|
||||||
FLAGS_db = default_db_path.c_str();
|
FLAGS_db = default_db_path.c_str();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// benchmark class needs to destruct before Shutdown call
|
||||||
|
{
|
||||||
leveldb::Benchmark benchmark;
|
leveldb::Benchmark benchmark;
|
||||||
benchmark.Run();
|
benchmark.Run();
|
||||||
|
}
|
||||||
|
|
||||||
|
leveldb::Env::Shutdown();
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -13,7 +13,7 @@
|
||||||
#include "leveldb/db.h"
|
#include "leveldb/db.h"
|
||||||
#include "leveldb/env.h"
|
#include "leveldb/env.h"
|
||||||
#include "port/port.h"
|
#include "port/port.h"
|
||||||
#include "port/thread_annotations.h"
|
#include "util/cache2.h"
|
||||||
|
|
||||||
namespace leveldb {
|
namespace leveldb {
|
||||||
|
|
||||||
|
@ -29,26 +29,37 @@ class DBImpl : public DB {
|
||||||
virtual ~DBImpl();
|
virtual ~DBImpl();
|
||||||
|
|
||||||
// Implementations of the DB interface
|
// Implementations of the DB interface
|
||||||
virtual Status Put(const WriteOptions&, const Slice& key, const Slice& value);
|
virtual Status Put(const WriteOptions&, const Slice& key, const Slice& value, const KeyMetaData * meta=NULL);
|
||||||
virtual Status Delete(const WriteOptions&, const Slice& key);
|
virtual Status Delete(const WriteOptions&, const Slice& key);
|
||||||
virtual Status Write(const WriteOptions& options, WriteBatch* updates);
|
virtual Status Write(const WriteOptions& options, WriteBatch* updates);
|
||||||
virtual Status Get(const ReadOptions& options,
|
virtual Status Get(const ReadOptions& options,
|
||||||
const Slice& key,
|
const Slice& key,
|
||||||
std::string* value);
|
std::string* value,
|
||||||
|
KeyMetaData * meta=NULL);
|
||||||
|
virtual Status Get(const ReadOptions& options,
|
||||||
|
const Slice& key,
|
||||||
|
Value* value,
|
||||||
|
KeyMetaData * meta=NULL);
|
||||||
virtual Iterator* NewIterator(const ReadOptions&);
|
virtual Iterator* NewIterator(const ReadOptions&);
|
||||||
virtual const Snapshot* GetSnapshot();
|
virtual const Snapshot* GetSnapshot();
|
||||||
virtual void ReleaseSnapshot(const Snapshot* snapshot);
|
virtual void ReleaseSnapshot(const Snapshot* snapshot);
|
||||||
virtual bool GetProperty(const Slice& property, std::string* value);
|
virtual bool GetProperty(const Slice& property, std::string* value);
|
||||||
virtual void GetApproximateSizes(const Range* range, int n, uint64_t* sizes);
|
virtual void GetApproximateSizes(const Range* range, int n, uint64_t* sizes);
|
||||||
virtual void CompactRange(const Slice* begin, const Slice* end);
|
virtual void CompactRange(const Slice* begin, const Slice* end);
|
||||||
|
virtual Status VerifyLevels();
|
||||||
|
virtual void CheckAvailableCompactions();
|
||||||
|
virtual Logger* GetLogger() const { return options_.info_log; }
|
||||||
|
|
||||||
// Extra methods (for testing) that are not in the public DB interface
|
// Extra methods (for testing) that are not in the public DB interface
|
||||||
|
|
||||||
|
const Options & GetOptions() const { return options_; };
|
||||||
|
|
||||||
// Compact any files in the named level that overlap [*begin,*end]
|
// Compact any files in the named level that overlap [*begin,*end]
|
||||||
void TEST_CompactRange(int level, const Slice* begin, const Slice* end);
|
void TEST_CompactRange(int level, const Slice* begin, const Slice* end);
|
||||||
|
|
||||||
// Force current memtable contents to be compacted.
|
// Force current memtable contents to be compacted, waits for completion
|
||||||
Status TEST_CompactMemTable();
|
Status CompactMemTableSynchronous();
|
||||||
|
Status TEST_CompactMemTable(); // wraps CompactMemTableSynchronous (historical)
|
||||||
|
|
||||||
// Return an internal iterator over the current state of the database.
|
// Return an internal iterator over the current state of the database.
|
||||||
// The keys of this iterator are internal keys (see format.h).
|
// The keys of this iterator are internal keys (see format.h).
|
||||||
|
@ -59,64 +70,82 @@ class DBImpl : public DB {
|
||||||
// file at a level >= 1.
|
// file at a level >= 1.
|
||||||
int64_t TEST_MaxNextLevelOverlappingBytes();
|
int64_t TEST_MaxNextLevelOverlappingBytes();
|
||||||
|
|
||||||
// Record a sample of bytes read at the specified internal key.
|
// These are routines that DBListImpl calls across all open databases
|
||||||
// Samples are taken approximately once every config::kReadBytesPeriod
|
void ResizeCaches() {double_cache.ResizeCaches();};
|
||||||
// bytes.
|
size_t GetCacheCapacity() {return(double_cache.GetCapacity(false));}
|
||||||
void RecordReadSample(Slice key);
|
void PurgeExpiredFileCache() {double_cache.PurgeExpiredFiles();};
|
||||||
|
|
||||||
private:
|
// in util/hot_backup.cc
|
||||||
|
void HotBackup();
|
||||||
|
bool PurgeWriteBuffer();
|
||||||
|
bool WriteBackupManifest();
|
||||||
|
bool CreateBackupLinks(Version * Version, Options & BackupOptions);
|
||||||
|
bool CopyLOGSegment(long FileEnd);
|
||||||
|
void HotBackupComplete();
|
||||||
|
|
||||||
|
void BackgroundCall2(Compaction * Compact);
|
||||||
|
void BackgroundImmCompactCall();
|
||||||
|
bool IsCompactionScheduled();
|
||||||
|
uint32_t RunningCompactionCount() {mutex_.AssertHeld(); return(running_compactions_);};
|
||||||
|
|
||||||
|
protected:
|
||||||
friend class DB;
|
friend class DB;
|
||||||
struct CompactionState;
|
struct CompactionState;
|
||||||
struct Writer;
|
struct Writer;
|
||||||
|
|
||||||
Iterator* NewInternalIterator(const ReadOptions&,
|
Iterator* NewInternalIterator(const ReadOptions&,
|
||||||
SequenceNumber* latest_snapshot,
|
SequenceNumber* latest_snapshot);
|
||||||
uint32_t* seed);
|
|
||||||
|
|
||||||
Status NewDB();
|
Status NewDB();
|
||||||
|
|
||||||
// Recover the descriptor from persistent storage. May do a significant
|
// Recover the descriptor from persistent storage. May do a significant
|
||||||
// amount of work to recover recently logged updates. Any changes to
|
// amount of work to recover recently logged updates. Any changes to
|
||||||
// be made to the descriptor are added to *edit.
|
// be made to the descriptor are added to *edit.
|
||||||
Status Recover(VersionEdit* edit, bool* save_manifest)
|
Status Recover(VersionEdit* edit);
|
||||||
EXCLUSIVE_LOCKS_REQUIRED(mutex_);
|
|
||||||
|
// Riak routine: pause DB::Open if too many compactions
|
||||||
|
// stacked up immediately. Happens in some repairs and
|
||||||
|
// some Riak upgrades
|
||||||
|
void CheckCompactionState();
|
||||||
|
|
||||||
void MaybeIgnoreError(Status* s) const;
|
void MaybeIgnoreError(Status* s) const;
|
||||||
|
|
||||||
// Delete any unneeded files and stale in-memory entries.
|
// Delete any unneeded files and stale in-memory entries.
|
||||||
void DeleteObsoleteFiles();
|
void DeleteObsoleteFiles();
|
||||||
|
void KeepOrDelete(const std::string & Filename, int level, const std::set<uint64_t> & Live);
|
||||||
|
|
||||||
// Compact the in-memory write buffer to disk. Switches to a new
|
// Compact the in-memory write buffer to disk. Switches to a new
|
||||||
// log-file/memtable and writes a new descriptor iff successful.
|
// log-file/memtable and writes a new descriptor iff successful.
|
||||||
// Errors are recorded in bg_error_.
|
Status CompactMemTable();
|
||||||
void CompactMemTable() EXCLUSIVE_LOCKS_REQUIRED(mutex_);
|
|
||||||
|
|
||||||
Status RecoverLogFile(uint64_t log_number, bool last_log, bool* save_manifest,
|
Status RecoverLogFile(uint64_t log_number,
|
||||||
VersionEdit* edit, SequenceNumber* max_sequence)
|
VersionEdit* edit,
|
||||||
EXCLUSIVE_LOCKS_REQUIRED(mutex_);
|
SequenceNumber* max_sequence);
|
||||||
|
|
||||||
Status WriteLevel0Table(MemTable* mem, VersionEdit* edit, Version* base)
|
Status WriteLevel0Table(volatile MemTable* mem, VersionEdit* edit, Version* base);
|
||||||
EXCLUSIVE_LOCKS_REQUIRED(mutex_);
|
|
||||||
|
Status MakeRoomForWrite(bool force /* TRUE forces memtable rotation to disk (for testing) */);
|
||||||
|
Status NewRecoveryLog(uint64_t NewLogNumber);
|
||||||
|
|
||||||
Status MakeRoomForWrite(bool force /* compact even if there is room? */)
|
|
||||||
EXCLUSIVE_LOCKS_REQUIRED(mutex_);
|
|
||||||
WriteBatch* BuildBatchGroup(Writer** last_writer);
|
WriteBatch* BuildBatchGroup(Writer** last_writer);
|
||||||
|
|
||||||
void RecordBackgroundError(const Status& s);
|
void MaybeScheduleCompaction();
|
||||||
|
|
||||||
void MaybeScheduleCompaction() EXCLUSIVE_LOCKS_REQUIRED(mutex_);
|
Status BackgroundCompaction(Compaction * Compact=NULL);
|
||||||
static void BGWork(void* db);
|
Status BackgroundExpiry(Compaction * Compact=NULL);
|
||||||
void BackgroundCall();
|
|
||||||
void BackgroundCompaction() EXCLUSIVE_LOCKS_REQUIRED(mutex_);
|
|
||||||
void CleanupCompaction(CompactionState* compact)
|
|
||||||
EXCLUSIVE_LOCKS_REQUIRED(mutex_);
|
|
||||||
Status DoCompactionWork(CompactionState* compact)
|
|
||||||
EXCLUSIVE_LOCKS_REQUIRED(mutex_);
|
|
||||||
|
|
||||||
Status OpenCompactionOutputFile(CompactionState* compact);
|
void CleanupCompaction(CompactionState* compact);
|
||||||
|
Status DoCompactionWork(CompactionState* compact);
|
||||||
|
int64_t PrioritizeWork(bool IsLevel0);
|
||||||
|
|
||||||
|
Status OpenCompactionOutputFile(CompactionState* compact, size_t sample_value_size);
|
||||||
|
bool Send2PageCache(CompactionState * compact);
|
||||||
|
size_t MaybeRaiseBlockSize(Compaction & CompactionStuff, size_t SampleValueSize);
|
||||||
Status FinishCompactionOutputFile(CompactionState* compact, Iterator* input);
|
Status FinishCompactionOutputFile(CompactionState* compact, Iterator* input);
|
||||||
Status InstallCompactionResults(CompactionState* compact)
|
Status InstallCompactionResults(CompactionState* compact);
|
||||||
EXCLUSIVE_LOCKS_REQUIRED(mutex_);
|
|
||||||
|
// initialized before options so its block_cache is available
|
||||||
|
class DoubleCache double_cache;
|
||||||
|
|
||||||
// Constant after construction
|
// Constant after construction
|
||||||
Env* const env_;
|
Env* const env_;
|
||||||
|
@ -130,20 +159,22 @@ class DBImpl : public DB {
|
||||||
// table_cache_ provides its own synchronization
|
// table_cache_ provides its own synchronization
|
||||||
TableCache* table_cache_;
|
TableCache* table_cache_;
|
||||||
|
|
||||||
|
|
||||||
// Lock over the persistent DB state. Non-NULL iff successfully acquired.
|
// Lock over the persistent DB state. Non-NULL iff successfully acquired.
|
||||||
FileLock* db_lock_;
|
FileLock* db_lock_;
|
||||||
|
|
||||||
// State below is protected by mutex_
|
// State below is protected by mutex_
|
||||||
port::Mutex mutex_;
|
port::Mutex mutex_;
|
||||||
|
port::Mutex throttle_mutex_; // used by write throttle to force sequential waits on callers
|
||||||
port::AtomicPointer shutting_down_;
|
port::AtomicPointer shutting_down_;
|
||||||
|
|
||||||
port::CondVar bg_cv_; // Signalled when background work finishes
|
port::CondVar bg_cv_; // Signalled when background work finishes
|
||||||
MemTable* mem_;
|
MemTable* mem_;
|
||||||
MemTable* imm_; // Memtable being compacted
|
volatile MemTable* imm_; // Memtable being compacted
|
||||||
port::AtomicPointer has_imm_; // So bg thread can detect non-NULL imm_
|
port::AtomicPointer has_imm_; // So bg thread can detect non-NULL imm_
|
||||||
WritableFile* logfile_;
|
WritableFile* logfile_;
|
||||||
uint64_t logfile_number_;
|
uint64_t logfile_number_;
|
||||||
log::Writer* log_;
|
log::Writer* log_;
|
||||||
uint32_t seed_; // For sampling.
|
|
||||||
|
|
||||||
// Queue of writers.
|
// Queue of writers.
|
||||||
std::deque<Writer*> writers_;
|
std::deque<Writer*> writers_;
|
||||||
|
@ -155,9 +186,6 @@ class DBImpl : public DB {
|
||||||
// part of ongoing compactions.
|
// part of ongoing compactions.
|
||||||
std::set<uint64_t> pending_outputs_;
|
std::set<uint64_t> pending_outputs_;
|
||||||
|
|
||||||
// Has a background compaction been scheduled or is running?
|
|
||||||
bool bg_compaction_scheduled_;
|
|
||||||
|
|
||||||
// Information for a manual compaction
|
// Information for a manual compaction
|
||||||
struct ManualCompaction {
|
struct ManualCompaction {
|
||||||
int level;
|
int level;
|
||||||
|
@ -166,7 +194,7 @@ class DBImpl : public DB {
|
||||||
const InternalKey* end; // NULL means end of key range
|
const InternalKey* end; // NULL means end of key range
|
||||||
InternalKey tmp_storage; // Used to keep track of compaction progress
|
InternalKey tmp_storage; // Used to keep track of compaction progress
|
||||||
};
|
};
|
||||||
ManualCompaction* manual_compaction_;
|
volatile ManualCompaction* manual_compaction_;
|
||||||
|
|
||||||
VersionSet* versions_;
|
VersionSet* versions_;
|
||||||
|
|
||||||
|
@ -190,6 +218,18 @@ class DBImpl : public DB {
|
||||||
};
|
};
|
||||||
CompactionStats stats_[config::kNumLevels];
|
CompactionStats stats_[config::kNumLevels];
|
||||||
|
|
||||||
|
volatile uint64_t throttle_end;
|
||||||
|
volatile uint32_t running_compactions_;
|
||||||
|
volatile size_t current_block_size_; // last dynamic block size computed
|
||||||
|
volatile uint64_t block_size_changed_; // NowMicros() when block size computed
|
||||||
|
volatile uint64_t last_low_mem_; // NowMicros() when low memory last seen
|
||||||
|
|
||||||
|
// accessor to new, dynamic block_cache
|
||||||
|
Cache * block_cache() {return(double_cache.GetBlockCache());};
|
||||||
|
Cache * file_cache() {return(double_cache.GetFileCache());};
|
||||||
|
|
||||||
|
volatile bool hotbackup_pending_;
|
||||||
|
|
||||||
// No copying allowed
|
// No copying allowed
|
||||||
DBImpl(const DBImpl&);
|
DBImpl(const DBImpl&);
|
||||||
void operator=(const DBImpl&);
|
void operator=(const DBImpl&);
|
||||||
|
@ -204,7 +244,8 @@ class DBImpl : public DB {
|
||||||
extern Options SanitizeOptions(const std::string& db,
|
extern Options SanitizeOptions(const std::string& db,
|
||||||
const InternalKeyComparator* icmp,
|
const InternalKeyComparator* icmp,
|
||||||
const InternalFilterPolicy* ipolicy,
|
const InternalFilterPolicy* ipolicy,
|
||||||
const Options& src);
|
const Options& src,
|
||||||
|
Cache * block_cache);
|
||||||
|
|
||||||
} // namespace leveldb
|
} // namespace leveldb
|
||||||
|
|
||||||
|
|
|
@ -5,14 +5,14 @@
|
||||||
#include "db/db_iter.h"
|
#include "db/db_iter.h"
|
||||||
|
|
||||||
#include "db/filename.h"
|
#include "db/filename.h"
|
||||||
#include "db/db_impl.h"
|
|
||||||
#include "db/dbformat.h"
|
#include "db/dbformat.h"
|
||||||
#include "leveldb/env.h"
|
#include "leveldb/env.h"
|
||||||
|
#include "leveldb/expiry.h"
|
||||||
#include "leveldb/iterator.h"
|
#include "leveldb/iterator.h"
|
||||||
|
#include "leveldb/perf_count.h"
|
||||||
#include "port/port.h"
|
#include "port/port.h"
|
||||||
#include "util/logging.h"
|
#include "util/logging.h"
|
||||||
#include "util/mutexlock.h"
|
#include "util/mutexlock.h"
|
||||||
#include "util/random.h"
|
|
||||||
|
|
||||||
namespace leveldb {
|
namespace leveldb {
|
||||||
|
|
||||||
|
@ -48,18 +48,20 @@ class DBIter: public Iterator {
|
||||||
kReverse
|
kReverse
|
||||||
};
|
};
|
||||||
|
|
||||||
DBIter(DBImpl* db, const Comparator* cmp, Iterator* iter, SequenceNumber s,
|
DBIter(const std::string* dbname, Env* env,
|
||||||
uint32_t seed)
|
const Comparator* cmp, Iterator* iter, SequenceNumber s,
|
||||||
: db_(db),
|
const ExpiryModule * expiry)
|
||||||
|
: dbname_(dbname),
|
||||||
|
env_(env),
|
||||||
user_comparator_(cmp),
|
user_comparator_(cmp),
|
||||||
iter_(iter),
|
iter_(iter),
|
||||||
sequence_(s),
|
sequence_(s),
|
||||||
direction_(kForward),
|
direction_(kForward),
|
||||||
valid_(false),
|
valid_(false),
|
||||||
rnd_(seed),
|
expiry_(expiry) {
|
||||||
bytes_counter_(RandomPeriod()) {
|
|
||||||
}
|
}
|
||||||
virtual ~DBIter() {
|
virtual ~DBIter() {
|
||||||
|
gPerfCounters->Inc(ePerfIterDelete);
|
||||||
delete iter_;
|
delete iter_;
|
||||||
}
|
}
|
||||||
virtual bool Valid() const { return valid_; }
|
virtual bool Valid() const { return valid_; }
|
||||||
|
@ -71,6 +73,26 @@ class DBIter: public Iterator {
|
||||||
assert(valid_);
|
assert(valid_);
|
||||||
return (direction_ == kForward) ? iter_->value() : saved_value_;
|
return (direction_ == kForward) ? iter_->value() : saved_value_;
|
||||||
}
|
}
|
||||||
|
// Riak specific: if a database iterator, returns key meta data
|
||||||
|
// REQUIRES: Valid() and forward iteration
|
||||||
|
// (reverse iteration is possible, just needs code)
|
||||||
|
virtual KeyMetaData & keymetadata() const
|
||||||
|
{
|
||||||
|
assert(valid_ && kForward==direction_);
|
||||||
|
if (kForward==direction_)
|
||||||
|
{
|
||||||
|
ParsedInternalKey parsed;
|
||||||
|
// this initialization clears a warning. ParsedInternalKey says
|
||||||
|
// it is not initializing for performance reasons ... oh well
|
||||||
|
parsed.type=kTypeValue; parsed.sequence=0; parsed.expiry=0;
|
||||||
|
ParseInternalKey(iter_->key(), &parsed);
|
||||||
|
keymetadata_.m_Type=parsed.type;
|
||||||
|
keymetadata_.m_Sequence=parsed.sequence;
|
||||||
|
keymetadata_.m_Expiry=parsed.expiry;
|
||||||
|
}
|
||||||
|
return(keymetadata_);
|
||||||
|
}
|
||||||
|
|
||||||
virtual Status status() const {
|
virtual Status status() const {
|
||||||
if (status_.ok()) {
|
if (status_.ok()) {
|
||||||
return iter_->status();
|
return iter_->status();
|
||||||
|
@ -103,12 +125,8 @@ class DBIter: public Iterator {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Pick next gap with average value of config::kReadBytesPeriod.
|
const std::string* const dbname_;
|
||||||
ssize_t RandomPeriod() {
|
Env* const env_;
|
||||||
return rnd_.Uniform(2*config::kReadBytesPeriod);
|
|
||||||
}
|
|
||||||
|
|
||||||
DBImpl* db_;
|
|
||||||
const Comparator* const user_comparator_;
|
const Comparator* const user_comparator_;
|
||||||
Iterator* const iter_;
|
Iterator* const iter_;
|
||||||
SequenceNumber const sequence_;
|
SequenceNumber const sequence_;
|
||||||
|
@ -118,9 +136,7 @@ class DBIter: public Iterator {
|
||||||
std::string saved_value_; // == current raw value when direction_==kReverse
|
std::string saved_value_; // == current raw value when direction_==kReverse
|
||||||
Direction direction_;
|
Direction direction_;
|
||||||
bool valid_;
|
bool valid_;
|
||||||
|
const ExpiryModule * expiry_;
|
||||||
Random rnd_;
|
|
||||||
ssize_t bytes_counter_;
|
|
||||||
|
|
||||||
// No copying allowed
|
// No copying allowed
|
||||||
DBIter(const DBIter&);
|
DBIter(const DBIter&);
|
||||||
|
@ -128,14 +144,7 @@ class DBIter: public Iterator {
|
||||||
};
|
};
|
||||||
|
|
||||||
inline bool DBIter::ParseKey(ParsedInternalKey* ikey) {
|
inline bool DBIter::ParseKey(ParsedInternalKey* ikey) {
|
||||||
Slice k = iter_->key();
|
if (!ParseInternalKey(iter_->key(), ikey)) {
|
||||||
ssize_t n = k.size() + iter_->value().size();
|
|
||||||
bytes_counter_ -= n;
|
|
||||||
while (bytes_counter_ < 0) {
|
|
||||||
bytes_counter_ += RandomPeriod();
|
|
||||||
db_->RecordReadSample(k);
|
|
||||||
}
|
|
||||||
if (!ParseInternalKey(k, ikey)) {
|
|
||||||
status_ = Status::Corruption("corrupted internal key in DBIter");
|
status_ = Status::Corruption("corrupted internal key in DBIter");
|
||||||
return false;
|
return false;
|
||||||
} else {
|
} else {
|
||||||
|
@ -146,6 +155,7 @@ inline bool DBIter::ParseKey(ParsedInternalKey* ikey) {
|
||||||
void DBIter::Next() {
|
void DBIter::Next() {
|
||||||
assert(valid_);
|
assert(valid_);
|
||||||
|
|
||||||
|
gPerfCounters->Inc(ePerfIterNext);
|
||||||
if (direction_ == kReverse) { // Switch directions?
|
if (direction_ == kReverse) { // Switch directions?
|
||||||
direction_ = kForward;
|
direction_ = kForward;
|
||||||
// iter_ is pointing just before the entries for this->key(),
|
// iter_ is pointing just before the entries for this->key(),
|
||||||
|
@ -161,13 +171,12 @@ void DBIter::Next() {
|
||||||
saved_key_.clear();
|
saved_key_.clear();
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
// saved_key_ already contains the key to skip past.
|
|
||||||
} else {
|
|
||||||
// Store in saved_key_ the current key so we skip it below.
|
|
||||||
SaveKey(ExtractUserKey(iter_->key()), &saved_key_);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
FindNextUserEntry(true, &saved_key_);
|
// Temporarily use saved_key_ as storage for key to skip.
|
||||||
|
std::string* skip = &saved_key_;
|
||||||
|
SaveKey(ExtractUserKey(iter_->key()), skip);
|
||||||
|
FindNextUserEntry(true, skip);
|
||||||
}
|
}
|
||||||
|
|
||||||
void DBIter::FindNextUserEntry(bool skipping, std::string* skip) {
|
void DBIter::FindNextUserEntry(bool skipping, std::string* skip) {
|
||||||
|
@ -177,6 +186,9 @@ void DBIter::FindNextUserEntry(bool skipping, std::string* skip) {
|
||||||
do {
|
do {
|
||||||
ParsedInternalKey ikey;
|
ParsedInternalKey ikey;
|
||||||
if (ParseKey(&ikey) && ikey.sequence <= sequence_) {
|
if (ParseKey(&ikey) && ikey.sequence <= sequence_) {
|
||||||
|
if (IsExpiryKey(ikey.type) && NULL!=expiry_
|
||||||
|
&& expiry_->KeyRetirementCallback(ikey))
|
||||||
|
ikey.type=kTypeDeletion;
|
||||||
switch (ikey.type) {
|
switch (ikey.type) {
|
||||||
case kTypeDeletion:
|
case kTypeDeletion:
|
||||||
// Arrange to skip all upcoming entries for this key since
|
// Arrange to skip all upcoming entries for this key since
|
||||||
|
@ -184,6 +196,9 @@ void DBIter::FindNextUserEntry(bool skipping, std::string* skip) {
|
||||||
SaveKey(ikey.user_key, skip);
|
SaveKey(ikey.user_key, skip);
|
||||||
skipping = true;
|
skipping = true;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
case kTypeValueWriteTime:
|
||||||
|
case kTypeValueExplicitExpiry:
|
||||||
case kTypeValue:
|
case kTypeValue:
|
||||||
if (skipping &&
|
if (skipping &&
|
||||||
user_comparator_->Compare(ikey.user_key, *skip) <= 0) {
|
user_comparator_->Compare(ikey.user_key, *skip) <= 0) {
|
||||||
|
@ -205,6 +220,7 @@ void DBIter::FindNextUserEntry(bool skipping, std::string* skip) {
|
||||||
void DBIter::Prev() {
|
void DBIter::Prev() {
|
||||||
assert(valid_);
|
assert(valid_);
|
||||||
|
|
||||||
|
gPerfCounters->Inc(ePerfIterPrev);
|
||||||
if (direction_ == kForward) { // Switch directions?
|
if (direction_ == kForward) { // Switch directions?
|
||||||
// iter_ is pointing at the current entry. Scan backwards until
|
// iter_ is pointing at the current entry. Scan backwards until
|
||||||
// the key changes so we can use the normal reverse scanning code.
|
// the key changes so we can use the normal reverse scanning code.
|
||||||
|
@ -242,6 +258,10 @@ void DBIter::FindPrevUserEntry() {
|
||||||
// We encountered a non-deleted value in entries for previous keys,
|
// We encountered a non-deleted value in entries for previous keys,
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
if (IsExpiryKey(ikey.type) && NULL!=expiry_
|
||||||
|
&& expiry_->KeyRetirementCallback(ikey))
|
||||||
|
ikey.type=kTypeDeletion;
|
||||||
|
|
||||||
value_type = ikey.type;
|
value_type = ikey.type;
|
||||||
if (value_type == kTypeDeletion) {
|
if (value_type == kTypeDeletion) {
|
||||||
saved_key_.clear();
|
saved_key_.clear();
|
||||||
|
@ -272,11 +292,12 @@ void DBIter::FindPrevUserEntry() {
|
||||||
}
|
}
|
||||||
|
|
||||||
void DBIter::Seek(const Slice& target) {
|
void DBIter::Seek(const Slice& target) {
|
||||||
|
gPerfCounters->Inc(ePerfIterSeek);
|
||||||
direction_ = kForward;
|
direction_ = kForward;
|
||||||
ClearSavedValue();
|
ClearSavedValue();
|
||||||
saved_key_.clear();
|
saved_key_.clear();
|
||||||
AppendInternalKey(
|
AppendInternalKey(
|
||||||
&saved_key_, ParsedInternalKey(target, sequence_, kValueTypeForSeek));
|
&saved_key_, ParsedInternalKey(target, 0, sequence_, kValueTypeForSeek));
|
||||||
iter_->Seek(saved_key_);
|
iter_->Seek(saved_key_);
|
||||||
if (iter_->Valid()) {
|
if (iter_->Valid()) {
|
||||||
FindNextUserEntry(false, &saved_key_ /* temporary storage */);
|
FindNextUserEntry(false, &saved_key_ /* temporary storage */);
|
||||||
|
@ -286,6 +307,7 @@ void DBIter::Seek(const Slice& target) {
|
||||||
}
|
}
|
||||||
|
|
||||||
void DBIter::SeekToFirst() {
|
void DBIter::SeekToFirst() {
|
||||||
|
gPerfCounters->Inc(ePerfIterSeekFirst);
|
||||||
direction_ = kForward;
|
direction_ = kForward;
|
||||||
ClearSavedValue();
|
ClearSavedValue();
|
||||||
iter_->SeekToFirst();
|
iter_->SeekToFirst();
|
||||||
|
@ -297,6 +319,7 @@ void DBIter::SeekToFirst() {
|
||||||
}
|
}
|
||||||
|
|
||||||
void DBIter::SeekToLast() {
|
void DBIter::SeekToLast() {
|
||||||
|
gPerfCounters->Inc(ePerfIterSeekLast);
|
||||||
direction_ = kReverse;
|
direction_ = kReverse;
|
||||||
ClearSavedValue();
|
ClearSavedValue();
|
||||||
iter_->SeekToLast();
|
iter_->SeekToLast();
|
||||||
|
@ -306,12 +329,13 @@ void DBIter::SeekToLast() {
|
||||||
} // anonymous namespace
|
} // anonymous namespace
|
||||||
|
|
||||||
Iterator* NewDBIterator(
|
Iterator* NewDBIterator(
|
||||||
DBImpl* db,
|
const std::string* dbname,
|
||||||
|
Env* env,
|
||||||
const Comparator* user_key_comparator,
|
const Comparator* user_key_comparator,
|
||||||
Iterator* internal_iter,
|
Iterator* internal_iter,
|
||||||
SequenceNumber sequence,
|
const SequenceNumber& sequence,
|
||||||
uint32_t seed) {
|
const ExpiryModule * expiry) {
|
||||||
return new DBIter(db, user_key_comparator, internal_iter, sequence, seed);
|
return new DBIter(dbname, env, user_key_comparator, internal_iter, sequence, expiry);
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace leveldb
|
} // namespace leveldb
|
||||||
|
|
|
@ -7,21 +7,21 @@
|
||||||
|
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include "leveldb/db.h"
|
#include "leveldb/db.h"
|
||||||
|
#include "leveldb/expiry.h"
|
||||||
#include "db/dbformat.h"
|
#include "db/dbformat.h"
|
||||||
|
|
||||||
namespace leveldb {
|
namespace leveldb {
|
||||||
|
|
||||||
class DBImpl;
|
|
||||||
|
|
||||||
// Return a new iterator that converts internal keys (yielded by
|
// Return a new iterator that converts internal keys (yielded by
|
||||||
// "*internal_iter") that were live at the specified "sequence" number
|
// "*internal_iter") that were live at the specified "sequence" number
|
||||||
// into appropriate user keys.
|
// into appropriate user keys.
|
||||||
extern Iterator* NewDBIterator(
|
extern Iterator* NewDBIterator(
|
||||||
DBImpl* db,
|
const std::string* dbname,
|
||||||
|
Env* env,
|
||||||
const Comparator* user_key_comparator,
|
const Comparator* user_key_comparator,
|
||||||
Iterator* internal_iter,
|
Iterator* internal_iter,
|
||||||
SequenceNumber sequence,
|
const SequenceNumber& sequence,
|
||||||
uint32_t seed);
|
const ExpiryModule * expiry=NULL);
|
||||||
|
|
||||||
} // namespace leveldb
|
} // namespace leveldb
|
||||||
|
|
||||||
|
|
|
@ -33,11 +33,8 @@ class AtomicCounter {
|
||||||
public:
|
public:
|
||||||
AtomicCounter() : count_(0) { }
|
AtomicCounter() : count_(0) { }
|
||||||
void Increment() {
|
void Increment() {
|
||||||
IncrementBy(1);
|
|
||||||
}
|
|
||||||
void IncrementBy(int count) {
|
|
||||||
MutexLock l(&mu_);
|
MutexLock l(&mu_);
|
||||||
count_ += count;
|
count_++;
|
||||||
}
|
}
|
||||||
int Read() {
|
int Read() {
|
||||||
MutexLock l(&mu_);
|
MutexLock l(&mu_);
|
||||||
|
@ -48,20 +45,13 @@ class AtomicCounter {
|
||||||
count_ = 0;
|
count_ = 0;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
void DelayMilliseconds(int millis) {
|
|
||||||
Env::Default()->SleepForMicroseconds(millis * 1000);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Special Env used to delay background operations
|
// Special Env used to delay background operations
|
||||||
class SpecialEnv : public EnvWrapper {
|
class SpecialEnv : public EnvWrapper {
|
||||||
public:
|
public:
|
||||||
// sstable/log Sync() calls are blocked while this pointer is non-NULL.
|
// sstable Sync() calls are blocked while this pointer is non-NULL.
|
||||||
port::AtomicPointer delay_data_sync_;
|
port::AtomicPointer delay_sstable_sync_;
|
||||||
|
|
||||||
// sstable/log Sync() calls return an error.
|
|
||||||
port::AtomicPointer data_sync_error_;
|
|
||||||
|
|
||||||
// Simulate no-space errors while this pointer is non-NULL.
|
// Simulate no-space errors while this pointer is non-NULL.
|
||||||
port::AtomicPointer no_space_;
|
port::AtomicPointer no_space_;
|
||||||
|
@ -69,37 +59,30 @@ class SpecialEnv : public EnvWrapper {
|
||||||
// Simulate non-writable file system while this pointer is non-NULL
|
// Simulate non-writable file system while this pointer is non-NULL
|
||||||
port::AtomicPointer non_writable_;
|
port::AtomicPointer non_writable_;
|
||||||
|
|
||||||
// Force sync of manifest files to fail while this pointer is non-NULL
|
|
||||||
port::AtomicPointer manifest_sync_error_;
|
|
||||||
|
|
||||||
// Force write to manifest files to fail while this pointer is non-NULL
|
|
||||||
port::AtomicPointer manifest_write_error_;
|
|
||||||
|
|
||||||
bool count_random_reads_;
|
bool count_random_reads_;
|
||||||
AtomicCounter random_read_counter_;
|
AtomicCounter random_read_counter_;
|
||||||
|
|
||||||
|
AtomicCounter sleep_counter_;
|
||||||
|
|
||||||
explicit SpecialEnv(Env* base) : EnvWrapper(base) {
|
explicit SpecialEnv(Env* base) : EnvWrapper(base) {
|
||||||
delay_data_sync_.Release_Store(NULL);
|
delay_sstable_sync_.Release_Store(NULL);
|
||||||
data_sync_error_.Release_Store(NULL);
|
|
||||||
no_space_.Release_Store(NULL);
|
no_space_.Release_Store(NULL);
|
||||||
non_writable_.Release_Store(NULL);
|
non_writable_.Release_Store(NULL);
|
||||||
count_random_reads_ = false;
|
count_random_reads_ = false;
|
||||||
manifest_sync_error_.Release_Store(NULL);
|
|
||||||
manifest_write_error_.Release_Store(NULL);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Status NewWritableFile(const std::string& f, WritableFile** r) {
|
Status NewWritableFile(const std::string& f, WritableFile** r, size_t map_size) {
|
||||||
class DataFile : public WritableFile {
|
class SSTableFile : public WritableFile {
|
||||||
private:
|
private:
|
||||||
SpecialEnv* env_;
|
SpecialEnv* env_;
|
||||||
WritableFile* base_;
|
WritableFile* base_;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
DataFile(SpecialEnv* env, WritableFile* base)
|
SSTableFile(SpecialEnv* env, WritableFile* base)
|
||||||
: env_(env),
|
: env_(env),
|
||||||
base_(base) {
|
base_(base) {
|
||||||
}
|
}
|
||||||
~DataFile() { delete base_; }
|
~SSTableFile() { delete base_; }
|
||||||
Status Append(const Slice& data) {
|
Status Append(const Slice& data) {
|
||||||
if (env_->no_space_.Acquire_Load() != NULL) {
|
if (env_->no_space_.Acquire_Load() != NULL) {
|
||||||
// Drop writes on the floor
|
// Drop writes on the floor
|
||||||
|
@ -111,51 +94,21 @@ class SpecialEnv : public EnvWrapper {
|
||||||
Status Close() { return base_->Close(); }
|
Status Close() { return base_->Close(); }
|
||||||
Status Flush() { return base_->Flush(); }
|
Status Flush() { return base_->Flush(); }
|
||||||
Status Sync() {
|
Status Sync() {
|
||||||
if (env_->data_sync_error_.Acquire_Load() != NULL) {
|
while (env_->delay_sstable_sync_.Acquire_Load() != NULL) {
|
||||||
return Status::IOError("simulated data sync error");
|
env_->SleepForMicroseconds(100000);
|
||||||
}
|
|
||||||
while (env_->delay_data_sync_.Acquire_Load() != NULL) {
|
|
||||||
DelayMilliseconds(100);
|
|
||||||
}
|
}
|
||||||
return base_->Sync();
|
return base_->Sync();
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
class ManifestFile : public WritableFile {
|
|
||||||
private:
|
|
||||||
SpecialEnv* env_;
|
|
||||||
WritableFile* base_;
|
|
||||||
public:
|
|
||||||
ManifestFile(SpecialEnv* env, WritableFile* b) : env_(env), base_(b) { }
|
|
||||||
~ManifestFile() { delete base_; }
|
|
||||||
Status Append(const Slice& data) {
|
|
||||||
if (env_->manifest_write_error_.Acquire_Load() != NULL) {
|
|
||||||
return Status::IOError("simulated writer error");
|
|
||||||
} else {
|
|
||||||
return base_->Append(data);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Status Close() { return base_->Close(); }
|
|
||||||
Status Flush() { return base_->Flush(); }
|
|
||||||
Status Sync() {
|
|
||||||
if (env_->manifest_sync_error_.Acquire_Load() != NULL) {
|
|
||||||
return Status::IOError("simulated sync error");
|
|
||||||
} else {
|
|
||||||
return base_->Sync();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
if (non_writable_.Acquire_Load() != NULL) {
|
if (non_writable_.Acquire_Load() != NULL) {
|
||||||
return Status::IOError("simulated write error");
|
return Status::IOError("simulated write error");
|
||||||
}
|
}
|
||||||
|
|
||||||
Status s = target()->NewWritableFile(f, r);
|
Status s = target()->NewWritableFile(f, r, 2<<20);
|
||||||
if (s.ok()) {
|
if (s.ok()) {
|
||||||
if (strstr(f.c_str(), ".ldb") != NULL ||
|
if (strstr(f.c_str(), ".sst") != NULL) {
|
||||||
strstr(f.c_str(), ".log") != NULL) {
|
*r = new SSTableFile(this, *r);
|
||||||
*r = new DataFile(this, *r);
|
|
||||||
} else if (strstr(f.c_str(), "MANIFEST") != NULL) {
|
|
||||||
*r = new ManifestFile(this, *r);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return s;
|
return s;
|
||||||
|
@ -184,6 +137,11 @@ class SpecialEnv : public EnvWrapper {
|
||||||
}
|
}
|
||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
virtual void SleepForMicroseconds(int micros) {
|
||||||
|
sleep_counter_.Increment();
|
||||||
|
target()->SleepForMicroseconds(micros);
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
class DBTest {
|
class DBTest {
|
||||||
|
@ -193,7 +151,6 @@ class DBTest {
|
||||||
// Sequence of option configurations to try
|
// Sequence of option configurations to try
|
||||||
enum OptionConfig {
|
enum OptionConfig {
|
||||||
kDefault,
|
kDefault,
|
||||||
kReuse,
|
|
||||||
kFilter,
|
kFilter,
|
||||||
kUncompressed,
|
kUncompressed,
|
||||||
kEnd
|
kEnd
|
||||||
|
@ -209,7 +166,7 @@ class DBTest {
|
||||||
|
|
||||||
DBTest() : option_config_(kDefault),
|
DBTest() : option_config_(kDefault),
|
||||||
env_(new SpecialEnv(Env::Default())) {
|
env_(new SpecialEnv(Env::Default())) {
|
||||||
filter_policy_ = NewBloomFilterPolicy(10);
|
filter_policy_ = NewBloomFilterPolicy2(16);
|
||||||
dbname_ = test::TmpDir() + "/db_test";
|
dbname_ = test::TmpDir() + "/db_test";
|
||||||
DestroyDB(dbname_, Options());
|
DestroyDB(dbname_, Options());
|
||||||
db_ = NULL;
|
db_ = NULL;
|
||||||
|
@ -238,11 +195,7 @@ class DBTest {
|
||||||
// Return the current option configuration.
|
// Return the current option configuration.
|
||||||
Options CurrentOptions() {
|
Options CurrentOptions() {
|
||||||
Options options;
|
Options options;
|
||||||
options.reuse_logs = false;
|
|
||||||
switch (option_config_) {
|
switch (option_config_) {
|
||||||
case kReuse:
|
|
||||||
options.reuse_logs = true;
|
|
||||||
break;
|
|
||||||
case kFilter:
|
case kFilter:
|
||||||
options.filter_policy = filter_policy_;
|
options.filter_policy = filter_policy_;
|
||||||
break;
|
break;
|
||||||
|
@ -290,6 +243,23 @@ class DBTest {
|
||||||
return DB::Open(opts, dbname_, &db_);
|
return DB::Open(opts, dbname_, &db_);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Status DoubleOpen(Options* options = NULL) {
|
||||||
|
DB * db_fail;
|
||||||
|
delete db_;
|
||||||
|
db_ = NULL;
|
||||||
|
Options opts, opts2;
|
||||||
|
if (options != NULL) {
|
||||||
|
opts = *options;
|
||||||
|
} else {
|
||||||
|
opts = CurrentOptions();
|
||||||
|
opts.create_if_missing = true;
|
||||||
|
}
|
||||||
|
last_options_ = opts;
|
||||||
|
|
||||||
|
DB::Open(opts, dbname_, &db_);
|
||||||
|
return DB::Open(opts2, dbname_, &db_fail);
|
||||||
|
}
|
||||||
|
|
||||||
Status Put(const std::string& k, const std::string& v) {
|
Status Put(const std::string& k, const std::string& v) {
|
||||||
return db_->Put(WriteOptions(), k, v);
|
return db_->Put(WriteOptions(), k, v);
|
||||||
}
|
}
|
||||||
|
@ -311,6 +281,20 @@ class DBTest {
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::string GetNoCache(const std::string& k, const Snapshot* snapshot = NULL) {
|
||||||
|
ReadOptions options;
|
||||||
|
options.snapshot = snapshot;
|
||||||
|
options.fill_cache=false;
|
||||||
|
std::string result;
|
||||||
|
Status s = db_->Get(options, k, &result);
|
||||||
|
if (s.IsNotFound()) {
|
||||||
|
result = "NOT_FOUND";
|
||||||
|
} else if (!s.ok()) {
|
||||||
|
result = s.ToString();
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
// Return a string that contains all key,value pairs in order,
|
// Return a string that contains all key,value pairs in order,
|
||||||
// formatted like "(k1->v1)(k2->v2)".
|
// formatted like "(k1->v1)(k2->v2)".
|
||||||
std::string Contents() {
|
std::string Contents() {
|
||||||
|
@ -326,7 +310,7 @@ class DBTest {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check reverse iteration results are the reverse of forward results
|
// Check reverse iteration results are the reverse of forward results
|
||||||
size_t matched = 0;
|
int matched = 0;
|
||||||
for (iter->SeekToLast(); iter->Valid(); iter->Prev()) {
|
for (iter->SeekToLast(); iter->Valid(); iter->Prev()) {
|
||||||
ASSERT_LT(matched, forward.size());
|
ASSERT_LT(matched, forward.size());
|
||||||
ASSERT_EQ(IterStatus(iter), forward[forward.size() - matched - 1]);
|
ASSERT_EQ(IterStatus(iter), forward[forward.size() - matched - 1]);
|
||||||
|
@ -340,7 +324,7 @@ class DBTest {
|
||||||
|
|
||||||
std::string AllEntriesFor(const Slice& user_key) {
|
std::string AllEntriesFor(const Slice& user_key) {
|
||||||
Iterator* iter = dbfull()->TEST_NewInternalIterator();
|
Iterator* iter = dbfull()->TEST_NewInternalIterator();
|
||||||
InternalKey target(user_key, kMaxSequenceNumber, kTypeValue);
|
InternalKey target(user_key, 0, kMaxSequenceNumber, kTypeValue);
|
||||||
iter->Seek(target.Encode());
|
iter->Seek(target.Encode());
|
||||||
std::string result;
|
std::string result;
|
||||||
if (!iter->status().ok()) {
|
if (!iter->status().ok()) {
|
||||||
|
@ -361,6 +345,8 @@ class DBTest {
|
||||||
}
|
}
|
||||||
first = false;
|
first = false;
|
||||||
switch (ikey.type) {
|
switch (ikey.type) {
|
||||||
|
case kTypeValueWriteTime:
|
||||||
|
case kTypeValueExplicitExpiry:
|
||||||
case kTypeValue:
|
case kTypeValue:
|
||||||
result += iter->value().ToString();
|
result += iter->value().ToString();
|
||||||
break;
|
break;
|
||||||
|
@ -474,38 +460,6 @@ class DBTest {
|
||||||
}
|
}
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool DeleteAnSSTFile() {
|
|
||||||
std::vector<std::string> filenames;
|
|
||||||
ASSERT_OK(env_->GetChildren(dbname_, &filenames));
|
|
||||||
uint64_t number;
|
|
||||||
FileType type;
|
|
||||||
for (size_t i = 0; i < filenames.size(); i++) {
|
|
||||||
if (ParseFileName(filenames[i], &number, &type) && type == kTableFile) {
|
|
||||||
ASSERT_OK(env_->DeleteFile(TableFileName(dbname_, number)));
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Returns number of files renamed.
|
|
||||||
int RenameLDBToSST() {
|
|
||||||
std::vector<std::string> filenames;
|
|
||||||
ASSERT_OK(env_->GetChildren(dbname_, &filenames));
|
|
||||||
uint64_t number;
|
|
||||||
FileType type;
|
|
||||||
int files_renamed = 0;
|
|
||||||
for (size_t i = 0; i < filenames.size(); i++) {
|
|
||||||
if (ParseFileName(filenames[i], &number, &type) && type == kTableFile) {
|
|
||||||
const std::string from = TableFileName(dbname_, number);
|
|
||||||
const std::string to = SSTTableFileName(dbname_, number);
|
|
||||||
ASSERT_OK(env_->RenameFile(from, to));
|
|
||||||
files_renamed++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return files_renamed;
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
|
|
||||||
TEST(DBTest, Empty) {
|
TEST(DBTest, Empty) {
|
||||||
|
@ -515,6 +469,11 @@ TEST(DBTest, Empty) {
|
||||||
} while (ChangeOptions());
|
} while (ChangeOptions());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST(DBTest, DoubleOpen)
|
||||||
|
{
|
||||||
|
ASSERT_NOTOK(DoubleOpen());
|
||||||
|
}
|
||||||
|
|
||||||
TEST(DBTest, ReadWrite) {
|
TEST(DBTest, ReadWrite) {
|
||||||
do {
|
do {
|
||||||
ASSERT_OK(Put("foo", "v1"));
|
ASSERT_OK(Put("foo", "v1"));
|
||||||
|
@ -547,11 +506,11 @@ TEST(DBTest, GetFromImmutableLayer) {
|
||||||
ASSERT_OK(Put("foo", "v1"));
|
ASSERT_OK(Put("foo", "v1"));
|
||||||
ASSERT_EQ("v1", Get("foo"));
|
ASSERT_EQ("v1", Get("foo"));
|
||||||
|
|
||||||
env_->delay_data_sync_.Release_Store(env_); // Block sync calls
|
env_->delay_sstable_sync_.Release_Store(env_); // Block sync calls
|
||||||
Put("k1", std::string(100000, 'x')); // Fill memtable
|
Put("k1", std::string(100000, 'x')); // Fill memtable
|
||||||
Put("k2", std::string(100000, 'y')); // Trigger compaction
|
Put("k2", std::string(100000, 'y')); // Trigger compaction
|
||||||
ASSERT_EQ("v1", Get("foo"));
|
ASSERT_EQ("v1", Get("foo"));
|
||||||
env_->delay_data_sync_.Release_Store(NULL); // Release sync calls
|
env_->delay_sstable_sync_.Release_Store(NULL); // Release sync calls
|
||||||
} while (ChangeOptions());
|
} while (ChangeOptions());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -563,17 +522,6 @@ TEST(DBTest, GetFromVersions) {
|
||||||
} while (ChangeOptions());
|
} while (ChangeOptions());
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(DBTest, GetMemUsage) {
|
|
||||||
do {
|
|
||||||
ASSERT_OK(Put("foo", "v1"));
|
|
||||||
std::string val;
|
|
||||||
ASSERT_TRUE(db_->GetProperty("leveldb.approximate-memory-usage", &val));
|
|
||||||
int mem_usage = atoi(val.c_str());
|
|
||||||
ASSERT_GT(mem_usage, 0);
|
|
||||||
ASSERT_LT(mem_usage, 5*1024*1024);
|
|
||||||
} while (ChangeOptions());
|
|
||||||
}
|
|
||||||
|
|
||||||
TEST(DBTest, GetSnapshot) {
|
TEST(DBTest, GetSnapshot) {
|
||||||
do {
|
do {
|
||||||
// Try with both a short key and a long key
|
// Try with both a short key and a long key
|
||||||
|
@ -634,6 +582,9 @@ TEST(DBTest, GetPicksCorrectFile) {
|
||||||
} while (ChangeOptions());
|
} while (ChangeOptions());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if 0
|
||||||
|
// riak does not execute compaction due to reads
|
||||||
|
|
||||||
TEST(DBTest, GetEncountersEmptyLevel) {
|
TEST(DBTest, GetEncountersEmptyLevel) {
|
||||||
do {
|
do {
|
||||||
// Arrange for the following to happen:
|
// Arrange for the following to happen:
|
||||||
|
@ -642,7 +593,7 @@ TEST(DBTest, GetEncountersEmptyLevel) {
|
||||||
// * sstable B in level 2
|
// * sstable B in level 2
|
||||||
// Then do enough Get() calls to arrange for an automatic compaction
|
// Then do enough Get() calls to arrange for an automatic compaction
|
||||||
// of sstable A. A bug would cause the compaction to be marked as
|
// of sstable A. A bug would cause the compaction to be marked as
|
||||||
// occurring at level 1 (instead of the correct level 0).
|
// occuring at level 1 (instead of the correct level 0).
|
||||||
|
|
||||||
// Step 1: First place sstables in levels 0 and 2
|
// Step 1: First place sstables in levels 0 and 2
|
||||||
int compaction_count = 0;
|
int compaction_count = 0;
|
||||||
|
@ -667,11 +618,12 @@ TEST(DBTest, GetEncountersEmptyLevel) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Step 4: Wait for compaction to finish
|
// Step 4: Wait for compaction to finish
|
||||||
DelayMilliseconds(1000);
|
env_->SleepForMicroseconds(1000000);
|
||||||
|
|
||||||
ASSERT_EQ(NumTableFilesAtLevel(0), 0);
|
ASSERT_EQ(NumTableFilesAtLevel(0), 0);
|
||||||
} while (ChangeOptions());
|
} while (ChangeOptions());
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
TEST(DBTest, IterEmpty) {
|
TEST(DBTest, IterEmpty) {
|
||||||
Iterator* iter = db_->NewIterator(ReadOptions());
|
Iterator* iter = db_->NewIterator(ReadOptions());
|
||||||
|
@ -996,7 +948,8 @@ TEST(DBTest, CompactionsGenerateMultipleFiles) {
|
||||||
dbfull()->TEST_CompactRange(0, NULL, NULL);
|
dbfull()->TEST_CompactRange(0, NULL, NULL);
|
||||||
|
|
||||||
ASSERT_EQ(NumTableFilesAtLevel(0), 0);
|
ASSERT_EQ(NumTableFilesAtLevel(0), 0);
|
||||||
ASSERT_GT(NumTableFilesAtLevel(1), 1);
|
// not riak ASSERT_GT(NumTableFilesAtLevel(1), 1);
|
||||||
|
ASSERT_EQ(NumTableFilesAtLevel(1), 1); // yes riak
|
||||||
for (int i = 0; i < 80; i++) {
|
for (int i = 0; i < 80; i++) {
|
||||||
ASSERT_EQ(Get(Key(i)), values[i]);
|
ASSERT_EQ(Get(Key(i)), values[i]);
|
||||||
}
|
}
|
||||||
|
@ -1010,7 +963,8 @@ TEST(DBTest, RepeatedWritesToSameKey) {
|
||||||
|
|
||||||
// We must have at most one file per level except for level-0,
|
// We must have at most one file per level except for level-0,
|
||||||
// which may have up to kL0_StopWritesTrigger files.
|
// which may have up to kL0_StopWritesTrigger files.
|
||||||
const int kMaxFiles = config::kNumLevels + config::kL0_StopWritesTrigger;
|
// ... basho adds *2 since level-1 is now overlapped too
|
||||||
|
const int kMaxFiles = config::kNumLevels + config::kL0_StopWritesTrigger*2;
|
||||||
|
|
||||||
Random rnd(301);
|
Random rnd(301);
|
||||||
std::string value = RandomString(&rnd, 2 * options.write_buffer_size);
|
std::string value = RandomString(&rnd, 2 * options.write_buffer_size);
|
||||||
|
@ -1054,11 +1008,13 @@ TEST(DBTest, SparseMerge) {
|
||||||
|
|
||||||
// Compactions should not cause us to create a situation where
|
// Compactions should not cause us to create a situation where
|
||||||
// a file overlaps too much data at the next level.
|
// a file overlaps too much data at the next level.
|
||||||
ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576);
|
// 07/10/14 matthewv - we overlap first two levels. sparse test not appropriate there,
|
||||||
|
// and we set overlaps into 100s of megabytes as "normal"
|
||||||
|
// ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576);
|
||||||
dbfull()->TEST_CompactRange(0, NULL, NULL);
|
dbfull()->TEST_CompactRange(0, NULL, NULL);
|
||||||
ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576);
|
// ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576);
|
||||||
dbfull()->TEST_CompactRange(1, NULL, NULL);
|
dbfull()->TEST_CompactRange(1, NULL, NULL);
|
||||||
ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576);
|
// ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576);
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool Between(uint64_t val, uint64_t low, uint64_t high) {
|
static bool Between(uint64_t val, uint64_t low, uint64_t high) {
|
||||||
|
@ -1096,14 +1052,6 @@ TEST(DBTest, ApproximateSizes) {
|
||||||
// 0 because GetApproximateSizes() does not account for memtable space
|
// 0 because GetApproximateSizes() does not account for memtable space
|
||||||
ASSERT_TRUE(Between(Size("", Key(50)), 0, 0));
|
ASSERT_TRUE(Between(Size("", Key(50)), 0, 0));
|
||||||
|
|
||||||
if (options.reuse_logs) {
|
|
||||||
// Recovery will reuse memtable, and GetApproximateSizes() does not
|
|
||||||
// account for memtable usage;
|
|
||||||
Reopen(&options);
|
|
||||||
ASSERT_TRUE(Between(Size("", Key(50)), 0, 0));
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Check sizes across recovery by reopening a few times
|
// Check sizes across recovery by reopening a few times
|
||||||
for (int run = 0; run < 3; run++) {
|
for (int run = 0; run < 3; run++) {
|
||||||
Reopen(&options);
|
Reopen(&options);
|
||||||
|
@ -1147,11 +1095,6 @@ TEST(DBTest, ApproximateSizes_MixOfSmallAndLarge) {
|
||||||
ASSERT_OK(Put(Key(6), RandomString(&rnd, 300000)));
|
ASSERT_OK(Put(Key(6), RandomString(&rnd, 300000)));
|
||||||
ASSERT_OK(Put(Key(7), RandomString(&rnd, 10000)));
|
ASSERT_OK(Put(Key(7), RandomString(&rnd, 10000)));
|
||||||
|
|
||||||
if (options.reuse_logs) {
|
|
||||||
// Need to force a memtable compaction since recovery does not do so.
|
|
||||||
ASSERT_OK(dbfull()->TEST_CompactMemTable());
|
|
||||||
}
|
|
||||||
|
|
||||||
// Check sizes across recovery by reopening a few times
|
// Check sizes across recovery by reopening a few times
|
||||||
for (int run = 0; run < 3; run++) {
|
for (int run = 0; run < 3; run++) {
|
||||||
Reopen(&options);
|
Reopen(&options);
|
||||||
|
@ -1223,7 +1166,7 @@ TEST(DBTest, Snapshot) {
|
||||||
ASSERT_EQ("v4", Get("foo"));
|
ASSERT_EQ("v4", Get("foo"));
|
||||||
} while (ChangeOptions());
|
} while (ChangeOptions());
|
||||||
}
|
}
|
||||||
|
#if 0 // trouble under Riak due to assumed file sizes
|
||||||
TEST(DBTest, HiddenValuesAreRemoved) {
|
TEST(DBTest, HiddenValuesAreRemoved) {
|
||||||
do {
|
do {
|
||||||
Random rnd(301);
|
Random rnd(301);
|
||||||
|
@ -1254,7 +1197,7 @@ TEST(DBTest, HiddenValuesAreRemoved) {
|
||||||
ASSERT_TRUE(Between(Size("", "pastfoo"), 0, 1000));
|
ASSERT_TRUE(Between(Size("", "pastfoo"), 0, 1000));
|
||||||
} while (ChangeOptions());
|
} while (ChangeOptions());
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
TEST(DBTest, DeletionMarkers1) {
|
TEST(DBTest, DeletionMarkers1) {
|
||||||
Put("foo", "v1");
|
Put("foo", "v1");
|
||||||
ASSERT_OK(dbfull()->TEST_CompactMemTable());
|
ASSERT_OK(dbfull()->TEST_CompactMemTable());
|
||||||
|
@ -1271,13 +1214,14 @@ TEST(DBTest, DeletionMarkers1) {
|
||||||
Delete("foo");
|
Delete("foo");
|
||||||
Put("foo", "v2");
|
Put("foo", "v2");
|
||||||
ASSERT_EQ(AllEntriesFor("foo"), "[ v2, DEL, v1 ]");
|
ASSERT_EQ(AllEntriesFor("foo"), "[ v2, DEL, v1 ]");
|
||||||
ASSERT_OK(dbfull()->TEST_CompactMemTable()); // Moves to level last-2
|
ASSERT_OK(dbfull()->TEST_CompactMemTable()); // stays at level 0
|
||||||
ASSERT_EQ(AllEntriesFor("foo"), "[ v2, DEL, v1 ]");
|
ASSERT_EQ(AllEntriesFor("foo"), "[ v2, v1 ]"); // riak 1.3, DEL merged out by BuildTable
|
||||||
Slice z("z");
|
Slice z("z");
|
||||||
dbfull()->TEST_CompactRange(last-2, NULL, &z);
|
dbfull()->TEST_CompactRange(0, NULL, &z);
|
||||||
|
dbfull()->TEST_CompactRange(1, NULL, &z);
|
||||||
// DEL eliminated, but v1 remains because we aren't compacting that level
|
// DEL eliminated, but v1 remains because we aren't compacting that level
|
||||||
// (DEL can be eliminated because v2 hides v1).
|
// (DEL can be eliminated because v2 hides v1).
|
||||||
ASSERT_EQ(AllEntriesFor("foo"), "[ v2, v1 ]");
|
ASSERT_EQ(AllEntriesFor("foo"), "[ v2, v1 ]"); // Riak 1.4 has merged to level 1
|
||||||
dbfull()->TEST_CompactRange(last-1, NULL, NULL);
|
dbfull()->TEST_CompactRange(last-1, NULL, NULL);
|
||||||
// Merging last-1 w/ last, so we are the base level for "foo", so
|
// Merging last-1 w/ last, so we are the base level for "foo", so
|
||||||
// DEL is removed. (as is v1).
|
// DEL is removed. (as is v1).
|
||||||
|
@ -1289,39 +1233,47 @@ TEST(DBTest, DeletionMarkers2) {
|
||||||
ASSERT_OK(dbfull()->TEST_CompactMemTable());
|
ASSERT_OK(dbfull()->TEST_CompactMemTable());
|
||||||
const int last = config::kMaxMemCompactLevel;
|
const int last = config::kMaxMemCompactLevel;
|
||||||
ASSERT_EQ(NumTableFilesAtLevel(last), 1); // foo => v1 is now in last level
|
ASSERT_EQ(NumTableFilesAtLevel(last), 1); // foo => v1 is now in last level
|
||||||
|
dbfull()->TEST_CompactRange(0, NULL, NULL);
|
||||||
|
ASSERT_EQ(NumTableFilesAtLevel(last), 1); // foo => v1 is now in last level
|
||||||
|
ASSERT_EQ(NumTableFilesAtLevel(last-1), 0);
|
||||||
|
|
||||||
// Place a table at level last-1 to prevent merging with preceding mutation
|
// Place a table at level last-1 to prevent merging with preceding mutation
|
||||||
Put("a", "begin");
|
Put("a", "begin");
|
||||||
Put("z", "end");
|
Put("z", "end");
|
||||||
dbfull()->TEST_CompactMemTable();
|
dbfull()->TEST_CompactMemTable(); // goes to last-1
|
||||||
ASSERT_EQ(NumTableFilesAtLevel(last), 1);
|
|
||||||
ASSERT_EQ(NumTableFilesAtLevel(last-1), 1);
|
ASSERT_EQ(NumTableFilesAtLevel(last-1), 1);
|
||||||
|
|
||||||
Delete("foo");
|
Delete("foo");
|
||||||
ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]");
|
ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]");
|
||||||
ASSERT_OK(dbfull()->TEST_CompactMemTable()); // Moves to level last-2
|
ASSERT_OK(dbfull()->TEST_CompactMemTable()); // Moves to level 0
|
||||||
ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]");
|
ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]");
|
||||||
dbfull()->TEST_CompactRange(last-2, NULL, NULL);
|
dbfull()->TEST_CompactRange(0, NULL, NULL); // Riak overlaps level 1
|
||||||
// DEL kept: "last" file overlaps
|
// DEL kept: "last" file overlaps
|
||||||
ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]");
|
ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]");
|
||||||
dbfull()->TEST_CompactRange(last-1, NULL, NULL);
|
|
||||||
// Merging last-1 w/ last, so we are the base level for "foo", so
|
// Merging last-1 w/ last, so we are the base level for "foo", so
|
||||||
// DEL is removed. (as is v1).
|
// DEL is removed. (as is v1).
|
||||||
|
dbfull()->TEST_CompactRange(1, NULL, NULL);
|
||||||
|
ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]");
|
||||||
|
|
||||||
|
dbfull()->TEST_CompactRange(2, NULL, NULL);
|
||||||
ASSERT_EQ(AllEntriesFor("foo"), "[ ]");
|
ASSERT_EQ(AllEntriesFor("foo"), "[ ]");
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(DBTest, OverlapInLevel0) {
|
TEST(DBTest, OverlapInLevel0) {
|
||||||
do {
|
do {
|
||||||
ASSERT_EQ(config::kMaxMemCompactLevel, 2) << "Fix test to match config";
|
ASSERT_EQ(config::kMaxMemCompactLevel, 3) << "Fix test to match config";
|
||||||
|
|
||||||
// Fill levels 1 and 2 to disable the pushing of new memtables to levels > 0.
|
// Fill levels 1 and 2 to disable the pushing of new memtables to levels > 0.
|
||||||
ASSERT_OK(Put("100", "v100"));
|
ASSERT_OK(Put("100", "v100"));
|
||||||
ASSERT_OK(Put("999", "v999"));
|
ASSERT_OK(Put("999", "v999"));
|
||||||
dbfull()->TEST_CompactMemTable();
|
dbfull()->TEST_CompactMemTable();
|
||||||
|
dbfull()->TEST_CompactRange(0, NULL, NULL);
|
||||||
|
dbfull()->TEST_CompactRange(1, NULL, NULL);
|
||||||
ASSERT_OK(Delete("100"));
|
ASSERT_OK(Delete("100"));
|
||||||
ASSERT_OK(Delete("999"));
|
ASSERT_OK(Delete("999"));
|
||||||
dbfull()->TEST_CompactMemTable();
|
dbfull()->TEST_CompactMemTable();
|
||||||
ASSERT_EQ("0,1,1", FilesPerLevel());
|
dbfull()->TEST_CompactRange(0, NULL, NULL);
|
||||||
|
ASSERT_EQ("0,0,1,1", FilesPerLevel());
|
||||||
|
|
||||||
// Make files spanning the following ranges in level-0:
|
// Make files spanning the following ranges in level-0:
|
||||||
// files[0] 200 .. 900
|
// files[0] 200 .. 900
|
||||||
|
@ -1334,7 +1286,7 @@ TEST(DBTest, OverlapInLevel0) {
|
||||||
ASSERT_OK(Put("600", "v600"));
|
ASSERT_OK(Put("600", "v600"));
|
||||||
ASSERT_OK(Put("900", "v900"));
|
ASSERT_OK(Put("900", "v900"));
|
||||||
dbfull()->TEST_CompactMemTable();
|
dbfull()->TEST_CompactMemTable();
|
||||||
ASSERT_EQ("2,1,1", FilesPerLevel());
|
ASSERT_EQ("2,0,1,1", FilesPerLevel());
|
||||||
|
|
||||||
// Compact away the placeholder files we created initially
|
// Compact away the placeholder files we created initially
|
||||||
dbfull()->TEST_CompactRange(1, NULL, NULL);
|
dbfull()->TEST_CompactRange(1, NULL, NULL);
|
||||||
|
@ -1364,7 +1316,7 @@ TEST(DBTest, L0_CompactionBug_Issue44_a) {
|
||||||
Reopen();
|
Reopen();
|
||||||
Reopen();
|
Reopen();
|
||||||
ASSERT_EQ("(a->v)", Contents());
|
ASSERT_EQ("(a->v)", Contents());
|
||||||
DelayMilliseconds(1000); // Wait for compaction to finish
|
env_->SleepForMicroseconds(1000000); // Wait for compaction to finish
|
||||||
ASSERT_EQ("(a->v)", Contents());
|
ASSERT_EQ("(a->v)", Contents());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1380,7 +1332,7 @@ TEST(DBTest, L0_CompactionBug_Issue44_b) {
|
||||||
Put("","");
|
Put("","");
|
||||||
Reopen();
|
Reopen();
|
||||||
Put("","");
|
Put("","");
|
||||||
DelayMilliseconds(1000); // Wait for compaction to finish
|
env_->SleepForMicroseconds(1000000); // Wait for compaction to finish
|
||||||
Reopen();
|
Reopen();
|
||||||
Put("d","dv");
|
Put("d","dv");
|
||||||
Reopen();
|
Reopen();
|
||||||
|
@ -1390,7 +1342,7 @@ TEST(DBTest, L0_CompactionBug_Issue44_b) {
|
||||||
Delete("b");
|
Delete("b");
|
||||||
Reopen();
|
Reopen();
|
||||||
ASSERT_EQ("(->)(c->cv)", Contents());
|
ASSERT_EQ("(->)(c->cv)", Contents());
|
||||||
DelayMilliseconds(1000); // Wait for compaction to finish
|
env_->SleepForMicroseconds(1000000); // Wait for compaction to finish
|
||||||
ASSERT_EQ("(->)(c->cv)", Contents());
|
ASSERT_EQ("(->)(c->cv)", Contents());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1473,37 +1425,37 @@ TEST(DBTest, CustomComparator) {
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(DBTest, ManualCompaction) {
|
TEST(DBTest, ManualCompaction) {
|
||||||
ASSERT_EQ(config::kMaxMemCompactLevel, 2)
|
ASSERT_EQ(config::kMaxMemCompactLevel, 3)
|
||||||
<< "Need to update this test to match kMaxMemCompactLevel";
|
<< "Need to update this test to match kMaxMemCompactLevel";
|
||||||
|
|
||||||
MakeTables(3, "p", "q");
|
MakeTables(3, "p", "q");
|
||||||
ASSERT_EQ("1,1,1", FilesPerLevel());
|
ASSERT_EQ("1,0,1,1", FilesPerLevel());
|
||||||
|
|
||||||
// Compaction range falls before files
|
// Compaction range falls before files
|
||||||
Compact("", "c");
|
Compact("", "c");
|
||||||
ASSERT_EQ("1,1,1", FilesPerLevel());
|
ASSERT_EQ("0,1,1,1", FilesPerLevel());
|
||||||
|
|
||||||
// Compaction range falls after files
|
// Compaction range falls after files
|
||||||
Compact("r", "z");
|
Compact("r", "z");
|
||||||
ASSERT_EQ("1,1,1", FilesPerLevel());
|
ASSERT_EQ("0,1,1,1", FilesPerLevel());
|
||||||
|
|
||||||
// Compaction range overlaps files
|
// Compaction range overlaps files
|
||||||
Compact("p1", "p9");
|
Compact("p1", "p9");
|
||||||
ASSERT_EQ("0,0,1", FilesPerLevel());
|
ASSERT_EQ("0,0,0,1", FilesPerLevel());
|
||||||
|
|
||||||
// Populate a different range
|
// Populate a different range
|
||||||
MakeTables(3, "c", "e");
|
MakeTables(3, "c", "e");
|
||||||
ASSERT_EQ("1,1,2", FilesPerLevel());
|
ASSERT_EQ("1,0,1,2", FilesPerLevel());
|
||||||
|
|
||||||
// Compact just the new range
|
// Compact just the new range
|
||||||
Compact("b", "f");
|
Compact("b", "f");
|
||||||
ASSERT_EQ("0,0,2", FilesPerLevel());
|
ASSERT_EQ("0,0,0,2", FilesPerLevel());
|
||||||
|
|
||||||
// Compact all
|
// Compact all
|
||||||
MakeTables(1, "a", "z");
|
MakeTables(1, "a", "z");
|
||||||
ASSERT_EQ("0,1,2", FilesPerLevel());
|
ASSERT_EQ("0,0,1,2", FilesPerLevel());
|
||||||
db_->CompactRange(NULL, NULL);
|
db_->CompactRange(NULL, NULL);
|
||||||
ASSERT_EQ("0,0,1", FilesPerLevel());
|
ASSERT_EQ("0,0,0,1", FilesPerLevel());
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(DBTest, DBOpen_Options) {
|
TEST(DBTest, DBOpen_Options) {
|
||||||
|
@ -1545,12 +1497,6 @@ TEST(DBTest, DBOpen_Options) {
|
||||||
db = NULL;
|
db = NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(DBTest, Locking) {
|
|
||||||
DB* db2 = NULL;
|
|
||||||
Status s = DB::Open(CurrentOptions(), dbname_, &db2);
|
|
||||||
ASSERT_TRUE(!s.ok()) << "Locking did not prevent re-opening db";
|
|
||||||
}
|
|
||||||
|
|
||||||
// Check that number of files does not grow when we are out of space
|
// Check that number of files does not grow when we are out of space
|
||||||
TEST(DBTest, NoSpace) {
|
TEST(DBTest, NoSpace) {
|
||||||
Options options = CurrentOptions();
|
Options options = CurrentOptions();
|
||||||
|
@ -1562,15 +1508,19 @@ TEST(DBTest, NoSpace) {
|
||||||
Compact("a", "z");
|
Compact("a", "z");
|
||||||
const int num_files = CountFiles();
|
const int num_files = CountFiles();
|
||||||
env_->no_space_.Release_Store(env_); // Force out-of-space errors
|
env_->no_space_.Release_Store(env_); // Force out-of-space errors
|
||||||
for (int i = 0; i < 10; i++) {
|
env_->sleep_counter_.Reset();
|
||||||
|
for (int i = 0; i < 5; i++) {
|
||||||
for (int level = 0; level < config::kNumLevels-1; level++) {
|
for (int level = 0; level < config::kNumLevels-1; level++) {
|
||||||
dbfull()->TEST_CompactRange(level, NULL, NULL);
|
dbfull()->TEST_CompactRange(level, NULL, NULL);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
env_->no_space_.Release_Store(NULL);
|
env_->no_space_.Release_Store(NULL);
|
||||||
ASSERT_LT(CountFiles(), num_files + 3);
|
ASSERT_LT(CountFiles(), num_files + 3);
|
||||||
}
|
|
||||||
|
|
||||||
|
// Check that compaction attempts slept after errors
|
||||||
|
ASSERT_GE(env_->sleep_counter_.Read(), 5);
|
||||||
|
}
|
||||||
|
#if 0
|
||||||
TEST(DBTest, NonWritableFileSystem) {
|
TEST(DBTest, NonWritableFileSystem) {
|
||||||
Options options = CurrentOptions();
|
Options options = CurrentOptions();
|
||||||
options.write_buffer_size = 1000;
|
options.write_buffer_size = 1000;
|
||||||
|
@ -1584,119 +1534,13 @@ TEST(DBTest, NonWritableFileSystem) {
|
||||||
fprintf(stderr, "iter %d; errors %d\n", i, errors);
|
fprintf(stderr, "iter %d; errors %d\n", i, errors);
|
||||||
if (!Put("foo", big).ok()) {
|
if (!Put("foo", big).ok()) {
|
||||||
errors++;
|
errors++;
|
||||||
DelayMilliseconds(100);
|
env_->SleepForMicroseconds(100000);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
ASSERT_GT(errors, 0);
|
ASSERT_GT(errors, 0);
|
||||||
env_->non_writable_.Release_Store(NULL);
|
env_->non_writable_.Release_Store(NULL);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
TEST(DBTest, WriteSyncError) {
|
|
||||||
// Check that log sync errors cause the DB to disallow future writes.
|
|
||||||
|
|
||||||
// (a) Cause log sync calls to fail
|
|
||||||
Options options = CurrentOptions();
|
|
||||||
options.env = env_;
|
|
||||||
Reopen(&options);
|
|
||||||
env_->data_sync_error_.Release_Store(env_);
|
|
||||||
|
|
||||||
// (b) Normal write should succeed
|
|
||||||
WriteOptions w;
|
|
||||||
ASSERT_OK(db_->Put(w, "k1", "v1"));
|
|
||||||
ASSERT_EQ("v1", Get("k1"));
|
|
||||||
|
|
||||||
// (c) Do a sync write; should fail
|
|
||||||
w.sync = true;
|
|
||||||
ASSERT_TRUE(!db_->Put(w, "k2", "v2").ok());
|
|
||||||
ASSERT_EQ("v1", Get("k1"));
|
|
||||||
ASSERT_EQ("NOT_FOUND", Get("k2"));
|
|
||||||
|
|
||||||
// (d) make sync behave normally
|
|
||||||
env_->data_sync_error_.Release_Store(NULL);
|
|
||||||
|
|
||||||
// (e) Do a non-sync write; should fail
|
|
||||||
w.sync = false;
|
|
||||||
ASSERT_TRUE(!db_->Put(w, "k3", "v3").ok());
|
|
||||||
ASSERT_EQ("v1", Get("k1"));
|
|
||||||
ASSERT_EQ("NOT_FOUND", Get("k2"));
|
|
||||||
ASSERT_EQ("NOT_FOUND", Get("k3"));
|
|
||||||
}
|
|
||||||
|
|
||||||
TEST(DBTest, ManifestWriteError) {
|
|
||||||
// Test for the following problem:
|
|
||||||
// (a) Compaction produces file F
|
|
||||||
// (b) Log record containing F is written to MANIFEST file, but Sync() fails
|
|
||||||
// (c) GC deletes F
|
|
||||||
// (d) After reopening DB, reads fail since deleted F is named in log record
|
|
||||||
|
|
||||||
// We iterate twice. In the second iteration, everything is the
|
|
||||||
// same except the log record never makes it to the MANIFEST file.
|
|
||||||
for (int iter = 0; iter < 2; iter++) {
|
|
||||||
port::AtomicPointer* error_type = (iter == 0)
|
|
||||||
? &env_->manifest_sync_error_
|
|
||||||
: &env_->manifest_write_error_;
|
|
||||||
|
|
||||||
// Insert foo=>bar mapping
|
|
||||||
Options options = CurrentOptions();
|
|
||||||
options.env = env_;
|
|
||||||
options.create_if_missing = true;
|
|
||||||
options.error_if_exists = false;
|
|
||||||
DestroyAndReopen(&options);
|
|
||||||
ASSERT_OK(Put("foo", "bar"));
|
|
||||||
ASSERT_EQ("bar", Get("foo"));
|
|
||||||
|
|
||||||
// Memtable compaction (will succeed)
|
|
||||||
dbfull()->TEST_CompactMemTable();
|
|
||||||
ASSERT_EQ("bar", Get("foo"));
|
|
||||||
const int last = config::kMaxMemCompactLevel;
|
|
||||||
ASSERT_EQ(NumTableFilesAtLevel(last), 1); // foo=>bar is now in last level
|
|
||||||
|
|
||||||
// Merging compaction (will fail)
|
|
||||||
error_type->Release_Store(env_);
|
|
||||||
dbfull()->TEST_CompactRange(last, NULL, NULL); // Should fail
|
|
||||||
ASSERT_EQ("bar", Get("foo"));
|
|
||||||
|
|
||||||
// Recovery: should not lose data
|
|
||||||
error_type->Release_Store(NULL);
|
|
||||||
Reopen(&options);
|
|
||||||
ASSERT_EQ("bar", Get("foo"));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
TEST(DBTest, MissingSSTFile) {
|
|
||||||
ASSERT_OK(Put("foo", "bar"));
|
|
||||||
ASSERT_EQ("bar", Get("foo"));
|
|
||||||
|
|
||||||
// Dump the memtable to disk.
|
|
||||||
dbfull()->TEST_CompactMemTable();
|
|
||||||
ASSERT_EQ("bar", Get("foo"));
|
|
||||||
|
|
||||||
Close();
|
|
||||||
ASSERT_TRUE(DeleteAnSSTFile());
|
|
||||||
Options options = CurrentOptions();
|
|
||||||
options.paranoid_checks = true;
|
|
||||||
Status s = TryReopen(&options);
|
|
||||||
ASSERT_TRUE(!s.ok());
|
|
||||||
ASSERT_TRUE(s.ToString().find("issing") != std::string::npos)
|
|
||||||
<< s.ToString();
|
|
||||||
}
|
|
||||||
|
|
||||||
TEST(DBTest, StillReadSST) {
|
|
||||||
ASSERT_OK(Put("foo", "bar"));
|
|
||||||
ASSERT_EQ("bar", Get("foo"));
|
|
||||||
|
|
||||||
// Dump the memtable to disk.
|
|
||||||
dbfull()->TEST_CompactMemTable();
|
|
||||||
ASSERT_EQ("bar", Get("foo"));
|
|
||||||
Close();
|
|
||||||
ASSERT_GT(RenameLDBToSST(), 0);
|
|
||||||
Options options = CurrentOptions();
|
|
||||||
options.paranoid_checks = true;
|
|
||||||
Status s = TryReopen(&options);
|
|
||||||
ASSERT_TRUE(s.ok());
|
|
||||||
ASSERT_EQ("bar", Get("foo"));
|
|
||||||
}
|
|
||||||
|
|
||||||
TEST(DBTest, FilesDeletedAfterCompaction) {
|
TEST(DBTest, FilesDeletedAfterCompaction) {
|
||||||
ASSERT_OK(Put("foo", "v2"));
|
ASSERT_OK(Put("foo", "v2"));
|
||||||
Compact("a", "z");
|
Compact("a", "z");
|
||||||
|
@ -1713,7 +1557,7 @@ TEST(DBTest, BloomFilter) {
|
||||||
Options options = CurrentOptions();
|
Options options = CurrentOptions();
|
||||||
options.env = env_;
|
options.env = env_;
|
||||||
options.block_cache = NewLRUCache(0); // Prevent cache hits
|
options.block_cache = NewLRUCache(0); // Prevent cache hits
|
||||||
options.filter_policy = NewBloomFilterPolicy(10);
|
options.filter_policy = NewBloomFilterPolicy2(16);
|
||||||
Reopen(&options);
|
Reopen(&options);
|
||||||
|
|
||||||
// Populate multiple layers
|
// Populate multiple layers
|
||||||
|
@ -1728,12 +1572,12 @@ TEST(DBTest, BloomFilter) {
|
||||||
dbfull()->TEST_CompactMemTable();
|
dbfull()->TEST_CompactMemTable();
|
||||||
|
|
||||||
// Prevent auto compactions triggered by seeks
|
// Prevent auto compactions triggered by seeks
|
||||||
env_->delay_data_sync_.Release_Store(env_);
|
env_->delay_sstable_sync_.Release_Store(env_);
|
||||||
|
|
||||||
// Lookup present keys. Should rarely read from small sstable.
|
// Lookup present keys. Should rarely read from small sstable.
|
||||||
env_->random_read_counter_.Reset();
|
env_->random_read_counter_.Reset();
|
||||||
for (int i = 0; i < N; i++) {
|
for (int i = 0; i < N; i++) {
|
||||||
ASSERT_EQ(Key(i), Get(Key(i)));
|
ASSERT_EQ(Key(i), GetNoCache(Key(i)));
|
||||||
}
|
}
|
||||||
int reads = env_->random_read_counter_.Read();
|
int reads = env_->random_read_counter_.Read();
|
||||||
fprintf(stderr, "%d present => %d reads\n", N, reads);
|
fprintf(stderr, "%d present => %d reads\n", N, reads);
|
||||||
|
@ -1743,13 +1587,13 @@ TEST(DBTest, BloomFilter) {
|
||||||
// Lookup present keys. Should rarely read from either sstable.
|
// Lookup present keys. Should rarely read from either sstable.
|
||||||
env_->random_read_counter_.Reset();
|
env_->random_read_counter_.Reset();
|
||||||
for (int i = 0; i < N; i++) {
|
for (int i = 0; i < N; i++) {
|
||||||
ASSERT_EQ("NOT_FOUND", Get(Key(i) + ".missing"));
|
ASSERT_EQ("NOT_FOUND", GetNoCache(Key(i) + ".missing"));
|
||||||
}
|
}
|
||||||
reads = env_->random_read_counter_.Read();
|
reads = env_->random_read_counter_.Read();
|
||||||
fprintf(stderr, "%d missing => %d reads\n", N, reads);
|
fprintf(stderr, "%d missing => %d reads\n", N, reads);
|
||||||
ASSERT_LE(reads, 3*N/100);
|
ASSERT_LE(reads, 3*N/100);
|
||||||
|
|
||||||
env_->delay_data_sync_.Release_Store(NULL);
|
env_->delay_sstable_sync_.Release_Store(NULL);
|
||||||
Close();
|
Close();
|
||||||
delete options.block_cache;
|
delete options.block_cache;
|
||||||
delete options.filter_policy;
|
delete options.filter_policy;
|
||||||
|
@ -1809,7 +1653,7 @@ static void MTThreadBody(void* arg) {
|
||||||
ASSERT_EQ(k, key);
|
ASSERT_EQ(k, key);
|
||||||
ASSERT_GE(w, 0);
|
ASSERT_GE(w, 0);
|
||||||
ASSERT_LT(w, kNumThreads);
|
ASSERT_LT(w, kNumThreads);
|
||||||
ASSERT_LE(static_cast<uintptr_t>(c), reinterpret_cast<uintptr_t>(
|
ASSERT_LE(c, reinterpret_cast<uintptr_t>(
|
||||||
t->state->counter[w].Acquire_Load()));
|
t->state->counter[w].Acquire_Load()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1834,27 +1678,35 @@ TEST(DBTest, MultiThreaded) {
|
||||||
|
|
||||||
// Start threads
|
// Start threads
|
||||||
MTThread thread[kNumThreads];
|
MTThread thread[kNumThreads];
|
||||||
|
pthread_t tid;
|
||||||
for (int id = 0; id < kNumThreads; id++) {
|
for (int id = 0; id < kNumThreads; id++) {
|
||||||
thread[id].state = &mt;
|
thread[id].state = &mt;
|
||||||
thread[id].id = id;
|
thread[id].id = id;
|
||||||
env_->StartThread(MTThreadBody, &thread[id]);
|
tid=env_->StartThread(MTThreadBody, &thread[id]);
|
||||||
|
pthread_detach(tid);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Let them run for a while
|
// Let them run for a while
|
||||||
DelayMilliseconds(kTestSeconds * 1000);
|
env_->SleepForMicroseconds(kTestSeconds * 1000000);
|
||||||
|
|
||||||
// Stop the threads and wait for them to finish
|
// Stop the threads and wait for them to finish
|
||||||
mt.stop.Release_Store(&mt);
|
mt.stop.Release_Store(&mt);
|
||||||
for (int id = 0; id < kNumThreads; id++) {
|
for (int id = 0; id < kNumThreads; id++) {
|
||||||
while (mt.thread_done[id].Acquire_Load() == NULL) {
|
while (mt.thread_done[id].Acquire_Load() == NULL) {
|
||||||
DelayMilliseconds(100);
|
env_->SleepForMicroseconds(100000);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} while (ChangeOptions());
|
} while (ChangeOptions());
|
||||||
}
|
}
|
||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
typedef std::map<std::string, std::string> KVMap;
|
struct KVEntry
|
||||||
|
{
|
||||||
|
std::string m_Value;
|
||||||
|
KeyMetaData m_Meta;
|
||||||
|
};
|
||||||
|
|
||||||
|
typedef std::map<std::string, KVEntry> KVMap;
|
||||||
}
|
}
|
||||||
|
|
||||||
class ModelDB: public DB {
|
class ModelDB: public DB {
|
||||||
|
@ -1866,14 +1718,21 @@ class ModelDB: public DB {
|
||||||
|
|
||||||
explicit ModelDB(const Options& options): options_(options) { }
|
explicit ModelDB(const Options& options): options_(options) { }
|
||||||
~ModelDB() { }
|
~ModelDB() { }
|
||||||
virtual Status Put(const WriteOptions& o, const Slice& k, const Slice& v) {
|
virtual Status Put(const WriteOptions& o, const Slice& k, const Slice& v, const KeyMetaData * meta=NULL) {
|
||||||
return DB::Put(o, k, v);
|
return DB::Put(o, k, v, meta);
|
||||||
}
|
}
|
||||||
virtual Status Delete(const WriteOptions& o, const Slice& key) {
|
virtual Status Delete(const WriteOptions& o, const Slice& key) {
|
||||||
return DB::Delete(o, key);
|
return DB::Delete(o, key);
|
||||||
}
|
}
|
||||||
virtual Status Get(const ReadOptions& options,
|
virtual Status Get(const ReadOptions& options,
|
||||||
const Slice& key, std::string* value) {
|
const Slice& key, std::string* value,
|
||||||
|
KeyMetaData * meta = NULL) {
|
||||||
|
assert(false); // Not implemented
|
||||||
|
return Status::NotFound(key);
|
||||||
|
}
|
||||||
|
virtual Status Get(const ReadOptions& options,
|
||||||
|
const Slice& key, Value* value,
|
||||||
|
KeyMetaData * meta = NULL) {
|
||||||
assert(false); // Not implemented
|
assert(false); // Not implemented
|
||||||
return Status::NotFound(key);
|
return Status::NotFound(key);
|
||||||
}
|
}
|
||||||
|
@ -1901,8 +1760,13 @@ class ModelDB: public DB {
|
||||||
class Handler : public WriteBatch::Handler {
|
class Handler : public WriteBatch::Handler {
|
||||||
public:
|
public:
|
||||||
KVMap* map_;
|
KVMap* map_;
|
||||||
virtual void Put(const Slice& key, const Slice& value) {
|
virtual void Put(const Slice& key, const Slice& value,
|
||||||
(*map_)[key.ToString()] = value.ToString();
|
const ValueType & type, const ExpiryTimeMicros & expiry) {
|
||||||
|
KVEntry ent;
|
||||||
|
ent.m_Value=value.ToString();
|
||||||
|
ent.m_Meta.m_Type=type;
|
||||||
|
ent.m_Meta.m_Expiry=expiry;
|
||||||
|
(*map_)[key.ToString()] = ent;
|
||||||
}
|
}
|
||||||
virtual void Delete(const Slice& key) {
|
virtual void Delete(const Slice& key) {
|
||||||
map_->erase(key.ToString());
|
map_->erase(key.ToString());
|
||||||
|
@ -1948,7 +1812,7 @@ class ModelDB: public DB {
|
||||||
virtual void Next() { ++iter_; }
|
virtual void Next() { ++iter_; }
|
||||||
virtual void Prev() { --iter_; }
|
virtual void Prev() { --iter_; }
|
||||||
virtual Slice key() const { return iter_->first; }
|
virtual Slice key() const { return iter_->first; }
|
||||||
virtual Slice value() const { return iter_->second; }
|
virtual Slice value() const { return iter_->second.m_Value; }
|
||||||
virtual Status status() const { return Status::OK(); }
|
virtual Status status() const { return Status::OK(); }
|
||||||
private:
|
private:
|
||||||
const KVMap* const map_;
|
const KVMap* const map_;
|
||||||
|
@ -2085,6 +1949,44 @@ TEST(DBTest, Randomized) {
|
||||||
} while (ChangeOptions());
|
} while (ChangeOptions());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class SimpleBugs
|
||||||
|
{
|
||||||
|
// need a class for the test harness
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
TEST(SimpleBugs, TieredRecoveryLog)
|
||||||
|
{
|
||||||
|
// DB::Open created first recovery log directly
|
||||||
|
// which lead to it NOT being in tiered storage location.
|
||||||
|
// nope std::string dbname = test::TmpDir() + "/leveldb_nontiered";
|
||||||
|
std::string dbname = "leveldb";
|
||||||
|
std::string fastname = test::TmpDir() + "/leveldb_fast";
|
||||||
|
std::string slowname = test::TmpDir() + "/leveldb_slow";
|
||||||
|
std::string combined;
|
||||||
|
|
||||||
|
DB* db = NULL;
|
||||||
|
Options opts;
|
||||||
|
|
||||||
|
opts.tiered_slow_level = 4;
|
||||||
|
opts.tiered_fast_prefix = fastname;
|
||||||
|
opts.tiered_slow_prefix = slowname;
|
||||||
|
opts.create_if_missing = true;
|
||||||
|
|
||||||
|
Env::Default()->CreateDir(fastname);
|
||||||
|
Env::Default()->CreateDir(slowname);
|
||||||
|
|
||||||
|
Status s = DB::Open(opts, dbname, &db);
|
||||||
|
ASSERT_OK(s);
|
||||||
|
ASSERT_TRUE(db != NULL);
|
||||||
|
|
||||||
|
delete db;
|
||||||
|
DestroyDB(dbname, opts);
|
||||||
|
|
||||||
|
} // TieredRecoveryLog
|
||||||
|
|
||||||
|
|
||||||
std::string MakeKey(unsigned int num) {
|
std::string MakeKey(unsigned int num) {
|
||||||
char buf[30];
|
char buf[30];
|
||||||
snprintf(buf, sizeof(buf), "%016u", num);
|
snprintf(buf, sizeof(buf), "%016u", num);
|
||||||
|
@ -2113,14 +2015,13 @@ void BM_LogAndApply(int iters, int num_base_files) {
|
||||||
InternalKeyComparator cmp(BytewiseComparator());
|
InternalKeyComparator cmp(BytewiseComparator());
|
||||||
Options options;
|
Options options;
|
||||||
VersionSet vset(dbname, &options, NULL, &cmp);
|
VersionSet vset(dbname, &options, NULL, &cmp);
|
||||||
bool save_manifest;
|
ASSERT_OK(vset.Recover());
|
||||||
ASSERT_OK(vset.Recover(&save_manifest));
|
|
||||||
VersionEdit vbase;
|
VersionEdit vbase;
|
||||||
uint64_t fnum = 1;
|
uint64_t fnum = 1;
|
||||||
for (int i = 0; i < num_base_files; i++) {
|
for (int i = 0; i < num_base_files; i++) {
|
||||||
InternalKey start(MakeKey(2*fnum), 1, kTypeValue);
|
InternalKey start(MakeKey(2*fnum), 0, 1, kTypeValue);
|
||||||
InternalKey limit(MakeKey(2*fnum+1), 1, kTypeDeletion);
|
InternalKey limit(MakeKey(2*fnum+1), 0, 1, kTypeDeletion);
|
||||||
vbase.AddFile(2, fnum++, 1 /* file size */, start, limit);
|
vbase.AddFile2(2, fnum++, 1 /* file size */, start, limit, 0,0,0);
|
||||||
}
|
}
|
||||||
ASSERT_OK(vset.LogAndApply(&vbase, &mu));
|
ASSERT_OK(vset.LogAndApply(&vbase, &mu));
|
||||||
|
|
||||||
|
@ -2129,9 +2030,9 @@ void BM_LogAndApply(int iters, int num_base_files) {
|
||||||
for (int i = 0; i < iters; i++) {
|
for (int i = 0; i < iters; i++) {
|
||||||
VersionEdit vedit;
|
VersionEdit vedit;
|
||||||
vedit.DeleteFile(2, fnum);
|
vedit.DeleteFile(2, fnum);
|
||||||
InternalKey start(MakeKey(2*fnum), 1, kTypeValue);
|
InternalKey start(MakeKey(2*fnum), 0, 1, kTypeValue);
|
||||||
InternalKey limit(MakeKey(2*fnum+1), 1, kTypeDeletion);
|
InternalKey limit(MakeKey(2*fnum+1), 0, 1, kTypeDeletion);
|
||||||
vedit.AddFile(2, fnum++, 1 /* file size */, start, limit);
|
vedit.AddFile2(2, fnum++, 1 /* file size */, start, limit, 0,0,0);
|
||||||
vset.LogAndApply(&vedit, &mu);
|
vset.LogAndApply(&vedit, &mu);
|
||||||
}
|
}
|
||||||
uint64_t stop_micros = env->NowMicros();
|
uint64_t stop_micros = env->NowMicros();
|
||||||
|
|
|
@ -3,7 +3,9 @@
|
||||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||||
|
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
|
//#include "leveldb/expiry.h"
|
||||||
#include "db/dbformat.h"
|
#include "db/dbformat.h"
|
||||||
|
#include "db/version_set.h"
|
||||||
#include "port/port.h"
|
#include "port/port.h"
|
||||||
#include "util/coding.h"
|
#include "util/coding.h"
|
||||||
|
|
||||||
|
@ -11,26 +13,66 @@ namespace leveldb {
|
||||||
|
|
||||||
static uint64_t PackSequenceAndType(uint64_t seq, ValueType t) {
|
static uint64_t PackSequenceAndType(uint64_t seq, ValueType t) {
|
||||||
assert(seq <= kMaxSequenceNumber);
|
assert(seq <= kMaxSequenceNumber);
|
||||||
assert(t <= kValueTypeForSeek);
|
// assert(t <= kValueTypeForSeek); requires revisit once expiry live
|
||||||
|
assert(t <= kTypeValueExplicitExpiry); // temp replacement for above
|
||||||
return (seq << 8) | t;
|
return (seq << 8) | t;
|
||||||
}
|
}
|
||||||
|
|
||||||
void AppendInternalKey(std::string* result, const ParsedInternalKey& key) {
|
void AppendInternalKey(std::string* result, const ParsedInternalKey& key) {
|
||||||
result->append(key.user_key.data(), key.user_key.size());
|
result->append(key.user_key.data(), key.user_key.size());
|
||||||
|
if (IsExpiryKey(key.type))
|
||||||
|
PutFixed64(result, key.expiry);
|
||||||
PutFixed64(result, PackSequenceAndType(key.sequence, key.type));
|
PutFixed64(result, PackSequenceAndType(key.sequence, key.type));
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string ParsedInternalKey::DebugString() const {
|
std::string ParsedInternalKey::DebugString() const {
|
||||||
char buf[50];
|
char buf[50];
|
||||||
|
if (IsExpiryKey(type))
|
||||||
|
snprintf(buf, sizeof(buf), "' @ %llu %llu : %d",
|
||||||
|
(unsigned long long) expiry,
|
||||||
|
(unsigned long long) sequence,
|
||||||
|
int(type));
|
||||||
|
else
|
||||||
snprintf(buf, sizeof(buf), "' @ %llu : %d",
|
snprintf(buf, sizeof(buf), "' @ %llu : %d",
|
||||||
(unsigned long long) sequence,
|
(unsigned long long) sequence,
|
||||||
int(type));
|
int(type));
|
||||||
std::string result = "'";
|
std::string result = "'";
|
||||||
result += EscapeString(user_key.ToString());
|
result += HexString(user_key.ToString());
|
||||||
result += buf;
|
result += buf;
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::string ParsedInternalKey::DebugStringHex() const {
|
||||||
|
char buf[50];
|
||||||
|
if (IsExpiryKey(type))
|
||||||
|
snprintf(buf, sizeof(buf), "' @ %llu %llu : %d",
|
||||||
|
(unsigned long long) expiry,
|
||||||
|
(unsigned long long) sequence,
|
||||||
|
int(type));
|
||||||
|
else
|
||||||
|
snprintf(buf, sizeof(buf), "' @ %llu : %d",
|
||||||
|
(unsigned long long) sequence,
|
||||||
|
int(type));
|
||||||
|
std::string result = "'";
|
||||||
|
result += HexString(user_key);
|
||||||
|
result += buf;
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
const char * KeyTypeString(ValueType val_type) {
|
||||||
|
const char * ret_ptr;
|
||||||
|
switch(val_type)
|
||||||
|
{
|
||||||
|
case kTypeDeletion: ret_ptr="kTypeDelete"; break;
|
||||||
|
case kTypeValue: ret_ptr="kTypeValue"; break;
|
||||||
|
case kTypeValueWriteTime: ret_ptr="kTypeValueWriteTime"; break;
|
||||||
|
case kTypeValueExplicitExpiry: ret_ptr="kTypeValueExplicitExpiry"; break;
|
||||||
|
default: ret_ptr="(unknown ValueType)"; break;
|
||||||
|
} // switch
|
||||||
|
return(ret_ptr);
|
||||||
|
}
|
||||||
|
|
||||||
std::string InternalKey::DebugString() const {
|
std::string InternalKey::DebugString() const {
|
||||||
std::string result;
|
std::string result;
|
||||||
ParsedInternalKey parsed;
|
ParsedInternalKey parsed;
|
||||||
|
@ -54,8 +96,10 @@ int InternalKeyComparator::Compare(const Slice& akey, const Slice& bkey) const {
|
||||||
// decreasing type (though sequence# should be enough to disambiguate)
|
// decreasing type (though sequence# should be enough to disambiguate)
|
||||||
int r = user_comparator_->Compare(ExtractUserKey(akey), ExtractUserKey(bkey));
|
int r = user_comparator_->Compare(ExtractUserKey(akey), ExtractUserKey(bkey));
|
||||||
if (r == 0) {
|
if (r == 0) {
|
||||||
const uint64_t anum = DecodeFixed64(akey.data() + akey.size() - 8);
|
uint64_t anum = DecodeFixed64(akey.data() + akey.size() - 8);
|
||||||
const uint64_t bnum = DecodeFixed64(bkey.data() + bkey.size() - 8);
|
uint64_t bnum = DecodeFixed64(bkey.data() + bkey.size() - 8);
|
||||||
|
if (IsExpiryKey((ValueType)*(unsigned char *)&anum)) *(unsigned char*)&anum=(unsigned char)kTypeValue;
|
||||||
|
if (IsExpiryKey((ValueType)*(unsigned char *)&bnum)) *(unsigned char*)&bnum=(unsigned char)kTypeValue;
|
||||||
if (anum > bnum) {
|
if (anum > bnum) {
|
||||||
r = -1;
|
r = -1;
|
||||||
} else if (anum < bnum) {
|
} else if (anum < bnum) {
|
||||||
|
@ -118,7 +162,8 @@ bool InternalFilterPolicy::KeyMayMatch(const Slice& key, const Slice& f) const {
|
||||||
return user_policy_->KeyMayMatch(ExtractUserKey(key), f);
|
return user_policy_->KeyMayMatch(ExtractUserKey(key), f);
|
||||||
}
|
}
|
||||||
|
|
||||||
LookupKey::LookupKey(const Slice& user_key, SequenceNumber s) {
|
LookupKey::LookupKey(const Slice& user_key, SequenceNumber s, KeyMetaData * meta) {
|
||||||
|
meta_=meta;
|
||||||
size_t usize = user_key.size();
|
size_t usize = user_key.size();
|
||||||
size_t needed = usize + 13; // A conservative estimate
|
size_t needed = usize + 13; // A conservative estimate
|
||||||
char* dst;
|
char* dst;
|
||||||
|
@ -137,4 +182,109 @@ LookupKey::LookupKey(const Slice& user_key, SequenceNumber s) {
|
||||||
end_ = dst;
|
end_ = dst;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
KeyRetirement::KeyRetirement(
|
||||||
|
const Comparator * Comparator,
|
||||||
|
SequenceNumber SmallestSnapshot,
|
||||||
|
const Options * Opts,
|
||||||
|
Compaction * const Compaction)
|
||||||
|
: has_current_user_key(false), last_sequence_for_key(kMaxSequenceNumber),
|
||||||
|
user_comparator(Comparator), smallest_snapshot(SmallestSnapshot),
|
||||||
|
options(Opts), compaction(Compaction),
|
||||||
|
valid(false), dropped(0), expired(0)
|
||||||
|
{
|
||||||
|
// NULL is ok for compaction
|
||||||
|
valid=(NULL!=user_comparator);
|
||||||
|
|
||||||
|
return;
|
||||||
|
} // KeyRetirement::KeyRetirement
|
||||||
|
|
||||||
|
|
||||||
|
KeyRetirement::~KeyRetirement()
|
||||||
|
{
|
||||||
|
if (0!=expired)
|
||||||
|
gPerfCounters->Add(ePerfExpiredKeys, expired);
|
||||||
|
} // KeyRetirement::~KeyRetirement
|
||||||
|
|
||||||
|
|
||||||
|
bool
|
||||||
|
KeyRetirement::operator()(
|
||||||
|
Slice & key)
|
||||||
|
{
|
||||||
|
ParsedInternalKey ikey;
|
||||||
|
bool drop = false, expire_flag;
|
||||||
|
|
||||||
|
if (valid)
|
||||||
|
{
|
||||||
|
if (!ParseInternalKey(key, &ikey))
|
||||||
|
{
|
||||||
|
// Do not hide error keys
|
||||||
|
current_user_key.clear();
|
||||||
|
has_current_user_key = false;
|
||||||
|
last_sequence_for_key = kMaxSequenceNumber;
|
||||||
|
} // else
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if (!has_current_user_key ||
|
||||||
|
user_comparator->Compare(ikey.user_key,
|
||||||
|
Slice(current_user_key)) != 0)
|
||||||
|
{
|
||||||
|
// First occurrence of this user key
|
||||||
|
current_user_key.assign(ikey.user_key.data(), ikey.user_key.size());
|
||||||
|
has_current_user_key = true;
|
||||||
|
last_sequence_for_key = kMaxSequenceNumber;
|
||||||
|
} // if
|
||||||
|
|
||||||
|
if (last_sequence_for_key <= smallest_snapshot)
|
||||||
|
{
|
||||||
|
// Hidden by an newer entry for same user key
|
||||||
|
drop = true; // (A)
|
||||||
|
} // if
|
||||||
|
|
||||||
|
else
|
||||||
|
{
|
||||||
|
expire_flag=false;
|
||||||
|
if (NULL!=options && options->ExpiryActivated())
|
||||||
|
expire_flag=options->expiry_module->KeyRetirementCallback(ikey);
|
||||||
|
|
||||||
|
if ((ikey.type == kTypeDeletion || expire_flag)
|
||||||
|
&& ikey.sequence <= smallest_snapshot
|
||||||
|
&& NULL!=compaction // mem to level0 ignores this test
|
||||||
|
&& compaction->IsBaseLevelForKey(ikey.user_key))
|
||||||
|
{
|
||||||
|
// For this user key:
|
||||||
|
// (1) there is no data in higher levels
|
||||||
|
// (2) data in lower levels will have larger sequence numbers
|
||||||
|
// (3) data in layers that are being compacted here and have
|
||||||
|
// smaller sequence numbers will be dropped in the next
|
||||||
|
// few iterations of this loop (by rule (A) above).
|
||||||
|
// Therefore this deletion marker is obsolete and can be dropped.
|
||||||
|
drop = true;
|
||||||
|
|
||||||
|
if (expire_flag)
|
||||||
|
++expired;
|
||||||
|
else
|
||||||
|
++dropped;
|
||||||
|
} // if
|
||||||
|
} // else
|
||||||
|
|
||||||
|
last_sequence_for_key = ikey.sequence;
|
||||||
|
} // else
|
||||||
|
} // if
|
||||||
|
|
||||||
|
#if 0
|
||||||
|
// needs clean up to be used again
|
||||||
|
Log(options_.info_log,
|
||||||
|
" Compact: %s, seq %d, type: %d %d, drop: %d, is_base: %d, "
|
||||||
|
"%d smallest_snapshot: %d",
|
||||||
|
ikey.user_key.ToString().c_str(),
|
||||||
|
(int)ikey.sequence, ikey.type, kTypeValue, drop,
|
||||||
|
compact->compaction->IsBaseLevelForKey(ikey.user_key),
|
||||||
|
(int)last_sequence_for_key, (int)compact->smallest_snapshot);
|
||||||
|
#endif
|
||||||
|
return(drop);
|
||||||
|
|
||||||
|
} // KeyRetirement::operator(Slice & )
|
||||||
|
|
||||||
|
|
||||||
} // namespace leveldb
|
} // namespace leveldb
|
||||||
|
|
|
@ -2,13 +2,14 @@
|
||||||
// Use of this source code is governed by a BSD-style license that can be
|
// Use of this source code is governed by a BSD-style license that can be
|
||||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||||
|
|
||||||
#ifndef STORAGE_LEVELDB_DB_DBFORMAT_H_
|
#ifndef STORAGE_LEVELDB_DB_FORMAT_H_
|
||||||
#define STORAGE_LEVELDB_DB_DBFORMAT_H_
|
#define STORAGE_LEVELDB_DB_FORMAT_H_
|
||||||
|
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include "leveldb/comparator.h"
|
#include "leveldb/comparator.h"
|
||||||
#include "leveldb/db.h"
|
#include "leveldb/db.h"
|
||||||
#include "leveldb/filter_policy.h"
|
#include "leveldb/filter_policy.h"
|
||||||
|
#include "leveldb/options.h"
|
||||||
#include "leveldb/slice.h"
|
#include "leveldb/slice.h"
|
||||||
#include "leveldb/table_builder.h"
|
#include "leveldb/table_builder.h"
|
||||||
#include "util/coding.h"
|
#include "util/coding.h"
|
||||||
|
@ -16,19 +17,33 @@
|
||||||
|
|
||||||
namespace leveldb {
|
namespace leveldb {
|
||||||
|
|
||||||
|
class Compaction;
|
||||||
|
|
||||||
// Grouping of constants. We may want to make some of these
|
// Grouping of constants. We may want to make some of these
|
||||||
// parameters set via options.
|
// parameters set via options.
|
||||||
namespace config {
|
namespace config {
|
||||||
static const int kNumLevels = 7;
|
static const int kNumLevels = 7;
|
||||||
|
static const int kNumOverlapLevels = 2;
|
||||||
|
|
||||||
// Level-0 compaction is started when we hit this many files.
|
// Level-0 compaction is started when we hit this many files.
|
||||||
static const int kL0_CompactionTrigger = 4;
|
// Google: static const size_t kL0_CompactionTrigger = 4;
|
||||||
|
static const size_t kL0_CompactionTrigger = 6;
|
||||||
|
|
||||||
|
// Level-0 (any overlapped level) number of files where a grooming
|
||||||
|
// compaction could start
|
||||||
|
static const size_t kL0_GroomingTrigger = 4;
|
||||||
|
static const size_t kL0_GroomingTrigger10min = 2;
|
||||||
|
static const size_t kL0_GroomingTrigger20min = 1;
|
||||||
|
|
||||||
|
// ... time limits in microseconds
|
||||||
|
static const size_t kL0_Grooming10minMicros = 10 * 60 * 1000000;
|
||||||
|
static const size_t kL0_Grooming20minMicros = 20 * 60 * 1000000;
|
||||||
|
|
||||||
// Soft limit on number of level-0 files. We slow down writes at this point.
|
// Soft limit on number of level-0 files. We slow down writes at this point.
|
||||||
static const int kL0_SlowdownWritesTrigger = 8;
|
static const size_t kL0_SlowdownWritesTrigger = 8;
|
||||||
|
|
||||||
// Maximum number of level-0 files. We stop writes at this point.
|
// Maximum number of level-0 files. We stop writes at this point.
|
||||||
static const int kL0_StopWritesTrigger = 12;
|
static const size_t kL0_StopWritesTrigger = 12;
|
||||||
|
|
||||||
// Maximum level to which a new compacted memtable is pushed if it
|
// Maximum level to which a new compacted memtable is pushed if it
|
||||||
// does not create overlap. We try to push to level 2 to avoid the
|
// does not create overlap. We try to push to level 2 to avoid the
|
||||||
|
@ -36,31 +51,28 @@ static const int kL0_StopWritesTrigger = 12;
|
||||||
// expensive manifest file operations. We do not push all the way to
|
// expensive manifest file operations. We do not push all the way to
|
||||||
// the largest level since that can generate a lot of wasted disk
|
// the largest level since that can generate a lot of wasted disk
|
||||||
// space if the same key space is being repeatedly overwritten.
|
// space if the same key space is being repeatedly overwritten.
|
||||||
static const int kMaxMemCompactLevel = 2;
|
// Basho: push to kNumOverlapLevels +1 ... beyond "landing level"
|
||||||
|
static const unsigned kMaxMemCompactLevel = kNumOverlapLevels+1;
|
||||||
// Approximate gap in bytes between samples of data read during iteration.
|
|
||||||
static const int kReadBytesPeriod = 1048576;
|
|
||||||
|
|
||||||
} // namespace config
|
} // namespace config
|
||||||
|
|
||||||
class InternalKey;
|
class InternalKey;
|
||||||
|
|
||||||
// Value types encoded as the last component of internal keys.
|
|
||||||
// DO NOT CHANGE THESE ENUM VALUES: they are embedded in the on-disk
|
|
||||||
// data structures.
|
|
||||||
enum ValueType {
|
|
||||||
kTypeDeletion = 0x0,
|
|
||||||
kTypeValue = 0x1
|
|
||||||
};
|
|
||||||
// kValueTypeForSeek defines the ValueType that should be passed when
|
// kValueTypeForSeek defines the ValueType that should be passed when
|
||||||
// constructing a ParsedInternalKey object for seeking to a particular
|
// constructing a ParsedInternalKey object for seeking to a particular
|
||||||
// sequence number (since we sort sequence numbers in decreasing order
|
// sequence number (since we sort sequence numbers in decreasing order
|
||||||
// and the value type is embedded as the low 8 bits in the sequence
|
// and the value type is embedded as the low 8 bits in the sequence
|
||||||
// number in internal keys, we need to use the highest-numbered
|
// number in internal keys, we need to use the highest-numbered
|
||||||
// ValueType, not the lowest).
|
// ValueType, not the lowest).
|
||||||
|
// Riak note: kValueTypeForSeek is placed within temporary keys
|
||||||
|
// for comparisons. Using kTypeValueExplicitExpiry would
|
||||||
|
// force more code changes to increase internal key size.
|
||||||
|
// But ValueTypeForSeek is redundant to sequence number for
|
||||||
|
// disambiguaty. Therefore going for easiest path and NOT changing.
|
||||||
static const ValueType kValueTypeForSeek = kTypeValue;
|
static const ValueType kValueTypeForSeek = kTypeValue;
|
||||||
|
|
||||||
typedef uint64_t SequenceNumber;
|
typedef uint64_t SequenceNumber;
|
||||||
|
typedef uint64_t ExpiryTimeMicros;
|
||||||
|
|
||||||
// We leave eight bits empty at the bottom so a type and sequence#
|
// We leave eight bits empty at the bottom so a type and sequence#
|
||||||
// can be packed together into 64-bits.
|
// can be packed together into 64-bits.
|
||||||
|
@ -69,20 +81,17 @@ static const SequenceNumber kMaxSequenceNumber =
|
||||||
|
|
||||||
struct ParsedInternalKey {
|
struct ParsedInternalKey {
|
||||||
Slice user_key;
|
Slice user_key;
|
||||||
|
ExpiryTimeMicros expiry;
|
||||||
SequenceNumber sequence;
|
SequenceNumber sequence;
|
||||||
ValueType type;
|
ValueType type;
|
||||||
|
|
||||||
ParsedInternalKey() { } // Intentionally left uninitialized (for speed)
|
ParsedInternalKey() { } // Intentionally left uninitialized (for speed)
|
||||||
ParsedInternalKey(const Slice& u, const SequenceNumber& seq, ValueType t)
|
ParsedInternalKey(const Slice& u, const ExpiryTimeMicros & exp, const SequenceNumber& seq, ValueType t)
|
||||||
: user_key(u), sequence(seq), type(t) { }
|
: user_key(u), expiry(exp), sequence(seq), type(t) { }
|
||||||
std::string DebugString() const;
|
std::string DebugString() const;
|
||||||
|
std::string DebugStringHex() const;
|
||||||
};
|
};
|
||||||
|
|
||||||
// Return the length of the encoding of "key".
|
|
||||||
inline size_t InternalKeyEncodingLength(const ParsedInternalKey& key) {
|
|
||||||
return key.user_key.size() + 8;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Append the serialization of "key" to *result.
|
// Append the serialization of "key" to *result.
|
||||||
extern void AppendInternalKey(std::string* result,
|
extern void AppendInternalKey(std::string* result,
|
||||||
const ParsedInternalKey& key);
|
const ParsedInternalKey& key);
|
||||||
|
@ -94,20 +103,76 @@ extern void AppendInternalKey(std::string* result,
|
||||||
extern bool ParseInternalKey(const Slice& internal_key,
|
extern bool ParseInternalKey(const Slice& internal_key,
|
||||||
ParsedInternalKey* result);
|
ParsedInternalKey* result);
|
||||||
|
|
||||||
// Returns the user key portion of an internal key.
|
|
||||||
inline Slice ExtractUserKey(const Slice& internal_key) {
|
|
||||||
assert(internal_key.size() >= 8);
|
|
||||||
return Slice(internal_key.data(), internal_key.size() - 8);
|
|
||||||
}
|
|
||||||
|
|
||||||
inline ValueType ExtractValueType(const Slice& internal_key) {
|
inline ValueType ExtractValueType(const Slice& internal_key) {
|
||||||
assert(internal_key.size() >= 8);
|
assert(internal_key.size() >= 8);
|
||||||
const size_t n = internal_key.size();
|
const size_t n = internal_key.size();
|
||||||
uint64_t num = DecodeFixed64(internal_key.data() + n - 8);
|
unsigned char c = DecodeLeastFixed64(internal_key.data() + n - sizeof(SequenceNumber));
|
||||||
unsigned char c = num & 0xff;
|
|
||||||
return static_cast<ValueType>(c);
|
return static_cast<ValueType>(c);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
inline size_t KeySuffixSize(ValueType val_type) {
|
||||||
|
size_t ret_val;
|
||||||
|
switch(val_type)
|
||||||
|
{
|
||||||
|
case kTypeDeletion:
|
||||||
|
case kTypeValue:
|
||||||
|
ret_val=sizeof(SequenceNumber);
|
||||||
|
break;
|
||||||
|
|
||||||
|
case kTypeValueWriteTime:
|
||||||
|
case kTypeValueExplicitExpiry:
|
||||||
|
ret_val=sizeof(SequenceNumber) + sizeof(ExpiryTimeMicros);
|
||||||
|
break;
|
||||||
|
|
||||||
|
default:
|
||||||
|
// assert(0); cannot use because bloom filter block's name is passed as internal key
|
||||||
|
ret_val=sizeof(SequenceNumber);
|
||||||
|
break;
|
||||||
|
} // switch
|
||||||
|
return(ret_val);
|
||||||
|
}
|
||||||
|
|
||||||
|
const char * KeyTypeString(ValueType val_type);
|
||||||
|
|
||||||
|
inline size_t KeySuffixSize(const Slice & internal_key) {
|
||||||
|
return(KeySuffixSize(ExtractValueType(internal_key)));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Returns the user key portion of an internal key.
|
||||||
|
inline Slice ExtractUserKey(const Slice& internal_key) {
|
||||||
|
assert(internal_key.size() >= 8);
|
||||||
|
return Slice(internal_key.data(), internal_key.size() - KeySuffixSize(internal_key));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Returns the sequence number with ValueType removed
|
||||||
|
inline SequenceNumber ExtractSequenceNumber(const Slice& internal_key) {
|
||||||
|
assert(internal_key.size() >= 8);
|
||||||
|
return(DecodeFixed64(internal_key.data() + internal_key.size() - 8)>>8);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Return the length of the encoding of "key".
|
||||||
|
inline size_t InternalKeyEncodingLength(const ParsedInternalKey& key) {
|
||||||
|
return key.user_key.size() + KeySuffixSize(key.type);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Riak: is this an expiry key and therefore contain extra ExpiryTime field
|
||||||
|
inline bool IsExpiryKey(ValueType val_type) {
|
||||||
|
return(kTypeValueWriteTime==val_type || kTypeValueExplicitExpiry==val_type);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Riak: is this an expiry key and therefore contain extra ExpiryTime field
|
||||||
|
inline bool IsExpiryKey(const Slice & internal_key) {
|
||||||
|
return(internal_key.size()>=KeySuffixSize(kTypeValueWriteTime)
|
||||||
|
&& IsExpiryKey(ExtractValueType(internal_key)));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Riak: extracts expiry value
|
||||||
|
inline ExpiryTimeMicros ExtractExpiry(const Slice& internal_key) {
|
||||||
|
assert(internal_key.size() >= KeySuffixSize(kTypeValueWriteTime));
|
||||||
|
assert(IsExpiryKey(internal_key));
|
||||||
|
return(DecodeFixed64(internal_key.data() + internal_key.size() - KeySuffixSize(kTypeValueWriteTime)));
|
||||||
|
}
|
||||||
|
|
||||||
// A comparator for internal keys that uses a specified comparator for
|
// A comparator for internal keys that uses a specified comparator for
|
||||||
// the user key portion and breaks ties by decreasing sequence number.
|
// the user key portion and breaks ties by decreasing sequence number.
|
||||||
class InternalKeyComparator : public Comparator {
|
class InternalKeyComparator : public Comparator {
|
||||||
|
@ -129,7 +194,7 @@ class InternalKeyComparator : public Comparator {
|
||||||
|
|
||||||
// Filter policy wrapper that converts from internal keys to user keys
|
// Filter policy wrapper that converts from internal keys to user keys
|
||||||
class InternalFilterPolicy : public FilterPolicy {
|
class InternalFilterPolicy : public FilterPolicy {
|
||||||
private:
|
protected:
|
||||||
const FilterPolicy* const user_policy_;
|
const FilterPolicy* const user_policy_;
|
||||||
public:
|
public:
|
||||||
explicit InternalFilterPolicy(const FilterPolicy* p) : user_policy_(p) { }
|
explicit InternalFilterPolicy(const FilterPolicy* p) : user_policy_(p) { }
|
||||||
|
@ -138,6 +203,12 @@ class InternalFilterPolicy : public FilterPolicy {
|
||||||
virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const;
|
virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
class InternalFilterPolicy2 : public InternalFilterPolicy {
|
||||||
|
public:
|
||||||
|
explicit InternalFilterPolicy2(const FilterPolicy* p) : InternalFilterPolicy(p) { }
|
||||||
|
virtual ~InternalFilterPolicy2() {delete user_policy_;};
|
||||||
|
};
|
||||||
|
|
||||||
// Modules in this directory should keep internal keys wrapped inside
|
// Modules in this directory should keep internal keys wrapped inside
|
||||||
// the following class instead of plain strings so that we do not
|
// the following class instead of plain strings so that we do not
|
||||||
// incorrectly use string comparisons instead of an InternalKeyComparator.
|
// incorrectly use string comparisons instead of an InternalKeyComparator.
|
||||||
|
@ -146,8 +217,8 @@ class InternalKey {
|
||||||
std::string rep_;
|
std::string rep_;
|
||||||
public:
|
public:
|
||||||
InternalKey() { } // Leave rep_ as empty to indicate it is invalid
|
InternalKey() { } // Leave rep_ as empty to indicate it is invalid
|
||||||
InternalKey(const Slice& user_key, SequenceNumber s, ValueType t) {
|
InternalKey(const Slice& user_key, ExpiryTimeMicros exp, SequenceNumber s, ValueType t) {
|
||||||
AppendInternalKey(&rep_, ParsedInternalKey(user_key, s, t));
|
AppendInternalKey(&rep_, ParsedInternalKey(user_key, exp, s, t));
|
||||||
}
|
}
|
||||||
|
|
||||||
void DecodeFrom(const Slice& s) { rep_.assign(s.data(), s.size()); }
|
void DecodeFrom(const Slice& s) { rep_.assign(s.data(), s.size()); }
|
||||||
|
@ -157,6 +228,7 @@ class InternalKey {
|
||||||
}
|
}
|
||||||
|
|
||||||
Slice user_key() const { return ExtractUserKey(rep_); }
|
Slice user_key() const { return ExtractUserKey(rep_); }
|
||||||
|
Slice internal_key() const { return Slice(rep_); }
|
||||||
|
|
||||||
void SetFrom(const ParsedInternalKey& p) {
|
void SetFrom(const ParsedInternalKey& p) {
|
||||||
rep_.clear();
|
rep_.clear();
|
||||||
|
@ -181,8 +253,12 @@ inline bool ParseInternalKey(const Slice& internal_key,
|
||||||
unsigned char c = num & 0xff;
|
unsigned char c = num & 0xff;
|
||||||
result->sequence = num >> 8;
|
result->sequence = num >> 8;
|
||||||
result->type = static_cast<ValueType>(c);
|
result->type = static_cast<ValueType>(c);
|
||||||
result->user_key = Slice(internal_key.data(), n - 8);
|
if (IsExpiryKey((ValueType)c))
|
||||||
return (c <= static_cast<unsigned char>(kTypeValue));
|
result->expiry=DecodeFixed64(internal_key.data() + n - KeySuffixSize((ValueType)c));
|
||||||
|
else
|
||||||
|
result->expiry=0;
|
||||||
|
result->user_key = Slice(internal_key.data(), n - KeySuffixSize((ValueType)c));
|
||||||
|
return (c <= static_cast<unsigned char>(kTypeValueExplicitExpiry));
|
||||||
}
|
}
|
||||||
|
|
||||||
// A helper class useful for DBImpl::Get()
|
// A helper class useful for DBImpl::Get()
|
||||||
|
@ -190,7 +266,7 @@ class LookupKey {
|
||||||
public:
|
public:
|
||||||
// Initialize *this for looking up user_key at a snapshot with
|
// Initialize *this for looking up user_key at a snapshot with
|
||||||
// the specified sequence number.
|
// the specified sequence number.
|
||||||
LookupKey(const Slice& user_key, SequenceNumber sequence);
|
LookupKey(const Slice& user_key, SequenceNumber sequence, KeyMetaData * meta=NULL);
|
||||||
|
|
||||||
~LookupKey();
|
~LookupKey();
|
||||||
|
|
||||||
|
@ -201,12 +277,38 @@ class LookupKey {
|
||||||
Slice internal_key() const { return Slice(kstart_, end_ - kstart_); }
|
Slice internal_key() const { return Slice(kstart_, end_ - kstart_); }
|
||||||
|
|
||||||
// Return the user key
|
// Return the user key
|
||||||
Slice user_key() const { return Slice(kstart_, end_ - kstart_ - 8); }
|
Slice user_key() const
|
||||||
|
{ return Slice(kstart_, end_ - kstart_ - KeySuffixSize(internal_key())); }
|
||||||
|
|
||||||
|
// did requestor have metadata object?
|
||||||
|
bool WantsKeyMetaData() const {return(NULL!=meta_);};
|
||||||
|
|
||||||
|
void SetKeyMetaData(ValueType type, SequenceNumber seq, ExpiryTimeMicros expiry) const
|
||||||
|
{if (NULL!=meta_)
|
||||||
|
{
|
||||||
|
meta_->m_Type=type;
|
||||||
|
meta_->m_Sequence=seq;
|
||||||
|
meta_->m_Expiry=expiry;
|
||||||
|
} // if
|
||||||
|
};
|
||||||
|
|
||||||
|
void SetKeyMetaData(const ParsedInternalKey & pi_key) const
|
||||||
|
{if (NULL!=meta_)
|
||||||
|
{
|
||||||
|
meta_->m_Type=pi_key.type;
|
||||||
|
meta_->m_Sequence=pi_key.sequence;
|
||||||
|
meta_->m_Expiry=pi_key.expiry;
|
||||||
|
} // if
|
||||||
|
};
|
||||||
|
|
||||||
|
void SetKeyMetaData(const KeyMetaData & meta) const
|
||||||
|
{if (NULL!=meta_) *meta_=meta;};
|
||||||
|
|
||||||
private:
|
private:
|
||||||
// We construct a char array of the form:
|
// We construct a char array of the form:
|
||||||
// klength varint32 <-- start_
|
// klength varint32 <-- start_
|
||||||
// userkey char[klength] <-- kstart_
|
// userkey char[klength] <-- kstart_
|
||||||
|
// optional uint64
|
||||||
// tag uint64
|
// tag uint64
|
||||||
// <-- end_
|
// <-- end_
|
||||||
// The array is a suitable MemTable key.
|
// The array is a suitable MemTable key.
|
||||||
|
@ -216,6 +318,9 @@ class LookupKey {
|
||||||
const char* end_;
|
const char* end_;
|
||||||
char space_[200]; // Avoid allocation for short keys
|
char space_[200]; // Avoid allocation for short keys
|
||||||
|
|
||||||
|
// allow code that finds the key to place metadata here, even if 'const'
|
||||||
|
mutable KeyMetaData * meta_;
|
||||||
|
|
||||||
// No copying allowed
|
// No copying allowed
|
||||||
LookupKey(const LookupKey&);
|
LookupKey(const LookupKey&);
|
||||||
void operator=(const LookupKey&);
|
void operator=(const LookupKey&);
|
||||||
|
@ -223,8 +328,47 @@ class LookupKey {
|
||||||
|
|
||||||
inline LookupKey::~LookupKey() {
|
inline LookupKey::~LookupKey() {
|
||||||
if (start_ != space_) delete[] start_;
|
if (start_ != space_) delete[] start_;
|
||||||
}
|
};
|
||||||
|
|
||||||
|
|
||||||
|
// this class was constructed from code with DBImpl::DoCompactionWork (db_impl.cc)
|
||||||
|
// so it could be shared within BuildTable (and thus reduce Level 0 bloating)
|
||||||
|
class KeyRetirement
|
||||||
|
{
|
||||||
|
protected:
|
||||||
|
// "state" from previous key reviewed
|
||||||
|
std::string current_user_key;
|
||||||
|
bool has_current_user_key;
|
||||||
|
SequenceNumber last_sequence_for_key;
|
||||||
|
|
||||||
|
// database values needed for processing
|
||||||
|
const Comparator * user_comparator;
|
||||||
|
SequenceNumber smallest_snapshot;
|
||||||
|
const Options * options;
|
||||||
|
Compaction * const compaction;
|
||||||
|
|
||||||
|
bool valid;
|
||||||
|
size_t dropped; // tombstone or old version dropped
|
||||||
|
size_t expired; // expired dropped
|
||||||
|
|
||||||
|
public:
|
||||||
|
KeyRetirement(const Comparator * UserComparator, SequenceNumber SmallestSnapshot,
|
||||||
|
const Options * Opts, Compaction * const Compaction=NULL);
|
||||||
|
|
||||||
|
virtual ~KeyRetirement();
|
||||||
|
|
||||||
|
bool operator()(Slice & key);
|
||||||
|
|
||||||
|
size_t GetDroppedCount() const {return(dropped);};
|
||||||
|
size_t GetExpiredCount() const {return(expired);};
|
||||||
|
|
||||||
|
private:
|
||||||
|
KeyRetirement();
|
||||||
|
KeyRetirement(const KeyRetirement &);
|
||||||
|
const KeyRetirement & operator=(const KeyRetirement &);
|
||||||
|
|
||||||
|
}; // class KeyRetirement
|
||||||
|
|
||||||
} // namespace leveldb
|
} // namespace leveldb
|
||||||
|
|
||||||
#endif // STORAGE_LEVELDB_DB_DBFORMAT_H_
|
#endif // STORAGE_LEVELDB_DB_FORMAT_H_
|
||||||
|
|
|
@ -9,10 +9,11 @@
|
||||||
namespace leveldb {
|
namespace leveldb {
|
||||||
|
|
||||||
static std::string IKey(const std::string& user_key,
|
static std::string IKey(const std::string& user_key,
|
||||||
|
ExpiryTimeMicros exp,
|
||||||
uint64_t seq,
|
uint64_t seq,
|
||||||
ValueType vt) {
|
ValueType vt) {
|
||||||
std::string encoded;
|
std::string encoded;
|
||||||
AppendInternalKey(&encoded, ParsedInternalKey(user_key, seq, vt));
|
AppendInternalKey(&encoded, ParsedInternalKey(user_key, exp, seq, vt));
|
||||||
return encoded;
|
return encoded;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -29,12 +30,13 @@ static std::string ShortSuccessor(const std::string& s) {
|
||||||
}
|
}
|
||||||
|
|
||||||
static void TestKey(const std::string& key,
|
static void TestKey(const std::string& key,
|
||||||
|
ExpiryTimeMicros exp,
|
||||||
uint64_t seq,
|
uint64_t seq,
|
||||||
ValueType vt) {
|
ValueType vt) {
|
||||||
std::string encoded = IKey(key, seq, vt);
|
std::string encoded = IKey(key, exp, seq, vt);
|
||||||
|
|
||||||
Slice in(encoded);
|
Slice in(encoded);
|
||||||
ParsedInternalKey decoded("", 0, kTypeValue);
|
ParsedInternalKey decoded("", 0, 0, kTypeValue);
|
||||||
|
|
||||||
ASSERT_TRUE(ParseInternalKey(in, &decoded));
|
ASSERT_TRUE(ParseInternalKey(in, &decoded));
|
||||||
ASSERT_EQ(key, decoded.user_key.ToString());
|
ASSERT_EQ(key, decoded.user_key.ToString());
|
||||||
|
@ -56,53 +58,53 @@ TEST(FormatTest, InternalKey_EncodeDecode) {
|
||||||
};
|
};
|
||||||
for (int k = 0; k < sizeof(keys) / sizeof(keys[0]); k++) {
|
for (int k = 0; k < sizeof(keys) / sizeof(keys[0]); k++) {
|
||||||
for (int s = 0; s < sizeof(seq) / sizeof(seq[0]); s++) {
|
for (int s = 0; s < sizeof(seq) / sizeof(seq[0]); s++) {
|
||||||
TestKey(keys[k], seq[s], kTypeValue);
|
TestKey(keys[k], 0, seq[s], kTypeValue);
|
||||||
TestKey("hello", 1, kTypeDeletion);
|
TestKey("hello", 0, 1, kTypeDeletion);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(FormatTest, InternalKeyShortSeparator) {
|
TEST(FormatTest, InternalKeyShortSeparator) {
|
||||||
// When user keys are same
|
// When user keys are same
|
||||||
ASSERT_EQ(IKey("foo", 100, kTypeValue),
|
ASSERT_EQ(IKey("foo", 0, 100, kTypeValue),
|
||||||
Shorten(IKey("foo", 100, kTypeValue),
|
Shorten(IKey("foo", 0, 100, kTypeValue),
|
||||||
IKey("foo", 99, kTypeValue)));
|
IKey("foo", 0, 99, kTypeValue)));
|
||||||
ASSERT_EQ(IKey("foo", 100, kTypeValue),
|
ASSERT_EQ(IKey("foo", 0, 100, kTypeValue),
|
||||||
Shorten(IKey("foo", 100, kTypeValue),
|
Shorten(IKey("foo", 0, 100, kTypeValue),
|
||||||
IKey("foo", 101, kTypeValue)));
|
IKey("foo", 0, 101, kTypeValue)));
|
||||||
ASSERT_EQ(IKey("foo", 100, kTypeValue),
|
ASSERT_EQ(IKey("foo", 0, 100, kTypeValue),
|
||||||
Shorten(IKey("foo", 100, kTypeValue),
|
Shorten(IKey("foo", 0, 100, kTypeValue),
|
||||||
IKey("foo", 100, kTypeValue)));
|
IKey("foo", 0, 100, kTypeValue)));
|
||||||
ASSERT_EQ(IKey("foo", 100, kTypeValue),
|
ASSERT_EQ(IKey("foo", 0, 100, kTypeValue),
|
||||||
Shorten(IKey("foo", 100, kTypeValue),
|
Shorten(IKey("foo", 0, 100, kTypeValue),
|
||||||
IKey("foo", 100, kTypeDeletion)));
|
IKey("foo", 0, 100, kTypeDeletion)));
|
||||||
|
|
||||||
// When user keys are misordered
|
// When user keys are misordered
|
||||||
ASSERT_EQ(IKey("foo", 100, kTypeValue),
|
ASSERT_EQ(IKey("foo", 0, 100, kTypeValue),
|
||||||
Shorten(IKey("foo", 100, kTypeValue),
|
Shorten(IKey("foo", 0, 100, kTypeValue),
|
||||||
IKey("bar", 99, kTypeValue)));
|
IKey("bar", 0, 99, kTypeValue)));
|
||||||
|
|
||||||
// When user keys are different, but correctly ordered
|
// When user keys are different, but correctly ordered
|
||||||
ASSERT_EQ(IKey("g", kMaxSequenceNumber, kValueTypeForSeek),
|
ASSERT_EQ(IKey("g", 0, kMaxSequenceNumber, kValueTypeForSeek),
|
||||||
Shorten(IKey("foo", 100, kTypeValue),
|
Shorten(IKey("foo", 0, 100, kTypeValue),
|
||||||
IKey("hello", 200, kTypeValue)));
|
IKey("hello", 0, 200, kTypeValue)));
|
||||||
|
|
||||||
// When start user key is prefix of limit user key
|
// When start user key is prefix of limit user key
|
||||||
ASSERT_EQ(IKey("foo", 100, kTypeValue),
|
ASSERT_EQ(IKey("foo", 0, 100, kTypeValue),
|
||||||
Shorten(IKey("foo", 100, kTypeValue),
|
Shorten(IKey("foo", 0, 100, kTypeValue),
|
||||||
IKey("foobar", 200, kTypeValue)));
|
IKey("foobar", 0, 200, kTypeValue)));
|
||||||
|
|
||||||
// When limit user key is prefix of start user key
|
// When limit user key is prefix of start user key
|
||||||
ASSERT_EQ(IKey("foobar", 100, kTypeValue),
|
ASSERT_EQ(IKey("foobar", 0, 100, kTypeValue),
|
||||||
Shorten(IKey("foobar", 100, kTypeValue),
|
Shorten(IKey("foobar", 0, 100, kTypeValue),
|
||||||
IKey("foo", 200, kTypeValue)));
|
IKey("foo", 0, 200, kTypeValue)));
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(FormatTest, InternalKeyShortestSuccessor) {
|
TEST(FormatTest, InternalKeyShortestSuccessor) {
|
||||||
ASSERT_EQ(IKey("g", kMaxSequenceNumber, kValueTypeForSeek),
|
ASSERT_EQ(IKey("g", 0, kMaxSequenceNumber, kValueTypeForSeek),
|
||||||
ShortSuccessor(IKey("foo", 100, kTypeValue)));
|
ShortSuccessor(IKey("foo", 0, 100, kTypeValue)));
|
||||||
ASSERT_EQ(IKey("\xff\xff", 100, kTypeValue),
|
ASSERT_EQ(IKey("\xff\xff", 0, 100, kTypeValue),
|
||||||
ShortSuccessor(IKey("\xff\xff", 100, kTypeValue)));
|
ShortSuccessor(IKey("\xff\xff", 0, 100, kTypeValue)));
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace leveldb
|
} // namespace leveldb
|
||||||
|
|
|
@ -1,554 +0,0 @@
|
||||||
// Copyright 2014 The LevelDB Authors. All rights reserved.
|
|
||||||
// Use of this source code is governed by a BSD-style license that can be
|
|
||||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
||||||
|
|
||||||
// This test uses a custom Env to keep track of the state of a filesystem as of
|
|
||||||
// the last "sync". It then checks for data loss errors by purposely dropping
|
|
||||||
// file data (or entire files) not protected by a "sync".
|
|
||||||
|
|
||||||
#include "leveldb/db.h"
|
|
||||||
|
|
||||||
#include <map>
|
|
||||||
#include <set>
|
|
||||||
#include "db/db_impl.h"
|
|
||||||
#include "db/filename.h"
|
|
||||||
#include "db/log_format.h"
|
|
||||||
#include "db/version_set.h"
|
|
||||||
#include "leveldb/cache.h"
|
|
||||||
#include "leveldb/env.h"
|
|
||||||
#include "leveldb/table.h"
|
|
||||||
#include "leveldb/write_batch.h"
|
|
||||||
#include "util/logging.h"
|
|
||||||
#include "util/mutexlock.h"
|
|
||||||
#include "util/testharness.h"
|
|
||||||
#include "util/testutil.h"
|
|
||||||
|
|
||||||
namespace leveldb {
|
|
||||||
|
|
||||||
static const int kValueSize = 1000;
|
|
||||||
static const int kMaxNumValues = 2000;
|
|
||||||
static const size_t kNumIterations = 3;
|
|
||||||
|
|
||||||
class FaultInjectionTestEnv;
|
|
||||||
|
|
||||||
namespace {
|
|
||||||
|
|
||||||
// Assume a filename, and not a directory name like "/foo/bar/"
|
|
||||||
static std::string GetDirName(const std::string filename) {
|
|
||||||
size_t found = filename.find_last_of("/\\");
|
|
||||||
if (found == std::string::npos) {
|
|
||||||
return "";
|
|
||||||
} else {
|
|
||||||
return filename.substr(0, found);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Status SyncDir(const std::string& dir) {
|
|
||||||
// As this is a test it isn't required to *actually* sync this directory.
|
|
||||||
return Status::OK();
|
|
||||||
}
|
|
||||||
|
|
||||||
// A basic file truncation function suitable for this test.
|
|
||||||
Status Truncate(const std::string& filename, uint64_t length) {
|
|
||||||
leveldb::Env* env = leveldb::Env::Default();
|
|
||||||
|
|
||||||
SequentialFile* orig_file;
|
|
||||||
Status s = env->NewSequentialFile(filename, &orig_file);
|
|
||||||
if (!s.ok())
|
|
||||||
return s;
|
|
||||||
|
|
||||||
char* scratch = new char[length];
|
|
||||||
leveldb::Slice result;
|
|
||||||
s = orig_file->Read(length, &result, scratch);
|
|
||||||
delete orig_file;
|
|
||||||
if (s.ok()) {
|
|
||||||
std::string tmp_name = GetDirName(filename) + "/truncate.tmp";
|
|
||||||
WritableFile* tmp_file;
|
|
||||||
s = env->NewWritableFile(tmp_name, &tmp_file);
|
|
||||||
if (s.ok()) {
|
|
||||||
s = tmp_file->Append(result);
|
|
||||||
delete tmp_file;
|
|
||||||
if (s.ok()) {
|
|
||||||
s = env->RenameFile(tmp_name, filename);
|
|
||||||
} else {
|
|
||||||
env->DeleteFile(tmp_name);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
delete[] scratch;
|
|
||||||
|
|
||||||
return s;
|
|
||||||
}
|
|
||||||
|
|
||||||
struct FileState {
|
|
||||||
std::string filename_;
|
|
||||||
ssize_t pos_;
|
|
||||||
ssize_t pos_at_last_sync_;
|
|
||||||
ssize_t pos_at_last_flush_;
|
|
||||||
|
|
||||||
FileState(const std::string& filename)
|
|
||||||
: filename_(filename),
|
|
||||||
pos_(-1),
|
|
||||||
pos_at_last_sync_(-1),
|
|
||||||
pos_at_last_flush_(-1) { }
|
|
||||||
|
|
||||||
FileState() : pos_(-1), pos_at_last_sync_(-1), pos_at_last_flush_(-1) {}
|
|
||||||
|
|
||||||
bool IsFullySynced() const { return pos_ <= 0 || pos_ == pos_at_last_sync_; }
|
|
||||||
|
|
||||||
Status DropUnsyncedData() const;
|
|
||||||
};
|
|
||||||
|
|
||||||
} // anonymous namespace
|
|
||||||
|
|
||||||
// A wrapper around WritableFile which informs another Env whenever this file
|
|
||||||
// is written to or sync'ed.
|
|
||||||
class TestWritableFile : public WritableFile {
|
|
||||||
public:
|
|
||||||
TestWritableFile(const FileState& state,
|
|
||||||
WritableFile* f,
|
|
||||||
FaultInjectionTestEnv* env);
|
|
||||||
virtual ~TestWritableFile();
|
|
||||||
virtual Status Append(const Slice& data);
|
|
||||||
virtual Status Close();
|
|
||||||
virtual Status Flush();
|
|
||||||
virtual Status Sync();
|
|
||||||
|
|
||||||
private:
|
|
||||||
FileState state_;
|
|
||||||
WritableFile* target_;
|
|
||||||
bool writable_file_opened_;
|
|
||||||
FaultInjectionTestEnv* env_;
|
|
||||||
|
|
||||||
Status SyncParent();
|
|
||||||
};
|
|
||||||
|
|
||||||
class FaultInjectionTestEnv : public EnvWrapper {
|
|
||||||
public:
|
|
||||||
FaultInjectionTestEnv() : EnvWrapper(Env::Default()), filesystem_active_(true) {}
|
|
||||||
virtual ~FaultInjectionTestEnv() { }
|
|
||||||
virtual Status NewWritableFile(const std::string& fname,
|
|
||||||
WritableFile** result);
|
|
||||||
virtual Status NewAppendableFile(const std::string& fname,
|
|
||||||
WritableFile** result);
|
|
||||||
virtual Status DeleteFile(const std::string& f);
|
|
||||||
virtual Status RenameFile(const std::string& s, const std::string& t);
|
|
||||||
|
|
||||||
void WritableFileClosed(const FileState& state);
|
|
||||||
Status DropUnsyncedFileData();
|
|
||||||
Status DeleteFilesCreatedAfterLastDirSync();
|
|
||||||
void DirWasSynced();
|
|
||||||
bool IsFileCreatedSinceLastDirSync(const std::string& filename);
|
|
||||||
void ResetState();
|
|
||||||
void UntrackFile(const std::string& f);
|
|
||||||
// Setting the filesystem to inactive is the test equivalent to simulating a
|
|
||||||
// system reset. Setting to inactive will freeze our saved filesystem state so
|
|
||||||
// that it will stop being recorded. It can then be reset back to the state at
|
|
||||||
// the time of the reset.
|
|
||||||
bool IsFilesystemActive() const { return filesystem_active_; }
|
|
||||||
void SetFilesystemActive(bool active) { filesystem_active_ = active; }
|
|
||||||
|
|
||||||
private:
|
|
||||||
port::Mutex mutex_;
|
|
||||||
std::map<std::string, FileState> db_file_state_;
|
|
||||||
std::set<std::string> new_files_since_last_dir_sync_;
|
|
||||||
bool filesystem_active_; // Record flushes, syncs, writes
|
|
||||||
};
|
|
||||||
|
|
||||||
TestWritableFile::TestWritableFile(const FileState& state,
|
|
||||||
WritableFile* f,
|
|
||||||
FaultInjectionTestEnv* env)
|
|
||||||
: state_(state),
|
|
||||||
target_(f),
|
|
||||||
writable_file_opened_(true),
|
|
||||||
env_(env) {
|
|
||||||
assert(f != NULL);
|
|
||||||
}
|
|
||||||
|
|
||||||
TestWritableFile::~TestWritableFile() {
|
|
||||||
if (writable_file_opened_) {
|
|
||||||
Close();
|
|
||||||
}
|
|
||||||
delete target_;
|
|
||||||
}
|
|
||||||
|
|
||||||
Status TestWritableFile::Append(const Slice& data) {
|
|
||||||
Status s = target_->Append(data);
|
|
||||||
if (s.ok() && env_->IsFilesystemActive()) {
|
|
||||||
state_.pos_ += data.size();
|
|
||||||
}
|
|
||||||
return s;
|
|
||||||
}
|
|
||||||
|
|
||||||
Status TestWritableFile::Close() {
|
|
||||||
writable_file_opened_ = false;
|
|
||||||
Status s = target_->Close();
|
|
||||||
if (s.ok()) {
|
|
||||||
env_->WritableFileClosed(state_);
|
|
||||||
}
|
|
||||||
return s;
|
|
||||||
}
|
|
||||||
|
|
||||||
Status TestWritableFile::Flush() {
|
|
||||||
Status s = target_->Flush();
|
|
||||||
if (s.ok() && env_->IsFilesystemActive()) {
|
|
||||||
state_.pos_at_last_flush_ = state_.pos_;
|
|
||||||
}
|
|
||||||
return s;
|
|
||||||
}
|
|
||||||
|
|
||||||
Status TestWritableFile::SyncParent() {
|
|
||||||
Status s = SyncDir(GetDirName(state_.filename_));
|
|
||||||
if (s.ok()) {
|
|
||||||
env_->DirWasSynced();
|
|
||||||
}
|
|
||||||
return s;
|
|
||||||
}
|
|
||||||
|
|
||||||
Status TestWritableFile::Sync() {
|
|
||||||
if (!env_->IsFilesystemActive()) {
|
|
||||||
return Status::OK();
|
|
||||||
}
|
|
||||||
// Ensure new files referred to by the manifest are in the filesystem.
|
|
||||||
Status s = target_->Sync();
|
|
||||||
if (s.ok()) {
|
|
||||||
state_.pos_at_last_sync_ = state_.pos_;
|
|
||||||
}
|
|
||||||
if (env_->IsFileCreatedSinceLastDirSync(state_.filename_)) {
|
|
||||||
Status ps = SyncParent();
|
|
||||||
if (s.ok() && !ps.ok()) {
|
|
||||||
s = ps;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return s;
|
|
||||||
}
|
|
||||||
|
|
||||||
Status FaultInjectionTestEnv::NewWritableFile(const std::string& fname,
|
|
||||||
WritableFile** result) {
|
|
||||||
WritableFile* actual_writable_file;
|
|
||||||
Status s = target()->NewWritableFile(fname, &actual_writable_file);
|
|
||||||
if (s.ok()) {
|
|
||||||
FileState state(fname);
|
|
||||||
state.pos_ = 0;
|
|
||||||
*result = new TestWritableFile(state, actual_writable_file, this);
|
|
||||||
// NewWritableFile doesn't append to files, so if the same file is
|
|
||||||
// opened again then it will be truncated - so forget our saved
|
|
||||||
// state.
|
|
||||||
UntrackFile(fname);
|
|
||||||
MutexLock l(&mutex_);
|
|
||||||
new_files_since_last_dir_sync_.insert(fname);
|
|
||||||
}
|
|
||||||
return s;
|
|
||||||
}
|
|
||||||
|
|
||||||
Status FaultInjectionTestEnv::NewAppendableFile(const std::string& fname,
|
|
||||||
WritableFile** result) {
|
|
||||||
WritableFile* actual_writable_file;
|
|
||||||
Status s = target()->NewAppendableFile(fname, &actual_writable_file);
|
|
||||||
if (s.ok()) {
|
|
||||||
FileState state(fname);
|
|
||||||
state.pos_ = 0;
|
|
||||||
{
|
|
||||||
MutexLock l(&mutex_);
|
|
||||||
if (db_file_state_.count(fname) == 0) {
|
|
||||||
new_files_since_last_dir_sync_.insert(fname);
|
|
||||||
} else {
|
|
||||||
state = db_file_state_[fname];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
*result = new TestWritableFile(state, actual_writable_file, this);
|
|
||||||
}
|
|
||||||
return s;
|
|
||||||
}
|
|
||||||
|
|
||||||
Status FaultInjectionTestEnv::DropUnsyncedFileData() {
|
|
||||||
Status s;
|
|
||||||
MutexLock l(&mutex_);
|
|
||||||
for (std::map<std::string, FileState>::const_iterator it =
|
|
||||||
db_file_state_.begin();
|
|
||||||
s.ok() && it != db_file_state_.end(); ++it) {
|
|
||||||
const FileState& state = it->second;
|
|
||||||
if (!state.IsFullySynced()) {
|
|
||||||
s = state.DropUnsyncedData();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return s;
|
|
||||||
}
|
|
||||||
|
|
||||||
void FaultInjectionTestEnv::DirWasSynced() {
|
|
||||||
MutexLock l(&mutex_);
|
|
||||||
new_files_since_last_dir_sync_.clear();
|
|
||||||
}
|
|
||||||
|
|
||||||
bool FaultInjectionTestEnv::IsFileCreatedSinceLastDirSync(
|
|
||||||
const std::string& filename) {
|
|
||||||
MutexLock l(&mutex_);
|
|
||||||
return new_files_since_last_dir_sync_.find(filename) !=
|
|
||||||
new_files_since_last_dir_sync_.end();
|
|
||||||
}
|
|
||||||
|
|
||||||
void FaultInjectionTestEnv::UntrackFile(const std::string& f) {
|
|
||||||
MutexLock l(&mutex_);
|
|
||||||
db_file_state_.erase(f);
|
|
||||||
new_files_since_last_dir_sync_.erase(f);
|
|
||||||
}
|
|
||||||
|
|
||||||
Status FaultInjectionTestEnv::DeleteFile(const std::string& f) {
|
|
||||||
Status s = EnvWrapper::DeleteFile(f);
|
|
||||||
ASSERT_OK(s);
|
|
||||||
if (s.ok()) {
|
|
||||||
UntrackFile(f);
|
|
||||||
}
|
|
||||||
return s;
|
|
||||||
}
|
|
||||||
|
|
||||||
Status FaultInjectionTestEnv::RenameFile(const std::string& s,
|
|
||||||
const std::string& t) {
|
|
||||||
Status ret = EnvWrapper::RenameFile(s, t);
|
|
||||||
|
|
||||||
if (ret.ok()) {
|
|
||||||
MutexLock l(&mutex_);
|
|
||||||
if (db_file_state_.find(s) != db_file_state_.end()) {
|
|
||||||
db_file_state_[t] = db_file_state_[s];
|
|
||||||
db_file_state_.erase(s);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (new_files_since_last_dir_sync_.erase(s) != 0) {
|
|
||||||
assert(new_files_since_last_dir_sync_.find(t) ==
|
|
||||||
new_files_since_last_dir_sync_.end());
|
|
||||||
new_files_since_last_dir_sync_.insert(t);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
void FaultInjectionTestEnv::ResetState() {
|
|
||||||
// Since we are not destroying the database, the existing files
|
|
||||||
// should keep their recorded synced/flushed state. Therefore
|
|
||||||
// we do not reset db_file_state_ and new_files_since_last_dir_sync_.
|
|
||||||
MutexLock l(&mutex_);
|
|
||||||
SetFilesystemActive(true);
|
|
||||||
}
|
|
||||||
|
|
||||||
Status FaultInjectionTestEnv::DeleteFilesCreatedAfterLastDirSync() {
|
|
||||||
// Because DeleteFile access this container make a copy to avoid deadlock
|
|
||||||
mutex_.Lock();
|
|
||||||
std::set<std::string> new_files(new_files_since_last_dir_sync_.begin(),
|
|
||||||
new_files_since_last_dir_sync_.end());
|
|
||||||
mutex_.Unlock();
|
|
||||||
Status s;
|
|
||||||
std::set<std::string>::const_iterator it;
|
|
||||||
for (it = new_files.begin(); s.ok() && it != new_files.end(); ++it) {
|
|
||||||
s = DeleteFile(*it);
|
|
||||||
}
|
|
||||||
return s;
|
|
||||||
}
|
|
||||||
|
|
||||||
void FaultInjectionTestEnv::WritableFileClosed(const FileState& state) {
|
|
||||||
MutexLock l(&mutex_);
|
|
||||||
db_file_state_[state.filename_] = state;
|
|
||||||
}
|
|
||||||
|
|
||||||
Status FileState::DropUnsyncedData() const {
|
|
||||||
ssize_t sync_pos = pos_at_last_sync_ == -1 ? 0 : pos_at_last_sync_;
|
|
||||||
return Truncate(filename_, sync_pos);
|
|
||||||
}
|
|
||||||
|
|
||||||
class FaultInjectionTest {
|
|
||||||
public:
|
|
||||||
enum ExpectedVerifResult { VAL_EXPECT_NO_ERROR, VAL_EXPECT_ERROR };
|
|
||||||
enum ResetMethod { RESET_DROP_UNSYNCED_DATA, RESET_DELETE_UNSYNCED_FILES };
|
|
||||||
|
|
||||||
FaultInjectionTestEnv* env_;
|
|
||||||
std::string dbname_;
|
|
||||||
Cache* tiny_cache_;
|
|
||||||
Options options_;
|
|
||||||
DB* db_;
|
|
||||||
|
|
||||||
FaultInjectionTest()
|
|
||||||
: env_(new FaultInjectionTestEnv),
|
|
||||||
tiny_cache_(NewLRUCache(100)),
|
|
||||||
db_(NULL) {
|
|
||||||
dbname_ = test::TmpDir() + "/fault_test";
|
|
||||||
DestroyDB(dbname_, Options()); // Destroy any db from earlier run
|
|
||||||
options_.reuse_logs = true;
|
|
||||||
options_.env = env_;
|
|
||||||
options_.paranoid_checks = true;
|
|
||||||
options_.block_cache = tiny_cache_;
|
|
||||||
options_.create_if_missing = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
~FaultInjectionTest() {
|
|
||||||
CloseDB();
|
|
||||||
DestroyDB(dbname_, Options());
|
|
||||||
delete tiny_cache_;
|
|
||||||
delete env_;
|
|
||||||
}
|
|
||||||
|
|
||||||
void ReuseLogs(bool reuse) {
|
|
||||||
options_.reuse_logs = reuse;
|
|
||||||
}
|
|
||||||
|
|
||||||
void Build(int start_idx, int num_vals) {
|
|
||||||
std::string key_space, value_space;
|
|
||||||
WriteBatch batch;
|
|
||||||
for (int i = start_idx; i < start_idx + num_vals; i++) {
|
|
||||||
Slice key = Key(i, &key_space);
|
|
||||||
batch.Clear();
|
|
||||||
batch.Put(key, Value(i, &value_space));
|
|
||||||
WriteOptions options;
|
|
||||||
ASSERT_OK(db_->Write(options, &batch));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Status ReadValue(int i, std::string* val) const {
|
|
||||||
std::string key_space, value_space;
|
|
||||||
Slice key = Key(i, &key_space);
|
|
||||||
Value(i, &value_space);
|
|
||||||
ReadOptions options;
|
|
||||||
return db_->Get(options, key, val);
|
|
||||||
}
|
|
||||||
|
|
||||||
Status Verify(int start_idx, int num_vals,
|
|
||||||
ExpectedVerifResult expected) const {
|
|
||||||
std::string val;
|
|
||||||
std::string value_space;
|
|
||||||
Status s;
|
|
||||||
for (int i = start_idx; i < start_idx + num_vals && s.ok(); i++) {
|
|
||||||
Value(i, &value_space);
|
|
||||||
s = ReadValue(i, &val);
|
|
||||||
if (expected == VAL_EXPECT_NO_ERROR) {
|
|
||||||
if (s.ok()) {
|
|
||||||
ASSERT_EQ(value_space, val);
|
|
||||||
}
|
|
||||||
} else if (s.ok()) {
|
|
||||||
fprintf(stderr, "Expected an error at %d, but was OK\n", i);
|
|
||||||
s = Status::IOError(dbname_, "Expected value error:");
|
|
||||||
} else {
|
|
||||||
s = Status::OK(); // An expected error
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return s;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Return the ith key
|
|
||||||
Slice Key(int i, std::string* storage) const {
|
|
||||||
char buf[100];
|
|
||||||
snprintf(buf, sizeof(buf), "%016d", i);
|
|
||||||
storage->assign(buf, strlen(buf));
|
|
||||||
return Slice(*storage);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Return the value to associate with the specified key
|
|
||||||
Slice Value(int k, std::string* storage) const {
|
|
||||||
Random r(k);
|
|
||||||
return test::RandomString(&r, kValueSize, storage);
|
|
||||||
}
|
|
||||||
|
|
||||||
Status OpenDB() {
|
|
||||||
delete db_;
|
|
||||||
db_ = NULL;
|
|
||||||
env_->ResetState();
|
|
||||||
return DB::Open(options_, dbname_, &db_);
|
|
||||||
}
|
|
||||||
|
|
||||||
void CloseDB() {
|
|
||||||
delete db_;
|
|
||||||
db_ = NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
void DeleteAllData() {
|
|
||||||
Iterator* iter = db_->NewIterator(ReadOptions());
|
|
||||||
WriteOptions options;
|
|
||||||
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
|
|
||||||
ASSERT_OK(db_->Delete(WriteOptions(), iter->key()));
|
|
||||||
}
|
|
||||||
|
|
||||||
delete iter;
|
|
||||||
}
|
|
||||||
|
|
||||||
void ResetDBState(ResetMethod reset_method) {
|
|
||||||
switch (reset_method) {
|
|
||||||
case RESET_DROP_UNSYNCED_DATA:
|
|
||||||
ASSERT_OK(env_->DropUnsyncedFileData());
|
|
||||||
break;
|
|
||||||
case RESET_DELETE_UNSYNCED_FILES:
|
|
||||||
ASSERT_OK(env_->DeleteFilesCreatedAfterLastDirSync());
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
assert(false);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void PartialCompactTestPreFault(int num_pre_sync, int num_post_sync) {
|
|
||||||
DeleteAllData();
|
|
||||||
Build(0, num_pre_sync);
|
|
||||||
db_->CompactRange(NULL, NULL);
|
|
||||||
Build(num_pre_sync, num_post_sync);
|
|
||||||
}
|
|
||||||
|
|
||||||
void PartialCompactTestReopenWithFault(ResetMethod reset_method,
|
|
||||||
int num_pre_sync,
|
|
||||||
int num_post_sync) {
|
|
||||||
env_->SetFilesystemActive(false);
|
|
||||||
CloseDB();
|
|
||||||
ResetDBState(reset_method);
|
|
||||||
ASSERT_OK(OpenDB());
|
|
||||||
ASSERT_OK(Verify(0, num_pre_sync, FaultInjectionTest::VAL_EXPECT_NO_ERROR));
|
|
||||||
ASSERT_OK(Verify(num_pre_sync, num_post_sync, FaultInjectionTest::VAL_EXPECT_ERROR));
|
|
||||||
}
|
|
||||||
|
|
||||||
void NoWriteTestPreFault() {
|
|
||||||
}
|
|
||||||
|
|
||||||
void NoWriteTestReopenWithFault(ResetMethod reset_method) {
|
|
||||||
CloseDB();
|
|
||||||
ResetDBState(reset_method);
|
|
||||||
ASSERT_OK(OpenDB());
|
|
||||||
}
|
|
||||||
|
|
||||||
void DoTest() {
|
|
||||||
Random rnd(0);
|
|
||||||
ASSERT_OK(OpenDB());
|
|
||||||
for (size_t idx = 0; idx < kNumIterations; idx++) {
|
|
||||||
int num_pre_sync = rnd.Uniform(kMaxNumValues);
|
|
||||||
int num_post_sync = rnd.Uniform(kMaxNumValues);
|
|
||||||
|
|
||||||
PartialCompactTestPreFault(num_pre_sync, num_post_sync);
|
|
||||||
PartialCompactTestReopenWithFault(RESET_DROP_UNSYNCED_DATA,
|
|
||||||
num_pre_sync,
|
|
||||||
num_post_sync);
|
|
||||||
|
|
||||||
NoWriteTestPreFault();
|
|
||||||
NoWriteTestReopenWithFault(RESET_DROP_UNSYNCED_DATA);
|
|
||||||
|
|
||||||
PartialCompactTestPreFault(num_pre_sync, num_post_sync);
|
|
||||||
// No new files created so we expect all values since no files will be
|
|
||||||
// dropped.
|
|
||||||
PartialCompactTestReopenWithFault(RESET_DELETE_UNSYNCED_FILES,
|
|
||||||
num_pre_sync + num_post_sync,
|
|
||||||
0);
|
|
||||||
|
|
||||||
NoWriteTestPreFault();
|
|
||||||
NoWriteTestReopenWithFault(RESET_DELETE_UNSYNCED_FILES);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
TEST(FaultInjectionTest, FaultTestNoLogReuse) {
|
|
||||||
ReuseLogs(false);
|
|
||||||
DoTest();
|
|
||||||
}
|
|
||||||
|
|
||||||
TEST(FaultInjectionTest, FaultTestWithLogReuse) {
|
|
||||||
ReuseLogs(true);
|
|
||||||
DoTest();
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace leveldb
|
|
||||||
|
|
||||||
int main(int argc, char** argv) {
|
|
||||||
return leveldb::test::RunAllTests();
|
|
||||||
}
|
|
|
@ -4,9 +4,14 @@
|
||||||
|
|
||||||
#include <ctype.h>
|
#include <ctype.h>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
|
#include <errno.h>
|
||||||
|
#include <sys/stat.h>
|
||||||
|
#include <sys/types.h>
|
||||||
#include "db/filename.h"
|
#include "db/filename.h"
|
||||||
#include "db/dbformat.h"
|
#include "db/dbformat.h"
|
||||||
|
#include "db/version_set.h"
|
||||||
#include "leveldb/env.h"
|
#include "leveldb/env.h"
|
||||||
|
#include "leveldb/status.h"
|
||||||
#include "util/logging.h"
|
#include "util/logging.h"
|
||||||
|
|
||||||
namespace leveldb {
|
namespace leveldb {
|
||||||
|
@ -24,19 +29,50 @@ static std::string MakeFileName(const std::string& name, uint64_t number,
|
||||||
return name + buf;
|
return name + buf;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static std::string MakeFileName2(const Options & options, uint64_t number,
|
||||||
|
int level, const char* suffix) {
|
||||||
|
char buf[100];
|
||||||
|
if (0<=level)
|
||||||
|
snprintf(buf, sizeof(buf), "/%s_%-d/%06llu.%s",
|
||||||
|
suffix, level,
|
||||||
|
static_cast<unsigned long long>(number),
|
||||||
|
suffix);
|
||||||
|
else if (-1==level)
|
||||||
|
snprintf(buf, sizeof(buf), "/%s/%06llu.%s",
|
||||||
|
suffix,
|
||||||
|
static_cast<unsigned long long>(number),
|
||||||
|
suffix);
|
||||||
|
else if (-2==level)
|
||||||
|
snprintf(buf, sizeof(buf), "/%06llu.%s",
|
||||||
|
static_cast<unsigned long long>(number),
|
||||||
|
suffix);
|
||||||
|
|
||||||
|
return((level<(int)options.tiered_slow_level ?
|
||||||
|
options.tiered_fast_prefix : options.tiered_slow_prefix) + buf);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string MakeDirName2(const Options & options,
|
||||||
|
int level, const char* suffix) {
|
||||||
|
char buf[100];
|
||||||
|
if (-1!=level)
|
||||||
|
snprintf(buf, sizeof(buf), "/%s_%-d",
|
||||||
|
suffix, level);
|
||||||
|
else
|
||||||
|
snprintf(buf, sizeof(buf), "/%s",
|
||||||
|
suffix);
|
||||||
|
|
||||||
|
return((level<(int)options.tiered_slow_level ?
|
||||||
|
options.tiered_fast_prefix : options.tiered_slow_prefix) + buf);
|
||||||
|
}
|
||||||
|
|
||||||
std::string LogFileName(const std::string& name, uint64_t number) {
|
std::string LogFileName(const std::string& name, uint64_t number) {
|
||||||
assert(number > 0);
|
assert(number > 0);
|
||||||
return MakeFileName(name, number, "log");
|
return MakeFileName(name, number, "log");
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string TableFileName(const std::string& name, uint64_t number) {
|
std::string TableFileName(const Options & options, uint64_t number, int level) {
|
||||||
assert(number > 0);
|
assert(number > 0);
|
||||||
return MakeFileName(name, number, "ldb");
|
return MakeFileName2(options, number, level, "sst");
|
||||||
}
|
|
||||||
|
|
||||||
std::string SSTTableFileName(const std::string& name, uint64_t number) {
|
|
||||||
assert(number > 0);
|
|
||||||
return MakeFileName(name, number, "sst");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string DescriptorFileName(const std::string& dbname, uint64_t number) {
|
std::string DescriptorFileName(const std::string& dbname, uint64_t number) {
|
||||||
|
@ -69,6 +105,36 @@ std::string OldInfoLogFileName(const std::string& dbname) {
|
||||||
return dbname + "/LOG.old";
|
return dbname + "/LOG.old";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
std::string CowFileName(const std::string& dbname) {
|
||||||
|
return dbname + "/COW";
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// Append appropriate "backup" string to input path
|
||||||
|
std::string BackupPath(const std::string& dbname, int backup_num) {
|
||||||
|
std::string dirname;
|
||||||
|
|
||||||
|
char buf[100];
|
||||||
|
if (0 != backup_num)
|
||||||
|
snprintf(buf, sizeof(buf), "/backup.%-d", backup_num);
|
||||||
|
else
|
||||||
|
snprintf(buf, sizeof(buf), "/backup");
|
||||||
|
|
||||||
|
return(dbname + buf);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// update tiered_fast_prefix and tiered_slow_prefix members of
|
||||||
|
// given Options object to point to desired backup path
|
||||||
|
bool SetBackupPaths(Options & options, int backup_num) {
|
||||||
|
|
||||||
|
options.tiered_fast_prefix = BackupPath(options.tiered_fast_prefix, backup_num);
|
||||||
|
options.tiered_slow_prefix = BackupPath(options.tiered_slow_prefix, backup_num);
|
||||||
|
|
||||||
|
return(true);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
// Owned filenames have the form:
|
// Owned filenames have the form:
|
||||||
// dbname/CURRENT
|
// dbname/CURRENT
|
||||||
|
@ -76,7 +142,8 @@ std::string OldInfoLogFileName(const std::string& dbname) {
|
||||||
// dbname/LOG
|
// dbname/LOG
|
||||||
// dbname/LOG.old
|
// dbname/LOG.old
|
||||||
// dbname/MANIFEST-[0-9]+
|
// dbname/MANIFEST-[0-9]+
|
||||||
// dbname/[0-9]+.(log|sst|ldb)
|
// dbname/[0-9]+.(log|sst)
|
||||||
|
// dbname/COW
|
||||||
bool ParseFileName(const std::string& fname,
|
bool ParseFileName(const std::string& fname,
|
||||||
uint64_t* number,
|
uint64_t* number,
|
||||||
FileType* type) {
|
FileType* type) {
|
||||||
|
@ -84,6 +151,9 @@ bool ParseFileName(const std::string& fname,
|
||||||
if (rest == "CURRENT") {
|
if (rest == "CURRENT") {
|
||||||
*number = 0;
|
*number = 0;
|
||||||
*type = kCurrentFile;
|
*type = kCurrentFile;
|
||||||
|
} else if (rest == "COW") {
|
||||||
|
*number = 0;
|
||||||
|
*type = kCacheWarming;
|
||||||
} else if (rest == "LOCK") {
|
} else if (rest == "LOCK") {
|
||||||
*number = 0;
|
*number = 0;
|
||||||
*type = kDBLockFile;
|
*type = kDBLockFile;
|
||||||
|
@ -111,7 +181,7 @@ bool ParseFileName(const std::string& fname,
|
||||||
Slice suffix = rest;
|
Slice suffix = rest;
|
||||||
if (suffix == Slice(".log")) {
|
if (suffix == Slice(".log")) {
|
||||||
*type = kLogFile;
|
*type = kLogFile;
|
||||||
} else if (suffix == Slice(".sst") || suffix == Slice(".ldb")) {
|
} else if (suffix == Slice(".sst")) {
|
||||||
*type = kTableFile;
|
*type = kTableFile;
|
||||||
} else if (suffix == Slice(".dbtmp")) {
|
} else if (suffix == Slice(".dbtmp")) {
|
||||||
*type = kTempFile;
|
*type = kTempFile;
|
||||||
|
@ -141,4 +211,99 @@ Status SetCurrentFile(Env* env, const std::string& dbname,
|
||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
Status
|
||||||
|
MakeLevelDirectories(Env * env, const Options & options)
|
||||||
|
{
|
||||||
|
Status ret_stat;
|
||||||
|
int level;
|
||||||
|
std::string dirname;
|
||||||
|
|
||||||
|
for (level=0; level<config::kNumLevels && ret_stat.ok(); ++level)
|
||||||
|
{
|
||||||
|
dirname=MakeDirName2(options, level, "sst");
|
||||||
|
|
||||||
|
// ignoring error since no way to tell if "bad" error, or "already exists" error
|
||||||
|
env->CreateDir(dirname.c_str());
|
||||||
|
} // for
|
||||||
|
|
||||||
|
return(ret_stat);
|
||||||
|
|
||||||
|
} // MakeLevelDirectories
|
||||||
|
|
||||||
|
|
||||||
|
bool
|
||||||
|
TestForLevelDirectories(
|
||||||
|
Env * env,
|
||||||
|
const Options & options,
|
||||||
|
Version * version)
|
||||||
|
{
|
||||||
|
bool ret_flag, again;
|
||||||
|
int level;
|
||||||
|
std::string dirname;
|
||||||
|
|
||||||
|
ret_flag=true;
|
||||||
|
again=true;
|
||||||
|
|
||||||
|
// walk backwards, fault will be in higher levels if partial conversion
|
||||||
|
for (level=config::kNumLevels-1; 0<=level && again; --level)
|
||||||
|
{
|
||||||
|
again=false;
|
||||||
|
|
||||||
|
// does directory exist
|
||||||
|
dirname=MakeDirName2(options, level, "sst");
|
||||||
|
ret_flag=env->FileExists(dirname.c_str());
|
||||||
|
|
||||||
|
// do all files exist in level
|
||||||
|
if (ret_flag)
|
||||||
|
{
|
||||||
|
const std::vector<FileMetaData*> & level_files(version->GetFileList(level));
|
||||||
|
std::vector<FileMetaData*>::const_iterator it;
|
||||||
|
std::string table_name;
|
||||||
|
Status s;
|
||||||
|
|
||||||
|
for (it=level_files.begin(); level_files.end()!=it && ret_flag; ++it)
|
||||||
|
{
|
||||||
|
table_name=TableFileName(options, (*it)->number, level);
|
||||||
|
ret_flag=env->FileExists(table_name.c_str());
|
||||||
|
} // for
|
||||||
|
|
||||||
|
again=ret_flag && 0==level_files.size();
|
||||||
|
} // if
|
||||||
|
} // for
|
||||||
|
|
||||||
|
return(ret_flag);
|
||||||
|
|
||||||
|
} // TestForLevelDirectories
|
||||||
|
|
||||||
|
std::string // replacement dbname ... potentially tiered
|
||||||
|
MakeTieredDbname(
|
||||||
|
const std::string & dbname, // input ... original dbname from DBImpl constructor
|
||||||
|
Options & options) // input/output ... writable Options, tiered values changed
|
||||||
|
{
|
||||||
|
// case for "", used with internal calls to DestroyDB
|
||||||
|
if (0==dbname.size() && 0!=options.tiered_fast_prefix.size())
|
||||||
|
{
|
||||||
|
// do NOTHING ... options already initialized
|
||||||
|
} // if
|
||||||
|
else if (0<(int)options.tiered_slow_level && (int)options.tiered_slow_level<config::kNumLevels
|
||||||
|
&& 0!=options.tiered_fast_prefix.size() && 0!=options.tiered_slow_prefix.size())
|
||||||
|
{
|
||||||
|
options.tiered_fast_prefix.append("/");
|
||||||
|
options.tiered_fast_prefix.append(dbname);
|
||||||
|
|
||||||
|
options.tiered_slow_prefix.append("/");
|
||||||
|
options.tiered_slow_prefix.append(dbname);
|
||||||
|
} // else if
|
||||||
|
else
|
||||||
|
{
|
||||||
|
options.tiered_slow_level=0;
|
||||||
|
options.tiered_fast_prefix=dbname; // duplicate as is
|
||||||
|
options.tiered_slow_prefix=dbname;
|
||||||
|
} // else
|
||||||
|
|
||||||
|
return(options.tiered_fast_prefix);
|
||||||
|
|
||||||
|
} // MakeTieredDbname
|
||||||
|
|
||||||
} // namespace leveldb
|
} // namespace leveldb
|
||||||
|
|
|
@ -9,6 +9,7 @@
|
||||||
|
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
#include "leveldb/options.h"
|
||||||
#include "leveldb/slice.h"
|
#include "leveldb/slice.h"
|
||||||
#include "leveldb/status.h"
|
#include "leveldb/status.h"
|
||||||
#include "port/port.h"
|
#include "port/port.h"
|
||||||
|
@ -16,6 +17,7 @@
|
||||||
namespace leveldb {
|
namespace leveldb {
|
||||||
|
|
||||||
class Env;
|
class Env;
|
||||||
|
class Version;
|
||||||
|
|
||||||
enum FileType {
|
enum FileType {
|
||||||
kLogFile,
|
kLogFile,
|
||||||
|
@ -24,9 +26,24 @@ enum FileType {
|
||||||
kDescriptorFile,
|
kDescriptorFile,
|
||||||
kCurrentFile,
|
kCurrentFile,
|
||||||
kTempFile,
|
kTempFile,
|
||||||
kInfoLogFile // Either the current one, or an old one
|
kInfoLogFile, // Either the current one, or an old one
|
||||||
|
kCacheWarming
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Riak specific routine to help create sst_? subdirectory names
|
||||||
|
std::string MakeDirName2(const Options & options,
|
||||||
|
int level, const char* suffix);
|
||||||
|
|
||||||
|
// Riak specific routine to help create sst_? subdirectories
|
||||||
|
Status MakeLevelDirectories(Env * env, const Options & options);
|
||||||
|
|
||||||
|
// Riak specific routine to test if sst_? subdirectories exist
|
||||||
|
bool TestForLevelDirectories(Env * env, const Options & options, class Version *);
|
||||||
|
|
||||||
|
// Riak specific routine to standardize conversion of dbname and
|
||||||
|
// Options' tiered directories (options parameter is MODIFIED)
|
||||||
|
std::string MakeTieredDbname(const std::string &dbname, Options & options_rw);
|
||||||
|
|
||||||
// Return the name of the log file with the specified number
|
// Return the name of the log file with the specified number
|
||||||
// in the db named by "dbname". The result will be prefixed with
|
// in the db named by "dbname". The result will be prefixed with
|
||||||
// "dbname".
|
// "dbname".
|
||||||
|
@ -35,12 +52,8 @@ extern std::string LogFileName(const std::string& dbname, uint64_t number);
|
||||||
// Return the name of the sstable with the specified number
|
// Return the name of the sstable with the specified number
|
||||||
// in the db named by "dbname". The result will be prefixed with
|
// in the db named by "dbname". The result will be prefixed with
|
||||||
// "dbname".
|
// "dbname".
|
||||||
extern std::string TableFileName(const std::string& dbname, uint64_t number);
|
extern std::string TableFileName(const Options & options, uint64_t number,
|
||||||
|
int level);
|
||||||
// Return the legacy file name for an sstable with the specified number
|
|
||||||
// in the db named by "dbname". The result will be prefixed with
|
|
||||||
// "dbname".
|
|
||||||
extern std::string SSTTableFileName(const std::string& dbname, uint64_t number);
|
|
||||||
|
|
||||||
// Return the name of the descriptor file for the db named by
|
// Return the name of the descriptor file for the db named by
|
||||||
// "dbname" and the specified incarnation number. The result will be
|
// "dbname" and the specified incarnation number. The result will be
|
||||||
|
@ -67,10 +80,21 @@ extern std::string InfoLogFileName(const std::string& dbname);
|
||||||
// Return the name of the old info log file for "dbname".
|
// Return the name of the old info log file for "dbname".
|
||||||
extern std::string OldInfoLogFileName(const std::string& dbname);
|
extern std::string OldInfoLogFileName(const std::string& dbname);
|
||||||
|
|
||||||
|
// Return the name of the cache object file for the db named by
|
||||||
|
// "dbname". The result will be prefixed with "dbname".
|
||||||
|
extern std::string CowFileName(const std::string& dbname);
|
||||||
|
|
||||||
|
// Append appropriate "backup" string to input path
|
||||||
|
extern std::string BackupPath(const std::string& dbname, int backup_num);
|
||||||
|
|
||||||
|
// update tiered_fast_prefix and tiered_slow_prefix members of
|
||||||
|
// given Options object to point to backup path
|
||||||
|
extern bool SetBackupPaths(Options & options, int backup_num);
|
||||||
|
|
||||||
// If filename is a leveldb file, store the type of the file in *type.
|
// If filename is a leveldb file, store the type of the file in *type.
|
||||||
// The number encoded in the filename is stored in *number. If the
|
// The number encoded in the filename is stored in *number. If the
|
||||||
// filename was successfully parsed, returns true. Else return false.
|
// filename was successfully parsed, returns true. Else return false.
|
||||||
extern bool ParseFileName(const std::string& filename,
|
extern bool ParseFileName(const std::string& tiered_filename,
|
||||||
uint64_t* number,
|
uint64_t* number,
|
||||||
FileType* type);
|
FileType* type);
|
||||||
|
|
||||||
|
|
|
@ -27,7 +27,6 @@ TEST(FileNameTest, Parse) {
|
||||||
{ "100.log", 100, kLogFile },
|
{ "100.log", 100, kLogFile },
|
||||||
{ "0.log", 0, kLogFile },
|
{ "0.log", 0, kLogFile },
|
||||||
{ "0.sst", 0, kTableFile },
|
{ "0.sst", 0, kTableFile },
|
||||||
{ "0.ldb", 0, kTableFile },
|
|
||||||
{ "CURRENT", 0, kCurrentFile },
|
{ "CURRENT", 0, kCurrentFile },
|
||||||
{ "LOCK", 0, kDBLockFile },
|
{ "LOCK", 0, kDBLockFile },
|
||||||
{ "MANIFEST-2", 2, kDescriptorFile },
|
{ "MANIFEST-2", 2, kDescriptorFile },
|
||||||
|
@ -71,13 +70,14 @@ TEST(FileNameTest, Parse) {
|
||||||
for (int i = 0; i < sizeof(errors) / sizeof(errors[0]); i++) {
|
for (int i = 0; i < sizeof(errors) / sizeof(errors[0]); i++) {
|
||||||
std::string f = errors[i];
|
std::string f = errors[i];
|
||||||
ASSERT_TRUE(!ParseFileName(f, &number, &type)) << f;
|
ASSERT_TRUE(!ParseFileName(f, &number, &type)) << f;
|
||||||
}
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(FileNameTest, Construction) {
|
TEST(FileNameTest, Construction) {
|
||||||
uint64_t number;
|
uint64_t number;
|
||||||
FileType type;
|
FileType type;
|
||||||
std::string fname;
|
std::string fname;
|
||||||
|
Options options;
|
||||||
|
|
||||||
fname = CurrentFileName("foo");
|
fname = CurrentFileName("foo");
|
||||||
ASSERT_EQ("foo/", std::string(fname.data(), 4));
|
ASSERT_EQ("foo/", std::string(fname.data(), 4));
|
||||||
|
@ -97,12 +97,40 @@ TEST(FileNameTest, Construction) {
|
||||||
ASSERT_EQ(192, number);
|
ASSERT_EQ(192, number);
|
||||||
ASSERT_EQ(kLogFile, type);
|
ASSERT_EQ(kLogFile, type);
|
||||||
|
|
||||||
fname = TableFileName("bar", 200);
|
options.tiered_fast_prefix="bar";
|
||||||
|
options.tiered_slow_prefix="bar";
|
||||||
|
fname = TableFileName(options, 200, 1);
|
||||||
ASSERT_EQ("bar/", std::string(fname.data(), 4));
|
ASSERT_EQ("bar/", std::string(fname.data(), 4));
|
||||||
ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
|
ASSERT_EQ("sst_1/", std::string(fname.substr(4,6)));
|
||||||
|
ASSERT_TRUE(ParseFileName(fname.c_str() + 10, &number, &type));
|
||||||
ASSERT_EQ(200, number);
|
ASSERT_EQ(200, number);
|
||||||
ASSERT_EQ(kTableFile, type);
|
ASSERT_EQ(kTableFile, type);
|
||||||
|
|
||||||
|
fname = TableFileName(options, 400, 4);
|
||||||
|
ASSERT_EQ("bar/", std::string(fname.data(), 4));
|
||||||
|
ASSERT_EQ("sst_4/", std::string(fname.substr(4,6)));
|
||||||
|
ASSERT_TRUE(ParseFileName(fname.c_str() + 10, &number, &type));
|
||||||
|
ASSERT_EQ(400, number);
|
||||||
|
ASSERT_EQ(kTableFile, type);
|
||||||
|
|
||||||
|
options.tiered_slow_level=4;
|
||||||
|
options.tiered_fast_prefix="fast";
|
||||||
|
options.tiered_slow_prefix="slow";
|
||||||
|
fname = TableFileName(options, 500, 3);
|
||||||
|
ASSERT_EQ("fast/", std::string(fname.data(), 5));
|
||||||
|
ASSERT_EQ("sst_3/", std::string(fname.substr(5,6)));
|
||||||
|
ASSERT_TRUE(ParseFileName(fname.c_str() + 11, &number, &type));
|
||||||
|
ASSERT_EQ(500, number);
|
||||||
|
ASSERT_EQ(kTableFile, type);
|
||||||
|
|
||||||
|
fname = TableFileName(options, 600, 4);
|
||||||
|
ASSERT_EQ("slow/", std::string(fname.data(), 5));
|
||||||
|
ASSERT_EQ("sst_4/", std::string(fname.substr(5,6)));
|
||||||
|
ASSERT_TRUE(ParseFileName(fname.c_str() + 11, &number, &type));
|
||||||
|
ASSERT_EQ(600, number);
|
||||||
|
ASSERT_EQ(kTableFile, type);
|
||||||
|
|
||||||
|
|
||||||
fname = DescriptorFileName("bar", 100);
|
fname = DescriptorFileName("bar", 100);
|
||||||
ASSERT_EQ("bar/", std::string(fname.data(), 4));
|
ASSERT_EQ("bar/", std::string(fname.data(), 4));
|
||||||
ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
|
ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
|
||||||
|
@ -114,6 +142,48 @@ TEST(FileNameTest, Construction) {
|
||||||
ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
|
ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
|
||||||
ASSERT_EQ(999, number);
|
ASSERT_EQ(999, number);
|
||||||
ASSERT_EQ(kTempFile, type);
|
ASSERT_EQ(kTempFile, type);
|
||||||
|
|
||||||
|
fname = CowFileName("/what/goes/moo");
|
||||||
|
ASSERT_EQ("/what/goes/moo/COW", fname);
|
||||||
|
|
||||||
|
fname = BackupPath("/var/db/riak/data/leveldb/0",0);
|
||||||
|
ASSERT_EQ("/var/db/riak/data/leveldb/0/backup", fname);
|
||||||
|
|
||||||
|
fname = BackupPath("/var/db/riak/data/leveldb/0",1);
|
||||||
|
ASSERT_EQ("/var/db/riak/data/leveldb/0/backup.1", fname);
|
||||||
|
|
||||||
|
fname = BackupPath("/var/db/riak/data/leveldb/0",5);
|
||||||
|
ASSERT_EQ("/var/db/riak/data/leveldb/0/backup.5", fname);
|
||||||
|
|
||||||
|
options.tiered_slow_level=4;
|
||||||
|
options.tiered_fast_prefix="fast";
|
||||||
|
options.tiered_slow_prefix="slow";
|
||||||
|
fname = SetBackupPaths(options,0);
|
||||||
|
ASSERT_EQ("fast/backup", options.tiered_fast_prefix);
|
||||||
|
ASSERT_EQ("slow/backup", options.tiered_slow_prefix);
|
||||||
|
|
||||||
|
options.tiered_slow_level=4;
|
||||||
|
options.tiered_fast_prefix="fast";
|
||||||
|
options.tiered_slow_prefix="slow";
|
||||||
|
fname = SetBackupPaths(options,3);
|
||||||
|
ASSERT_EQ("fast/backup.3", options.tiered_fast_prefix);
|
||||||
|
ASSERT_EQ("slow/backup.3", options.tiered_slow_prefix);
|
||||||
|
|
||||||
|
|
||||||
|
options.tiered_slow_level=4;
|
||||||
|
options.tiered_fast_prefix="//mnt/fast";
|
||||||
|
options.tiered_slow_prefix="//mnt/slow";
|
||||||
|
fname=MakeTieredDbname("riak/data/leveldb", options);
|
||||||
|
ASSERT_EQ("//mnt/fast/riak/data/leveldb", fname);
|
||||||
|
ASSERT_EQ("//mnt/fast/riak/data/leveldb", options.tiered_fast_prefix);
|
||||||
|
ASSERT_EQ("//mnt/slow/riak/data/leveldb", options.tiered_slow_prefix);
|
||||||
|
|
||||||
|
// special case with no dbname given, should have no changes
|
||||||
|
fname=MakeTieredDbname("", options);
|
||||||
|
ASSERT_EQ("//mnt/fast/riak/data/leveldb", fname);
|
||||||
|
ASSERT_EQ("//mnt/fast/riak/data/leveldb", options.tiered_fast_prefix);
|
||||||
|
ASSERT_EQ("//mnt/slow/riak/data/leveldb", options.tiered_slow_prefix);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace leveldb
|
} // namespace leveldb
|
||||||
|
|
|
@ -1,65 +0,0 @@
|
||||||
// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
|
|
||||||
// Use of this source code is governed by a BSD-style license that can be
|
|
||||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
||||||
|
|
||||||
#include <stdio.h>
|
|
||||||
#include "leveldb/dumpfile.h"
|
|
||||||
#include "leveldb/env.h"
|
|
||||||
#include "leveldb/status.h"
|
|
||||||
|
|
||||||
namespace leveldb {
|
|
||||||
namespace {
|
|
||||||
|
|
||||||
class StdoutPrinter : public WritableFile {
|
|
||||||
public:
|
|
||||||
virtual Status Append(const Slice& data) {
|
|
||||||
fwrite(data.data(), 1, data.size(), stdout);
|
|
||||||
return Status::OK();
|
|
||||||
}
|
|
||||||
virtual Status Close() { return Status::OK(); }
|
|
||||||
virtual Status Flush() { return Status::OK(); }
|
|
||||||
virtual Status Sync() { return Status::OK(); }
|
|
||||||
virtual std::string GetName() const { return "[stdout]"; }
|
|
||||||
};
|
|
||||||
|
|
||||||
bool HandleDumpCommand(Env* env, char** files, int num) {
|
|
||||||
StdoutPrinter printer;
|
|
||||||
bool ok = true;
|
|
||||||
for (int i = 0; i < num; i++) {
|
|
||||||
Status s = DumpFile(env, files[i], &printer);
|
|
||||||
if (!s.ok()) {
|
|
||||||
fprintf(stderr, "%s\n", s.ToString().c_str());
|
|
||||||
ok = false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return ok;
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace
|
|
||||||
} // namespace leveldb
|
|
||||||
|
|
||||||
static void Usage() {
|
|
||||||
fprintf(
|
|
||||||
stderr,
|
|
||||||
"Usage: leveldbutil command...\n"
|
|
||||||
" dump files... -- dump contents of specified files\n"
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
int main(int argc, char** argv) {
|
|
||||||
leveldb::Env* env = leveldb::Env::Default();
|
|
||||||
bool ok = true;
|
|
||||||
if (argc < 2) {
|
|
||||||
Usage();
|
|
||||||
ok = false;
|
|
||||||
} else {
|
|
||||||
std::string command = argv[1];
|
|
||||||
if (command == "dump") {
|
|
||||||
ok = leveldb::HandleDumpCommand(env, argv+2, argc-2);
|
|
||||||
} else {
|
|
||||||
Usage();
|
|
||||||
ok = false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return (ok ? 0 : 1);
|
|
||||||
}
|
|
|
@ -3,7 +3,7 @@
|
||||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||||
//
|
//
|
||||||
// Log format information shared by reader and writer.
|
// Log format information shared by reader and writer.
|
||||||
// See ../doc/log_format.md for more detail.
|
// See ../doc/log_format.txt for more detail.
|
||||||
|
|
||||||
#ifndef STORAGE_LEVELDB_DB_LOG_FORMAT_H_
|
#ifndef STORAGE_LEVELDB_DB_LOG_FORMAT_H_
|
||||||
#define STORAGE_LEVELDB_DB_LOG_FORMAT_H_
|
#define STORAGE_LEVELDB_DB_LOG_FORMAT_H_
|
||||||
|
@ -26,8 +26,8 @@ static const int kMaxRecordType = kLastType;
|
||||||
|
|
||||||
static const int kBlockSize = 32768;
|
static const int kBlockSize = 32768;
|
||||||
|
|
||||||
// Header is checksum (4 bytes), length (2 bytes), type (1 byte).
|
// Header is checksum (4 bytes), type (1 byte), length (2 bytes).
|
||||||
static const int kHeaderSize = 4 + 2 + 1;
|
static const int kHeaderSize = 4 + 1 + 2;
|
||||||
|
|
||||||
} // namespace log
|
} // namespace log
|
||||||
} // namespace leveldb
|
} // namespace leveldb
|
||||||
|
|
|
@ -25,8 +25,7 @@ Reader::Reader(SequentialFile* file, Reporter* reporter, bool checksum,
|
||||||
eof_(false),
|
eof_(false),
|
||||||
last_record_offset_(0),
|
last_record_offset_(0),
|
||||||
end_of_buffer_offset_(0),
|
end_of_buffer_offset_(0),
|
||||||
initial_offset_(initial_offset),
|
initial_offset_(initial_offset) {
|
||||||
resyncing_(initial_offset > 0) {
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Reader::~Reader() {
|
Reader::~Reader() {
|
||||||
|
@ -73,25 +72,8 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch) {
|
||||||
|
|
||||||
Slice fragment;
|
Slice fragment;
|
||||||
while (true) {
|
while (true) {
|
||||||
|
uint64_t physical_record_offset = end_of_buffer_offset_ - buffer_.size();
|
||||||
const unsigned int record_type = ReadPhysicalRecord(&fragment);
|
const unsigned int record_type = ReadPhysicalRecord(&fragment);
|
||||||
|
|
||||||
// ReadPhysicalRecord may have only had an empty trailer remaining in its
|
|
||||||
// internal buffer. Calculate the offset of the next physical record now
|
|
||||||
// that it has returned, properly accounting for its header size.
|
|
||||||
uint64_t physical_record_offset =
|
|
||||||
end_of_buffer_offset_ - buffer_.size() - kHeaderSize - fragment.size();
|
|
||||||
|
|
||||||
if (resyncing_) {
|
|
||||||
if (record_type == kMiddleType) {
|
|
||||||
continue;
|
|
||||||
} else if (record_type == kLastType) {
|
|
||||||
resyncing_ = false;
|
|
||||||
continue;
|
|
||||||
} else {
|
|
||||||
resyncing_ = false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
switch (record_type) {
|
switch (record_type) {
|
||||||
case kFullType:
|
case kFullType:
|
||||||
if (in_fragmented_record) {
|
if (in_fragmented_record) {
|
||||||
|
@ -151,9 +133,7 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch) {
|
||||||
|
|
||||||
case kEof:
|
case kEof:
|
||||||
if (in_fragmented_record) {
|
if (in_fragmented_record) {
|
||||||
// This can be caused by the writer dying immediately after
|
ReportCorruption(scratch->size(), "partial record without end(3)");
|
||||||
// writing a physical record but before completing the next; don't
|
|
||||||
// treat it as a corruption, just ignore the entire logical record.
|
|
||||||
scratch->clear();
|
scratch->clear();
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
|
@ -185,20 +165,20 @@ uint64_t Reader::LastRecordOffset() {
|
||||||
return last_record_offset_;
|
return last_record_offset_;
|
||||||
}
|
}
|
||||||
|
|
||||||
void Reader::ReportCorruption(uint64_t bytes, const char* reason) {
|
void Reader::ReportCorruption(size_t bytes, const char* reason) {
|
||||||
ReportDrop(bytes, Status::Corruption(reason, file_->GetName()));
|
ReportDrop(bytes, Status::Corruption(reason));
|
||||||
}
|
}
|
||||||
|
|
||||||
void Reader::ReportDrop(uint64_t bytes, const Status& reason) {
|
void Reader::ReportDrop(size_t bytes, const Status& reason) {
|
||||||
if (reporter_ != NULL &&
|
if (reporter_ != NULL &&
|
||||||
end_of_buffer_offset_ - buffer_.size() - bytes >= initial_offset_) {
|
end_of_buffer_offset_ - buffer_.size() - bytes >= initial_offset_) {
|
||||||
reporter_->Corruption(static_cast<size_t>(bytes), reason);
|
reporter_->Corruption(bytes, reason);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
unsigned int Reader::ReadPhysicalRecord(Slice* result) {
|
unsigned int Reader::ReadPhysicalRecord(Slice* result) {
|
||||||
while (true) {
|
while (true) {
|
||||||
if (buffer_.size() < kHeaderSize) {
|
if (buffer_.size() < (size_t)kHeaderSize) {
|
||||||
if (!eof_) {
|
if (!eof_) {
|
||||||
// Last read was a full read, so this is a trailer to skip
|
// Last read was a full read, so this is a trailer to skip
|
||||||
buffer_.clear();
|
buffer_.clear();
|
||||||
|
@ -209,16 +189,17 @@ unsigned int Reader::ReadPhysicalRecord(Slice* result) {
|
||||||
ReportDrop(kBlockSize, status);
|
ReportDrop(kBlockSize, status);
|
||||||
eof_ = true;
|
eof_ = true;
|
||||||
return kEof;
|
return kEof;
|
||||||
} else if (buffer_.size() < kBlockSize) {
|
} else if (buffer_.size() < (size_t)kBlockSize) {
|
||||||
eof_ = true;
|
eof_ = true;
|
||||||
}
|
}
|
||||||
continue;
|
continue;
|
||||||
|
} else if (buffer_.size() == 0) {
|
||||||
|
// End of file
|
||||||
|
return kEof;
|
||||||
} else {
|
} else {
|
||||||
// Note that if buffer_ is non-empty, we have a truncated header at the
|
size_t drop_size = buffer_.size();
|
||||||
// end of the file, which can be caused by the writer crashing in the
|
|
||||||
// middle of writing the header. Instead of considering this an error,
|
|
||||||
// just report EOF.
|
|
||||||
buffer_.clear();
|
buffer_.clear();
|
||||||
|
ReportCorruption(drop_size, "truncated record at end of file");
|
||||||
return kEof;
|
return kEof;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -232,15 +213,9 @@ unsigned int Reader::ReadPhysicalRecord(Slice* result) {
|
||||||
if (kHeaderSize + length > buffer_.size()) {
|
if (kHeaderSize + length > buffer_.size()) {
|
||||||
size_t drop_size = buffer_.size();
|
size_t drop_size = buffer_.size();
|
||||||
buffer_.clear();
|
buffer_.clear();
|
||||||
if (!eof_) {
|
|
||||||
ReportCorruption(drop_size, "bad record length");
|
ReportCorruption(drop_size, "bad record length");
|
||||||
return kBadRecord;
|
return kBadRecord;
|
||||||
}
|
}
|
||||||
// If the end of the file has been reached without reading |length| bytes
|
|
||||||
// of payload, assume the writer died in the middle of writing the record.
|
|
||||||
// Don't report a corruption.
|
|
||||||
return kEof;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (type == kZeroType && length == 0) {
|
if (type == kZeroType && length == 0) {
|
||||||
// Skip zero length record without reporting any drops since
|
// Skip zero length record without reporting any drops since
|
||||||
|
|
|
@ -73,11 +73,6 @@ class Reader {
|
||||||
// Offset at which to start looking for the first record to return
|
// Offset at which to start looking for the first record to return
|
||||||
uint64_t const initial_offset_;
|
uint64_t const initial_offset_;
|
||||||
|
|
||||||
// True if we are resynchronizing after a seek (initial_offset_ > 0). In
|
|
||||||
// particular, a run of kMiddleType and kLastType records can be silently
|
|
||||||
// skipped in this mode
|
|
||||||
bool resyncing_;
|
|
||||||
|
|
||||||
// Extend record types with the following special values
|
// Extend record types with the following special values
|
||||||
enum {
|
enum {
|
||||||
kEof = kMaxRecordType + 1,
|
kEof = kMaxRecordType + 1,
|
||||||
|
@ -99,8 +94,8 @@ class Reader {
|
||||||
|
|
||||||
// Reports dropped bytes to the reporter.
|
// Reports dropped bytes to the reporter.
|
||||||
// buffer_ must be updated to remove the dropped bytes prior to invocation.
|
// buffer_ must be updated to remove the dropped bytes prior to invocation.
|
||||||
void ReportCorruption(uint64_t bytes, const char* reason);
|
void ReportCorruption(size_t bytes, const char* reason);
|
||||||
void ReportDrop(uint64_t bytes, const Status& reason);
|
void ReportDrop(size_t bytes, const Status& reason);
|
||||||
|
|
||||||
// No copying allowed
|
// No copying allowed
|
||||||
Reader(const Reader&);
|
Reader(const Reader&);
|
||||||
|
|
|
@ -79,7 +79,7 @@ class LogTest {
|
||||||
virtual Status Skip(uint64_t n) {
|
virtual Status Skip(uint64_t n) {
|
||||||
if (n > contents_.size()) {
|
if (n > contents_.size()) {
|
||||||
contents_.clear();
|
contents_.clear();
|
||||||
return Status::NotFound("in-memory file skipped past end");
|
return Status::NotFound("in-memory file skipepd past end");
|
||||||
}
|
}
|
||||||
|
|
||||||
contents_.remove_prefix(n);
|
contents_.remove_prefix(n);
|
||||||
|
@ -104,34 +104,23 @@ class LogTest {
|
||||||
StringSource source_;
|
StringSource source_;
|
||||||
ReportCollector report_;
|
ReportCollector report_;
|
||||||
bool reading_;
|
bool reading_;
|
||||||
Writer* writer_;
|
Writer writer_;
|
||||||
Reader* reader_;
|
Reader reader_;
|
||||||
|
|
||||||
// Record metadata for testing initial offset functionality
|
// Record metadata for testing initial offset functionality
|
||||||
static size_t initial_offset_record_sizes_[];
|
static size_t initial_offset_record_sizes_[];
|
||||||
static uint64_t initial_offset_last_record_offsets_[];
|
static uint64_t initial_offset_last_record_offsets_[];
|
||||||
static int num_initial_offset_records_;
|
|
||||||
|
|
||||||
public:
|
public:
|
||||||
LogTest() : reading_(false),
|
LogTest() : reading_(false),
|
||||||
writer_(new Writer(&dest_)),
|
writer_(&dest_),
|
||||||
reader_(new Reader(&source_, &report_, true/*checksum*/,
|
reader_(&source_, &report_, true/*checksum*/,
|
||||||
0/*initial_offset*/)) {
|
0/*initial_offset*/) {
|
||||||
}
|
|
||||||
|
|
||||||
~LogTest() {
|
|
||||||
delete writer_;
|
|
||||||
delete reader_;
|
|
||||||
}
|
|
||||||
|
|
||||||
void ReopenForAppend() {
|
|
||||||
delete writer_;
|
|
||||||
writer_ = new Writer(&dest_, dest_.contents_.size());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void Write(const std::string& msg) {
|
void Write(const std::string& msg) {
|
||||||
ASSERT_TRUE(!reading_) << "Write() after starting to read";
|
ASSERT_TRUE(!reading_) << "Write() after starting to read";
|
||||||
writer_->AddRecord(Slice(msg));
|
writer_.AddRecord(Slice(msg));
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t WrittenBytes() const {
|
size_t WrittenBytes() const {
|
||||||
|
@ -145,7 +134,7 @@ class LogTest {
|
||||||
}
|
}
|
||||||
std::string scratch;
|
std::string scratch;
|
||||||
Slice record;
|
Slice record;
|
||||||
if (reader_->ReadRecord(&record, &scratch)) {
|
if (reader_.ReadRecord(&record, &scratch)) {
|
||||||
return record.ToString();
|
return record.ToString();
|
||||||
} else {
|
} else {
|
||||||
return "EOF";
|
return "EOF";
|
||||||
|
@ -193,18 +182,13 @@ class LogTest {
|
||||||
}
|
}
|
||||||
|
|
||||||
void WriteInitialOffsetLog() {
|
void WriteInitialOffsetLog() {
|
||||||
for (int i = 0; i < num_initial_offset_records_; i++) {
|
for (int i = 0; i < 4; i++) {
|
||||||
std::string record(initial_offset_record_sizes_[i],
|
std::string record(initial_offset_record_sizes_[i],
|
||||||
static_cast<char>('a' + i));
|
static_cast<char>('a' + i));
|
||||||
Write(record);
|
Write(record);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void StartReadingAt(uint64_t initial_offset) {
|
|
||||||
delete reader_;
|
|
||||||
reader_ = new Reader(&source_, &report_, true/*checksum*/, initial_offset);
|
|
||||||
}
|
|
||||||
|
|
||||||
void CheckOffsetPastEndReturnsNoRecords(uint64_t offset_past_end) {
|
void CheckOffsetPastEndReturnsNoRecords(uint64_t offset_past_end) {
|
||||||
WriteInitialOffsetLog();
|
WriteInitialOffsetLog();
|
||||||
reading_ = true;
|
reading_ = true;
|
||||||
|
@ -224,11 +208,6 @@ class LogTest {
|
||||||
source_.contents_ = Slice(dest_.contents_);
|
source_.contents_ = Slice(dest_.contents_);
|
||||||
Reader* offset_reader = new Reader(&source_, &report_, true/*checksum*/,
|
Reader* offset_reader = new Reader(&source_, &report_, true/*checksum*/,
|
||||||
initial_offset);
|
initial_offset);
|
||||||
|
|
||||||
// Read all records from expected_record_offset through the last one.
|
|
||||||
ASSERT_LT(expected_record_offset, num_initial_offset_records_);
|
|
||||||
for (; expected_record_offset < num_initial_offset_records_;
|
|
||||||
++expected_record_offset) {
|
|
||||||
Slice record;
|
Slice record;
|
||||||
std::string scratch;
|
std::string scratch;
|
||||||
ASSERT_TRUE(offset_reader->ReadRecord(&record, &scratch));
|
ASSERT_TRUE(offset_reader->ReadRecord(&record, &scratch));
|
||||||
|
@ -237,35 +216,24 @@ class LogTest {
|
||||||
ASSERT_EQ(initial_offset_last_record_offsets_[expected_record_offset],
|
ASSERT_EQ(initial_offset_last_record_offsets_[expected_record_offset],
|
||||||
offset_reader->LastRecordOffset());
|
offset_reader->LastRecordOffset());
|
||||||
ASSERT_EQ((char)('a' + expected_record_offset), record.data()[0]);
|
ASSERT_EQ((char)('a' + expected_record_offset), record.data()[0]);
|
||||||
}
|
|
||||||
delete offset_reader;
|
delete offset_reader;
|
||||||
}
|
}
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
size_t LogTest::initial_offset_record_sizes_[] =
|
size_t LogTest::initial_offset_record_sizes_[] =
|
||||||
{10000, // Two sizable records in first block
|
{10000, // Two sizable records in first block
|
||||||
10000,
|
10000,
|
||||||
2 * log::kBlockSize - 1000, // Span three blocks
|
2 * log::kBlockSize - 1000, // Span three blocks
|
||||||
1,
|
1};
|
||||||
13716, // Consume all but two bytes of block 3.
|
|
||||||
log::kBlockSize - kHeaderSize, // Consume the entirety of block 4.
|
|
||||||
};
|
|
||||||
|
|
||||||
uint64_t LogTest::initial_offset_last_record_offsets_[] =
|
uint64_t LogTest::initial_offset_last_record_offsets_[] =
|
||||||
{0,
|
{0,
|
||||||
kHeaderSize + 10000,
|
kHeaderSize + 10000,
|
||||||
2 * (kHeaderSize + 10000),
|
2 * (kHeaderSize + 10000),
|
||||||
2 * (kHeaderSize + 10000) +
|
2 * (kHeaderSize + 10000) +
|
||||||
(2 * log::kBlockSize - 1000) + 3 * kHeaderSize,
|
(2 * log::kBlockSize - 1000) + 3 * kHeaderSize};
|
||||||
2 * (kHeaderSize + 10000) +
|
|
||||||
(2 * log::kBlockSize - 1000) + 3 * kHeaderSize
|
|
||||||
+ kHeaderSize + 1,
|
|
||||||
3 * log::kBlockSize,
|
|
||||||
};
|
|
||||||
|
|
||||||
// LogTest::initial_offset_last_record_offsets_ must be defined before this.
|
|
||||||
int LogTest::num_initial_offset_records_ =
|
|
||||||
sizeof(LogTest::initial_offset_last_record_offsets_)/sizeof(uint64_t);
|
|
||||||
|
|
||||||
TEST(LogTest, Empty) {
|
TEST(LogTest, Empty) {
|
||||||
ASSERT_EQ("EOF", Read());
|
ASSERT_EQ("EOF", Read());
|
||||||
|
@ -350,15 +318,6 @@ TEST(LogTest, AlignedEof) {
|
||||||
ASSERT_EQ("EOF", Read());
|
ASSERT_EQ("EOF", Read());
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(LogTest, OpenForAppend) {
|
|
||||||
Write("hello");
|
|
||||||
ReopenForAppend();
|
|
||||||
Write("world");
|
|
||||||
ASSERT_EQ("hello", Read());
|
|
||||||
ASSERT_EQ("world", Read());
|
|
||||||
ASSERT_EQ("EOF", Read());
|
|
||||||
}
|
|
||||||
|
|
||||||
TEST(LogTest, RandomRead) {
|
TEST(LogTest, RandomRead) {
|
||||||
const int N = 500;
|
const int N = 500;
|
||||||
Random write_rnd(301);
|
Random write_rnd(301);
|
||||||
|
@ -392,32 +351,20 @@ TEST(LogTest, BadRecordType) {
|
||||||
ASSERT_EQ("OK", MatchError("unknown record type"));
|
ASSERT_EQ("OK", MatchError("unknown record type"));
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(LogTest, TruncatedTrailingRecordIsIgnored) {
|
TEST(LogTest, TruncatedTrailingRecord) {
|
||||||
Write("foo");
|
Write("foo");
|
||||||
ShrinkSize(4); // Drop all payload as well as a header byte
|
ShrinkSize(4); // Drop all payload as well as a header byte
|
||||||
ASSERT_EQ("EOF", Read());
|
ASSERT_EQ("EOF", Read());
|
||||||
// Truncated last record is ignored, not treated as an error.
|
ASSERT_EQ(kHeaderSize - 1, DroppedBytes());
|
||||||
ASSERT_EQ(0, DroppedBytes());
|
ASSERT_EQ("OK", MatchError("truncated record at end of file"));
|
||||||
ASSERT_EQ("", ReportMessage());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(LogTest, BadLength) {
|
TEST(LogTest, BadLength) {
|
||||||
const int kPayloadSize = kBlockSize - kHeaderSize;
|
|
||||||
Write(BigString("bar", kPayloadSize));
|
|
||||||
Write("foo");
|
|
||||||
// Least significant size byte is stored in header[4].
|
|
||||||
IncrementByte(4, 1);
|
|
||||||
ASSERT_EQ("foo", Read());
|
|
||||||
ASSERT_EQ(kBlockSize, DroppedBytes());
|
|
||||||
ASSERT_EQ("OK", MatchError("bad record length"));
|
|
||||||
}
|
|
||||||
|
|
||||||
TEST(LogTest, BadLengthAtEndIsIgnored) {
|
|
||||||
Write("foo");
|
Write("foo");
|
||||||
ShrinkSize(1);
|
ShrinkSize(1);
|
||||||
ASSERT_EQ("EOF", Read());
|
ASSERT_EQ("EOF", Read());
|
||||||
ASSERT_EQ(0, DroppedBytes());
|
ASSERT_EQ(kHeaderSize + 2, DroppedBytes());
|
||||||
ASSERT_EQ("", ReportMessage());
|
ASSERT_EQ("OK", MatchError("bad record length"));
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(LogTest, ChecksumMismatch) {
|
TEST(LogTest, ChecksumMismatch) {
|
||||||
|
@ -468,40 +415,6 @@ TEST(LogTest, UnexpectedFirstType) {
|
||||||
ASSERT_EQ("OK", MatchError("partial record without end"));
|
ASSERT_EQ("OK", MatchError("partial record without end"));
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(LogTest, MissingLastIsIgnored) {
|
|
||||||
Write(BigString("bar", kBlockSize));
|
|
||||||
// Remove the LAST block, including header.
|
|
||||||
ShrinkSize(14);
|
|
||||||
ASSERT_EQ("EOF", Read());
|
|
||||||
ASSERT_EQ("", ReportMessage());
|
|
||||||
ASSERT_EQ(0, DroppedBytes());
|
|
||||||
}
|
|
||||||
|
|
||||||
TEST(LogTest, PartialLastIsIgnored) {
|
|
||||||
Write(BigString("bar", kBlockSize));
|
|
||||||
// Cause a bad record length in the LAST block.
|
|
||||||
ShrinkSize(1);
|
|
||||||
ASSERT_EQ("EOF", Read());
|
|
||||||
ASSERT_EQ("", ReportMessage());
|
|
||||||
ASSERT_EQ(0, DroppedBytes());
|
|
||||||
}
|
|
||||||
|
|
||||||
TEST(LogTest, SkipIntoMultiRecord) {
|
|
||||||
// Consider a fragmented record:
|
|
||||||
// first(R1), middle(R1), last(R1), first(R2)
|
|
||||||
// If initial_offset points to a record after first(R1) but before first(R2)
|
|
||||||
// incomplete fragment errors are not actual errors, and must be suppressed
|
|
||||||
// until a new first or full record is encountered.
|
|
||||||
Write(BigString("foo", 3*kBlockSize));
|
|
||||||
Write("correct");
|
|
||||||
StartReadingAt(kBlockSize);
|
|
||||||
|
|
||||||
ASSERT_EQ("correct", Read());
|
|
||||||
ASSERT_EQ("", ReportMessage());
|
|
||||||
ASSERT_EQ(0, DroppedBytes());
|
|
||||||
ASSERT_EQ("EOF", Read());
|
|
||||||
}
|
|
||||||
|
|
||||||
TEST(LogTest, ErrorJoinsRecords) {
|
TEST(LogTest, ErrorJoinsRecords) {
|
||||||
// Consider two fragmented records:
|
// Consider two fragmented records:
|
||||||
// first(R1) last(R1) first(R2) last(R2)
|
// first(R1) last(R1) first(R2) last(R2)
|
||||||
|
@ -520,7 +433,7 @@ TEST(LogTest, ErrorJoinsRecords) {
|
||||||
|
|
||||||
ASSERT_EQ("correct", Read());
|
ASSERT_EQ("correct", Read());
|
||||||
ASSERT_EQ("EOF", Read());
|
ASSERT_EQ("EOF", Read());
|
||||||
const size_t dropped = DroppedBytes();
|
const int dropped = DroppedBytes();
|
||||||
ASSERT_LE(dropped, 2*kBlockSize + 100);
|
ASSERT_LE(dropped, 2*kBlockSize + 100);
|
||||||
ASSERT_GE(dropped, 2*kBlockSize);
|
ASSERT_GE(dropped, 2*kBlockSize);
|
||||||
}
|
}
|
||||||
|
@ -571,10 +484,6 @@ TEST(LogTest, ReadFourthStart) {
|
||||||
3);
|
3);
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(LogTest, ReadInitialOffsetIntoBlockPadding) {
|
|
||||||
CheckInitialOffsetRecord(3 * log::kBlockSize - 3, 5);
|
|
||||||
}
|
|
||||||
|
|
||||||
TEST(LogTest, ReadEnd) {
|
TEST(LogTest, ReadEnd) {
|
||||||
CheckOffsetPastEndReturnsNoRecords(0);
|
CheckOffsetPastEndReturnsNoRecords(0);
|
||||||
}
|
}
|
||||||
|
|
|
@ -12,22 +12,13 @@
|
||||||
namespace leveldb {
|
namespace leveldb {
|
||||||
namespace log {
|
namespace log {
|
||||||
|
|
||||||
static void InitTypeCrc(uint32_t* type_crc) {
|
|
||||||
for (int i = 0; i <= kMaxRecordType; i++) {
|
|
||||||
char t = static_cast<char>(i);
|
|
||||||
type_crc[i] = crc32c::Value(&t, 1);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Writer::Writer(WritableFile* dest)
|
Writer::Writer(WritableFile* dest)
|
||||||
: dest_(dest),
|
: dest_(dest),
|
||||||
block_offset_(0) {
|
block_offset_(0) {
|
||||||
InitTypeCrc(type_crc_);
|
for (int i = 0; i <= kMaxRecordType; i++) {
|
||||||
}
|
char t = static_cast<char>(i);
|
||||||
|
type_crc_[i] = crc32c::Value(&t, 1);
|
||||||
Writer::Writer(WritableFile* dest, uint64_t dest_length)
|
}
|
||||||
: dest_(dest), block_offset_(dest_length % kBlockSize) {
|
|
||||||
InitTypeCrc(type_crc_);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Writer::~Writer() {
|
Writer::~Writer() {
|
||||||
|
@ -83,7 +74,7 @@ Status Writer::AddRecord(const Slice& slice) {
|
||||||
|
|
||||||
Status Writer::EmitPhysicalRecord(RecordType t, const char* ptr, size_t n) {
|
Status Writer::EmitPhysicalRecord(RecordType t, const char* ptr, size_t n) {
|
||||||
assert(n <= 0xffff); // Must fit in two bytes
|
assert(n <= 0xffff); // Must fit in two bytes
|
||||||
assert(block_offset_ + kHeaderSize + n <= kBlockSize);
|
assert(block_offset_ + kHeaderSize + (int)n <= kBlockSize);
|
||||||
|
|
||||||
// Format the header
|
// Format the header
|
||||||
char buf[kHeaderSize];
|
char buf[kHeaderSize];
|
||||||
|
|
|
@ -9,11 +9,10 @@
|
||||||
#include "db/log_format.h"
|
#include "db/log_format.h"
|
||||||
#include "leveldb/slice.h"
|
#include "leveldb/slice.h"
|
||||||
#include "leveldb/status.h"
|
#include "leveldb/status.h"
|
||||||
|
#include "leveldb/env.h"
|
||||||
|
|
||||||
namespace leveldb {
|
namespace leveldb {
|
||||||
|
|
||||||
class WritableFile;
|
|
||||||
|
|
||||||
namespace log {
|
namespace log {
|
||||||
|
|
||||||
class Writer {
|
class Writer {
|
||||||
|
@ -22,16 +21,12 @@ class Writer {
|
||||||
// "*dest" must be initially empty.
|
// "*dest" must be initially empty.
|
||||||
// "*dest" must remain live while this Writer is in use.
|
// "*dest" must remain live while this Writer is in use.
|
||||||
explicit Writer(WritableFile* dest);
|
explicit Writer(WritableFile* dest);
|
||||||
|
|
||||||
// Create a writer that will append data to "*dest".
|
|
||||||
// "*dest" must have initial length "dest_length".
|
|
||||||
// "*dest" must remain live while this Writer is in use.
|
|
||||||
Writer(WritableFile* dest, uint64_t dest_length);
|
|
||||||
|
|
||||||
~Writer();
|
~Writer();
|
||||||
|
|
||||||
Status AddRecord(const Slice& slice);
|
Status AddRecord(const Slice& slice);
|
||||||
|
|
||||||
|
void Close() {delete dest_; dest_=NULL;};
|
||||||
|
|
||||||
private:
|
private:
|
||||||
WritableFile* dest_;
|
WritableFile* dest_;
|
||||||
int block_offset_; // Current offset in block
|
int block_offset_; // Current offset in block
|
||||||
|
|
|
@ -6,6 +6,7 @@
|
||||||
#include "db/dbformat.h"
|
#include "db/dbformat.h"
|
||||||
#include "leveldb/comparator.h"
|
#include "leveldb/comparator.h"
|
||||||
#include "leveldb/env.h"
|
#include "leveldb/env.h"
|
||||||
|
#include "leveldb/expiry.h"
|
||||||
#include "leveldb/iterator.h"
|
#include "leveldb/iterator.h"
|
||||||
#include "util/coding.h"
|
#include "util/coding.h"
|
||||||
|
|
||||||
|
@ -63,6 +64,8 @@ class MemTableIterator: public Iterator {
|
||||||
Slice key_slice = GetLengthPrefixedSlice(iter_.key());
|
Slice key_slice = GetLengthPrefixedSlice(iter_.key());
|
||||||
return GetLengthPrefixedSlice(key_slice.data() + key_slice.size());
|
return GetLengthPrefixedSlice(key_slice.data() + key_slice.size());
|
||||||
}
|
}
|
||||||
|
virtual KeyMetaData & keymetadata() const
|
||||||
|
{MemTable::DecodeKeyMetaData(iter_.key(), keymetadata_); return(keymetadata_);};
|
||||||
|
|
||||||
virtual Status status() const { return Status::OK(); }
|
virtual Status status() const { return Status::OK(); }
|
||||||
|
|
||||||
|
@ -81,7 +84,8 @@ Iterator* MemTable::NewIterator() {
|
||||||
|
|
||||||
void MemTable::Add(SequenceNumber s, ValueType type,
|
void MemTable::Add(SequenceNumber s, ValueType type,
|
||||||
const Slice& key,
|
const Slice& key,
|
||||||
const Slice& value) {
|
const Slice& value,
|
||||||
|
const ExpiryTimeMicros & expiry) {
|
||||||
// Format of an entry is concatenation of:
|
// Format of an entry is concatenation of:
|
||||||
// key_size : varint32 of internal_key.size()
|
// key_size : varint32 of internal_key.size()
|
||||||
// key bytes : char[internal_key.size()]
|
// key bytes : char[internal_key.size()]
|
||||||
|
@ -89,7 +93,7 @@ void MemTable::Add(SequenceNumber s, ValueType type,
|
||||||
// value bytes : char[value.size()]
|
// value bytes : char[value.size()]
|
||||||
size_t key_size = key.size();
|
size_t key_size = key.size();
|
||||||
size_t val_size = value.size();
|
size_t val_size = value.size();
|
||||||
size_t internal_key_size = key_size + 8;
|
size_t internal_key_size = key_size + KeySuffixSize(type);
|
||||||
const size_t encoded_len =
|
const size_t encoded_len =
|
||||||
VarintLength(internal_key_size) + internal_key_size +
|
VarintLength(internal_key_size) + internal_key_size +
|
||||||
VarintLength(val_size) + val_size;
|
VarintLength(val_size) + val_size;
|
||||||
|
@ -97,15 +101,22 @@ void MemTable::Add(SequenceNumber s, ValueType type,
|
||||||
char* p = EncodeVarint32(buf, internal_key_size);
|
char* p = EncodeVarint32(buf, internal_key_size);
|
||||||
memcpy(p, key.data(), key_size);
|
memcpy(p, key.data(), key_size);
|
||||||
p += key_size;
|
p += key_size;
|
||||||
|
if (IsExpiryKey(type))
|
||||||
|
{
|
||||||
|
EncodeFixed64(p, expiry);
|
||||||
|
p+=8;
|
||||||
|
}
|
||||||
EncodeFixed64(p, (s << 8) | type);
|
EncodeFixed64(p, (s << 8) | type);
|
||||||
p += 8;
|
p += 8;
|
||||||
p = EncodeVarint32(p, val_size);
|
p = EncodeVarint32(p, val_size);
|
||||||
memcpy(p, value.data(), val_size);
|
memcpy(p, value.data(), val_size);
|
||||||
assert(p + val_size == buf + encoded_len);
|
assert((size_t)((p + val_size) - buf) == encoded_len);
|
||||||
table_.Insert(buf);
|
table_.Insert(buf);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool MemTable::Get(const LookupKey& key, std::string* value, Status* s) {
|
bool MemTable::Get(const LookupKey& key, Value* value, Status* s,
|
||||||
|
const Options * options) {
|
||||||
|
bool ret_flag(false);
|
||||||
Slice memkey = key.memtable_key();
|
Slice memkey = key.memtable_key();
|
||||||
Table::Iterator iter(&table_);
|
Table::Iterator iter(&table_);
|
||||||
iter.Seek(memkey.data());
|
iter.Seek(memkey.data());
|
||||||
|
@ -113,6 +124,7 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s) {
|
||||||
// entry format is:
|
// entry format is:
|
||||||
// klength varint32
|
// klength varint32
|
||||||
// userkey char[klength]
|
// userkey char[klength]
|
||||||
|
// optional uint64
|
||||||
// tag uint64
|
// tag uint64
|
||||||
// vlength varint32
|
// vlength varint32
|
||||||
// value char[vlength]
|
// value char[vlength]
|
||||||
|
@ -122,24 +134,66 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s) {
|
||||||
const char* entry = iter.key();
|
const char* entry = iter.key();
|
||||||
uint32_t key_length;
|
uint32_t key_length;
|
||||||
const char* key_ptr = GetVarint32Ptr(entry, entry+5, &key_length);
|
const char* key_ptr = GetVarint32Ptr(entry, entry+5, &key_length);
|
||||||
|
Slice internal_key(key_ptr, key_length);
|
||||||
if (comparator_.comparator.user_comparator()->Compare(
|
if (comparator_.comparator.user_comparator()->Compare(
|
||||||
Slice(key_ptr, key_length - 8),
|
ExtractUserKey(internal_key),
|
||||||
key.user_key()) == 0) {
|
key.user_key()) == 0) {
|
||||||
// Correct user key
|
// Correct user key
|
||||||
const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8);
|
KeyMetaData meta;
|
||||||
switch (static_cast<ValueType>(tag & 0xff)) {
|
DecodeKeyMetaData(entry, meta);
|
||||||
case kTypeValue: {
|
|
||||||
|
switch (meta.m_Type) {
|
||||||
|
case kTypeValueWriteTime:
|
||||||
|
case kTypeValueExplicitExpiry:
|
||||||
|
{
|
||||||
|
bool expired=false;
|
||||||
|
if (NULL!=options && options->ExpiryActivated())
|
||||||
|
expired=options->expiry_module->MemTableCallback(internal_key);
|
||||||
|
if (expired)
|
||||||
|
{
|
||||||
|
// like kTypeDeletion
|
||||||
|
*s = Status::NotFound(Slice());
|
||||||
|
ret_flag=true;
|
||||||
|
break;
|
||||||
|
} // if
|
||||||
|
//otherwise fall into kTypeValue code
|
||||||
|
} // case
|
||||||
|
|
||||||
|
case kTypeValue:
|
||||||
|
{
|
||||||
Slice v = GetLengthPrefixedSlice(key_ptr + key_length);
|
Slice v = GetLengthPrefixedSlice(key_ptr + key_length);
|
||||||
value->assign(v.data(), v.size());
|
value->assign(v.data(), v.size());
|
||||||
return true;
|
ret_flag=true;
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
case kTypeDeletion:
|
case kTypeDeletion:
|
||||||
*s = Status::NotFound(Slice());
|
*s = Status::NotFound(Slice());
|
||||||
return true;
|
ret_flag=true;
|
||||||
|
break;
|
||||||
|
} // switch
|
||||||
|
|
||||||
|
// only unpack metadata if requested
|
||||||
|
if (key.WantsKeyMetaData())
|
||||||
|
key.SetKeyMetaData(meta);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
return ret_flag;
|
||||||
return false;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// this is a static function
|
||||||
|
void MemTable::DecodeKeyMetaData(
|
||||||
|
const char * key,
|
||||||
|
KeyMetaData & meta)
|
||||||
|
{
|
||||||
|
Slice key_slice = GetLengthPrefixedSlice(key);
|
||||||
|
|
||||||
|
meta.m_Type=ExtractValueType(key_slice);
|
||||||
|
meta.m_Sequence=ExtractSequenceNumber(key_slice);
|
||||||
|
if (IsExpiryKey(meta.m_Type))
|
||||||
|
meta.m_Expiry=ExtractExpiry(key_slice);
|
||||||
|
else
|
||||||
|
meta.m_Expiry=0;
|
||||||
|
|
||||||
|
} // DecodeKeyMetaData
|
||||||
|
|
||||||
} // namespace leveldb
|
} // namespace leveldb
|
||||||
|
|
|
@ -24,10 +24,10 @@ class MemTable {
|
||||||
explicit MemTable(const InternalKeyComparator& comparator);
|
explicit MemTable(const InternalKeyComparator& comparator);
|
||||||
|
|
||||||
// Increase reference count.
|
// Increase reference count.
|
||||||
void Ref() { ++refs_; }
|
void Ref() volatile { ++refs_; }
|
||||||
|
|
||||||
// Drop reference count. Delete if no more references exist.
|
// Drop reference count. Delete if no more references exist.
|
||||||
void Unref() {
|
void Unref() volatile {
|
||||||
--refs_;
|
--refs_;
|
||||||
assert(refs_ >= 0);
|
assert(refs_ >= 0);
|
||||||
if (refs_ <= 0) {
|
if (refs_ <= 0) {
|
||||||
|
@ -36,7 +36,10 @@ class MemTable {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Returns an estimate of the number of bytes of data in use by this
|
// Returns an estimate of the number of bytes of data in use by this
|
||||||
// data structure. It is safe to call when MemTable is being modified.
|
// data structure.
|
||||||
|
//
|
||||||
|
// REQUIRES: external synchronization to prevent simultaneous
|
||||||
|
// operations on the same MemTable.
|
||||||
size_t ApproximateMemoryUsage();
|
size_t ApproximateMemoryUsage();
|
||||||
|
|
||||||
// Return an iterator that yields the contents of the memtable.
|
// Return an iterator that yields the contents of the memtable.
|
||||||
|
@ -52,13 +55,17 @@ class MemTable {
|
||||||
// Typically value will be empty if type==kTypeDeletion.
|
// Typically value will be empty if type==kTypeDeletion.
|
||||||
void Add(SequenceNumber seq, ValueType type,
|
void Add(SequenceNumber seq, ValueType type,
|
||||||
const Slice& key,
|
const Slice& key,
|
||||||
const Slice& value);
|
const Slice& value,
|
||||||
|
const ExpiryTimeMicros& expiry=0);
|
||||||
|
|
||||||
// If memtable contains a value for key, store it in *value and return true.
|
// If memtable contains a value for key, store it in *value and return true.
|
||||||
// If memtable contains a deletion for key, store a NotFound() error
|
// If memtable contains a deletion for key, store a NotFound() error
|
||||||
// in *status and return true.
|
// in *status and return true.
|
||||||
// Else, return false.
|
// Else, return false.
|
||||||
bool Get(const LookupKey& key, std::string* value, Status* s);
|
bool Get(const LookupKey& key, Value* value, Status* s, const Options * options);
|
||||||
|
|
||||||
|
// parse keymetadata from skiplist key string
|
||||||
|
static void DecodeKeyMetaData(const char * key, KeyMetaData & meta);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
~MemTable(); // Private since only Unref() should be used to delete it
|
~MemTable(); // Private since only Unref() should be used to delete it
|
||||||
|
@ -69,7 +76,7 @@ class MemTable {
|
||||||
int operator()(const char* a, const char* b) const;
|
int operator()(const char* a, const char* b) const;
|
||||||
};
|
};
|
||||||
friend class MemTableIterator;
|
friend class MemTableIterator;
|
||||||
friend class MemTableBackwardIterator;
|
friend class MemTableBackwardIterator; // does not exist
|
||||||
|
|
||||||
typedef SkipList<const char*, KeyComparator> Table;
|
typedef SkipList<const char*, KeyComparator> Table;
|
||||||
|
|
||||||
|
|
248
src/leveldb/db/penalty_test.cc
Normal file
248
src/leveldb/db/penalty_test.cc
Normal file
|
@ -0,0 +1,248 @@
|
||||||
|
// -------------------------------------------------------------------
|
||||||
|
//
|
||||||
|
// penalty_test.cc
|
||||||
|
//
|
||||||
|
// Copyright (c) 2016 Basho Technologies, Inc. All Rights Reserved.
|
||||||
|
//
|
||||||
|
// This file is provided to you under the Apache License,
|
||||||
|
// Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain
|
||||||
|
// a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing,
|
||||||
|
// software distributed under the License is distributed on an
|
||||||
|
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
// KIND, either express or implied. See the License for the
|
||||||
|
// specific language governing permissions and limitations
|
||||||
|
// under the License.
|
||||||
|
//
|
||||||
|
// -------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
#include "util/testharness.h"
|
||||||
|
#include "util/testutil.h"
|
||||||
|
|
||||||
|
#include "leveldb/comparator.h"
|
||||||
|
|
||||||
|
#include "db/version_set.h"
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Execution routine
|
||||||
|
*/
|
||||||
|
int main(int argc, char** argv)
|
||||||
|
{
|
||||||
|
return leveldb::test::RunAllTests();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
namespace leveldb {
|
||||||
|
|
||||||
|
class TestVersion : public Version
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
TestVersion()
|
||||||
|
: Version(NULL)
|
||||||
|
{
|
||||||
|
int loop;
|
||||||
|
|
||||||
|
for (loop=0; loop<config::kNumLevels; ++loop)
|
||||||
|
{
|
||||||
|
m_FalseFile[loop].file_size=0;
|
||||||
|
m_LevelFileCount[loop]=0;
|
||||||
|
} // for
|
||||||
|
};
|
||||||
|
|
||||||
|
virtual size_t NumFiles(int level) const {return(m_LevelFileCount[level]);};
|
||||||
|
|
||||||
|
virtual const std::vector<FileMetaData*> & GetFileList(int level) const
|
||||||
|
{
|
||||||
|
m_FalseVector.clear();
|
||||||
|
m_FalseVector.push_back(&m_FalseFile[level]);
|
||||||
|
return(m_FalseVector);
|
||||||
|
};
|
||||||
|
|
||||||
|
mutable std::vector<FileMetaData*> m_FalseVector;
|
||||||
|
mutable FileMetaData m_FalseFile[config::kNumLevels];
|
||||||
|
|
||||||
|
size_t m_LevelFileCount[config::kNumLevels];
|
||||||
|
|
||||||
|
}; // class TestVersion
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Wrapper class for tests. Holds working variables
|
||||||
|
* and helper functions.
|
||||||
|
*/
|
||||||
|
class PenaltyTester : public VersionSet
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
PenaltyTester()
|
||||||
|
: m_IntCompare(m_Options.comparator), VersionSet("", &m_Options, NULL, &m_IntCompare)
|
||||||
|
{
|
||||||
|
};
|
||||||
|
|
||||||
|
~PenaltyTester()
|
||||||
|
{
|
||||||
|
};
|
||||||
|
|
||||||
|
Options m_Options;
|
||||||
|
InternalKeyComparator m_IntCompare;
|
||||||
|
|
||||||
|
}; // class PenaltyTester
|
||||||
|
|
||||||
|
|
||||||
|
/*******************
|
||||||
|
* Form note:
|
||||||
|
* using ASSERT_TRUE(0==version.WritePenalty());
|
||||||
|
* instead of ASSERT_EQ / ASSERT_NE because WritePenalty
|
||||||
|
* returns a volatile int, which older compilers believe is
|
||||||
|
* not an equivalent type to a constant. RedHat 5, Solaris,
|
||||||
|
* and SmartOS were giving grief.
|
||||||
|
*******************/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Debug 1
|
||||||
|
*/
|
||||||
|
#if 0
|
||||||
|
TEST(PenaltyTester, Debug1)
|
||||||
|
{
|
||||||
|
TestVersion version;
|
||||||
|
int penalty;
|
||||||
|
|
||||||
|
m_Options.write_buffer_size=46416847;
|
||||||
|
|
||||||
|
version.m_FalseFile[2].file_size=1075676398;
|
||||||
|
version.m_LevelFileCount[1]=1;
|
||||||
|
|
||||||
|
UpdatePenalty(&version);
|
||||||
|
|
||||||
|
ASSERT_TRUE(0==version.WritePenalty());
|
||||||
|
|
||||||
|
} // test Debug1
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* No penalty scenarios
|
||||||
|
*/
|
||||||
|
TEST(PenaltyTester, NoPenalty)
|
||||||
|
{
|
||||||
|
TestVersion version;
|
||||||
|
int level;
|
||||||
|
|
||||||
|
m_Options.write_buffer_size=46416847;
|
||||||
|
|
||||||
|
// nothing
|
||||||
|
UpdatePenalty(&version);
|
||||||
|
ASSERT_TRUE(0==version.WritePenalty());
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Level 0
|
||||||
|
* (overlapped level, penalty is count based)
|
||||||
|
*/
|
||||||
|
// no penalty
|
||||||
|
version.m_LevelFileCount[0]=config::kL0_CompactionTrigger;
|
||||||
|
UpdatePenalty(&version);
|
||||||
|
ASSERT_TRUE(0==version.WritePenalty());
|
||||||
|
|
||||||
|
version.m_LevelFileCount[0]=config::kL0_SlowdownWritesTrigger;
|
||||||
|
UpdatePenalty(&version);
|
||||||
|
ASSERT_TRUE(0==version.WritePenalty());
|
||||||
|
|
||||||
|
#if 0 // needs rewrite to be time based
|
||||||
|
// threshold reached ... some penalty
|
||||||
|
version.m_LevelFileCount[0]=config::kL0_SlowdownWritesTrigger+1;
|
||||||
|
UpdatePenalty(&version);
|
||||||
|
ASSERT_TRUE(0!=version.WritePenalty());
|
||||||
|
|
||||||
|
// clean up
|
||||||
|
version.m_LevelFileCount[0]=0;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Level 1
|
||||||
|
* (overlapped level, penalty is count based)
|
||||||
|
*/
|
||||||
|
// no penalty
|
||||||
|
version.m_LevelFileCount[1]=config::kL0_CompactionTrigger;
|
||||||
|
UpdatePenalty(&version);
|
||||||
|
ASSERT_TRUE(0==version.WritePenalty());
|
||||||
|
|
||||||
|
version.m_LevelFileCount[1]=config::kL0_SlowdownWritesTrigger;
|
||||||
|
UpdatePenalty(&version);
|
||||||
|
ASSERT_TRUE(0==version.WritePenalty());
|
||||||
|
|
||||||
|
// threshold reached ... some penalty
|
||||||
|
version.m_LevelFileCount[1]=config::kL0_SlowdownWritesTrigger+1;
|
||||||
|
UpdatePenalty(&version);
|
||||||
|
ASSERT_TRUE(0!=version.WritePenalty());
|
||||||
|
|
||||||
|
// clean up
|
||||||
|
version.m_LevelFileCount[1]=0;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Level 2
|
||||||
|
* (landing level, penalty size based)
|
||||||
|
*/
|
||||||
|
// no penalty
|
||||||
|
version.m_FalseFile[2].file_size=0;
|
||||||
|
UpdatePenalty(&version);
|
||||||
|
ASSERT_TRUE(0==version.WritePenalty());
|
||||||
|
|
||||||
|
version.m_FalseFile[2].file_size=VersionSet::DesiredBytesForLevel(2);
|
||||||
|
UpdatePenalty(&version);
|
||||||
|
ASSERT_TRUE(0==version.WritePenalty());
|
||||||
|
|
||||||
|
version.m_FalseFile[2].file_size=VersionSet::MaxBytesForLevel(2)-1;
|
||||||
|
UpdatePenalty(&version);
|
||||||
|
ASSERT_TRUE(0==version.WritePenalty());
|
||||||
|
|
||||||
|
version.m_FalseFile[2].file_size=VersionSet::MaxBytesForLevel(2);
|
||||||
|
UpdatePenalty(&version);
|
||||||
|
ASSERT_TRUE(0!=version.WritePenalty());
|
||||||
|
|
||||||
|
// interaction rule with level 1
|
||||||
|
version.m_FalseFile[2].file_size=VersionSet::MaxBytesForLevel(2)-1;
|
||||||
|
version.m_LevelFileCount[1]=config::kL0_CompactionTrigger/2;
|
||||||
|
UpdatePenalty(&version);
|
||||||
|
ASSERT_TRUE(0!=version.WritePenalty());
|
||||||
|
|
||||||
|
// clean up
|
||||||
|
version.m_LevelFileCount[1]=0;
|
||||||
|
version.m_FalseFile[2].file_size=0;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Level 3+
|
||||||
|
* (landing level, penalty size based)
|
||||||
|
*/
|
||||||
|
for (level=3; level<config::kNumLevels; ++level)
|
||||||
|
{
|
||||||
|
// no penalty
|
||||||
|
version.m_FalseFile[level].file_size=0;
|
||||||
|
UpdatePenalty(&version);
|
||||||
|
ASSERT_TRUE(0==version.WritePenalty());
|
||||||
|
|
||||||
|
version.m_FalseFile[level].file_size=VersionSet::DesiredBytesForLevel(level);
|
||||||
|
UpdatePenalty(&version);
|
||||||
|
ASSERT_TRUE(0==version.WritePenalty());
|
||||||
|
|
||||||
|
version.m_FalseFile[level].file_size=VersionSet::MaxBytesForLevel(level)-1;
|
||||||
|
UpdatePenalty(&version);
|
||||||
|
ASSERT_TRUE(0==version.WritePenalty());
|
||||||
|
|
||||||
|
version.m_FalseFile[level].file_size=VersionSet::MaxBytesForLevel(level);
|
||||||
|
UpdatePenalty(&version);
|
||||||
|
if ((config::kNumLevels-1)!=level)
|
||||||
|
ASSERT_TRUE(0!=version.WritePenalty());
|
||||||
|
else
|
||||||
|
ASSERT_TRUE(0==version.WritePenalty());
|
||||||
|
|
||||||
|
// clean up
|
||||||
|
version.m_FalseFile[level].file_size=0;
|
||||||
|
} // for
|
||||||
|
#endif
|
||||||
|
} // test NoPenalty
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
} // namespace leveldb
|
|
@ -1,324 +0,0 @@
|
||||||
// Copyright (c) 2014 The LevelDB Authors. All rights reserved.
|
|
||||||
// Use of this source code is governed by a BSD-style license that can be
|
|
||||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
||||||
|
|
||||||
#include "db/db_impl.h"
|
|
||||||
#include "db/filename.h"
|
|
||||||
#include "db/version_set.h"
|
|
||||||
#include "db/write_batch_internal.h"
|
|
||||||
#include "leveldb/db.h"
|
|
||||||
#include "leveldb/env.h"
|
|
||||||
#include "leveldb/write_batch.h"
|
|
||||||
#include "util/logging.h"
|
|
||||||
#include "util/testharness.h"
|
|
||||||
#include "util/testutil.h"
|
|
||||||
|
|
||||||
namespace leveldb {
|
|
||||||
|
|
||||||
class RecoveryTest {
|
|
||||||
public:
|
|
||||||
RecoveryTest() : env_(Env::Default()), db_(NULL) {
|
|
||||||
dbname_ = test::TmpDir() + "/recovery_test";
|
|
||||||
DestroyDB(dbname_, Options());
|
|
||||||
Open();
|
|
||||||
}
|
|
||||||
|
|
||||||
~RecoveryTest() {
|
|
||||||
Close();
|
|
||||||
DestroyDB(dbname_, Options());
|
|
||||||
}
|
|
||||||
|
|
||||||
DBImpl* dbfull() const { return reinterpret_cast<DBImpl*>(db_); }
|
|
||||||
Env* env() const { return env_; }
|
|
||||||
|
|
||||||
bool CanAppend() {
|
|
||||||
WritableFile* tmp;
|
|
||||||
Status s = env_->NewAppendableFile(CurrentFileName(dbname_), &tmp);
|
|
||||||
delete tmp;
|
|
||||||
if (s.IsNotSupportedError()) {
|
|
||||||
return false;
|
|
||||||
} else {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void Close() {
|
|
||||||
delete db_;
|
|
||||||
db_ = NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
void Open(Options* options = NULL) {
|
|
||||||
Close();
|
|
||||||
Options opts;
|
|
||||||
if (options != NULL) {
|
|
||||||
opts = *options;
|
|
||||||
} else {
|
|
||||||
opts.reuse_logs = true; // TODO(sanjay): test both ways
|
|
||||||
opts.create_if_missing = true;
|
|
||||||
}
|
|
||||||
if (opts.env == NULL) {
|
|
||||||
opts.env = env_;
|
|
||||||
}
|
|
||||||
ASSERT_OK(DB::Open(opts, dbname_, &db_));
|
|
||||||
ASSERT_EQ(1, NumLogs());
|
|
||||||
}
|
|
||||||
|
|
||||||
Status Put(const std::string& k, const std::string& v) {
|
|
||||||
return db_->Put(WriteOptions(), k, v);
|
|
||||||
}
|
|
||||||
|
|
||||||
std::string Get(const std::string& k, const Snapshot* snapshot = NULL) {
|
|
||||||
std::string result;
|
|
||||||
Status s = db_->Get(ReadOptions(), k, &result);
|
|
||||||
if (s.IsNotFound()) {
|
|
||||||
result = "NOT_FOUND";
|
|
||||||
} else if (!s.ok()) {
|
|
||||||
result = s.ToString();
|
|
||||||
}
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::string ManifestFileName() {
|
|
||||||
std::string current;
|
|
||||||
ASSERT_OK(ReadFileToString(env_, CurrentFileName(dbname_), ¤t));
|
|
||||||
size_t len = current.size();
|
|
||||||
if (len > 0 && current[len-1] == '\n') {
|
|
||||||
current.resize(len - 1);
|
|
||||||
}
|
|
||||||
return dbname_ + "/" + current;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::string LogName(uint64_t number) {
|
|
||||||
return LogFileName(dbname_, number);
|
|
||||||
}
|
|
||||||
|
|
||||||
size_t DeleteLogFiles() {
|
|
||||||
std::vector<uint64_t> logs = GetFiles(kLogFile);
|
|
||||||
for (size_t i = 0; i < logs.size(); i++) {
|
|
||||||
ASSERT_OK(env_->DeleteFile(LogName(logs[i]))) << LogName(logs[i]);
|
|
||||||
}
|
|
||||||
return logs.size();
|
|
||||||
}
|
|
||||||
|
|
||||||
uint64_t FirstLogFile() {
|
|
||||||
return GetFiles(kLogFile)[0];
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<uint64_t> GetFiles(FileType t) {
|
|
||||||
std::vector<std::string> filenames;
|
|
||||||
ASSERT_OK(env_->GetChildren(dbname_, &filenames));
|
|
||||||
std::vector<uint64_t> result;
|
|
||||||
for (size_t i = 0; i < filenames.size(); i++) {
|
|
||||||
uint64_t number;
|
|
||||||
FileType type;
|
|
||||||
if (ParseFileName(filenames[i], &number, &type) && type == t) {
|
|
||||||
result.push_back(number);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
int NumLogs() {
|
|
||||||
return GetFiles(kLogFile).size();
|
|
||||||
}
|
|
||||||
|
|
||||||
int NumTables() {
|
|
||||||
return GetFiles(kTableFile).size();
|
|
||||||
}
|
|
||||||
|
|
||||||
uint64_t FileSize(const std::string& fname) {
|
|
||||||
uint64_t result;
|
|
||||||
ASSERT_OK(env_->GetFileSize(fname, &result)) << fname;
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
void CompactMemTable() {
|
|
||||||
dbfull()->TEST_CompactMemTable();
|
|
||||||
}
|
|
||||||
|
|
||||||
// Directly construct a log file that sets key to val.
|
|
||||||
void MakeLogFile(uint64_t lognum, SequenceNumber seq, Slice key, Slice val) {
|
|
||||||
std::string fname = LogFileName(dbname_, lognum);
|
|
||||||
WritableFile* file;
|
|
||||||
ASSERT_OK(env_->NewWritableFile(fname, &file));
|
|
||||||
log::Writer writer(file);
|
|
||||||
WriteBatch batch;
|
|
||||||
batch.Put(key, val);
|
|
||||||
WriteBatchInternal::SetSequence(&batch, seq);
|
|
||||||
ASSERT_OK(writer.AddRecord(WriteBatchInternal::Contents(&batch)));
|
|
||||||
ASSERT_OK(file->Flush());
|
|
||||||
delete file;
|
|
||||||
}
|
|
||||||
|
|
||||||
private:
|
|
||||||
std::string dbname_;
|
|
||||||
Env* env_;
|
|
||||||
DB* db_;
|
|
||||||
};
|
|
||||||
|
|
||||||
TEST(RecoveryTest, ManifestReused) {
|
|
||||||
if (!CanAppend()) {
|
|
||||||
fprintf(stderr, "skipping test because env does not support appending\n");
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
ASSERT_OK(Put("foo", "bar"));
|
|
||||||
Close();
|
|
||||||
std::string old_manifest = ManifestFileName();
|
|
||||||
Open();
|
|
||||||
ASSERT_EQ(old_manifest, ManifestFileName());
|
|
||||||
ASSERT_EQ("bar", Get("foo"));
|
|
||||||
Open();
|
|
||||||
ASSERT_EQ(old_manifest, ManifestFileName());
|
|
||||||
ASSERT_EQ("bar", Get("foo"));
|
|
||||||
}
|
|
||||||
|
|
||||||
TEST(RecoveryTest, LargeManifestCompacted) {
|
|
||||||
if (!CanAppend()) {
|
|
||||||
fprintf(stderr, "skipping test because env does not support appending\n");
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
ASSERT_OK(Put("foo", "bar"));
|
|
||||||
Close();
|
|
||||||
std::string old_manifest = ManifestFileName();
|
|
||||||
|
|
||||||
// Pad with zeroes to make manifest file very big.
|
|
||||||
{
|
|
||||||
uint64_t len = FileSize(old_manifest);
|
|
||||||
WritableFile* file;
|
|
||||||
ASSERT_OK(env()->NewAppendableFile(old_manifest, &file));
|
|
||||||
std::string zeroes(3*1048576 - static_cast<size_t>(len), 0);
|
|
||||||
ASSERT_OK(file->Append(zeroes));
|
|
||||||
ASSERT_OK(file->Flush());
|
|
||||||
delete file;
|
|
||||||
}
|
|
||||||
|
|
||||||
Open();
|
|
||||||
std::string new_manifest = ManifestFileName();
|
|
||||||
ASSERT_NE(old_manifest, new_manifest);
|
|
||||||
ASSERT_GT(10000, FileSize(new_manifest));
|
|
||||||
ASSERT_EQ("bar", Get("foo"));
|
|
||||||
|
|
||||||
Open();
|
|
||||||
ASSERT_EQ(new_manifest, ManifestFileName());
|
|
||||||
ASSERT_EQ("bar", Get("foo"));
|
|
||||||
}
|
|
||||||
|
|
||||||
TEST(RecoveryTest, NoLogFiles) {
|
|
||||||
ASSERT_OK(Put("foo", "bar"));
|
|
||||||
ASSERT_EQ(1, DeleteLogFiles());
|
|
||||||
Open();
|
|
||||||
ASSERT_EQ("NOT_FOUND", Get("foo"));
|
|
||||||
Open();
|
|
||||||
ASSERT_EQ("NOT_FOUND", Get("foo"));
|
|
||||||
}
|
|
||||||
|
|
||||||
TEST(RecoveryTest, LogFileReuse) {
|
|
||||||
if (!CanAppend()) {
|
|
||||||
fprintf(stderr, "skipping test because env does not support appending\n");
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
for (int i = 0; i < 2; i++) {
|
|
||||||
ASSERT_OK(Put("foo", "bar"));
|
|
||||||
if (i == 0) {
|
|
||||||
// Compact to ensure current log is empty
|
|
||||||
CompactMemTable();
|
|
||||||
}
|
|
||||||
Close();
|
|
||||||
ASSERT_EQ(1, NumLogs());
|
|
||||||
uint64_t number = FirstLogFile();
|
|
||||||
if (i == 0) {
|
|
||||||
ASSERT_EQ(0, FileSize(LogName(number)));
|
|
||||||
} else {
|
|
||||||
ASSERT_LT(0, FileSize(LogName(number)));
|
|
||||||
}
|
|
||||||
Open();
|
|
||||||
ASSERT_EQ(1, NumLogs());
|
|
||||||
ASSERT_EQ(number, FirstLogFile()) << "did not reuse log file";
|
|
||||||
ASSERT_EQ("bar", Get("foo"));
|
|
||||||
Open();
|
|
||||||
ASSERT_EQ(1, NumLogs());
|
|
||||||
ASSERT_EQ(number, FirstLogFile()) << "did not reuse log file";
|
|
||||||
ASSERT_EQ("bar", Get("foo"));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
TEST(RecoveryTest, MultipleMemTables) {
|
|
||||||
// Make a large log.
|
|
||||||
const int kNum = 1000;
|
|
||||||
for (int i = 0; i < kNum; i++) {
|
|
||||||
char buf[100];
|
|
||||||
snprintf(buf, sizeof(buf), "%050d", i);
|
|
||||||
ASSERT_OK(Put(buf, buf));
|
|
||||||
}
|
|
||||||
ASSERT_EQ(0, NumTables());
|
|
||||||
Close();
|
|
||||||
ASSERT_EQ(0, NumTables());
|
|
||||||
ASSERT_EQ(1, NumLogs());
|
|
||||||
uint64_t old_log_file = FirstLogFile();
|
|
||||||
|
|
||||||
// Force creation of multiple memtables by reducing the write buffer size.
|
|
||||||
Options opt;
|
|
||||||
opt.reuse_logs = true;
|
|
||||||
opt.write_buffer_size = (kNum*100) / 2;
|
|
||||||
Open(&opt);
|
|
||||||
ASSERT_LE(2, NumTables());
|
|
||||||
ASSERT_EQ(1, NumLogs());
|
|
||||||
ASSERT_NE(old_log_file, FirstLogFile()) << "must not reuse log";
|
|
||||||
for (int i = 0; i < kNum; i++) {
|
|
||||||
char buf[100];
|
|
||||||
snprintf(buf, sizeof(buf), "%050d", i);
|
|
||||||
ASSERT_EQ(buf, Get(buf));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
TEST(RecoveryTest, MultipleLogFiles) {
|
|
||||||
ASSERT_OK(Put("foo", "bar"));
|
|
||||||
Close();
|
|
||||||
ASSERT_EQ(1, NumLogs());
|
|
||||||
|
|
||||||
// Make a bunch of uncompacted log files.
|
|
||||||
uint64_t old_log = FirstLogFile();
|
|
||||||
MakeLogFile(old_log+1, 1000, "hello", "world");
|
|
||||||
MakeLogFile(old_log+2, 1001, "hi", "there");
|
|
||||||
MakeLogFile(old_log+3, 1002, "foo", "bar2");
|
|
||||||
|
|
||||||
// Recover and check that all log files were processed.
|
|
||||||
Open();
|
|
||||||
ASSERT_LE(1, NumTables());
|
|
||||||
ASSERT_EQ(1, NumLogs());
|
|
||||||
uint64_t new_log = FirstLogFile();
|
|
||||||
ASSERT_LE(old_log+3, new_log);
|
|
||||||
ASSERT_EQ("bar2", Get("foo"));
|
|
||||||
ASSERT_EQ("world", Get("hello"));
|
|
||||||
ASSERT_EQ("there", Get("hi"));
|
|
||||||
|
|
||||||
// Test that previous recovery produced recoverable state.
|
|
||||||
Open();
|
|
||||||
ASSERT_LE(1, NumTables());
|
|
||||||
ASSERT_EQ(1, NumLogs());
|
|
||||||
if (CanAppend()) {
|
|
||||||
ASSERT_EQ(new_log, FirstLogFile());
|
|
||||||
}
|
|
||||||
ASSERT_EQ("bar2", Get("foo"));
|
|
||||||
ASSERT_EQ("world", Get("hello"));
|
|
||||||
ASSERT_EQ("there", Get("hi"));
|
|
||||||
|
|
||||||
// Check that introducing an older log file does not cause it to be re-read.
|
|
||||||
Close();
|
|
||||||
MakeLogFile(old_log+1, 2000, "hello", "stale write");
|
|
||||||
Open();
|
|
||||||
ASSERT_LE(1, NumTables());
|
|
||||||
ASSERT_EQ(1, NumLogs());
|
|
||||||
if (CanAppend()) {
|
|
||||||
ASSERT_EQ(new_log, FirstLogFile());
|
|
||||||
}
|
|
||||||
ASSERT_EQ("bar2", Get("foo"));
|
|
||||||
ASSERT_EQ("world", Get("hello"));
|
|
||||||
ASSERT_EQ("there", Get("hi"));
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace leveldb
|
|
||||||
|
|
||||||
int main(int argc, char** argv) {
|
|
||||||
return leveldb::test::RunAllTests();
|
|
||||||
}
|
|
|
@ -45,30 +45,56 @@ namespace {
|
||||||
class Repairer {
|
class Repairer {
|
||||||
public:
|
public:
|
||||||
Repairer(const std::string& dbname, const Options& options)
|
Repairer(const std::string& dbname, const Options& options)
|
||||||
: dbname_(dbname),
|
: double_cache_(options),
|
||||||
|
options_(SanitizeOptions(dbname, &icmp_, &ipolicy_, options, double_cache_.GetBlockCache())),
|
||||||
|
org_options_(options),
|
||||||
|
dbname_(options_.tiered_fast_prefix),
|
||||||
|
org_dbname_(dbname),
|
||||||
env_(options.env),
|
env_(options.env),
|
||||||
icmp_(options.comparator),
|
icmp_(options.comparator),
|
||||||
ipolicy_(options.filter_policy),
|
ipolicy_(options.filter_policy),
|
||||||
options_(SanitizeOptions(dbname, &icmp_, &ipolicy_, options)),
|
|
||||||
owns_info_log_(options_.info_log != options.info_log),
|
owns_info_log_(options_.info_log != options.info_log),
|
||||||
owns_cache_(options_.block_cache != options.block_cache),
|
db_lock_(NULL),
|
||||||
next_file_number_(1) {
|
next_file_number_(1)
|
||||||
|
{
|
||||||
// TableCache can be small since we expect each table to be opened once.
|
// TableCache can be small since we expect each table to be opened once.
|
||||||
table_cache_ = new TableCache(dbname_, &options_, 10);
|
table_cache_ = new TableCache(dbname_, &options_, double_cache_.GetFileCache(), double_cache_);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
~Repairer() {
|
~Repairer() {
|
||||||
delete table_cache_;
|
|
||||||
if (owns_info_log_) {
|
if (owns_info_log_) {
|
||||||
delete options_.info_log;
|
delete options_.info_log;
|
||||||
}
|
}
|
||||||
if (owns_cache_) {
|
// if (owns_cache_) {
|
||||||
delete options_.block_cache;
|
// delete options_.block_cache;
|
||||||
}
|
// }
|
||||||
|
|
||||||
|
// must remove second ref counter that keeps overlapped files locked
|
||||||
|
// table cache
|
||||||
|
bool is_overlap;
|
||||||
|
for (int level = 0; level < config::kNumLevels; level++) {
|
||||||
|
{
|
||||||
|
is_overlap=(level < leveldb::config::kNumOverlapLevels);
|
||||||
|
for (size_t i = 0; i < table_numbers_[level].size(); i++) {
|
||||||
|
table_cache_->Evict(table_numbers_[level][i], is_overlap);
|
||||||
|
} // for
|
||||||
|
} // if
|
||||||
|
} // for
|
||||||
|
|
||||||
|
delete table_cache_;
|
||||||
}
|
}
|
||||||
|
|
||||||
Status Run() {
|
Status Run() {
|
||||||
Status status = FindFiles();
|
Status status;
|
||||||
|
|
||||||
|
status = env_->LockFile(LockFileName(dbname_), &db_lock_);
|
||||||
|
|
||||||
|
if (status.ok())
|
||||||
|
status = MakeLevelDirectories(env_, options_);
|
||||||
|
|
||||||
|
if (status.ok()) {
|
||||||
|
status = FindFiles();
|
||||||
if (status.ok()) {
|
if (status.ok()) {
|
||||||
ConvertLogFilesToTables();
|
ConvertLogFilesToTables();
|
||||||
ExtractMetaData();
|
ExtractMetaData();
|
||||||
|
@ -76,18 +102,56 @@ class Repairer {
|
||||||
}
|
}
|
||||||
if (status.ok()) {
|
if (status.ok()) {
|
||||||
unsigned long long bytes = 0;
|
unsigned long long bytes = 0;
|
||||||
for (size_t i = 0; i < tables_.size(); i++) {
|
unsigned long long files = 0;
|
||||||
bytes += tables_[i].meta.file_size;
|
|
||||||
|
// calculate size for log information
|
||||||
|
for (int level=0; level<config::kNumLevels;++level)
|
||||||
|
{
|
||||||
|
std::vector<TableInfo> * table_ptr;
|
||||||
|
std::vector<TableInfo>::const_iterator i;
|
||||||
|
|
||||||
|
table_ptr=&tables_[level];
|
||||||
|
files+=table_ptr->size();
|
||||||
|
|
||||||
|
for ( i = table_ptr->begin(); table_ptr->end()!= i; i++) {
|
||||||
|
bytes += i->meta.file_size;
|
||||||
}
|
}
|
||||||
|
} // for
|
||||||
|
|
||||||
Log(options_.info_log,
|
Log(options_.info_log,
|
||||||
"**** Repaired leveldb %s; "
|
"**** Repaired leveldb %s; "
|
||||||
"recovered %d files; %llu bytes. "
|
"recovered %d files; %llu bytes. "
|
||||||
"Some data may have been lost. "
|
"Some data may have been lost. "
|
||||||
"****",
|
"****",
|
||||||
dbname_.c_str(),
|
dbname_.c_str(),
|
||||||
static_cast<int>(tables_.size()),
|
static_cast<int>(files),
|
||||||
bytes);
|
bytes);
|
||||||
}
|
}
|
||||||
|
if (db_lock_ != NULL) {
|
||||||
|
env_->UnlockFile(db_lock_);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// perform Riak specific scan for overlapping .sst files
|
||||||
|
// within a level
|
||||||
|
if (status.ok())
|
||||||
|
{
|
||||||
|
leveldb::DB * db_ptr;
|
||||||
|
Options options;
|
||||||
|
|
||||||
|
db_ptr=NULL;
|
||||||
|
options=org_options_;
|
||||||
|
// options.block_cache=NULL; // not reusing for fear of edge cases
|
||||||
|
options.is_repair=true;
|
||||||
|
options.error_if_exists=false;
|
||||||
|
status=leveldb::DB::Open(options, org_dbname_, &db_ptr);
|
||||||
|
|
||||||
|
if (status.ok())
|
||||||
|
status=db_ptr->VerifyLevels();
|
||||||
|
|
||||||
|
delete db_ptr;
|
||||||
|
|
||||||
|
} // if
|
||||||
return status;
|
return status;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -97,34 +161,36 @@ class Repairer {
|
||||||
SequenceNumber max_sequence;
|
SequenceNumber max_sequence;
|
||||||
};
|
};
|
||||||
|
|
||||||
std::string const dbname_;
|
DoubleCache double_cache_;
|
||||||
|
Options const options_, org_options_;
|
||||||
|
std::string const dbname_, org_dbname_;
|
||||||
Env* const env_;
|
Env* const env_;
|
||||||
InternalKeyComparator const icmp_;
|
InternalKeyComparator const icmp_;
|
||||||
InternalFilterPolicy const ipolicy_;
|
InternalFilterPolicy const ipolicy_;
|
||||||
Options const options_;
|
|
||||||
bool owns_info_log_;
|
bool owns_info_log_;
|
||||||
bool owns_cache_;
|
FileLock* db_lock_;
|
||||||
TableCache* table_cache_;
|
TableCache* table_cache_;
|
||||||
VersionEdit edit_;
|
VersionEdit edit_;
|
||||||
|
|
||||||
std::vector<std::string> manifests_;
|
std::vector<std::string> manifests_;
|
||||||
std::vector<uint64_t> table_numbers_;
|
std::vector<uint64_t> table_numbers_[config::kNumLevels];
|
||||||
std::vector<uint64_t> logs_;
|
std::vector<uint64_t> logs_;
|
||||||
std::vector<TableInfo> tables_;
|
std::vector<TableInfo> tables_[config::kNumLevels];
|
||||||
uint64_t next_file_number_;
|
uint64_t next_file_number_;
|
||||||
|
|
||||||
Status FindFiles() {
|
Status FindFiles()
|
||||||
|
{
|
||||||
std::vector<std::string> filenames;
|
std::vector<std::string> filenames;
|
||||||
|
uint64_t number;
|
||||||
|
FileType type;
|
||||||
|
int level;
|
||||||
|
|
||||||
|
// base directory
|
||||||
Status status = env_->GetChildren(dbname_, &filenames);
|
Status status = env_->GetChildren(dbname_, &filenames);
|
||||||
if (!status.ok()) {
|
if (!status.ok()) {
|
||||||
return status;
|
return status;
|
||||||
}
|
}
|
||||||
if (filenames.empty()) {
|
|
||||||
return Status::IOError(dbname_, "repair found no files");
|
|
||||||
}
|
|
||||||
|
|
||||||
uint64_t number;
|
|
||||||
FileType type;
|
|
||||||
for (size_t i = 0; i < filenames.size(); i++) {
|
for (size_t i = 0; i < filenames.size(); i++) {
|
||||||
if (ParseFileName(filenames[i], &number, &type)) {
|
if (ParseFileName(filenames[i], &number, &type)) {
|
||||||
if (type == kDescriptorFile) {
|
if (type == kDescriptorFile) {
|
||||||
|
@ -136,13 +202,38 @@ class Repairer {
|
||||||
if (type == kLogFile) {
|
if (type == kLogFile) {
|
||||||
logs_.push_back(number);
|
logs_.push_back(number);
|
||||||
} else if (type == kTableFile) {
|
} else if (type == kTableFile) {
|
||||||
table_numbers_.push_back(number);
|
table_numbers_[0].push_back(number);
|
||||||
} else {
|
} else {
|
||||||
// Ignore other files
|
// Ignore other files
|
||||||
|
} // else
|
||||||
|
} // else
|
||||||
|
} // if
|
||||||
|
} // for
|
||||||
|
|
||||||
|
for (level=0; level < config::kNumLevels; ++level)
|
||||||
|
{
|
||||||
|
std::string dirname;
|
||||||
|
|
||||||
|
filenames.clear();
|
||||||
|
dirname=MakeDirName2(options_, level, "sst");
|
||||||
|
Status status = env_->GetChildren(dirname, &filenames);
|
||||||
|
if (!status.ok()) {
|
||||||
|
return status;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for (size_t i = 0; i < filenames.size(); i++) {
|
||||||
|
if (ParseFileName(filenames[i], &number, &type)) {
|
||||||
|
if (number + 1 > next_file_number_) {
|
||||||
|
next_file_number_ = number + 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (type == kTableFile) {
|
||||||
|
table_numbers_[level].push_back(number);
|
||||||
}
|
}
|
||||||
}
|
} // if
|
||||||
|
} // for
|
||||||
|
} // for
|
||||||
|
|
||||||
return status;
|
return status;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -186,7 +277,7 @@ class Repairer {
|
||||||
reporter.env = env_;
|
reporter.env = env_;
|
||||||
reporter.info_log = options_.info_log;
|
reporter.info_log = options_.info_log;
|
||||||
reporter.lognum = log;
|
reporter.lognum = log;
|
||||||
// We intentionally make log::Reader do checksumming so that
|
// We intentially make log::Reader do checksumming so that
|
||||||
// corruptions cause entire commits to be skipped instead of
|
// corruptions cause entire commits to be skipped instead of
|
||||||
// propagating bad information (like overly large sequence
|
// propagating bad information (like overly large sequence
|
||||||
// numbers).
|
// numbers).
|
||||||
|
@ -203,11 +294,11 @@ class Repairer {
|
||||||
while (reader.ReadRecord(&record, &scratch)) {
|
while (reader.ReadRecord(&record, &scratch)) {
|
||||||
if (record.size() < 12) {
|
if (record.size() < 12) {
|
||||||
reporter.Corruption(
|
reporter.Corruption(
|
||||||
record.size(), Status::Corruption("log record too small", logname));
|
record.size(), Status::Corruption("log record too small"));
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
WriteBatchInternal::SetContents(&batch, record);
|
WriteBatchInternal::SetContents(&batch, record);
|
||||||
status = WriteBatchInternal::InsertInto(&batch, mem);
|
status = WriteBatchInternal::InsertInto(&batch, mem, &options_);
|
||||||
if (status.ok()) {
|
if (status.ok()) {
|
||||||
counter += WriteBatchInternal::Count(&batch);
|
counter += WriteBatchInternal::Count(&batch);
|
||||||
} else {
|
} else {
|
||||||
|
@ -223,14 +314,15 @@ class Repairer {
|
||||||
// since ExtractMetaData() will also generate edits.
|
// since ExtractMetaData() will also generate edits.
|
||||||
FileMetaData meta;
|
FileMetaData meta;
|
||||||
meta.number = next_file_number_++;
|
meta.number = next_file_number_++;
|
||||||
|
meta.level = 0;
|
||||||
Iterator* iter = mem->NewIterator();
|
Iterator* iter = mem->NewIterator();
|
||||||
status = BuildTable(dbname_, env_, options_, table_cache_, iter, &meta);
|
status = BuildTable(dbname_, env_, options_, icmp_.user_comparator(), table_cache_, iter, &meta, 0);
|
||||||
delete iter;
|
delete iter;
|
||||||
mem->Unref();
|
mem->Unref();
|
||||||
mem = NULL;
|
mem = NULL;
|
||||||
if (status.ok()) {
|
if (status.ok()) {
|
||||||
if (meta.file_size > 0) {
|
if (meta.file_size > 0) {
|
||||||
table_numbers_.push_back(meta.number);
|
table_numbers_[0].push_back(meta.number);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Log(options_.info_log, "Log #%llu: %d ops saved to Table #%llu %s",
|
Log(options_.info_log, "Log #%llu: %d ops saved to Table #%llu %s",
|
||||||
|
@ -242,52 +334,48 @@ class Repairer {
|
||||||
}
|
}
|
||||||
|
|
||||||
void ExtractMetaData() {
|
void ExtractMetaData() {
|
||||||
for (size_t i = 0; i < table_numbers_.size(); i++) {
|
for (int level=0; level < config::kNumLevels; ++level)
|
||||||
ScanTable(table_numbers_[i]);
|
{
|
||||||
}
|
std::vector<uint64_t> * number_ptr;
|
||||||
}
|
std::vector<uint64_t>::const_iterator i;
|
||||||
|
|
||||||
Iterator* NewTableIterator(const FileMetaData& meta) {
|
number_ptr=&table_numbers_[level];
|
||||||
// Same as compaction iterators: if paranoid_checks are on, turn
|
for (i = number_ptr->begin(); number_ptr->end()!= i; ++i) {
|
||||||
// on checksum verification.
|
|
||||||
ReadOptions r;
|
|
||||||
r.verify_checksums = options_.paranoid_checks;
|
|
||||||
return table_cache_->NewIterator(r, meta.number, meta.file_size);
|
|
||||||
}
|
|
||||||
|
|
||||||
void ScanTable(uint64_t number) {
|
|
||||||
TableInfo t;
|
TableInfo t;
|
||||||
t.meta.number = number;
|
t.meta.number = *i;
|
||||||
std::string fname = TableFileName(dbname_, number);
|
t.meta.level = level;
|
||||||
Status status = env_->GetFileSize(fname, &t.meta.file_size);
|
Status status = ScanTable(&t);
|
||||||
if (!status.ok()) {
|
if (!status.ok())
|
||||||
// Try alternate file name.
|
{
|
||||||
fname = SSTTableFileName(dbname_, number);
|
std::string fname = TableFileName(options_, t.meta.number, t.meta.level);
|
||||||
Status s2 = env_->GetFileSize(fname, &t.meta.file_size);
|
Log(options_.info_log, "Table #%llu: ignoring %s",
|
||||||
if (s2.ok()) {
|
|
||||||
status = Status::OK();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (!status.ok()) {
|
|
||||||
ArchiveFile(TableFileName(dbname_, number));
|
|
||||||
ArchiveFile(SSTTableFileName(dbname_, number));
|
|
||||||
Log(options_.info_log, "Table #%llu: dropped: %s",
|
|
||||||
(unsigned long long) t.meta.number,
|
(unsigned long long) t.meta.number,
|
||||||
status.ToString().c_str());
|
status.ToString().c_str());
|
||||||
return;
|
ArchiveFile(fname, true);
|
||||||
|
} else {
|
||||||
|
tables_[level].push_back(t);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Extract metadata by scanning through table.
|
Status ScanTable(TableInfo* t) {
|
||||||
|
Table * table_ptr;
|
||||||
|
SstCounters counters;
|
||||||
|
std::string fname = TableFileName(options_, t->meta.number, t->meta.level);
|
||||||
int counter = 0;
|
int counter = 0;
|
||||||
Iterator* iter = NewTableIterator(t.meta);
|
Status status = env_->GetFileSize(fname, &t->meta.file_size);
|
||||||
|
if (status.ok()) {
|
||||||
|
Iterator* iter = table_cache_->NewIterator(
|
||||||
|
ReadOptions(), t->meta.number, t->meta.file_size, t->meta.level, &table_ptr);
|
||||||
bool empty = true;
|
bool empty = true;
|
||||||
ParsedInternalKey parsed;
|
ParsedInternalKey parsed;
|
||||||
t.max_sequence = 0;
|
t->max_sequence = 0;
|
||||||
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
|
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
|
||||||
Slice key = iter->key();
|
Slice key = iter->key();
|
||||||
if (!ParseInternalKey(key, &parsed)) {
|
if (!ParseInternalKey(key, &parsed)) {
|
||||||
Log(options_.info_log, "Table #%llu: unparsable key %s",
|
Log(options_.info_log, "Table #%llu: unparsable key %s",
|
||||||
(unsigned long long) t.meta.number,
|
(unsigned long long) t->meta.number,
|
||||||
EscapeString(key).c_str());
|
EscapeString(key).c_str());
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
@ -295,115 +383,79 @@ class Repairer {
|
||||||
counter++;
|
counter++;
|
||||||
if (empty) {
|
if (empty) {
|
||||||
empty = false;
|
empty = false;
|
||||||
t.meta.smallest.DecodeFrom(key);
|
t->meta.smallest.DecodeFrom(key);
|
||||||
}
|
}
|
||||||
t.meta.largest.DecodeFrom(key);
|
t->meta.largest.DecodeFrom(key);
|
||||||
if (parsed.sequence > t.max_sequence) {
|
if (parsed.sequence > t->max_sequence) {
|
||||||
t.max_sequence = parsed.sequence;
|
t->max_sequence = parsed.sequence;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (!iter->status().ok()) {
|
if (!iter->status().ok()) {
|
||||||
status = iter->status();
|
status = iter->status();
|
||||||
}
|
}
|
||||||
|
else {
|
||||||
|
counters=table_ptr->GetSstCounters();
|
||||||
|
t->meta.exp_write_low=counters.Value(eSstCountExpiry1);
|
||||||
|
t->meta.exp_write_high=counters.Value(eSstCountExpiry2);
|
||||||
|
t->meta.exp_explicit_high=counters.Value(eSstCountExpiry3);
|
||||||
|
}
|
||||||
delete iter;
|
delete iter;
|
||||||
|
}
|
||||||
Log(options_.info_log, "Table #%llu: %d entries %s",
|
Log(options_.info_log, "Table #%llu: %d entries %s",
|
||||||
(unsigned long long) t.meta.number,
|
(unsigned long long) t->meta.number,
|
||||||
counter,
|
counter,
|
||||||
status.ToString().c_str());
|
status.ToString().c_str());
|
||||||
|
return status;
|
||||||
if (status.ok()) {
|
|
||||||
tables_.push_back(t);
|
|
||||||
} else {
|
|
||||||
RepairTable(fname, t); // RepairTable archives input file.
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void RepairTable(const std::string& src, TableInfo t) {
|
|
||||||
// We will copy src contents to a new table and then rename the
|
|
||||||
// new table over the source.
|
|
||||||
|
|
||||||
// Create builder.
|
|
||||||
std::string copy = TableFileName(dbname_, next_file_number_++);
|
|
||||||
WritableFile* file;
|
|
||||||
Status s = env_->NewWritableFile(copy, &file);
|
|
||||||
if (!s.ok()) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
TableBuilder* builder = new TableBuilder(options_, file);
|
|
||||||
|
|
||||||
// Copy data.
|
|
||||||
Iterator* iter = NewTableIterator(t.meta);
|
|
||||||
int counter = 0;
|
|
||||||
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
|
|
||||||
builder->Add(iter->key(), iter->value());
|
|
||||||
counter++;
|
|
||||||
}
|
|
||||||
delete iter;
|
|
||||||
|
|
||||||
ArchiveFile(src);
|
|
||||||
if (counter == 0) {
|
|
||||||
builder->Abandon(); // Nothing to save
|
|
||||||
} else {
|
|
||||||
s = builder->Finish();
|
|
||||||
if (s.ok()) {
|
|
||||||
t.meta.file_size = builder->FileSize();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
delete builder;
|
|
||||||
builder = NULL;
|
|
||||||
|
|
||||||
if (s.ok()) {
|
|
||||||
s = file->Close();
|
|
||||||
}
|
|
||||||
delete file;
|
|
||||||
file = NULL;
|
|
||||||
|
|
||||||
if (counter > 0 && s.ok()) {
|
|
||||||
std::string orig = TableFileName(dbname_, t.meta.number);
|
|
||||||
s = env_->RenameFile(copy, orig);
|
|
||||||
if (s.ok()) {
|
|
||||||
Log(options_.info_log, "Table #%llu: %d entries repaired",
|
|
||||||
(unsigned long long) t.meta.number, counter);
|
|
||||||
tables_.push_back(t);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (!s.ok()) {
|
|
||||||
env_->DeleteFile(copy);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Status WriteDescriptor() {
|
Status WriteDescriptor() {
|
||||||
std::string tmp = TempFileName(dbname_, 1);
|
std::string tmp = TempFileName(dbname_, 1);
|
||||||
WritableFile* file;
|
WritableFile* file;
|
||||||
Status status = env_->NewWritableFile(tmp, &file);
|
Status status = env_->NewWritableFile(tmp, &file, 4096);
|
||||||
if (!status.ok()) {
|
if (!status.ok()) {
|
||||||
return status;
|
return status;
|
||||||
}
|
}
|
||||||
|
|
||||||
SequenceNumber max_sequence = 0;
|
SequenceNumber max_sequence = 0;
|
||||||
for (size_t i = 0; i < tables_.size(); i++) {
|
for (int level=0; level<config::kNumLevels;++level)
|
||||||
if (max_sequence < tables_[i].max_sequence) {
|
{
|
||||||
max_sequence = tables_[i].max_sequence;
|
std::vector<TableInfo> * table_ptr;
|
||||||
}
|
std::vector<TableInfo>::const_iterator i;
|
||||||
|
|
||||||
|
table_ptr=&tables_[level];
|
||||||
|
|
||||||
|
for ( i = table_ptr->begin(); table_ptr->end()!= i; i++) {
|
||||||
|
if (max_sequence < i->max_sequence) {
|
||||||
|
max_sequence = i->max_sequence;
|
||||||
}
|
}
|
||||||
|
} // for
|
||||||
|
} // for
|
||||||
|
|
||||||
edit_.SetComparatorName(icmp_.user_comparator()->Name());
|
edit_.SetComparatorName(icmp_.user_comparator()->Name());
|
||||||
edit_.SetLogNumber(0);
|
edit_.SetLogNumber(0);
|
||||||
edit_.SetNextFile(next_file_number_);
|
edit_.SetNextFile(next_file_number_);
|
||||||
edit_.SetLastSequence(max_sequence);
|
edit_.SetLastSequence(max_sequence);
|
||||||
|
|
||||||
for (size_t i = 0; i < tables_.size(); i++) {
|
for (int level=0; level<config::kNumLevels;++level)
|
||||||
// TODO(opt): separate out into multiple levels
|
{
|
||||||
const TableInfo& t = tables_[i];
|
std::vector<TableInfo> * table_ptr;
|
||||||
edit_.AddFile(0, t.meta.number, t.meta.file_size,
|
std::vector<TableInfo>::const_iterator i;
|
||||||
t.meta.smallest, t.meta.largest);
|
|
||||||
}
|
table_ptr=&tables_[level];
|
||||||
|
|
||||||
|
for ( i = table_ptr->begin(); table_ptr->end()!= i; i++) {
|
||||||
|
edit_.AddFile2(level, i->meta.number, i->meta.file_size,
|
||||||
|
i->meta.smallest, i->meta.largest,
|
||||||
|
i->meta.exp_write_low, i->meta.exp_write_high, i->meta.exp_explicit_high);
|
||||||
|
|
||||||
|
} // for
|
||||||
|
} // for
|
||||||
|
|
||||||
//fprintf(stderr, "NewDescriptor:\n%s\n", edit_.DebugString().c_str());
|
//fprintf(stderr, "NewDescriptor:\n%s\n", edit_.DebugString().c_str());
|
||||||
{
|
{
|
||||||
log::Writer log(file);
|
log::Writer log(file);
|
||||||
std::string record;
|
std::string record;
|
||||||
edit_.EncodeTo(&record);
|
edit_.EncodeTo(&record); // manifest format is default for release, options_ often incomplete
|
||||||
status = log.AddRecord(record);
|
status = log.AddRecord(record);
|
||||||
}
|
}
|
||||||
if (status.ok()) {
|
if (status.ok()) {
|
||||||
|
@ -431,21 +483,33 @@ class Repairer {
|
||||||
return status;
|
return status;
|
||||||
}
|
}
|
||||||
|
|
||||||
void ArchiveFile(const std::string& fname) {
|
void ArchiveFile(const std::string& fname, bool two_levels=false) {
|
||||||
// Move into another directory. E.g., for
|
// Move into another directory. E.g., for
|
||||||
// dir/foo
|
// dir/foo
|
||||||
// rename to
|
// rename to
|
||||||
// dir/lost/foo
|
// dir/lost/foo
|
||||||
const char* slash = strrchr(fname.c_str(), '/');
|
std::string::size_type slash, slash2;
|
||||||
|
|
||||||
|
slash=fname.rfind('/');
|
||||||
|
if (two_levels && std::string::npos!=slash && 0<slash)
|
||||||
|
{
|
||||||
|
slash2=fname.rfind('/',slash-1);
|
||||||
|
if (std::string::npos==slash2)
|
||||||
|
slash2=slash;
|
||||||
|
} // if
|
||||||
|
else
|
||||||
|
slash2=slash;
|
||||||
|
|
||||||
std::string new_dir;
|
std::string new_dir;
|
||||||
if (slash != NULL) {
|
|
||||||
new_dir.assign(fname.data(), slash - fname.data());
|
if (std::string::npos != slash2 && 0<slash2)
|
||||||
}
|
new_dir.append(fname,0,slash2);
|
||||||
|
|
||||||
new_dir.append("/lost");
|
new_dir.append("/lost");
|
||||||
env_->CreateDir(new_dir); // Ignore error
|
env_->CreateDir(new_dir); // Ignore error
|
||||||
std::string new_file = new_dir;
|
std::string new_file = new_dir;
|
||||||
new_file.append("/");
|
new_file.append("/");
|
||||||
new_file.append((slash == NULL) ? fname.c_str() : slash + 1);
|
new_file.append((std::string::npos!=slash) ? fname.substr(slash+1) : fname);
|
||||||
Status s = env_->RenameFile(fname, new_file);
|
Status s = env_->RenameFile(fname, new_file);
|
||||||
Log(options_.info_log, "Archiving %s: %s\n",
|
Log(options_.info_log, "Archiving %s: %s\n",
|
||||||
fname.c_str(), s.ToString().c_str());
|
fname.c_str(), s.ToString().c_str());
|
||||||
|
|
|
@ -1,10 +1,7 @@
|
||||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||||
// Use of this source code is governed by a BSD-style license that can be
|
// Use of this source code is governed by a BSD-style license that can be
|
||||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||||
|
//
|
||||||
#ifndef STORAGE_LEVELDB_DB_SKIPLIST_H_
|
|
||||||
#define STORAGE_LEVELDB_DB_SKIPLIST_H_
|
|
||||||
|
|
||||||
// Thread safety
|
// Thread safety
|
||||||
// -------------
|
// -------------
|
||||||
//
|
//
|
||||||
|
@ -55,6 +52,12 @@ class SkipList {
|
||||||
// Returns true iff an entry that compares equal to key is in the list.
|
// Returns true iff an entry that compares equal to key is in the list.
|
||||||
bool Contains(const Key& key) const;
|
bool Contains(const Key& key) const;
|
||||||
|
|
||||||
|
// Returns true if all inserts have been sequentially increasing;
|
||||||
|
// else this SkipList has had keys inserted in non-sequential order
|
||||||
|
bool InSequentialInsertMode() const {
|
||||||
|
return sequentialInsertMode_;
|
||||||
|
}
|
||||||
|
|
||||||
// Iteration over the contents of a skip list
|
// Iteration over the contents of a skip list
|
||||||
class Iterator {
|
class Iterator {
|
||||||
public:
|
public:
|
||||||
|
@ -94,8 +97,22 @@ class SkipList {
|
||||||
// Intentionally copyable
|
// Intentionally copyable
|
||||||
};
|
};
|
||||||
|
|
||||||
|
protected:
|
||||||
|
// Checks the structure of this SkipList object, ensuring the keys are
|
||||||
|
// properly ordered
|
||||||
|
//
|
||||||
|
// This is protected since it is intended for use by unit tests; if a lock
|
||||||
|
// is used to protect Insert(), then it should be used to protect this
|
||||||
|
// method as well
|
||||||
|
bool Valid() const;
|
||||||
|
|
||||||
|
// Disables the sequential insert optimizations (used in performance testing)
|
||||||
|
void DisableSequentialInsertMode() {
|
||||||
|
sequentialInsertMode_ = false;
|
||||||
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
enum { kMaxHeight = 12 };
|
enum { kMaxHeight = 17 };
|
||||||
|
|
||||||
// Immutable after construction
|
// Immutable after construction
|
||||||
Comparator const compare_;
|
Comparator const compare_;
|
||||||
|
@ -115,6 +132,18 @@ class SkipList {
|
||||||
// Read/written only by Insert().
|
// Read/written only by Insert().
|
||||||
Random rnd_;
|
Random rnd_;
|
||||||
|
|
||||||
|
// Points to the last node in the list; modified only by Insert()
|
||||||
|
Node* tail_;
|
||||||
|
|
||||||
|
// Pointers to the nodes previous to the tail node; have max_height_ entries
|
||||||
|
Node* tailPrev_[kMaxHeight];
|
||||||
|
|
||||||
|
// The height of the tail_ node
|
||||||
|
int tailHeight_;
|
||||||
|
|
||||||
|
// We track the tail node until we have a non-sequential insert
|
||||||
|
bool sequentialInsertMode_;
|
||||||
|
|
||||||
Node* NewNode(const Key& key, int height);
|
Node* NewNode(const Key& key, int height);
|
||||||
int RandomHeight();
|
int RandomHeight();
|
||||||
bool Equal(const Key& a, const Key& b) const { return (compare_(a, b) == 0); }
|
bool Equal(const Key& a, const Key& b) const { return (compare_(a, b) == 0); }
|
||||||
|
@ -129,6 +158,11 @@ class SkipList {
|
||||||
// node at "level" for every level in [0..max_height_-1].
|
// node at "level" for every level in [0..max_height_-1].
|
||||||
Node* FindGreaterOrEqual(const Key& key, Node** prev) const;
|
Node* FindGreaterOrEqual(const Key& key, Node** prev) const;
|
||||||
|
|
||||||
|
// Similar to FindGreaterOrEqual() except it uses the barrier-free
|
||||||
|
// variant of Next(); this is used only by Insert() and it
|
||||||
|
// checks the tail_ pointer in case we're doing a sequential insert
|
||||||
|
Node* NoBarrier_FindGreaterOrEqual(const Key& key, Node** prev) const;
|
||||||
|
|
||||||
// Return the latest node with a key < key.
|
// Return the latest node with a key < key.
|
||||||
// Return head_ if there is no such node.
|
// Return head_ if there is no such node.
|
||||||
Node* FindLessThan(const Key& key) const;
|
Node* FindLessThan(const Key& key) const;
|
||||||
|
@ -280,6 +314,54 @@ typename SkipList<Key,Comparator>::Node* SkipList<Key,Comparator>::FindGreaterOr
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<typename Key, class Comparator>
|
||||||
|
typename SkipList<Key,Comparator>::Node*
|
||||||
|
SkipList<Key,Comparator>::NoBarrier_FindGreaterOrEqual(const Key& key, Node** prev) const {
|
||||||
|
int level = GetMaxHeight() - 1;
|
||||||
|
|
||||||
|
// If we have only seen sequential inserts up to this point, we can use
|
||||||
|
// the tail_ node
|
||||||
|
if ( sequentialInsertMode_ ) {
|
||||||
|
if (tail_ == NULL) {
|
||||||
|
// The list is currently empty, so the node being inserted
|
||||||
|
// will be the new tail_
|
||||||
|
assert(level == 0);
|
||||||
|
if (prev != NULL) prev[0] = head_;
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
else if (KeyIsAfterNode(key, tail_)) {
|
||||||
|
// The new key must be inserted after the current tail_ node
|
||||||
|
if (prev != NULL) {
|
||||||
|
int i;
|
||||||
|
for (i = 0; i < tailHeight_; ++i) {
|
||||||
|
prev[i] = tail_;
|
||||||
|
}
|
||||||
|
for (/*continue with i*/; i <= level; ++i) {
|
||||||
|
prev[i] = tailPrev_[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Node* x = head_;
|
||||||
|
while (true) {
|
||||||
|
Node* next = x->NoBarrier_Next(level);
|
||||||
|
if (KeyIsAfterNode(key, next)) {
|
||||||
|
// Keep searching in this list
|
||||||
|
x = next;
|
||||||
|
} else {
|
||||||
|
if (prev != NULL) prev[level] = x;
|
||||||
|
if (level == 0) {
|
||||||
|
return next;
|
||||||
|
} else {
|
||||||
|
// Switch to next list
|
||||||
|
level--;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
template<typename Key, class Comparator>
|
template<typename Key, class Comparator>
|
||||||
typename SkipList<Key,Comparator>::Node*
|
typename SkipList<Key,Comparator>::Node*
|
||||||
SkipList<Key,Comparator>::FindLessThan(const Key& key) const {
|
SkipList<Key,Comparator>::FindLessThan(const Key& key) const {
|
||||||
|
@ -327,25 +409,41 @@ SkipList<Key,Comparator>::SkipList(Comparator cmp, Arena* arena)
|
||||||
arena_(arena),
|
arena_(arena),
|
||||||
head_(NewNode(0 /* any key will do */, kMaxHeight)),
|
head_(NewNode(0 /* any key will do */, kMaxHeight)),
|
||||||
max_height_(reinterpret_cast<void*>(1)),
|
max_height_(reinterpret_cast<void*>(1)),
|
||||||
rnd_(0xdeadbeef) {
|
rnd_(0xdeadbeef),
|
||||||
|
tail_(NULL),
|
||||||
|
tailHeight_(0),
|
||||||
|
sequentialInsertMode_(true) {
|
||||||
for (int i = 0; i < kMaxHeight; i++) {
|
for (int i = 0; i < kMaxHeight; i++) {
|
||||||
head_->SetNext(i, NULL);
|
head_->SetNext(i, NULL);
|
||||||
|
tailPrev_[i] = NULL;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename Key, class Comparator>
|
template<typename Key, class Comparator>
|
||||||
void SkipList<Key,Comparator>::Insert(const Key& key) {
|
void SkipList<Key,Comparator>::Insert(const Key& key) {
|
||||||
// TODO(opt): We can use a barrier-free variant of FindGreaterOrEqual()
|
// We use a barrier-free variant of FindGreaterOrEqual()
|
||||||
// here since Insert() is externally synchronized.
|
// here since Insert() is externally synchronized.
|
||||||
Node* prev[kMaxHeight];
|
Node* prev[kMaxHeight];
|
||||||
Node* x = FindGreaterOrEqual(key, prev);
|
Node* x = NoBarrier_FindGreaterOrEqual(key, prev);
|
||||||
|
|
||||||
|
// If we're still in sequential-insert mode, check if the new node is being
|
||||||
|
// inserted at the end of the list, which is indicated by x being NULL
|
||||||
|
if (sequentialInsertMode_) {
|
||||||
|
if (x != NULL) {
|
||||||
|
// we have a non-sequential (AKA random) insert, so stop maintaining
|
||||||
|
// the tail bookkeeping overhead
|
||||||
|
sequentialInsertMode_ = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Our data structure does not allow duplicate insertion
|
// Our data structure does not allow duplicate insertion
|
||||||
assert(x == NULL || !Equal(key, x->key));
|
assert(x == NULL || !Equal(key, x->key));
|
||||||
|
|
||||||
int height = RandomHeight();
|
int i, height = RandomHeight();
|
||||||
if (height > GetMaxHeight()) {
|
if (height > GetMaxHeight()) {
|
||||||
for (int i = GetMaxHeight(); i < height; i++) {
|
// We are extending max_height_ which means we need to fill in the blanks
|
||||||
|
// in prev[] that were not filled in by NoBarrier_FindGreaterOrEqual()
|
||||||
|
for (i = GetMaxHeight(); i < height; ++i) {
|
||||||
prev[i] = head_;
|
prev[i] = head_;
|
||||||
}
|
}
|
||||||
//fprintf(stderr, "Change height from %d to %d\n", max_height_, height);
|
//fprintf(stderr, "Change height from %d to %d\n", max_height_, height);
|
||||||
|
@ -361,12 +459,37 @@ void SkipList<Key,Comparator>::Insert(const Key& key) {
|
||||||
}
|
}
|
||||||
|
|
||||||
x = NewNode(key, height);
|
x = NewNode(key, height);
|
||||||
for (int i = 0; i < height; i++) {
|
for (i = 0; i < height; ++i) {
|
||||||
// NoBarrier_SetNext() suffices since we will add a barrier when
|
// NoBarrier_SetNext() suffices since we will add a barrier when
|
||||||
// we publish a pointer to "x" in prev[i].
|
// we publish a pointer to "x" in prev[i].
|
||||||
x->NoBarrier_SetNext(i, prev[i]->NoBarrier_Next(i));
|
x->NoBarrier_SetNext(i, prev[i]->NoBarrier_Next(i));
|
||||||
prev[i]->SetNext(i, x);
|
prev[i]->SetNext(i, x);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Do we need to update our tail_ pointer?
|
||||||
|
if (sequentialInsertMode_) {
|
||||||
|
Node* prevTail = tail_;
|
||||||
|
int prevTailHeight = tailHeight_;
|
||||||
|
|
||||||
|
tail_ = x;
|
||||||
|
tailHeight_ = height;
|
||||||
|
|
||||||
|
// We also need to update our tailPrev_ pointers; first we capture
|
||||||
|
// the nodes already pointing to the new tail_
|
||||||
|
for (i = 0; i < height; ++i) {
|
||||||
|
tailPrev_[i] = prev[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
// If the previous tail node was taller than the new tail node, then
|
||||||
|
// the prev pointers above the current tail node's height (up to the
|
||||||
|
// height of the previous tail node) are simply the previous tail node
|
||||||
|
for (/*continue with i*/; i < prevTailHeight; ++i) {
|
||||||
|
tailPrev_[i] = prevTail;
|
||||||
|
}
|
||||||
|
|
||||||
|
// NOTE: any prev pointers above prevTailHeight (up to max_height_) were
|
||||||
|
// already set in tailPrev_ by previous calls to this method
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename Key, class Comparator>
|
template<typename Key, class Comparator>
|
||||||
|
@ -379,6 +502,115 @@ bool SkipList<Key,Comparator>::Contains(const Key& key) const {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace leveldb
|
template<typename Key, class Comparator>
|
||||||
|
bool SkipList<Key,Comparator>::Valid() const
|
||||||
|
{
|
||||||
|
// Note that we can use barrier-free overloads in this method since it is
|
||||||
|
// protected by the same lock as Insert().
|
||||||
|
|
||||||
#endif // STORAGE_LEVELDB_DB_SKIPLIST_H_
|
// Ensure that the list is properly sorted; use an iterator for this check
|
||||||
|
const Key* pPrevKey = NULL;
|
||||||
|
typename SkipList<Key, Comparator>::Iterator iter(this);
|
||||||
|
for ( iter.SeekToFirst(); iter.Valid(); iter.Next() ) {
|
||||||
|
if ( pPrevKey != NULL ) {
|
||||||
|
if ( compare_( *pPrevKey, iter.key() ) >= 0 ) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
pPrevKey = &iter.key();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Now walk the linked list at each level and ensure it's sorted. Also track
|
||||||
|
// how many nodes we see at each level; the number of nodes in the linked
|
||||||
|
// list at level n must not be larger than the number of nodes at level n-1.
|
||||||
|
std::vector<int> nodeCounts( GetMaxHeight() );
|
||||||
|
int level;
|
||||||
|
for ( level = GetMaxHeight() - 1; level >= 0; --level ) {
|
||||||
|
int nodeCount = 0;
|
||||||
|
pPrevKey = NULL;
|
||||||
|
for ( Node* pNode = head_->NoBarrier_Next( level );
|
||||||
|
pNode != NULL;
|
||||||
|
pNode = pNode->NoBarrier_Next( level ) ) {
|
||||||
|
++nodeCount;
|
||||||
|
if ( pPrevKey != NULL ) {
|
||||||
|
if ( compare_( *pPrevKey, pNode->key ) >= 0 ) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
pPrevKey = &pNode->key;
|
||||||
|
}
|
||||||
|
nodeCounts[ level ] = nodeCount;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Ensure the node counts do not increase as we move up the levels
|
||||||
|
int prevNodeCount = nodeCounts[0];
|
||||||
|
for ( level = 1; level < GetMaxHeight(); ++level ) {
|
||||||
|
int currentNodeCount = nodeCounts[ level ];
|
||||||
|
if ( currentNodeCount > prevNodeCount ) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
prevNodeCount = currentNodeCount;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Ensure that tail_ points to the last node
|
||||||
|
if ( sequentialInsertMode_ ) {
|
||||||
|
if ( tail_ == NULL ) {
|
||||||
|
// tail_ is not set, so the list must be empty
|
||||||
|
if ( tailPrev_[0] != NULL || head_->NoBarrier_Next(0) != NULL ) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
// we have a tail_ node; first ensure that its prev pointer actually
|
||||||
|
// points to it
|
||||||
|
if ( tailPrev_[0] == NULL || tailPrev_[0]->NoBarrier_Next(0) != tail_ ) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if ( compare_( tailPrev_[0]->key, tail_->key ) >= 0 ) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// now check the rest of the pointers in tailPrev_; up to tailHeight_,
|
||||||
|
// the next pointer of the node in tailPrev_ should point to tail_; after
|
||||||
|
// that, the next pointer should be NULL
|
||||||
|
for ( level = 1; level < GetMaxHeight(); ++level ) {
|
||||||
|
Node* tailPrev = tailPrev_[ level ];
|
||||||
|
if ( tailPrev == NULL ) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if ( level < tailHeight_ ) {
|
||||||
|
if ( tailPrev->NoBarrier_Next( level ) != tail_ ) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if ( compare_( tailPrev->key, tail_->key ) >= 0 ) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
if ( tailPrev->NoBarrier_Next( level ) != NULL ) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// the remainder of the tailPrev_ pointers (above max_height_)
|
||||||
|
// should be NULL
|
||||||
|
for ( /*continue with level*/; level < kMaxHeight; ++level ) {
|
||||||
|
if ( tailPrev_[ level ] != NULL ) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// now ensure that FindLast() returns tail_
|
||||||
|
Node* lastNode = FindLast();
|
||||||
|
if ( lastNode != tail_ ) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// if we get here, all is good
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace leveldb
|
||||||
|
|
|
@ -2,11 +2,15 @@
|
||||||
// Use of this source code is governed by a BSD-style license that can be
|
// Use of this source code is governed by a BSD-style license that can be
|
||||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||||
|
|
||||||
|
#define __STDC_FORMAT_MACROS
|
||||||
|
#include <inttypes.h>
|
||||||
|
|
||||||
#include "db/skiplist.h"
|
#include "db/skiplist.h"
|
||||||
#include <set>
|
#include <set>
|
||||||
#include "leveldb/env.h"
|
#include "leveldb/env.h"
|
||||||
#include "util/arena.h"
|
#include "util/arena.h"
|
||||||
#include "util/hash.h"
|
#include "util/hash.h"
|
||||||
|
#include "util/mutexlock.h"
|
||||||
#include "util/random.h"
|
#include "util/random.h"
|
||||||
#include "util/testharness.h"
|
#include "util/testharness.h"
|
||||||
|
|
||||||
|
@ -26,15 +30,29 @@ struct Comparator {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
template<typename Key, class Comparator>
|
||||||
|
class SkipListTest : public SkipList<Key, Comparator>
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
SkipListTest(Comparator cmp, Arena* arena) : SkipList<Key, Comparator>(cmp, arena) {}
|
||||||
|
|
||||||
|
// check the validity of this SkipList object by calling the Valid() method
|
||||||
|
// in the base class
|
||||||
|
bool Valid() const { return SkipList<Key, Comparator>::Valid(); }
|
||||||
|
|
||||||
|
void DisableSequentialInsertMode() { SkipList<Key, Comparator>::DisableSequentialInsertMode(); }
|
||||||
|
};
|
||||||
|
|
||||||
class SkipTest { };
|
class SkipTest { };
|
||||||
|
|
||||||
TEST(SkipTest, Empty) {
|
TEST(SkipTest, Empty) {
|
||||||
Arena arena;
|
Arena arena;
|
||||||
Comparator cmp;
|
Comparator cmp;
|
||||||
SkipList<Key, Comparator> list(cmp, &arena);
|
SkipListTest<Key, Comparator> list(cmp, &arena);
|
||||||
ASSERT_TRUE(!list.Contains(10));
|
ASSERT_TRUE(!list.Contains(10));
|
||||||
|
ASSERT_TRUE(list.Valid());
|
||||||
|
|
||||||
SkipList<Key, Comparator>::Iterator iter(&list);
|
SkipListTest<Key, Comparator>::Iterator iter(&list);
|
||||||
ASSERT_TRUE(!iter.Valid());
|
ASSERT_TRUE(!iter.Valid());
|
||||||
iter.SeekToFirst();
|
iter.SeekToFirst();
|
||||||
ASSERT_TRUE(!iter.Valid());
|
ASSERT_TRUE(!iter.Valid());
|
||||||
|
@ -51,13 +69,14 @@ TEST(SkipTest, InsertAndLookup) {
|
||||||
std::set<Key> keys;
|
std::set<Key> keys;
|
||||||
Arena arena;
|
Arena arena;
|
||||||
Comparator cmp;
|
Comparator cmp;
|
||||||
SkipList<Key, Comparator> list(cmp, &arena);
|
SkipListTest<Key, Comparator> list(cmp, &arena);
|
||||||
for (int i = 0; i < N; i++) {
|
for (int i = 0; i < N; i++) {
|
||||||
Key key = rnd.Next() % R;
|
Key key = rnd.Next() % R;
|
||||||
if (keys.insert(key).second) {
|
if (keys.insert(key).second) {
|
||||||
list.Insert(key);
|
list.Insert(key);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
ASSERT_TRUE(list.Valid());
|
||||||
|
|
||||||
for (int i = 0; i < R; i++) {
|
for (int i = 0; i < R; i++) {
|
||||||
if (list.Contains(i)) {
|
if (list.Contains(i)) {
|
||||||
|
@ -69,7 +88,7 @@ TEST(SkipTest, InsertAndLookup) {
|
||||||
|
|
||||||
// Simple iterator tests
|
// Simple iterator tests
|
||||||
{
|
{
|
||||||
SkipList<Key, Comparator>::Iterator iter(&list);
|
SkipListTest<Key, Comparator>::Iterator iter(&list);
|
||||||
ASSERT_TRUE(!iter.Valid());
|
ASSERT_TRUE(!iter.Valid());
|
||||||
|
|
||||||
iter.Seek(0);
|
iter.Seek(0);
|
||||||
|
@ -87,7 +106,7 @@ TEST(SkipTest, InsertAndLookup) {
|
||||||
|
|
||||||
// Forward iteration test
|
// Forward iteration test
|
||||||
for (int i = 0; i < R; i++) {
|
for (int i = 0; i < R; i++) {
|
||||||
SkipList<Key, Comparator>::Iterator iter(&list);
|
SkipListTest<Key, Comparator>::Iterator iter(&list);
|
||||||
iter.Seek(i);
|
iter.Seek(i);
|
||||||
|
|
||||||
// Compare against model iterator
|
// Compare against model iterator
|
||||||
|
@ -107,7 +126,7 @@ TEST(SkipTest, InsertAndLookup) {
|
||||||
|
|
||||||
// Backward iteration test
|
// Backward iteration test
|
||||||
{
|
{
|
||||||
SkipList<Key, Comparator>::Iterator iter(&list);
|
SkipListTest<Key, Comparator>::Iterator iter(&list);
|
||||||
iter.SeekToLast();
|
iter.SeekToLast();
|
||||||
|
|
||||||
// Compare against model iterator
|
// Compare against model iterator
|
||||||
|
@ -250,7 +269,7 @@ class ConcurrentTest {
|
||||||
// Note that generation 0 is never inserted, so it is ok if
|
// Note that generation 0 is never inserted, so it is ok if
|
||||||
// <*,0,*> is missing.
|
// <*,0,*> is missing.
|
||||||
ASSERT_TRUE((gen(pos) == 0) ||
|
ASSERT_TRUE((gen(pos) == 0) ||
|
||||||
(gen(pos) > static_cast<Key>(initial_state.Get(key(pos))))
|
(gen(pos) > initial_state.Get(key(pos)))
|
||||||
) << "key: " << key(pos)
|
) << "key: " << key(pos)
|
||||||
<< "; gen: " << gen(pos)
|
<< "; gen: " << gen(pos)
|
||||||
<< "; initgen: "
|
<< "; initgen: "
|
||||||
|
@ -313,18 +332,16 @@ class TestState {
|
||||||
state_cv_(&mu_) {}
|
state_cv_(&mu_) {}
|
||||||
|
|
||||||
void Wait(ReaderState s) {
|
void Wait(ReaderState s) {
|
||||||
mu_.Lock();
|
MutexLock lock(&mu_);
|
||||||
while (state_ != s) {
|
while (state_ != s) {
|
||||||
state_cv_.Wait();
|
state_cv_.Wait();
|
||||||
}
|
}
|
||||||
mu_.Unlock();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void Change(ReaderState s) {
|
void Change(ReaderState s) {
|
||||||
mu_.Lock();
|
MutexLock lock(&mu_);
|
||||||
state_ = s;
|
state_ = s;
|
||||||
state_cv_.Signal();
|
state_cv_.Signal();
|
||||||
mu_.Unlock();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
@ -371,6 +388,211 @@ TEST(SkipTest, Concurrent3) { RunConcurrent(3); }
|
||||||
TEST(SkipTest, Concurrent4) { RunConcurrent(4); }
|
TEST(SkipTest, Concurrent4) { RunConcurrent(4); }
|
||||||
TEST(SkipTest, Concurrent5) { RunConcurrent(5); }
|
TEST(SkipTest, Concurrent5) { RunConcurrent(5); }
|
||||||
|
|
||||||
|
static void
|
||||||
|
RunSequentialInsert(
|
||||||
|
const int NumKeys,
|
||||||
|
bool AcquireLock,
|
||||||
|
bool ReverseInsert,
|
||||||
|
bool SequentialInsertModeEnabled )
|
||||||
|
{
|
||||||
|
const int loopCount = 5; // repeat the whole process this many times and average the time spent
|
||||||
|
std::vector<uint64_t> timeSpent;
|
||||||
|
|
||||||
|
port::Mutex mutex;
|
||||||
|
Env* env = Env::Default();
|
||||||
|
|
||||||
|
fprintf( stderr,
|
||||||
|
"Sequentially inserting %d keys in %s order,\n"
|
||||||
|
" seqential insert mode is initially %sabled,\n"
|
||||||
|
" %sacquiring a lock for each insert (averaging over %d runs)\n",
|
||||||
|
NumKeys, ReverseInsert ? "reverse" : "forward",
|
||||||
|
SequentialInsertModeEnabled ? "en" : "dis",
|
||||||
|
AcquireLock ? "" : "not ", loopCount );
|
||||||
|
|
||||||
|
int k;
|
||||||
|
for ( k = 0; k < loopCount; ++k ) {
|
||||||
|
int j;
|
||||||
|
Arena arena;
|
||||||
|
Comparator cmp;
|
||||||
|
SkipListTest<Key, Comparator> list( cmp, &arena );
|
||||||
|
|
||||||
|
// initially the SkipList should be in sequential mode
|
||||||
|
ASSERT_TRUE( list.InSequentialInsertMode() );
|
||||||
|
|
||||||
|
// were we instructed to disable sequential insert mode?
|
||||||
|
if ( !SequentialInsertModeEnabled ) {
|
||||||
|
list.DisableSequentialInsertMode();
|
||||||
|
ASSERT_TRUE( !list.InSequentialInsertMode() );
|
||||||
|
}
|
||||||
|
|
||||||
|
uint64_t start = env->NowMicros();
|
||||||
|
for ( j = 0; j < NumKeys; ++j ) {
|
||||||
|
Key key = ReverseInsert ? NumKeys - 1 - j : j;
|
||||||
|
|
||||||
|
if ( AcquireLock ) mutex.Lock();
|
||||||
|
list.Insert( key );
|
||||||
|
if ( AcquireLock ) mutex.Unlock();
|
||||||
|
}
|
||||||
|
uint64_t stop = env->NowMicros();
|
||||||
|
timeSpent.push_back( stop - start );
|
||||||
|
//fprintf( stderr, " Time for run %d: %llu\n", k, timeSpent[k] );
|
||||||
|
|
||||||
|
// if SequentialInsertModeEnabled is true, the SkipList should still be
|
||||||
|
// in sequential mode iff ReverseInsert is false
|
||||||
|
if ( SequentialInsertModeEnabled ) {
|
||||||
|
ASSERT_TRUE( list.InSequentialInsertMode() != ReverseInsert );
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
ASSERT_TRUE( !list.InSequentialInsertMode() );
|
||||||
|
}
|
||||||
|
|
||||||
|
// ensure the SkipLlist is properly sorted
|
||||||
|
if ( AcquireLock ) mutex.Lock();
|
||||||
|
ASSERT_TRUE( list.Valid() );
|
||||||
|
if ( AcquireLock ) mutex.Unlock();
|
||||||
|
|
||||||
|
// ensure the SkipList contains all the keys we inserted
|
||||||
|
for ( j = 0; j < NumKeys; ++j ) {
|
||||||
|
ASSERT_TRUE( list.Contains( j ) );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// throw out the low and high times and average the rest
|
||||||
|
uint64_t totalTime, lowTime, highTime;
|
||||||
|
totalTime = lowTime = highTime = timeSpent[0];
|
||||||
|
for ( k = 1; k < loopCount; ++k ) {
|
||||||
|
uint64_t currentTime = timeSpent[k];
|
||||||
|
totalTime += currentTime;
|
||||||
|
if ( lowTime > currentTime ) lowTime = currentTime;
|
||||||
|
if ( highTime < currentTime ) highTime = currentTime;
|
||||||
|
}
|
||||||
|
|
||||||
|
totalTime -= (lowTime + highTime);
|
||||||
|
|
||||||
|
uint64_t averageTime = (totalTime / (loopCount - 2));
|
||||||
|
double timePerKey = (double)averageTime / (double)NumKeys;
|
||||||
|
fprintf( stderr, " Average insertion time: %" PRIu64 " (%f/key)\n", averageTime, timePerKey );
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(SkipTest, SequentialInsert_NoLock_ForwardInsert)
|
||||||
|
{
|
||||||
|
int numKeys = 100000;
|
||||||
|
bool acquireLock = false;
|
||||||
|
bool reverseInsert = false;
|
||||||
|
bool sequentialInsertModeEnabled = true;
|
||||||
|
RunSequentialInsert( numKeys, acquireLock, reverseInsert, sequentialInsertModeEnabled );
|
||||||
|
|
||||||
|
sequentialInsertModeEnabled = false;
|
||||||
|
RunSequentialInsert( numKeys, acquireLock, reverseInsert, sequentialInsertModeEnabled );
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(SkipTest, SequentialInsert_Lock_ForwardInsert)
|
||||||
|
{
|
||||||
|
int numKeys = 100000;
|
||||||
|
bool acquireLock = true;
|
||||||
|
bool reverseInsert = false;
|
||||||
|
bool sequentialInsertModeEnabled = true;
|
||||||
|
RunSequentialInsert( numKeys, acquireLock, reverseInsert, sequentialInsertModeEnabled );
|
||||||
|
|
||||||
|
sequentialInsertModeEnabled = false;
|
||||||
|
RunSequentialInsert( numKeys, acquireLock, reverseInsert, sequentialInsertModeEnabled );
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(SkipTest, SequentialInsert_NoLock_ReverseInsert)
|
||||||
|
{
|
||||||
|
int numKeys = 100000;
|
||||||
|
bool acquireLock = false;
|
||||||
|
bool reverseInsert = true;
|
||||||
|
bool sequentialInsertModeEnabled = true;
|
||||||
|
RunSequentialInsert( numKeys, acquireLock, reverseInsert, sequentialInsertModeEnabled );
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(SkipTest, SequentialInsert_Lock_ReverseInsert)
|
||||||
|
{
|
||||||
|
int numKeys = 100000;
|
||||||
|
bool acquireLock = true;
|
||||||
|
bool reverseInsert = true;
|
||||||
|
bool sequentialInsertModeEnabled = true;
|
||||||
|
RunSequentialInsert( numKeys, acquireLock, reverseInsert, sequentialInsertModeEnabled );
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(SkipTest, SequentialInsert_IncreasingNumberOfInserts)
|
||||||
|
{
|
||||||
|
// test with increasing numbers of keys, with sequential-insert mode both
|
||||||
|
// enabled and disabled; we're looking to see if per-key insertion times
|
||||||
|
// trend upward as the number of keys increases
|
||||||
|
int numKeys = 10000;
|
||||||
|
bool acquireLock = false;
|
||||||
|
bool reverseInsert = false;
|
||||||
|
bool sequentialInsertModeEnabled = true;
|
||||||
|
RunSequentialInsert( numKeys, acquireLock, reverseInsert, sequentialInsertModeEnabled );
|
||||||
|
|
||||||
|
sequentialInsertModeEnabled = false;
|
||||||
|
RunSequentialInsert( numKeys, acquireLock, reverseInsert, sequentialInsertModeEnabled );
|
||||||
|
|
||||||
|
numKeys = 100000;
|
||||||
|
sequentialInsertModeEnabled = true;
|
||||||
|
RunSequentialInsert( numKeys, acquireLock, reverseInsert, sequentialInsertModeEnabled );
|
||||||
|
|
||||||
|
sequentialInsertModeEnabled = false;
|
||||||
|
RunSequentialInsert( numKeys, acquireLock, reverseInsert, sequentialInsertModeEnabled );
|
||||||
|
|
||||||
|
numKeys = 1000000;
|
||||||
|
sequentialInsertModeEnabled = true;
|
||||||
|
RunSequentialInsert( numKeys, acquireLock, reverseInsert, sequentialInsertModeEnabled );
|
||||||
|
|
||||||
|
sequentialInsertModeEnabled = false;
|
||||||
|
RunSequentialInsert( numKeys, acquireLock, reverseInsert, sequentialInsertModeEnabled );
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(SkipTest, SequentialInsert_MixedInsertionModes)
|
||||||
|
{
|
||||||
|
// start inserting sequentially, then switch to non-sequential inserts,
|
||||||
|
// ensuring all works as intended
|
||||||
|
int j, numSequentialKeys = 100000, numNonSequentialKeys = 100000;
|
||||||
|
int totalNumKeys = numSequentialKeys + numNonSequentialKeys;
|
||||||
|
Arena arena;
|
||||||
|
Comparator cmp;
|
||||||
|
SkipListTest<Key, Comparator> list( cmp, &arena );
|
||||||
|
|
||||||
|
// initially the SkipList should be in sequential mode
|
||||||
|
ASSERT_TRUE( list.InSequentialInsertMode() );
|
||||||
|
|
||||||
|
// start inserting at key=1; when we insert 0 below, the list should switch
|
||||||
|
// out of sequential insert mode
|
||||||
|
for ( j = 1; j < numSequentialKeys; ++j ) {
|
||||||
|
list.Insert( j );
|
||||||
|
}
|
||||||
|
|
||||||
|
// the SkipList should still be in sequential mode
|
||||||
|
ASSERT_TRUE( list.InSequentialInsertMode() );
|
||||||
|
ASSERT_TRUE( list.Valid() );
|
||||||
|
|
||||||
|
list.Insert( 0 );
|
||||||
|
ASSERT_TRUE( !list.InSequentialInsertMode() );
|
||||||
|
ASSERT_TRUE( list.Valid() );
|
||||||
|
|
||||||
|
// now insert the remaining keys in non-sequential order (they're not
|
||||||
|
// random, but that doesn't matter here; just ensure we switch to
|
||||||
|
// non-sequential mode and that all continues to work)
|
||||||
|
for ( j = 0; j < numNonSequentialKeys; j += 2 ) {
|
||||||
|
int key = totalNumKeys - j - 1;
|
||||||
|
list.Insert( key );
|
||||||
|
}
|
||||||
|
for ( j = 0; j < numNonSequentialKeys; j += 2 ) {
|
||||||
|
int key = numSequentialKeys + j;
|
||||||
|
list.Insert( key );
|
||||||
|
}
|
||||||
|
|
||||||
|
ASSERT_TRUE( !list.InSequentialInsertMode() );
|
||||||
|
ASSERT_TRUE( list.Valid() );
|
||||||
|
|
||||||
|
// ensure the SkipList contains all the keys we inserted
|
||||||
|
for ( j = 0; j < totalNumKeys; ++j ) {
|
||||||
|
ASSERT_TRUE( list.Contains( j ) );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace leveldb
|
} // namespace leveldb
|
||||||
|
|
||||||
int main(int argc, char** argv) {
|
int main(int argc, char** argv) {
|
||||||
|
|
|
@ -5,7 +5,6 @@
|
||||||
#ifndef STORAGE_LEVELDB_DB_SNAPSHOT_H_
|
#ifndef STORAGE_LEVELDB_DB_SNAPSHOT_H_
|
||||||
#define STORAGE_LEVELDB_DB_SNAPSHOT_H_
|
#define STORAGE_LEVELDB_DB_SNAPSHOT_H_
|
||||||
|
|
||||||
#include "db/dbformat.h"
|
|
||||||
#include "leveldb/db.h"
|
#include "leveldb/db.h"
|
||||||
|
|
||||||
namespace leveldb {
|
namespace leveldb {
|
||||||
|
|
|
@ -5,22 +5,26 @@
|
||||||
#include "db/table_cache.h"
|
#include "db/table_cache.h"
|
||||||
|
|
||||||
#include "db/filename.h"
|
#include "db/filename.h"
|
||||||
|
#include "db/log_reader.h"
|
||||||
|
#include "db/log_writer.h"
|
||||||
|
#include "db/version_edit.h"
|
||||||
#include "leveldb/env.h"
|
#include "leveldb/env.h"
|
||||||
#include "leveldb/table.h"
|
#include "leveldb/table.h"
|
||||||
#include "util/coding.h"
|
#include "util/coding.h"
|
||||||
|
#include "leveldb/perf_count.h"
|
||||||
|
|
||||||
namespace leveldb {
|
namespace leveldb {
|
||||||
|
|
||||||
struct TableAndFile {
|
|
||||||
RandomAccessFile* file;
|
|
||||||
Table* table;
|
|
||||||
};
|
|
||||||
|
|
||||||
static void DeleteEntry(const Slice& key, void* value) {
|
static void DeleteEntry(const Slice& key, void* value) {
|
||||||
TableAndFile* tf = reinterpret_cast<TableAndFile*>(value);
|
TableAndFile* tf = reinterpret_cast<TableAndFile*>(value);
|
||||||
|
if (0==dec_and_fetch(&tf->user_count))
|
||||||
|
{
|
||||||
|
if (NULL!=tf->doublecache)
|
||||||
|
tf->doublecache->SubFileSize(tf->table->GetFileSize());
|
||||||
delete tf->table;
|
delete tf->table;
|
||||||
delete tf->file;
|
delete tf->file;
|
||||||
delete tf;
|
delete tf;
|
||||||
|
} // if
|
||||||
}
|
}
|
||||||
|
|
||||||
static void UnrefEntry(void* arg1, void* arg2) {
|
static void UnrefEntry(void* arg1, void* arg2) {
|
||||||
|
@ -31,37 +35,38 @@ static void UnrefEntry(void* arg1, void* arg2) {
|
||||||
|
|
||||||
TableCache::TableCache(const std::string& dbname,
|
TableCache::TableCache(const std::string& dbname,
|
||||||
const Options* options,
|
const Options* options,
|
||||||
int entries)
|
Cache * file_cache,
|
||||||
|
DoubleCache & doublecache)
|
||||||
: env_(options->env),
|
: env_(options->env),
|
||||||
dbname_(dbname),
|
dbname_(dbname),
|
||||||
options_(options),
|
options_(options),
|
||||||
cache_(NewLRUCache(entries)) {
|
cache_(file_cache),
|
||||||
|
doublecache_(doublecache)
|
||||||
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
TableCache::~TableCache() {
|
TableCache::~TableCache() {
|
||||||
delete cache_;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Status TableCache::FindTable(uint64_t file_number, uint64_t file_size,
|
Status TableCache::FindTable(uint64_t file_number, uint64_t file_size, int level,
|
||||||
Cache::Handle** handle) {
|
Cache::Handle** handle, bool is_compaction,
|
||||||
|
bool for_iterator) {
|
||||||
Status s;
|
Status s;
|
||||||
char buf[sizeof(file_number)];
|
char buf[sizeof(file_number)];
|
||||||
EncodeFixed64(buf, file_number);
|
EncodeFixed64(buf, file_number);
|
||||||
Slice key(buf, sizeof(buf));
|
Slice key(buf, sizeof(buf));
|
||||||
*handle = cache_->Lookup(key);
|
*handle = cache_->Lookup(key);
|
||||||
if (*handle == NULL) {
|
if (*handle == NULL) {
|
||||||
std::string fname = TableFileName(dbname_, file_number);
|
std::string fname = TableFileName(*options_, file_number, level);
|
||||||
RandomAccessFile* file = NULL;
|
RandomAccessFile* file = NULL;
|
||||||
Table* table = NULL;
|
Table* table = NULL;
|
||||||
s = env_->NewRandomAccessFile(fname, &file);
|
s = env_->NewRandomAccessFile(fname, &file);
|
||||||
if (!s.ok()) {
|
|
||||||
std::string old_fname = SSTTableFileName(dbname_, file_number);
|
|
||||||
if (env_->NewRandomAccessFile(old_fname, &file).ok()) {
|
|
||||||
s = Status::OK();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (s.ok()) {
|
if (s.ok()) {
|
||||||
s = Table::Open(*options_, file, file_size, &table);
|
s = Table::Open(*options_, file, file_size, &table);
|
||||||
|
|
||||||
|
// Riak: support opportunity to manage Linux page cache
|
||||||
|
if (is_compaction)
|
||||||
|
file->SetForCompaction(file_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!s.ok()) {
|
if (!s.ok()) {
|
||||||
|
@ -73,22 +78,74 @@ Status TableCache::FindTable(uint64_t file_number, uint64_t file_size,
|
||||||
TableAndFile* tf = new TableAndFile;
|
TableAndFile* tf = new TableAndFile;
|
||||||
tf->file = file;
|
tf->file = file;
|
||||||
tf->table = table;
|
tf->table = table;
|
||||||
*handle = cache_->Insert(key, tf, 1, &DeleteEntry);
|
tf->doublecache = &doublecache_;
|
||||||
|
tf->file_number = file_number;
|
||||||
|
tf->level = level;
|
||||||
|
|
||||||
|
*handle = cache_->Insert(key, tf, table->TableObjectSize(), &DeleteEntry);
|
||||||
|
gPerfCounters->Inc(ePerfTableOpened);
|
||||||
|
doublecache_.AddFileSize(table->GetFileSize());
|
||||||
|
|
||||||
|
// temporary hardcoding to match number of levels defined as
|
||||||
|
// overlapped in version_set.cc
|
||||||
|
if (level<config::kNumOverlapLevels)
|
||||||
|
cache_->Addref(*handle);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
Table *table = reinterpret_cast<TableAndFile*>(cache_->Value(*handle))->table;
|
||||||
|
|
||||||
|
// this is NOT first access, see if bloom filter can load now
|
||||||
|
if (!for_iterator && table->ReadFilter())
|
||||||
|
{
|
||||||
|
// TableAndFile now going to be present in two cache entries
|
||||||
|
// 1. retrieve old entry within file cache
|
||||||
|
TableAndFile* tf = reinterpret_cast<TableAndFile*>(cache_->Value(*handle));
|
||||||
|
inc_and_fetch(&tf->user_count);
|
||||||
|
|
||||||
|
// 2. must clean file size, do not want double count
|
||||||
|
if (NULL!=tf->doublecache)
|
||||||
|
tf->doublecache->SubFileSize(tf->table->GetFileSize());
|
||||||
|
|
||||||
|
// 3. release current reference (and possible special overlap reference)
|
||||||
|
cache_->Release(*handle);
|
||||||
|
if (tf->level<config::kNumOverlapLevels)
|
||||||
|
cache_->Release(*handle);
|
||||||
|
|
||||||
|
// 4. create second table cache entry using TableObjectSize that now includes
|
||||||
|
// bloom filter size
|
||||||
|
*handle = cache_->Insert(key, tf, table->TableObjectSize(), &DeleteEntry);
|
||||||
|
|
||||||
|
// 5. set double reference if an overlapped file (prevents from being flushed)
|
||||||
|
if (level<config::kNumOverlapLevels)
|
||||||
|
cache_->Addref(*handle);
|
||||||
|
} // if
|
||||||
|
|
||||||
|
// for Linux, let fadvise start precaching
|
||||||
|
if (is_compaction)
|
||||||
|
{
|
||||||
|
RandomAccessFile *file = reinterpret_cast<TableAndFile*>(cache_->Value(*handle))->file;
|
||||||
|
file->SetForCompaction(file_size);
|
||||||
|
} // if
|
||||||
|
|
||||||
|
gPerfCounters->Inc(ePerfTableCached);
|
||||||
|
} // else
|
||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
|
|
||||||
Iterator* TableCache::NewIterator(const ReadOptions& options,
|
Iterator* TableCache::NewIterator(const ReadOptions& options,
|
||||||
uint64_t file_number,
|
uint64_t file_number,
|
||||||
uint64_t file_size,
|
uint64_t file_size,
|
||||||
|
int level,
|
||||||
Table** tableptr) {
|
Table** tableptr) {
|
||||||
if (tableptr != NULL) {
|
if (tableptr != NULL) {
|
||||||
*tableptr = NULL;
|
*tableptr = NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
Cache::Handle* handle = NULL;
|
Cache::Handle* handle = NULL;
|
||||||
Status s = FindTable(file_number, file_size, &handle);
|
Status s = FindTable(file_number, file_size, level, &handle, options.IsCompaction(), true);
|
||||||
|
|
||||||
if (!s.ok()) {
|
if (!s.ok()) {
|
||||||
return NewErrorIterator(s);
|
return NewErrorIterator(s);
|
||||||
}
|
}
|
||||||
|
@ -105,11 +162,13 @@ Iterator* TableCache::NewIterator(const ReadOptions& options,
|
||||||
Status TableCache::Get(const ReadOptions& options,
|
Status TableCache::Get(const ReadOptions& options,
|
||||||
uint64_t file_number,
|
uint64_t file_number,
|
||||||
uint64_t file_size,
|
uint64_t file_size,
|
||||||
|
int level,
|
||||||
const Slice& k,
|
const Slice& k,
|
||||||
void* arg,
|
void* arg,
|
||||||
void (*saver)(void*, const Slice&, const Slice&)) {
|
bool (*saver)(void*, const Slice&, const Slice&)) {
|
||||||
Cache::Handle* handle = NULL;
|
Cache::Handle* handle = NULL;
|
||||||
Status s = FindTable(file_number, file_size, &handle);
|
Status s = FindTable(file_number, file_size, level, &handle);
|
||||||
|
|
||||||
if (s.ok()) {
|
if (s.ok()) {
|
||||||
Table* t = reinterpret_cast<TableAndFile*>(cache_->Value(handle))->table;
|
Table* t = reinterpret_cast<TableAndFile*>(cache_->Value(handle))->table;
|
||||||
s = t->InternalGet(options, k, arg, saver);
|
s = t->InternalGet(options, k, arg, saver);
|
||||||
|
@ -118,10 +177,60 @@ Status TableCache::Get(const ReadOptions& options,
|
||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
|
|
||||||
void TableCache::Evict(uint64_t file_number) {
|
void TableCache::Evict(uint64_t file_number, bool is_overlapped) {
|
||||||
char buf[sizeof(file_number)];
|
char buf[sizeof(file_number)];
|
||||||
EncodeFixed64(buf, file_number);
|
EncodeFixed64(buf, file_number);
|
||||||
|
|
||||||
|
// overlapped files have extra reference to prevent their purge,
|
||||||
|
// release that reference now
|
||||||
|
if (is_overlapped)
|
||||||
|
{
|
||||||
|
Cache::Handle *handle;
|
||||||
|
|
||||||
|
// the Lookup call adds a reference too, back out both
|
||||||
|
handle=cache_->Lookup(Slice(buf, sizeof(buf)));
|
||||||
|
|
||||||
|
// with multiple background threads, file might already be
|
||||||
|
// evicted
|
||||||
|
if (NULL!=handle)
|
||||||
|
{
|
||||||
|
cache_->Release(handle); // release for Lookup() call just made
|
||||||
|
cache_->Release(handle); // release for extra reference
|
||||||
|
} // if
|
||||||
|
} // if
|
||||||
|
|
||||||
cache_->Erase(Slice(buf, sizeof(buf)));
|
cache_->Erase(Slice(buf, sizeof(buf)));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Riak specific routine to return table statistic ONLY if table metadata
|
||||||
|
* already within cache ... otherwise return 0.
|
||||||
|
*/
|
||||||
|
uint64_t
|
||||||
|
TableCache::GetStatisticValue(
|
||||||
|
uint64_t file_number,
|
||||||
|
unsigned Index)
|
||||||
|
{
|
||||||
|
uint64_t ret_val;
|
||||||
|
char buf[sizeof(file_number)];
|
||||||
|
Cache::Handle *handle;
|
||||||
|
|
||||||
|
ret_val=0;
|
||||||
|
EncodeFixed64(buf, file_number);
|
||||||
|
Slice key(buf, sizeof(buf));
|
||||||
|
handle = cache_->Lookup(key);
|
||||||
|
|
||||||
|
if (NULL != handle)
|
||||||
|
{
|
||||||
|
TableAndFile * tf;
|
||||||
|
|
||||||
|
tf=reinterpret_cast<TableAndFile*>(cache_->Value(handle));
|
||||||
|
ret_val=tf->table->GetSstCounters().Value(Index);
|
||||||
|
cache_->Release(handle);
|
||||||
|
} // if
|
||||||
|
|
||||||
|
return(ret_val);
|
||||||
|
|
||||||
|
} // TableCache::GetStatisticValue
|
||||||
|
|
||||||
} // namespace leveldb
|
} // namespace leveldb
|
||||||
|
|
|
@ -13,6 +13,7 @@
|
||||||
#include "leveldb/cache.h"
|
#include "leveldb/cache.h"
|
||||||
#include "leveldb/table.h"
|
#include "leveldb/table.h"
|
||||||
#include "port/port.h"
|
#include "port/port.h"
|
||||||
|
#include "util/cache2.h"
|
||||||
|
|
||||||
namespace leveldb {
|
namespace leveldb {
|
||||||
|
|
||||||
|
@ -20,8 +21,10 @@ class Env;
|
||||||
|
|
||||||
class TableCache {
|
class TableCache {
|
||||||
public:
|
public:
|
||||||
TableCache(const std::string& dbname, const Options* options, int entries);
|
// clean up note: file_cache is redundant to GetFileCache available from doublecache
|
||||||
~TableCache();
|
TableCache(const std::string& dbname, const Options* options, Cache * file_cache,
|
||||||
|
DoubleCache & doublecache);
|
||||||
|
virtual ~TableCache();
|
||||||
|
|
||||||
// Return an iterator for the specified file number (the corresponding
|
// Return an iterator for the specified file number (the corresponding
|
||||||
// file length must be exactly "file_size" bytes). If "tableptr" is
|
// file length must be exactly "file_size" bytes). If "tableptr" is
|
||||||
|
@ -33,6 +36,7 @@ class TableCache {
|
||||||
Iterator* NewIterator(const ReadOptions& options,
|
Iterator* NewIterator(const ReadOptions& options,
|
||||||
uint64_t file_number,
|
uint64_t file_number,
|
||||||
uint64_t file_size,
|
uint64_t file_size,
|
||||||
|
int level,
|
||||||
Table** tableptr = NULL);
|
Table** tableptr = NULL);
|
||||||
|
|
||||||
// If a seek to internal key "k" in specified file finds an entry,
|
// If a seek to internal key "k" in specified file finds an entry,
|
||||||
|
@ -40,22 +44,65 @@ class TableCache {
|
||||||
Status Get(const ReadOptions& options,
|
Status Get(const ReadOptions& options,
|
||||||
uint64_t file_number,
|
uint64_t file_number,
|
||||||
uint64_t file_size,
|
uint64_t file_size,
|
||||||
|
int level,
|
||||||
const Slice& k,
|
const Slice& k,
|
||||||
void* arg,
|
void* arg,
|
||||||
void (*handle_result)(void*, const Slice&, const Slice&));
|
bool (*handle_result)(void*, const Slice&, const Slice&));
|
||||||
|
|
||||||
// Evict any entry for the specified file number
|
// Evict any entry for the specified file number
|
||||||
void Evict(uint64_t file_number);
|
void Evict(uint64_t file_number, bool is_overlapped);
|
||||||
|
|
||||||
private:
|
// Riak specific: return table statistic ONLY if table in cache, otherwise zero
|
||||||
|
uint64_t GetStatisticValue(uint64_t file_number, unsigned Index);
|
||||||
|
|
||||||
|
|
||||||
|
// access for testing tools, not for public access
|
||||||
|
Status TEST_FindTable(uint64_t file_number, uint64_t file_size, int level, Cache::Handle** handle)
|
||||||
|
{return( FindTable(file_number, file_size, level, handle));};
|
||||||
|
|
||||||
|
Cache* TEST_GetInternalCache() {return(cache_);};
|
||||||
|
|
||||||
|
void Release(Cache::Handle * handle) {cache_->Release(handle);};
|
||||||
|
|
||||||
|
// routine called if Options::cache_object_warming is true.
|
||||||
|
// Writes list of all file names currently in file cache to disk.
|
||||||
|
Status SaveOpenFileList();
|
||||||
|
|
||||||
|
// routine called if Options::cache_object_warming is true.
|
||||||
|
// Reads file created by SaveOpenFileList() and attempts to open
|
||||||
|
// every file.
|
||||||
|
Status PreloadTableCache();
|
||||||
|
|
||||||
|
// was private, now protected to allow easy unit test overrides
|
||||||
|
protected:
|
||||||
Env* const env_;
|
Env* const env_;
|
||||||
const std::string dbname_;
|
const std::string dbname_;
|
||||||
const Options* options_;
|
const Options* options_;
|
||||||
Cache* cache_;
|
Cache * cache_;
|
||||||
|
DoubleCache & doublecache_;
|
||||||
|
|
||||||
Status FindTable(uint64_t file_number, uint64_t file_size, Cache::Handle**);
|
// virtual to enable unit test overrides
|
||||||
|
virtual Status FindTable(uint64_t file_number, uint64_t file_size, int level,
|
||||||
|
Cache::Handle**, bool is_compaction=false,
|
||||||
|
bool for_iterator=false);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
struct TableAndFile {
|
||||||
|
RandomAccessFile* file;
|
||||||
|
Table* table;
|
||||||
|
DoubleCache * doublecache;
|
||||||
|
uint64_t file_number; // saved for cache object warming
|
||||||
|
int level; // saved for cache object warming
|
||||||
|
volatile uint32_t user_count;
|
||||||
|
|
||||||
|
TableAndFile()
|
||||||
|
: file(NULL), table(NULL), doublecache(NULL),
|
||||||
|
file_number(0), level(0), user_count(1)
|
||||||
|
{};
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
} // namespace leveldb
|
} // namespace leveldb
|
||||||
|
|
||||||
#endif // STORAGE_LEVELDB_DB_TABLE_CACHE_H_
|
#endif // STORAGE_LEVELDB_DB_TABLE_CACHE_H_
|
||||||
|
|
|
@ -9,20 +9,6 @@
|
||||||
|
|
||||||
namespace leveldb {
|
namespace leveldb {
|
||||||
|
|
||||||
// Tag numbers for serialized VersionEdit. These numbers are written to
|
|
||||||
// disk and should not be changed.
|
|
||||||
enum Tag {
|
|
||||||
kComparator = 1,
|
|
||||||
kLogNumber = 2,
|
|
||||||
kNextFileNumber = 3,
|
|
||||||
kLastSequence = 4,
|
|
||||||
kCompactPointer = 5,
|
|
||||||
kDeletedFile = 6,
|
|
||||||
kNewFile = 7,
|
|
||||||
// 8 was used for large value refs
|
|
||||||
kPrevLogNumber = 9
|
|
||||||
};
|
|
||||||
|
|
||||||
void VersionEdit::Clear() {
|
void VersionEdit::Clear() {
|
||||||
comparator_.clear();
|
comparator_.clear();
|
||||||
log_number_ = 0;
|
log_number_ = 0;
|
||||||
|
@ -34,11 +20,21 @@ void VersionEdit::Clear() {
|
||||||
has_prev_log_number_ = false;
|
has_prev_log_number_ = false;
|
||||||
has_next_file_number_ = false;
|
has_next_file_number_ = false;
|
||||||
has_last_sequence_ = false;
|
has_last_sequence_ = false;
|
||||||
|
has_f1_files_ = false;
|
||||||
|
has_f2_files_ = false;
|
||||||
|
|
||||||
deleted_files_.clear();
|
deleted_files_.clear();
|
||||||
new_files_.clear();
|
new_files_.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
void VersionEdit::EncodeTo(std::string* dst) const {
|
/**
|
||||||
|
* EncodeTo serializes the VersionEdit object
|
||||||
|
* to the "dst" string parameter. "format2" flag
|
||||||
|
* indicates whether serialization should use original
|
||||||
|
* Google format for file objects (false) or Basho's updated
|
||||||
|
* file2 format for expiry enabled file objects (true)
|
||||||
|
*/
|
||||||
|
void VersionEdit::EncodeTo(std::string* dst, bool format2) const {
|
||||||
if (has_comparator_) {
|
if (has_comparator_) {
|
||||||
PutVarint32(dst, kComparator);
|
PutVarint32(dst, kComparator);
|
||||||
PutLengthPrefixedSlice(dst, comparator_);
|
PutLengthPrefixedSlice(dst, comparator_);
|
||||||
|
@ -76,12 +72,21 @@ void VersionEdit::EncodeTo(std::string* dst) const {
|
||||||
|
|
||||||
for (size_t i = 0; i < new_files_.size(); i++) {
|
for (size_t i = 0; i < new_files_.size(); i++) {
|
||||||
const FileMetaData& f = new_files_[i].second;
|
const FileMetaData& f = new_files_[i].second;
|
||||||
|
if (format2)
|
||||||
|
PutVarint32(dst, kNewFile2);
|
||||||
|
else
|
||||||
PutVarint32(dst, kNewFile);
|
PutVarint32(dst, kNewFile);
|
||||||
PutVarint32(dst, new_files_[i].first); // level
|
PutVarint32(dst, new_files_[i].first); // level
|
||||||
PutVarint64(dst, f.number);
|
PutVarint64(dst, f.number);
|
||||||
PutVarint64(dst, f.file_size);
|
PutVarint64(dst, f.file_size);
|
||||||
PutLengthPrefixedSlice(dst, f.smallest.Encode());
|
PutLengthPrefixedSlice(dst, f.smallest.Encode());
|
||||||
PutLengthPrefixedSlice(dst, f.largest.Encode());
|
PutLengthPrefixedSlice(dst, f.largest.Encode());
|
||||||
|
if (format2)
|
||||||
|
{
|
||||||
|
PutVarint64(dst, f.exp_write_low);
|
||||||
|
PutVarint64(dst, f.exp_write_high);
|
||||||
|
PutVarint64(dst, f.exp_explicit_high);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -98,7 +103,7 @@ static bool GetInternalKey(Slice* input, InternalKey* dst) {
|
||||||
static bool GetLevel(Slice* input, int* level) {
|
static bool GetLevel(Slice* input, int* level) {
|
||||||
uint32_t v;
|
uint32_t v;
|
||||||
if (GetVarint32(input, &v) &&
|
if (GetVarint32(input, &v) &&
|
||||||
v < config::kNumLevels) {
|
v < (unsigned)config::kNumLevels) {
|
||||||
*level = v;
|
*level = v;
|
||||||
return true;
|
return true;
|
||||||
} else {
|
} else {
|
||||||
|
@ -185,13 +190,34 @@ Status VersionEdit::DecodeFrom(const Slice& src) {
|
||||||
GetVarint64(&input, &f.number) &&
|
GetVarint64(&input, &f.number) &&
|
||||||
GetVarint64(&input, &f.file_size) &&
|
GetVarint64(&input, &f.file_size) &&
|
||||||
GetInternalKey(&input, &f.smallest) &&
|
GetInternalKey(&input, &f.smallest) &&
|
||||||
GetInternalKey(&input, &f.largest)) {
|
GetInternalKey(&input, &f.largest))
|
||||||
|
{
|
||||||
|
has_f1_files_ = true;
|
||||||
|
f.level=level;
|
||||||
new_files_.push_back(std::make_pair(level, f));
|
new_files_.push_back(std::make_pair(level, f));
|
||||||
} else {
|
} else {
|
||||||
msg = "new-file entry";
|
msg = "new-file entry";
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
case kNewFile2:
|
||||||
|
if (GetLevel(&input, &level) &&
|
||||||
|
GetVarint64(&input, &f.number) &&
|
||||||
|
GetVarint64(&input, &f.file_size) &&
|
||||||
|
GetInternalKey(&input, &f.smallest) &&
|
||||||
|
GetInternalKey(&input, &f.largest) &&
|
||||||
|
GetVarint64(&input, &f.exp_write_low) &&
|
||||||
|
GetVarint64(&input, &f.exp_write_high) &&
|
||||||
|
GetVarint64(&input, &f.exp_explicit_high))
|
||||||
|
{
|
||||||
|
has_f2_files_ = true;
|
||||||
|
f.level=level;
|
||||||
|
new_files_.push_back(std::make_pair(level, f));
|
||||||
|
} else {
|
||||||
|
msg = "new-file2 entry";
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
|
||||||
default:
|
default:
|
||||||
msg = "unknown tag";
|
msg = "unknown tag";
|
||||||
break;
|
break;
|
||||||
|
@ -258,6 +284,12 @@ std::string VersionEdit::DebugString() const {
|
||||||
r.append(f.smallest.DebugString());
|
r.append(f.smallest.DebugString());
|
||||||
r.append(" .. ");
|
r.append(" .. ");
|
||||||
r.append(f.largest.DebugString());
|
r.append(f.largest.DebugString());
|
||||||
|
r.append(" ");
|
||||||
|
AppendNumberTo(&r, f.exp_write_low);
|
||||||
|
r.append(" ");
|
||||||
|
AppendNumberTo(&r, f.exp_write_high);
|
||||||
|
r.append(" ");
|
||||||
|
AppendNumberTo(&r, f.exp_explicit_high);
|
||||||
}
|
}
|
||||||
r.append("\n}\n");
|
r.append("\n}\n");
|
||||||
return r;
|
return r;
|
||||||
|
|
|
@ -16,15 +16,41 @@ class VersionSet;
|
||||||
|
|
||||||
struct FileMetaData {
|
struct FileMetaData {
|
||||||
int refs;
|
int refs;
|
||||||
int allowed_seeks; // Seeks allowed until compaction
|
// int allowed_seeks; // Seeks allowed until compaction
|
||||||
uint64_t number;
|
uint64_t number;
|
||||||
uint64_t file_size; // File size in bytes
|
uint64_t file_size; // File size in bytes
|
||||||
|
uint64_t num_entries; // count of values in .sst file, only valid during table build
|
||||||
InternalKey smallest; // Smallest internal key served by table
|
InternalKey smallest; // Smallest internal key served by table
|
||||||
InternalKey largest; // Largest internal key served by table
|
InternalKey largest; // Largest internal key served by table
|
||||||
|
int level;
|
||||||
|
ExpiryTimeMicros exp_write_low; // oldest write time in file:
|
||||||
|
// 0 - non-expiry keys exist too
|
||||||
|
// ULLONG_MAX - no write time expiry & no plain keys
|
||||||
|
ExpiryTimeMicros exp_write_high; // most recent write time in file
|
||||||
|
ExpiryTimeMicros exp_explicit_high; // most recent/furthest into future explicit expiry
|
||||||
|
|
||||||
FileMetaData() : refs(0), allowed_seeks(1 << 30), file_size(0) { }
|
FileMetaData()
|
||||||
|
: refs(0), /*allowed_seeks(1 << 30),*/ file_size(0),
|
||||||
|
num_entries(0), level(-1), exp_write_low(0), exp_write_high(0), exp_explicit_high(0)
|
||||||
|
{ }
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
class FileMetaDataPtrCompare
|
||||||
|
{
|
||||||
|
protected:
|
||||||
|
const Comparator * comparator_;
|
||||||
|
|
||||||
|
public:
|
||||||
|
explicit FileMetaDataPtrCompare(const Comparator * Comparer)
|
||||||
|
: comparator_(Comparer) {};
|
||||||
|
|
||||||
|
bool operator() (const FileMetaData * file1, const FileMetaData * file2) const
|
||||||
|
{
|
||||||
|
return(comparator_->Compare(file1->smallest.user_key(), file2->smallest.user_key()) < 0);
|
||||||
|
}
|
||||||
|
}; // class FileMetaDataPtrCompare
|
||||||
|
|
||||||
class VersionEdit {
|
class VersionEdit {
|
||||||
public:
|
public:
|
||||||
VersionEdit() { Clear(); }
|
VersionEdit() { Clear(); }
|
||||||
|
@ -59,6 +85,7 @@ class VersionEdit {
|
||||||
// Add the specified file at the specified number.
|
// Add the specified file at the specified number.
|
||||||
// REQUIRES: This version has not been saved (see VersionSet::SaveTo)
|
// REQUIRES: This version has not been saved (see VersionSet::SaveTo)
|
||||||
// REQUIRES: "smallest" and "largest" are smallest and largest keys in file
|
// REQUIRES: "smallest" and "largest" are smallest and largest keys in file
|
||||||
|
#if 0
|
||||||
void AddFile(int level, uint64_t file,
|
void AddFile(int level, uint64_t file,
|
||||||
uint64_t file_size,
|
uint64_t file_size,
|
||||||
const InternalKey& smallest,
|
const InternalKey& smallest,
|
||||||
|
@ -68,6 +95,27 @@ class VersionEdit {
|
||||||
f.file_size = file_size;
|
f.file_size = file_size;
|
||||||
f.smallest = smallest;
|
f.smallest = smallest;
|
||||||
f.largest = largest;
|
f.largest = largest;
|
||||||
|
f.level = level;
|
||||||
|
new_files_.push_back(std::make_pair(level, f));
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
void AddFile2(int level, uint64_t file,
|
||||||
|
uint64_t file_size,
|
||||||
|
const InternalKey& smallest,
|
||||||
|
const InternalKey& largest,
|
||||||
|
uint64_t exp_write_low,
|
||||||
|
uint64_t exp_write_high,
|
||||||
|
uint64_t exp_explicit_high) {
|
||||||
|
FileMetaData f;
|
||||||
|
f.number = file;
|
||||||
|
f.file_size = file_size;
|
||||||
|
f.smallest = smallest;
|
||||||
|
f.largest = largest;
|
||||||
|
f.level = level;
|
||||||
|
f.exp_write_low = exp_write_low;
|
||||||
|
f.exp_write_high = exp_write_high;
|
||||||
|
f.exp_explicit_high = exp_explicit_high;
|
||||||
new_files_.push_back(std::make_pair(level, f));
|
new_files_.push_back(std::make_pair(level, f));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -75,16 +123,37 @@ class VersionEdit {
|
||||||
void DeleteFile(int level, uint64_t file) {
|
void DeleteFile(int level, uint64_t file) {
|
||||||
deleted_files_.insert(std::make_pair(level, file));
|
deleted_files_.insert(std::make_pair(level, file));
|
||||||
}
|
}
|
||||||
|
size_t DeletedFileCount() const {return(deleted_files_.size());};
|
||||||
|
|
||||||
void EncodeTo(std::string* dst) const;
|
void EncodeTo(std::string* dst, bool format2=true) const;
|
||||||
Status DecodeFrom(const Slice& src);
|
Status DecodeFrom(const Slice& src);
|
||||||
|
|
||||||
|
// unit test access to validate file entries' format types
|
||||||
|
bool HasF1Files() const {return(has_f1_files_);};
|
||||||
|
bool HasF2Files() const {return(has_f2_files_);};
|
||||||
|
|
||||||
std::string DebugString() const;
|
std::string DebugString() const;
|
||||||
|
|
||||||
|
// Tag numbers for serialized VersionEdit. These numbers are written to
|
||||||
|
// disk and should not be changed.
|
||||||
|
enum Tag {
|
||||||
|
kComparator = 1,
|
||||||
|
kLogNumber = 2,
|
||||||
|
kNextFileNumber = 3,
|
||||||
|
kLastSequence = 4,
|
||||||
|
kCompactPointer = 5,
|
||||||
|
kDeletedFile = 6,
|
||||||
|
kNewFile = 7,
|
||||||
|
// 8 was used for large value refs
|
||||||
|
kPrevLogNumber = 9,
|
||||||
|
kFileCacheObject = 10,
|
||||||
|
kNewFile2 = 11 // expiry capable file
|
||||||
|
};
|
||||||
|
|
||||||
private:
|
private:
|
||||||
friend class VersionSet;
|
friend class VersionSet;
|
||||||
|
|
||||||
typedef std::set< std::pair<int, uint64_t> > DeletedFileSet;
|
USED_BY_NESTED_FRIEND2(typedef std::set< std::pair<int, uint64_t> > DeletedFileSet)
|
||||||
|
|
||||||
std::string comparator_;
|
std::string comparator_;
|
||||||
uint64_t log_number_;
|
uint64_t log_number_;
|
||||||
|
@ -96,10 +165,13 @@ class VersionEdit {
|
||||||
bool has_prev_log_number_;
|
bool has_prev_log_number_;
|
||||||
bool has_next_file_number_;
|
bool has_next_file_number_;
|
||||||
bool has_last_sequence_;
|
bool has_last_sequence_;
|
||||||
|
// following should be mutually exclusive, but tested independently to be sure
|
||||||
|
bool has_f1_files_; // manifest uses format 1 (for unit tests)
|
||||||
|
bool has_f2_files_; // manifest uses format 2 (for unit tests)
|
||||||
|
|
||||||
std::vector< std::pair<int, InternalKey> > compact_pointers_;
|
USED_BY_NESTED_FRIEND2(std::vector< std::pair<int, InternalKey> > compact_pointers_)
|
||||||
DeletedFileSet deleted_files_;
|
USED_BY_NESTED_FRIEND(DeletedFileSet deleted_files_)
|
||||||
std::vector< std::pair<int, FileMetaData> > new_files_;
|
USED_BY_NESTED_FRIEND2(std::vector< std::pair<int, FileMetaData> > new_files_)
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace leveldb
|
} // namespace leveldb
|
||||||
|
|
|
@ -7,14 +7,22 @@
|
||||||
|
|
||||||
namespace leveldb {
|
namespace leveldb {
|
||||||
|
|
||||||
static void TestEncodeDecode(const VersionEdit& edit) {
|
static void TestEncodeDecode(
|
||||||
|
const VersionEdit& edit,
|
||||||
|
bool format2=false) {
|
||||||
std::string encoded, encoded2;
|
std::string encoded, encoded2;
|
||||||
edit.EncodeTo(&encoded);
|
edit.EncodeTo(&encoded,format2);
|
||||||
VersionEdit parsed;
|
VersionEdit parsed;
|
||||||
Status s = parsed.DecodeFrom(encoded);
|
Status s = parsed.DecodeFrom(encoded);
|
||||||
ASSERT_TRUE(s.ok()) << s.ToString();
|
ASSERT_TRUE(s.ok()) << s.ToString();
|
||||||
parsed.EncodeTo(&encoded2);
|
parsed.EncodeTo(&encoded2,format2);
|
||||||
ASSERT_EQ(encoded, encoded2);
|
ASSERT_EQ(encoded, encoded2);
|
||||||
|
|
||||||
|
if (parsed.HasF1Files() || parsed.HasF2Files())
|
||||||
|
{
|
||||||
|
ASSERT_EQ(parsed.HasF1Files(), !format2);
|
||||||
|
ASSERT_EQ(parsed.HasF2Files(), format2);
|
||||||
|
} // if
|
||||||
}
|
}
|
||||||
|
|
||||||
class VersionEditTest { };
|
class VersionEditTest { };
|
||||||
|
@ -25,11 +33,12 @@ TEST(VersionEditTest, EncodeDecode) {
|
||||||
VersionEdit edit;
|
VersionEdit edit;
|
||||||
for (int i = 0; i < 4; i++) {
|
for (int i = 0; i < 4; i++) {
|
||||||
TestEncodeDecode(edit);
|
TestEncodeDecode(edit);
|
||||||
edit.AddFile(3, kBig + 300 + i, kBig + 400 + i,
|
edit.AddFile2(3, kBig + 300 + i, kBig + 400 + i,
|
||||||
InternalKey("foo", kBig + 500 + i, kTypeValue),
|
InternalKey("foo", 0, kBig + 500 + i, kTypeValue),
|
||||||
InternalKey("zoo", kBig + 600 + i, kTypeDeletion));
|
InternalKey("zoo", 0, kBig + 600 + i, kTypeDeletion),
|
||||||
|
0,0,0);
|
||||||
edit.DeleteFile(4, kBig + 700 + i);
|
edit.DeleteFile(4, kBig + 700 + i);
|
||||||
edit.SetCompactPointer(i, InternalKey("x", kBig + 900 + i, kTypeValue));
|
edit.SetCompactPointer(i, InternalKey("x", 0, kBig + 900 + i, kTypeValue));
|
||||||
}
|
}
|
||||||
|
|
||||||
edit.SetComparatorName("foo");
|
edit.SetComparatorName("foo");
|
||||||
|
@ -39,6 +48,29 @@ TEST(VersionEditTest, EncodeDecode) {
|
||||||
TestEncodeDecode(edit);
|
TestEncodeDecode(edit);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST(VersionEditTest, EncodeDecodeExpiry) {
|
||||||
|
static const uint64_t kBig = 1ull << 25;
|
||||||
|
|
||||||
|
VersionEdit edit;
|
||||||
|
for (int i = 0; i < 4; i++) {
|
||||||
|
TestEncodeDecode(edit, false); // only testing for s.ok()
|
||||||
|
edit.AddFile2(3, kBig + 300 + i, kBig + 400 + i,
|
||||||
|
InternalKey("foo", 700+i, kBig + 500 + i, kTypeValueExplicitExpiry),
|
||||||
|
InternalKey("zoo", 800+i, kBig + 600 + i, kTypeDeletion),
|
||||||
|
10203040,
|
||||||
|
123456789,
|
||||||
|
987654321);
|
||||||
|
edit.DeleteFile(4, kBig + 700 + i);
|
||||||
|
edit.SetCompactPointer(i, InternalKey("x", 0, kBig + 900 + i, kTypeValue));
|
||||||
|
}
|
||||||
|
|
||||||
|
edit.SetComparatorName("foo");
|
||||||
|
edit.SetLogNumber(kBig + 100);
|
||||||
|
edit.SetNextFile(kBig + 200);
|
||||||
|
edit.SetLastSequence(kBig + 1000);
|
||||||
|
TestEncodeDecode(edit, true);
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace leveldb
|
} // namespace leveldb
|
||||||
|
|
||||||
int main(int argc, char** argv) {
|
int main(int argc, char** argv) {
|
||||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -21,7 +21,9 @@
|
||||||
#include "db/dbformat.h"
|
#include "db/dbformat.h"
|
||||||
#include "db/version_edit.h"
|
#include "db/version_edit.h"
|
||||||
#include "port/port.h"
|
#include "port/port.h"
|
||||||
#include "port/thread_annotations.h"
|
#include "leveldb/atomics.h"
|
||||||
|
#include "leveldb/env.h"
|
||||||
|
#include "util/throttle.h"
|
||||||
|
|
||||||
namespace leveldb {
|
namespace leveldb {
|
||||||
|
|
||||||
|
@ -70,7 +72,7 @@ class Version {
|
||||||
FileMetaData* seek_file;
|
FileMetaData* seek_file;
|
||||||
int seek_file_level;
|
int seek_file_level;
|
||||||
};
|
};
|
||||||
Status Get(const ReadOptions&, const LookupKey& key, std::string* val,
|
Status Get(const ReadOptions&, const LookupKey& key, Value* val,
|
||||||
GetStats* stats);
|
GetStats* stats);
|
||||||
|
|
||||||
// Adds "stats" into the current state. Returns true if a new
|
// Adds "stats" into the current state. Returns true if a new
|
||||||
|
@ -78,12 +80,6 @@ class Version {
|
||||||
// REQUIRES: lock is held
|
// REQUIRES: lock is held
|
||||||
bool UpdateStats(const GetStats& stats);
|
bool UpdateStats(const GetStats& stats);
|
||||||
|
|
||||||
// Record a sample of bytes read at the specified internal key.
|
|
||||||
// Samples are taken approximately once every config::kReadBytesPeriod
|
|
||||||
// bytes. Returns true if a new compaction may need to be triggered.
|
|
||||||
// REQUIRES: lock is held
|
|
||||||
bool RecordReadSample(Slice key);
|
|
||||||
|
|
||||||
// Reference count management (so Versions do not disappear out from
|
// Reference count management (so Versions do not disappear out from
|
||||||
// under live iterators)
|
// under live iterators)
|
||||||
void Ref();
|
void Ref();
|
||||||
|
@ -101,43 +97,47 @@ class Version {
|
||||||
// largest_user_key==NULL represents a key largest than all keys in the DB.
|
// largest_user_key==NULL represents a key largest than all keys in the DB.
|
||||||
bool OverlapInLevel(int level,
|
bool OverlapInLevel(int level,
|
||||||
const Slice* smallest_user_key,
|
const Slice* smallest_user_key,
|
||||||
const Slice* largest_user_key);
|
const Slice* largest_user_key) const;
|
||||||
|
|
||||||
// Return the level at which we should place a new memtable compaction
|
// Return the level at which we should place a new memtable compaction
|
||||||
// result that covers the range [smallest_user_key,largest_user_key].
|
// result that covers the range [smallest_user_key,largest_user_key].
|
||||||
int PickLevelForMemTableOutput(const Slice& smallest_user_key,
|
int PickLevelForMemTableOutput(const Slice& smallest_user_key,
|
||||||
const Slice& largest_user_key);
|
const Slice& largest_user_key,
|
||||||
|
const int level_limit);
|
||||||
|
|
||||||
int NumFiles(int level) const { return files_[level].size(); }
|
virtual size_t NumFiles(int level) const { return files_[level].size(); }
|
||||||
|
|
||||||
|
const VersionSet * GetVersionSet() const { return vset_; }
|
||||||
|
|
||||||
|
typedef std::vector<FileMetaData*> FileMetaDataVector_t;
|
||||||
|
|
||||||
|
virtual const std::vector<FileMetaData*> & GetFileList(int level) const {return files_[level];};
|
||||||
|
|
||||||
|
volatile int WritePenalty() const {return write_penalty_; }
|
||||||
|
|
||||||
|
// Riak specific repair routine
|
||||||
|
bool VerifyLevels(int & level, InternalKey & begin, InternalKey & end);
|
||||||
|
|
||||||
// Return a human readable string that describes this version's contents.
|
// Return a human readable string that describes this version's contents.
|
||||||
std::string DebugString() const;
|
std::string DebugString() const;
|
||||||
|
|
||||||
private:
|
protected:
|
||||||
friend class Compaction;
|
friend class Compaction;
|
||||||
friend class VersionSet;
|
friend class VersionSet;
|
||||||
|
|
||||||
class LevelFileNumIterator;
|
class LevelFileNumIterator;
|
||||||
Iterator* NewConcatenatingIterator(const ReadOptions&, int level) const;
|
Iterator* NewConcatenatingIterator(const ReadOptions&, int level) const;
|
||||||
|
|
||||||
// Call func(arg, level, f) for every file that overlaps user_key in
|
|
||||||
// order from newest to oldest. If an invocation of func returns
|
|
||||||
// false, makes no more calls.
|
|
||||||
//
|
|
||||||
// REQUIRES: user portion of internal_key == user_key.
|
|
||||||
void ForEachOverlapping(Slice user_key, Slice internal_key,
|
|
||||||
void* arg,
|
|
||||||
bool (*func)(void*, int, FileMetaData*));
|
|
||||||
|
|
||||||
VersionSet* vset_; // VersionSet to which this Version belongs
|
VersionSet* vset_; // VersionSet to which this Version belongs
|
||||||
Version* next_; // Next version in linked list
|
Version* next_; // Next version in linked list
|
||||||
Version* prev_; // Previous version in linked list
|
Version* prev_; // Previous version in linked list
|
||||||
int refs_; // Number of live refs to this version
|
int refs_; // Number of live refs to this version
|
||||||
|
|
||||||
// List of files per level
|
// List of files per level
|
||||||
std::vector<FileMetaData*> files_[config::kNumLevels];
|
USED_BY_NESTED_FRIEND(std::vector<FileMetaData*> files_[config::kNumLevels];)
|
||||||
|
|
||||||
// Next file to compact based on seek stats.
|
protected:
|
||||||
|
// Next file to compact based on seek stats (or Riak delete test)
|
||||||
FileMetaData* file_to_compact_;
|
FileMetaData* file_to_compact_;
|
||||||
int file_to_compact_level_;
|
int file_to_compact_level_;
|
||||||
|
|
||||||
|
@ -146,17 +146,29 @@ class Version {
|
||||||
// are initialized by Finalize().
|
// are initialized by Finalize().
|
||||||
double compaction_score_;
|
double compaction_score_;
|
||||||
int compaction_level_;
|
int compaction_level_;
|
||||||
|
bool compaction_grooming_;
|
||||||
|
bool compaction_no_move_;
|
||||||
|
bool compaction_expirefile_;
|
||||||
|
volatile int write_penalty_;
|
||||||
|
|
||||||
|
protected:
|
||||||
|
// make the ctor/dtor protected, so that a unit test can subclass
|
||||||
explicit Version(VersionSet* vset)
|
explicit Version(VersionSet* vset)
|
||||||
: vset_(vset), next_(this), prev_(this), refs_(0),
|
: vset_(vset), next_(this), prev_(this), refs_(0),
|
||||||
file_to_compact_(NULL),
|
file_to_compact_(NULL),
|
||||||
file_to_compact_level_(-1),
|
file_to_compact_level_(-1),
|
||||||
compaction_score_(-1),
|
compaction_score_(-1),
|
||||||
compaction_level_(-1) {
|
compaction_level_(-1),
|
||||||
|
compaction_grooming_(false),
|
||||||
|
compaction_no_move_(false),
|
||||||
|
compaction_expirefile_(false),
|
||||||
|
write_penalty_(0)
|
||||||
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
~Version();
|
virtual ~Version();
|
||||||
|
|
||||||
|
private:
|
||||||
// No copying allowed
|
// No copying allowed
|
||||||
Version(const Version&);
|
Version(const Version&);
|
||||||
void operator=(const Version&);
|
void operator=(const Version&);
|
||||||
|
@ -175,11 +187,10 @@ class VersionSet {
|
||||||
// current version. Will release *mu while actually writing to the file.
|
// current version. Will release *mu while actually writing to the file.
|
||||||
// REQUIRES: *mu is held on entry.
|
// REQUIRES: *mu is held on entry.
|
||||||
// REQUIRES: no other thread concurrently calls LogAndApply()
|
// REQUIRES: no other thread concurrently calls LogAndApply()
|
||||||
Status LogAndApply(VersionEdit* edit, port::Mutex* mu)
|
Status LogAndApply(VersionEdit* edit, port::Mutex* mu);
|
||||||
EXCLUSIVE_LOCKS_REQUIRED(mu);
|
|
||||||
|
|
||||||
// Recover the last saved descriptor from persistent storage.
|
// Recover the last saved descriptor from persistent storage.
|
||||||
Status Recover(bool *save_manifest);
|
Status Recover();
|
||||||
|
|
||||||
// Return the current version.
|
// Return the current version.
|
||||||
Version* current() const { return current_; }
|
Version* current() const { return current_; }
|
||||||
|
@ -188,19 +199,29 @@ class VersionSet {
|
||||||
uint64_t ManifestFileNumber() const { return manifest_file_number_; }
|
uint64_t ManifestFileNumber() const { return manifest_file_number_; }
|
||||||
|
|
||||||
// Allocate and return a new file number
|
// Allocate and return a new file number
|
||||||
uint64_t NewFileNumber() { return next_file_number_++; }
|
// (-1 is to "duplicate" old post-increment logic while maintaining
|
||||||
|
// some threading integrity ... next_file_number_ used naked a bunch)
|
||||||
|
uint64_t NewFileNumber() { return(inc_and_fetch(&next_file_number_) -1); }
|
||||||
|
|
||||||
// Arrange to reuse "file_number" unless a newer file number has
|
// Arrange to reuse "file_number" unless a newer file number has
|
||||||
// already been allocated.
|
// already been allocated.
|
||||||
// REQUIRES: "file_number" was returned by a call to NewFileNumber().
|
// REQUIRES: "file_number" was returned by a call to NewFileNumber().
|
||||||
|
// (disabled due to threading concerns ... and desire NOT to use mutex, matthewv)
|
||||||
void ReuseFileNumber(uint64_t file_number) {
|
void ReuseFileNumber(uint64_t file_number) {
|
||||||
if (next_file_number_ == file_number + 1) {
|
// if (next_file_number_ == file_number + 1) {
|
||||||
next_file_number_ = file_number;
|
// next_file_number_ = file_number;
|
||||||
}
|
// }
|
||||||
}
|
}
|
||||||
|
|
||||||
// Return the number of Table files at the specified level.
|
// Return the number of Table files at the specified level.
|
||||||
int NumLevelFiles(int level) const;
|
size_t NumLevelFiles(int level) const;
|
||||||
|
|
||||||
|
// is the specified level overlapped (or if false->sorted)
|
||||||
|
static bool IsLevelOverlapped(int level);
|
||||||
|
|
||||||
|
static uint64_t DesiredBytesForLevel(int level);
|
||||||
|
static uint64_t MaxBytesForLevel(int level);
|
||||||
|
static uint64_t MaxFileSizeForLevel(int level);
|
||||||
|
|
||||||
// Return the combined file size of all files at the specified level.
|
// Return the combined file size of all files at the specified level.
|
||||||
int64_t NumLevelBytes(int level) const;
|
int64_t NumLevelBytes(int level) const;
|
||||||
|
@ -224,11 +245,36 @@ class VersionSet {
|
||||||
// being compacted, or zero if there is no such log file.
|
// being compacted, or zero if there is no such log file.
|
||||||
uint64_t PrevLogNumber() const { return prev_log_number_; }
|
uint64_t PrevLogNumber() const { return prev_log_number_; }
|
||||||
|
|
||||||
|
int WriteThrottleUsec(bool active_compaction)
|
||||||
|
{
|
||||||
|
uint64_t penalty, throttle;
|
||||||
|
int ret_val;
|
||||||
|
|
||||||
|
penalty=current_->write_penalty_;
|
||||||
|
throttle=GetThrottleWriteRate();
|
||||||
|
|
||||||
|
ret_val=0;
|
||||||
|
if (0==penalty && 1!=throttle)
|
||||||
|
ret_val=(int)throttle;
|
||||||
|
else if (0!=penalty)
|
||||||
|
{
|
||||||
|
if (1==throttle)
|
||||||
|
throttle=GetUnadjustedThrottleWriteRate();
|
||||||
|
ret_val=(int)penalty * throttle;
|
||||||
|
} // else if
|
||||||
|
|
||||||
|
return(ret_val);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
// Pick level and inputs for a new compaction.
|
// Pick level and inputs for a new compaction.
|
||||||
// Returns NULL if there is no compaction to be done.
|
// Returns NULL if there is no compaction to be done.
|
||||||
// Otherwise returns a pointer to a heap-allocated object that
|
// Otherwise returns a pointer to a heap-allocated object that
|
||||||
// describes the compaction. Caller should delete the result.
|
// describes the compaction. Caller should delete the result.
|
||||||
Compaction* PickCompaction();
|
//
|
||||||
|
// Riak October 2013: Pick Compaction now posts work directly
|
||||||
|
// to hot_thread pools
|
||||||
|
void PickCompaction(class DBImpl * db_impl);
|
||||||
|
|
||||||
// Return a compaction object for compacting the range [begin,end] in
|
// Return a compaction object for compacting the range [begin,end] in
|
||||||
// the specified level. Returns NULL if there is nothing in that
|
// the specified level. Returns NULL if there is nothing in that
|
||||||
|
@ -267,16 +313,42 @@ class VersionSet {
|
||||||
char buffer[100];
|
char buffer[100];
|
||||||
};
|
};
|
||||||
const char* LevelSummary(LevelSummaryStorage* scratch) const;
|
const char* LevelSummary(LevelSummaryStorage* scratch) const;
|
||||||
|
const char* CompactionSummary(LevelSummaryStorage* scratch) const;
|
||||||
|
|
||||||
private:
|
TableCache* GetTableCache() {return(table_cache_);};
|
||||||
|
|
||||||
|
const Options * GetOptions() const {return(options_);};
|
||||||
|
|
||||||
|
bool IsCompactionSubmitted(int level)
|
||||||
|
{return(m_CompactionStatus[level].m_Submitted);}
|
||||||
|
|
||||||
|
void SetCompactionSubmitted(int level)
|
||||||
|
{m_CompactionStatus[level].m_Submitted=true;}
|
||||||
|
|
||||||
|
void SetCompactionRunning(int level)
|
||||||
|
{m_CompactionStatus[level].m_Running=true;}
|
||||||
|
|
||||||
|
void SetCompactionDone(int level, uint64_t Now)
|
||||||
|
{ m_CompactionStatus[level].m_Running=false;
|
||||||
|
m_CompactionStatus[level].m_Submitted=false;
|
||||||
|
// must set both source and destination. otherwise
|
||||||
|
// destination might immediately decide it needs a
|
||||||
|
// timed grooming too ... defeating idea to spreadout the groomings
|
||||||
|
m_CompactionStatus[level].m_LastCompaction=Now;
|
||||||
|
if ((level+1)<config::kNumLevels)
|
||||||
|
m_CompactionStatus[level+1].m_LastCompaction=Now;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool NeighborCompactionsQuiet(int level);
|
||||||
|
|
||||||
|
protected:
|
||||||
class Builder;
|
class Builder;
|
||||||
|
|
||||||
friend class Compaction;
|
friend class Compaction;
|
||||||
friend class Version;
|
friend class Version;
|
||||||
|
|
||||||
bool ReuseManifest(const std::string& dscname, const std::string& dscbase);
|
bool Finalize(Version* v);
|
||||||
|
void UpdatePenalty(Version *v);
|
||||||
void Finalize(Version* v);
|
|
||||||
|
|
||||||
void GetRange(const std::vector<FileMetaData*>& inputs,
|
void GetRange(const std::vector<FileMetaData*>& inputs,
|
||||||
InternalKey* smallest,
|
InternalKey* smallest,
|
||||||
|
@ -299,7 +371,7 @@ class VersionSet {
|
||||||
const Options* const options_;
|
const Options* const options_;
|
||||||
TableCache* const table_cache_;
|
TableCache* const table_cache_;
|
||||||
const InternalKeyComparator icmp_;
|
const InternalKeyComparator icmp_;
|
||||||
uint64_t next_file_number_;
|
volatile uint64_t next_file_number_;
|
||||||
uint64_t manifest_file_number_;
|
uint64_t manifest_file_number_;
|
||||||
uint64_t last_sequence_;
|
uint64_t last_sequence_;
|
||||||
uint64_t log_number_;
|
uint64_t log_number_;
|
||||||
|
@ -315,11 +387,44 @@ class VersionSet {
|
||||||
// Either an empty string, or a valid InternalKey.
|
// Either an empty string, or a valid InternalKey.
|
||||||
std::string compact_pointer_[config::kNumLevels];
|
std::string compact_pointer_[config::kNumLevels];
|
||||||
|
|
||||||
|
// Riak allows multiple compaction threads, this mutex allows
|
||||||
|
// only one to write to manifest at a time. Only used in LogAndApply
|
||||||
|
port::Mutex manifest_mutex_;
|
||||||
|
|
||||||
|
volatile uint64_t last_penalty_minutes_;
|
||||||
|
volatile int prev_write_penalty_;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
struct CompactionStatus_s
|
||||||
|
{
|
||||||
|
bool m_Submitted; //!< level submitted to hot thread pool
|
||||||
|
bool m_Running; //!< thread actually running compaction
|
||||||
|
uint64_t m_LastCompaction; //!<NowMicros() when last compaction completed
|
||||||
|
|
||||||
|
CompactionStatus_s()
|
||||||
|
: m_Submitted(false), m_Running(false), m_LastCompaction(0)
|
||||||
|
{};
|
||||||
|
} m_CompactionStatus[config::kNumLevels];
|
||||||
|
|
||||||
|
private:
|
||||||
// No copying allowed
|
// No copying allowed
|
||||||
VersionSet(const VersionSet&);
|
VersionSet(const VersionSet&);
|
||||||
void operator=(const VersionSet&);
|
void operator=(const VersionSet&);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
//
|
||||||
|
// allows routing of compaction request to
|
||||||
|
// diverse processing routines via common
|
||||||
|
// BackgroundCall2 thread entry
|
||||||
|
//
|
||||||
|
enum CompactionType
|
||||||
|
{
|
||||||
|
kNormalCompaction = 0x0,
|
||||||
|
kExpiryFileCompaction = 0x1
|
||||||
|
}; // CompactionType
|
||||||
|
|
||||||
|
|
||||||
// A Compaction encapsulates information about a compaction.
|
// A Compaction encapsulates information about a compaction.
|
||||||
class Compaction {
|
class Compaction {
|
||||||
public:
|
public:
|
||||||
|
@ -329,6 +434,9 @@ class Compaction {
|
||||||
// and "level+1" will be merged to produce a set of "level+1" files.
|
// and "level+1" will be merged to produce a set of "level+1" files.
|
||||||
int level() const { return level_; }
|
int level() const { return level_; }
|
||||||
|
|
||||||
|
// Return parent Version object
|
||||||
|
const Version * version() const { return input_version_; }
|
||||||
|
|
||||||
// Return the object that holds the edits to the descriptor done
|
// Return the object that holds the edits to the descriptor done
|
||||||
// by this compaction.
|
// by this compaction.
|
||||||
VersionEdit* edit() { return &edit_; }
|
VersionEdit* edit() { return &edit_; }
|
||||||
|
@ -356,32 +464,47 @@ class Compaction {
|
||||||
|
|
||||||
// Returns true iff we should stop building the current output
|
// Returns true iff we should stop building the current output
|
||||||
// before processing "internal_key".
|
// before processing "internal_key".
|
||||||
bool ShouldStopBefore(const Slice& internal_key);
|
bool ShouldStopBefore(const Slice& internal_key, size_t key_count);
|
||||||
|
|
||||||
// Release the input version for the compaction, once the compaction
|
// Release the input version for the compaction, once the compaction
|
||||||
// is successful.
|
// is successful.
|
||||||
void ReleaseInputs();
|
void ReleaseInputs();
|
||||||
|
|
||||||
|
// Riak specific: get summary statistics from compaction inputs
|
||||||
|
void CalcInputStats(TableCache & tables);
|
||||||
|
size_t TotalUserDataSize() const {return(tot_user_data_);};
|
||||||
|
size_t TotalIndexKeys() const {return(tot_index_keys_);};
|
||||||
|
size_t AverageValueSize() const {return(avg_value_size_);};
|
||||||
|
size_t AverageKeySize() const {return(avg_key_size_);};
|
||||||
|
size_t AverageBlockSize() const {return(avg_block_size_);};
|
||||||
|
bool IsCompressible() const {return(compressible_);};
|
||||||
|
|
||||||
|
// Riak specific: is move operation ok for compaction?
|
||||||
|
bool IsMoveOk() const {return(!no_move_);};
|
||||||
|
|
||||||
|
enum CompactionType GetCompactionType() const {return(compaction_type_);};
|
||||||
|
|
||||||
private:
|
private:
|
||||||
friend class Version;
|
friend class Version;
|
||||||
friend class VersionSet;
|
friend class VersionSet;
|
||||||
|
|
||||||
Compaction(const Options* options, int level);
|
explicit Compaction(int level);
|
||||||
|
|
||||||
int level_;
|
int level_;
|
||||||
uint64_t max_output_file_size_;
|
uint64_t max_output_file_size_;
|
||||||
Version* input_version_;
|
Version* input_version_;
|
||||||
VersionEdit edit_;
|
VersionEdit edit_;
|
||||||
|
CompactionType compaction_type_;
|
||||||
|
|
||||||
// Each compaction reads inputs from "level_" and "level_+1"
|
// Each compaction reads inputs from "level_" and "level_+1"
|
||||||
std::vector<FileMetaData*> inputs_[2]; // The two sets of inputs
|
std::vector<FileMetaData*> inputs_[2]; // The two sets of inputs
|
||||||
|
|
||||||
// State used to check for number of overlapping grandparent files
|
// State used to check for number of of overlapping grandparent files
|
||||||
// (parent == level_ + 1, grandparent == level_ + 2)
|
// (parent == level_ + 1, grandparent == level_ + 2)
|
||||||
std::vector<FileMetaData*> grandparents_;
|
std::vector<FileMetaData*> grandparents_;
|
||||||
size_t grandparent_index_; // Index in grandparent_starts_
|
size_t grandparent_index_; // Index in grandparent_starts_
|
||||||
bool seen_key_; // Some output key has been seen
|
bool seen_key_; // Some output key has been seen
|
||||||
int64_t overlapped_bytes_; // Bytes of overlap between current output
|
uint64_t overlapped_bytes_; // Bytes of overlap between current output
|
||||||
// and grandparent files
|
// and grandparent files
|
||||||
|
|
||||||
// State for implementing IsBaseLevelForKey
|
// State for implementing IsBaseLevelForKey
|
||||||
|
@ -391,6 +514,16 @@ class Compaction {
|
||||||
// higher level than the ones involved in this compaction (i.e. for
|
// higher level than the ones involved in this compaction (i.e. for
|
||||||
// all L >= level_ + 2).
|
// all L >= level_ + 2).
|
||||||
size_t level_ptrs_[config::kNumLevels];
|
size_t level_ptrs_[config::kNumLevels];
|
||||||
|
|
||||||
|
// Riak specific: output statistics from CalcInputStats
|
||||||
|
size_t tot_user_data_;
|
||||||
|
size_t tot_index_keys_;
|
||||||
|
size_t avg_value_size_;
|
||||||
|
size_t avg_key_size_;
|
||||||
|
size_t avg_block_size_;
|
||||||
|
bool compressible_;
|
||||||
|
bool stats_done_;
|
||||||
|
bool no_move_;
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace leveldb
|
} // namespace leveldb
|
||||||
|
|
|
@ -27,13 +27,13 @@ class FindFileTest {
|
||||||
SequenceNumber largest_seq = 100) {
|
SequenceNumber largest_seq = 100) {
|
||||||
FileMetaData* f = new FileMetaData;
|
FileMetaData* f = new FileMetaData;
|
||||||
f->number = files_.size() + 1;
|
f->number = files_.size() + 1;
|
||||||
f->smallest = InternalKey(smallest, smallest_seq, kTypeValue);
|
f->smallest = InternalKey(smallest, 0, smallest_seq, kTypeValue);
|
||||||
f->largest = InternalKey(largest, largest_seq, kTypeValue);
|
f->largest = InternalKey(largest, 0, largest_seq, kTypeValue);
|
||||||
files_.push_back(f);
|
files_.push_back(f);
|
||||||
}
|
}
|
||||||
|
|
||||||
int Find(const char* key) {
|
int Find(const char* key) {
|
||||||
InternalKey target(key, 100, kTypeValue);
|
InternalKey target(key, 0, 100, kTypeValue);
|
||||||
InternalKeyComparator cmp(BytewiseComparator());
|
InternalKeyComparator cmp(BytewiseComparator());
|
||||||
return FindFile(cmp, files_, target.Encode());
|
return FindFile(cmp, files_, target.Encode());
|
||||||
}
|
}
|
||||||
|
|
|
@ -13,13 +13,17 @@
|
||||||
// len: varint32
|
// len: varint32
|
||||||
// data: uint8[len]
|
// data: uint8[len]
|
||||||
|
|
||||||
#include "leveldb/write_batch.h"
|
#include <stdint.h>
|
||||||
|
|
||||||
#include "leveldb/db.h"
|
#include "leveldb/db.h"
|
||||||
|
#include "leveldb/env.h"
|
||||||
|
#include "leveldb/expiry.h"
|
||||||
|
#include "leveldb/write_batch.h"
|
||||||
#include "db/dbformat.h"
|
#include "db/dbformat.h"
|
||||||
#include "db/memtable.h"
|
#include "db/memtable.h"
|
||||||
#include "db/write_batch_internal.h"
|
#include "db/write_batch_internal.h"
|
||||||
#include "util/coding.h"
|
#include "util/coding.h"
|
||||||
|
#include "util/throttle.h"
|
||||||
|
|
||||||
namespace leveldb {
|
namespace leveldb {
|
||||||
|
|
||||||
|
@ -47,16 +51,17 @@ Status WriteBatch::Iterate(Handler* handler) const {
|
||||||
|
|
||||||
input.remove_prefix(kHeader);
|
input.remove_prefix(kHeader);
|
||||||
Slice key, value;
|
Slice key, value;
|
||||||
|
ExpiryTimeMicros expiry;
|
||||||
int found = 0;
|
int found = 0;
|
||||||
while (!input.empty()) {
|
while (!input.empty()) {
|
||||||
found++;
|
found++;
|
||||||
char tag = input[0];
|
ValueType tag = (ValueType)input[0];
|
||||||
input.remove_prefix(1);
|
input.remove_prefix(1);
|
||||||
switch (tag) {
|
switch (tag) {
|
||||||
case kTypeValue:
|
case kTypeValue:
|
||||||
if (GetLengthPrefixedSlice(&input, &key) &&
|
if (GetLengthPrefixedSlice(&input, &key) &&
|
||||||
GetLengthPrefixedSlice(&input, &value)) {
|
GetLengthPrefixedSlice(&input, &value)) {
|
||||||
handler->Put(key, value);
|
handler->Put(key, value, kTypeValue, 0);
|
||||||
} else {
|
} else {
|
||||||
return Status::Corruption("bad WriteBatch Put");
|
return Status::Corruption("bad WriteBatch Put");
|
||||||
}
|
}
|
||||||
|
@ -68,6 +73,16 @@ Status WriteBatch::Iterate(Handler* handler) const {
|
||||||
return Status::Corruption("bad WriteBatch Delete");
|
return Status::Corruption("bad WriteBatch Delete");
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
case kTypeValueWriteTime:
|
||||||
|
case kTypeValueExplicitExpiry:
|
||||||
|
if (GetLengthPrefixedSlice(&input, &key) &&
|
||||||
|
GetVarint64(&input, &expiry) &&
|
||||||
|
GetLengthPrefixedSlice(&input, &value)) {
|
||||||
|
handler->Put(key, value, tag, expiry);
|
||||||
|
} else {
|
||||||
|
return Status::Corruption("bad WriteBatch Expiry");
|
||||||
|
}
|
||||||
|
break;
|
||||||
default:
|
default:
|
||||||
return Status::Corruption("unknown WriteBatch tag");
|
return Status::Corruption("unknown WriteBatch tag");
|
||||||
}
|
}
|
||||||
|
@ -95,10 +110,20 @@ void WriteBatchInternal::SetSequence(WriteBatch* b, SequenceNumber seq) {
|
||||||
EncodeFixed64(&b->rep_[0], seq);
|
EncodeFixed64(&b->rep_[0], seq);
|
||||||
}
|
}
|
||||||
|
|
||||||
void WriteBatch::Put(const Slice& key, const Slice& value) {
|
void WriteBatch::Put(const Slice& key, const Slice& value, const KeyMetaData * meta) {
|
||||||
|
KeyMetaData local_meta;
|
||||||
WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1);
|
WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1);
|
||||||
rep_.push_back(static_cast<char>(kTypeValue));
|
if (NULL!=meta)
|
||||||
|
local_meta=*meta;
|
||||||
|
rep_.push_back(static_cast<char>(local_meta.m_Type));
|
||||||
PutLengthPrefixedSlice(&rep_, key);
|
PutLengthPrefixedSlice(&rep_, key);
|
||||||
|
if (kTypeValueExplicitExpiry==local_meta.m_Type
|
||||||
|
|| kTypeValueWriteTime==local_meta.m_Type)
|
||||||
|
{
|
||||||
|
if (kTypeValueWriteTime==local_meta.m_Type && 0==local_meta.m_Expiry)
|
||||||
|
local_meta.m_Expiry=GetCachedTimeMicros();
|
||||||
|
PutVarint64(&rep_, local_meta.m_Expiry);
|
||||||
|
} // if
|
||||||
PutLengthPrefixedSlice(&rep_, value);
|
PutLengthPrefixedSlice(&rep_, value);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -113,23 +138,33 @@ class MemTableInserter : public WriteBatch::Handler {
|
||||||
public:
|
public:
|
||||||
SequenceNumber sequence_;
|
SequenceNumber sequence_;
|
||||||
MemTable* mem_;
|
MemTable* mem_;
|
||||||
|
const Options * options_;
|
||||||
|
|
||||||
virtual void Put(const Slice& key, const Slice& value) {
|
MemTableInserter() : mem_(NULL), options_(NULL) {};
|
||||||
mem_->Add(sequence_, kTypeValue, key, value);
|
|
||||||
|
virtual void Put(const Slice& key, const Slice& value, const ValueType &type, const ExpiryTimeMicros &expiry) {
|
||||||
|
ValueType type_use(type);
|
||||||
|
ExpiryTimeMicros expiry_use(expiry);
|
||||||
|
|
||||||
|
if (NULL!=options_ && options_->ExpiryActivated())
|
||||||
|
options_->expiry_module->MemTableInserterCallback(key, value, type_use, expiry_use);
|
||||||
|
mem_->Add(sequence_, (ValueType)type_use, key, value, expiry_use);
|
||||||
sequence_++;
|
sequence_++;
|
||||||
}
|
}
|
||||||
virtual void Delete(const Slice& key) {
|
virtual void Delete(const Slice& key) {
|
||||||
mem_->Add(sequence_, kTypeDeletion, key, Slice());
|
mem_->Add(sequence_, kTypeDeletion, key, Slice(), 0);
|
||||||
sequence_++;
|
sequence_++;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
} // namespace
|
} // namespace
|
||||||
|
|
||||||
Status WriteBatchInternal::InsertInto(const WriteBatch* b,
|
Status WriteBatchInternal::InsertInto(const WriteBatch* b,
|
||||||
MemTable* memtable) {
|
MemTable* memtable,
|
||||||
|
const Options * options) {
|
||||||
MemTableInserter inserter;
|
MemTableInserter inserter;
|
||||||
inserter.sequence_ = WriteBatchInternal::Sequence(b);
|
inserter.sequence_ = WriteBatchInternal::Sequence(b);
|
||||||
inserter.mem_ = memtable;
|
inserter.mem_ = memtable;
|
||||||
|
inserter.options_ = options;
|
||||||
return b->Iterate(&inserter);
|
return b->Iterate(&inserter);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -5,7 +5,6 @@
|
||||||
#ifndef STORAGE_LEVELDB_DB_WRITE_BATCH_INTERNAL_H_
|
#ifndef STORAGE_LEVELDB_DB_WRITE_BATCH_INTERNAL_H_
|
||||||
#define STORAGE_LEVELDB_DB_WRITE_BATCH_INTERNAL_H_
|
#define STORAGE_LEVELDB_DB_WRITE_BATCH_INTERNAL_H_
|
||||||
|
|
||||||
#include "db/dbformat.h"
|
|
||||||
#include "leveldb/write_batch.h"
|
#include "leveldb/write_batch.h"
|
||||||
|
|
||||||
namespace leveldb {
|
namespace leveldb {
|
||||||
|
@ -22,10 +21,10 @@ class WriteBatchInternal {
|
||||||
// Set the count for the number of entries in the batch.
|
// Set the count for the number of entries in the batch.
|
||||||
static void SetCount(WriteBatch* batch, int n);
|
static void SetCount(WriteBatch* batch, int n);
|
||||||
|
|
||||||
// Return the sequence number for the start of this batch.
|
// Return the seqeunce number for the start of this batch.
|
||||||
static SequenceNumber Sequence(const WriteBatch* batch);
|
static SequenceNumber Sequence(const WriteBatch* batch);
|
||||||
|
|
||||||
// Store the specified number as the sequence number for the start of
|
// Store the specified number as the seqeunce number for the start of
|
||||||
// this batch.
|
// this batch.
|
||||||
static void SetSequence(WriteBatch* batch, SequenceNumber seq);
|
static void SetSequence(WriteBatch* batch, SequenceNumber seq);
|
||||||
|
|
||||||
|
@ -39,7 +38,7 @@ class WriteBatchInternal {
|
||||||
|
|
||||||
static void SetContents(WriteBatch* batch, const Slice& contents);
|
static void SetContents(WriteBatch* batch, const Slice& contents);
|
||||||
|
|
||||||
static Status InsertInto(const WriteBatch* batch, MemTable* memtable);
|
static Status InsertInto(const WriteBatch* batch, MemTable* memtable, const Options * options);
|
||||||
|
|
||||||
static void Append(WriteBatch* dst, const WriteBatch* src);
|
static void Append(WriteBatch* dst, const WriteBatch* src);
|
||||||
};
|
};
|
||||||
|
|
|
@ -2,6 +2,7 @@
|
||||||
// Use of this source code is governed by a BSD-style license that can be
|
// Use of this source code is governed by a BSD-style license that can be
|
||||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||||
|
|
||||||
|
#include <sstream>
|
||||||
#include "leveldb/db.h"
|
#include "leveldb/db.h"
|
||||||
|
|
||||||
#include "db/memtable.h"
|
#include "db/memtable.h"
|
||||||
|
@ -17,11 +18,12 @@ static std::string PrintContents(WriteBatch* b) {
|
||||||
MemTable* mem = new MemTable(cmp);
|
MemTable* mem = new MemTable(cmp);
|
||||||
mem->Ref();
|
mem->Ref();
|
||||||
std::string state;
|
std::string state;
|
||||||
Status s = WriteBatchInternal::InsertInto(b, mem);
|
Status s = WriteBatchInternal::InsertInto(b, mem, NULL);
|
||||||
int count = 0;
|
int count = 0;
|
||||||
Iterator* iter = mem->NewIterator();
|
Iterator* iter = mem->NewIterator();
|
||||||
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
|
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
|
||||||
ParsedInternalKey ikey;
|
ParsedInternalKey ikey;
|
||||||
|
std::stringstream sstr;
|
||||||
ASSERT_TRUE(ParseInternalKey(iter->key(), &ikey));
|
ASSERT_TRUE(ParseInternalKey(iter->key(), &ikey));
|
||||||
switch (ikey.type) {
|
switch (ikey.type) {
|
||||||
case kTypeValue:
|
case kTypeValue:
|
||||||
|
@ -32,6 +34,28 @@ static std::string PrintContents(WriteBatch* b) {
|
||||||
state.append(")");
|
state.append(")");
|
||||||
count++;
|
count++;
|
||||||
break;
|
break;
|
||||||
|
case kTypeValueWriteTime:
|
||||||
|
state.append("PutWT(");
|
||||||
|
state.append(ikey.user_key.ToString());
|
||||||
|
state.append(", ");
|
||||||
|
sstr << ikey.expiry;
|
||||||
|
state.append(sstr.str());
|
||||||
|
state.append(", ");
|
||||||
|
state.append(iter->value().ToString());
|
||||||
|
state.append(")");
|
||||||
|
count++;
|
||||||
|
break;
|
||||||
|
case kTypeValueExplicitExpiry:
|
||||||
|
state.append("PutEE(");
|
||||||
|
state.append(ikey.user_key.ToString());
|
||||||
|
state.append(", ");
|
||||||
|
sstr << ikey.expiry;
|
||||||
|
state.append(sstr.str());
|
||||||
|
state.append(", ");
|
||||||
|
state.append(iter->value().ToString());
|
||||||
|
state.append(")");
|
||||||
|
count++;
|
||||||
|
break;
|
||||||
case kTypeDeletion:
|
case kTypeDeletion:
|
||||||
state.append("Delete(");
|
state.append("Delete(");
|
||||||
state.append(ikey.user_key.ToString());
|
state.append(ikey.user_key.ToString());
|
||||||
|
@ -74,6 +98,32 @@ TEST(WriteBatchTest, Multiple) {
|
||||||
PrintContents(&batch));
|
PrintContents(&batch));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST(WriteBatchTest, MultipleExpiry) {
|
||||||
|
WriteBatch batch;
|
||||||
|
KeyMetaData meta;
|
||||||
|
batch.Put(Slice("Mary"), Slice("Lamb"));
|
||||||
|
meta.m_Type=kTypeValueExplicitExpiry;
|
||||||
|
meta.m_Expiry=2347;
|
||||||
|
batch.Put(Slice("Adam"), Slice("Ant"), &meta);
|
||||||
|
//batch.PutExplicitExpiry(Slice("Adam"), Slice("Ant"), 2347);
|
||||||
|
batch.Put(Slice("Frosty"), Slice("Snowman"));
|
||||||
|
batch.Put(Slice("Tip"), Slice("ONeal"));
|
||||||
|
batch.Delete(Slice("Frosty"));
|
||||||
|
meta.m_Type=kTypeValueExplicitExpiry;
|
||||||
|
meta.m_Expiry=987654321;
|
||||||
|
batch.Put(Slice("The"), Slice("Fonz"), &meta);
|
||||||
|
WriteBatchInternal::SetSequence(&batch, 200);
|
||||||
|
ASSERT_EQ(200, WriteBatchInternal::Sequence(&batch));
|
||||||
|
ASSERT_EQ(6, WriteBatchInternal::Count(&batch));
|
||||||
|
ASSERT_EQ("PutEE(Adam, 2347, Ant)@201"
|
||||||
|
"Delete(Frosty)@204"
|
||||||
|
"Put(Frosty, Snowman)@202"
|
||||||
|
"Put(Mary, Lamb)@200"
|
||||||
|
"PutEE(The, 987654321, Fonz)@205"
|
||||||
|
"Put(Tip, ONeal)@203",
|
||||||
|
PrintContents(&batch));
|
||||||
|
}
|
||||||
|
|
||||||
TEST(WriteBatchTest, Corruption) {
|
TEST(WriteBatchTest, Corruption) {
|
||||||
WriteBatch batch;
|
WriteBatch batch;
|
||||||
batch.Put(Slice("foo"), Slice("bar"));
|
batch.Put(Slice("foo"), Slice("bar"));
|
||||||
|
|
|
@ -618,7 +618,7 @@ class Benchmark {
|
||||||
ErrorCheck(status);
|
ErrorCheck(status);
|
||||||
|
|
||||||
// Execute read statement
|
// Execute read statement
|
||||||
while ((status = sqlite3_step(read_stmt)) == SQLITE_ROW) {}
|
while ((status = sqlite3_step(read_stmt)) == SQLITE_ROW);
|
||||||
StepErrorCheck(status);
|
StepErrorCheck(status);
|
||||||
|
|
||||||
// Reset SQLite statement for another use
|
// Reset SQLite statement for another use
|
||||||
|
|
|
@ -338,7 +338,7 @@ class Benchmark {
|
||||||
bool write_sync = false;
|
bool write_sync = false;
|
||||||
if (name == Slice("fillseq")) {
|
if (name == Slice("fillseq")) {
|
||||||
Write(write_sync, SEQUENTIAL, FRESH, num_, FLAGS_value_size, 1);
|
Write(write_sync, SEQUENTIAL, FRESH, num_, FLAGS_value_size, 1);
|
||||||
DBSynchronize(db_);
|
|
||||||
} else if (name == Slice("fillrandom")) {
|
} else if (name == Slice("fillrandom")) {
|
||||||
Write(write_sync, RANDOM, FRESH, num_, FLAGS_value_size, 1);
|
Write(write_sync, RANDOM, FRESH, num_, FLAGS_value_size, 1);
|
||||||
DBSynchronize(db_);
|
DBSynchronize(db_);
|
||||||
|
|
89
src/leveldb/doc/doc.css
Normal file
89
src/leveldb/doc/doc.css
Normal file
|
@ -0,0 +1,89 @@
|
||||||
|
body {
|
||||||
|
margin-left: 0.5in;
|
||||||
|
margin-right: 0.5in;
|
||||||
|
background: white;
|
||||||
|
color: black;
|
||||||
|
}
|
||||||
|
|
||||||
|
h1 {
|
||||||
|
margin-left: -0.2in;
|
||||||
|
font-size: 14pt;
|
||||||
|
}
|
||||||
|
h2 {
|
||||||
|
margin-left: -0in;
|
||||||
|
font-size: 12pt;
|
||||||
|
}
|
||||||
|
h3 {
|
||||||
|
margin-left: -0in;
|
||||||
|
}
|
||||||
|
h4 {
|
||||||
|
margin-left: -0in;
|
||||||
|
}
|
||||||
|
hr {
|
||||||
|
margin-left: -0in;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Definition lists: definition term bold */
|
||||||
|
dt {
|
||||||
|
font-weight: bold;
|
||||||
|
}
|
||||||
|
|
||||||
|
address {
|
||||||
|
text-align: center;
|
||||||
|
}
|
||||||
|
code,samp,var {
|
||||||
|
color: blue;
|
||||||
|
}
|
||||||
|
kbd {
|
||||||
|
color: #600000;
|
||||||
|
}
|
||||||
|
div.note p {
|
||||||
|
float: right;
|
||||||
|
width: 3in;
|
||||||
|
margin-right: 0%;
|
||||||
|
padding: 1px;
|
||||||
|
border: 2px solid #6060a0;
|
||||||
|
background-color: #fffff0;
|
||||||
|
}
|
||||||
|
|
||||||
|
ul {
|
||||||
|
margin-top: -0em;
|
||||||
|
margin-bottom: -0em;
|
||||||
|
}
|
||||||
|
|
||||||
|
ol {
|
||||||
|
margin-top: -0em;
|
||||||
|
margin-bottom: -0em;
|
||||||
|
}
|
||||||
|
|
||||||
|
UL.nobullets {
|
||||||
|
list-style-type: none;
|
||||||
|
list-style-image: none;
|
||||||
|
margin-left: -1em;
|
||||||
|
}
|
||||||
|
|
||||||
|
p {
|
||||||
|
margin: 1em 0 1em 0;
|
||||||
|
padding: 0 0 0 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
pre {
|
||||||
|
line-height: 1.3em;
|
||||||
|
padding: 0.4em 0 0.8em 0;
|
||||||
|
margin: 0 0 0 0;
|
||||||
|
border: 0 0 0 0;
|
||||||
|
color: blue;
|
||||||
|
}
|
||||||
|
|
||||||
|
.datatable {
|
||||||
|
margin-left: auto;
|
||||||
|
margin-right: auto;
|
||||||
|
margin-top: 2em;
|
||||||
|
margin-bottom: 2em;
|
||||||
|
border: 1px solid;
|
||||||
|
}
|
||||||
|
|
||||||
|
.datatable td,th {
|
||||||
|
padding: 0 0.5em 0 0.5em;
|
||||||
|
text-align: right;
|
||||||
|
}
|
213
src/leveldb/doc/impl.html
Normal file
213
src/leveldb/doc/impl.html
Normal file
|
@ -0,0 +1,213 @@
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<link rel="stylesheet" type="text/css" href="doc.css" />
|
||||||
|
<title>Leveldb file layout and compactions</title>
|
||||||
|
</head>
|
||||||
|
|
||||||
|
<body>
|
||||||
|
|
||||||
|
<h1>Files</h1>
|
||||||
|
|
||||||
|
The implementation of leveldb is similar in spirit to the
|
||||||
|
representation of a single
|
||||||
|
<a href="http://labs.google.com/papers/bigtable.html">
|
||||||
|
Bigtable tablet (section 5.3)</a>.
|
||||||
|
However the organization of the files that make up the representation
|
||||||
|
is somewhat different and is explained below.
|
||||||
|
|
||||||
|
<p>
|
||||||
|
Each database is represented by a set of files stored in a directory.
|
||||||
|
There are several different types of files as documented below:
|
||||||
|
<p>
|
||||||
|
<h2>Log files</h2>
|
||||||
|
<p>
|
||||||
|
A log file (*.log) stores a sequence of recent updates. Each update
|
||||||
|
is appended to the current log file. When the log file reaches a
|
||||||
|
pre-determined size (approximately 4MB by default), it is converted
|
||||||
|
to a sorted table (see below) and a new log file is created for future
|
||||||
|
updates.
|
||||||
|
<p>
|
||||||
|
A copy of the current log file is kept in an in-memory structure (the
|
||||||
|
<code>memtable</code>). This copy is consulted on every read so that read
|
||||||
|
operations reflect all logged updates.
|
||||||
|
<p>
|
||||||
|
<h2>Sorted tables</h2>
|
||||||
|
<p>
|
||||||
|
A sorted table (*.sst) stores a sequence of entries sorted by key.
|
||||||
|
Each entry is either a value for the key, or a deletion marker for the
|
||||||
|
key. (Deletion markers are kept around to hide obsolete values
|
||||||
|
present in older sorted tables).
|
||||||
|
<p>
|
||||||
|
The set of sorted tables are organized into a sequence of levels. The
|
||||||
|
sorted table generated from a log file is placed in a special <code>young</code>
|
||||||
|
level (also called level-0). When the number of young files exceeds a
|
||||||
|
certain threshold (currently four), all of the young files are merged
|
||||||
|
together with all of the overlapping level-1 files to produce a
|
||||||
|
sequence of new level-1 files (we create a new level-1 file for every
|
||||||
|
2MB of data.)
|
||||||
|
<p>
|
||||||
|
Files in the young level may contain overlapping keys. However files
|
||||||
|
in other levels have distinct non-overlapping key ranges. Consider
|
||||||
|
level number L where L >= 1. When the combined size of files in
|
||||||
|
level-L exceeds (10^L) MB (i.e., 10MB for level-1, 100MB for level-2,
|
||||||
|
...), one file in level-L, and all of the overlapping files in
|
||||||
|
level-(L+1) are merged to form a set of new files for level-(L+1).
|
||||||
|
These merges have the effect of gradually migrating new updates from
|
||||||
|
the young level to the largest level using only bulk reads and writes
|
||||||
|
(i.e., minimizing expensive seeks).
|
||||||
|
|
||||||
|
<h2>Manifest</h2>
|
||||||
|
<p>
|
||||||
|
A MANIFEST file lists the set of sorted tables that make up each
|
||||||
|
level, the corresponding key ranges, and other important metadata.
|
||||||
|
A new MANIFEST file (with a new number embedded in the file name)
|
||||||
|
is created whenever the database is reopened. The MANIFEST file is
|
||||||
|
formatted as a log, and changes made to the serving state (as files
|
||||||
|
are added or removed) are appended to this log.
|
||||||
|
<p>
|
||||||
|
<h2>Current</h2>
|
||||||
|
<p>
|
||||||
|
CURRENT is a simple text file that contains the name of the latest
|
||||||
|
MANIFEST file.
|
||||||
|
<p>
|
||||||
|
<h2>Info logs</h2>
|
||||||
|
<p>
|
||||||
|
Informational messages are printed to files named LOG and LOG.old.
|
||||||
|
<p>
|
||||||
|
<h2>Others</h2>
|
||||||
|
<p>
|
||||||
|
Other files used for miscellaneous purposes may also be present
|
||||||
|
(LOCK, *.dbtmp).
|
||||||
|
|
||||||
|
<h1>Level 0</h1>
|
||||||
|
When the log file grows above a certain size (1MB by default):
|
||||||
|
<ul>
|
||||||
|
<li>Create a brand new memtable and log file and direct future updates here
|
||||||
|
<li>In the background:
|
||||||
|
<ul>
|
||||||
|
<li>Write the contents of the previous memtable to an sstable
|
||||||
|
<li>Discard the memtable
|
||||||
|
<li>Delete the old log file and the old memtable
|
||||||
|
<li>Add the new sstable to the young (level-0) level.
|
||||||
|
</ul>
|
||||||
|
</ul>
|
||||||
|
|
||||||
|
<h1>Compactions</h1>
|
||||||
|
|
||||||
|
<p>
|
||||||
|
When the size of level L exceeds its limit, we compact it in a
|
||||||
|
background thread. The compaction picks a file from level L and all
|
||||||
|
overlapping files from the next level L+1. Note that if a level-L
|
||||||
|
file overlaps only part of a level-(L+1) file, the entire file at
|
||||||
|
level-(L+1) is used as an input to the compaction and will be
|
||||||
|
discarded after the compaction. Aside: because level-0 is special
|
||||||
|
(files in it may overlap each other), we treat compactions from
|
||||||
|
level-0 to level-1 specially: a level-0 compaction may pick more than
|
||||||
|
one level-0 file in case some of these files overlap each other.
|
||||||
|
|
||||||
|
<p>
|
||||||
|
A compaction merges the contents of the picked files to produce a
|
||||||
|
sequence of level-(L+1) files. We switch to producing a new
|
||||||
|
level-(L+1) file after the current output file has reached the target
|
||||||
|
file size (2MB). We also switch to a new output file when the key
|
||||||
|
range of the current output file has grown enough to overlap more then
|
||||||
|
ten level-(L+2) files. This last rule ensures that a later compaction
|
||||||
|
of a level-(L+1) file will not pick up too much data from level-(L+2).
|
||||||
|
|
||||||
|
<p>
|
||||||
|
The old files are discarded and the new files are added to the serving
|
||||||
|
state.
|
||||||
|
|
||||||
|
<p>
|
||||||
|
Compactions for a particular level rotate through the key space. In
|
||||||
|
more detail, for each level L, we remember the ending key of the last
|
||||||
|
compaction at level L. The next compaction for level L will pick the
|
||||||
|
first file that starts after this key (wrapping around to the
|
||||||
|
beginning of the key space if there is no such file).
|
||||||
|
|
||||||
|
<p>
|
||||||
|
Compactions drop overwritten values. They also drop deletion markers
|
||||||
|
if there are no higher numbered levels that contain a file whose range
|
||||||
|
overlaps the current key.
|
||||||
|
|
||||||
|
<h2>Timing</h2>
|
||||||
|
|
||||||
|
Level-0 compactions will read up to four 1MB files from level-0, and
|
||||||
|
at worst all the level-1 files (10MB). I.e., we will read 14MB and
|
||||||
|
write 14MB.
|
||||||
|
|
||||||
|
<p>
|
||||||
|
Other than the special level-0 compactions, we will pick one 2MB file
|
||||||
|
from level L. In the worst case, this will overlap ~ 12 files from
|
||||||
|
level L+1 (10 because level-(L+1) is ten times the size of level-L,
|
||||||
|
and another two at the boundaries since the file ranges at level-L
|
||||||
|
will usually not be aligned with the file ranges at level-L+1). The
|
||||||
|
compaction will therefore read 26MB and write 26MB. Assuming a disk
|
||||||
|
IO rate of 100MB/s (ballpark range for modern drives), the worst
|
||||||
|
compaction cost will be approximately 0.5 second.
|
||||||
|
|
||||||
|
<p>
|
||||||
|
If we throttle the background writing to something small, say 10% of
|
||||||
|
the full 100MB/s speed, a compaction may take up to 5 seconds. If the
|
||||||
|
user is writing at 10MB/s, we might build up lots of level-0 files
|
||||||
|
(~50 to hold the 5*10MB). This may signficantly increase the cost of
|
||||||
|
reads due to the overhead of merging more files together on every
|
||||||
|
read.
|
||||||
|
|
||||||
|
<p>
|
||||||
|
Solution 1: To reduce this problem, we might want to increase the log
|
||||||
|
switching threshold when the number of level-0 files is large. Though
|
||||||
|
the downside is that the larger this threshold, the more memory we will
|
||||||
|
need to hold the corresponding memtable.
|
||||||
|
|
||||||
|
<p>
|
||||||
|
Solution 2: We might want to decrease write rate artificially when the
|
||||||
|
number of level-0 files goes up.
|
||||||
|
|
||||||
|
<p>
|
||||||
|
Solution 3: We work on reducing the cost of very wide merges.
|
||||||
|
Perhaps most of the level-0 files will have their blocks sitting
|
||||||
|
uncompressed in the cache and we will only need to worry about the
|
||||||
|
O(N) complexity in the merging iterator.
|
||||||
|
|
||||||
|
<h2>Number of files</h2>
|
||||||
|
|
||||||
|
Instead of always making 2MB files, we could make larger files for
|
||||||
|
larger levels to reduce the total file count, though at the expense of
|
||||||
|
more bursty compactions. Alternatively, we could shard the set of
|
||||||
|
files into multiple directories.
|
||||||
|
|
||||||
|
<p>
|
||||||
|
An experiment on an <code>ext3</code> filesystem on Feb 04, 2011 shows
|
||||||
|
the following timings to do 100K file opens in directories with
|
||||||
|
varying number of files:
|
||||||
|
<table class="datatable">
|
||||||
|
<tr><th>Files in directory</th><th>Microseconds to open a file</th></tr>
|
||||||
|
<tr><td>1000</td><td>9</td>
|
||||||
|
<tr><td>10000</td><td>10</td>
|
||||||
|
<tr><td>100000</td><td>16</td>
|
||||||
|
</table>
|
||||||
|
So maybe even the sharding is not necessary on modern filesystems?
|
||||||
|
|
||||||
|
<h1>Recovery</h1>
|
||||||
|
|
||||||
|
<ul>
|
||||||
|
<li> Read CURRENT to find name of the latest committed MANIFEST
|
||||||
|
<li> Read the named MANIFEST file
|
||||||
|
<li> Clean up stale files
|
||||||
|
<li> We could open all sstables here, but it is probably better to be lazy...
|
||||||
|
<li> Convert log chunk to a new level-0 sstable
|
||||||
|
<li> Start directing new writes to a new log file with recovered sequence#
|
||||||
|
</ul>
|
||||||
|
|
||||||
|
<h1>Garbage collection of files</h1>
|
||||||
|
|
||||||
|
<code>DeleteObsoleteFiles()</code> is called at the end of every
|
||||||
|
compaction and at the end of recovery. It finds the names of all
|
||||||
|
files in the database. It deletes all log files that are not the
|
||||||
|
current log file. It deletes all table files that are not referenced
|
||||||
|
from some level and are not the output of an active compaction.
|
||||||
|
|
||||||
|
</body>
|
||||||
|
</html>
|
|
@ -1,170 +0,0 @@
|
||||||
## Files
|
|
||||||
|
|
||||||
The implementation of leveldb is similar in spirit to the representation of a
|
|
||||||
single [Bigtable tablet (section 5.3)](http://research.google.com/archive/bigtable.html).
|
|
||||||
However the organization of the files that make up the representation is
|
|
||||||
somewhat different and is explained below.
|
|
||||||
|
|
||||||
Each database is represented by a set of files stored in a directory. There are
|
|
||||||
several different types of files as documented below:
|
|
||||||
|
|
||||||
### Log files
|
|
||||||
|
|
||||||
A log file (*.log) stores a sequence of recent updates. Each update is appended
|
|
||||||
to the current log file. When the log file reaches a pre-determined size
|
|
||||||
(approximately 4MB by default), it is converted to a sorted table (see below)
|
|
||||||
and a new log file is created for future updates.
|
|
||||||
|
|
||||||
A copy of the current log file is kept in an in-memory structure (the
|
|
||||||
`memtable`). This copy is consulted on every read so that read operations
|
|
||||||
reflect all logged updates.
|
|
||||||
|
|
||||||
## Sorted tables
|
|
||||||
|
|
||||||
A sorted table (*.ldb) stores a sequence of entries sorted by key. Each entry is
|
|
||||||
either a value for the key, or a deletion marker for the key. (Deletion markers
|
|
||||||
are kept around to hide obsolete values present in older sorted tables).
|
|
||||||
|
|
||||||
The set of sorted tables are organized into a sequence of levels. The sorted
|
|
||||||
table generated from a log file is placed in a special **young** level (also
|
|
||||||
called level-0). When the number of young files exceeds a certain threshold
|
|
||||||
(currently four), all of the young files are merged together with all of the
|
|
||||||
overlapping level-1 files to produce a sequence of new level-1 files (we create
|
|
||||||
a new level-1 file for every 2MB of data.)
|
|
||||||
|
|
||||||
Files in the young level may contain overlapping keys. However files in other
|
|
||||||
levels have distinct non-overlapping key ranges. Consider level number L where
|
|
||||||
L >= 1. When the combined size of files in level-L exceeds (10^L) MB (i.e., 10MB
|
|
||||||
for level-1, 100MB for level-2, ...), one file in level-L, and all of the
|
|
||||||
overlapping files in level-(L+1) are merged to form a set of new files for
|
|
||||||
level-(L+1). These merges have the effect of gradually migrating new updates
|
|
||||||
from the young level to the largest level using only bulk reads and writes
|
|
||||||
(i.e., minimizing expensive seeks).
|
|
||||||
|
|
||||||
### Manifest
|
|
||||||
|
|
||||||
A MANIFEST file lists the set of sorted tables that make up each level, the
|
|
||||||
corresponding key ranges, and other important metadata. A new MANIFEST file
|
|
||||||
(with a new number embedded in the file name) is created whenever the database
|
|
||||||
is reopened. The MANIFEST file is formatted as a log, and changes made to the
|
|
||||||
serving state (as files are added or removed) are appended to this log.
|
|
||||||
|
|
||||||
### Current
|
|
||||||
|
|
||||||
CURRENT is a simple text file that contains the name of the latest MANIFEST
|
|
||||||
file.
|
|
||||||
|
|
||||||
### Info logs
|
|
||||||
|
|
||||||
Informational messages are printed to files named LOG and LOG.old.
|
|
||||||
|
|
||||||
### Others
|
|
||||||
|
|
||||||
Other files used for miscellaneous purposes may also be present (LOCK, *.dbtmp).
|
|
||||||
|
|
||||||
## Level 0
|
|
||||||
|
|
||||||
When the log file grows above a certain size (1MB by default):
|
|
||||||
Create a brand new memtable and log file and direct future updates here
|
|
||||||
In the background:
|
|
||||||
Write the contents of the previous memtable to an sstable
|
|
||||||
Discard the memtable
|
|
||||||
Delete the old log file and the old memtable
|
|
||||||
Add the new sstable to the young (level-0) level.
|
|
||||||
|
|
||||||
## Compactions
|
|
||||||
|
|
||||||
When the size of level L exceeds its limit, we compact it in a background
|
|
||||||
thread. The compaction picks a file from level L and all overlapping files from
|
|
||||||
the next level L+1. Note that if a level-L file overlaps only part of a
|
|
||||||
level-(L+1) file, the entire file at level-(L+1) is used as an input to the
|
|
||||||
compaction and will be discarded after the compaction. Aside: because level-0
|
|
||||||
is special (files in it may overlap each other), we treat compactions from
|
|
||||||
level-0 to level-1 specially: a level-0 compaction may pick more than one
|
|
||||||
level-0 file in case some of these files overlap each other.
|
|
||||||
|
|
||||||
A compaction merges the contents of the picked files to produce a sequence of
|
|
||||||
level-(L+1) files. We switch to producing a new level-(L+1) file after the
|
|
||||||
current output file has reached the target file size (2MB). We also switch to a
|
|
||||||
new output file when the key range of the current output file has grown enough
|
|
||||||
to overlap more than ten level-(L+2) files. This last rule ensures that a later
|
|
||||||
compaction of a level-(L+1) file will not pick up too much data from
|
|
||||||
level-(L+2).
|
|
||||||
|
|
||||||
The old files are discarded and the new files are added to the serving state.
|
|
||||||
|
|
||||||
Compactions for a particular level rotate through the key space. In more detail,
|
|
||||||
for each level L, we remember the ending key of the last compaction at level L.
|
|
||||||
The next compaction for level L will pick the first file that starts after this
|
|
||||||
key (wrapping around to the beginning of the key space if there is no such
|
|
||||||
file).
|
|
||||||
|
|
||||||
Compactions drop overwritten values. They also drop deletion markers if there
|
|
||||||
are no higher numbered levels that contain a file whose range overlaps the
|
|
||||||
current key.
|
|
||||||
|
|
||||||
### Timing
|
|
||||||
|
|
||||||
Level-0 compactions will read up to four 1MB files from level-0, and at worst
|
|
||||||
all the level-1 files (10MB). I.e., we will read 14MB and write 14MB.
|
|
||||||
|
|
||||||
Other than the special level-0 compactions, we will pick one 2MB file from level
|
|
||||||
L. In the worst case, this will overlap ~ 12 files from level L+1 (10 because
|
|
||||||
level-(L+1) is ten times the size of level-L, and another two at the boundaries
|
|
||||||
since the file ranges at level-L will usually not be aligned with the file
|
|
||||||
ranges at level-L+1). The compaction will therefore read 26MB and write 26MB.
|
|
||||||
Assuming a disk IO rate of 100MB/s (ballpark range for modern drives), the worst
|
|
||||||
compaction cost will be approximately 0.5 second.
|
|
||||||
|
|
||||||
If we throttle the background writing to something small, say 10% of the full
|
|
||||||
100MB/s speed, a compaction may take up to 5 seconds. If the user is writing at
|
|
||||||
10MB/s, we might build up lots of level-0 files (~50 to hold the 5*10MB). This
|
|
||||||
may significantly increase the cost of reads due to the overhead of merging more
|
|
||||||
files together on every read.
|
|
||||||
|
|
||||||
Solution 1: To reduce this problem, we might want to increase the log switching
|
|
||||||
threshold when the number of level-0 files is large. Though the downside is that
|
|
||||||
the larger this threshold, the more memory we will need to hold the
|
|
||||||
corresponding memtable.
|
|
||||||
|
|
||||||
Solution 2: We might want to decrease write rate artificially when the number of
|
|
||||||
level-0 files goes up.
|
|
||||||
|
|
||||||
Solution 3: We work on reducing the cost of very wide merges. Perhaps most of
|
|
||||||
the level-0 files will have their blocks sitting uncompressed in the cache and
|
|
||||||
we will only need to worry about the O(N) complexity in the merging iterator.
|
|
||||||
|
|
||||||
### Number of files
|
|
||||||
|
|
||||||
Instead of always making 2MB files, we could make larger files for larger levels
|
|
||||||
to reduce the total file count, though at the expense of more bursty
|
|
||||||
compactions. Alternatively, we could shard the set of files into multiple
|
|
||||||
directories.
|
|
||||||
|
|
||||||
An experiment on an ext3 filesystem on Feb 04, 2011 shows the following timings
|
|
||||||
to do 100K file opens in directories with varying number of files:
|
|
||||||
|
|
||||||
|
|
||||||
| Files in directory | Microseconds to open a file |
|
|
||||||
|-------------------:|----------------------------:|
|
|
||||||
| 1000 | 9 |
|
|
||||||
| 10000 | 10 |
|
|
||||||
| 100000 | 16 |
|
|
||||||
|
|
||||||
So maybe even the sharding is not necessary on modern filesystems?
|
|
||||||
|
|
||||||
## Recovery
|
|
||||||
|
|
||||||
* Read CURRENT to find name of the latest committed MANIFEST
|
|
||||||
* Read the named MANIFEST file
|
|
||||||
* Clean up stale files
|
|
||||||
* We could open all sstables here, but it is probably better to be lazy...
|
|
||||||
* Convert log chunk to a new level-0 sstable
|
|
||||||
* Start directing new writes to a new log file with recovered sequence#
|
|
||||||
|
|
||||||
## Garbage collection of files
|
|
||||||
|
|
||||||
`DeleteObsoleteFiles()` is called at the end of every compaction and at the end
|
|
||||||
of recovery. It finds the names of all files in the database. It deletes all log
|
|
||||||
files that are not the current log file. It deletes all table files that are not
|
|
||||||
referenced from some level and are not the output of an active compaction.
|
|
549
src/leveldb/doc/index.html
Normal file
549
src/leveldb/doc/index.html
Normal file
|
@ -0,0 +1,549 @@
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<link rel="stylesheet" type="text/css" href="doc.css" />
|
||||||
|
<title>Leveldb</title>
|
||||||
|
</head>
|
||||||
|
|
||||||
|
<body>
|
||||||
|
<h1>Leveldb</h1>
|
||||||
|
<address>Jeff Dean, Sanjay Ghemawat</address>
|
||||||
|
<p>
|
||||||
|
The <code>leveldb</code> library provides a persistent key value store. Keys and
|
||||||
|
values are arbitrary byte arrays. The keys are ordered within the key
|
||||||
|
value store according to a user-specified comparator function.
|
||||||
|
|
||||||
|
<p>
|
||||||
|
<h1>Opening A Database</h1>
|
||||||
|
<p>
|
||||||
|
A <code>leveldb</code> database has a name which corresponds to a file system
|
||||||
|
directory. All of the contents of database are stored in this
|
||||||
|
directory. The following example shows how to open a database,
|
||||||
|
creating it if necessary:
|
||||||
|
<p>
|
||||||
|
<pre>
|
||||||
|
#include <assert>
|
||||||
|
#include "leveldb/db.h"
|
||||||
|
|
||||||
|
leveldb::DB* db;
|
||||||
|
leveldb::Options options;
|
||||||
|
options.create_if_missing = true;
|
||||||
|
leveldb::Status status = leveldb::DB::Open(options, "/tmp/testdb", &db);
|
||||||
|
assert(status.ok());
|
||||||
|
...
|
||||||
|
</pre>
|
||||||
|
If you want to raise an error if the database already exists, add
|
||||||
|
the following line before the <code>leveldb::DB::Open</code> call:
|
||||||
|
<pre>
|
||||||
|
options.error_if_exists = true;
|
||||||
|
</pre>
|
||||||
|
<h1>Status</h1>
|
||||||
|
<p>
|
||||||
|
You may have noticed the <code>leveldb::Status</code> type above. Values of this
|
||||||
|
type are returned by most functions in <code>leveldb</code> that may encounter an
|
||||||
|
error. You can check if such a result is ok, and also print an
|
||||||
|
associated error message:
|
||||||
|
<p>
|
||||||
|
<pre>
|
||||||
|
leveldb::Status s = ...;
|
||||||
|
if (!s.ok()) cerr << s.ToString() << endl;
|
||||||
|
</pre>
|
||||||
|
<h1>Closing A Database</h1>
|
||||||
|
<p>
|
||||||
|
When you are done with a database, just delete the database object.
|
||||||
|
Example:
|
||||||
|
<p>
|
||||||
|
<pre>
|
||||||
|
... open the db as described above ...
|
||||||
|
... do something with db ...
|
||||||
|
delete db;
|
||||||
|
</pre>
|
||||||
|
<h1>Reads And Writes</h1>
|
||||||
|
<p>
|
||||||
|
The database provides <code>Put</code>, <code>Delete</code>, and <code>Get</code> methods to
|
||||||
|
modify/query the database. For example, the following code
|
||||||
|
moves the value stored under key1 to key2.
|
||||||
|
<pre>
|
||||||
|
std::string value;
|
||||||
|
leveldb::Status s = db->Get(leveldb::ReadOptions(), key1, &value);
|
||||||
|
if (s.ok()) s = db->Put(leveldb::WriteOptions(), key2, value);
|
||||||
|
if (s.ok()) s = db->Delete(leveldb::WriteOptions(), key1);
|
||||||
|
</pre>
|
||||||
|
|
||||||
|
<h1>Atomic Updates</h1>
|
||||||
|
<p>
|
||||||
|
Note that if the process dies after the Put of key2 but before the
|
||||||
|
delete of key1, the same value may be left stored under multiple keys.
|
||||||
|
Such problems can be avoided by using the <code>WriteBatch</code> class to
|
||||||
|
atomically apply a set of updates:
|
||||||
|
<p>
|
||||||
|
<pre>
|
||||||
|
#include "leveldb/write_batch.h"
|
||||||
|
...
|
||||||
|
std::string value;
|
||||||
|
leveldb::Status s = db->Get(leveldb::ReadOptions(), key1, &value);
|
||||||
|
if (s.ok()) {
|
||||||
|
leveldb::WriteBatch batch;
|
||||||
|
batch.Delete(key1);
|
||||||
|
batch.Put(key2, value);
|
||||||
|
s = db->Write(leveldb::WriteOptions(), &batch);
|
||||||
|
}
|
||||||
|
</pre>
|
||||||
|
The <code>WriteBatch</code> holds a sequence of edits to be made to the database,
|
||||||
|
and these edits within the batch are applied in order. Note that we
|
||||||
|
called <code>Delete</code> before <code>Put</code> so that if <code>key1</code> is identical to <code>key2</code>,
|
||||||
|
we do not end up erroneously dropping the value entirely.
|
||||||
|
<p>
|
||||||
|
Apart from its atomicity benefits, <code>WriteBatch</code> may also be used to
|
||||||
|
speed up bulk updates by placing lots of individual mutations into the
|
||||||
|
same batch.
|
||||||
|
|
||||||
|
<h1>Synchronous Writes</h1>
|
||||||
|
By default, each write to <code>leveldb</code> is asynchronous: it
|
||||||
|
returns after pushing the write from the process into the operating
|
||||||
|
system. The transfer from operating system memory to the underlying
|
||||||
|
persistent storage happens asynchronously. The <code>sync</code> flag
|
||||||
|
can be turned on for a particular write to make the write operation
|
||||||
|
not return until the data being written has been pushed all the way to
|
||||||
|
persistent storage. (On Posix systems, this is implemented by calling
|
||||||
|
either <code>fsync(...)</code> or <code>fdatasync(...)</code> or
|
||||||
|
<code>msync(..., MS_SYNC)</code> before the write operation returns.)
|
||||||
|
<pre>
|
||||||
|
leveldb::WriteOptions write_options;
|
||||||
|
write_options.sync = true;
|
||||||
|
db->Put(write_options, ...);
|
||||||
|
</pre>
|
||||||
|
Asynchronous writes are often more than a thousand times as fast as
|
||||||
|
synchronous writes. The downside of asynchronous writes is that a
|
||||||
|
crash of the machine may cause the last few updates to be lost. Note
|
||||||
|
that a crash of just the writing process (i.e., not a reboot) will not
|
||||||
|
cause any loss since even when <code>sync</code> is false, an update
|
||||||
|
is pushed from the process memory into the operating system before it
|
||||||
|
is considered done.
|
||||||
|
|
||||||
|
<p>
|
||||||
|
Asynchronous writes can often be used safely. For example, when
|
||||||
|
loading a large amount of data into the database you can handle lost
|
||||||
|
updates by restarting the bulk load after a crash. A hybrid scheme is
|
||||||
|
also possible where every Nth write is synchronous, and in the event
|
||||||
|
of a crash, the bulk load is restarted just after the last synchronous
|
||||||
|
write finished by the previous run. (The synchronous write can update
|
||||||
|
a marker that describes where to restart on a crash.)
|
||||||
|
|
||||||
|
<p>
|
||||||
|
<code>WriteBatch</code> provides an alternative to asynchronous writes.
|
||||||
|
Multiple updates may be placed in the same <code>WriteBatch</code> and
|
||||||
|
applied together using a synchronous write (i.e.,
|
||||||
|
<code>write_options.sync</code> is set to true). The extra cost of
|
||||||
|
the synchronous write will be amortized across all of the writes in
|
||||||
|
the batch.
|
||||||
|
|
||||||
|
<p>
|
||||||
|
<h1>Concurrency</h1>
|
||||||
|
<p>
|
||||||
|
A database may only be opened by one process at a time.
|
||||||
|
The <code>leveldb</code> implementation acquires a lock from the
|
||||||
|
operating system to prevent misuse. Within a single process, the
|
||||||
|
same <code>leveldb::DB</code> object may be safely shared by multiple
|
||||||
|
concurrent threads. I.e., different threads may write into or fetch
|
||||||
|
iterators or call <code>Get</code> on the same database without any
|
||||||
|
external synchronization (the leveldb implementation will
|
||||||
|
automatically do the required synchronization). However other objects
|
||||||
|
(like Iterator and WriteBatch) may require external synchronization.
|
||||||
|
If two threads share such an object, they must protect access to it
|
||||||
|
using their own locking protocol. More details are available in
|
||||||
|
the public header files.
|
||||||
|
<p>
|
||||||
|
<h1>Iteration</h1>
|
||||||
|
<p>
|
||||||
|
The following example demonstrates how to print all key,value pairs
|
||||||
|
in a database.
|
||||||
|
<p>
|
||||||
|
<pre>
|
||||||
|
leveldb::Iterator* it = db->NewIterator(leveldb::ReadOptions());
|
||||||
|
for (it->SeekToFirst(); it->Valid(); it->Next()) {
|
||||||
|
cout << it->key().ToString() << ": " << it->value().ToString() << endl;
|
||||||
|
}
|
||||||
|
assert(it->status().ok()); // Check for any errors found during the scan
|
||||||
|
delete it;
|
||||||
|
</pre>
|
||||||
|
The following variation shows how to process just the keys in the
|
||||||
|
range <code>[start,limit)</code>:
|
||||||
|
<p>
|
||||||
|
<pre>
|
||||||
|
for (it->Seek(start);
|
||||||
|
it->Valid() && it->key().ToString() < limit;
|
||||||
|
it->Next()) {
|
||||||
|
...
|
||||||
|
}
|
||||||
|
</pre>
|
||||||
|
You can also process entries in reverse order. (Caveat: reverse
|
||||||
|
iteration may be somewhat slower than forward iteration.)
|
||||||
|
<p>
|
||||||
|
<pre>
|
||||||
|
for (it->SeekToLast(); it->Valid(); it->Prev()) {
|
||||||
|
...
|
||||||
|
}
|
||||||
|
</pre>
|
||||||
|
<h1>Snapshots</h1>
|
||||||
|
<p>
|
||||||
|
Snapshots provide consistent read-only views over the entire state of
|
||||||
|
the key-value store. <code>ReadOptions::snapshot</code> may be non-NULL to indicate
|
||||||
|
that a read should operate on a particular version of the DB state.
|
||||||
|
If <code>ReadOptions::snapshot</code> is NULL, the read will operate on an
|
||||||
|
implicit snapshot of the current state.
|
||||||
|
<p>
|
||||||
|
Snapshots are created by the DB::GetSnapshot() method:
|
||||||
|
<p>
|
||||||
|
<pre>
|
||||||
|
leveldb::ReadOptions options;
|
||||||
|
options.snapshot = db->GetSnapshot();
|
||||||
|
... apply some updates to db ...
|
||||||
|
leveldb::Iterator* iter = db->NewIterator(options);
|
||||||
|
... read using iter to view the state when the snapshot was created ...
|
||||||
|
delete iter;
|
||||||
|
db->ReleaseSnapshot(options.snapshot);
|
||||||
|
</pre>
|
||||||
|
Note that when a snapshot is no longer needed, it should be released
|
||||||
|
using the DB::ReleaseSnapshot interface. This allows the
|
||||||
|
implementation to get rid of state that was being maintained just to
|
||||||
|
support reading as of that snapshot.
|
||||||
|
<h1>Slice</h1>
|
||||||
|
<p>
|
||||||
|
The return value of the <code>it->key()</code> and <code>it->value()</code> calls above
|
||||||
|
are instances of the <code>leveldb::Slice</code> type. <code>Slice</code> is a simple
|
||||||
|
structure that contains a length and a pointer to an external byte
|
||||||
|
array. Returning a <code>Slice</code> is a cheaper alternative to returning a
|
||||||
|
<code>std::string</code> since we do not need to copy potentially large keys and
|
||||||
|
values. In addition, <code>leveldb</code> methods do not return null-terminated
|
||||||
|
C-style strings since <code>leveldb</code> keys and values are allowed to
|
||||||
|
contain '\0' bytes.
|
||||||
|
<p>
|
||||||
|
C++ strings and null-terminated C-style strings can be easily converted
|
||||||
|
to a Slice:
|
||||||
|
<p>
|
||||||
|
<pre>
|
||||||
|
leveldb::Slice s1 = "hello";
|
||||||
|
|
||||||
|
std::string str("world");
|
||||||
|
leveldb::Slice s2 = str;
|
||||||
|
</pre>
|
||||||
|
A Slice can be easily converted back to a C++ string:
|
||||||
|
<pre>
|
||||||
|
std::string str = s1.ToString();
|
||||||
|
assert(str == std::string("hello"));
|
||||||
|
</pre>
|
||||||
|
Be careful when using Slices since it is up to the caller to ensure that
|
||||||
|
the external byte array into which the Slice points remains live while
|
||||||
|
the Slice is in use. For example, the following is buggy:
|
||||||
|
<p>
|
||||||
|
<pre>
|
||||||
|
leveldb::Slice slice;
|
||||||
|
if (...) {
|
||||||
|
std::string str = ...;
|
||||||
|
slice = str;
|
||||||
|
}
|
||||||
|
Use(slice);
|
||||||
|
</pre>
|
||||||
|
When the <code>if</code> statement goes out of scope, <code>str</code> will be destroyed and the
|
||||||
|
backing storage for <code>slice</code> will disappear.
|
||||||
|
<p>
|
||||||
|
<h1>Comparators</h1>
|
||||||
|
<p>
|
||||||
|
The preceding examples used the default ordering function for key,
|
||||||
|
which orders bytes lexicographically. You can however supply a custom
|
||||||
|
comparator when opening a database. For example, suppose each
|
||||||
|
database key consists of two numbers and we should sort by the first
|
||||||
|
number, breaking ties by the second number. First, define a proper
|
||||||
|
subclass of <code>leveldb::Comparator</code> that expresses these rules:
|
||||||
|
<p>
|
||||||
|
<pre>
|
||||||
|
class TwoPartComparator : public leveldb::Comparator {
|
||||||
|
public:
|
||||||
|
// Three-way comparison function:
|
||||||
|
// if a < b: negative result
|
||||||
|
// if a > b: positive result
|
||||||
|
// else: zero result
|
||||||
|
int Compare(const leveldb::Slice& a, const leveldb::Slice& b) const {
|
||||||
|
int a1, a2, b1, b2;
|
||||||
|
ParseKey(a, &a1, &a2);
|
||||||
|
ParseKey(b, &b1, &b2);
|
||||||
|
if (a1 < b1) return -1;
|
||||||
|
if (a1 > b1) return +1;
|
||||||
|
if (a2 < b2) return -1;
|
||||||
|
if (a2 > b2) return +1;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Ignore the following methods for now:
|
||||||
|
const char* Name() const { return "TwoPartComparator"; }
|
||||||
|
void FindShortestSeparator(std::string*, const leveldb::Slice&) const { }
|
||||||
|
void FindShortSuccessor(std::string*) const { }
|
||||||
|
};
|
||||||
|
</pre>
|
||||||
|
Now create a database using this custom comparator:
|
||||||
|
<p>
|
||||||
|
<pre>
|
||||||
|
TwoPartComparator cmp;
|
||||||
|
leveldb::DB* db;
|
||||||
|
leveldb::Options options;
|
||||||
|
options.create_if_missing = true;
|
||||||
|
options.comparator = &cmp;
|
||||||
|
leveldb::Status status = leveldb::DB::Open(options, "/tmp/testdb", &db);
|
||||||
|
...
|
||||||
|
</pre>
|
||||||
|
<h2>Backwards compatibility</h2>
|
||||||
|
<p>
|
||||||
|
The result of the comparator's <code>Name</code> method is attached to the
|
||||||
|
database when it is created, and is checked on every subsequent
|
||||||
|
database open. If the name changes, the <code>leveldb::DB::Open</code> call will
|
||||||
|
fail. Therefore, change the name if and only if the new key format
|
||||||
|
and comparison function are incompatible with existing databases, and
|
||||||
|
it is ok to discard the contents of all existing databases.
|
||||||
|
<p>
|
||||||
|
You can however still gradually evolve your key format over time with
|
||||||
|
a little bit of pre-planning. For example, you could store a version
|
||||||
|
number at the end of each key (one byte should suffice for most uses).
|
||||||
|
When you wish to switch to a new key format (e.g., adding an optional
|
||||||
|
third part to the keys processed by <code>TwoPartComparator</code>),
|
||||||
|
(a) keep the same comparator name (b) increment the version number
|
||||||
|
for new keys (c) change the comparator function so it uses the
|
||||||
|
version numbers found in the keys to decide how to interpret them.
|
||||||
|
<p>
|
||||||
|
<h1>Performance</h1>
|
||||||
|
<p>
|
||||||
|
Performance can be tuned by changing the default values of the
|
||||||
|
types defined in <code>include/leveldb/options.h</code>.
|
||||||
|
|
||||||
|
<p>
|
||||||
|
<h2>Block size</h2>
|
||||||
|
<p>
|
||||||
|
<code>leveldb</code> groups adjacent keys together into the same block and such a
|
||||||
|
block is the unit of transfer to and from persistent storage. The
|
||||||
|
default block size is approximately 4096 uncompressed bytes.
|
||||||
|
Applications that mostly do bulk scans over the contents of the
|
||||||
|
database may wish to increase this size. Applications that do a lot
|
||||||
|
of point reads of small values may wish to switch to a smaller block
|
||||||
|
size if performance measurements indicate an improvement. There isn't
|
||||||
|
much benefit in using blocks smaller than one kilobyte, or larger than
|
||||||
|
a few megabytes. Also note that compression will be more effective
|
||||||
|
with larger block sizes.
|
||||||
|
<p>
|
||||||
|
<h2>Compression</h2>
|
||||||
|
<p>
|
||||||
|
Each block is individually compressed before being written to
|
||||||
|
persistent storage. Compression is on by default since the default
|
||||||
|
compression method is very fast, and is automatically disabled for
|
||||||
|
uncompressible data. In rare cases, applications may want to disable
|
||||||
|
compression entirely, but should only do so if benchmarks show a
|
||||||
|
performance improvement:
|
||||||
|
<p>
|
||||||
|
<pre>
|
||||||
|
leveldb::Options options;
|
||||||
|
options.compression = leveldb::kNoCompression;
|
||||||
|
... leveldb::DB::Open(options, name, ...) ....
|
||||||
|
</pre>
|
||||||
|
<h2>Cache</h2>
|
||||||
|
<p>
|
||||||
|
The contents of the database are stored in a set of files in the
|
||||||
|
filesystem and each file stores a sequence of compressed blocks. If
|
||||||
|
<code>options.cache</code> is non-NULL, it is used to cache frequently used
|
||||||
|
uncompressed block contents.
|
||||||
|
<p>
|
||||||
|
<pre>
|
||||||
|
#include "leveldb/cache.h"
|
||||||
|
|
||||||
|
leveldb::Options options;
|
||||||
|
options.cache = leveldb::NewLRUCache(100 * 1048576); // 100MB cache
|
||||||
|
leveldb::DB* db;
|
||||||
|
leveldb::DB::Open(options, name, &db);
|
||||||
|
... use the db ...
|
||||||
|
delete db
|
||||||
|
delete options.cache;
|
||||||
|
</pre>
|
||||||
|
Note that the cache holds uncompressed data, and therefore it should
|
||||||
|
be sized according to application level data sizes, without any
|
||||||
|
reduction from compression. (Caching of compressed blocks is left to
|
||||||
|
the operating system buffer cache, or any custom <code>Env</code>
|
||||||
|
implementation provided by the client.)
|
||||||
|
<p>
|
||||||
|
When performing a bulk read, the application may wish to disable
|
||||||
|
caching so that the data processed by the bulk read does not end up
|
||||||
|
displacing most of the cached contents. A per-iterator option can be
|
||||||
|
used to achieve this:
|
||||||
|
<p>
|
||||||
|
<pre>
|
||||||
|
leveldb::ReadOptions options;
|
||||||
|
options.fill_cache = false;
|
||||||
|
leveldb::Iterator* it = db->NewIterator(options);
|
||||||
|
for (it->SeekToFirst(); it->Valid(); it->Next()) {
|
||||||
|
...
|
||||||
|
}
|
||||||
|
</pre>
|
||||||
|
<h2>Key Layout</h2>
|
||||||
|
<p>
|
||||||
|
Note that the unit of disk transfer and caching is a block. Adjacent
|
||||||
|
keys (according to the database sort order) will usually be placed in
|
||||||
|
the same block. Therefore the application can improve its performance
|
||||||
|
by placing keys that are accessed together near each other and placing
|
||||||
|
infrequently used keys in a separate region of the key space.
|
||||||
|
<p>
|
||||||
|
For example, suppose we are implementing a simple file system on top
|
||||||
|
of <code>leveldb</code>. The types of entries we might wish to store are:
|
||||||
|
<p>
|
||||||
|
<pre>
|
||||||
|
filename -> permission-bits, length, list of file_block_ids
|
||||||
|
file_block_id -> data
|
||||||
|
</pre>
|
||||||
|
We might want to prefix <code>filename</code> keys with one letter (say '/') and the
|
||||||
|
<code>file_block_id</code> keys with a different letter (say '0') so that scans
|
||||||
|
over just the metadata do not force us to fetch and cache bulky file
|
||||||
|
contents.
|
||||||
|
<p>
|
||||||
|
<h2>Filters</h2>
|
||||||
|
<p>
|
||||||
|
Because of the way <code>leveldb</code> data is organized on disk,
|
||||||
|
a single <code>Get()</code> call may involve multiple reads from disk.
|
||||||
|
The optional <code>FilterPolicy</code> mechanism can be used to reduce
|
||||||
|
the number of disk reads substantially.
|
||||||
|
<pre>
|
||||||
|
leveldb::Options options;
|
||||||
|
options.filter_policy = NewBloomFilter(10);
|
||||||
|
leveldb::DB* db;
|
||||||
|
leveldb::DB::Open(options, "/tmp/testdb", &db);
|
||||||
|
... use the database ...
|
||||||
|
delete db;
|
||||||
|
delete options.filter_policy;
|
||||||
|
</pre>
|
||||||
|
The preceding code associates a
|
||||||
|
<a href="http://en.wikipedia.org/wiki/Bloom_filter">Bloom filter</a>
|
||||||
|
based filtering policy with the database. Bloom filter based
|
||||||
|
filtering relies on keeping some number of bits of data in memory per
|
||||||
|
key (in this case 10 bits per key since that is the argument we passed
|
||||||
|
to NewBloomFilter). This filter will reduce the number of unnecessary
|
||||||
|
disk reads needed for <code>Get()</code> calls by a factor of
|
||||||
|
approximately a 100. Increasing the bits per key will lead to a
|
||||||
|
larger reduction at the cost of more memory usage. We recommend that
|
||||||
|
applications whose working set does not fit in memory and that do a
|
||||||
|
lot of random reads set a filter policy.
|
||||||
|
<p>
|
||||||
|
If you are using a custom comparator, you should ensure that the filter
|
||||||
|
policy you are using is compatible with your comparator. For example,
|
||||||
|
consider a comparator that ignores trailing spaces when comparing keys.
|
||||||
|
<code>NewBloomFilter</code> must not be used with such a comparator.
|
||||||
|
Instead, the application should provide a custom filter policy that
|
||||||
|
also ignores trailing spaces. For example:
|
||||||
|
<pre>
|
||||||
|
class CustomFilterPolicy : public leveldb::FilterPolicy {
|
||||||
|
private:
|
||||||
|
FilterPolicy* builtin_policy_;
|
||||||
|
public:
|
||||||
|
CustomFilterPolicy() : builtin_policy_(NewBloomFilter(10)) { }
|
||||||
|
~CustomFilterPolicy() { delete builtin_policy_; }
|
||||||
|
|
||||||
|
const char* Name() const { return "IgnoreTrailingSpacesFilter"; }
|
||||||
|
|
||||||
|
void CreateFilter(const Slice* keys, int n, std::string* dst) const {
|
||||||
|
// Use builtin bloom filter code after removing trailing spaces
|
||||||
|
std::vector<Slice> trimmed(n);
|
||||||
|
for (int i = 0; i < n; i++) {
|
||||||
|
trimmed[i] = RemoveTrailingSpaces(keys[i]);
|
||||||
|
}
|
||||||
|
return builtin_policy_->CreateFilter(&trimmed[i], n, dst);
|
||||||
|
}
|
||||||
|
|
||||||
|
bool KeyMayMatch(const Slice& key, const Slice& filter) const {
|
||||||
|
// Use builtin bloom filter code after removing trailing spaces
|
||||||
|
return builtin_policy_->KeyMayMatch(RemoveTrailingSpaces(key), filter);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
</pre>
|
||||||
|
<p>
|
||||||
|
Advanced applications may provide a filter policy that does not use
|
||||||
|
a bloom filter but uses some other mechanism for summarizing a set
|
||||||
|
of keys. See <code>leveldb/filter_policy.h</code> for detail.
|
||||||
|
<p>
|
||||||
|
<h1>Checksums</h1>
|
||||||
|
<p>
|
||||||
|
<code>leveldb</code> associates checksums with all data it stores in the file system.
|
||||||
|
There are two separate controls provided over how aggressively these
|
||||||
|
checksums are verified:
|
||||||
|
<p>
|
||||||
|
<ul>
|
||||||
|
<li> <code>ReadOptions::verify_checksums</code> may be set to true to force
|
||||||
|
checksum verification of all data that is read from the file system on
|
||||||
|
behalf of a particular read. By default, no such verification is
|
||||||
|
done.
|
||||||
|
<p>
|
||||||
|
<li> <code>Options::paranoid_checks</code> may be set to true before opening a
|
||||||
|
database to make the database implementation raise an error as soon as
|
||||||
|
it detects an internal corruption. Depending on which portion of the
|
||||||
|
database has been corrupted, the error may be raised when the database
|
||||||
|
is opened, or later by another database operation. By default,
|
||||||
|
paranoid checking is off so that the database can be used even if
|
||||||
|
parts of its persistent storage have been corrupted.
|
||||||
|
<p>
|
||||||
|
If a database is corrupted (perhaps it cannot be opened when
|
||||||
|
paranoid checking is turned on), the <code>leveldb::RepairDB</code> function
|
||||||
|
may be used to recover as much of the data as possible
|
||||||
|
<p>
|
||||||
|
</ul>
|
||||||
|
<h1>Approximate Sizes</h1>
|
||||||
|
<p>
|
||||||
|
The <code>GetApproximateSizes</code> method can used to get the approximate
|
||||||
|
number of bytes of file system space used by one or more key ranges.
|
||||||
|
<p>
|
||||||
|
<pre>
|
||||||
|
leveldb::Range ranges[2];
|
||||||
|
ranges[0] = leveldb::Range("a", "c");
|
||||||
|
ranges[1] = leveldb::Range("x", "z");
|
||||||
|
uint64_t sizes[2];
|
||||||
|
leveldb::Status s = db->GetApproximateSizes(ranges, 2, sizes);
|
||||||
|
</pre>
|
||||||
|
The preceding call will set <code>sizes[0]</code> to the approximate number of
|
||||||
|
bytes of file system space used by the key range <code>[a..c)</code> and
|
||||||
|
<code>sizes[1]</code> to the approximate number of bytes used by the key range
|
||||||
|
<code>[x..z)</code>.
|
||||||
|
<p>
|
||||||
|
<h1>Environment</h1>
|
||||||
|
<p>
|
||||||
|
All file operations (and other operating system calls) issued by the
|
||||||
|
<code>leveldb</code> implementation are routed through a <code>leveldb::Env</code> object.
|
||||||
|
Sophisticated clients may wish to provide their own <code>Env</code>
|
||||||
|
implementation to get better control. For example, an application may
|
||||||
|
introduce artificial delays in the file IO paths to limit the impact
|
||||||
|
of <code>leveldb</code> on other activities in the system.
|
||||||
|
<p>
|
||||||
|
<pre>
|
||||||
|
class SlowEnv : public leveldb::Env {
|
||||||
|
.. implementation of the Env interface ...
|
||||||
|
};
|
||||||
|
|
||||||
|
SlowEnv env;
|
||||||
|
leveldb::Options options;
|
||||||
|
options.env = &env;
|
||||||
|
Status s = leveldb::DB::Open(options, ...);
|
||||||
|
</pre>
|
||||||
|
<h1>Porting</h1>
|
||||||
|
<p>
|
||||||
|
<code>leveldb</code> may be ported to a new platform by providing platform
|
||||||
|
specific implementations of the types/methods/functions exported by
|
||||||
|
<code>leveldb/port/port.h</code>. See <code>leveldb/port/port_example.h</code> for more
|
||||||
|
details.
|
||||||
|
<p>
|
||||||
|
In addition, the new platform may need a new default <code>leveldb::Env</code>
|
||||||
|
implementation. See <code>leveldb/util/env_posix.h</code> for an example.
|
||||||
|
|
||||||
|
<h1>Other Information</h1>
|
||||||
|
|
||||||
|
<p>
|
||||||
|
Details about the <code>leveldb</code> implementation may be found in
|
||||||
|
the following documents:
|
||||||
|
<ul>
|
||||||
|
<li> <a href="impl.html">Implementation notes</a>
|
||||||
|
<li> <a href="table_format.txt">Format of an immutable Table file</a>
|
||||||
|
<li> <a href="log_format.txt">Format of a log file</a>
|
||||||
|
</ul>
|
||||||
|
|
||||||
|
</body>
|
||||||
|
</html>
|
|
@ -1,523 +0,0 @@
|
||||||
leveldb
|
|
||||||
=======
|
|
||||||
|
|
||||||
_Jeff Dean, Sanjay Ghemawat_
|
|
||||||
|
|
||||||
The leveldb library provides a persistent key value store. Keys and values are
|
|
||||||
arbitrary byte arrays. The keys are ordered within the key value store
|
|
||||||
according to a user-specified comparator function.
|
|
||||||
|
|
||||||
## Opening A Database
|
|
||||||
|
|
||||||
A leveldb database has a name which corresponds to a file system directory. All
|
|
||||||
of the contents of database are stored in this directory. The following example
|
|
||||||
shows how to open a database, creating it if necessary:
|
|
||||||
|
|
||||||
```c++
|
|
||||||
#include <cassert>
|
|
||||||
#include "leveldb/db.h"
|
|
||||||
|
|
||||||
leveldb::DB* db;
|
|
||||||
leveldb::Options options;
|
|
||||||
options.create_if_missing = true;
|
|
||||||
leveldb::Status status = leveldb::DB::Open(options, "/tmp/testdb", &db);
|
|
||||||
assert(status.ok());
|
|
||||||
...
|
|
||||||
```
|
|
||||||
|
|
||||||
If you want to raise an error if the database already exists, add the following
|
|
||||||
line before the `leveldb::DB::Open` call:
|
|
||||||
|
|
||||||
```c++
|
|
||||||
options.error_if_exists = true;
|
|
||||||
```
|
|
||||||
|
|
||||||
## Status
|
|
||||||
|
|
||||||
You may have noticed the `leveldb::Status` type above. Values of this type are
|
|
||||||
returned by most functions in leveldb that may encounter an error. You can check
|
|
||||||
if such a result is ok, and also print an associated error message:
|
|
||||||
|
|
||||||
```c++
|
|
||||||
leveldb::Status s = ...;
|
|
||||||
if (!s.ok()) cerr << s.ToString() << endl;
|
|
||||||
```
|
|
||||||
|
|
||||||
## Closing A Database
|
|
||||||
|
|
||||||
When you are done with a database, just delete the database object. Example:
|
|
||||||
|
|
||||||
```c++
|
|
||||||
... open the db as described above ...
|
|
||||||
... do something with db ...
|
|
||||||
delete db;
|
|
||||||
```
|
|
||||||
|
|
||||||
## Reads And Writes
|
|
||||||
|
|
||||||
The database provides Put, Delete, and Get methods to modify/query the database.
|
|
||||||
For example, the following code moves the value stored under key1 to key2.
|
|
||||||
|
|
||||||
```c++
|
|
||||||
std::string value;
|
|
||||||
leveldb::Status s = db->Get(leveldb::ReadOptions(), key1, &value);
|
|
||||||
if (s.ok()) s = db->Put(leveldb::WriteOptions(), key2, value);
|
|
||||||
if (s.ok()) s = db->Delete(leveldb::WriteOptions(), key1);
|
|
||||||
```
|
|
||||||
|
|
||||||
## Atomic Updates
|
|
||||||
|
|
||||||
Note that if the process dies after the Put of key2 but before the delete of
|
|
||||||
key1, the same value may be left stored under multiple keys. Such problems can
|
|
||||||
be avoided by using the `WriteBatch` class to atomically apply a set of updates:
|
|
||||||
|
|
||||||
```c++
|
|
||||||
#include "leveldb/write_batch.h"
|
|
||||||
...
|
|
||||||
std::string value;
|
|
||||||
leveldb::Status s = db->Get(leveldb::ReadOptions(), key1, &value);
|
|
||||||
if (s.ok()) {
|
|
||||||
leveldb::WriteBatch batch;
|
|
||||||
batch.Delete(key1);
|
|
||||||
batch.Put(key2, value);
|
|
||||||
s = db->Write(leveldb::WriteOptions(), &batch);
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
The `WriteBatch` holds a sequence of edits to be made to the database, and these
|
|
||||||
edits within the batch are applied in order. Note that we called Delete before
|
|
||||||
Put so that if key1 is identical to key2, we do not end up erroneously dropping
|
|
||||||
the value entirely.
|
|
||||||
|
|
||||||
Apart from its atomicity benefits, `WriteBatch` may also be used to speed up
|
|
||||||
bulk updates by placing lots of individual mutations into the same batch.
|
|
||||||
|
|
||||||
## Synchronous Writes
|
|
||||||
|
|
||||||
By default, each write to leveldb is asynchronous: it returns after pushing the
|
|
||||||
write from the process into the operating system. The transfer from operating
|
|
||||||
system memory to the underlying persistent storage happens asynchronously. The
|
|
||||||
sync flag can be turned on for a particular write to make the write operation
|
|
||||||
not return until the data being written has been pushed all the way to
|
|
||||||
persistent storage. (On Posix systems, this is implemented by calling either
|
|
||||||
`fsync(...)` or `fdatasync(...)` or `msync(..., MS_SYNC)` before the write
|
|
||||||
operation returns.)
|
|
||||||
|
|
||||||
```c++
|
|
||||||
leveldb::WriteOptions write_options;
|
|
||||||
write_options.sync = true;
|
|
||||||
db->Put(write_options, ...);
|
|
||||||
```
|
|
||||||
|
|
||||||
Asynchronous writes are often more than a thousand times as fast as synchronous
|
|
||||||
writes. The downside of asynchronous writes is that a crash of the machine may
|
|
||||||
cause the last few updates to be lost. Note that a crash of just the writing
|
|
||||||
process (i.e., not a reboot) will not cause any loss since even when sync is
|
|
||||||
false, an update is pushed from the process memory into the operating system
|
|
||||||
before it is considered done.
|
|
||||||
|
|
||||||
Asynchronous writes can often be used safely. For example, when loading a large
|
|
||||||
amount of data into the database you can handle lost updates by restarting the
|
|
||||||
bulk load after a crash. A hybrid scheme is also possible where every Nth write
|
|
||||||
is synchronous, and in the event of a crash, the bulk load is restarted just
|
|
||||||
after the last synchronous write finished by the previous run. (The synchronous
|
|
||||||
write can update a marker that describes where to restart on a crash.)
|
|
||||||
|
|
||||||
`WriteBatch` provides an alternative to asynchronous writes. Multiple updates
|
|
||||||
may be placed in the same WriteBatch and applied together using a synchronous
|
|
||||||
write (i.e., `write_options.sync` is set to true). The extra cost of the
|
|
||||||
synchronous write will be amortized across all of the writes in the batch.
|
|
||||||
|
|
||||||
## Concurrency
|
|
||||||
|
|
||||||
A database may only be opened by one process at a time. The leveldb
|
|
||||||
implementation acquires a lock from the operating system to prevent misuse.
|
|
||||||
Within a single process, the same `leveldb::DB` object may be safely shared by
|
|
||||||
multiple concurrent threads. I.e., different threads may write into or fetch
|
|
||||||
iterators or call Get on the same database without any external synchronization
|
|
||||||
(the leveldb implementation will automatically do the required synchronization).
|
|
||||||
However other objects (like Iterator and `WriteBatch`) may require external
|
|
||||||
synchronization. If two threads share such an object, they must protect access
|
|
||||||
to it using their own locking protocol. More details are available in the public
|
|
||||||
header files.
|
|
||||||
|
|
||||||
## Iteration
|
|
||||||
|
|
||||||
The following example demonstrates how to print all key,value pairs in a
|
|
||||||
database.
|
|
||||||
|
|
||||||
```c++
|
|
||||||
leveldb::Iterator* it = db->NewIterator(leveldb::ReadOptions());
|
|
||||||
for (it->SeekToFirst(); it->Valid(); it->Next()) {
|
|
||||||
cout << it->key().ToString() << ": " << it->value().ToString() << endl;
|
|
||||||
}
|
|
||||||
assert(it->status().ok()); // Check for any errors found during the scan
|
|
||||||
delete it;
|
|
||||||
```
|
|
||||||
|
|
||||||
The following variation shows how to process just the keys in the range
|
|
||||||
[start,limit):
|
|
||||||
|
|
||||||
```c++
|
|
||||||
for (it->Seek(start);
|
|
||||||
it->Valid() && it->key().ToString() < limit;
|
|
||||||
it->Next()) {
|
|
||||||
...
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
You can also process entries in reverse order. (Caveat: reverse iteration may be
|
|
||||||
somewhat slower than forward iteration.)
|
|
||||||
|
|
||||||
```c++
|
|
||||||
for (it->SeekToLast(); it->Valid(); it->Prev()) {
|
|
||||||
...
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
## Snapshots
|
|
||||||
|
|
||||||
Snapshots provide consistent read-only views over the entire state of the
|
|
||||||
key-value store. `ReadOptions::snapshot` may be non-NULL to indicate that a
|
|
||||||
read should operate on a particular version of the DB state. If
|
|
||||||
`ReadOptions::snapshot` is NULL, the read will operate on an implicit snapshot
|
|
||||||
of the current state.
|
|
||||||
|
|
||||||
Snapshots are created by the `DB::GetSnapshot()` method:
|
|
||||||
|
|
||||||
```c++
|
|
||||||
leveldb::ReadOptions options;
|
|
||||||
options.snapshot = db->GetSnapshot();
|
|
||||||
... apply some updates to db ...
|
|
||||||
leveldb::Iterator* iter = db->NewIterator(options);
|
|
||||||
... read using iter to view the state when the snapshot was created ...
|
|
||||||
delete iter;
|
|
||||||
db->ReleaseSnapshot(options.snapshot);
|
|
||||||
```
|
|
||||||
|
|
||||||
Note that when a snapshot is no longer needed, it should be released using the
|
|
||||||
`DB::ReleaseSnapshot` interface. This allows the implementation to get rid of
|
|
||||||
state that was being maintained just to support reading as of that snapshot.
|
|
||||||
|
|
||||||
## Slice
|
|
||||||
|
|
||||||
The return value of the `it->key()` and `it->value()` calls above are instances
|
|
||||||
of the `leveldb::Slice` type. Slice is a simple structure that contains a length
|
|
||||||
and a pointer to an external byte array. Returning a Slice is a cheaper
|
|
||||||
alternative to returning a `std::string` since we do not need to copy
|
|
||||||
potentially large keys and values. In addition, leveldb methods do not return
|
|
||||||
null-terminated C-style strings since leveldb keys and values are allowed to
|
|
||||||
contain `'\0'` bytes.
|
|
||||||
|
|
||||||
C++ strings and null-terminated C-style strings can be easily converted to a
|
|
||||||
Slice:
|
|
||||||
|
|
||||||
```c++
|
|
||||||
leveldb::Slice s1 = "hello";
|
|
||||||
|
|
||||||
std::string str("world");
|
|
||||||
leveldb::Slice s2 = str;
|
|
||||||
```
|
|
||||||
|
|
||||||
A Slice can be easily converted back to a C++ string:
|
|
||||||
|
|
||||||
```c++
|
|
||||||
std::string str = s1.ToString();
|
|
||||||
assert(str == std::string("hello"));
|
|
||||||
```
|
|
||||||
|
|
||||||
Be careful when using Slices since it is up to the caller to ensure that the
|
|
||||||
external byte array into which the Slice points remains live while the Slice is
|
|
||||||
in use. For example, the following is buggy:
|
|
||||||
|
|
||||||
```c++
|
|
||||||
leveldb::Slice slice;
|
|
||||||
if (...) {
|
|
||||||
std::string str = ...;
|
|
||||||
slice = str;
|
|
||||||
}
|
|
||||||
Use(slice);
|
|
||||||
```
|
|
||||||
|
|
||||||
When the if statement goes out of scope, str will be destroyed and the backing
|
|
||||||
storage for slice will disappear.
|
|
||||||
|
|
||||||
## Comparators
|
|
||||||
|
|
||||||
The preceding examples used the default ordering function for key, which orders
|
|
||||||
bytes lexicographically. You can however supply a custom comparator when opening
|
|
||||||
a database. For example, suppose each database key consists of two numbers and
|
|
||||||
we should sort by the first number, breaking ties by the second number. First,
|
|
||||||
define a proper subclass of `leveldb::Comparator` that expresses these rules:
|
|
||||||
|
|
||||||
```c++
|
|
||||||
class TwoPartComparator : public leveldb::Comparator {
|
|
||||||
public:
|
|
||||||
// Three-way comparison function:
|
|
||||||
// if a < b: negative result
|
|
||||||
// if a > b: positive result
|
|
||||||
// else: zero result
|
|
||||||
int Compare(const leveldb::Slice& a, const leveldb::Slice& b) const {
|
|
||||||
int a1, a2, b1, b2;
|
|
||||||
ParseKey(a, &a1, &a2);
|
|
||||||
ParseKey(b, &b1, &b2);
|
|
||||||
if (a1 < b1) return -1;
|
|
||||||
if (a1 > b1) return +1;
|
|
||||||
if (a2 < b2) return -1;
|
|
||||||
if (a2 > b2) return +1;
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Ignore the following methods for now:
|
|
||||||
const char* Name() const { return "TwoPartComparator"; }
|
|
||||||
void FindShortestSeparator(std::string*, const leveldb::Slice&) const {}
|
|
||||||
void FindShortSuccessor(std::string*) const {}
|
|
||||||
};
|
|
||||||
```
|
|
||||||
|
|
||||||
Now create a database using this custom comparator:
|
|
||||||
|
|
||||||
```c++
|
|
||||||
TwoPartComparator cmp;
|
|
||||||
leveldb::DB* db;
|
|
||||||
leveldb::Options options;
|
|
||||||
options.create_if_missing = true;
|
|
||||||
options.comparator = &cmp;
|
|
||||||
leveldb::Status status = leveldb::DB::Open(options, "/tmp/testdb", &db);
|
|
||||||
...
|
|
||||||
```
|
|
||||||
|
|
||||||
### Backwards compatibility
|
|
||||||
|
|
||||||
The result of the comparator's Name method is attached to the database when it
|
|
||||||
is created, and is checked on every subsequent database open. If the name
|
|
||||||
changes, the `leveldb::DB::Open` call will fail. Therefore, change the name if
|
|
||||||
and only if the new key format and comparison function are incompatible with
|
|
||||||
existing databases, and it is ok to discard the contents of all existing
|
|
||||||
databases.
|
|
||||||
|
|
||||||
You can however still gradually evolve your key format over time with a little
|
|
||||||
bit of pre-planning. For example, you could store a version number at the end of
|
|
||||||
each key (one byte should suffice for most uses). When you wish to switch to a
|
|
||||||
new key format (e.g., adding an optional third part to the keys processed by
|
|
||||||
`TwoPartComparator`), (a) keep the same comparator name (b) increment the
|
|
||||||
version number for new keys (c) change the comparator function so it uses the
|
|
||||||
version numbers found in the keys to decide how to interpret them.
|
|
||||||
|
|
||||||
## Performance
|
|
||||||
|
|
||||||
Performance can be tuned by changing the default values of the types defined in
|
|
||||||
`include/leveldb/options.h`.
|
|
||||||
|
|
||||||
### Block size
|
|
||||||
|
|
||||||
leveldb groups adjacent keys together into the same block and such a block is
|
|
||||||
the unit of transfer to and from persistent storage. The default block size is
|
|
||||||
approximately 4096 uncompressed bytes. Applications that mostly do bulk scans
|
|
||||||
over the contents of the database may wish to increase this size. Applications
|
|
||||||
that do a lot of point reads of small values may wish to switch to a smaller
|
|
||||||
block size if performance measurements indicate an improvement. There isn't much
|
|
||||||
benefit in using blocks smaller than one kilobyte, or larger than a few
|
|
||||||
megabytes. Also note that compression will be more effective with larger block
|
|
||||||
sizes.
|
|
||||||
|
|
||||||
### Compression
|
|
||||||
|
|
||||||
Each block is individually compressed before being written to persistent
|
|
||||||
storage. Compression is on by default since the default compression method is
|
|
||||||
very fast, and is automatically disabled for uncompressible data. In rare cases,
|
|
||||||
applications may want to disable compression entirely, but should only do so if
|
|
||||||
benchmarks show a performance improvement:
|
|
||||||
|
|
||||||
```c++
|
|
||||||
leveldb::Options options;
|
|
||||||
options.compression = leveldb::kNoCompression;
|
|
||||||
... leveldb::DB::Open(options, name, ...) ....
|
|
||||||
```
|
|
||||||
|
|
||||||
### Cache
|
|
||||||
|
|
||||||
The contents of the database are stored in a set of files in the filesystem and
|
|
||||||
each file stores a sequence of compressed blocks. If options.cache is non-NULL,
|
|
||||||
it is used to cache frequently used uncompressed block contents.
|
|
||||||
|
|
||||||
```c++
|
|
||||||
#include "leveldb/cache.h"
|
|
||||||
|
|
||||||
leveldb::Options options;
|
|
||||||
options.cache = leveldb::NewLRUCache(100 * 1048576); // 100MB cache
|
|
||||||
leveldb::DB* db;
|
|
||||||
leveldb::DB::Open(options, name, &db);
|
|
||||||
... use the db ...
|
|
||||||
delete db
|
|
||||||
delete options.cache;
|
|
||||||
```
|
|
||||||
|
|
||||||
Note that the cache holds uncompressed data, and therefore it should be sized
|
|
||||||
according to application level data sizes, without any reduction from
|
|
||||||
compression. (Caching of compressed blocks is left to the operating system
|
|
||||||
buffer cache, or any custom Env implementation provided by the client.)
|
|
||||||
|
|
||||||
When performing a bulk read, the application may wish to disable caching so that
|
|
||||||
the data processed by the bulk read does not end up displacing most of the
|
|
||||||
cached contents. A per-iterator option can be used to achieve this:
|
|
||||||
|
|
||||||
```c++
|
|
||||||
leveldb::ReadOptions options;
|
|
||||||
options.fill_cache = false;
|
|
||||||
leveldb::Iterator* it = db->NewIterator(options);
|
|
||||||
for (it->SeekToFirst(); it->Valid(); it->Next()) {
|
|
||||||
...
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
### Key Layout
|
|
||||||
|
|
||||||
Note that the unit of disk transfer and caching is a block. Adjacent keys
|
|
||||||
(according to the database sort order) will usually be placed in the same block.
|
|
||||||
Therefore the application can improve its performance by placing keys that are
|
|
||||||
accessed together near each other and placing infrequently used keys in a
|
|
||||||
separate region of the key space.
|
|
||||||
|
|
||||||
For example, suppose we are implementing a simple file system on top of leveldb.
|
|
||||||
The types of entries we might wish to store are:
|
|
||||||
|
|
||||||
filename -> permission-bits, length, list of file_block_ids
|
|
||||||
file_block_id -> data
|
|
||||||
|
|
||||||
We might want to prefix filename keys with one letter (say '/') and the
|
|
||||||
`file_block_id` keys with a different letter (say '0') so that scans over just
|
|
||||||
the metadata do not force us to fetch and cache bulky file contents.
|
|
||||||
|
|
||||||
### Filters
|
|
||||||
|
|
||||||
Because of the way leveldb data is organized on disk, a single `Get()` call may
|
|
||||||
involve multiple reads from disk. The optional FilterPolicy mechanism can be
|
|
||||||
used to reduce the number of disk reads substantially.
|
|
||||||
|
|
||||||
```c++
|
|
||||||
leveldb::Options options;
|
|
||||||
options.filter_policy = NewBloomFilterPolicy(10);
|
|
||||||
leveldb::DB* db;
|
|
||||||
leveldb::DB::Open(options, "/tmp/testdb", &db);
|
|
||||||
... use the database ...
|
|
||||||
delete db;
|
|
||||||
delete options.filter_policy;
|
|
||||||
```
|
|
||||||
|
|
||||||
The preceding code associates a Bloom filter based filtering policy with the
|
|
||||||
database. Bloom filter based filtering relies on keeping some number of bits of
|
|
||||||
data in memory per key (in this case 10 bits per key since that is the argument
|
|
||||||
we passed to `NewBloomFilterPolicy`). This filter will reduce the number of
|
|
||||||
unnecessary disk reads needed for Get() calls by a factor of approximately
|
|
||||||
a 100. Increasing the bits per key will lead to a larger reduction at the cost
|
|
||||||
of more memory usage. We recommend that applications whose working set does not
|
|
||||||
fit in memory and that do a lot of random reads set a filter policy.
|
|
||||||
|
|
||||||
If you are using a custom comparator, you should ensure that the filter policy
|
|
||||||
you are using is compatible with your comparator. For example, consider a
|
|
||||||
comparator that ignores trailing spaces when comparing keys.
|
|
||||||
`NewBloomFilterPolicy` must not be used with such a comparator. Instead, the
|
|
||||||
application should provide a custom filter policy that also ignores trailing
|
|
||||||
spaces. For example:
|
|
||||||
|
|
||||||
```c++
|
|
||||||
class CustomFilterPolicy : public leveldb::FilterPolicy {
|
|
||||||
private:
|
|
||||||
FilterPolicy* builtin_policy_;
|
|
||||||
|
|
||||||
public:
|
|
||||||
CustomFilterPolicy() : builtin_policy_(NewBloomFilterPolicy(10)) {}
|
|
||||||
~CustomFilterPolicy() { delete builtin_policy_; }
|
|
||||||
|
|
||||||
const char* Name() const { return "IgnoreTrailingSpacesFilter"; }
|
|
||||||
|
|
||||||
void CreateFilter(const Slice* keys, int n, std::string* dst) const {
|
|
||||||
// Use builtin bloom filter code after removing trailing spaces
|
|
||||||
std::vector<Slice> trimmed(n);
|
|
||||||
for (int i = 0; i < n; i++) {
|
|
||||||
trimmed[i] = RemoveTrailingSpaces(keys[i]);
|
|
||||||
}
|
|
||||||
return builtin_policy_->CreateFilter(&trimmed[i], n, dst);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
```
|
|
||||||
|
|
||||||
Advanced applications may provide a filter policy that does not use a bloom
|
|
||||||
filter but uses some other mechanism for summarizing a set of keys. See
|
|
||||||
`leveldb/filter_policy.h` for detail.
|
|
||||||
|
|
||||||
## Checksums
|
|
||||||
|
|
||||||
leveldb associates checksums with all data it stores in the file system. There
|
|
||||||
are two separate controls provided over how aggressively these checksums are
|
|
||||||
verified:
|
|
||||||
|
|
||||||
`ReadOptions::verify_checksums` may be set to true to force checksum
|
|
||||||
verification of all data that is read from the file system on behalf of a
|
|
||||||
particular read. By default, no such verification is done.
|
|
||||||
|
|
||||||
`Options::paranoid_checks` may be set to true before opening a database to make
|
|
||||||
the database implementation raise an error as soon as it detects an internal
|
|
||||||
corruption. Depending on which portion of the database has been corrupted, the
|
|
||||||
error may be raised when the database is opened, or later by another database
|
|
||||||
operation. By default, paranoid checking is off so that the database can be used
|
|
||||||
even if parts of its persistent storage have been corrupted.
|
|
||||||
|
|
||||||
If a database is corrupted (perhaps it cannot be opened when paranoid checking
|
|
||||||
is turned on), the `leveldb::RepairDB` function may be used to recover as much
|
|
||||||
of the data as possible
|
|
||||||
|
|
||||||
## Approximate Sizes
|
|
||||||
|
|
||||||
The `GetApproximateSizes` method can used to get the approximate number of bytes
|
|
||||||
of file system space used by one or more key ranges.
|
|
||||||
|
|
||||||
```c++
|
|
||||||
leveldb::Range ranges[2];
|
|
||||||
ranges[0] = leveldb::Range("a", "c");
|
|
||||||
ranges[1] = leveldb::Range("x", "z");
|
|
||||||
uint64_t sizes[2];
|
|
||||||
leveldb::Status s = db->GetApproximateSizes(ranges, 2, sizes);
|
|
||||||
```
|
|
||||||
|
|
||||||
The preceding call will set `sizes[0]` to the approximate number of bytes of
|
|
||||||
file system space used by the key range `[a..c)` and `sizes[1]` to the
|
|
||||||
approximate number of bytes used by the key range `[x..z)`.
|
|
||||||
|
|
||||||
## Environment
|
|
||||||
|
|
||||||
All file operations (and other operating system calls) issued by the leveldb
|
|
||||||
implementation are routed through a `leveldb::Env` object. Sophisticated clients
|
|
||||||
may wish to provide their own Env implementation to get better control.
|
|
||||||
For example, an application may introduce artificial delays in the file IO
|
|
||||||
paths to limit the impact of leveldb on other activities in the system.
|
|
||||||
|
|
||||||
```c++
|
|
||||||
class SlowEnv : public leveldb::Env {
|
|
||||||
... implementation of the Env interface ...
|
|
||||||
};
|
|
||||||
|
|
||||||
SlowEnv env;
|
|
||||||
leveldb::Options options;
|
|
||||||
options.env = &env;
|
|
||||||
Status s = leveldb::DB::Open(options, ...);
|
|
||||||
```
|
|
||||||
|
|
||||||
## Porting
|
|
||||||
|
|
||||||
leveldb may be ported to a new platform by providing platform specific
|
|
||||||
implementations of the types/methods/functions exported by
|
|
||||||
`leveldb/port/port.h`. See `leveldb/port/port_example.h` for more details.
|
|
||||||
|
|
||||||
In addition, the new platform may need a new default `leveldb::Env`
|
|
||||||
implementation. See `leveldb/util/env_posix.h` for an example.
|
|
||||||
|
|
||||||
## Other Information
|
|
||||||
|
|
||||||
Details about the leveldb implementation may be found in the following
|
|
||||||
documents:
|
|
||||||
|
|
||||||
1. [Implementation notes](impl.md)
|
|
||||||
2. [Format of an immutable Table file](table_format.md)
|
|
||||||
3. [Format of a log file](log_format.md)
|
|
|
@ -1,75 +0,0 @@
|
||||||
leveldb Log format
|
|
||||||
==================
|
|
||||||
The log file contents are a sequence of 32KB blocks. The only exception is that
|
|
||||||
the tail of the file may contain a partial block.
|
|
||||||
|
|
||||||
Each block consists of a sequence of records:
|
|
||||||
|
|
||||||
block := record* trailer?
|
|
||||||
record :=
|
|
||||||
checksum: uint32 // crc32c of type and data[] ; little-endian
|
|
||||||
length: uint16 // little-endian
|
|
||||||
type: uint8 // One of FULL, FIRST, MIDDLE, LAST
|
|
||||||
data: uint8[length]
|
|
||||||
|
|
||||||
A record never starts within the last six bytes of a block (since it won't fit).
|
|
||||||
Any leftover bytes here form the trailer, which must consist entirely of zero
|
|
||||||
bytes and must be skipped by readers.
|
|
||||||
|
|
||||||
Aside: if exactly seven bytes are left in the current block, and a new non-zero
|
|
||||||
length record is added, the writer must emit a FIRST record (which contains zero
|
|
||||||
bytes of user data) to fill up the trailing seven bytes of the block and then
|
|
||||||
emit all of the user data in subsequent blocks.
|
|
||||||
|
|
||||||
More types may be added in the future. Some Readers may skip record types they
|
|
||||||
do not understand, others may report that some data was skipped.
|
|
||||||
|
|
||||||
FULL == 1
|
|
||||||
FIRST == 2
|
|
||||||
MIDDLE == 3
|
|
||||||
LAST == 4
|
|
||||||
|
|
||||||
The FULL record contains the contents of an entire user record.
|
|
||||||
|
|
||||||
FIRST, MIDDLE, LAST are types used for user records that have been split into
|
|
||||||
multiple fragments (typically because of block boundaries). FIRST is the type
|
|
||||||
of the first fragment of a user record, LAST is the type of the last fragment of
|
|
||||||
a user record, and MIDDLE is the type of all interior fragments of a user
|
|
||||||
record.
|
|
||||||
|
|
||||||
Example: consider a sequence of user records:
|
|
||||||
|
|
||||||
A: length 1000
|
|
||||||
B: length 97270
|
|
||||||
C: length 8000
|
|
||||||
|
|
||||||
**A** will be stored as a FULL record in the first block.
|
|
||||||
|
|
||||||
**B** will be split into three fragments: first fragment occupies the rest of
|
|
||||||
the first block, second fragment occupies the entirety of the second block, and
|
|
||||||
the third fragment occupies a prefix of the third block. This will leave six
|
|
||||||
bytes free in the third block, which will be left empty as the trailer.
|
|
||||||
|
|
||||||
**C** will be stored as a FULL record in the fourth block.
|
|
||||||
|
|
||||||
----
|
|
||||||
|
|
||||||
## Some benefits over the recordio format:
|
|
||||||
|
|
||||||
1. We do not need any heuristics for resyncing - just go to next block boundary
|
|
||||||
and scan. If there is a corruption, skip to the next block. As a
|
|
||||||
side-benefit, we do not get confused when part of the contents of one log
|
|
||||||
file are embedded as a record inside another log file.
|
|
||||||
|
|
||||||
2. Splitting at approximate boundaries (e.g., for mapreduce) is simple: find the
|
|
||||||
next block boundary and skip records until we hit a FULL or FIRST record.
|
|
||||||
|
|
||||||
3. We do not need extra buffering for large records.
|
|
||||||
|
|
||||||
## Some downsides compared to recordio format:
|
|
||||||
|
|
||||||
1. No packing of tiny records. This could be fixed by adding a new record type,
|
|
||||||
so it is a shortcoming of the current implementation, not necessarily the
|
|
||||||
format.
|
|
||||||
|
|
||||||
2. No compression. Again, this could be fixed by adding new record types.
|
|
75
src/leveldb/doc/log_format.txt
Normal file
75
src/leveldb/doc/log_format.txt
Normal file
|
@ -0,0 +1,75 @@
|
||||||
|
The log file contents are a sequence of 32KB blocks. The only
|
||||||
|
exception is that the tail of the file may contain a partial block.
|
||||||
|
|
||||||
|
Each block consists of a sequence of records:
|
||||||
|
block := record* trailer?
|
||||||
|
record :=
|
||||||
|
checksum: uint32 // crc32c of type and data[]
|
||||||
|
length: uint16
|
||||||
|
type: uint8 // One of FULL, FIRST, MIDDLE, LAST
|
||||||
|
data: uint8[length]
|
||||||
|
|
||||||
|
A record never starts within the last six bytes of a block (since it
|
||||||
|
won't fit). Any leftover bytes here form the trailer, which must
|
||||||
|
consist entirely of zero bytes and must be skipped by readers.
|
||||||
|
|
||||||
|
Aside: if exactly seven bytes are left in the current block, and a new
|
||||||
|
non-zero length record is added, the writer must emit a FIRST record
|
||||||
|
(which contains zero bytes of user data) to fill up the trailing seven
|
||||||
|
bytes of the block and then emit all of the user data in subsequent
|
||||||
|
blocks.
|
||||||
|
|
||||||
|
More types may be added in the future. Some Readers may skip record
|
||||||
|
types they do not understand, others may report that some data was
|
||||||
|
skipped.
|
||||||
|
|
||||||
|
FULL == 1
|
||||||
|
FIRST == 2
|
||||||
|
MIDDLE == 3
|
||||||
|
LAST == 4
|
||||||
|
|
||||||
|
The FULL record contains the contents of an entire user record.
|
||||||
|
|
||||||
|
FIRST, MIDDLE, LAST are types used for user records that have been
|
||||||
|
split into multiple fragments (typically because of block boundaries).
|
||||||
|
FIRST is the type of the first fragment of a user record, LAST is the
|
||||||
|
type of the last fragment of a user record, and MID is the type of all
|
||||||
|
interior fragments of a user record.
|
||||||
|
|
||||||
|
Example: consider a sequence of user records:
|
||||||
|
A: length 1000
|
||||||
|
B: length 97270
|
||||||
|
C: length 8000
|
||||||
|
A will be stored as a FULL record in the first block.
|
||||||
|
|
||||||
|
B will be split into three fragments: first fragment occupies the rest
|
||||||
|
of the first block, second fragment occupies the entirety of the
|
||||||
|
second block, and the third fragment occupies a prefix of the third
|
||||||
|
block. This will leave six bytes free in the third block, which will
|
||||||
|
be left empty as the trailer.
|
||||||
|
|
||||||
|
C will be stored as a FULL record in the fourth block.
|
||||||
|
|
||||||
|
===================
|
||||||
|
|
||||||
|
Some benefits over the recordio format:
|
||||||
|
|
||||||
|
(1) We do not need any heuristics for resyncing - just go to next
|
||||||
|
block boundary and scan. If there is a corruption, skip to the next
|
||||||
|
block. As a side-benefit, we do not get confused when part of the
|
||||||
|
contents of one log file are embedded as a record inside another log
|
||||||
|
file.
|
||||||
|
|
||||||
|
(2) Splitting at approximate boundaries (e.g., for mapreduce) is
|
||||||
|
simple: find the next block boundary and skip records until we
|
||||||
|
hit a FULL or FIRST record.
|
||||||
|
|
||||||
|
(3) We do not need extra buffering for large records.
|
||||||
|
|
||||||
|
Some downsides compared to recordio format:
|
||||||
|
|
||||||
|
(1) No packing of tiny records. This could be fixed by adding a new
|
||||||
|
record type, so it is a shortcoming of the current implementation,
|
||||||
|
not necessarily the format.
|
||||||
|
|
||||||
|
(2) No compression. Again, this could be fixed by adding new record types.
|
|
@ -1,107 +0,0 @@
|
||||||
leveldb File format
|
|
||||||
===================
|
|
||||||
|
|
||||||
<beginning_of_file>
|
|
||||||
[data block 1]
|
|
||||||
[data block 2]
|
|
||||||
...
|
|
||||||
[data block N]
|
|
||||||
[meta block 1]
|
|
||||||
...
|
|
||||||
[meta block K]
|
|
||||||
[metaindex block]
|
|
||||||
[index block]
|
|
||||||
[Footer] (fixed size; starts at file_size - sizeof(Footer))
|
|
||||||
<end_of_file>
|
|
||||||
|
|
||||||
The file contains internal pointers. Each such pointer is called
|
|
||||||
a BlockHandle and contains the following information:
|
|
||||||
|
|
||||||
offset: varint64
|
|
||||||
size: varint64
|
|
||||||
|
|
||||||
See [varints](https://developers.google.com/protocol-buffers/docs/encoding#varints)
|
|
||||||
for an explanation of varint64 format.
|
|
||||||
|
|
||||||
1. The sequence of key/value pairs in the file are stored in sorted
|
|
||||||
order and partitioned into a sequence of data blocks. These blocks
|
|
||||||
come one after another at the beginning of the file. Each data block
|
|
||||||
is formatted according to the code in `block_builder.cc`, and then
|
|
||||||
optionally compressed.
|
|
||||||
|
|
||||||
2. After the data blocks we store a bunch of meta blocks. The
|
|
||||||
supported meta block types are described below. More meta block types
|
|
||||||
may be added in the future. Each meta block is again formatted using
|
|
||||||
`block_builder.cc` and then optionally compressed.
|
|
||||||
|
|
||||||
3. A "metaindex" block. It contains one entry for every other meta
|
|
||||||
block where the key is the name of the meta block and the value is a
|
|
||||||
BlockHandle pointing to that meta block.
|
|
||||||
|
|
||||||
4. An "index" block. This block contains one entry per data block,
|
|
||||||
where the key is a string >= last key in that data block and before
|
|
||||||
the first key in the successive data block. The value is the
|
|
||||||
BlockHandle for the data block.
|
|
||||||
|
|
||||||
5. At the very end of the file is a fixed length footer that contains
|
|
||||||
the BlockHandle of the metaindex and index blocks as well as a magic number.
|
|
||||||
|
|
||||||
metaindex_handle: char[p]; // Block handle for metaindex
|
|
||||||
index_handle: char[q]; // Block handle for index
|
|
||||||
padding: char[40-p-q];// zeroed bytes to make fixed length
|
|
||||||
// (40==2*BlockHandle::kMaxEncodedLength)
|
|
||||||
magic: fixed64; // == 0xdb4775248b80fb57 (little-endian)
|
|
||||||
|
|
||||||
## "filter" Meta Block
|
|
||||||
|
|
||||||
If a `FilterPolicy` was specified when the database was opened, a
|
|
||||||
filter block is stored in each table. The "metaindex" block contains
|
|
||||||
an entry that maps from `filter.<N>` to the BlockHandle for the filter
|
|
||||||
block where `<N>` is the string returned by the filter policy's
|
|
||||||
`Name()` method.
|
|
||||||
|
|
||||||
The filter block stores a sequence of filters, where filter i contains
|
|
||||||
the output of `FilterPolicy::CreateFilter()` on all keys that are stored
|
|
||||||
in a block whose file offset falls within the range
|
|
||||||
|
|
||||||
[ i*base ... (i+1)*base-1 ]
|
|
||||||
|
|
||||||
Currently, "base" is 2KB. So for example, if blocks X and Y start in
|
|
||||||
the range `[ 0KB .. 2KB-1 ]`, all of the keys in X and Y will be
|
|
||||||
converted to a filter by calling `FilterPolicy::CreateFilter()`, and the
|
|
||||||
resulting filter will be stored as the first filter in the filter
|
|
||||||
block.
|
|
||||||
|
|
||||||
The filter block is formatted as follows:
|
|
||||||
|
|
||||||
[filter 0]
|
|
||||||
[filter 1]
|
|
||||||
[filter 2]
|
|
||||||
...
|
|
||||||
[filter N-1]
|
|
||||||
|
|
||||||
[offset of filter 0] : 4 bytes
|
|
||||||
[offset of filter 1] : 4 bytes
|
|
||||||
[offset of filter 2] : 4 bytes
|
|
||||||
...
|
|
||||||
[offset of filter N-1] : 4 bytes
|
|
||||||
|
|
||||||
[offset of beginning of offset array] : 4 bytes
|
|
||||||
lg(base) : 1 byte
|
|
||||||
|
|
||||||
The offset array at the end of the filter block allows efficient
|
|
||||||
mapping from a data block offset to the corresponding filter.
|
|
||||||
|
|
||||||
## "stats" Meta Block
|
|
||||||
|
|
||||||
This meta block contains a bunch of stats. The key is the name
|
|
||||||
of the statistic. The value contains the statistic.
|
|
||||||
|
|
||||||
TODO(postrelease): record following stats.
|
|
||||||
|
|
||||||
data size
|
|
||||||
index size
|
|
||||||
key size (uncompressed)
|
|
||||||
value size (uncompressed)
|
|
||||||
number of entries
|
|
||||||
number of data blocks
|
|
102
src/leveldb/doc/table_format.txt
Normal file
102
src/leveldb/doc/table_format.txt
Normal file
|
@ -0,0 +1,102 @@
|
||||||
|
File format
|
||||||
|
===========
|
||||||
|
|
||||||
|
<beginning_of_file>
|
||||||
|
[data block 1]
|
||||||
|
[data block 2]
|
||||||
|
...
|
||||||
|
[data block N]
|
||||||
|
[meta block 1]
|
||||||
|
...
|
||||||
|
[meta block K]
|
||||||
|
[metaindex block]
|
||||||
|
[index block]
|
||||||
|
[Footer] (fixed size; starts at file_size - sizeof(Footer))
|
||||||
|
<end_of_file>
|
||||||
|
|
||||||
|
The file contains internal pointers. Each such pointer is called
|
||||||
|
a BlockHandle and contains the following information:
|
||||||
|
offset: varint64
|
||||||
|
size: varint64
|
||||||
|
|
||||||
|
(1) The sequence of key/value pairs in the file are stored in sorted
|
||||||
|
order and partitioned into a sequence of data blocks. These blocks
|
||||||
|
come one after another at the beginning of the file. Each data block
|
||||||
|
is formatted according to the code in block_builder.cc, and then
|
||||||
|
optionally compressed.
|
||||||
|
|
||||||
|
(2) After the data blocks we store a bunch of meta blocks. The
|
||||||
|
supported meta block types are described below. More meta block types
|
||||||
|
may be added in the future. Each meta block is again formatted using
|
||||||
|
block_builder.cc and then optionally compressed.
|
||||||
|
|
||||||
|
(3) A "metaindex" block. It contains one entry for every other meta
|
||||||
|
block where the key is the name of the meta block and the value is a
|
||||||
|
BlockHandle pointing to that meta block.
|
||||||
|
|
||||||
|
(4) An "index" block. This block contains one entry per data block,
|
||||||
|
where the key is a string >= last key in that data block and before
|
||||||
|
the first key in the successive data block. The value is the
|
||||||
|
BlockHandle for the data block.
|
||||||
|
|
||||||
|
(6) At the very end of the file is a fixed length footer that contains
|
||||||
|
the BlockHandle of the metaindex and index blocks as well as a magic number.
|
||||||
|
metaindex_handle: char[p]; // Block handle for metaindex
|
||||||
|
index_handle: char[q]; // Block handle for index
|
||||||
|
padding: char[40-p-q]; // 0 bytes to make fixed length
|
||||||
|
// (40==2*BlockHandle::kMaxEncodedLength)
|
||||||
|
magic: fixed64; // == 0xdb4775248b80fb57
|
||||||
|
|
||||||
|
"filter" Meta Block
|
||||||
|
-------------------
|
||||||
|
|
||||||
|
If a "FilterPolicy" was specified when the database was opened, a
|
||||||
|
filter block is stored in each table. The "metaindex" block contains
|
||||||
|
an entry that maps from "filter.<N>" to the BlockHandle for the filter
|
||||||
|
block where "<N>" is the string returned by the filter policy's
|
||||||
|
"Name()" method.
|
||||||
|
|
||||||
|
The filter block stores a sequence of filters, where filter i contains
|
||||||
|
the output of FilterPolicy::CreateFilter() on all keys that are stored
|
||||||
|
in a block whose file offset falls within the range
|
||||||
|
|
||||||
|
[ i*base ... (i+1)*base-1 ]
|
||||||
|
|
||||||
|
Currently, "base" is 2KB. So for example, if blocks X and Y start in
|
||||||
|
the range [ 0KB .. 2KB-1 ], all of the keys in X and Y will be
|
||||||
|
converted to a filter by calling FilterPolicy::CreateFilter(), and the
|
||||||
|
resulting filter will be stored as the first filter in the filter
|
||||||
|
block.
|
||||||
|
|
||||||
|
The filter block is formatted as follows:
|
||||||
|
|
||||||
|
[filter 0]
|
||||||
|
[filter 1]
|
||||||
|
[filter 2]
|
||||||
|
...
|
||||||
|
[filter N-1]
|
||||||
|
|
||||||
|
[offset of filter 0] : 4 bytes
|
||||||
|
[offset of filter 1] : 4 bytes
|
||||||
|
[offset of filter 2] : 4 bytes
|
||||||
|
...
|
||||||
|
[offset of filter N-1] : 4 bytes
|
||||||
|
|
||||||
|
[offset of beginning of offset array] : 4 bytes
|
||||||
|
lg(base) : 1 byte
|
||||||
|
|
||||||
|
The offset array at the end of the filter block allows efficient
|
||||||
|
mapping from a data block offset to the corresponding filter.
|
||||||
|
|
||||||
|
"stats" Meta Block
|
||||||
|
------------------
|
||||||
|
|
||||||
|
This meta block contains a bunch of stats. The key is the name
|
||||||
|
of the statistic. The value contains the statistic.
|
||||||
|
TODO(postrelease): record following stats.
|
||||||
|
data size
|
||||||
|
index size
|
||||||
|
key size (uncompressed)
|
||||||
|
value size (uncompressed)
|
||||||
|
number of entries
|
||||||
|
number of data blocks
|
|
@ -55,15 +55,14 @@ class FileState {
|
||||||
}
|
}
|
||||||
const uint64_t available = size_ - offset;
|
const uint64_t available = size_ - offset;
|
||||||
if (n > available) {
|
if (n > available) {
|
||||||
n = static_cast<size_t>(available);
|
n = available;
|
||||||
}
|
}
|
||||||
if (n == 0) {
|
if (n == 0) {
|
||||||
*result = Slice();
|
*result = Slice();
|
||||||
return Status::OK();
|
return Status::OK();
|
||||||
}
|
}
|
||||||
|
|
||||||
assert(offset / kBlockSize <= SIZE_MAX);
|
size_t block = offset / kBlockSize;
|
||||||
size_t block = static_cast<size_t>(offset / kBlockSize);
|
|
||||||
size_t block_offset = offset % kBlockSize;
|
size_t block_offset = offset % kBlockSize;
|
||||||
|
|
||||||
if (n <= kBlockSize - block_offset) {
|
if (n <= kBlockSize - block_offset) {
|
||||||
|
@ -168,7 +167,7 @@ class SequentialFileImpl : public SequentialFile {
|
||||||
if (pos_ > file_->Size()) {
|
if (pos_ > file_->Size()) {
|
||||||
return Status::IOError("pos_ > file_->Size()");
|
return Status::IOError("pos_ > file_->Size()");
|
||||||
}
|
}
|
||||||
const uint64_t available = file_->Size() - pos_;
|
const size_t available = file_->Size() - pos_;
|
||||||
if (n > available) {
|
if (n > available) {
|
||||||
n = available;
|
n = available;
|
||||||
}
|
}
|
||||||
|
@ -176,10 +175,9 @@ class SequentialFileImpl : public SequentialFile {
|
||||||
return Status::OK();
|
return Status::OK();
|
||||||
}
|
}
|
||||||
|
|
||||||
virtual std::string GetName() const { return "[memenv]"; }
|
|
||||||
private:
|
private:
|
||||||
FileState* file_;
|
FileState* file_;
|
||||||
uint64_t pos_;
|
size_t pos_;
|
||||||
};
|
};
|
||||||
|
|
||||||
class RandomAccessFileImpl : public RandomAccessFile {
|
class RandomAccessFileImpl : public RandomAccessFile {
|
||||||
|
@ -197,7 +195,6 @@ class RandomAccessFileImpl : public RandomAccessFile {
|
||||||
return file_->Read(offset, n, result, scratch);
|
return file_->Read(offset, n, result, scratch);
|
||||||
}
|
}
|
||||||
|
|
||||||
virtual std::string GetName() const { return "[memenv]"; }
|
|
||||||
private:
|
private:
|
||||||
FileState* file_;
|
FileState* file_;
|
||||||
};
|
};
|
||||||
|
@ -220,16 +217,10 @@ class WritableFileImpl : public WritableFile {
|
||||||
virtual Status Flush() { return Status::OK(); }
|
virtual Status Flush() { return Status::OK(); }
|
||||||
virtual Status Sync() { return Status::OK(); }
|
virtual Status Sync() { return Status::OK(); }
|
||||||
|
|
||||||
virtual std::string GetName() const { return "[memenv]"; }
|
|
||||||
private:
|
private:
|
||||||
FileState* file_;
|
FileState* file_;
|
||||||
};
|
};
|
||||||
|
|
||||||
class NoOpLogger : public Logger {
|
|
||||||
public:
|
|
||||||
virtual void Logv(const char* format, va_list ap) { }
|
|
||||||
};
|
|
||||||
|
|
||||||
class InMemoryEnv : public EnvWrapper {
|
class InMemoryEnv : public EnvWrapper {
|
||||||
public:
|
public:
|
||||||
explicit InMemoryEnv(Env* base_env) : EnvWrapper(base_env) { }
|
explicit InMemoryEnv(Env* base_env) : EnvWrapper(base_env) { }
|
||||||
|
@ -266,7 +257,7 @@ class InMemoryEnv : public EnvWrapper {
|
||||||
}
|
}
|
||||||
|
|
||||||
virtual Status NewWritableFile(const std::string& fname,
|
virtual Status NewWritableFile(const std::string& fname,
|
||||||
WritableFile** result) {
|
WritableFile** result, size_t) {
|
||||||
MutexLock lock(&mutex_);
|
MutexLock lock(&mutex_);
|
||||||
if (file_map_.find(fname) != file_map_.end()) {
|
if (file_map_.find(fname) != file_map_.end()) {
|
||||||
DeleteFileInternal(fname);
|
DeleteFileInternal(fname);
|
||||||
|
@ -280,19 +271,6 @@ class InMemoryEnv : public EnvWrapper {
|
||||||
return Status::OK();
|
return Status::OK();
|
||||||
}
|
}
|
||||||
|
|
||||||
virtual Status NewAppendableFile(const std::string& fname,
|
|
||||||
WritableFile** result) {
|
|
||||||
MutexLock lock(&mutex_);
|
|
||||||
FileState** sptr = &file_map_[fname];
|
|
||||||
FileState* file = *sptr;
|
|
||||||
if (file == NULL) {
|
|
||||||
file = new FileState();
|
|
||||||
file->Ref();
|
|
||||||
}
|
|
||||||
*result = new WritableFileImpl(file);
|
|
||||||
return Status::OK();
|
|
||||||
}
|
|
||||||
|
|
||||||
virtual bool FileExists(const std::string& fname) {
|
virtual bool FileExists(const std::string& fname) {
|
||||||
MutexLock lock(&mutex_);
|
MutexLock lock(&mutex_);
|
||||||
return file_map_.find(fname) != file_map_.end();
|
return file_map_.find(fname) != file_map_.end();
|
||||||
|
@ -380,11 +358,6 @@ class InMemoryEnv : public EnvWrapper {
|
||||||
return Status::OK();
|
return Status::OK();
|
||||||
}
|
}
|
||||||
|
|
||||||
virtual Status NewLogger(const std::string& fname, Logger** result) {
|
|
||||||
*result = new NoOpLogger;
|
|
||||||
return Status::OK();
|
|
||||||
}
|
|
||||||
|
|
||||||
private:
|
private:
|
||||||
// Map from filenames to FileState objects, representing a simple file system.
|
// Map from filenames to FileState objects, representing a simple file system.
|
||||||
typedef std::map<std::string, FileState*> FileSystem;
|
typedef std::map<std::string, FileState*> FileSystem;
|
||||||
|
|
|
@ -29,68 +29,61 @@ TEST(MemEnvTest, Basics) {
|
||||||
uint64_t file_size;
|
uint64_t file_size;
|
||||||
WritableFile* writable_file;
|
WritableFile* writable_file;
|
||||||
std::vector<std::string> children;
|
std::vector<std::string> children;
|
||||||
|
std::string dbname;
|
||||||
|
|
||||||
ASSERT_OK(env_->CreateDir("/dir"));
|
dbname=test::TmpDir();
|
||||||
|
ASSERT_OK(env_->CreateDir(dbname.c_str()));
|
||||||
|
|
||||||
// Check that the directory is empty.
|
// Check that the directory is empty.
|
||||||
ASSERT_TRUE(!env_->FileExists("/dir/non_existent"));
|
ASSERT_TRUE(!env_->FileExists(dbname + "/non_existent"));
|
||||||
ASSERT_TRUE(!env_->GetFileSize("/dir/non_existent", &file_size).ok());
|
ASSERT_TRUE(!env_->GetFileSize(dbname + "/non_existent", &file_size).ok());
|
||||||
ASSERT_OK(env_->GetChildren("/dir", &children));
|
ASSERT_OK(env_->GetChildren(dbname + "", &children));
|
||||||
ASSERT_EQ(0, children.size());
|
ASSERT_EQ(0, children.size());
|
||||||
|
|
||||||
// Create a file.
|
// Create a file.
|
||||||
ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file));
|
ASSERT_OK(env_->NewWritableFile(dbname + "/f", &writable_file, 2<<20));
|
||||||
ASSERT_OK(env_->GetFileSize("/dir/f", &file_size));
|
|
||||||
ASSERT_EQ(0, file_size);
|
|
||||||
delete writable_file;
|
delete writable_file;
|
||||||
|
|
||||||
// Check that the file exists.
|
// Check that the file exists.
|
||||||
ASSERT_TRUE(env_->FileExists("/dir/f"));
|
ASSERT_TRUE(env_->FileExists(dbname + "/f"));
|
||||||
ASSERT_OK(env_->GetFileSize("/dir/f", &file_size));
|
ASSERT_OK(env_->GetFileSize(dbname + "/f", &file_size));
|
||||||
ASSERT_EQ(0, file_size);
|
ASSERT_EQ(0, file_size);
|
||||||
ASSERT_OK(env_->GetChildren("/dir", &children));
|
ASSERT_OK(env_->GetChildren(dbname + "", &children));
|
||||||
ASSERT_EQ(1, children.size());
|
ASSERT_EQ(1, children.size());
|
||||||
ASSERT_EQ("f", children[0]);
|
ASSERT_EQ("f", children[0]);
|
||||||
|
|
||||||
// Write to the file.
|
// Write to the file.
|
||||||
ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file));
|
ASSERT_OK(env_->NewWritableFile(dbname + "/f", &writable_file, 2<<20));
|
||||||
ASSERT_OK(writable_file->Append("abc"));
|
ASSERT_OK(writable_file->Append("abc"));
|
||||||
delete writable_file;
|
delete writable_file;
|
||||||
|
|
||||||
// Check that append works.
|
|
||||||
ASSERT_OK(env_->NewAppendableFile("/dir/f", &writable_file));
|
|
||||||
ASSERT_OK(env_->GetFileSize("/dir/f", &file_size));
|
|
||||||
ASSERT_EQ(3, file_size);
|
|
||||||
ASSERT_OK(writable_file->Append("hello"));
|
|
||||||
delete writable_file;
|
|
||||||
|
|
||||||
// Check for expected size.
|
// Check for expected size.
|
||||||
ASSERT_OK(env_->GetFileSize("/dir/f", &file_size));
|
ASSERT_OK(env_->GetFileSize(dbname + "/f", &file_size));
|
||||||
ASSERT_EQ(8, file_size);
|
ASSERT_EQ(3, file_size);
|
||||||
|
|
||||||
// Check that renaming works.
|
// Check that renaming works.
|
||||||
ASSERT_TRUE(!env_->RenameFile("/dir/non_existent", "/dir/g").ok());
|
ASSERT_TRUE(!env_->RenameFile(dbname + "/non_existent", dbname + "/g").ok());
|
||||||
ASSERT_OK(env_->RenameFile("/dir/f", "/dir/g"));
|
ASSERT_OK(env_->RenameFile(dbname + "/f", dbname + "/g"));
|
||||||
ASSERT_TRUE(!env_->FileExists("/dir/f"));
|
ASSERT_TRUE(!env_->FileExists(dbname + "/f"));
|
||||||
ASSERT_TRUE(env_->FileExists("/dir/g"));
|
ASSERT_TRUE(env_->FileExists(dbname + "/g"));
|
||||||
ASSERT_OK(env_->GetFileSize("/dir/g", &file_size));
|
ASSERT_OK(env_->GetFileSize(dbname + "/g", &file_size));
|
||||||
ASSERT_EQ(8, file_size);
|
ASSERT_EQ(3, file_size);
|
||||||
|
|
||||||
// Check that opening non-existent file fails.
|
// Check that opening non-existent file fails.
|
||||||
SequentialFile* seq_file;
|
SequentialFile* seq_file;
|
||||||
RandomAccessFile* rand_file;
|
RandomAccessFile* rand_file;
|
||||||
ASSERT_TRUE(!env_->NewSequentialFile("/dir/non_existent", &seq_file).ok());
|
ASSERT_TRUE(!env_->NewSequentialFile(dbname + "/non_existent", &seq_file).ok());
|
||||||
ASSERT_TRUE(!seq_file);
|
ASSERT_TRUE(!seq_file);
|
||||||
ASSERT_TRUE(!env_->NewRandomAccessFile("/dir/non_existent", &rand_file).ok());
|
ASSERT_TRUE(!env_->NewRandomAccessFile(dbname + "/non_existent", &rand_file).ok());
|
||||||
ASSERT_TRUE(!rand_file);
|
ASSERT_TRUE(!rand_file);
|
||||||
|
|
||||||
// Check that deleting works.
|
// Check that deleting works.
|
||||||
ASSERT_TRUE(!env_->DeleteFile("/dir/non_existent").ok());
|
ASSERT_TRUE(!env_->DeleteFile(dbname + "/non_existent").ok());
|
||||||
ASSERT_OK(env_->DeleteFile("/dir/g"));
|
ASSERT_OK(env_->DeleteFile(dbname + "/g"));
|
||||||
ASSERT_TRUE(!env_->FileExists("/dir/g"));
|
ASSERT_TRUE(!env_->FileExists(dbname + "/g"));
|
||||||
ASSERT_OK(env_->GetChildren("/dir", &children));
|
ASSERT_OK(env_->GetChildren(dbname + "", &children));
|
||||||
ASSERT_EQ(0, children.size());
|
ASSERT_EQ(0, children.size());
|
||||||
ASSERT_OK(env_->DeleteDir("/dir"));
|
ASSERT_OK(env_->DeleteDir(dbname + ""));
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(MemEnvTest, ReadWrite) {
|
TEST(MemEnvTest, ReadWrite) {
|
||||||
|
@ -99,16 +92,19 @@ TEST(MemEnvTest, ReadWrite) {
|
||||||
RandomAccessFile* rand_file;
|
RandomAccessFile* rand_file;
|
||||||
Slice result;
|
Slice result;
|
||||||
char scratch[100];
|
char scratch[100];
|
||||||
|
std::string dbname;
|
||||||
|
|
||||||
ASSERT_OK(env_->CreateDir("/dir"));
|
dbname=test::TmpDir();
|
||||||
|
|
||||||
ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file));
|
ASSERT_OK(env_->CreateDir(dbname + ""));
|
||||||
|
|
||||||
|
ASSERT_OK(env_->NewWritableFile(dbname + "/f", &writable_file, 2<<20));
|
||||||
ASSERT_OK(writable_file->Append("hello "));
|
ASSERT_OK(writable_file->Append("hello "));
|
||||||
ASSERT_OK(writable_file->Append("world"));
|
ASSERT_OK(writable_file->Append("world"));
|
||||||
delete writable_file;
|
delete writable_file;
|
||||||
|
|
||||||
// Read sequentially.
|
// Read sequentially.
|
||||||
ASSERT_OK(env_->NewSequentialFile("/dir/f", &seq_file));
|
ASSERT_OK(env_->NewSequentialFile(dbname + "/f", &seq_file));
|
||||||
ASSERT_OK(seq_file->Read(5, &result, scratch)); // Read "hello".
|
ASSERT_OK(seq_file->Read(5, &result, scratch)); // Read "hello".
|
||||||
ASSERT_EQ(0, result.compare("hello"));
|
ASSERT_EQ(0, result.compare("hello"));
|
||||||
ASSERT_OK(seq_file->Skip(1));
|
ASSERT_OK(seq_file->Skip(1));
|
||||||
|
@ -122,7 +118,7 @@ TEST(MemEnvTest, ReadWrite) {
|
||||||
delete seq_file;
|
delete seq_file;
|
||||||
|
|
||||||
// Random reads.
|
// Random reads.
|
||||||
ASSERT_OK(env_->NewRandomAccessFile("/dir/f", &rand_file));
|
ASSERT_OK(env_->NewRandomAccessFile(dbname + "/f", &rand_file));
|
||||||
ASSERT_OK(rand_file->Read(6, 5, &result, scratch)); // Read "world".
|
ASSERT_OK(rand_file->Read(6, 5, &result, scratch)); // Read "world".
|
||||||
ASSERT_EQ(0, result.compare("world"));
|
ASSERT_EQ(0, result.compare("world"));
|
||||||
ASSERT_OK(rand_file->Read(0, 5, &result, scratch)); // Read "hello".
|
ASSERT_OK(rand_file->Read(0, 5, &result, scratch)); // Read "hello".
|
||||||
|
@ -149,7 +145,7 @@ TEST(MemEnvTest, Misc) {
|
||||||
ASSERT_TRUE(!test_dir.empty());
|
ASSERT_TRUE(!test_dir.empty());
|
||||||
|
|
||||||
WritableFile* writable_file;
|
WritableFile* writable_file;
|
||||||
ASSERT_OK(env_->NewWritableFile("/a/b", &writable_file));
|
ASSERT_OK(env_->NewWritableFile("/a/b", &writable_file, 2<<20));
|
||||||
|
|
||||||
// These are no-ops, but we test they return success.
|
// These are no-ops, but we test they return success.
|
||||||
ASSERT_OK(writable_file->Sync());
|
ASSERT_OK(writable_file->Sync());
|
||||||
|
@ -161,6 +157,9 @@ TEST(MemEnvTest, Misc) {
|
||||||
TEST(MemEnvTest, LargeWrite) {
|
TEST(MemEnvTest, LargeWrite) {
|
||||||
const size_t kWriteSize = 300 * 1024;
|
const size_t kWriteSize = 300 * 1024;
|
||||||
char* scratch = new char[kWriteSize * 2];
|
char* scratch = new char[kWriteSize * 2];
|
||||||
|
std::string dbname;
|
||||||
|
|
||||||
|
dbname=test::TmpDir();
|
||||||
|
|
||||||
std::string write_data;
|
std::string write_data;
|
||||||
for (size_t i = 0; i < kWriteSize; ++i) {
|
for (size_t i = 0; i < kWriteSize; ++i) {
|
||||||
|
@ -168,14 +167,14 @@ TEST(MemEnvTest, LargeWrite) {
|
||||||
}
|
}
|
||||||
|
|
||||||
WritableFile* writable_file;
|
WritableFile* writable_file;
|
||||||
ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file));
|
ASSERT_OK(env_->NewWritableFile(dbname + "/f", &writable_file, 2<<20));
|
||||||
ASSERT_OK(writable_file->Append("foo"));
|
ASSERT_OK(writable_file->Append("foo"));
|
||||||
ASSERT_OK(writable_file->Append(write_data));
|
ASSERT_OK(writable_file->Append(write_data));
|
||||||
delete writable_file;
|
delete writable_file;
|
||||||
|
|
||||||
SequentialFile* seq_file;
|
SequentialFile* seq_file;
|
||||||
Slice result;
|
Slice result;
|
||||||
ASSERT_OK(env_->NewSequentialFile("/dir/f", &seq_file));
|
ASSERT_OK(env_->NewSequentialFile(dbname + "/f", &seq_file));
|
||||||
ASSERT_OK(seq_file->Read(3, &result, scratch)); // Read "foo".
|
ASSERT_OK(seq_file->Read(3, &result, scratch)); // Read "foo".
|
||||||
ASSERT_EQ(0, result.compare("foo"));
|
ASSERT_EQ(0, result.compare("foo"));
|
||||||
|
|
||||||
|
@ -190,17 +189,21 @@ TEST(MemEnvTest, LargeWrite) {
|
||||||
delete seq_file;
|
delete seq_file;
|
||||||
delete [] scratch;
|
delete [] scratch;
|
||||||
}
|
}
|
||||||
|
#if 0
|
||||||
TEST(MemEnvTest, DBTest) {
|
TEST(MemEnvTest, DBTest) {
|
||||||
Options options;
|
Options options;
|
||||||
options.create_if_missing = true;
|
options.create_if_missing = true;
|
||||||
options.env = env_;
|
options.env = env_;
|
||||||
DB* db;
|
DB* db;
|
||||||
|
std::string dbname;
|
||||||
|
|
||||||
|
dbname=test::TmpDir();
|
||||||
|
ASSERT_OK(env_->CreateDir(dbname+ "/db"));
|
||||||
|
|
||||||
const Slice keys[] = {Slice("aaa"), Slice("bbb"), Slice("ccc")};
|
const Slice keys[] = {Slice("aaa"), Slice("bbb"), Slice("ccc")};
|
||||||
const Slice vals[] = {Slice("foo"), Slice("bar"), Slice("baz")};
|
const Slice vals[] = {Slice("foo"), Slice("bar"), Slice("baz")};
|
||||||
|
|
||||||
ASSERT_OK(DB::Open(options, "/dir/db", &db));
|
ASSERT_OK(DB::Open(options, dbname + "/db", &db));
|
||||||
for (size_t i = 0; i < 3; ++i) {
|
for (size_t i = 0; i < 3; ++i) {
|
||||||
ASSERT_OK(db->Put(WriteOptions(), keys[i], vals[i]));
|
ASSERT_OK(db->Put(WriteOptions(), keys[i], vals[i]));
|
||||||
}
|
}
|
||||||
|
@ -233,7 +236,7 @@ TEST(MemEnvTest, DBTest) {
|
||||||
|
|
||||||
delete db;
|
delete db;
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
} // namespace leveldb
|
} // namespace leveldb
|
||||||
|
|
||||||
int main(int argc, char** argv) {
|
int main(int argc, char** argv) {
|
||||||
|
|
227
src/leveldb/include/leveldb/atomics.h
Normal file
227
src/leveldb/include/leveldb/atomics.h
Normal file
|
@ -0,0 +1,227 @@
|
||||||
|
// -------------------------------------------------------------------
|
||||||
|
//
|
||||||
|
// atomics.h: portable atomic operations for leveldb/eleveldb (http://code.google.com/p/leveldb/)
|
||||||
|
//
|
||||||
|
// Copyright (c) 2011-2013 Basho Technologies, Inc. All Rights Reserved.
|
||||||
|
//
|
||||||
|
// This file is provided to you under the Apache License,
|
||||||
|
// Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain
|
||||||
|
// a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing,
|
||||||
|
// software distributed under the License is distributed on an
|
||||||
|
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
// KIND, either express or implied. See the License for the
|
||||||
|
// specific language governing permissions and limitations
|
||||||
|
// under the License.
|
||||||
|
//
|
||||||
|
// -------------------------------------------------------------------
|
||||||
|
|
||||||
|
/// Copied from basho/eleveldb/c_src/detail.hpp September 8, 2013
|
||||||
|
|
||||||
|
#ifndef LEVELDB_ATOMIC_H
|
||||||
|
#define LEVELDB_ATOMIC_H 1
|
||||||
|
|
||||||
|
#include <stdint.h>
|
||||||
|
#include <stddef.h>
|
||||||
|
|
||||||
|
/* These can be hopefully-replaced with constexpr or compile-time assert later: */
|
||||||
|
#if defined(OS_SOLARIS) || defined(SOLARIS) || defined(sun)
|
||||||
|
#define LEVELDB_IS_SOLARIS 1
|
||||||
|
#else
|
||||||
|
#undef LEVELDB_IS_SOLARIS
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef LEVELDB_IS_SOLARIS
|
||||||
|
#include <atomic.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
namespace leveldb {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Compare and swap
|
||||||
|
*/
|
||||||
|
|
||||||
|
// primary template
|
||||||
|
template <typename PtrT, typename ValueT>
|
||||||
|
inline bool compare_and_swap(volatile PtrT *ptr, const ValueT& comp_val, const ValueT& exchange_val);
|
||||||
|
|
||||||
|
|
||||||
|
// uint32 size (needed for solaris)
|
||||||
|
template <>
|
||||||
|
inline bool compare_and_swap(volatile uint32_t *ptr, const int& comp_val, const int& exchange_val)
|
||||||
|
{
|
||||||
|
#if LEVELDB_IS_SOLARIS
|
||||||
|
return ((uint32_t) comp_val==atomic_cas_32(ptr, comp_val, exchange_val));
|
||||||
|
#else
|
||||||
|
return __sync_bool_compare_and_swap(ptr, comp_val, exchange_val);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// generic specification ... for pointers
|
||||||
|
template <typename PtrT, typename ValueT>
|
||||||
|
inline bool compare_and_swap(volatile PtrT *ptr, const ValueT& comp_val, const ValueT& exchange_val)
|
||||||
|
{
|
||||||
|
#if LEVELDB_IS_SOLARIS
|
||||||
|
return (comp_val==atomic_cas_ptr(ptr, comp_val, exchange_val));
|
||||||
|
#else
|
||||||
|
return __sync_bool_compare_and_swap(ptr, comp_val, exchange_val);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Atomic increment
|
||||||
|
*/
|
||||||
|
|
||||||
|
template <typename ValueT>
|
||||||
|
inline ValueT inc_and_fetch(volatile ValueT *ptr);
|
||||||
|
|
||||||
|
template <>
|
||||||
|
inline uint64_t inc_and_fetch(volatile uint64_t *ptr)
|
||||||
|
{
|
||||||
|
#if LEVELDB_IS_SOLARIS
|
||||||
|
return atomic_inc_64_nv(ptr);
|
||||||
|
#else
|
||||||
|
return __sync_add_and_fetch(ptr, 1);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
template <>
|
||||||
|
inline uint32_t inc_and_fetch(volatile uint32_t *ptr)
|
||||||
|
{
|
||||||
|
#if LEVELDB_IS_SOLARIS
|
||||||
|
return atomic_inc_32_nv(ptr);
|
||||||
|
#else
|
||||||
|
return __sync_add_and_fetch(ptr, 1);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
#if defined(__APPLE__) || defined(__OpenBSD__) || (defined(__s390__) && !defined(__s390x__))
|
||||||
|
template <>
|
||||||
|
inline size_t inc_and_fetch(volatile size_t *ptr)
|
||||||
|
{
|
||||||
|
return __sync_add_and_fetch(ptr, 1);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* atomic decrement
|
||||||
|
*/
|
||||||
|
|
||||||
|
template <typename ValueT>
|
||||||
|
inline ValueT dec_and_fetch(volatile ValueT *ptr);
|
||||||
|
|
||||||
|
template <>
|
||||||
|
inline uint64_t dec_and_fetch(volatile uint64_t *ptr)
|
||||||
|
{
|
||||||
|
#if LEVELDB_IS_SOLARIS
|
||||||
|
return atomic_dec_64_nv(ptr);
|
||||||
|
#else
|
||||||
|
return __sync_sub_and_fetch(ptr, 1);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
template <>
|
||||||
|
inline uint32_t dec_and_fetch(volatile uint32_t *ptr)
|
||||||
|
{
|
||||||
|
#if LEVELDB_IS_SOLARIS
|
||||||
|
return atomic_dec_32_nv(ptr);
|
||||||
|
#else
|
||||||
|
return __sync_sub_and_fetch(ptr, 1);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
#if defined(__APPLE__) || defined(__OpenBSD__) || (defined(__s390__) && !defined(__s390x__))
|
||||||
|
template <>
|
||||||
|
inline size_t dec_and_fetch(volatile size_t *ptr)
|
||||||
|
{
|
||||||
|
return __sync_sub_and_fetch(ptr, 1);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Atomic add
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
template <typename ValueT>
|
||||||
|
inline ValueT add_and_fetch(volatile ValueT *ptr, ValueT val);
|
||||||
|
|
||||||
|
template <>
|
||||||
|
inline uint64_t add_and_fetch(volatile uint64_t *ptr, uint64_t val)
|
||||||
|
{
|
||||||
|
#if LEVELDB_IS_SOLARIS
|
||||||
|
return atomic_add_64_nv(ptr, val);
|
||||||
|
#else
|
||||||
|
return __sync_add_and_fetch(ptr, val);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
template <>
|
||||||
|
inline uint32_t add_and_fetch(volatile uint32_t *ptr, uint32_t val)
|
||||||
|
{
|
||||||
|
#if LEVELDB_IS_SOLARIS
|
||||||
|
return atomic_add_32_nv(ptr, val);
|
||||||
|
#else
|
||||||
|
return __sync_add_and_fetch(ptr, val);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
#if defined(__APPLE__) || defined(__OpenBSD__) || (defined(__s390__) && !defined(__s390x__))
|
||||||
|
template <>
|
||||||
|
inline size_t add_and_fetch(volatile size_t *ptr, size_t val)
|
||||||
|
{
|
||||||
|
return __sync_add_and_fetch(ptr, val);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Atomic subtract
|
||||||
|
*/
|
||||||
|
|
||||||
|
template <typename ValueT>
|
||||||
|
inline ValueT sub_and_fetch(volatile ValueT *ptr, ValueT val);
|
||||||
|
|
||||||
|
template <>
|
||||||
|
inline uint64_t sub_and_fetch(volatile uint64_t *ptr, uint64_t val)
|
||||||
|
{
|
||||||
|
#if LEVELDB_IS_SOLARIS
|
||||||
|
uint64_t temp=(~val)+1; // 2's complement, bypass sign warnings
|
||||||
|
return atomic_add_64_nv(ptr, temp);
|
||||||
|
#else
|
||||||
|
return __sync_sub_and_fetch(ptr, val);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
template <>
|
||||||
|
inline uint32_t sub_and_fetch(volatile uint32_t *ptr, uint32_t val)
|
||||||
|
{
|
||||||
|
#if LEVELDB_IS_SOLARIS
|
||||||
|
uint32_t temp=(~val)+1; // 2's complement, bypass sign warnings
|
||||||
|
return atomic_add_32_nv(ptr, temp);
|
||||||
|
#else
|
||||||
|
return __sync_sub_and_fetch(ptr, val);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
#if defined(__APPLE__) || defined(__OpenBSD__) || (defined(__s390__) && !defined(__s390x__))
|
||||||
|
template <>
|
||||||
|
inline size_t sub_and_fetch(volatile size_t *ptr, size_t val)
|
||||||
|
{
|
||||||
|
return __sync_sub_and_fetch(ptr, val);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
} // namespace leveldb
|
||||||
|
|
||||||
|
#endif
|
|
@ -9,6 +9,7 @@
|
||||||
Does not support:
|
Does not support:
|
||||||
. getters for the option types
|
. getters for the option types
|
||||||
. custom comparators that implement key shortening
|
. custom comparators that implement key shortening
|
||||||
|
. capturing post-write-snapshot
|
||||||
. custom iter, db, env, cache implementations using just the C bindings
|
. custom iter, db, env, cache implementations using just the C bindings
|
||||||
|
|
||||||
Some conventions:
|
Some conventions:
|
||||||
|
@ -27,7 +28,6 @@
|
||||||
be true on entry:
|
be true on entry:
|
||||||
*errptr == NULL
|
*errptr == NULL
|
||||||
*errptr points to a malloc()ed null-terminated error message
|
*errptr points to a malloc()ed null-terminated error message
|
||||||
(On Windows, *errptr must have been malloc()-ed by this library.)
|
|
||||||
On success, a leveldb routine leaves *errptr unchanged.
|
On success, a leveldb routine leaves *errptr unchanged.
|
||||||
On failure, leveldb frees the old value of *errptr and
|
On failure, leveldb frees the old value of *errptr and
|
||||||
set *errptr to a malloc()ed error message.
|
set *errptr to a malloc()ed error message.
|
||||||
|
@ -66,7 +66,7 @@ typedef struct leveldb_snapshot_t leveldb_snapshot_t;
|
||||||
typedef struct leveldb_writablefile_t leveldb_writablefile_t;
|
typedef struct leveldb_writablefile_t leveldb_writablefile_t;
|
||||||
typedef struct leveldb_writebatch_t leveldb_writebatch_t;
|
typedef struct leveldb_writebatch_t leveldb_writebatch_t;
|
||||||
typedef struct leveldb_writeoptions_t leveldb_writeoptions_t;
|
typedef struct leveldb_writeoptions_t leveldb_writeoptions_t;
|
||||||
|
typedef struct leveldb_keymetadata_t leveldb_keymetadata_t;
|
||||||
/* DB operations */
|
/* DB operations */
|
||||||
|
|
||||||
extern leveldb_t* leveldb_open(
|
extern leveldb_t* leveldb_open(
|
||||||
|
@ -83,6 +83,14 @@ extern void leveldb_put(
|
||||||
const char* val, size_t vallen,
|
const char* val, size_t vallen,
|
||||||
char** errptr);
|
char** errptr);
|
||||||
|
|
||||||
|
extern void leveldb_put2(
|
||||||
|
leveldb_t* db,
|
||||||
|
const leveldb_writeoptions_t* options,
|
||||||
|
const char* key, size_t keylen,
|
||||||
|
const char* val, size_t vallen,
|
||||||
|
char** errptr,
|
||||||
|
const leveldb_keymetadata_t * metadata);
|
||||||
|
|
||||||
extern void leveldb_delete(
|
extern void leveldb_delete(
|
||||||
leveldb_t* db,
|
leveldb_t* db,
|
||||||
const leveldb_writeoptions_t* options,
|
const leveldb_writeoptions_t* options,
|
||||||
|
@ -104,6 +112,14 @@ extern char* leveldb_get(
|
||||||
size_t* vallen,
|
size_t* vallen,
|
||||||
char** errptr);
|
char** errptr);
|
||||||
|
|
||||||
|
extern char* leveldb_get2(
|
||||||
|
leveldb_t* db,
|
||||||
|
const leveldb_readoptions_t* options,
|
||||||
|
const char* key, size_t keylen,
|
||||||
|
size_t* vallen,
|
||||||
|
char** errptr,
|
||||||
|
leveldb_keymetadata_t * metadata);
|
||||||
|
|
||||||
extern leveldb_iterator_t* leveldb_create_iterator(
|
extern leveldb_iterator_t* leveldb_create_iterator(
|
||||||
leveldb_t* db,
|
leveldb_t* db,
|
||||||
const leveldb_readoptions_t* options);
|
const leveldb_readoptions_t* options);
|
||||||
|
@ -156,6 +172,7 @@ extern void leveldb_iter_next(leveldb_iterator_t*);
|
||||||
extern void leveldb_iter_prev(leveldb_iterator_t*);
|
extern void leveldb_iter_prev(leveldb_iterator_t*);
|
||||||
extern const char* leveldb_iter_key(const leveldb_iterator_t*, size_t* klen);
|
extern const char* leveldb_iter_key(const leveldb_iterator_t*, size_t* klen);
|
||||||
extern const char* leveldb_iter_value(const leveldb_iterator_t*, size_t* vlen);
|
extern const char* leveldb_iter_value(const leveldb_iterator_t*, size_t* vlen);
|
||||||
|
extern const void leveldb_iter_keymetadata(const leveldb_iterator_t *, leveldb_keymetadata_t *);
|
||||||
extern void leveldb_iter_get_error(const leveldb_iterator_t*, char** errptr);
|
extern void leveldb_iter_get_error(const leveldb_iterator_t*, char** errptr);
|
||||||
|
|
||||||
/* Write batch */
|
/* Write batch */
|
||||||
|
@ -167,13 +184,19 @@ extern void leveldb_writebatch_put(
|
||||||
leveldb_writebatch_t*,
|
leveldb_writebatch_t*,
|
||||||
const char* key, size_t klen,
|
const char* key, size_t klen,
|
||||||
const char* val, size_t vlen);
|
const char* val, size_t vlen);
|
||||||
|
extern void leveldb_writebatch_put2(
|
||||||
|
leveldb_writebatch_t*,
|
||||||
|
const char* key, size_t klen,
|
||||||
|
const char* val, size_t vlen,
|
||||||
|
const leveldb_keymetadata_t * meta);
|
||||||
extern void leveldb_writebatch_delete(
|
extern void leveldb_writebatch_delete(
|
||||||
leveldb_writebatch_t*,
|
leveldb_writebatch_t*,
|
||||||
const char* key, size_t klen);
|
const char* key, size_t klen);
|
||||||
extern void leveldb_writebatch_iterate(
|
extern void leveldb_writebatch_iterate(
|
||||||
leveldb_writebatch_t*,
|
leveldb_writebatch_t*,
|
||||||
void* state,
|
void* state,
|
||||||
void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen),
|
void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen,
|
||||||
|
const int & type, const uint64_t & expiry),
|
||||||
void (*deleted)(void*, const char* k, size_t klen));
|
void (*deleted)(void*, const char* k, size_t klen));
|
||||||
|
|
||||||
/* Options */
|
/* Options */
|
||||||
|
@ -192,6 +215,8 @@ extern void leveldb_options_set_error_if_exists(
|
||||||
leveldb_options_t*, unsigned char);
|
leveldb_options_t*, unsigned char);
|
||||||
extern void leveldb_options_set_paranoid_checks(
|
extern void leveldb_options_set_paranoid_checks(
|
||||||
leveldb_options_t*, unsigned char);
|
leveldb_options_t*, unsigned char);
|
||||||
|
extern void leveldb_options_set_verify_compactions(
|
||||||
|
leveldb_options_t*, unsigned char);
|
||||||
extern void leveldb_options_set_env(leveldb_options_t*, leveldb_env_t*);
|
extern void leveldb_options_set_env(leveldb_options_t*, leveldb_env_t*);
|
||||||
extern void leveldb_options_set_info_log(leveldb_options_t*, leveldb_logger_t*);
|
extern void leveldb_options_set_info_log(leveldb_options_t*, leveldb_logger_t*);
|
||||||
extern void leveldb_options_set_write_buffer_size(leveldb_options_t*, size_t);
|
extern void leveldb_options_set_write_buffer_size(leveldb_options_t*, size_t);
|
||||||
|
@ -199,6 +224,7 @@ extern void leveldb_options_set_max_open_files(leveldb_options_t*, int);
|
||||||
extern void leveldb_options_set_cache(leveldb_options_t*, leveldb_cache_t*);
|
extern void leveldb_options_set_cache(leveldb_options_t*, leveldb_cache_t*);
|
||||||
extern void leveldb_options_set_block_size(leveldb_options_t*, size_t);
|
extern void leveldb_options_set_block_size(leveldb_options_t*, size_t);
|
||||||
extern void leveldb_options_set_block_restart_interval(leveldb_options_t*, int);
|
extern void leveldb_options_set_block_restart_interval(leveldb_options_t*, int);
|
||||||
|
extern void leveldb_options_set_total_leveldb_mem(leveldb_options_t*, size_t);
|
||||||
|
|
||||||
enum {
|
enum {
|
||||||
leveldb_no_compression = 0,
|
leveldb_no_compression = 0,
|
||||||
|
@ -267,20 +293,20 @@ extern void leveldb_cache_destroy(leveldb_cache_t* cache);
|
||||||
|
|
||||||
extern leveldb_env_t* leveldb_create_default_env();
|
extern leveldb_env_t* leveldb_create_default_env();
|
||||||
extern void leveldb_env_destroy(leveldb_env_t*);
|
extern void leveldb_env_destroy(leveldb_env_t*);
|
||||||
|
extern void leveldb_env_shutdown();
|
||||||
|
|
||||||
/* Utility */
|
/* Util */
|
||||||
|
|
||||||
/* Calls free(ptr).
|
/**
|
||||||
REQUIRES: ptr was malloc()-ed and returned by one of the routines
|
* CAUTION: this call is only for char * objects returned by
|
||||||
in this file. Note that in certain cases (typically on Windows), you
|
* functions like leveldb_get and leveldb_property_value.
|
||||||
may need to call this routine instead of free(ptr) to dispose of
|
* Also used to release errptr strings.
|
||||||
malloc()-ed memory returned by this library. */
|
*/
|
||||||
extern void leveldb_free(void* ptr);
|
extern void leveldb_free(void* ptr);
|
||||||
|
|
||||||
/* Return the major version number for this release. */
|
/* Version */
|
||||||
extern int leveldb_major_version();
|
|
||||||
|
|
||||||
/* Return the minor version number for this release. */
|
extern int leveldb_major_version();
|
||||||
extern int leveldb_minor_version();
|
extern int leveldb_minor_version();
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
|
|
|
@ -29,6 +29,11 @@ class Cache;
|
||||||
// of Cache uses a least-recently-used eviction policy.
|
// of Cache uses a least-recently-used eviction policy.
|
||||||
extern Cache* NewLRUCache(size_t capacity);
|
extern Cache* NewLRUCache(size_t capacity);
|
||||||
|
|
||||||
|
// Riak customization - just like NewLRUCache except the underlying
|
||||||
|
// structure is NOT sharded. Better for file cache.
|
||||||
|
extern Cache* NewLRUCache2(size_t capacity);
|
||||||
|
|
||||||
|
|
||||||
class Cache {
|
class Cache {
|
||||||
public:
|
public:
|
||||||
Cache() { }
|
Cache() { }
|
||||||
|
@ -81,16 +86,17 @@ class Cache {
|
||||||
// its cache keys.
|
// its cache keys.
|
||||||
virtual uint64_t NewId() = 0;
|
virtual uint64_t NewId() = 0;
|
||||||
|
|
||||||
// Remove all cache entries that are not actively in use. Memory-constrained
|
// Return size, if any, of per entry overhead for item placed in cache.
|
||||||
// applications may wish to call this method to reduce memory usage.
|
// Allows more accurate tracking of "charge" against each cache item.
|
||||||
// Default implementation of Prune() does nothing. Subclasses are strongly
|
virtual size_t EntryOverheadSize() {return(0);};
|
||||||
// encouraged to override the default implementation. A future release of
|
|
||||||
// leveldb may change Prune() to a pure abstract method.
|
|
||||||
virtual void Prune() {}
|
|
||||||
|
|
||||||
// Return an estimate of the combined charges of all elements stored in the
|
// Riak specific: Add a reference to cache object to help hold it
|
||||||
// cache.
|
// in memory
|
||||||
virtual size_t TotalCharge() const = 0;
|
virtual void Addref(Handle* e) = 0;
|
||||||
|
|
||||||
|
// Riak specific: walk contents of entire cache, calling functor Acc
|
||||||
|
// with the "value" for each cache entry. Locks cache throughout call.
|
||||||
|
virtual bool WalkCache(class CacheAccumulator & Acc) {return(true);};
|
||||||
|
|
||||||
private:
|
private:
|
||||||
void LRU_Remove(Handle* e);
|
void LRU_Remove(Handle* e);
|
||||||
|
@ -107,4 +113,4 @@ class Cache {
|
||||||
|
|
||||||
} // namespace leveldb
|
} // namespace leveldb
|
||||||
|
|
||||||
#endif // STORAGE_LEVELDB_INCLUDE_CACHE_H_
|
#endif // STORAGE_LEVELDB_UTIL_CACHE_H_
|
||||||
|
|
|
@ -58,6 +58,10 @@ class Comparator {
|
||||||
// must not be deleted.
|
// must not be deleted.
|
||||||
extern const Comparator* BytewiseComparator();
|
extern const Comparator* BytewiseComparator();
|
||||||
|
|
||||||
|
// Riak specific: cleans up the default comparitor to make
|
||||||
|
// valgrind results clean
|
||||||
|
extern void ComparatorShutdown();
|
||||||
|
|
||||||
} // namespace leveldb
|
} // namespace leveldb
|
||||||
|
|
||||||
#endif // STORAGE_LEVELDB_INCLUDE_COMPARATOR_H_
|
#endif // STORAGE_LEVELDB_INCLUDE_COMPARATOR_H_
|
||||||
|
|
|
@ -14,7 +14,7 @@ namespace leveldb {
|
||||||
|
|
||||||
// Update Makefile if you change these
|
// Update Makefile if you change these
|
||||||
static const int kMajorVersion = 1;
|
static const int kMajorVersion = 1;
|
||||||
static const int kMinorVersion = 20;
|
static const int kMinorVersion = 9;
|
||||||
|
|
||||||
struct Options;
|
struct Options;
|
||||||
struct ReadOptions;
|
struct ReadOptions;
|
||||||
|
@ -38,6 +38,17 @@ struct Range {
|
||||||
Range(const Slice& s, const Slice& l) : start(s), limit(l) { }
|
Range(const Slice& s, const Slice& l) : start(s), limit(l) { }
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Abstract holder for a DB value.
|
||||||
|
// This allows callers to manage their own value buffers and have
|
||||||
|
// DB values copied directly into those buffers.
|
||||||
|
class Value {
|
||||||
|
public:
|
||||||
|
virtual Value& assign(const char* data, size_t size) = 0;
|
||||||
|
|
||||||
|
protected:
|
||||||
|
virtual ~Value();
|
||||||
|
};
|
||||||
|
|
||||||
// A DB is a persistent ordered map from keys to values.
|
// A DB is a persistent ordered map from keys to values.
|
||||||
// A DB is safe for concurrent access from multiple threads without
|
// A DB is safe for concurrent access from multiple threads without
|
||||||
// any external synchronization.
|
// any external synchronization.
|
||||||
|
@ -60,7 +71,8 @@ class DB {
|
||||||
// Note: consider setting options.sync = true.
|
// Note: consider setting options.sync = true.
|
||||||
virtual Status Put(const WriteOptions& options,
|
virtual Status Put(const WriteOptions& options,
|
||||||
const Slice& key,
|
const Slice& key,
|
||||||
const Slice& value) = 0;
|
const Slice& value,
|
||||||
|
const KeyMetaData * meta=NULL) = 0;
|
||||||
|
|
||||||
// Remove the database entry (if any) for "key". Returns OK on
|
// Remove the database entry (if any) for "key". Returns OK on
|
||||||
// success, and a non-OK status on error. It is not an error if "key"
|
// success, and a non-OK status on error. It is not an error if "key"
|
||||||
|
@ -81,7 +93,11 @@ class DB {
|
||||||
//
|
//
|
||||||
// May return some other Status on an error.
|
// May return some other Status on an error.
|
||||||
virtual Status Get(const ReadOptions& options,
|
virtual Status Get(const ReadOptions& options,
|
||||||
const Slice& key, std::string* value) = 0;
|
const Slice& key, std::string* value,
|
||||||
|
KeyMetaData * meta=NULL) = 0;
|
||||||
|
virtual Status Get(const ReadOptions& options,
|
||||||
|
const Slice& key, Value* value,
|
||||||
|
KeyMetaData * meta=NULL) = 0;
|
||||||
|
|
||||||
// Return a heap-allocated iterator over the contents of the database.
|
// Return a heap-allocated iterator over the contents of the database.
|
||||||
// The result of NewIterator() is initially invalid (caller must
|
// The result of NewIterator() is initially invalid (caller must
|
||||||
|
@ -115,8 +131,6 @@ class DB {
|
||||||
// about the internal operation of the DB.
|
// about the internal operation of the DB.
|
||||||
// "leveldb.sstables" - returns a multi-line string that describes all
|
// "leveldb.sstables" - returns a multi-line string that describes all
|
||||||
// of the sstables that make up the db contents.
|
// of the sstables that make up the db contents.
|
||||||
// "leveldb.approximate-memory-usage" - returns the approximate number of
|
|
||||||
// bytes of memory in use by the DB.
|
|
||||||
virtual bool GetProperty(const Slice& property, std::string* value) = 0;
|
virtual bool GetProperty(const Slice& property, std::string* value) = 0;
|
||||||
|
|
||||||
// For each i in [0,n-1], store in "sizes[i]", the approximate
|
// For each i in [0,n-1], store in "sizes[i]", the approximate
|
||||||
|
@ -142,6 +156,21 @@ class DB {
|
||||||
// db->CompactRange(NULL, NULL);
|
// db->CompactRange(NULL, NULL);
|
||||||
virtual void CompactRange(const Slice* begin, const Slice* end) = 0;
|
virtual void CompactRange(const Slice* begin, const Slice* end) = 0;
|
||||||
|
|
||||||
|
// Riak specific function: Verify that no .sst files overlap
|
||||||
|
// within the levels that expect non-overlapping files. Run
|
||||||
|
// compactions as necessary to correct. Assumes DB opened
|
||||||
|
// with Options.is_repair=true
|
||||||
|
virtual Status VerifyLevels();
|
||||||
|
|
||||||
|
// Riak specific function: Request database check for
|
||||||
|
// available compactions. This is to stimulate retry of
|
||||||
|
// grooming that might have been offered and rejected previously
|
||||||
|
virtual void CheckAvailableCompactions();
|
||||||
|
|
||||||
|
// Riak specific function: Give external code, namely
|
||||||
|
// eleveldb, access to leveldb's logging routines.
|
||||||
|
virtual Logger* GetLogger() const { return NULL; }
|
||||||
|
|
||||||
private:
|
private:
|
||||||
// No copying allowed
|
// No copying allowed
|
||||||
DB(const DB&);
|
DB(const DB&);
|
||||||
|
|
|
@ -1,25 +0,0 @@
|
||||||
// Copyright (c) 2014 The LevelDB Authors. All rights reserved.
|
|
||||||
// Use of this source code is governed by a BSD-style license that can be
|
|
||||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
||||||
|
|
||||||
#ifndef STORAGE_LEVELDB_INCLUDE_DUMPFILE_H_
|
|
||||||
#define STORAGE_LEVELDB_INCLUDE_DUMPFILE_H_
|
|
||||||
|
|
||||||
#include <string>
|
|
||||||
#include "leveldb/env.h"
|
|
||||||
#include "leveldb/status.h"
|
|
||||||
|
|
||||||
namespace leveldb {
|
|
||||||
|
|
||||||
// Dump the contents of the file named by fname in text format to
|
|
||||||
// *dst. Makes a sequence of dst->Append() calls; each call is passed
|
|
||||||
// the newline-terminated text corresponding to a single item found
|
|
||||||
// in the file.
|
|
||||||
//
|
|
||||||
// Returns a non-OK result if fname does not name a leveldb storage
|
|
||||||
// file, or if the file cannot be read.
|
|
||||||
Status DumpFile(Env* env, const std::string& fname, WritableFile* dst);
|
|
||||||
|
|
||||||
} // namespace leveldb
|
|
||||||
|
|
||||||
#endif // STORAGE_LEVELDB_INCLUDE_DUMPFILE_H_
|
|
|
@ -13,15 +13,19 @@
|
||||||
#ifndef STORAGE_LEVELDB_INCLUDE_ENV_H_
|
#ifndef STORAGE_LEVELDB_INCLUDE_ENV_H_
|
||||||
#define STORAGE_LEVELDB_INCLUDE_ENV_H_
|
#define STORAGE_LEVELDB_INCLUDE_ENV_H_
|
||||||
|
|
||||||
|
#include <cstdarg>
|
||||||
|
#include <pthread.h>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <stdarg.h>
|
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
|
#include "leveldb/perf_count.h"
|
||||||
#include "leveldb/status.h"
|
#include "leveldb/status.h"
|
||||||
|
|
||||||
namespace leveldb {
|
namespace leveldb {
|
||||||
|
|
||||||
|
class AppendableFile;
|
||||||
class FileLock;
|
class FileLock;
|
||||||
|
struct Options;
|
||||||
class Logger;
|
class Logger;
|
||||||
class RandomAccessFile;
|
class RandomAccessFile;
|
||||||
class SequentialFile;
|
class SequentialFile;
|
||||||
|
@ -40,6 +44,11 @@ class Env {
|
||||||
// The result of Default() belongs to leveldb and must never be deleted.
|
// The result of Default() belongs to leveldb and must never be deleted.
|
||||||
static Env* Default();
|
static Env* Default();
|
||||||
|
|
||||||
|
// Riak specific: Shutdown background work threads and other objects
|
||||||
|
// to get clean environment for valgrind memory test. No restart supported
|
||||||
|
// after this call. Not thread safe.
|
||||||
|
static void Shutdown();
|
||||||
|
|
||||||
// Create a brand new sequentially-readable file with the specified name.
|
// Create a brand new sequentially-readable file with the specified name.
|
||||||
// On success, stores a pointer to the new file in *result and returns OK.
|
// On success, stores a pointer to the new file in *result and returns OK.
|
||||||
// On failure stores NULL in *result and returns non-OK. If the file does
|
// On failure stores NULL in *result and returns non-OK. If the file does
|
||||||
|
@ -67,22 +76,31 @@ class Env {
|
||||||
//
|
//
|
||||||
// The returned file will only be accessed by one thread at a time.
|
// The returned file will only be accessed by one thread at a time.
|
||||||
virtual Status NewWritableFile(const std::string& fname,
|
virtual Status NewWritableFile(const std::string& fname,
|
||||||
WritableFile** result) = 0;
|
WritableFile** result,
|
||||||
|
size_t map_size) = 0;
|
||||||
|
|
||||||
// Create an object that either appends to an existing file, or
|
// Riak specific:
|
||||||
// writes to a new file (if the file does not exist to begin with).
|
// Derived from NewWritableFile. One change: if the file exists,
|
||||||
// On success, stores a pointer to the new file in *result and
|
// move to the end of the file and continue writing.
|
||||||
// returns OK. On failure stores NULL in *result and returns
|
// new file. On success, stores a pointer to the open file in
|
||||||
// non-OK.
|
// *result and returns OK. On failure stores NULL in *result and
|
||||||
|
// returns non-OK.
|
||||||
//
|
//
|
||||||
// The returned file will only be accessed by one thread at a time.
|
// The returned file will only be accessed by one thread at a time.
|
||||||
//
|
|
||||||
// May return an IsNotSupportedError error if this Env does
|
|
||||||
// not allow appending to an existing file. Users of Env (including
|
|
||||||
// the leveldb implementation) must be prepared to deal with
|
|
||||||
// an Env that does not support appending.
|
|
||||||
virtual Status NewAppendableFile(const std::string& fname,
|
virtual Status NewAppendableFile(const std::string& fname,
|
||||||
WritableFile** result);
|
WritableFile** result,
|
||||||
|
size_t map_size) = 0;
|
||||||
|
|
||||||
|
// Riak specific:
|
||||||
|
// Allows for virtualized version of NewWritableFile that enables write
|
||||||
|
// and close operations to execute on background threads
|
||||||
|
// (where platform supported).
|
||||||
|
//
|
||||||
|
// The returned file will only be accessed by one thread at a time.
|
||||||
|
virtual Status NewWriteOnlyFile(const std::string& fname,
|
||||||
|
WritableFile** result,
|
||||||
|
size_t map_size)
|
||||||
|
{return(NewWritableFile(fname, result, map_size));};
|
||||||
|
|
||||||
// Returns true iff the named file exists.
|
// Returns true iff the named file exists.
|
||||||
virtual bool FileExists(const std::string& fname) = 0;
|
virtual bool FileExists(const std::string& fname) = 0;
|
||||||
|
@ -142,7 +160,7 @@ class Env {
|
||||||
|
|
||||||
// Start a new thread, invoking "function(arg)" within the new thread.
|
// Start a new thread, invoking "function(arg)" within the new thread.
|
||||||
// When "function(arg)" returns, the thread will be destroyed.
|
// When "function(arg)" returns, the thread will be destroyed.
|
||||||
virtual void StartThread(void (*function)(void* arg), void* arg) = 0;
|
virtual pthread_t StartThread(void (*function)(void* arg), void* arg) = 0;
|
||||||
|
|
||||||
// *path is set to a temporary directory that can be used for testing. It may
|
// *path is set to a temporary directory that can be used for testing. It may
|
||||||
// or many not have just been created. The directory may or may not differ
|
// or many not have just been created. The directory may or may not differ
|
||||||
|
@ -157,9 +175,16 @@ class Env {
|
||||||
// useful for computing deltas of time.
|
// useful for computing deltas of time.
|
||||||
virtual uint64_t NowMicros() = 0;
|
virtual uint64_t NowMicros() = 0;
|
||||||
|
|
||||||
// Sleep/delay the thread for the prescribed number of micro-seconds.
|
// Sleep/delay the thread for the perscribed number of micro-seconds.
|
||||||
virtual void SleepForMicroseconds(int micros) = 0;
|
virtual void SleepForMicroseconds(int micros) = 0;
|
||||||
|
|
||||||
|
// Riak specific: Get object that is tracking various software counters
|
||||||
|
virtual PerformanceCounters * GetPerformanceCounters() {return(gPerfCounters);};
|
||||||
|
|
||||||
|
// Riak specific: Request size of recovery memory map, potentially using
|
||||||
|
// Options data for the decision. Default 2Mbyte is Google's original size.
|
||||||
|
virtual size_t RecoveryMmapSize(const struct Options *) const {return(2*1024*1024L);};
|
||||||
|
|
||||||
private:
|
private:
|
||||||
// No copying allowed
|
// No copying allowed
|
||||||
Env(const Env&);
|
Env(const Env&);
|
||||||
|
@ -190,14 +215,6 @@ class SequentialFile {
|
||||||
//
|
//
|
||||||
// REQUIRES: External synchronization
|
// REQUIRES: External synchronization
|
||||||
virtual Status Skip(uint64_t n) = 0;
|
virtual Status Skip(uint64_t n) = 0;
|
||||||
|
|
||||||
// Get a name for the file, only for error reporting
|
|
||||||
virtual std::string GetName() const = 0;
|
|
||||||
|
|
||||||
private:
|
|
||||||
// No copying allowed
|
|
||||||
SequentialFile(const SequentialFile&);
|
|
||||||
void operator=(const SequentialFile&);
|
|
||||||
};
|
};
|
||||||
|
|
||||||
// A file abstraction for randomly reading the contents of a file.
|
// A file abstraction for randomly reading the contents of a file.
|
||||||
|
@ -218,13 +235,11 @@ class RandomAccessFile {
|
||||||
virtual Status Read(uint64_t offset, size_t n, Slice* result,
|
virtual Status Read(uint64_t offset, size_t n, Slice* result,
|
||||||
char* scratch) const = 0;
|
char* scratch) const = 0;
|
||||||
|
|
||||||
// Get a name for the file, only for error reporting
|
// Riak optimization: allows advising Linux page cache
|
||||||
virtual std::string GetName() const = 0;
|
virtual void SetForCompaction(uint64_t file_size) {};
|
||||||
|
|
||||||
private:
|
// Riak addition: size of this structure in bytes
|
||||||
// No copying allowed
|
virtual size_t ObjectSize() {return(sizeof(RandomAccessFile));};
|
||||||
RandomAccessFile(const RandomAccessFile&);
|
|
||||||
void operator=(const RandomAccessFile&);
|
|
||||||
};
|
};
|
||||||
|
|
||||||
// A file abstraction for sequential writing. The implementation
|
// A file abstraction for sequential writing. The implementation
|
||||||
|
@ -240,8 +255,10 @@ class WritableFile {
|
||||||
virtual Status Flush() = 0;
|
virtual Status Flush() = 0;
|
||||||
virtual Status Sync() = 0;
|
virtual Status Sync() = 0;
|
||||||
|
|
||||||
// Get a name for the file, only for error reporting
|
// Riak specific:
|
||||||
virtual std::string GetName() const = 0;
|
// Provide hint where key/value data ends and metadata starts
|
||||||
|
// in an .sst table file.
|
||||||
|
virtual void SetMetadataOffset(uint64_t) {};
|
||||||
|
|
||||||
private:
|
private:
|
||||||
// No copying allowed
|
// No copying allowed
|
||||||
|
@ -249,12 +266,30 @@ class WritableFile {
|
||||||
void operator=(const WritableFile&);
|
void operator=(const WritableFile&);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// A file abstraction for sequential writing at end of existing file.
|
||||||
|
class AppendableFile: public WritableFile {
|
||||||
|
public:
|
||||||
|
AppendableFile() { }
|
||||||
|
virtual ~AppendableFile();
|
||||||
|
|
||||||
|
private:
|
||||||
|
// No copying allowed
|
||||||
|
AppendableFile(const AppendableFile&);
|
||||||
|
void operator=(const AppendableFile&);
|
||||||
|
};
|
||||||
|
|
||||||
// An interface for writing log messages.
|
// An interface for writing log messages.
|
||||||
class Logger {
|
class Logger {
|
||||||
public:
|
public:
|
||||||
Logger() { }
|
Logger() { }
|
||||||
virtual ~Logger();
|
virtual ~Logger();
|
||||||
|
|
||||||
|
// Riak specific function for hot backup.
|
||||||
|
// hot_backup.cc assumes that it can rotate the LOG file
|
||||||
|
// via standard Env routines if this function returns a
|
||||||
|
// non-zero value.
|
||||||
|
virtual long LogSize() {return(0);};
|
||||||
|
|
||||||
// Write an entry to the log file with the specified format.
|
// Write an entry to the log file with the specified format.
|
||||||
virtual void Logv(const char* format, va_list ap) = 0;
|
virtual void Logv(const char* format, va_list ap) = 0;
|
||||||
|
|
||||||
|
@ -310,11 +345,14 @@ class EnvWrapper : public Env {
|
||||||
Status NewRandomAccessFile(const std::string& f, RandomAccessFile** r) {
|
Status NewRandomAccessFile(const std::string& f, RandomAccessFile** r) {
|
||||||
return target_->NewRandomAccessFile(f, r);
|
return target_->NewRandomAccessFile(f, r);
|
||||||
}
|
}
|
||||||
Status NewWritableFile(const std::string& f, WritableFile** r) {
|
Status NewWritableFile(const std::string& f, WritableFile** r, size_t s=0) {
|
||||||
return target_->NewWritableFile(f, r);
|
return target_->NewWritableFile(f, r, s);
|
||||||
}
|
}
|
||||||
Status NewAppendableFile(const std::string& f, WritableFile** r) {
|
Status NewAppendableFile(const std::string& f, WritableFile** r, size_t s=0) {
|
||||||
return target_->NewAppendableFile(f, r);
|
return target_->NewAppendableFile(f, r, s);
|
||||||
|
}
|
||||||
|
Status NewWriteOnlyFile(const std::string& f, WritableFile** r, size_t s=0) {
|
||||||
|
return target_->NewWriteOnlyFile(f, r, s);
|
||||||
}
|
}
|
||||||
bool FileExists(const std::string& f) { return target_->FileExists(f); }
|
bool FileExists(const std::string& f) { return target_->FileExists(f); }
|
||||||
Status GetChildren(const std::string& dir, std::vector<std::string>* r) {
|
Status GetChildren(const std::string& dir, std::vector<std::string>* r) {
|
||||||
|
@ -336,7 +374,7 @@ class EnvWrapper : public Env {
|
||||||
void Schedule(void (*f)(void*), void* a) {
|
void Schedule(void (*f)(void*), void* a) {
|
||||||
return target_->Schedule(f, a);
|
return target_->Schedule(f, a);
|
||||||
}
|
}
|
||||||
void StartThread(void (*f)(void*), void* a) {
|
pthread_t StartThread(void (*f)(void*), void* a) {
|
||||||
return target_->StartThread(f, a);
|
return target_->StartThread(f, a);
|
||||||
}
|
}
|
||||||
virtual Status GetTestDirectory(std::string* path) {
|
virtual Status GetTestDirectory(std::string* path) {
|
||||||
|
@ -355,6 +393,12 @@ class EnvWrapper : public Env {
|
||||||
Env* target_;
|
Env* target_;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Riak specific hack to allow runtime change
|
||||||
|
// of mapping size
|
||||||
|
extern volatile size_t gMapSize;
|
||||||
|
|
||||||
|
extern bool gFadviseWillNeed;
|
||||||
|
|
||||||
} // namespace leveldb
|
} // namespace leveldb
|
||||||
|
|
||||||
#endif // STORAGE_LEVELDB_INCLUDE_ENV_H_
|
#endif // STORAGE_LEVELDB_INCLUDE_ENV_H_
|
||||||
|
|
135
src/leveldb/include/leveldb/expiry.h
Normal file
135
src/leveldb/include/leveldb/expiry.h
Normal file
|
@ -0,0 +1,135 @@
|
||||||
|
// -------------------------------------------------------------------
|
||||||
|
//
|
||||||
|
// expiry.h: background expiry management for Basho's modified leveldb
|
||||||
|
//
|
||||||
|
// Copyright (c) 2016 Basho Technologies, Inc. All Rights Reserved.
|
||||||
|
//
|
||||||
|
// This file is provided to you under the Apache License,
|
||||||
|
// Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain
|
||||||
|
// a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing,
|
||||||
|
// software distributed under the License is distributed on an
|
||||||
|
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
// KIND, either express or implied. See the License for the
|
||||||
|
// specific language governing permissions and limitations
|
||||||
|
// under the License.
|
||||||
|
//
|
||||||
|
// -------------------------------------------------------------------
|
||||||
|
|
||||||
|
#ifndef EXPIRY_H
|
||||||
|
#define EXPIRY_H
|
||||||
|
|
||||||
|
#include <limits.h>
|
||||||
|
#include <stdint.h>
|
||||||
|
#include "leveldb/env.h"
|
||||||
|
#include "leveldb/options.h"
|
||||||
|
#include "util/refobject_base.h"
|
||||||
|
|
||||||
|
namespace leveldb {
|
||||||
|
|
||||||
|
class Compaction;
|
||||||
|
class Logger;
|
||||||
|
struct ParsedInternalKey;
|
||||||
|
class Slice;
|
||||||
|
class SstCounters;
|
||||||
|
class Version;
|
||||||
|
class VersionEdit;
|
||||||
|
struct FileMetaData;
|
||||||
|
|
||||||
|
|
||||||
|
enum EleveldbRouterActions_t
|
||||||
|
{
|
||||||
|
eGetBucketProperties=1
|
||||||
|
}; // enum EleveldbRouterActions_t
|
||||||
|
|
||||||
|
|
||||||
|
typedef bool (* EleveldbRouter_t)(EleveldbRouterActions_t Action, int ParamCount, const void ** Params);
|
||||||
|
|
||||||
|
|
||||||
|
class ExpiryModule : public RefObjectBase
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
virtual ~ExpiryModule() {};
|
||||||
|
|
||||||
|
// Print expiry options to LOG file
|
||||||
|
virtual void Dump(Logger * log) const
|
||||||
|
{Log(log," Expiry: (none)");};
|
||||||
|
|
||||||
|
// Quick test to allow manifest logic and such know if
|
||||||
|
// extra expiry logic should be checked
|
||||||
|
virtual bool ExpiryActivated() const {return(false);};
|
||||||
|
|
||||||
|
// db/write_batch.cc MemTableInserter::Put() calls this.
|
||||||
|
// returns false on internal error
|
||||||
|
virtual bool MemTableInserterCallback(
|
||||||
|
const Slice & Key, // input: user's key about to be written
|
||||||
|
const Slice & Value, // input: user's value object
|
||||||
|
ValueType & ValType, // input/output: key type. call might change
|
||||||
|
ExpiryTimeMicros & Expiry) const // input/output: 0 or specific expiry. call might change
|
||||||
|
{return(true);};
|
||||||
|
|
||||||
|
// db/dbformat.cc KeyRetirement::operator() calls this.
|
||||||
|
// db/version_set.cc SaveValue() calls this too.
|
||||||
|
// returns true if key is expired, returns false if key not expired
|
||||||
|
virtual bool KeyRetirementCallback(
|
||||||
|
const ParsedInternalKey & Ikey) const
|
||||||
|
{return(false);};
|
||||||
|
|
||||||
|
// table/table_builder.cc TableBuilder::Add() calls this.
|
||||||
|
// returns false on internal error
|
||||||
|
virtual bool TableBuilderCallback(
|
||||||
|
const Slice & Key, // input: internal key
|
||||||
|
SstCounters & Counters) const // input/output: counters for new sst table
|
||||||
|
{return(true);};
|
||||||
|
|
||||||
|
// db/memtable.cc MemTable::Get() calls this.
|
||||||
|
// returns true if type/expiry is expired, returns false if not expired
|
||||||
|
virtual bool MemTableCallback(
|
||||||
|
const Slice & Key) const // input: leveldb internal key
|
||||||
|
{return(false);};
|
||||||
|
|
||||||
|
// db/version_set.cc VersionSet::Finalize() calls this if no
|
||||||
|
// other compaction selected for a level
|
||||||
|
// returns true if there is an expiry compaction eligible
|
||||||
|
virtual bool CompactionFinalizeCallback(
|
||||||
|
bool WantAll, // input: true - examine all expired files
|
||||||
|
const Version & Ver, // input: database state for examination
|
||||||
|
int Level, // input: level to review for expiry
|
||||||
|
VersionEdit * Edit) const // output: NULL or destination of delete list
|
||||||
|
{return(false);};
|
||||||
|
|
||||||
|
// yep, sometimes we want to expiry this expiry module object.
|
||||||
|
// mostly for bucket level properties in Riak EE
|
||||||
|
virtual uint64_t ExpiryModuleExpiryMicros() {return(0);};
|
||||||
|
|
||||||
|
// Creates derived ExpiryModule object that matches compile time
|
||||||
|
// switch for open source or Basho enterprise edition features.
|
||||||
|
static ExpiryModule * CreateExpiryModule(EleveldbRouter_t Router);
|
||||||
|
|
||||||
|
// Cleans up global objects related to expiry
|
||||||
|
// switch for open source or Basho enterprise edition features.
|
||||||
|
static void ShutdownExpiryModule();
|
||||||
|
|
||||||
|
// Riak EE: stash a user created module with settings
|
||||||
|
virtual void NoteUserExpirySettings() {};
|
||||||
|
|
||||||
|
protected:
|
||||||
|
ExpiryModule() {};
|
||||||
|
|
||||||
|
private:
|
||||||
|
ExpiryModule(const ExpiryModule &);
|
||||||
|
ExpiryModule & operator=(const ExpiryModule &);
|
||||||
|
|
||||||
|
}; // ExpiryModule
|
||||||
|
|
||||||
|
|
||||||
|
typedef RefPtr<class ExpiryModule> ExpiryPtr_t;
|
||||||
|
|
||||||
|
} // namespace leveldb
|
||||||
|
|
||||||
|
#endif // ifndef
|
||||||
|
|
|
@ -23,9 +23,21 @@ namespace leveldb {
|
||||||
class Slice;
|
class Slice;
|
||||||
|
|
||||||
class FilterPolicy {
|
class FilterPolicy {
|
||||||
public:
|
protected:
|
||||||
|
mutable const FilterPolicy * m_Next; // used by FilterInventory
|
||||||
|
|
||||||
|
public:
|
||||||
|
FilterPolicy()
|
||||||
|
: m_Next(NULL)
|
||||||
|
{};
|
||||||
|
|
||||||
virtual ~FilterPolicy();
|
virtual ~FilterPolicy();
|
||||||
|
|
||||||
|
// list pointer accessors
|
||||||
|
const FilterPolicy * GetNext() const {return(m_Next);};
|
||||||
|
void SetNext(const FilterPolicy * Next) const {m_Next=Next;};
|
||||||
|
|
||||||
|
|
||||||
// Return the name of this policy. Note that if the filter encoding
|
// Return the name of this policy. Note that if the filter encoding
|
||||||
// changes in an incompatible way, the name returned by this method
|
// changes in an incompatible way, the name returned by this method
|
||||||
// must be changed. Otherwise, old incompatible filters may be
|
// must be changed. Otherwise, old incompatible filters may be
|
||||||
|
@ -47,6 +59,7 @@ class FilterPolicy {
|
||||||
// This method may return true or false if the key was not on the
|
// This method may return true or false if the key was not on the
|
||||||
// list, but it should aim to return false with a high probability.
|
// list, but it should aim to return false with a high probability.
|
||||||
virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const = 0;
|
virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const = 0;
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
// Return a new filter policy that uses a bloom filter with approximately
|
// Return a new filter policy that uses a bloom filter with approximately
|
||||||
|
@ -64,7 +77,29 @@ class FilterPolicy {
|
||||||
// FilterPolicy (like NewBloomFilterPolicy) that does not ignore
|
// FilterPolicy (like NewBloomFilterPolicy) that does not ignore
|
||||||
// trailing spaces in keys.
|
// trailing spaces in keys.
|
||||||
extern const FilterPolicy* NewBloomFilterPolicy(int bits_per_key);
|
extern const FilterPolicy* NewBloomFilterPolicy(int bits_per_key);
|
||||||
|
extern const FilterPolicy* NewBloomFilterPolicy2(int bits_per_key);
|
||||||
|
|
||||||
}
|
|
||||||
|
class FilterInventory
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
// MUST be static variable so that it initializes before any static objects
|
||||||
|
// have their initializers called
|
||||||
|
static const FilterPolicy * ListHead;
|
||||||
|
|
||||||
|
// This might be called prior to singleton FilterInventory object
|
||||||
|
// being initialized. NOT THREAD SAFE.
|
||||||
|
static void AddFilterToInventory(const FilterPolicy * Filter)
|
||||||
|
{
|
||||||
|
if (NULL!=Filter)
|
||||||
|
{
|
||||||
|
Filter->SetNext(ListHead);
|
||||||
|
ListHead=Filter;
|
||||||
|
} // if
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}; // class FilterInventory
|
||||||
|
|
||||||
|
} // namespace leveldb
|
||||||
|
|
||||||
#endif // STORAGE_LEVELDB_INCLUDE_FILTER_POLICY_H_
|
#endif // STORAGE_LEVELDB_INCLUDE_FILTER_POLICY_H_
|
||||||
|
|
|
@ -17,6 +17,7 @@
|
||||||
|
|
||||||
#include "leveldb/slice.h"
|
#include "leveldb/slice.h"
|
||||||
#include "leveldb/status.h"
|
#include "leveldb/status.h"
|
||||||
|
#include "leveldb/options.h"
|
||||||
|
|
||||||
namespace leveldb {
|
namespace leveldb {
|
||||||
|
|
||||||
|
@ -37,7 +38,7 @@ class Iterator {
|
||||||
// Valid() after this call iff the source is not empty.
|
// Valid() after this call iff the source is not empty.
|
||||||
virtual void SeekToLast() = 0;
|
virtual void SeekToLast() = 0;
|
||||||
|
|
||||||
// Position at the first key in the source that is at or past target.
|
// Position at the first key in the source that at or past target
|
||||||
// The iterator is Valid() after this call iff the source contains
|
// The iterator is Valid() after this call iff the source contains
|
||||||
// an entry that comes at or past target.
|
// an entry that comes at or past target.
|
||||||
virtual void Seek(const Slice& target) = 0;
|
virtual void Seek(const Slice& target) = 0;
|
||||||
|
@ -61,9 +62,13 @@ class Iterator {
|
||||||
// Return the value for the current entry. The underlying storage for
|
// Return the value for the current entry. The underlying storage for
|
||||||
// the returned slice is valid only until the next modification of
|
// the returned slice is valid only until the next modification of
|
||||||
// the iterator.
|
// the iterator.
|
||||||
// REQUIRES: Valid()
|
// REQUIRES: !AtEnd() && !AtStart()
|
||||||
virtual Slice value() const = 0;
|
virtual Slice value() const = 0;
|
||||||
|
|
||||||
|
// Riak specific: if a database iterator, returns key meta data
|
||||||
|
// REQUIRES: Valid()
|
||||||
|
virtual KeyMetaData & keymetadata() const {return(keymetadata_); };
|
||||||
|
|
||||||
// If an error has occurred, return it. Else return an ok status.
|
// If an error has occurred, return it. Else return an ok status.
|
||||||
virtual Status status() const = 0;
|
virtual Status status() const = 0;
|
||||||
|
|
||||||
|
@ -75,6 +80,10 @@ class Iterator {
|
||||||
typedef void (*CleanupFunction)(void* arg1, void* arg2);
|
typedef void (*CleanupFunction)(void* arg1, void* arg2);
|
||||||
void RegisterCleanup(CleanupFunction function, void* arg1, void* arg2);
|
void RegisterCleanup(CleanupFunction function, void* arg1, void* arg2);
|
||||||
|
|
||||||
|
protected:
|
||||||
|
// mutable so reusable by derived classes
|
||||||
|
mutable KeyMetaData keymetadata_;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
struct Cleanup {
|
struct Cleanup {
|
||||||
CleanupFunction function;
|
CleanupFunction function;
|
||||||
|
|
|
@ -6,15 +6,23 @@
|
||||||
#define STORAGE_LEVELDB_INCLUDE_OPTIONS_H_
|
#define STORAGE_LEVELDB_INCLUDE_OPTIONS_H_
|
||||||
|
|
||||||
#include <stddef.h>
|
#include <stddef.h>
|
||||||
|
#include <stdint.h>
|
||||||
|
#include <string>
|
||||||
|
#include <memory>
|
||||||
|
|
||||||
namespace leveldb {
|
namespace leveldb {
|
||||||
|
|
||||||
class Cache;
|
class Cache;
|
||||||
class Comparator;
|
class Comparator;
|
||||||
class Env;
|
class Env;
|
||||||
|
class ExpiryModule;
|
||||||
class FilterPolicy;
|
class FilterPolicy;
|
||||||
class Logger;
|
class Logger;
|
||||||
class Snapshot;
|
class Snapshot;
|
||||||
|
namespace log
|
||||||
|
{
|
||||||
|
class Writer;
|
||||||
|
} // namespace log
|
||||||
|
|
||||||
// DB contents are stored in a set of blocks, each of which holds a
|
// DB contents are stored in a set of blocks, each of which holds a
|
||||||
// sequence of key,value pairs. Each block may be compressed before
|
// sequence of key,value pairs. Each block may be compressed before
|
||||||
|
@ -24,9 +32,34 @@ enum CompressionType {
|
||||||
// NOTE: do not change the values of existing entries, as these are
|
// NOTE: do not change the values of existing entries, as these are
|
||||||
// part of the persistent format on disk.
|
// part of the persistent format on disk.
|
||||||
kNoCompression = 0x0,
|
kNoCompression = 0x0,
|
||||||
kSnappyCompression = 0x1
|
kSnappyCompression = 0x1,
|
||||||
|
kLZ4Compression = 0x2,
|
||||||
|
kNoCompressionAutomated = 0x3
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Originally located in db/dbformat.h. Now available publically.
|
||||||
|
// Value types encoded as the last component of internal keys.
|
||||||
|
// DO NOT CHANGE THESE ENUM VALUES: they are embedded in the on-disk
|
||||||
|
// data structures.
|
||||||
|
enum ValueType {
|
||||||
|
kTypeDeletion = 0x0,
|
||||||
|
kTypeValue = 0x1,
|
||||||
|
kTypeValueWriteTime = 0x2,
|
||||||
|
kTypeValueExplicitExpiry = 0x3
|
||||||
|
};
|
||||||
|
|
||||||
|
// Originally located in db/dbformat.h
|
||||||
|
typedef uint64_t SequenceNumber;
|
||||||
|
typedef uint64_t ExpiryTimeMicros;
|
||||||
|
|
||||||
|
}; // namespace leveldb
|
||||||
|
|
||||||
|
//
|
||||||
|
// must follow ValueType declaration
|
||||||
|
#include "leveldb/expiry.h"
|
||||||
|
|
||||||
|
namespace leveldb {
|
||||||
|
|
||||||
// Options to control the behavior of a database (passed to DB::Open)
|
// Options to control the behavior of a database (passed to DB::Open)
|
||||||
struct Options {
|
struct Options {
|
||||||
// -------------------
|
// -------------------
|
||||||
|
@ -56,6 +89,14 @@ struct Options {
|
||||||
// Default: false
|
// Default: false
|
||||||
bool paranoid_checks;
|
bool paranoid_checks;
|
||||||
|
|
||||||
|
// Riak specific: this variable replaces paranoid_checks at one
|
||||||
|
// one place in the code. This variable alone controls whether or not
|
||||||
|
// compaction read operations check CRC values. Riak needs
|
||||||
|
// the compaction CRC check, but not other paranoid_checks ... so
|
||||||
|
// this independent control.
|
||||||
|
// Default: true
|
||||||
|
bool verify_compactions;
|
||||||
|
|
||||||
// Use the specified object to interact with the environment,
|
// Use the specified object to interact with the environment,
|
||||||
// e.g. to read/write files, schedule background work, etc.
|
// e.g. to read/write files, schedule background work, etc.
|
||||||
// Default: Env::Default()
|
// Default: Env::Default()
|
||||||
|
@ -85,7 +126,7 @@ struct Options {
|
||||||
// Number of open files that can be used by the DB. You may need to
|
// Number of open files that can be used by the DB. You may need to
|
||||||
// increase this if your database has a large working set (budget
|
// increase this if your database has a large working set (budget
|
||||||
// one open file per 2MB of working set).
|
// one open file per 2MB of working set).
|
||||||
//
|
// RIAK: NO LONGER USED
|
||||||
// Default: 1000
|
// Default: 1000
|
||||||
int max_open_files;
|
int max_open_files;
|
||||||
|
|
||||||
|
@ -105,6 +146,15 @@ struct Options {
|
||||||
// Default: 4K
|
// Default: 4K
|
||||||
size_t block_size;
|
size_t block_size;
|
||||||
|
|
||||||
|
// Riak specific: non-zero value activates code to automatically
|
||||||
|
// increase block_size as needed to ensure maximum number of files
|
||||||
|
// are available in the file cache. The value indicates how many
|
||||||
|
// incremental increases to use between the original block_size
|
||||||
|
// and largest, reasonable block_size.
|
||||||
|
//
|
||||||
|
// Default: 16
|
||||||
|
int block_size_steps;
|
||||||
|
|
||||||
// Number of keys between restart points for delta encoding of keys.
|
// Number of keys between restart points for delta encoding of keys.
|
||||||
// This parameter can be changed dynamically. Most clients should
|
// This parameter can be changed dynamically. Most clients should
|
||||||
// leave this parameter alone.
|
// leave this parameter alone.
|
||||||
|
@ -112,18 +162,6 @@ struct Options {
|
||||||
// Default: 16
|
// Default: 16
|
||||||
int block_restart_interval;
|
int block_restart_interval;
|
||||||
|
|
||||||
// Leveldb will write up to this amount of bytes to a file before
|
|
||||||
// switching to a new one.
|
|
||||||
// Most clients should leave this parameter alone. However if your
|
|
||||||
// filesystem is more efficient with larger files, you could
|
|
||||||
// consider increasing the value. The downside will be longer
|
|
||||||
// compactions and hence longer latency/performance hiccups.
|
|
||||||
// Another reason to increase this parameter might be when you are
|
|
||||||
// initially populating a large database.
|
|
||||||
//
|
|
||||||
// Default: 2MB
|
|
||||||
size_t max_file_size;
|
|
||||||
|
|
||||||
// Compress blocks using the specified compression algorithm. This
|
// Compress blocks using the specified compression algorithm. This
|
||||||
// parameter can be changed dynamically.
|
// parameter can be changed dynamically.
|
||||||
//
|
//
|
||||||
|
@ -140,12 +178,6 @@ struct Options {
|
||||||
// efficiently detect that and will switch to uncompressed mode.
|
// efficiently detect that and will switch to uncompressed mode.
|
||||||
CompressionType compression;
|
CompressionType compression;
|
||||||
|
|
||||||
// EXPERIMENTAL: If true, append to existing MANIFEST and log files
|
|
||||||
// when a database is opened. This can significantly speed up open.
|
|
||||||
//
|
|
||||||
// Default: currently false, but may become true later.
|
|
||||||
bool reuse_logs;
|
|
||||||
|
|
||||||
// If non-NULL, use the specified filter policy to reduce disk reads.
|
// If non-NULL, use the specified filter policy to reduce disk reads.
|
||||||
// Many applications will benefit from passing the result of
|
// Many applications will benefit from passing the result of
|
||||||
// NewBloomFilterPolicy() here.
|
// NewBloomFilterPolicy() here.
|
||||||
|
@ -153,8 +185,84 @@ struct Options {
|
||||||
// Default: NULL
|
// Default: NULL
|
||||||
const FilterPolicy* filter_policy;
|
const FilterPolicy* filter_policy;
|
||||||
|
|
||||||
|
// Riak specific flag used to indicate when database is open
|
||||||
|
// as part of a Repair operation. Default is false
|
||||||
|
bool is_repair;
|
||||||
|
|
||||||
|
// Riak specific flag to mark Riak internal database versus
|
||||||
|
// user database. (User database gets larger cache resources.)
|
||||||
|
bool is_internal_db;
|
||||||
|
|
||||||
|
// Riak replacement for max_open_files and block_cache. This is
|
||||||
|
// TOTAL memory to be used by leveldb across ALL DATABASES.
|
||||||
|
// Most recent value seen upon database open, wins. Zero for default.
|
||||||
|
uint64_t total_leveldb_mem;
|
||||||
|
|
||||||
|
// Riak specific option specifying block cache space that cannot
|
||||||
|
// be released for page cache use. The space may still be
|
||||||
|
// released for file cache.
|
||||||
|
uint64_t block_cache_threshold;
|
||||||
|
|
||||||
|
// Riak option to override most memory modeling and create
|
||||||
|
// smaller memory footprint for developers. Helps when
|
||||||
|
// running large number of databases and multiple VMs. Do
|
||||||
|
// NOT use this option if making performance measurements.
|
||||||
|
// Default: false
|
||||||
|
bool limited_developer_mem;
|
||||||
|
|
||||||
|
// The size of each MMAped file, choose 0 for the default (20M)
|
||||||
|
uint64_t mmap_size;
|
||||||
|
|
||||||
|
// Riak option to adjust aggressive delete behavior.
|
||||||
|
// - zero disables aggressive delete
|
||||||
|
// - positive value indicates how many deletes must exist
|
||||||
|
// in a file for it to be compacted due to deletes
|
||||||
|
uint64_t delete_threshold;
|
||||||
|
|
||||||
|
// Riak specific flag used to indicate when fadvise() management
|
||||||
|
// should default to WILLNEED instead of DONTNEED. Default is false
|
||||||
|
bool fadvise_willneed;
|
||||||
|
|
||||||
|
// *****
|
||||||
|
// Riak specific options for establishing two tiers of disk arrays.
|
||||||
|
// All three tier options must be valid for the option to activate.
|
||||||
|
// When active, leveldb directories are constructed using either
|
||||||
|
// the fast or slow prefix followed by the database name given
|
||||||
|
// in the DB::Open call. (a synonym for "prefix" is "mount")
|
||||||
|
// *****
|
||||||
|
|
||||||
|
// Riak specific option setting the level number at which the
|
||||||
|
// "tiered_slow_prefix" should be used. Default is zero which
|
||||||
|
// disables the option. Valid values are 1 to 6. 3 or 4 recommended.
|
||||||
|
unsigned tiered_slow_level;
|
||||||
|
|
||||||
|
// Riak specific option with the path prefix used for "fast" disk
|
||||||
|
// array. levels 0 to tiered_slow_level-1 use this path prefix
|
||||||
|
std::string tiered_fast_prefix;
|
||||||
|
|
||||||
|
// Riak specific option with the path prefix used for "slow" disk
|
||||||
|
// array. levels tiered_slow_level through 6 use this path prefix
|
||||||
|
std::string tiered_slow_prefix;
|
||||||
|
|
||||||
|
// Riak specific option that writes a list of open table files
|
||||||
|
// to disk on close then automatically opens same files again
|
||||||
|
// upon restart.
|
||||||
|
bool cache_object_warming;
|
||||||
|
|
||||||
|
// Riak specific object that defines expiry policy for data
|
||||||
|
// written to leveldb.
|
||||||
|
ExpiryPtr_t expiry_module;
|
||||||
|
|
||||||
// Create an Options object with default values for all fields.
|
// Create an Options object with default values for all fields.
|
||||||
Options();
|
Options();
|
||||||
|
|
||||||
|
void Dump(Logger * log) const;
|
||||||
|
|
||||||
|
bool ExpiryActivated() const
|
||||||
|
{return(NULL!=expiry_module.get() && expiry_module->ExpiryActivated());};
|
||||||
|
|
||||||
|
private:
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
// Options that control read operations
|
// Options that control read operations
|
||||||
|
@ -171,16 +279,57 @@ struct ReadOptions {
|
||||||
|
|
||||||
// If "snapshot" is non-NULL, read as of the supplied snapshot
|
// If "snapshot" is non-NULL, read as of the supplied snapshot
|
||||||
// (which must belong to the DB that is being read and which must
|
// (which must belong to the DB that is being read and which must
|
||||||
// not have been released). If "snapshot" is NULL, use an implicit
|
// not have been released). If "snapshot" is NULL, use an impliicit
|
||||||
// snapshot of the state at the beginning of this read operation.
|
// snapshot of the state at the beginning of this read operation.
|
||||||
// Default: NULL
|
// Default: NULL
|
||||||
const Snapshot* snapshot;
|
const Snapshot* snapshot;
|
||||||
|
|
||||||
|
// Riak specific flag, currently used within Erlang adaptor
|
||||||
|
// to enable automatic delete and new of fresh snapshot
|
||||||
|
// and database iterator objects for long running iterations
|
||||||
|
// (only supports iterator NEXT operations).
|
||||||
|
// Default: false
|
||||||
|
bool iterator_refresh;
|
||||||
|
|
||||||
ReadOptions()
|
ReadOptions()
|
||||||
: verify_checksums(false),
|
: verify_checksums(true),
|
||||||
fill_cache(true),
|
fill_cache(true),
|
||||||
snapshot(NULL) {
|
snapshot(NULL),
|
||||||
|
iterator_refresh(false),
|
||||||
|
is_compaction(false),
|
||||||
|
env(NULL),
|
||||||
|
info_log(NULL)
|
||||||
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// accessors to the private data
|
||||||
|
bool IsCompaction() const {return(is_compaction);};
|
||||||
|
|
||||||
|
Logger * GetInfoLog() const {return(info_log);};
|
||||||
|
|
||||||
|
const std::string & GetDBName() const {return(dbname);};
|
||||||
|
|
||||||
|
Env * GetEnv() const {return(env);};
|
||||||
|
|
||||||
|
// The items below are internal options, not for external manipulation.
|
||||||
|
// They are populated by VersionSet::MakeInputIterator only during compaction operations
|
||||||
|
private:
|
||||||
|
friend class VersionSet;
|
||||||
|
|
||||||
|
// true when used on background compaction
|
||||||
|
bool is_compaction;
|
||||||
|
|
||||||
|
// Database name for potential creation of bad blocks file
|
||||||
|
std::string dbname;
|
||||||
|
|
||||||
|
// Needed for file operations if creating bad blocks file
|
||||||
|
Env * env;
|
||||||
|
|
||||||
|
// Open log file for error notifications
|
||||||
|
// Only valid when is_compation==true
|
||||||
|
Logger* info_log;
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
// Options that control write operations
|
// Options that control write operations
|
||||||
|
@ -208,6 +357,22 @@ struct WriteOptions {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
// Riak specific object that can return key metadata
|
||||||
|
// during get or iterate operation
|
||||||
|
struct KeyMetaData
|
||||||
|
{
|
||||||
|
ValueType m_Type; // see above
|
||||||
|
SequenceNumber m_Sequence; // output only, leveldb internal
|
||||||
|
ExpiryTimeMicros m_Expiry; // microseconds since Epoch, UTC
|
||||||
|
|
||||||
|
KeyMetaData()
|
||||||
|
: m_Type(kTypeValue), m_Sequence(0), m_Expiry(0)
|
||||||
|
{};
|
||||||
|
}; // struct KeyMetaData
|
||||||
|
|
||||||
|
const char * CompileOptionsString();
|
||||||
|
|
||||||
} // namespace leveldb
|
} // namespace leveldb
|
||||||
|
|
||||||
#endif // STORAGE_LEVELDB_INCLUDE_OPTIONS_H_
|
#endif // STORAGE_LEVELDB_INCLUDE_OPTIONS_H_
|
||||||
|
|
329
src/leveldb/include/leveldb/perf_count.h
Normal file
329
src/leveldb/include/leveldb/perf_count.h
Normal file
|
@ -0,0 +1,329 @@
|
||||||
|
// -------------------------------------------------------------------
|
||||||
|
//
|
||||||
|
// perf_count.h: performance counters LevelDB
|
||||||
|
//
|
||||||
|
// Copyright (c) 2012-2016 Basho Technologies, Inc. All Rights Reserved.
|
||||||
|
//
|
||||||
|
// This file is provided to you under the Apache License,
|
||||||
|
// Version 2.0 (the "License"); you may not use this file
|
||||||
|
// except in compliance with the License. You may obtain
|
||||||
|
// a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing,
|
||||||
|
// software distributed under the License is distributed on an
|
||||||
|
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
// KIND, either express or implied. See the License for the
|
||||||
|
// specific language governing permissions and limitations
|
||||||
|
// under the License.
|
||||||
|
//
|
||||||
|
// -------------------------------------------------------------------
|
||||||
|
|
||||||
|
#ifndef STORAGE_LEVELDB_INCLUDE_PERF_COUNT_H_
|
||||||
|
#define STORAGE_LEVELDB_INCLUDE_PERF_COUNT_H_
|
||||||
|
|
||||||
|
#include <stdint.h>
|
||||||
|
#include <string>
|
||||||
|
#include "leveldb/status.h"
|
||||||
|
|
||||||
|
namespace leveldb {
|
||||||
|
|
||||||
|
enum SstCountEnum
|
||||||
|
{
|
||||||
|
//
|
||||||
|
// array index values/names
|
||||||
|
//
|
||||||
|
eSstCountKeys=0, //!< how many keys in this sst
|
||||||
|
eSstCountBlocks=1, //!< how many blocks in this sst
|
||||||
|
eSstCountCompressAborted=2,//!< how many blocks attempted compression and aborted use
|
||||||
|
eSstCountKeySize=3, //!< byte count of all keys
|
||||||
|
eSstCountValueSize=4, //!< byte count of all values
|
||||||
|
eSstCountBlockSize=5, //!< byte count of all blocks (pre-compression)
|
||||||
|
eSstCountBlockWriteSize=6, //!< post-compression size, or BlockSize if no compression
|
||||||
|
eSstCountIndexKeys=7, //!< how many keys in the index block
|
||||||
|
eSstCountKeyLargest=8, //!< largest key in sst
|
||||||
|
eSstCountKeySmallest=9, //!< smallest key in sst
|
||||||
|
eSstCountValueLargest=10, //!< largest value in sst
|
||||||
|
eSstCountValueSmallest=11, //!< smallest value in sst
|
||||||
|
eSstCountDeleteKey=12, //!< tombstone count
|
||||||
|
eSstCountBlockSizeUsed=13, //!< Options::block_size used with this file
|
||||||
|
eSstCountUserDataSize=14, //!< post-compression size of non-metadata (user keys/values/block overhead)
|
||||||
|
eSstCountExpiry1=15, //!< undocumented expiry counter 1
|
||||||
|
eSstCountExpiry2=16, //!< undocumented expiry counter 2
|
||||||
|
eSstCountExpiry3=17, //!< undocumented expiry counter 3
|
||||||
|
eSstCountSequence=18, //!< highest sequence number in file
|
||||||
|
|
||||||
|
// must follow last index name to represent size of array
|
||||||
|
eSstCountEnumSize, //!< size of the array described by the enum values
|
||||||
|
|
||||||
|
eSstCountVersion=1
|
||||||
|
|
||||||
|
}; // enum SstCountEnum
|
||||||
|
|
||||||
|
|
||||||
|
class SstCounters
|
||||||
|
{
|
||||||
|
protected:
|
||||||
|
bool m_IsReadOnly; //!< set when data decoded from a file
|
||||||
|
uint32_t m_Version; //!< object revision identification
|
||||||
|
uint32_t m_CounterSize; //!< number of objects in m_Counter
|
||||||
|
|
||||||
|
uint64_t m_Counter[eSstCountEnumSize];
|
||||||
|
|
||||||
|
public:
|
||||||
|
// constructors / destructor
|
||||||
|
SstCounters();
|
||||||
|
|
||||||
|
// Put data into disk form
|
||||||
|
void EncodeTo(std::string & Dst) const;
|
||||||
|
|
||||||
|
// Populate member data from prior EncodeTo block
|
||||||
|
Status DecodeFrom(const Slice& src);
|
||||||
|
|
||||||
|
// increment the counter
|
||||||
|
uint64_t Inc(unsigned Index);
|
||||||
|
|
||||||
|
// add value to the counter
|
||||||
|
uint64_t Add(unsigned Index, uint64_t Amount);
|
||||||
|
|
||||||
|
// return value of a counter
|
||||||
|
uint64_t Value(unsigned Index) const;
|
||||||
|
|
||||||
|
// set a value
|
||||||
|
void Set(unsigned Index, uint64_t);
|
||||||
|
|
||||||
|
// return number of counters
|
||||||
|
uint32_t Size() const {return(m_CounterSize);};
|
||||||
|
|
||||||
|
// printf all values
|
||||||
|
void Dump() const;
|
||||||
|
|
||||||
|
}; // class SstCounters
|
||||||
|
|
||||||
|
|
||||||
|
extern struct PerformanceCounters * gPerfCounters;
|
||||||
|
|
||||||
|
|
||||||
|
enum PerformanceCountersEnum
|
||||||
|
{
|
||||||
|
//
|
||||||
|
// array index values/names
|
||||||
|
// (enum explicitly numbered to allow future edits / moves / inserts)
|
||||||
|
//
|
||||||
|
ePerfROFileOpen=0, //!< PosixMmapReadableFile open
|
||||||
|
ePerfROFileClose=1, //!< closed
|
||||||
|
ePerfROFileUnmap=2, //!< unmap without close
|
||||||
|
|
||||||
|
ePerfRWFileOpen=3, //!< PosixMmapFile open
|
||||||
|
ePerfRWFileClose=4, //!< closed
|
||||||
|
ePerfRWFileUnmap=5, //!< unmap without close
|
||||||
|
|
||||||
|
ePerfApiOpen=6, //!< Count of DB::Open completions
|
||||||
|
ePerfApiGet=7, //!< Count of DBImpl::Get completions
|
||||||
|
ePerfApiWrite=8, //!< Count of DBImpl::Get completions
|
||||||
|
|
||||||
|
ePerfWriteSleep=9, //!< DBImpl::MakeRoomForWrite called sleep
|
||||||
|
ePerfWriteWaitImm=10, //!< DBImpl::MakeRoomForWrite called Wait on Imm compact
|
||||||
|
ePerfWriteWaitLevel0=11,//!< DBImpl::MakeRoomForWrite called Wait on Level0 compact
|
||||||
|
ePerfWriteNewMem=12, //!< DBImpl::MakeRoomForWrite created new memory log
|
||||||
|
ePerfWriteError=13, //!< DBImpl::MakeRoomForWrite saw bg_error_
|
||||||
|
ePerfWriteNoWait=14, //!< DBImpl::MakeRoomForWrite took no action
|
||||||
|
|
||||||
|
ePerfGetMem=15, //!< DBImpl::Get read from memory log
|
||||||
|
ePerfGetImm=16, //!< DBImpl::Get read from previous memory log
|
||||||
|
ePerfGetVersion=17, //!< DBImpl::Get read from Version object
|
||||||
|
|
||||||
|
// code ASSUMES the levels are in numerical order,
|
||||||
|
// i.e. based off of ePerfSearchLevel0
|
||||||
|
ePerfSearchLevel0=18, //!< Version::Get read searched one or more files here
|
||||||
|
ePerfSearchLevel1=19, //!< Version::Get read searched one or more files here
|
||||||
|
ePerfSearchLevel2=20, //!< Version::Get read searched one or more files here
|
||||||
|
ePerfSearchLevel3=21, //!< Version::Get read searched one or more files here
|
||||||
|
ePerfSearchLevel4=22, //!< Version::Get read searched one or more files here
|
||||||
|
ePerfSearchLevel5=23, //!< Version::Get read searched one or more files here
|
||||||
|
ePerfSearchLevel6=24, //!< Version::Get read searched one or more files here
|
||||||
|
|
||||||
|
ePerfTableCached=25, //!< TableCache::FindTable found table in cache
|
||||||
|
ePerfTableOpened=26, //!< TableCache::FindTable had to open table file
|
||||||
|
ePerfTableGet=27, //!< TableCache::Get used to retrieve a key
|
||||||
|
|
||||||
|
ePerfBGCloseUnmap=28, //!< PosixEnv::BGThreaed started Unmap/Close job
|
||||||
|
ePerfBGCompactImm=29, //!< PosixEnv::BGThreaed started compaction of Imm
|
||||||
|
ePerfBGNormal=30, //!< PosixEnv::BGThreaed started normal compaction job
|
||||||
|
ePerfBGCompactLevel0=31,//!< PosixEnv::BGThreaed started compaction of Level0
|
||||||
|
|
||||||
|
ePerfBlockFiltered=32, //!< Table::BlockReader search stopped due to filter
|
||||||
|
ePerfBlockFilterFalse=33,//!< Table::BlockReader gave a false positive for match
|
||||||
|
ePerfBlockCached=34, //!< Table::BlockReader found block in cache
|
||||||
|
ePerfBlockRead=35, //!< Table::BlockReader read block from disk
|
||||||
|
ePerfBlockFilterRead=36,//!< Table::ReadMeta filter loaded from file
|
||||||
|
ePerfBlockValidGet=37, //!< Table::InternalGet has valid iterator
|
||||||
|
|
||||||
|
ePerfDebug0=38, //!< Developer debug counters, moveable
|
||||||
|
ePerfDebug1=39, //!< Developer debug counters, moveable
|
||||||
|
ePerfDebug2=40, //!< Developer debug counters, moveable
|
||||||
|
ePerfDebug3=41, //!< Developer debug counters, moveable
|
||||||
|
ePerfDebug4=42, //!< Developer debug counters, moveable
|
||||||
|
|
||||||
|
ePerfReadBlockError=43, //!< crc or compression error in ReadBlock (format.cc)
|
||||||
|
|
||||||
|
ePerfIterNew=44, //!< Count of DBImpl::NewDBIterator calls
|
||||||
|
ePerfIterNext=45, //!< Count of DBIter::Next calls
|
||||||
|
ePerfIterPrev=46, //!< Count of DBIter::Prev calls
|
||||||
|
ePerfIterSeek=47, //!< Count of DBIter::Seek calls
|
||||||
|
ePerfIterSeekFirst=48, //!< Count of DBIter::SeekFirst calls
|
||||||
|
ePerfIterSeekLast=49, //!< Count of DBIter::SeekLast calls
|
||||||
|
ePerfIterDelete=50, //!< Count of DBIter::~DBIter
|
||||||
|
|
||||||
|
ePerfElevelDirect=51, //!< eleveldb's FindWaitingThread went direct to thread
|
||||||
|
ePerfElevelQueued=52, //!< eleveldb's FindWaitingThread queued work item
|
||||||
|
ePerfElevelDequeued=53, //!< eleveldb's worker took item from backlog queue
|
||||||
|
|
||||||
|
ePerfElevelRefCreate=54,//!< eleveldb RefObject constructed
|
||||||
|
ePerfElevelRefDelete=55,//!< eleveldb RefObject destructed
|
||||||
|
|
||||||
|
ePerfThrottleGauge=56, //!< current throttle value
|
||||||
|
ePerfThrottleCounter=57,//!< running throttle by seconds
|
||||||
|
|
||||||
|
ePerfThrottleMicros0=58,//!< level 0 micros spent compacting
|
||||||
|
ePerfThrottleKeys0=59, //!< level 0 keys processed
|
||||||
|
ePerfThrottleBacklog0=60,//!< backlog at time of posting (level0)
|
||||||
|
ePerfThrottleCompacts0=61,//!< number of level 0 compactions
|
||||||
|
|
||||||
|
ePerfThrottleMicros1=62,//!< level 1+ micros spent compacting
|
||||||
|
ePerfThrottleKeys1=63, //!< level 1+ keys processed
|
||||||
|
ePerfThrottleBacklog1=64,//!< backlog at time of posting (level1+)
|
||||||
|
ePerfThrottleCompacts1=65,//!< number of level 1+ compactions
|
||||||
|
|
||||||
|
ePerfBGWriteError=66, //!< error in write/close, see syslog
|
||||||
|
|
||||||
|
ePerfThrottleWait=67, //!< milliseconds of throttle wait
|
||||||
|
ePerfThreadError=68, //!< system error on thread related call, no LOG access
|
||||||
|
|
||||||
|
ePerfBGImmDirect=69, //!< count Imm compactions happened directly
|
||||||
|
ePerfBGImmQueued=70, //!< count Imm compactions placed on queue
|
||||||
|
ePerfBGImmDequeued=71, //!< count Imm compactions removed from queue
|
||||||
|
ePerfBGImmWeighted=72, //!< total microseconds item spent on queue
|
||||||
|
|
||||||
|
ePerfBGUnmapDirect=73, //!< count Unmap operations happened directly
|
||||||
|
ePerfBGUnmapQueued=74, //!< count Unmap operations placed on queue
|
||||||
|
ePerfBGUnmapDequeued=75,//!< count Unmap operations removed from queue
|
||||||
|
ePerfBGUnmapWeighted=76,//!< total microseconds item spent on queue
|
||||||
|
|
||||||
|
ePerfBGLevel0Direct=77, //!< count Level0 compactions happened directly
|
||||||
|
ePerfBGLevel0Queued=78, //!< count Level0 compactions placed on queue
|
||||||
|
ePerfBGLevel0Dequeued=79,//!< count Level0 compactions removed from queue
|
||||||
|
ePerfBGLevel0Weighted=80,//!< total microseconds item spent on queue
|
||||||
|
|
||||||
|
ePerfBGCompactDirect=81, //!< count generic compactions happened directly
|
||||||
|
ePerfBGCompactQueued=82, //!< count generic compactions placed on queue
|
||||||
|
ePerfBGCompactDequeued=83,//!< count generic compactions removed from queue
|
||||||
|
ePerfBGCompactWeighted=84,//!< total microseconds item spent on queue
|
||||||
|
|
||||||
|
ePerfFileCacheInsert=85, //!< total bytes inserted into file cache
|
||||||
|
ePerfFileCacheRemove=86, //!< total bytes removed from file cache
|
||||||
|
|
||||||
|
ePerfBlockCacheInsert=87, //!< total bytes inserted into block cache
|
||||||
|
ePerfBlockCacheRemove=88, //!< total bytes removed from block cache
|
||||||
|
|
||||||
|
ePerfApiDelete=89, //!< Count of DB::Delete
|
||||||
|
|
||||||
|
ePerfBGMove=90, //!< compaction was a successful move
|
||||||
|
ePerfBGMoveFail=91, //!< compaction move failed, regular compaction attempted
|
||||||
|
|
||||||
|
ePerfThrottleUnadjusted=92,//!< current unadjusted throttle gauge
|
||||||
|
|
||||||
|
// this one was added to the other ePerfElevelXxx counters above when we backported HotThreadPool to eleveldb
|
||||||
|
ePerfElevelWeighted=93, //!< total microseconds item spent on queue
|
||||||
|
|
||||||
|
ePerfExpiredKeys=94, //!< key physically removed because it expired
|
||||||
|
ePerfExpiredFiles=95, //!< entire file removed because all keys expired
|
||||||
|
|
||||||
|
ePerfSyslogWrite=96, //!< logged message to syslog
|
||||||
|
ePerfBackupStarted=97, //!< hot backup initiated
|
||||||
|
ePerfBackupError=98, //!< hot backup had an error
|
||||||
|
|
||||||
|
ePerfPropCacheHit=99, //!< property cache had data
|
||||||
|
ePerfPropCacheMiss=100, //!< property cache had to look up data
|
||||||
|
ePerfPropCacheError=101, //!< no property cache entry built/located
|
||||||
|
|
||||||
|
// must follow last index name to represent size of array
|
||||||
|
// (ASSUMES previous enum is highest value)
|
||||||
|
ePerfCountEnumSize, //!< size of the array described by the enum values
|
||||||
|
|
||||||
|
ePerfVersion=1, //!< structure versioning
|
||||||
|
ePerfKey=41207 //!< random number as shared memory identifier
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
struct PerfCounterAttributes
|
||||||
|
{
|
||||||
|
const char * m_PerfCounterName; //!< text description
|
||||||
|
const bool m_PerfDiscretionary; //!< true if ok to disable
|
||||||
|
}; // PerfCounterAttributes
|
||||||
|
|
||||||
|
|
||||||
|
//
|
||||||
|
// Do NOT use virtual functions. This structure will be aligned at different
|
||||||
|
// locations in multiple processes. Things can get messy with virtuals.
|
||||||
|
|
||||||
|
struct PerformanceCounters
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
static int m_LastError;
|
||||||
|
|
||||||
|
protected:
|
||||||
|
uint32_t m_Version; //!< object revision identification
|
||||||
|
uint32_t m_CounterSize; //!< number of objects in m_Counter
|
||||||
|
|
||||||
|
volatile uint64_t m_Counter[ePerfCountEnumSize];
|
||||||
|
|
||||||
|
static const PerfCounterAttributes m_PerfCounterAttr[];
|
||||||
|
static int m_PerfSharedId;
|
||||||
|
static volatile uint64_t m_BogusCounter; //!< for out of range GetPtr calls
|
||||||
|
|
||||||
|
public:
|
||||||
|
// only called for local object, not for shared memory
|
||||||
|
PerformanceCounters();
|
||||||
|
|
||||||
|
//!< does executable's idea of version match shared object?
|
||||||
|
bool VersionTest()
|
||||||
|
{return(ePerfCountEnumSize<=m_CounterSize && ePerfVersion==m_Version);};
|
||||||
|
|
||||||
|
//!< mostly for perf_count_test.cc
|
||||||
|
void SetVersion(uint32_t Version, uint32_t CounterSize)
|
||||||
|
{m_Version=Version; m_CounterSize=CounterSize;};
|
||||||
|
|
||||||
|
static PerformanceCounters * Init(bool IsReadOnly);
|
||||||
|
static int Close(PerformanceCounters * Counts);
|
||||||
|
|
||||||
|
uint64_t Inc(unsigned Index);
|
||||||
|
uint64_t Dec(unsigned Index);
|
||||||
|
|
||||||
|
// add value to the counter
|
||||||
|
uint64_t Add(unsigned Index, uint64_t Amount);
|
||||||
|
|
||||||
|
// return value of a counter
|
||||||
|
uint64_t Value(unsigned Index) const;
|
||||||
|
|
||||||
|
// set a value
|
||||||
|
void Set(unsigned Index, uint64_t);
|
||||||
|
|
||||||
|
volatile const uint64_t * GetPtr(unsigned Index) const;
|
||||||
|
|
||||||
|
static const char * GetNamePtr(unsigned Index);
|
||||||
|
|
||||||
|
int LookupCounter(const char * Name);
|
||||||
|
|
||||||
|
void Dump();
|
||||||
|
|
||||||
|
}; // struct PerformanceCounters
|
||||||
|
|
||||||
|
extern PerformanceCounters * gPerfCounters;
|
||||||
|
|
||||||
|
extern volatile bool gPerfCountersDisabled;
|
||||||
|
|
||||||
|
} // namespace leveldb
|
||||||
|
|
||||||
|
#endif // STORAGE_LEVELDB_INCLUDE_PERF_COUNT_H_
|
|
@ -94,7 +94,7 @@ inline bool operator!=(const Slice& x, const Slice& y) {
|
||||||
}
|
}
|
||||||
|
|
||||||
inline int Slice::compare(const Slice& b) const {
|
inline int Slice::compare(const Slice& b) const {
|
||||||
const size_t min_len = (size_ < b.size_) ? size_ : b.size_;
|
const int min_len = (size_ < b.size_) ? size_ : b.size_;
|
||||||
int r = memcmp(data_, b.data_, min_len);
|
int r = memcmp(data_, b.data_, min_len);
|
||||||
if (r == 0) {
|
if (r == 0) {
|
||||||
if (size_ < b.size_) r = -1;
|
if (size_ < b.size_) r = -1;
|
||||||
|
|
|
@ -60,12 +60,6 @@ class Status {
|
||||||
// Returns true iff the status indicates an IOError.
|
// Returns true iff the status indicates an IOError.
|
||||||
bool IsIOError() const { return code() == kIOError; }
|
bool IsIOError() const { return code() == kIOError; }
|
||||||
|
|
||||||
// Returns true iff the status indicates a NotSupportedError.
|
|
||||||
bool IsNotSupportedError() const { return code() == kNotSupported; }
|
|
||||||
|
|
||||||
// Returns true iff the status indicates an InvalidArgument.
|
|
||||||
bool IsInvalidArgument() const { return code() == kInvalidArgument; }
|
|
||||||
|
|
||||||
// Return a string representation of this status suitable for printing.
|
// Return a string representation of this status suitable for printing.
|
||||||
// Returns the string "OK" for success.
|
// Returns the string "OK" for success.
|
||||||
std::string ToString() const;
|
std::string ToString() const;
|
||||||
|
|
|
@ -7,6 +7,7 @@
|
||||||
|
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include "leveldb/iterator.h"
|
#include "leveldb/iterator.h"
|
||||||
|
#include "leveldb/perf_count.h"
|
||||||
|
|
||||||
namespace leveldb {
|
namespace leveldb {
|
||||||
|
|
||||||
|
@ -40,7 +41,7 @@ class Table {
|
||||||
uint64_t file_size,
|
uint64_t file_size,
|
||||||
Table** table);
|
Table** table);
|
||||||
|
|
||||||
~Table();
|
virtual ~Table();
|
||||||
|
|
||||||
// Returns a new iterator over the table contents.
|
// Returns a new iterator over the table contents.
|
||||||
// The result of NewIterator() is initially invalid (caller must
|
// The result of NewIterator() is initially invalid (caller must
|
||||||
|
@ -55,7 +56,29 @@ class Table {
|
||||||
// be close to the file length.
|
// be close to the file length.
|
||||||
uint64_t ApproximateOffsetOf(const Slice& key) const;
|
uint64_t ApproximateOffsetOf(const Slice& key) const;
|
||||||
|
|
||||||
private:
|
// return a static copy of the table's counters.
|
||||||
|
SstCounters GetSstCounters() const;
|
||||||
|
|
||||||
|
// riak routine to retrieve total memory footprint of an open table
|
||||||
|
// object in memory
|
||||||
|
size_t TableObjectSize();
|
||||||
|
|
||||||
|
// riak routine to retrieve disk size of table file
|
||||||
|
// ("virtual" is for unit test activites)
|
||||||
|
virtual uint64_t GetFileSize();
|
||||||
|
|
||||||
|
// Riak routine to request bloom filter load on
|
||||||
|
// second read operation (not iterator read)
|
||||||
|
bool ReadFilter();
|
||||||
|
|
||||||
|
// access routines for testing tools, not for public use
|
||||||
|
Block * TEST_GetIndexBlock();
|
||||||
|
size_t TEST_TableObjectSize() {return(TableObjectSize());};
|
||||||
|
size_t TEST_FilterDataSize();
|
||||||
|
static Iterator* TEST_BlockReader(void* Ptr, const ReadOptions& ROptions, const Slice& SliceReturn)
|
||||||
|
{return(BlockReader(Ptr, ROptions, SliceReturn));};
|
||||||
|
|
||||||
|
protected: // was private, made protected for unit tests
|
||||||
struct Rep;
|
struct Rep;
|
||||||
Rep* rep_;
|
Rep* rep_;
|
||||||
|
|
||||||
|
@ -69,11 +92,12 @@ class Table {
|
||||||
Status InternalGet(
|
Status InternalGet(
|
||||||
const ReadOptions&, const Slice& key,
|
const ReadOptions&, const Slice& key,
|
||||||
void* arg,
|
void* arg,
|
||||||
void (*handle_result)(void* arg, const Slice& k, const Slice& v));
|
bool (*handle_result)(void* arg, const Slice& k, const Slice& v));
|
||||||
|
|
||||||
|
|
||||||
void ReadMeta(const Footer& footer);
|
void ReadMeta(const Footer& footer);
|
||||||
void ReadFilter(const Slice& filter_handle_value);
|
void ReadFilter(class BlockHandle & filter_handle_value, const class FilterPolicy * policy);
|
||||||
|
void ReadSstCounters(const Slice& sst_counters_handle_value);
|
||||||
|
|
||||||
// No copying allowed
|
// No copying allowed
|
||||||
Table(const Table&);
|
Table(const Table&);
|
||||||
|
|
|
@ -74,6 +74,14 @@ class TableBuilder {
|
||||||
// Finish() call, returns the size of the final generated file.
|
// Finish() call, returns the size of the final generated file.
|
||||||
uint64_t FileSize() const;
|
uint64_t FileSize() const;
|
||||||
|
|
||||||
|
// Number of delete tombstones so far.
|
||||||
|
uint64_t NumDeletes() const;
|
||||||
|
|
||||||
|
// Retrieve expiry control values
|
||||||
|
uint64_t GetExpiryWriteLow() const;
|
||||||
|
uint64_t GetExpiryWriteHigh() const;
|
||||||
|
uint64_t GetExpiryExplicitHigh() const;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
bool ok() const { return status().ok(); }
|
bool ok() const { return status().ok(); }
|
||||||
void WriteBlock(BlockBuilder* block, BlockHandle* handle);
|
void WriteBlock(BlockBuilder* block, BlockHandle* handle);
|
||||||
|
|
|
@ -23,6 +23,7 @@
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
#include "leveldb/status.h"
|
#include "leveldb/status.h"
|
||||||
|
#include "leveldb/options.h"
|
||||||
|
|
||||||
namespace leveldb {
|
namespace leveldb {
|
||||||
|
|
||||||
|
@ -34,7 +35,7 @@ class WriteBatch {
|
||||||
~WriteBatch();
|
~WriteBatch();
|
||||||
|
|
||||||
// Store the mapping "key->value" in the database.
|
// Store the mapping "key->value" in the database.
|
||||||
void Put(const Slice& key, const Slice& value);
|
void Put(const Slice& key, const Slice& value, const KeyMetaData * meta=NULL);
|
||||||
|
|
||||||
// If the database contains a mapping for "key", erase it. Else do nothing.
|
// If the database contains a mapping for "key", erase it. Else do nothing.
|
||||||
void Delete(const Slice& key);
|
void Delete(const Slice& key);
|
||||||
|
@ -46,7 +47,8 @@ class WriteBatch {
|
||||||
class Handler {
|
class Handler {
|
||||||
public:
|
public:
|
||||||
virtual ~Handler();
|
virtual ~Handler();
|
||||||
virtual void Put(const Slice& key, const Slice& value) = 0;
|
virtual void Put(const Slice& key, const Slice& value,
|
||||||
|
const ValueType & type, const ExpiryTimeMicros & expiry) = 0;
|
||||||
virtual void Delete(const Slice& key) = 0;
|
virtual void Delete(const Slice& key) = 0;
|
||||||
};
|
};
|
||||||
Status Iterate(Handler* handler) const;
|
Status Iterate(Handler* handler) const;
|
||||||
|
|
|
@ -1,92 +0,0 @@
|
||||||
// Copyright (c) 2013 The LevelDB Authors. All rights reserved.
|
|
||||||
// Use of this source code is governed by a BSD-style license that can be
|
|
||||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
||||||
|
|
||||||
// Test for issue 178: a manual compaction causes deleted data to reappear.
|
|
||||||
#include <iostream>
|
|
||||||
#include <sstream>
|
|
||||||
#include <cstdlib>
|
|
||||||
|
|
||||||
#include "leveldb/db.h"
|
|
||||||
#include "leveldb/write_batch.h"
|
|
||||||
#include "util/testharness.h"
|
|
||||||
|
|
||||||
namespace {
|
|
||||||
|
|
||||||
const int kNumKeys = 1100000;
|
|
||||||
|
|
||||||
std::string Key1(int i) {
|
|
||||||
char buf[100];
|
|
||||||
snprintf(buf, sizeof(buf), "my_key_%d", i);
|
|
||||||
return buf;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::string Key2(int i) {
|
|
||||||
return Key1(i) + "_xxx";
|
|
||||||
}
|
|
||||||
|
|
||||||
class Issue178 { };
|
|
||||||
|
|
||||||
TEST(Issue178, Test) {
|
|
||||||
// Get rid of any state from an old run.
|
|
||||||
std::string dbpath = leveldb::test::TmpDir() + "/leveldb_cbug_test";
|
|
||||||
DestroyDB(dbpath, leveldb::Options());
|
|
||||||
|
|
||||||
// Open database. Disable compression since it affects the creation
|
|
||||||
// of layers and the code below is trying to test against a very
|
|
||||||
// specific scenario.
|
|
||||||
leveldb::DB* db;
|
|
||||||
leveldb::Options db_options;
|
|
||||||
db_options.create_if_missing = true;
|
|
||||||
db_options.compression = leveldb::kNoCompression;
|
|
||||||
ASSERT_OK(leveldb::DB::Open(db_options, dbpath, &db));
|
|
||||||
|
|
||||||
// create first key range
|
|
||||||
leveldb::WriteBatch batch;
|
|
||||||
for (size_t i = 0; i < kNumKeys; i++) {
|
|
||||||
batch.Put(Key1(i), "value for range 1 key");
|
|
||||||
}
|
|
||||||
ASSERT_OK(db->Write(leveldb::WriteOptions(), &batch));
|
|
||||||
|
|
||||||
// create second key range
|
|
||||||
batch.Clear();
|
|
||||||
for (size_t i = 0; i < kNumKeys; i++) {
|
|
||||||
batch.Put(Key2(i), "value for range 2 key");
|
|
||||||
}
|
|
||||||
ASSERT_OK(db->Write(leveldb::WriteOptions(), &batch));
|
|
||||||
|
|
||||||
// delete second key range
|
|
||||||
batch.Clear();
|
|
||||||
for (size_t i = 0; i < kNumKeys; i++) {
|
|
||||||
batch.Delete(Key2(i));
|
|
||||||
}
|
|
||||||
ASSERT_OK(db->Write(leveldb::WriteOptions(), &batch));
|
|
||||||
|
|
||||||
// compact database
|
|
||||||
std::string start_key = Key1(0);
|
|
||||||
std::string end_key = Key1(kNumKeys - 1);
|
|
||||||
leveldb::Slice least(start_key.data(), start_key.size());
|
|
||||||
leveldb::Slice greatest(end_key.data(), end_key.size());
|
|
||||||
|
|
||||||
// commenting out the line below causes the example to work correctly
|
|
||||||
db->CompactRange(&least, &greatest);
|
|
||||||
|
|
||||||
// count the keys
|
|
||||||
leveldb::Iterator* iter = db->NewIterator(leveldb::ReadOptions());
|
|
||||||
size_t num_keys = 0;
|
|
||||||
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
|
|
||||||
num_keys++;
|
|
||||||
}
|
|
||||||
delete iter;
|
|
||||||
ASSERT_EQ(kNumKeys, num_keys) << "Bad number of keys";
|
|
||||||
|
|
||||||
// close database
|
|
||||||
delete db;
|
|
||||||
DestroyDB(dbpath, leveldb::Options());
|
|
||||||
}
|
|
||||||
|
|
||||||
} // anonymous namespace
|
|
||||||
|
|
||||||
int main(int argc, char** argv) {
|
|
||||||
return leveldb::test::RunAllTests();
|
|
||||||
}
|
|
|
@ -1,59 +0,0 @@
|
||||||
// Copyright (c) 2013 The LevelDB Authors. All rights reserved.
|
|
||||||
// Use of this source code is governed by a BSD-style license that can be
|
|
||||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
||||||
|
|
||||||
// Test for issue 200: when iterator switches direction from backward
|
|
||||||
// to forward, the current key can be yielded unexpectedly if a new
|
|
||||||
// mutation has been added just before the current key.
|
|
||||||
|
|
||||||
#include "leveldb/db.h"
|
|
||||||
#include "util/testharness.h"
|
|
||||||
|
|
||||||
namespace leveldb {
|
|
||||||
|
|
||||||
class Issue200 { };
|
|
||||||
|
|
||||||
TEST(Issue200, Test) {
|
|
||||||
// Get rid of any state from an old run.
|
|
||||||
std::string dbpath = test::TmpDir() + "/leveldb_issue200_test";
|
|
||||||
DestroyDB(dbpath, Options());
|
|
||||||
|
|
||||||
DB *db;
|
|
||||||
Options options;
|
|
||||||
options.create_if_missing = true;
|
|
||||||
ASSERT_OK(DB::Open(options, dbpath, &db));
|
|
||||||
|
|
||||||
WriteOptions write_options;
|
|
||||||
ASSERT_OK(db->Put(write_options, "1", "b"));
|
|
||||||
ASSERT_OK(db->Put(write_options, "2", "c"));
|
|
||||||
ASSERT_OK(db->Put(write_options, "3", "d"));
|
|
||||||
ASSERT_OK(db->Put(write_options, "4", "e"));
|
|
||||||
ASSERT_OK(db->Put(write_options, "5", "f"));
|
|
||||||
|
|
||||||
ReadOptions read_options;
|
|
||||||
Iterator *iter = db->NewIterator(read_options);
|
|
||||||
|
|
||||||
// Add an element that should not be reflected in the iterator.
|
|
||||||
ASSERT_OK(db->Put(write_options, "25", "cd"));
|
|
||||||
|
|
||||||
iter->Seek("5");
|
|
||||||
ASSERT_EQ(iter->key().ToString(), "5");
|
|
||||||
iter->Prev();
|
|
||||||
ASSERT_EQ(iter->key().ToString(), "4");
|
|
||||||
iter->Prev();
|
|
||||||
ASSERT_EQ(iter->key().ToString(), "3");
|
|
||||||
iter->Next();
|
|
||||||
ASSERT_EQ(iter->key().ToString(), "4");
|
|
||||||
iter->Next();
|
|
||||||
ASSERT_EQ(iter->key().ToString(), "5");
|
|
||||||
|
|
||||||
delete iter;
|
|
||||||
delete db;
|
|
||||||
DestroyDB(dbpath, options);
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace leveldb
|
|
||||||
|
|
||||||
int main(int argc, char** argv) {
|
|
||||||
return leveldb::test::RunAllTests();
|
|
||||||
}
|
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Reference in a new issue