Squashed 'src/leveldb/' changes from aca1ffc..ae6c262

ae6c262 Merge branch 'leveldb' into ripple-fork
28fa222 Looks like a bit more delay is needed to smooth the latency.
a18f3e6 Tidy up JobQueue, add ripple_core module
ab82e57 Release leveldb 1.12
02c6259 Release leveldb 1.11
5bbb544 Rate limit compactions with a 25ms pause after each complete file.
8c29c47 LevelDB issue 178 fix: cannot resize a level 0 compaction set
18b245c Added GNU/kFreeBSD kernel name (TARGET_OS)
8be9d12 CondVar::SignalAll was broken, leading to deadlocks on Windows builds. http://code.google.com/p/leveldb/issues/detail?id=149
c9fc070 Upgrade LevelDB to 1.10.0, mostly for better write stall logging.
8215b15 Tweak to variable name to support unity build

git-subtree-dir: src/leveldb
git-subtree-split: ae6c2620b2ef3d5c69e63dc0eda865d6a39fa061
This commit is contained in:
Vinnie Falco 2013-07-01 08:36:32 -07:00
parent c25e98186d
commit adae78ea99
19 changed files with 196 additions and 59 deletions

View file

@ -6,3 +6,6 @@ Google Inc.
# Initial version authors:
Jeffrey Dean <jeff@google.com>
Sanjay Ghemawat <sanjay@google.com>
# Partial list of contributors:
Kevin Regan <kevin.d.regan@gmail.com>

View file

@ -42,6 +42,7 @@ TESTS = \
env_test \
filename_test \
filter_block_test \
issue178_test \
log_test \
memenv_test \
skiplist_test \
@ -69,7 +70,7 @@ SHARED = $(SHARED1)
else
# Update db.h if you change these.
SHARED_MAJOR = 1
SHARED_MINOR = 9
SHARED_MINOR = 12
SHARED1 = libleveldb.$(PLATFORM_SHARED_EXT)
SHARED2 = $(SHARED1).$(SHARED_MAJOR)
SHARED3 = $(SHARED1).$(SHARED_MAJOR).$(SHARED_MINOR)
@ -146,6 +147,9 @@ filename_test: db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS)
filter_block_test: table/filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) $(LDFLAGS) table/filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
issue178_test: issues/issue178_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) $(LDFLAGS) issues/issue178_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
log_test: db/log_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) $(LDFLAGS) db/log_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)

View file

@ -94,6 +94,12 @@ case "$TARGET_OS" in
PLATFORM_LIBS="-lpthread"
PORT_FILE=port/port_posix.cc
;;
GNU/kFreeBSD)
PLATFORM=OS_KFREEBSD
COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_KFREEBSD"
PLATFORM_LIBS="-lpthread"
PORT_FILE=port/port_posix.cc
;;
NetBSD)
PLATFORM=OS_NETBSD
COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_NETBSD"

View file

@ -35,6 +35,8 @@
namespace leveldb {
const int kNumNonTableCacheFiles = 10;
// Information kept for every waiting writer
struct DBImpl::Writer {
Status status;
@ -92,9 +94,9 @@ Options SanitizeOptions(const std::string& dbname,
Options result = src;
result.comparator = icmp;
result.filter_policy = (src.filter_policy != NULL) ? ipolicy : NULL;
ClipToRange(&result.max_open_files, 20, 50000);
ClipToRange(&result.write_buffer_size, 64<<10, 1<<30);
ClipToRange(&result.block_size, 1<<10, 4<<20);
ClipToRange(&result.max_open_files, 64 + kNumNonTableCacheFiles, 50000);
ClipToRange(&result.write_buffer_size, 64<<10, 1<<30);
ClipToRange(&result.block_size, 1<<10, 4<<20);
if (result.info_log == NULL) {
// Open a log file in the same directory as the db
src.env->CreateDir(dbname); // In case it does not exist
@ -130,12 +132,13 @@ DBImpl::DBImpl(const Options& options, const std::string& dbname)
log_(NULL),
tmp_batch_(new WriteBatch),
bg_compaction_scheduled_(false),
manual_compaction_(NULL) {
manual_compaction_(NULL),
consecutive_compaction_errors_(0) {
mem_->Ref();
has_imm_.Release_Store(NULL);
// Reserve ten files or so for other uses and give the rest to TableCache.
const int table_cache_size = options.max_open_files - 10;
const int table_cache_size = options.max_open_files - kNumNonTableCacheFiles;
table_cache_ = new TableCache(dbname_, &options_, table_cache_size);
versions_ = new VersionSet(dbname_, &options_, table_cache_,
@ -310,16 +313,24 @@ Status DBImpl::Recover(VersionEdit* edit) {
if (!s.ok()) {
return s;
}
std::set<uint64_t> expected;
versions_->AddLiveFiles(&expected);
uint64_t number;
FileType type;
std::vector<uint64_t> logs;
for (size_t i = 0; i < filenames.size(); i++) {
if (ParseFileName(filenames[i], &number, &type)
&& type == kLogFile
&& ((number >= min_log) || (number == prev_log))) {
if (ParseFileName(filenames[i], &number, &type)) {
expected.erase(number);
if (type == kLogFile && ((number >= min_log) || (number == prev_log)))
logs.push_back(number);
}
}
if (!expected.empty()) {
char buf[50];
snprintf(buf, sizeof(buf), "%d missing files; e.g.",
static_cast<int>(expected.size()));
return Status::Corruption(buf, TableFileName(dbname_, *(expected.begin())));
}
// Recover in the order in which the logs were generated
std::sort(logs.begin(), logs.end());
@ -611,6 +622,7 @@ void DBImpl::BackgroundCall() {
Status s = BackgroundCompaction();
if (s.ok()) {
// Success
consecutive_compaction_errors_ = 0;
} else if (shutting_down_.Acquire_Load()) {
// Error most likely due to shutdown; do not wait
} else {
@ -622,7 +634,12 @@ void DBImpl::BackgroundCall() {
Log(options_.info_log, "Waiting after background compaction error: %s",
s.ToString().c_str());
mutex_.Unlock();
env_->SleepForMicroseconds(1000000);
++consecutive_compaction_errors_;
int seconds_to_sleep = 1;
for (int i = 0; i < 3 && i < consecutive_compaction_errors_ - 1; ++i) {
seconds_to_sleep *= 2;
}
env_->SleepForMicroseconds(seconds_to_sleep * 1000000);
mutex_.Lock();
}
}
@ -805,6 +822,9 @@ Status DBImpl::FinishCompactionOutputFile(CompactionState* compact,
(unsigned long long) output_number,
(unsigned long long) current_entries,
(unsigned long long) current_bytes);
// rate-limit compaction file creation with a 100ms pause
env_->SleepForMicroseconds(100000);
}
}
return s;
@ -1268,10 +1288,11 @@ Status DBImpl::MakeRoomForWrite(bool force) {
} else if (imm_ != NULL) {
// We have filled up the current memtable, but the previous
// one is still being compacted, so we wait.
Log(options_.info_log, "Current memtable full; waiting...\n");
bg_cv_.Wait();
} else if (versions_->NumLevelFiles(0) >= config::kL0_StopWritesTrigger) {
// There are too many level-0 files.
Log(options_.info_log, "waiting...\n");
Log(options_.info_log, "Too many L0 files; waiting...\n");
bg_cv_.Wait();
} else {
// Attempt to switch to a new memtable and trigger compaction of old

View file

@ -163,6 +163,7 @@ class DBImpl : public DB {
// Have we encountered a background error in paranoid mode?
Status bg_error_;
int consecutive_compaction_errors_;
// Per level compaction stats. stats_[level] stores the stats for
// compactions that produced data for the specified "level".

View file

@ -33,8 +33,11 @@ class AtomicCounter {
public:
AtomicCounter() : count_(0) { }
void Increment() {
IncrementBy(1);
}
void IncrementBy(int count) {
MutexLock l(&mu_);
count_++;
count_ += count;
}
int Read() {
MutexLock l(&mu_);
@ -45,6 +48,10 @@ class AtomicCounter {
count_ = 0;
}
};
void DelayMilliseconds(int millis) {
Env::Default()->SleepForMicroseconds(millis * 1000);
}
}
// Special Env used to delay background operations
@ -69,6 +76,7 @@ class SpecialEnv : public EnvWrapper {
AtomicCounter random_read_counter_;
AtomicCounter sleep_counter_;
AtomicCounter sleep_time_counter_;
explicit SpecialEnv(Env* base) : EnvWrapper(base) {
delay_sstable_sync_.Release_Store(NULL);
@ -103,7 +111,7 @@ class SpecialEnv : public EnvWrapper {
Status Flush() { return base_->Flush(); }
Status Sync() {
while (env_->delay_sstable_sync_.Acquire_Load() != NULL) {
env_->SleepForMicroseconds(100000);
DelayMilliseconds(100);
}
return base_->Sync();
}
@ -174,8 +182,9 @@ class SpecialEnv : public EnvWrapper {
virtual void SleepForMicroseconds(int micros) {
sleep_counter_.Increment();
target()->SleepForMicroseconds(micros);
sleep_time_counter_.IncrementBy(micros);
}
};
class DBTest {
@ -461,6 +470,20 @@ class DBTest {
}
return result;
}
bool DeleteAnSSTFile() {
std::vector<std::string> filenames;
ASSERT_OK(env_->GetChildren(dbname_, &filenames));
uint64_t number;
FileType type;
for (size_t i = 0; i < filenames.size(); i++) {
if (ParseFileName(filenames[i], &number, &type) && type == kTableFile) {
ASSERT_OK(env_->DeleteFile(TableFileName(dbname_, number)));
return true;
}
}
return false;
}
};
TEST(DBTest, Empty) {
@ -611,7 +634,7 @@ TEST(DBTest, GetEncountersEmptyLevel) {
}
// Step 4: Wait for compaction to finish
env_->SleepForMicroseconds(1000000);
DelayMilliseconds(1000);
ASSERT_EQ(NumTableFilesAtLevel(0), 0);
} while (ChangeOptions());
@ -1295,7 +1318,7 @@ TEST(DBTest, L0_CompactionBug_Issue44_a) {
Reopen();
Reopen();
ASSERT_EQ("(a->v)", Contents());
env_->SleepForMicroseconds(1000000); // Wait for compaction to finish
DelayMilliseconds(1000); // Wait for compaction to finish
ASSERT_EQ("(a->v)", Contents());
}
@ -1311,7 +1334,7 @@ TEST(DBTest, L0_CompactionBug_Issue44_b) {
Put("","");
Reopen();
Put("","");
env_->SleepForMicroseconds(1000000); // Wait for compaction to finish
DelayMilliseconds(1000); // Wait for compaction to finish
Reopen();
Put("d","dv");
Reopen();
@ -1321,7 +1344,7 @@ TEST(DBTest, L0_CompactionBug_Issue44_b) {
Delete("b");
Reopen();
ASSERT_EQ("(->)(c->cv)", Contents());
env_->SleepForMicroseconds(1000000); // Wait for compaction to finish
DelayMilliseconds(1000); // Wait for compaction to finish
ASSERT_EQ("(->)(c->cv)", Contents());
}
@ -1506,6 +1529,30 @@ TEST(DBTest, NoSpace) {
ASSERT_GE(env_->sleep_counter_.Read(), 5);
}
TEST(DBTest, ExponentialBackoff) {
Options options = CurrentOptions();
options.env = env_;
Reopen(&options);
ASSERT_OK(Put("foo", "v1"));
ASSERT_EQ("v1", Get("foo"));
Compact("a", "z");
env_->non_writable_.Release_Store(env_); // Force errors for new files
env_->sleep_counter_.Reset();
env_->sleep_time_counter_.Reset();
for (int i = 0; i < 5; i++) {
dbfull()->TEST_CompactRange(2, NULL, NULL);
}
env_->non_writable_.Release_Store(NULL);
// Wait for compaction to finish
DelayMilliseconds(1000);
ASSERT_GE(env_->sleep_counter_.Read(), 5);
ASSERT_LT(env_->sleep_counter_.Read(), 10);
ASSERT_GE(env_->sleep_time_counter_.Read(), 10e6);
}
TEST(DBTest, NonWritableFileSystem) {
Options options = CurrentOptions();
options.write_buffer_size = 1000;
@ -1519,7 +1566,7 @@ TEST(DBTest, NonWritableFileSystem) {
fprintf(stderr, "iter %d; errors %d\n", i, errors);
if (!Put("foo", big).ok()) {
errors++;
env_->SleepForMicroseconds(100000);
DelayMilliseconds(100);
}
}
ASSERT_GT(errors, 0);
@ -1567,6 +1614,24 @@ TEST(DBTest, ManifestWriteError) {
}
}
TEST(DBTest, MissingSSTFile) {
ASSERT_OK(Put("foo", "bar"));
ASSERT_EQ("bar", Get("foo"));
// Dump the memtable to disk.
dbfull()->TEST_CompactMemTable();
ASSERT_EQ("bar", Get("foo"));
Close();
ASSERT_TRUE(DeleteAnSSTFile());
Options options = CurrentOptions();
options.paranoid_checks = true;
Status s = TryReopen(&options);
ASSERT_TRUE(!s.ok());
ASSERT_TRUE(s.ToString().find("issing") != std::string::npos)
<< s.ToString();
}
TEST(DBTest, FilesDeletedAfterCompaction) {
ASSERT_OK(Put("foo", "v2"));
Compact("a", "z");
@ -1711,13 +1776,13 @@ TEST(DBTest, MultiThreaded) {
}
// Let them run for a while
env_->SleepForMicroseconds(kTestSeconds * 1000000);
DelayMilliseconds(kTestSeconds * 1000);
// Stop the threads and wait for them to finish
mt.stop.Release_Store(&mt);
for (int id = 0; id < kNumThreads; id++) {
while (mt.thread_done[id].Acquire_Load() == NULL) {
env_->SleepForMicroseconds(100000);
DelayMilliseconds(100);
}
}
} while (ChangeOptions());

View file

@ -26,7 +26,7 @@ std::string ParsedInternalKey::DebugString() const {
(unsigned long long) sequence,
int(type));
std::string result = "'";
result += user_key.ToString();
result += EscapeString(user_key.ToString());
result += buf;
return result;
}

View file

@ -70,7 +70,7 @@ TEST(FileNameTest, Parse) {
for (int i = 0; i < sizeof(errors) / sizeof(errors[0]); i++) {
std::string f = errors[i];
ASSERT_TRUE(!ParseFileName(f, &number, &type)) << f;
};
}
}
TEST(FileNameTest, Construction) {

View file

@ -1331,14 +1331,19 @@ Compaction* VersionSet::CompactRange(
}
// Avoid compacting too much in one shot in case the range is large.
const uint64_t limit = MaxFileSizeForLevel(level);
uint64_t total = 0;
for (size_t i = 0; i < inputs.size(); i++) {
uint64_t s = inputs[i]->file_size;
total += s;
if (total >= limit) {
inputs.resize(i + 1);
break;
// But we cannot do this for level-0 since level-0 files can overlap
// and we must not pick one file and drop another older file if the
// two files overlap.
if (level > 0) {
const uint64_t limit = MaxFileSizeForLevel(level);
uint64_t total = 0;
for (size_t i = 0; i < inputs.size(); i++) {
uint64_t s = inputs[i]->file_size;
total += s;
if (total >= limit) {
inputs.resize(i + 1);
break;
}
}
}

View file

@ -14,7 +14,7 @@ namespace leveldb {
// Update Makefile if you change these
static const int kMajorVersion = 1;
static const int kMinorVersion = 9;
static const int kMinorVersion = 12;
struct Options;
struct ReadOptions;

View file

@ -109,12 +109,10 @@ void CondVar::Signal() {
void CondVar::SignalAll() {
wait_mtx_.Lock();
for(long i = 0; i < waiting_; ++i) {
::ReleaseSemaphore(sem1_, 1, NULL);
while(waiting_ > 0) {
--waiting_;
::WaitForSingleObject(sem2_, INFINITE);
}
::ReleaseSemaphore(sem1_, waiting_, NULL);
while(waiting_ > 0) {
--waiting_;
::WaitForSingleObject(sem2_, INFINITE);
}
wait_mtx_.Unlock();
}

View file

@ -16,7 +16,7 @@
namespace leveldb {
inline uint32_t Block::NumRestarts() const {
assert(size_ >= 2*sizeof(uint32_t));
assert(size_ >= sizeof(uint32_t));
return DecodeFixed32(data_ + size_ - sizeof(uint32_t));
}
@ -27,11 +27,12 @@ Block::Block(const BlockContents& contents)
if (size_ < sizeof(uint32_t)) {
size_ = 0; // Error marker
} else {
restart_offset_ = size_ - (1 + NumRestarts()) * sizeof(uint32_t);
if (restart_offset_ > size_ - sizeof(uint32_t)) {
// The size is too small for NumRestarts() and therefore
// restart_offset_ wrapped around.
size_t max_restarts_allowed = (size_-sizeof(uint32_t)) / sizeof(uint32_t);
if (NumRestarts() > max_restarts_allowed) {
// The size is too small for NumRestarts()
size_ = 0;
} else {
restart_offset_ = size_ - (1 + NumRestarts()) * sizeof(uint32_t);
}
}
}
@ -253,7 +254,7 @@ class Block::Iter : public Iterator {
};
Iterator* Block::NewIterator(const Comparator* cmp) {
if (size_ < 2*sizeof(uint32_t)) {
if (size_ < sizeof(uint32_t)) {
return NewErrorIterator(Status::Corruption("bad block contents"));
}
const uint32_t num_restarts = NumRestarts();

View file

@ -228,7 +228,6 @@ Status Table::InternalGet(const ReadOptions& options, const Slice& k,
!filter->KeyMayMatch(handle.offset(), k)) {
// Not found
} else {
Slice handle = iiter->value();
Iterator* block_iter = BlockReader(this, options, iiter->value());
block_iter->Seek(k);
if (block_iter->Valid()) {

View file

@ -644,6 +644,36 @@ class Harness {
Constructor* constructor_;
};
// Test empty table/block.
TEST(Harness, Empty) {
for (int i = 0; i < kNumTestArgs; i++) {
Init(kTestArgList[i]);
Random rnd(test::RandomSeed() + 1);
Test(&rnd);
}
}
// Special test for a block with no restart entries. The C++ leveldb
// code never generates such blocks, but the Java version of leveldb
// seems to.
TEST(Harness, ZeroRestartPointsInBlock) {
char data[sizeof(uint32_t)];
memset(data, 0, sizeof(data));
BlockContents contents;
contents.data = Slice(data, sizeof(data));
contents.cachable = false;
contents.heap_allocated = false;
Block block(contents);
Iterator* iter = block.NewIterator(BytewiseComparator());
iter->SeekToFirst();
ASSERT_TRUE(!iter->Valid());
iter->SeekToLast();
ASSERT_TRUE(!iter->Valid());
iter->Seek("foo");
ASSERT_TRUE(!iter->Valid());
delete iter;
}
// Test the empty key
TEST(Harness, SimpleEmptyKey) {
for (int i = 0; i < kNumTestArgs; i++) {

View file

@ -116,7 +116,6 @@ class HandleTable {
LRUHandle* h = list_[i];
while (h != NULL) {
LRUHandle* next = h->next_hash;
Slice key = h->key();
uint32_t hash = h->hash;
LRUHandle** ptr = &new_list[hash & (new_length - 1)];
h->next_hash = *ptr;
@ -160,7 +159,6 @@ class LRUCache {
// mutex_ protects the following state.
port::Mutex mutex_;
size_t usage_;
uint64_t last_id_;
// Dummy head of LRU list.
// lru.prev is newest entry, lru.next is oldest entry.
@ -170,8 +168,7 @@ class LRUCache {
};
LRUCache::LRUCache()
: usage_(0),
last_id_(0) {
: usage_(0) {
// Make empty circular linked list
lru_.next = &lru_;
lru_.prev = &lru_;

View file

@ -109,7 +109,7 @@ TEST(Coding, Varint64) {
values.push_back(power);
values.push_back(power-1);
values.push_back(power+1);
};
}
std::string s;
for (int i = 0; i < values.size(); i++) {

View file

@ -66,7 +66,7 @@ class BytewiseComparatorImpl : public Comparator {
};
} // namespace
static port::OnceType once = LEVELDB_ONCE_INIT;
static port::OnceType once_comparator = LEVELDB_ONCE_INIT;
static const Comparator* bytewise;
static void InitModule() {
@ -74,7 +74,7 @@ static void InitModule() {
}
const Comparator* BytewiseComparator() {
port::InitOnce(&once, InitModule);
port::InitOnce(&once_comparator, InitModule);
return bytewise;
}

View file

@ -386,7 +386,7 @@ class PosixEnv : public Env {
PosixEnv();
virtual ~PosixEnv() {
fprintf(stderr, "Destroying Env::Default()\n");
exit(1);
abort();
}
virtual Status NewSequentialFile(const std::string& fname,
@ -467,7 +467,7 @@ class PosixEnv : public Env {
result = IOError(fname, errno);
}
return result;
};
}
virtual Status CreateDir(const std::string& name) {
Status result;
@ -475,7 +475,7 @@ class PosixEnv : public Env {
result = IOError(name, errno);
}
return result;
};
}
virtual Status DeleteDir(const std::string& name) {
Status result;
@ -483,7 +483,7 @@ class PosixEnv : public Env {
result = IOError(name, errno);
}
return result;
};
}
virtual Status GetFileSize(const std::string& fname, uint64_t* size) {
Status s;
@ -589,7 +589,7 @@ class PosixEnv : public Env {
void PthreadCall(const char* label, int result) {
if (result != 0) {
fprintf(stderr, "pthread %s: %s\n", label, strerror(result));
exit(1);
abort();
}
}

View file

@ -6,6 +6,13 @@
#include "util/coding.h"
#include "util/hash.h"
// The FALLTHROUGH_INTENDED macro can be used to annotate implicit fall-through
// between switch labels. The real definition should be provided externally.
// This one is a fallback version for unsupported compilers.
#ifndef FALLTHROUGH_INTENDED
#define FALLTHROUGH_INTENDED do { } while (0)
#endif
namespace leveldb {
uint32_t Hash(const char* data, size_t n, uint32_t seed) {
@ -28,10 +35,10 @@ uint32_t Hash(const char* data, size_t n, uint32_t seed) {
switch (limit - data) {
case 3:
h += data[2] << 16;
// fall through
FALLTHROUGH_INTENDED;
case 2:
h += data[1] << 8;
// fall through
FALLTHROUGH_INTENDED;
case 1:
h += data[0];
h *= m;