Compact serialization for variable-length integers
Variable-length integers: bytes are a MSB base-128 encoding of the number. The high bit in each byte signifies whether another digit follows. To make the encoding is one-to-one, one is subtracted from all but the last digit. Thus, the byte sequence a[] with length len, where all but the last byte has bit 128 set, encodes the number: (a[len-1] & 0x7F) + sum(i=1..len-1, 128^i*((a[len-i-1] & 0x7F)+1)) Properties: * Very small (0-127: 1 byte, 128-16511: 2 bytes, 16512-2113663: 3 bytes) * Every integer has exactly one encoding * Encoding does not depend on size of original integer type
This commit is contained in:
parent
43b7905e98
commit
4d6144f97f
2 changed files with 138 additions and 1 deletions
|
@ -238,9 +238,75 @@ uint64 ReadCompactSize(Stream& is)
|
||||||
return nSizeRet;
|
return nSizeRet;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Variable-length integers: bytes are a MSB base-128 encoding of the number.
|
||||||
|
// The high bit in each byte signifies whether another digit follows. To make
|
||||||
|
// the encoding is one-to-one, one is subtracted from all but the last digit.
|
||||||
|
// Thus, the byte sequence a[] with length len, where all but the last byte
|
||||||
|
// has bit 128 set, encodes the number:
|
||||||
|
//
|
||||||
|
// (a[len-1] & 0x7F) + sum(i=1..len-1, 128^i*((a[len-i-1] & 0x7F)+1))
|
||||||
|
//
|
||||||
|
// Properties:
|
||||||
|
// * Very small (0-127: 1 byte, 128-16511: 2 bytes, 16512-2113663: 3 bytes)
|
||||||
|
// * Every integer has exactly one encoding
|
||||||
|
// * Encoding does not depend on size of original integer type
|
||||||
|
// * No redundancy: every (infinite) byte sequence corresponds to a list
|
||||||
|
// of encoded integers.
|
||||||
|
//
|
||||||
|
// 0: [0x00] 256: [0x81 0x00]
|
||||||
|
// 1: [0x01] 16383: [0xFE 0x7F]
|
||||||
|
// 127: [0x7F] 16384: [0xFF 0x00]
|
||||||
|
// 128: [0x80 0x00] 16511: [0x80 0xFF 0x7F]
|
||||||
|
// 255: [0x80 0x7F] 65535: [0x82 0xFD 0x7F]
|
||||||
|
// 2^32: [0x8E 0xFE 0xFE 0xFF 0x00]
|
||||||
|
|
||||||
|
template<typename I>
|
||||||
|
inline unsigned int GetSizeOfVarInt(I n)
|
||||||
|
{
|
||||||
|
int nRet = 0;
|
||||||
|
while(true) {
|
||||||
|
nRet++;
|
||||||
|
if (n <= 0x7F)
|
||||||
|
break;
|
||||||
|
n = (n >> 7) - 1;
|
||||||
|
}
|
||||||
|
return nRet;
|
||||||
|
}
|
||||||
|
|
||||||
#define FLATDATA(obj) REF(CFlatData((char*)&(obj), (char*)&(obj) + sizeof(obj)))
|
template<typename Stream, typename I>
|
||||||
|
void WriteVarInt(Stream& os, I n)
|
||||||
|
{
|
||||||
|
unsigned char tmp[(sizeof(n)*8+6)/7];
|
||||||
|
int len=0;
|
||||||
|
while(true) {
|
||||||
|
tmp[len] = (n & 0x7F) | (len ? 0x80 : 0x00);
|
||||||
|
if (n <= 0x7F)
|
||||||
|
break;
|
||||||
|
n = (n >> 7) - 1;
|
||||||
|
len++;
|
||||||
|
}
|
||||||
|
do {
|
||||||
|
WRITEDATA(os, tmp[len]);
|
||||||
|
} while(len--);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename Stream, typename I>
|
||||||
|
I ReadVarInt(Stream& is)
|
||||||
|
{
|
||||||
|
I n = 0;
|
||||||
|
while(true) {
|
||||||
|
unsigned char chData;
|
||||||
|
READDATA(is, chData);
|
||||||
|
n = (n << 7) | (chData & 0x7F);
|
||||||
|
if (chData & 0x80)
|
||||||
|
n++;
|
||||||
|
else
|
||||||
|
return n;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#define FLATDATA(obj) REF(CFlatData((char*)&(obj), (char*)&(obj) + sizeof(obj)))
|
||||||
|
#define VARINT(obj) REF(WrapVarInt(REF(obj)))
|
||||||
|
|
||||||
/** Wrapper for serializing arrays and POD.
|
/** Wrapper for serializing arrays and POD.
|
||||||
*/
|
*/
|
||||||
|
@ -274,6 +340,32 @@ public:
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
template<typename I>
|
||||||
|
class CVarInt
|
||||||
|
{
|
||||||
|
protected:
|
||||||
|
I &n;
|
||||||
|
public:
|
||||||
|
CVarInt(I& nIn) : n(nIn) { }
|
||||||
|
|
||||||
|
unsigned int GetSerializeSize(int, int) const {
|
||||||
|
return GetSizeOfVarInt<I>(n);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename Stream>
|
||||||
|
void Serialize(Stream &s, int, int) const {
|
||||||
|
WriteVarInt<Stream,I>(s, n);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename Stream>
|
||||||
|
void Unserialize(Stream& s, int, int) {
|
||||||
|
n = ReadVarInt<Stream,I>(s);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
template<typename I>
|
||||||
|
CVarInt<I> WrapVarInt(I& n) { return CVarInt<I>(n); }
|
||||||
|
|
||||||
//
|
//
|
||||||
// Forward declarations
|
// Forward declarations
|
||||||
//
|
//
|
||||||
|
|
45
src/test/serialize_tests.cpp
Normal file
45
src/test/serialize_tests.cpp
Normal file
|
@ -0,0 +1,45 @@
|
||||||
|
#include <boost/test/unit_test.hpp>
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
#include "serialize.h"
|
||||||
|
|
||||||
|
using namespace std;
|
||||||
|
|
||||||
|
BOOST_AUTO_TEST_SUITE(serialize_tests)
|
||||||
|
|
||||||
|
BOOST_AUTO_TEST_CASE(varints)
|
||||||
|
{
|
||||||
|
// encode
|
||||||
|
|
||||||
|
CDataStream ss(SER_DISK, 0);
|
||||||
|
CDataStream::size_type size = 0;
|
||||||
|
for (int i = 0; i < 100000; i++) {
|
||||||
|
ss << VARINT(i);
|
||||||
|
size += ::GetSerializeSize(VARINT(i), 0, 0);
|
||||||
|
BOOST_CHECK(size == ss.size());
|
||||||
|
}
|
||||||
|
|
||||||
|
for (uint64 i = 0; i < 100000000000ULL; i += 999999937) {
|
||||||
|
ss << VARINT(i);
|
||||||
|
size += ::GetSerializeSize(VARINT(i), 0, 0);
|
||||||
|
BOOST_CHECK(size == ss.size());
|
||||||
|
}
|
||||||
|
|
||||||
|
// decode
|
||||||
|
for (int i = 0; i < 100000; i++) {
|
||||||
|
int j;
|
||||||
|
ss >> VARINT(j);
|
||||||
|
BOOST_CHECK_MESSAGE(i == j, "decoded:" << j << " expected:" << i);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (uint64 i = 0; i < 100000000000ULL; i += 999999937) {
|
||||||
|
uint64 j;
|
||||||
|
ss >> VARINT(j);
|
||||||
|
BOOST_CHECK_MESSAGE(i == j, "decoded:" << j << " expected:" << i);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
BOOST_AUTO_TEST_SUITE_END()
|
Loading…
Reference in a new issue