contrib: make linearize-data.py cope with out-of-order blocks
Make it possible to read blocks in any order. This will be required after headers-first (#4468), so should be merged before that. - Read block header. For expected blocks, continue, else skip. - For in-order blocks: copy block contents directly. Write prior out-of-order blocks if this connects a consecutive span. - For out-of-order blocks, store extents of block data for later retrieval. Cache out-of-order blocks in memory up to 100MB (configurable).
This commit is contained in:
parent
5505a1b13f
commit
aedc74dfa6
2 changed files with 154 additions and 90 deletions
|
@ -15,3 +15,5 @@ output_file=/home/example/Downloads/bootstrap.dat
|
||||||
hashlist=hashlist.txt
|
hashlist=hashlist.txt
|
||||||
split_year=1
|
split_year=1
|
||||||
|
|
||||||
|
# Maxmimum size in bytes of out-of-order blocks cache in memory
|
||||||
|
out_of_order_cache_sz = 100000000
|
||||||
|
|
|
@ -2,11 +2,12 @@
|
||||||
#
|
#
|
||||||
# linearize-data.py: Construct a linear, no-fork version of the chain.
|
# linearize-data.py: Construct a linear, no-fork version of the chain.
|
||||||
#
|
#
|
||||||
# Copyright (c) 2013 The Bitcoin developers
|
# Copyright (c) 2013-2014 The Bitcoin developers
|
||||||
# Distributed under the MIT/X11 software license, see the accompanying
|
# Distributed under the MIT/X11 software license, see the accompanying
|
||||||
# file COPYING or http://www.opensource.org/licenses/mit-license.php.
|
# file COPYING or http://www.opensource.org/licenses/mit-license.php.
|
||||||
#
|
#
|
||||||
|
|
||||||
|
from __future__ import print_function, division
|
||||||
import json
|
import json
|
||||||
import struct
|
import struct
|
||||||
import re
|
import re
|
||||||
|
@ -17,10 +18,10 @@ import sys
|
||||||
import hashlib
|
import hashlib
|
||||||
import datetime
|
import datetime
|
||||||
import time
|
import time
|
||||||
|
from collections import namedtuple
|
||||||
|
|
||||||
settings = {}
|
settings = {}
|
||||||
|
|
||||||
|
|
||||||
def uint32(x):
|
def uint32(x):
|
||||||
return x & 0xffffffffL
|
return x & 0xffffffffL
|
||||||
|
|
||||||
|
@ -78,116 +79,174 @@ def get_block_hashes(settings):
|
||||||
|
|
||||||
return blkindex
|
return blkindex
|
||||||
|
|
||||||
def mkblockset(blkindex):
|
def mkblockmap(blkindex):
|
||||||
blkmap = {}
|
blkmap = {}
|
||||||
for hash in blkindex:
|
for height,hash in enumerate(blkindex):
|
||||||
blkmap[hash] = True
|
blkmap[hash] = height
|
||||||
return blkmap
|
return blkmap
|
||||||
|
|
||||||
def copydata(settings, blkindex, blkset):
|
# Block header and extent on disk
|
||||||
inFn = 0
|
BlockExtent = namedtuple('BlockExtent', ['fn', 'offset', 'inhdr', 'blkhdr', 'size'])
|
||||||
inF = None
|
|
||||||
outFn = 0
|
|
||||||
outsz = 0
|
|
||||||
outF = None
|
|
||||||
outFname = None
|
|
||||||
blkCount = 0
|
|
||||||
|
|
||||||
lastDate = datetime.datetime(2000, 1, 1)
|
class BlockDataCopier:
|
||||||
highTS = 1408893517 - 315360000
|
def __init__(self, settings, blkindex, blkmap):
|
||||||
timestampSplit = False
|
self.settings = settings
|
||||||
fileOutput = True
|
self.blkindex = blkindex
|
||||||
setFileTime = False
|
self.blkmap = blkmap
|
||||||
maxOutSz = settings['max_out_sz']
|
|
||||||
if 'output' in settings:
|
|
||||||
fileOutput = False
|
|
||||||
if settings['file_timestamp'] != 0:
|
|
||||||
setFileTime = True
|
|
||||||
if settings['split_timestamp'] != 0:
|
|
||||||
timestampSplit = True
|
|
||||||
|
|
||||||
while True:
|
self.inFn = 0
|
||||||
if not inF:
|
self.inF = None
|
||||||
fname = "%s/blk%05d.dat" % (settings['input'], inFn)
|
self.outFn = 0
|
||||||
print("Input file" + fname)
|
self.outsz = 0
|
||||||
try:
|
self.outF = None
|
||||||
inF = open(fname, "rb")
|
self.outFname = None
|
||||||
except IOError:
|
self.blkCountIn = 0
|
||||||
print "Done"
|
self.blkCountOut = 0
|
||||||
return
|
|
||||||
|
|
||||||
inhdr = inF.read(8)
|
self.lastDate = datetime.datetime(2000, 1, 1)
|
||||||
if (not inhdr or (inhdr[0] == "\0")):
|
self.highTS = 1408893517 - 315360000
|
||||||
inF.close()
|
self.timestampSplit = False
|
||||||
inF = None
|
self.fileOutput = True
|
||||||
inFn = inFn + 1
|
self.setFileTime = False
|
||||||
continue
|
self.maxOutSz = settings['max_out_sz']
|
||||||
|
if 'output' in settings:
|
||||||
|
self.fileOutput = False
|
||||||
|
if settings['file_timestamp'] != 0:
|
||||||
|
self.setFileTime = True
|
||||||
|
if settings['split_timestamp'] != 0:
|
||||||
|
self.timestampSplit = True
|
||||||
|
# Extents and cache for out-of-order blocks
|
||||||
|
self.blockExtents = {}
|
||||||
|
self.outOfOrderData = {}
|
||||||
|
self.outOfOrderSize = 0 # running total size for items in outOfOrderData
|
||||||
|
|
||||||
inMagic = inhdr[:4]
|
def writeBlock(self, inhdr, blk_hdr, rawblock):
|
||||||
if (inMagic != settings['netmagic']):
|
if not self.fileOutput and ((self.outsz + self.inLen) > self.maxOutSz):
|
||||||
print("Invalid magic:" + inMagic)
|
self.outF.close()
|
||||||
return
|
if self.setFileTime:
|
||||||
inLenLE = inhdr[4:]
|
|
||||||
su = struct.unpack("<I", inLenLE)
|
|
||||||
inLen = su[0]
|
|
||||||
rawblock = inF.read(inLen)
|
|
||||||
blk_hdr = rawblock[:80]
|
|
||||||
|
|
||||||
hash_str = calc_hash_str(blk_hdr)
|
|
||||||
if not hash_str in blkset:
|
|
||||||
print("Skipping unknown block " + hash_str)
|
|
||||||
continue
|
|
||||||
|
|
||||||
if blkindex[blkCount] != hash_str:
|
|
||||||
print("Out of order block.")
|
|
||||||
print("Expected " + blkindex[blkCount])
|
|
||||||
print("Got " + hash_str)
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
if not fileOutput and ((outsz + inLen) > maxOutSz):
|
|
||||||
outF.close()
|
|
||||||
if setFileTime:
|
|
||||||
os.utime(outFname, (int(time.time()), highTS))
|
os.utime(outFname, (int(time.time()), highTS))
|
||||||
outF = None
|
self.outF = None
|
||||||
outFname = None
|
self.outFname = None
|
||||||
outFn = outFn + 1
|
self.outFn = outFn + 1
|
||||||
outsz = 0
|
self.outsz = 0
|
||||||
|
|
||||||
(blkDate, blkTS) = get_blk_dt(blk_hdr)
|
(blkDate, blkTS) = get_blk_dt(blk_hdr)
|
||||||
if timestampSplit and (blkDate > lastDate):
|
if self.timestampSplit and (blkDate > self.lastDate):
|
||||||
print("New month " + blkDate.strftime("%Y-%m") + " @ " + hash_str)
|
print("New month " + blkDate.strftime("%Y-%m") + " @ " + hash_str)
|
||||||
lastDate = blkDate
|
lastDate = blkDate
|
||||||
if outF:
|
if outF:
|
||||||
outF.close()
|
outF.close()
|
||||||
if setFileTime:
|
if setFileTime:
|
||||||
os.utime(outFname, (int(time.time()), highTS))
|
os.utime(outFname, (int(time.time()), highTS))
|
||||||
outF = None
|
self.outF = None
|
||||||
outFname = None
|
self.outFname = None
|
||||||
outFn = outFn + 1
|
self.outFn = self.outFn + 1
|
||||||
outsz = 0
|
self.outsz = 0
|
||||||
|
|
||||||
if not outF:
|
if not self.outF:
|
||||||
if fileOutput:
|
if self.fileOutput:
|
||||||
outFname = settings['output_file']
|
outFname = self.settings['output_file']
|
||||||
else:
|
else:
|
||||||
outFname = "%s/blk%05d.dat" % (settings['output'], outFn)
|
outFname = "%s/blk%05d.dat" % (self.settings['output'], outFn)
|
||||||
print("Output file" + outFname)
|
print("Output file" + outFname)
|
||||||
outF = open(outFname, "wb")
|
self.outF = open(outFname, "wb")
|
||||||
|
|
||||||
outF.write(inhdr)
|
self.outF.write(inhdr)
|
||||||
outF.write(rawblock)
|
self.outF.write(blk_hdr)
|
||||||
outsz = outsz + inLen + 8
|
self.outF.write(rawblock)
|
||||||
|
self.outsz = self.outsz + len(inhdr) + len(blk_hdr) + len(rawblock)
|
||||||
|
|
||||||
blkCount = blkCount + 1
|
self.blkCountOut = self.blkCountOut + 1
|
||||||
if blkTS > highTS:
|
if blkTS > self.highTS:
|
||||||
highTS = blkTS
|
self.highTS = blkTS
|
||||||
|
|
||||||
if (blkCount % 1000) == 0:
|
if (self.blkCountOut % 1000) == 0:
|
||||||
print("Wrote " + str(blkCount) + " blocks")
|
print('%i blocks scanned, %i blocks written (of %i, %.1f%% complete)' %
|
||||||
|
(self.blkCountIn, self.blkCountOut, len(self.blkindex), 100.0 * self.blkCountOut / len(self.blkindex)))
|
||||||
|
|
||||||
|
def inFileName(self, fn):
|
||||||
|
return "%s/blk%05d.dat" % (self.settings['input'], fn)
|
||||||
|
|
||||||
|
def fetchBlock(self, extent):
|
||||||
|
'''Fetch block contents from disk given extents'''
|
||||||
|
with open(self.inFileName(extent.fn), "rb") as f:
|
||||||
|
f.seek(extent.offset)
|
||||||
|
return f.read(extent.size)
|
||||||
|
|
||||||
|
def copyOneBlock(self):
|
||||||
|
'''Find the next block to be written in the input, and copy it to the output.'''
|
||||||
|
extent = self.blockExtents.pop(self.blkCountOut)
|
||||||
|
if self.blkCountOut in self.outOfOrderData:
|
||||||
|
# If the data is cached, use it from memory and remove from the cache
|
||||||
|
rawblock = self.outOfOrderData.pop(self.blkCountOut)
|
||||||
|
self.outOfOrderSize -= len(rawblock)
|
||||||
|
else: # Otherwise look up data on disk
|
||||||
|
rawblock = self.fetchBlock(extent)
|
||||||
|
|
||||||
|
self.writeBlock(extent.inhdr, extent.blkhdr, rawblock)
|
||||||
|
|
||||||
|
def run(self):
|
||||||
|
while self.blkCountOut < len(self.blkindex):
|
||||||
|
if not self.inF:
|
||||||
|
fname = self.inFileName(self.inFn)
|
||||||
|
print("Input file" + fname)
|
||||||
|
try:
|
||||||
|
self.inF = open(fname, "rb")
|
||||||
|
except IOError:
|
||||||
|
print("Premature end of block data")
|
||||||
|
return
|
||||||
|
|
||||||
|
inhdr = self.inF.read(8)
|
||||||
|
if (not inhdr or (inhdr[0] == "\0")):
|
||||||
|
self.inF.close()
|
||||||
|
self.inF = None
|
||||||
|
self.inFn = self.inFn + 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
inMagic = inhdr[:4]
|
||||||
|
if (inMagic != self.settings['netmagic']):
|
||||||
|
print("Invalid magic:" + inMagic)
|
||||||
|
return
|
||||||
|
inLenLE = inhdr[4:]
|
||||||
|
su = struct.unpack("<I", inLenLE)
|
||||||
|
inLen = su[0] - 80 # length without header
|
||||||
|
blk_hdr = self.inF.read(80)
|
||||||
|
inExtent = BlockExtent(self.inFn, self.inF.tell(), inhdr, blk_hdr, inLen)
|
||||||
|
|
||||||
|
hash_str = calc_hash_str(blk_hdr)
|
||||||
|
if not hash_str in blkmap:
|
||||||
|
print("Skipping unknown block " + hash_str)
|
||||||
|
self.inF.seek(inLen, os.SEEK_CUR)
|
||||||
|
continue
|
||||||
|
|
||||||
|
blkHeight = self.blkmap[hash_str]
|
||||||
|
self.blkCountIn += 1
|
||||||
|
|
||||||
|
if self.blkCountOut == blkHeight:
|
||||||
|
# If in-order block, just copy
|
||||||
|
rawblock = self.inF.read(inLen)
|
||||||
|
self.writeBlock(inhdr, blk_hdr, rawblock)
|
||||||
|
|
||||||
|
# See if we can catch up to prior out-of-order blocks
|
||||||
|
while self.blkCountOut in self.blockExtents:
|
||||||
|
self.copyOneBlock()
|
||||||
|
|
||||||
|
else: # If out-of-order, skip over block data for now
|
||||||
|
self.blockExtents[blkHeight] = inExtent
|
||||||
|
if self.outOfOrderSize < self.settings['out_of_order_cache_sz']:
|
||||||
|
# If there is space in the cache, read the data
|
||||||
|
# Reading the data in file sequence instead of seeking and fetching it later is preferred,
|
||||||
|
# but we don't want to fill up memory
|
||||||
|
self.outOfOrderData[blkHeight] = self.inF.read(inLen)
|
||||||
|
self.outOfOrderSize += inLen
|
||||||
|
else: # If no space in cache, seek forward
|
||||||
|
self.inF.seek(inLen, os.SEEK_CUR)
|
||||||
|
|
||||||
|
print("Done (%i blocks written)" % (self.blkCountOut))
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
if len(sys.argv) != 2:
|
if len(sys.argv) != 2:
|
||||||
print "Usage: linearize-data.py CONFIG-FILE"
|
print("Usage: linearize-data.py CONFIG-FILE")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
f = open(sys.argv[1])
|
f = open(sys.argv[1])
|
||||||
|
@ -216,22 +275,25 @@ if __name__ == '__main__':
|
||||||
settings['split_timestamp'] = 0
|
settings['split_timestamp'] = 0
|
||||||
if 'max_out_sz' not in settings:
|
if 'max_out_sz' not in settings:
|
||||||
settings['max_out_sz'] = 1000L * 1000 * 1000
|
settings['max_out_sz'] = 1000L * 1000 * 1000
|
||||||
|
if 'out_of_order_cache_sz' not in settings:
|
||||||
|
settings['out_of_order_cache_sz'] = 100 * 1000 * 1000
|
||||||
|
|
||||||
settings['max_out_sz'] = long(settings['max_out_sz'])
|
settings['max_out_sz'] = long(settings['max_out_sz'])
|
||||||
settings['split_timestamp'] = int(settings['split_timestamp'])
|
settings['split_timestamp'] = int(settings['split_timestamp'])
|
||||||
settings['file_timestamp'] = int(settings['file_timestamp'])
|
settings['file_timestamp'] = int(settings['file_timestamp'])
|
||||||
settings['netmagic'] = settings['netmagic'].decode('hex')
|
settings['netmagic'] = settings['netmagic'].decode('hex')
|
||||||
|
settings['out_of_order_cache_sz'] = int(settings['out_of_order_cache_sz'])
|
||||||
|
|
||||||
if 'output_file' not in settings and 'output' not in settings:
|
if 'output_file' not in settings and 'output' not in settings:
|
||||||
print("Missing output file / directory")
|
print("Missing output file / directory")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
blkindex = get_block_hashes(settings)
|
blkindex = get_block_hashes(settings)
|
||||||
blkset = mkblockset(blkindex)
|
blkmap = mkblockmap(blkindex)
|
||||||
|
|
||||||
if not "000000000019d6689c085ae165831e934ff763ae46a2a6c172b3f1b60a8ce26f" in blkset:
|
if not "000000000019d6689c085ae165831e934ff763ae46a2a6c172b3f1b60a8ce26f" in blkmap:
|
||||||
print("not found")
|
print("not found")
|
||||||
else:
|
else:
|
||||||
copydata(settings, blkindex, blkset)
|
BlockDataCopier(settings, blkindex, blkmap).run()
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue