contrib: make linearize-data.py cope with out-of-order blocks

Make it possible to read blocks in any order. This will be required
after headers-first (#4468), so should be merged before that.

- Read block header. For expected blocks, continue, else skip.
- For in-order blocks: copy block contents directly. Write prior
  out-of-order blocks if this connects a consecutive span.
- For out-of-order blocks, store extents of block data for later
  retrieval. Cache out-of-order blocks in memory up to 100MB
  (configurable).
This commit is contained in:
Wladimir J. van der Laan 2014-10-06 17:55:55 +02:00
parent 5505a1b13f
commit aedc74dfa6
2 changed files with 154 additions and 90 deletions

View file

@ -15,3 +15,5 @@ output_file=/home/example/Downloads/bootstrap.dat
hashlist=hashlist.txt
split_year=1
# Maxmimum size in bytes of out-of-order blocks cache in memory
out_of_order_cache_sz = 100000000

View file

@ -2,11 +2,12 @@
#
# linearize-data.py: Construct a linear, no-fork version of the chain.
#
# Copyright (c) 2013 The Bitcoin developers
# Copyright (c) 2013-2014 The Bitcoin developers
# Distributed under the MIT/X11 software license, see the accompanying
# file COPYING or http://www.opensource.org/licenses/mit-license.php.
#
from __future__ import print_function, division
import json
import struct
import re
@ -17,10 +18,10 @@ import sys
import hashlib
import datetime
import time
from collections import namedtuple
settings = {}
def uint32(x):
return x & 0xffffffffL
@ -78,116 +79,174 @@ def get_block_hashes(settings):
return blkindex
def mkblockset(blkindex):
def mkblockmap(blkindex):
blkmap = {}
for hash in blkindex:
blkmap[hash] = True
for height,hash in enumerate(blkindex):
blkmap[hash] = height
return blkmap
def copydata(settings, blkindex, blkset):
inFn = 0
inF = None
outFn = 0
outsz = 0
outF = None
outFname = None
blkCount = 0
# Block header and extent on disk
BlockExtent = namedtuple('BlockExtent', ['fn', 'offset', 'inhdr', 'blkhdr', 'size'])
lastDate = datetime.datetime(2000, 1, 1)
highTS = 1408893517 - 315360000
timestampSplit = False
fileOutput = True
setFileTime = False
maxOutSz = settings['max_out_sz']
if 'output' in settings:
fileOutput = False
if settings['file_timestamp'] != 0:
setFileTime = True
if settings['split_timestamp'] != 0:
timestampSplit = True
class BlockDataCopier:
def __init__(self, settings, blkindex, blkmap):
self.settings = settings
self.blkindex = blkindex
self.blkmap = blkmap
while True:
if not inF:
fname = "%s/blk%05d.dat" % (settings['input'], inFn)
print("Input file" + fname)
try:
inF = open(fname, "rb")
except IOError:
print "Done"
return
self.inFn = 0
self.inF = None
self.outFn = 0
self.outsz = 0
self.outF = None
self.outFname = None
self.blkCountIn = 0
self.blkCountOut = 0
inhdr = inF.read(8)
if (not inhdr or (inhdr[0] == "\0")):
inF.close()
inF = None
inFn = inFn + 1
continue
self.lastDate = datetime.datetime(2000, 1, 1)
self.highTS = 1408893517 - 315360000
self.timestampSplit = False
self.fileOutput = True
self.setFileTime = False
self.maxOutSz = settings['max_out_sz']
if 'output' in settings:
self.fileOutput = False
if settings['file_timestamp'] != 0:
self.setFileTime = True
if settings['split_timestamp'] != 0:
self.timestampSplit = True
# Extents and cache for out-of-order blocks
self.blockExtents = {}
self.outOfOrderData = {}
self.outOfOrderSize = 0 # running total size for items in outOfOrderData
inMagic = inhdr[:4]
if (inMagic != settings['netmagic']):
print("Invalid magic:" + inMagic)
return
inLenLE = inhdr[4:]
su = struct.unpack("<I", inLenLE)
inLen = su[0]
rawblock = inF.read(inLen)
blk_hdr = rawblock[:80]
hash_str = calc_hash_str(blk_hdr)
if not hash_str in blkset:
print("Skipping unknown block " + hash_str)
continue
if blkindex[blkCount] != hash_str:
print("Out of order block.")
print("Expected " + blkindex[blkCount])
print("Got " + hash_str)
sys.exit(1)
if not fileOutput and ((outsz + inLen) > maxOutSz):
outF.close()
if setFileTime:
def writeBlock(self, inhdr, blk_hdr, rawblock):
if not self.fileOutput and ((self.outsz + self.inLen) > self.maxOutSz):
self.outF.close()
if self.setFileTime:
os.utime(outFname, (int(time.time()), highTS))
outF = None
outFname = None
outFn = outFn + 1
outsz = 0
self.outF = None
self.outFname = None
self.outFn = outFn + 1
self.outsz = 0
(blkDate, blkTS) = get_blk_dt(blk_hdr)
if timestampSplit and (blkDate > lastDate):
if self.timestampSplit and (blkDate > self.lastDate):
print("New month " + blkDate.strftime("%Y-%m") + " @ " + hash_str)
lastDate = blkDate
if outF:
outF.close()
if setFileTime:
os.utime(outFname, (int(time.time()), highTS))
outF = None
outFname = None
outFn = outFn + 1
outsz = 0
self.outF = None
self.outFname = None
self.outFn = self.outFn + 1
self.outsz = 0
if not outF:
if fileOutput:
outFname = settings['output_file']
if not self.outF:
if self.fileOutput:
outFname = self.settings['output_file']
else:
outFname = "%s/blk%05d.dat" % (settings['output'], outFn)
outFname = "%s/blk%05d.dat" % (self.settings['output'], outFn)
print("Output file" + outFname)
outF = open(outFname, "wb")
self.outF = open(outFname, "wb")
outF.write(inhdr)
outF.write(rawblock)
outsz = outsz + inLen + 8
self.outF.write(inhdr)
self.outF.write(blk_hdr)
self.outF.write(rawblock)
self.outsz = self.outsz + len(inhdr) + len(blk_hdr) + len(rawblock)
blkCount = blkCount + 1
if blkTS > highTS:
highTS = blkTS
self.blkCountOut = self.blkCountOut + 1
if blkTS > self.highTS:
self.highTS = blkTS
if (blkCount % 1000) == 0:
print("Wrote " + str(blkCount) + " blocks")
if (self.blkCountOut % 1000) == 0:
print('%i blocks scanned, %i blocks written (of %i, %.1f%% complete)' %
(self.blkCountIn, self.blkCountOut, len(self.blkindex), 100.0 * self.blkCountOut / len(self.blkindex)))
def inFileName(self, fn):
return "%s/blk%05d.dat" % (self.settings['input'], fn)
def fetchBlock(self, extent):
'''Fetch block contents from disk given extents'''
with open(self.inFileName(extent.fn), "rb") as f:
f.seek(extent.offset)
return f.read(extent.size)
def copyOneBlock(self):
'''Find the next block to be written in the input, and copy it to the output.'''
extent = self.blockExtents.pop(self.blkCountOut)
if self.blkCountOut in self.outOfOrderData:
# If the data is cached, use it from memory and remove from the cache
rawblock = self.outOfOrderData.pop(self.blkCountOut)
self.outOfOrderSize -= len(rawblock)
else: # Otherwise look up data on disk
rawblock = self.fetchBlock(extent)
self.writeBlock(extent.inhdr, extent.blkhdr, rawblock)
def run(self):
while self.blkCountOut < len(self.blkindex):
if not self.inF:
fname = self.inFileName(self.inFn)
print("Input file" + fname)
try:
self.inF = open(fname, "rb")
except IOError:
print("Premature end of block data")
return
inhdr = self.inF.read(8)
if (not inhdr or (inhdr[0] == "\0")):
self.inF.close()
self.inF = None
self.inFn = self.inFn + 1
continue
inMagic = inhdr[:4]
if (inMagic != self.settings['netmagic']):
print("Invalid magic:" + inMagic)
return
inLenLE = inhdr[4:]
su = struct.unpack("<I", inLenLE)
inLen = su[0] - 80 # length without header
blk_hdr = self.inF.read(80)
inExtent = BlockExtent(self.inFn, self.inF.tell(), inhdr, blk_hdr, inLen)
hash_str = calc_hash_str(blk_hdr)
if not hash_str in blkmap:
print("Skipping unknown block " + hash_str)
self.inF.seek(inLen, os.SEEK_CUR)
continue
blkHeight = self.blkmap[hash_str]
self.blkCountIn += 1
if self.blkCountOut == blkHeight:
# If in-order block, just copy
rawblock = self.inF.read(inLen)
self.writeBlock(inhdr, blk_hdr, rawblock)
# See if we can catch up to prior out-of-order blocks
while self.blkCountOut in self.blockExtents:
self.copyOneBlock()
else: # If out-of-order, skip over block data for now
self.blockExtents[blkHeight] = inExtent
if self.outOfOrderSize < self.settings['out_of_order_cache_sz']:
# If there is space in the cache, read the data
# Reading the data in file sequence instead of seeking and fetching it later is preferred,
# but we don't want to fill up memory
self.outOfOrderData[blkHeight] = self.inF.read(inLen)
self.outOfOrderSize += inLen
else: # If no space in cache, seek forward
self.inF.seek(inLen, os.SEEK_CUR)
print("Done (%i blocks written)" % (self.blkCountOut))
if __name__ == '__main__':
if len(sys.argv) != 2:
print "Usage: linearize-data.py CONFIG-FILE"
print("Usage: linearize-data.py CONFIG-FILE")
sys.exit(1)
f = open(sys.argv[1])
@ -216,22 +275,25 @@ if __name__ == '__main__':
settings['split_timestamp'] = 0
if 'max_out_sz' not in settings:
settings['max_out_sz'] = 1000L * 1000 * 1000
if 'out_of_order_cache_sz' not in settings:
settings['out_of_order_cache_sz'] = 100 * 1000 * 1000
settings['max_out_sz'] = long(settings['max_out_sz'])
settings['split_timestamp'] = int(settings['split_timestamp'])
settings['file_timestamp'] = int(settings['file_timestamp'])
settings['netmagic'] = settings['netmagic'].decode('hex')
settings['out_of_order_cache_sz'] = int(settings['out_of_order_cache_sz'])
if 'output_file' not in settings and 'output' not in settings:
print("Missing output file / directory")
sys.exit(1)
blkindex = get_block_hashes(settings)
blkset = mkblockset(blkindex)
blkmap = mkblockmap(blkindex)
if not "000000000019d6689c085ae165831e934ff763ae46a2a6c172b3f1b60a8ce26f" in blkset:
if not "000000000019d6689c085ae165831e934ff763ae46a2a6c172b3f1b60a8ce26f" in blkmap:
print("not found")
else:
copydata(settings, blkindex, blkset)
BlockDataCopier(settings, blkindex, blkmap).run()