#!/usr/bin/env python3 # # linearize-data.py: Construct a linear, no-fork version of the chain. # # Copyright (c) 2013-2016 The Bitcoin Core developers # Distributed under the MIT software license, see the accompanying # file COPYING or http://www.opensource.org/licenses/mit-license.php. # from __future__ import print_function, division try: # Python 3 import http.client as httplib except ImportError: # Python 2 import httplib import json import struct import re import os import os.path import base64 import sys import hashlib import datetime import time from collections import namedtuple from binascii import hexlify, unhexlify settings = {} ##### Switch endian-ness ##### def hex_switchEndian(s): """ Switches the endianness of a hex string (in pairs of hex chars) """ pairList = [s[i:i+2].encode() for i in range(0, len(s), 2)] return b''.join(pairList[::-1]).decode() def uint32(x): return x & 0xffffffff def bytereverse(x): return uint32(( ((x) << 24) | (((x) << 8) & 0x00ff0000) | (((x) >> 8) & 0x0000ff00) | ((x) >> 24) )) def bufreverse(in_buf): out_words = [] for i in range(0, len(in_buf), 4): word = struct.unpack('@I', in_buf[i:i+4])[0] out_words.append(struct.pack('@I', bytereverse(word))) return b''.join(out_words) def wordreverse(in_buf): out_words = [] for i in range(0, len(in_buf), 4): out_words.append(in_buf[i:i+4]) out_words.reverse() return b''.join(out_words) def calc_hdr_hash(blk_hdr): hash1 = hashlib.sha256() hash1.update(blk_hdr) hash1_o = hash1.digest() hash2 = hashlib.sha256() hash2.update(hash1_o) hash2_o = hash2.digest() return hash2_o def calc_hash_str(blk_hdr): hash = calc_hdr_hash(blk_hdr) hash = bufreverse(hash) hash = wordreverse(hash) hash_str = hexlify(hash).decode('utf-8') return hash_str def get_blk_dt(blk_hdr): members = struct.unpack("<I", blk_hdr[68:68+4]) nTime = members[0] dt = datetime.datetime.fromtimestamp(nTime) dt_ym = datetime.datetime(dt.year, dt.month, 1) return (dt_ym, nTime) # When getting the list of block hashes, undo any byte reversals. def get_block_hashes(settings): blkindex = [] f = open(settings['hashlist'], "r") for line in f: line = line.rstrip() if settings['rev_hash_bytes'] == 'true': line = hex_switchEndian(line) blkindex.append(line) print("Read " + str(len(blkindex)) + " hashes") return blkindex # The block map shouldn't give or receive byte-reversed hashes. def mkblockmap(blkindex): blkmap = {} for height,hash in enumerate(blkindex): blkmap[hash] = height return blkmap # Block header and extent on disk BlockExtent = namedtuple('BlockExtent', ['fn', 'offset', 'inhdr', 'blkhdr', 'size']) class BlockDataCopier: def __init__(self, settings, blkindex, blkmap): self.settings = settings self.blkindex = blkindex self.blkmap = blkmap self.inFn = 0 self.inF = None self.outFn = 0 self.outsz = 0 self.outF = None self.outFname = None self.blkCountIn = 0 self.blkCountOut = 0 self.lastDate = datetime.datetime(2000, 1, 1) self.highTS = 1408893517 - 315360000 self.timestampSplit = False self.fileOutput = True self.setFileTime = False self.maxOutSz = settings['max_out_sz'] if 'output' in settings: self.fileOutput = False if settings['file_timestamp'] != 0: self.setFileTime = True if settings['split_timestamp'] != 0: self.timestampSplit = True # Extents and cache for out-of-order blocks self.blockExtents = {} self.outOfOrderData = {} self.outOfOrderSize = 0 # running total size for items in outOfOrderData def writeBlock(self, inhdr, blk_hdr, rawblock): blockSizeOnDisk = len(inhdr) + len(blk_hdr) + len(rawblock) if not self.fileOutput and ((self.outsz + blockSizeOnDisk) > self.maxOutSz): self.outF.close() if self.setFileTime: os.utime(outFname, (int(time.time()), highTS)) self.outF = None self.outFname = None self.outFn = self.outFn + 1 self.outsz = 0 (blkDate, blkTS) = get_blk_dt(blk_hdr) if self.timestampSplit and (blkDate > self.lastDate): print("New month " + blkDate.strftime("%Y-%m") + " @ " + hash_str) lastDate = blkDate if outF: outF.close() if setFileTime: os.utime(outFname, (int(time.time()), highTS)) self.outF = None self.outFname = None self.outFn = self.outFn + 1 self.outsz = 0 if not self.outF: if self.fileOutput: outFname = self.settings['output_file'] else: outFname = os.path.join(self.settings['output'], "blk%05d.dat" % self.outFn) print("Output file " + outFname) self.outF = open(outFname, "wb") self.outF.write(inhdr) self.outF.write(blk_hdr) self.outF.write(rawblock) self.outsz = self.outsz + len(inhdr) + len(blk_hdr) + len(rawblock) self.blkCountOut = self.blkCountOut + 1 if blkTS > self.highTS: self.highTS = blkTS if (self.blkCountOut % 1000) == 0: print('%i blocks scanned, %i blocks written (of %i, %.1f%% complete)' % (self.blkCountIn, self.blkCountOut, len(self.blkindex), 100.0 * self.blkCountOut / len(self.blkindex))) def inFileName(self, fn): return os.path.join(self.settings['input'], "blk%05d.dat" % fn) def fetchBlock(self, extent): '''Fetch block contents from disk given extents''' with open(self.inFileName(extent.fn), "rb") as f: f.seek(extent.offset) return f.read(extent.size) def copyOneBlock(self): '''Find the next block to be written in the input, and copy it to the output.''' extent = self.blockExtents.pop(self.blkCountOut) if self.blkCountOut in self.outOfOrderData: # If the data is cached, use it from memory and remove from the cache rawblock = self.outOfOrderData.pop(self.blkCountOut) self.outOfOrderSize -= len(rawblock) else: # Otherwise look up data on disk rawblock = self.fetchBlock(extent) self.writeBlock(extent.inhdr, extent.blkhdr, rawblock) def run(self): while self.blkCountOut < len(self.blkindex): if not self.inF: fname = self.inFileName(self.inFn) print("Input file " + fname) try: self.inF = open(fname, "rb") except IOError: print("Premature end of block data") return inhdr = self.inF.read(8) if (not inhdr or (inhdr[0] == "\0")): self.inF.close() self.inF = None self.inFn = self.inFn + 1 continue inMagic = inhdr[:4] if (inMagic != self.settings['netmagic']): print("Invalid magic: " + hexlify(inMagic).decode('utf-8')) return inLenLE = inhdr[4:] su = struct.unpack("<I", inLenLE) inLen = su[0] - 80 # length without header blk_hdr = self.inF.read(80) inExtent = BlockExtent(self.inFn, self.inF.tell(), inhdr, blk_hdr, inLen) hash_str = calc_hash_str(blk_hdr) if not hash_str in blkmap: print("Skipping unknown block " + hash_str) self.inF.seek(inLen, os.SEEK_CUR) continue blkHeight = self.blkmap[hash_str] self.blkCountIn += 1 if self.blkCountOut == blkHeight: # If in-order block, just copy rawblock = self.inF.read(inLen) self.writeBlock(inhdr, blk_hdr, rawblock) # See if we can catch up to prior out-of-order blocks while self.blkCountOut in self.blockExtents: self.copyOneBlock() else: # If out-of-order, skip over block data for now self.blockExtents[blkHeight] = inExtent if self.outOfOrderSize < self.settings['out_of_order_cache_sz']: # If there is space in the cache, read the data # Reading the data in file sequence instead of seeking and fetching it later is preferred, # but we don't want to fill up memory self.outOfOrderData[blkHeight] = self.inF.read(inLen) self.outOfOrderSize += inLen else: # If no space in cache, seek forward self.inF.seek(inLen, os.SEEK_CUR) print("Done (%i blocks written)" % (self.blkCountOut)) if __name__ == '__main__': if len(sys.argv) != 2: print("Usage: linearize-data.py CONFIG-FILE") sys.exit(1) f = open(sys.argv[1]) for line in f: # skip comment lines m = re.search('^\s*#', line) if m: continue # parse key=value lines m = re.search('^(\w+)\s*=\s*(\S.*)$', line) if m is None: continue settings[m.group(1)] = m.group(2) f.close() # Force hash byte format setting to be lowercase to make comparisons easier. # Also place upfront in case any settings need to know about it. if 'rev_hash_bytes' not in settings: settings['rev_hash_bytes'] = 'false' settings['rev_hash_bytes'] = settings['rev_hash_bytes'].lower() if 'netmagic' not in settings: settings['netmagic'] = 'f9beb4d9' if 'genesis' not in settings: settings['genesis'] = '000000000019d6689c085ae165831e934ff763ae46a2a6c172b3f1b60a8ce26f' if 'input' not in settings: settings['input'] = 'input' if 'hashlist' not in settings: settings['hashlist'] = 'hashlist.txt' if 'file_timestamp' not in settings: settings['file_timestamp'] = 0 if 'split_timestamp' not in settings: settings['split_timestamp'] = 0 if 'max_out_sz' not in settings: settings['max_out_sz'] = 1000 * 1000 * 1000 if 'out_of_order_cache_sz' not in settings: settings['out_of_order_cache_sz'] = 100 * 1000 * 1000 settings['max_out_sz'] = int(settings['max_out_sz']) settings['split_timestamp'] = int(settings['split_timestamp']) settings['file_timestamp'] = int(settings['file_timestamp']) settings['netmagic'] = unhexlify(settings['netmagic'].encode('utf-8')) settings['out_of_order_cache_sz'] = int(settings['out_of_order_cache_sz']) if 'output_file' not in settings and 'output' not in settings: print("Missing output file / directory") sys.exit(1) blkindex = get_block_hashes(settings) blkmap = mkblockmap(blkindex) # Block hash map won't be byte-reversed. Neither should the genesis hash. if not settings['genesis'] in blkmap: print("Genesis block not found in hashlist") else: BlockDataCopier(settings, blkindex, blkmap).run()