contrib/linearize: Add feature to set file's timestamp based on block header time.

This commit is contained in:
Jeff Garzik 2014-08-24 11:37:14 -04:00 committed by Wladimir J. van der Laan
parent 8f5a423344
commit 399cdbc700
No known key found for this signature in database
GPG key ID: 74810B012346C9A6
2 changed files with 37 additions and 18 deletions

View file

@ -27,6 +27,7 @@ output.
Optional config file setting for linearize-data: Optional config file setting for linearize-data:
* "netmagic": network magic number * "netmagic": network magic number
* "max_out_sz": maximum output file size (default 1000*1000*1000) * "max_out_sz": maximum output file size (default 1000*1000*1000)
* "split_year": Split files when a new year is first seen, in addition to * "split_timestamp": Split files when a new month is first seen, in addition to
reaching a maximum file size. reaching a maximum file size.
* "file_timestamp": Set each file's last-modified time to that of the
most recent block in that file.

View file

@ -10,11 +10,13 @@
import json import json
import struct import struct
import re import re
import os
import base64 import base64
import httplib import httplib
import sys import sys
import hashlib import hashlib
import datetime import datetime
import time
settings = {} settings = {}
@ -60,9 +62,10 @@ def calc_hash_str(blk_hdr):
def get_blk_dt(blk_hdr): def get_blk_dt(blk_hdr):
members = struct.unpack("<I", blk_hdr[68:68+4]) members = struct.unpack("<I", blk_hdr[68:68+4])
dt = datetime.datetime.fromtimestamp(members[0]) nTime = members[0]
dt = datetime.datetime.fromtimestamp(nTime)
dt_ym = datetime.datetime(dt.year, dt.month, 1) dt_ym = datetime.datetime(dt.year, dt.month, 1)
return dt_ym return (dt_ym, nTime)
def get_block_hashes(settings): def get_block_hashes(settings):
blkindex = [] blkindex = []
@ -87,14 +90,19 @@ def copydata(settings, blkindex, blkset):
outFn = 0 outFn = 0
outsz = 0 outsz = 0
outF = None outF = None
outFname = None
blkCount = 0 blkCount = 0
lastDate = datetime.datetime(2000, 1, 1) lastDate = datetime.datetime(2000, 1, 1)
highTS = 1408893517 - 315360000
timestampSplit = False timestampSplit = False
fileOutput = True fileOutput = True
setFileTime = False
maxOutSz = settings['max_out_sz'] maxOutSz = settings['max_out_sz']
if 'output' in settings: if 'output' in settings:
fileOutput = False fileOutput = False
if settings['file_timestamp'] != 0:
setFileTime = True
if settings['split_timestamp'] != 0: if settings['split_timestamp'] != 0:
timestampSplit = True timestampSplit = True
@ -134,34 +142,41 @@ def copydata(settings, blkindex, blkset):
if not fileOutput and ((outsz + inLen) > maxOutSz): if not fileOutput and ((outsz + inLen) > maxOutSz):
outF.close() outF.close()
if setFileTime:
os.utime(outFname, (int(time.time()), highTS))
outF = None outF = None
outFname = None
outFn = outFn + 1 outFn = outFn + 1
outsz = 0 outsz = 0
if timestampSplit: (blkDate, blkTS) = get_blk_dt(blk_hdr)
blkDate = get_blk_dt(blk_hdr) if timestampSplit and (blkDate > lastDate):
if blkDate > lastDate: print("New month " + blkDate.strftime("%Y-%m") + " @ " + hash_str)
print("New month " + blkDate.strftime("%Y-%m") + " @ " + hash_str) lastDate = blkDate
lastDate = blkDate if outF:
if outF: outF.close()
outF.close() if setFileTime:
outF = None os.utime(outFname, (int(time.time()), highTS))
outFn = outFn + 1 outF = None
outsz = 0 outFname = None
outFn = outFn + 1
outsz = 0
if not outF: if not outF:
if fileOutput: if fileOutput:
fname = settings['output_file'] outFname = settings['output_file']
else: else:
fname = "%s/blk%05d.dat" % (settings['output'], outFn) outFname = "%s/blk%05d.dat" % (settings['output'], outFn)
print("Output file" + fname) print("Output file" + outFname)
outF = open(fname, "wb") outF = open(outFname, "wb")
outF.write(inhdr) outF.write(inhdr)
outF.write(rawblock) outF.write(rawblock)
outsz = outsz + inLen + 8 outsz = outsz + inLen + 8
blkCount = blkCount + 1 blkCount = blkCount + 1
if blkTS > highTS:
highTS = blkTS
if (blkCount % 1000) == 0: if (blkCount % 1000) == 0:
print("Wrote " + str(blkCount) + " blocks") print("Wrote " + str(blkCount) + " blocks")
@ -191,6 +206,8 @@ if __name__ == '__main__':
settings['input'] = 'input' settings['input'] = 'input'
if 'hashlist' not in settings: if 'hashlist' not in settings:
settings['hashlist'] = 'hashlist.txt' settings['hashlist'] = 'hashlist.txt'
if 'file_timestamp' not in settings:
settings['file_timestamp'] = 0
if 'split_timestamp' not in settings: if 'split_timestamp' not in settings:
settings['split_timestamp'] = 0 settings['split_timestamp'] = 0
if 'max_out_sz' not in settings: if 'max_out_sz' not in settings:
@ -198,6 +215,7 @@ if __name__ == '__main__':
settings['max_out_sz'] = long(settings['max_out_sz']) settings['max_out_sz'] = long(settings['max_out_sz'])
settings['split_timestamp'] = int(settings['split_timestamp']) settings['split_timestamp'] = int(settings['split_timestamp'])
settings['file_timestamp'] = int(settings['file_timestamp'])
settings['netmagic'] = settings['netmagic'].decode('hex') settings['netmagic'] = settings['netmagic'].decode('hex')
if 'output_file' not in settings and 'output' not in settings: if 'output_file' not in settings and 'output' not in settings: