Created
October 11, 2018 22:27
-
-
Save tsudoko/46faae62bbb7c58d033e3d688aca617e to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import collections | |
import os | |
import sys | |
CDXFile = collections.namedtuple("CDXFile", ["sep", "fields", "file"]) | |
essential_fields = frozenset("aku") | |
warn_warc_dedup = True | |
def say(*args): | |
print("{}: ".format(os.path.basename(sys.argv[0])), *args, file=sys.stderr) | |
def main(*cdx, dest=sys.stdout.buffer): | |
files = {} | |
commonfields = set(chr(x) for x in range(256)) | |
for filename in cdx: | |
f = open(filename, "rb") | |
sep = f.read(1) | |
if sep != b" ": | |
# having multiple different separators in input files makes stream | |
# processing much harder, you don't know beforehand if there aren't | |
# any conflicts in field contents, e.g. some field from file 1 | |
# containing a field separator from file 2 | |
# this could be solved with an escaping mechanism, but afaik cdx | |
# doesn't have one | |
say("{}: separator is not space: {}".format(filename, sep)) | |
magic = f.read(4) | |
if magic != b"CDX ": | |
say("{}: invalid cdx magic: {}".format(filename, magic)) | |
exit(1) | |
fields = tuple(x.decode() for x in f.readline().rstrip().split(sep)) | |
commonfields.intersection_update(fields) | |
files[filename] = CDXFile(sep=sep, fields=fields, file=f) | |
orderedfields = tuple(commonfields) | |
dest.write(b" CDX ") | |
dest.write(b" ".join(x.encode() for x in orderedfields)) | |
dest.write(b"\n") | |
for c in files.values(): | |
for l in c.file.readlines(): | |
entry = {c.fields[i]: field for i, field in enumerate(l.rstrip().split(c.sep))}# if c.fields[i] in commonfields} | |
dest.write(b" ".join(entry[f] for f in orderedfields)) | |
dest.write(b"\n") | |
dest.flush() | |
for filename, c in files.items(): | |
if warn_warc_dedup: | |
essential_missing = essential_fields - set(c.fields) | |
if essential_missing: | |
say("{}: warning: missing fields for warc-dedup: {}".format(filename, set(essential_missing))) | |
c.file.close() | |
if __name__ == "__main__": | |
if len(sys.argv) < 2: | |
print("usage: {} cdxfile...".format(os.path.basename(sys.argv[0]))) | |
exit(1) | |
main(*sys.argv[1:]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment