Last active
September 10, 2022 09:02
-
-
Save RuizSerra/e463bf9d79166825379cdba8c6d9b251 to your computer and use it in GitHub Desktop.
PaperSpan to Instapaper
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Given exports from PaperSpan, Diigo, Pocket, format for import to Instapaper. | |
Run the script, browse to https://www.instapaper.com/user and select "Import from Instapaper CSV" | |
""" | |
import datetime | |
import csv | |
import re | |
OUTPUT_CSV = '/Users/foobar/Downloads/instapaper-import.csv' | |
# ---------------------------------------------------------------------------- | |
INPUT_PAPERSPAN = '/Users/foobar/Downloads/ps_export.html' | |
with open(INPUT_PAPERSPAN, 'r') as f: | |
html = f.readlines() | |
entries = [l.strip() for l in html if '<li><a ' in l] | |
entries = [l for l in entries if 'time_read' not in l] | |
pat = re.compile(r'<li><a href="(.+?)" time_added="(\d+?)">(.+?)<\/a><\/li>') | |
output = [] | |
for l in entries: | |
result = pat.match(l) | |
if not result: | |
print(l) | |
continue | |
url, timestamp, title = result.groups() | |
output.append({'URL': url, 'Title': title, 'Selection': '', | |
'Folder': 'Imported', 'Timestamp': timestamp}) | |
print(f'Formatted {len(output)} entries.') | |
# ---------------------------------------------------------------------------- | |
INPUT_DIIGO = '/Users/jaime/Downloads/diigo-export.csv' | |
output = [] | |
with open(INPUT_DIIGO, 'r') as f: | |
reader = csv.DictReader(f) | |
for r in reader: | |
output.append({'URL': r['url'], 'Title': r['title'], | |
'Selection': '', 'Folder': 'Imported', | |
'Timestamp': int(datetime.datetime.timestamp( | |
datetime.datetime.now())*1000)} | |
) | |
print(f'Formatted {len(output)} entries.') | |
# ---------------------------------------------------------------------------- | |
INPUT_POCKET = '/Users/jaime/Downloads/pocket-export.html' | |
with open(INPUT_POCKET, 'r') as f: | |
html = f.readlines() | |
entries = [l.strip() for l in html] | |
pat_read = re.compile(r'<h1>Read Archive</h1>') | |
pat = re.compile(r'<li><a href="(.+?)" time_added="(\d+?)" tags="(.*)">(.+?)<\/a><\/li>') | |
folder = 'Imported' | |
output = [] | |
for l in entries: | |
if pat_read.match(l): | |
folder = 'Archive' | |
result = pat.match(l) | |
if not result: | |
print('NO MATCH', l) | |
continue | |
url, timestamp, tags, title = result.groups() | |
output.append({'URL': url, 'Title': title, 'Selection': '', 'Folder': folder, 'Timestamp': timestamp}) | |
print(f'Formatted {len(output)} entries.') | |
# ---------------------------------------------------------------------------- | |
with open(OUTPUT_CSV, 'w') as f: | |
w = csv.DictWriter(f, output[0].keys()) | |
w.writeheader() | |
w.writerows(output) | |
print(f'Formatting complete. See output file {OUTPUT_CSV}') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment