RuizSerra · September 10, 2022 09:02
diff --git a/instapaper_import.py b/instapaper_import.py
 """
 Given exports from PaperSpan, Diigo, Pocket, format for import to Instapaper.

 Run the script, browse to https://www.instapaper.com/user and select "Import from Instapaper CSV"
 """

 import datetime
 import csv
 import re

 OUTPUT_CSV = '/Users/foobar/Downloads/instapaper-import.csv'

 # ----------------------------------------------------------------------------
 INPUT_PAPERSPAN = '/Users/foobar/Downloads/ps_export.html'

 with open(INPUT_PAPERSPAN, 'r') as f:
    html = f.readlines()
 entries = [l.strip() for l in html if '<li><a ' in l]
 entries = [l for l in entries if 'time_read' not in l]
 pat = re.compile(r'<li><a href="(.+?)" time_added="(\d+?)">(.+?)<\/a><\/li>')

 output = []
 for l in entries:
    result = pat.match(l)
    if not result:
        print(l)
        continue
    url, timestamp, title = result.groups()
    output.append({'URL': url, 'Title': title, 'Selection': '', 
                   'Folder': 'Imported', 'Timestamp': timestamp})

 print(f'Formatted {len(output)} entries.')

 # ----------------------------------------------------------------------------
 INPUT_DIIGO = '/Users/jaime/Downloads/diigo-export.csv'

 output = []
 with open(INPUT_DIIGO, 'r') as f:
    reader = csv.DictReader(f)
    for r in reader:
        output.append({'URL': r['url'], 'Title': r['title'], 
                       'Selection': '', 'Folder': 'Imported', 
                       'Timestamp': int(datetime.datetime.timestamp(
                                            datetime.datetime.now())*1000)}
                      )  
 print(f'Formatted {len(output)} entries.')

 # ----------------------------------------------------------------------------
 INPUT_POCKET = '/Users/jaime/Downloads/pocket-export.html'

 with open(INPUT_POCKET, 'r') as f:
    html = f.readlines()
    
 entries = [l.strip() for l in html]
 pat_read = re.compile(r'<h1>Read Archive</h1>')
 pat = re.compile(r'<li><a href="(.+?)" time_added="(\d+?)" tags="(.*)">(.+?)<\/a><\/li>')

 folder = 'Imported'
 output = []
 for l in entries:
    
    if pat_read.match(l): 
        folder = 'Archive'
    
    result = pat.match(l)
    if not result:
        print('NO MATCH', l)
        continue
    url, timestamp, tags, title = result.groups()
    output.append({'URL': url, 'Title': title, 'Selection': '', 'Folder': folder, 'Timestamp': timestamp})
    
 print(f'Formatted {len(output)} entries.')

 # ----------------------------------------------------------------------------
 with open(OUTPUT_CSV, 'w') as f:
    w = csv.DictWriter(f, output[0].keys())
    w.writeheader()
    w.writerows(output)
    
 print(f'Formatting complete. See output file {OUTPUT_CSV}')
	"""
	Given exports from PaperSpan, Diigo, Pocket, format for import to Instapaper.

	Run the script, browse to https://www.instapaper.com/user and select "Import from Instapaper CSV"
	"""

	import datetime
	import csv
	import re

	OUTPUT_CSV = '/Users/foobar/Downloads/instapaper-import.csv'

	# ----------------------------------------------------------------------------
	INPUT_PAPERSPAN = '/Users/foobar/Downloads/ps_export.html'

	with open(INPUT_PAPERSPAN, 'r') as f:
	html = f.readlines()
	entries = [l.strip() for l in html if '<li><a ' in l]
	entries = [l for l in entries if 'time_read' not in l]
	pat = re.compile(r'<li><a href="(.+?)" time_added="(\d+?)">(.+?)<\/a><\/li>')

	output = []
	for l in entries:
	result = pat.match(l)
	if not result:
	print(l)
	continue
	url, timestamp, title = result.groups()
	output.append({'URL': url, 'Title': title, 'Selection': '',
	'Folder': 'Imported', 'Timestamp': timestamp})

	print(f'Formatted {len(output)} entries.')

	# ----------------------------------------------------------------------------
	INPUT_DIIGO = '/Users/jaime/Downloads/diigo-export.csv'

	output = []
	with open(INPUT_DIIGO, 'r') as f:
	reader = csv.DictReader(f)
	for r in reader:
	output.append({'URL': r['url'], 'Title': r['title'],
	'Selection': '', 'Folder': 'Imported',
	'Timestamp': int(datetime.datetime.timestamp(
	datetime.datetime.now())*1000)}
	)
	print(f'Formatted {len(output)} entries.')

	# ----------------------------------------------------------------------------
	INPUT_POCKET = '/Users/jaime/Downloads/pocket-export.html'

	with open(INPUT_POCKET, 'r') as f:
	html = f.readlines()

	entries = [l.strip() for l in html]
	pat_read = re.compile(r'<h1>Read Archive</h1>')
	pat = re.compile(r'<li><a href="(.+?)" time_added="(\d+?)" tags="(.*)">(.+?)<\/a><\/li>')

	folder = 'Imported'
	output = []
	for l in entries:

	if pat_read.match(l):
	folder = 'Archive'

	result = pat.match(l)
	if not result:
	print('NO MATCH', l)
	continue
	url, timestamp, tags, title = result.groups()
	output.append({'URL': url, 'Title': title, 'Selection': '', 'Folder': folder, 'Timestamp': timestamp})

	print(f'Formatted {len(output)} entries.')

	# ----------------------------------------------------------------------------
	with open(OUTPUT_CSV, 'w') as f:
	w = csv.DictWriter(f, output[0].keys())
	w.writeheader()
	w.writerows(output)

	print(f'Formatting complete. See output file {OUTPUT_CSV}')
No results found