Last active
September 25, 2018 01:41
-
-
Save pjatx/eeaf842259b618403a9cd649b11014c1 to your computer and use it in GitHub Desktop.
Rename resume pdf as first email address found
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Import global broad stuff | |
import os | |
import os.path | |
import shutil | |
import re | |
from optparse import OptionParser | |
# Import PDF Miner specific stuff to use as library | |
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter | |
from pdfminer.converter import TextConverter | |
from pdfminer.layout import LAParams | |
from pdfminer.pdfpage import PDFPage | |
from io import StringIO | |
# Global Variables | |
src_dir = os.path.join(os.curdir, 'to-process') | |
dst_dir = os.path.join(os.curdir, 'processed') | |
# Regex to find emails | |
regex = re.compile(("([a-z0-9!#$%&'*+\/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+\/=?^_`" | |
"{|}~-]+)*(@|\sat\s)(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?(\.|" | |
"\sdot\s))+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?)")) | |
# Helper Functions | |
def copy_rename(old_file_name, new_file_name): | |
src_file = os.path.join(src_dir, old_file_name) | |
shutil.copy(src_file, dst_dir) | |
dst_file = os.path.join(dst_dir, old_file_name) | |
new_dst_file_name = os.path.join(dst_dir, new_file_name) | |
os.rename(dst_file, new_dst_file_name) | |
def get_emails(s): | |
"""Returns an iterator of matched emails found in string s.""" | |
# Removing lines that start with '//' because the regular expression | |
# mistakenly matches patterns like 'http://[email protected]' as '//[email protected]'. | |
return (email[0] for email in re.findall(regex, s) if not email[0].startswith('//')) | |
def convert_pdf_to_txt(path, pages=None): | |
if not pages: | |
pagenums = set() | |
else: | |
pagenums = set(pages) | |
output = StringIO() | |
manager = PDFResourceManager() | |
converter = TextConverter(manager, output, laparams=LAParams()) | |
interpreter = PDFPageInterpreter(manager, converter) | |
infile = open(path, 'rb') | |
for page in PDFPage.get_pages(infile, pagenums): | |
interpreter.process_page(page) | |
infile.close() | |
converter.close() | |
text = output.getvalue() | |
output.close() | |
return text | |
# Iterate through files in source directory/to-process | |
# Parse them using pdf miner | |
# Copy to other | |
def main(): | |
i = 0 | |
d = 0 | |
for filename in os.listdir(src_dir): | |
if filename.endswith('.pdf'): | |
with open(os.path.join(src_dir, filename)) as f: | |
parsed = convert_pdf_to_txt(f.name) | |
emails = get_emails(parsed) | |
first_email = next(emails, None) | |
if first_email == None: | |
print('No email addresses found', '\t', 'skipped...') | |
else: | |
print(f.name, '\t', first_email) | |
print("Copying...") | |
old_name = os.path.basename(f.name) | |
new_name = first_email + '.pdf' | |
print(new_name) | |
try: | |
copy_rename(old_name, new_name) | |
except OSError as err: | |
print("OS error: {0}".format(err)) | |
except ValueError: | |
print("Could not convert data to an integer.") | |
except: | |
print("Unexpected error:", sys.exc_info()[0]) | |
raise | |
os.remove(f.name) | |
print("Done.") | |
i += 1 | |
d += 1 | |
print("All set!", '\t', str(i), '/', str(d), ' resumes processed') | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment