Skip to content

Instantly share code, notes, and snippets.

@seiteta
Last active October 18, 2017 09:42
Show Gist options
  • Save seiteta/4b5fbc3d6a7123804d2d4060dba14517 to your computer and use it in GitHub Desktop.
Save seiteta/4b5fbc3d6a7123804d2d4060dba14517 to your computer and use it in GitHub Desktop.
import docx
from subprocess import Popen, PIPE
def doc_to_txt(filename):
'''
Get the path of a Word document and returns the text of this document
:param filename: The filename of the doc or docx document
:type filename: str
:return: The text of the document
:rtype: str
:Example:
>>> doc_to_txt("/Users/seiteta/Work/quen-dit-la-cour/reports/jf00097342.doc")
'This is text from a .doc document'
>>> doc_to_txt("/Users/seiteta/Work/quen-dit-la-cour/reports/jf00136930.docx")
'This is text from a .docx document'
'''
full_text = []
if filename.lower().endswith(".doc"):
print("Converting to txt the doc file:" + filename)
cmd = ['antiword', filename]
p = Popen(cmd, stdout=PIPE)
stdout, stderr = p.communicate()
full_text = stdout.decode()
elif filename.lower().endswith(".docx"):
print("Converting to txt the docx file:" + filename)
doc = docx.Document(filename)
for para in doc.paragraphs:
full_text.append(para.text)
full_text = '\n'.join(full_text)
else :
print("Document extension should be either .doc or .docx")
return full_text
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment