Last active
October 18, 2017 09:42
-
-
Save seiteta/4b5fbc3d6a7123804d2d4060dba14517 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import docx | |
from subprocess import Popen, PIPE | |
def doc_to_txt(filename): | |
''' | |
Get the path of a Word document and returns the text of this document | |
:param filename: The filename of the doc or docx document | |
:type filename: str | |
:return: The text of the document | |
:rtype: str | |
:Example: | |
>>> doc_to_txt("/Users/seiteta/Work/quen-dit-la-cour/reports/jf00097342.doc") | |
'This is text from a .doc document' | |
>>> doc_to_txt("/Users/seiteta/Work/quen-dit-la-cour/reports/jf00136930.docx") | |
'This is text from a .docx document' | |
''' | |
full_text = [] | |
if filename.lower().endswith(".doc"): | |
print("Converting to txt the doc file:" + filename) | |
cmd = ['antiword', filename] | |
p = Popen(cmd, stdout=PIPE) | |
stdout, stderr = p.communicate() | |
full_text = stdout.decode() | |
elif filename.lower().endswith(".docx"): | |
print("Converting to txt the docx file:" + filename) | |
doc = docx.Document(filename) | |
for para in doc.paragraphs: | |
full_text.append(para.text) | |
full_text = '\n'.join(full_text) | |
else : | |
print("Document extension should be either .doc or .docx") | |
return full_text |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment