seiteta · October 18, 2017 09:42
diff --git a/read_word.py b/read_word.py
 import docx
 from subprocess import Popen, PIPE

 def doc_to_txt(filename):
     '''
         Get the path of a Word document and returns the text of this document

         :param filename: The filename of the doc or docx document
         :type filename: str
         :return: The text of the document
         :rtype: str

         :Example:

         >>> doc_to_txt("/Users/seiteta/Work/quen-dit-la-cour/reports/jf00097342.doc")
         'This is text from a .doc document'
         >>> doc_to_txt("/Users/seiteta/Work/quen-dit-la-cour/reports/jf00136930.docx")
         'This is text from a .docx document'

     '''
     full_text = []

     if filename.lower().endswith(".doc"):
         print("Converting to txt the doc file:" + filename)
         cmd = ['antiword', filename]
         p = Popen(cmd, stdout=PIPE)
         stdout, stderr = p.communicate()
         full_text = stdout.decode()

     elif filename.lower().endswith(".docx"):
         print("Converting to txt the docx file:" + filename)
         doc = docx.Document(filename)
         for para in doc.paragraphs:
             full_text.append(para.text)
         full_text = '\n'.join(full_text)

     else :
         print("Document extension should be either .doc or .docx")

     return full_text
	import docx
	from subprocess import Popen, PIPE

	def doc_to_txt(filename):
	'''
	Get the path of a Word document and returns the text of this document

	:param filename: The filename of the doc or docx document
	:type filename: str
	:return: The text of the document
	:rtype: str

	:Example:

	>>> doc_to_txt("/Users/seiteta/Work/quen-dit-la-cour/reports/jf00097342.doc")
	'This is text from a .doc document'
	>>> doc_to_txt("/Users/seiteta/Work/quen-dit-la-cour/reports/jf00136930.docx")
	'This is text from a .docx document'

	'''
	full_text = []

	if filename.lower().endswith(".doc"):
	print("Converting to txt the doc file:" + filename)
	cmd = ['antiword', filename]
	p = Popen(cmd, stdout=PIPE)
	stdout, stderr = p.communicate()
	full_text = stdout.decode()

	elif filename.lower().endswith(".docx"):
	print("Converting to txt the docx file:" + filename)
	doc = docx.Document(filename)
	for para in doc.paragraphs:
	full_text.append(para.text)
	full_text = '\n'.join(full_text)

	else :
	print("Document extension should be either .doc or .docx")

	return full_text
No results found