Skip to content

Instantly share code, notes, and snippets.

@askulkarni2
Last active April 24, 2024 23:28
Show Gist options
  • Save askulkarni2/3d1883c43c59a04afacaac7a9ffbfcc4 to your computer and use it in GitHub Desktop.
Save askulkarni2/3d1883c43c59a04afacaac7a9ffbfcc4 to your computer and use it in GitHub Desktop.
import os
import docx
from datetime import datetime
from multiprocessing import Pool
def extract_paragraphs(args):
docx_file, keywords = args
doc = docx.Document(docx_file)
paragraphs = []
for para in doc.paragraphs:
text = para.text
for keyword in keywords:
if keyword in text:
paragraphs.append((text, doc.core_properties.created))
break
return paragraphs
def process_documents(directory, keywords, num_processes):
docx_files = [os.path.join(directory, f) for f in os.listdir(directory) if f.endswith('.docx')]
with Pool(processes=num_processes) as pool:
args = [(file, keywords) for file in docx_files]
results = pool.map(extract_paragraphs, args)
return results
if __name__ == "__main__":
keywords = ["EKS Blueprints"]
directory = "2x2s"
num_processes = 10 # Number of concurrent processes to use
all_paragraphs = process_documents(directory, keywords, num_processes)
# Flatten the list of lists into a single list
paragraphs = [p for sublist in all_paragraphs for p in sublist]
# Process the extracted paragraphs
for paragraph, creation_date in paragraphs:
print(f"Paragraph: {paragraph}\nCreation Date: {creation_date}\n")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment