Last active
April 24, 2024 23:28
-
-
Save askulkarni2/3d1883c43c59a04afacaac7a9ffbfcc4 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import docx | |
from datetime import datetime | |
from multiprocessing import Pool | |
def extract_paragraphs(args): | |
docx_file, keywords = args | |
doc = docx.Document(docx_file) | |
paragraphs = [] | |
for para in doc.paragraphs: | |
text = para.text | |
for keyword in keywords: | |
if keyword in text: | |
paragraphs.append((text, doc.core_properties.created)) | |
break | |
return paragraphs | |
def process_documents(directory, keywords, num_processes): | |
docx_files = [os.path.join(directory, f) for f in os.listdir(directory) if f.endswith('.docx')] | |
with Pool(processes=num_processes) as pool: | |
args = [(file, keywords) for file in docx_files] | |
results = pool.map(extract_paragraphs, args) | |
return results | |
if __name__ == "__main__": | |
keywords = ["EKS Blueprints"] | |
directory = "2x2s" | |
num_processes = 10 # Number of concurrent processes to use | |
all_paragraphs = process_documents(directory, keywords, num_processes) | |
# Flatten the list of lists into a single list | |
paragraphs = [p for sublist in all_paragraphs for p in sublist] | |
# Process the extracted paragraphs | |
for paragraph, creation_date in paragraphs: | |
print(f"Paragraph: {paragraph}\nCreation Date: {creation_date}\n") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment