Created
June 2, 2026 20:11
-
-
Save me-suzy/3f0c2f9ded83cf0c58c850b545f388fd to your computer and use it in GitHub Desktop.
convert_articles RO in docx si in pdf 2026
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from fpdf import FPDF, HTMLMixin | |
| from docx import Document | |
| from docx.shared import Pt, RGBColor | |
| from docx.enum.text import WD_ALIGN_PARAGRAPH | |
| from docx.oxml.ns import qn | |
| from docx.oxml import OxmlElement | |
| import copy | |
| import os | |
| import re | |
| from PyPDF2 import PdfMerger | |
| # pip uninstall --yes pypdf && pip install --upgrade fpdf2 | |
| # pip install python-docx | |
| def read_text_from_file(file_path): | |
| with open(file_path, encoding='utf8', errors='ignore') as f: | |
| text = f.read() | |
| f.close() | |
| return text | |
| def write_to_file(text, file_path): | |
| with open(file_path, 'wb') as f: | |
| f.write(text.encode('utf8', 'ignore')) | |
| f.close() | |
| dict_simboluri = { | |
| 'ă': 'ă', 'â': 'â', 'ã': 'ã', 'â': 'â', 'ă': 'ă', 'â': 'a', ' ': ' ', 'î': 'î', | |
| 'Î': 'Î', 'î': 'î', 'î': 'î', 'Î': 'Î', 'Î': 'Î', ' ': ' ', 'ș': 'ș', 'Ș': 'Ș', | |
| 'Ş': 'Ş', 'ș': 'ș', 'ş': 'ș', '&': '', 'ț': 'ț', 'ţ': 'ț', 'Ţ': 'Ţ', 'ț': 'ț', | |
| '”': '"', '“': '"' | |
| } | |
| def fix_html(content): | |
| content = re.sub(r'</span>(\s*)<p', r'</span>\1</p><p', content) | |
| return content | |
| def html_to_docx_paragraph(doc, html_content, bold_paragraph=False): | |
| """Convert HTML content with inline tags to a docx paragraph with formatting.""" | |
| paragraph = doc.add_paragraph() | |
| # Remove img tags | |
| html_content = re.sub('<img[^>]*>', '', html_content) | |
| # Split by inline formatting tags | |
| pattern = r'(<b>.*?</b>|<strong>.*?</strong>|<i>.*?</i>|<em>.*?</em>|<span[^>]*>.*?</span>|<a[^>]*>.*?</a>)' | |
| parts = re.split(pattern, html_content, flags=re.DOTALL) | |
| for part in parts: | |
| if not part: | |
| continue | |
| is_bold = bold_paragraph | |
| is_italic = False | |
| text = part | |
| if re.match(r'<b>', part) or re.match(r'<strong>', part): | |
| is_bold = True | |
| text = re.sub(r'</?b>|</?strong>', '', part) | |
| elif re.match(r'<i>', part) or re.match(r'<em>', part): | |
| is_italic = True | |
| text = re.sub(r'</?i>|</?em>', '', part) | |
| elif re.match(r'<span', part): | |
| if 'text_obisnuit2' in part: | |
| is_bold = True | |
| text = re.sub(r'<span[^>]*>|</span>', '', part) | |
| elif re.match(r'<a', part): | |
| text = re.sub(r'<a[^>]*>|</a>', '', part) | |
| # Strip any remaining HTML tags | |
| text = re.sub('<[^>]+>', '', text) | |
| if text: | |
| run = paragraph.add_run(text) | |
| run.bold = is_bold | |
| run.italic = is_italic | |
| run.font.size = Pt(12) | |
| paragraph.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY | |
| return paragraph | |
| def save_to_pdf(directory_path): | |
| modified_files = [] | |
| file_count = 0 | |
| for root, dirs, files in os.walk(directory_path): | |
| for file_name in files: | |
| if file_name.endswith(".html"): | |
| try: | |
| if "webinar" in file_name: | |
| print(f"Fișierul {file_name} conține 'webinar' în numele său și va fi ignorat.") | |
| continue | |
| file_path = root + os.sep + file_name | |
| file_content = read_text_from_file(file_path) | |
| if "https://pastebin.com" in file_content: | |
| print(f"Fișierul {file_name} conține 'https://pastebin.com' în conținutul său și va fi ignorat.") | |
| continue | |
| file_content = fix_html(file_content) | |
| class PDF(FPDF, HTMLMixin): | |
| pass | |
| if '<!-- ARTICOL START -->' in file_content: | |
| pdf = PDF() | |
| pdf.add_page() | |
| pdf.add_font("Kanit", fname="e:/Carte/BB/17 - Site Leadership/alte/Ionel Balauta/Aryeht/Task 1 - Traduce tot site-ul/Doar Google Web/Andreea/Meditatii/Sedinta 20 august 2022/fonts/Kanit-Regular.ttf", uni=True) | |
| pdf.add_font("Kanit", style="B", fname="e:/Carte/BB/17 - Site Leadership/alte/Ionel Balauta/Aryeht/Task 1 - Traduce tot site-ul/Doar Google Web/Andreea/Meditatii/Sedinta 20 august 2022/fonts/Kanit-Bold.ttf", uni=True) | |
| pdf.add_font("Kanit", style="I", fname="e:/Carte/BB/17 - Site Leadership/alte/Ionel Balauta/Aryeht/Task 1 - Traduce tot site-ul/Doar Google Web/Andreea/Meditatii/Sedinta 20 august 2022/fonts/Kanit-Italic.ttf", uni=True) | |
| pdf.add_font("Kanit", style="BI", fname="e:/Carte/BB/17 - Site Leadership/alte/Ionel Balauta/Aryeht/Task 1 - Traduce tot site-ul/Doar Google Web/Andreea/Meditatii/Sedinta 20 august 2022/fonts/Kanit-BoldItalic.ttf", uni=True) | |
| pdf.set_font("Kanit", size=24) | |
| den_articol = re.search('<h1 class="den_articol" itemprop="name">(.*?)</h1>', file_content) | |
| if den_articol is None: | |
| print("Nu am gasit --- denumire articol --- in fisierul --- {} ---.".format(file_path)) | |
| else: | |
| den_articol = den_articol.group(1) | |
| for simbol in dict_simboluri.keys(): | |
| den_articol = den_articol.replace(simbol, dict_simboluri[simbol]) | |
| pdf.set_text_color(204, 0, 0) | |
| pdf.set_font('Kanit', size=14, style="B") | |
| pdf.multi_cell(w=190, txt=den_articol, align='J') | |
| pdf.ln() | |
| pdf.set_font('Kanit', size=12) | |
| date = re.search('<td class="text_dreapta">(.*?), in <a', file_content) | |
| if date is None: | |
| print("Nu am gasit --- date --- in fisierul --- {} ---.".format(file_path)) | |
| else: | |
| date = date.group(1) | |
| pdf.set_text_color(0, 102, 204) | |
| pdf.set_font('Kanit', size=8, style="B") | |
| pdf.cell(txt=date) | |
| pdf.ln() | |
| pdf.ln() | |
| pdf.ln() | |
| pdf.ln() | |
| pdf.set_text_color(0, 0, 0) | |
| pdf.set_font('Kanit', size=12) | |
| articol = re.search(r'<!-- ARTICOL START -->([\s\S]*?)<!-- ARTICOL FINAL -->', file_content) | |
| if articol is None: | |
| print("Nu am gasit --- ARTICOL START/FINAL --- in fisierul --- {} ---.".format(file_path)) | |
| else: | |
| articol = articol.group(1) | |
| articol = articol.replace(""", "\"") | |
| articol = articol.replace("’", "'") | |
| par_regex = re.compile(r'<p class="text_obisnuit.*?">.*?</p>') | |
| pars = re.findall(par_regex, articol) | |
| pars_text = list() | |
| if len(pars) == 0: | |
| print("Nu am gasit -- paragrafe text_obisnuit -- in fisierul --- {} ---.".format(file_path)) | |
| else: | |
| for i in range(0, len(pars)): | |
| if '<p class="text_obisnuit">' in pars[i]: | |
| content = re.findall('<p class="text_obisnuit">(.*?)</p>', pars[i]) | |
| if len(content) == 0: | |
| print("Nu am gasit text in paragraful {}, fisierul {}.".format(pars[i], file_path)) | |
| else: | |
| for simbol in dict_simboluri.keys(): | |
| content[0] = content[0].replace(simbol, dict_simboluri[simbol]) | |
| content[0] = re.sub('<img[^>]*>', '', content[0]) | |
| content[0] = re.sub('<[^>]+>', '', content[0]) | |
| content[0] = ' '.join(content[0].split()) | |
| pdf.set_font('Kanit', size=12) | |
| pdf.multi_cell(w=190, txt=content[0], align='J') | |
| pdf.ln() | |
| elif '<p class="text_obisnuit2">' in pars[i]: | |
| content = re.findall('<p class="text_obisnuit2">(.*?)</p>', pars[i]) | |
| if len(content) == 0: | |
| print("Nu am gasit text in paragraful {}, fisierul {}.".format(pars[i], file_path)) | |
| else: | |
| for simbol in dict_simboluri.keys(): | |
| content[0] = content[0].replace(simbol, dict_simboluri[simbol]) | |
| content[0] = re.sub('<[^>]+>', '', content[0]) | |
| content[0] = ' '.join(content[0].split()) | |
| pdf.set_font('Kanit', size=12, style='B') | |
| pdf.multi_cell(w=190, txt=content[0], align='J') | |
| pdf.ln() | |
| pdf.set_font('Kanit', size=12) | |
| pdf.ln() | |
| pdf.ln() | |
| pdf.set_font('Kanit', size=12, style="B") | |
| pdf.cell(txt="Source:") | |
| pdf.set_font('Kanit', size=12) | |
| pdf.set_text_color(0, 102, 204) | |
| pdf.cell(w=40, txt="https://neculaifantanaru.com/{}".format(file_name), link="https://neculaifantanaru.com/{}".format(file_name)) | |
| # pdf.cell(w=40, txt="https://neculaifantanaru.com/en/{}".format(file_name), link="https://neculaifantanaru.com/en/{}".format(file_name)) | |
| den_fisier = file_path.split('.')[0] + '.pdf' | |
| pdf.output(den_fisier) | |
| modified_files.append(file_name) | |
| file_count += 1 | |
| print(f"Fișierul {file_count}: {file_name} a fost modificat.") | |
| except Exception as e: | |
| print(f"Eroare la procesarea fișierului {file_name}: {e}") | |
| continue | |
| print(f"\nAu fost modificate {file_count} fișiere:") | |
| for i, file_name in enumerate(modified_files, 1): | |
| print(f"{i}. {file_name}") | |
| def save_to_docx(directory_path): | |
| modified_files = [] | |
| file_count = 0 | |
| for root, dirs, files in os.walk(directory_path): | |
| for file_name in files: | |
| if file_name.endswith(".html"): | |
| try: | |
| if "webinar" in file_name: | |
| print(f"Fișierul {file_name} conține 'webinar' în numele său și va fi ignorat (DOCX).") | |
| continue | |
| file_path = root + os.sep + file_name | |
| file_content = read_text_from_file(file_path) | |
| if "https://pastebin.com" in file_content: | |
| print(f"Fișierul {file_name} conține 'https://pastebin.com' în conținutul său și va fi ignorat (DOCX).") | |
| continue | |
| file_content = fix_html(file_content) | |
| if '<!-- ARTICOL START -->' in file_content: | |
| doc = Document() | |
| # Title | |
| den_articol = re.search('<h1 class="den_articol" itemprop="name">(.*?)</h1>', file_content) | |
| if den_articol is None: | |
| print("Nu am gasit --- denumire articol --- in fisierul --- {} --- (DOCX).".format(file_path)) | |
| title_text = "" | |
| else: | |
| title_text = den_articol.group(1) | |
| for simbol in dict_simboluri.keys(): | |
| title_text = title_text.replace(simbol, dict_simboluri[simbol]) | |
| title_text = re.sub('<[^>]+>', '', title_text) | |
| title_para = doc.add_paragraph() | |
| title_para.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY | |
| title_run = title_para.add_run(title_text) | |
| title_run.bold = True | |
| title_run.font.size = Pt(14) | |
| title_run.font.color.rgb = RGBColor(204, 0, 0) | |
| # Date | |
| date = re.search('<td class="text_dreapta">(.*?), in <a', file_content) | |
| if date is None: | |
| print("Nu am gasit --- date --- in fisierul --- {} --- (DOCX).".format(file_path)) | |
| else: | |
| date_text = date.group(1) | |
| date_para = doc.add_paragraph() | |
| date_para.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY | |
| date_run = date_para.add_run(date_text) | |
| date_run.bold = True | |
| date_run.font.size = Pt(8) | |
| date_run.font.color.rgb = RGBColor(0, 102, 204) | |
| doc.add_paragraph() # spacing | |
| # Article content | |
| articol = re.search(r'<!-- ARTICOL START -->([\s\S]*?)<!-- ARTICOL FINAL -->', file_content) | |
| if articol is None: | |
| print("Nu am gasit --- ARTICOL START/FINAL --- in fisierul --- {} --- (DOCX).".format(file_path)) | |
| else: | |
| articol = articol.group(1) | |
| articol = articol.replace(""", "\"") | |
| articol = articol.replace("’", "'") | |
| par_regex = re.compile(r'<p class="text_obisnuit.*?">.*?</p>') | |
| pars = re.findall(par_regex, articol) | |
| if len(pars) == 0: | |
| print("Nu am gasit -- paragrafe text_obisnuit -- in fisierul --- {} --- (DOCX).".format(file_path)) | |
| else: | |
| for par in pars: | |
| if '<p class="text_obisnuit">' in par: | |
| content = re.findall('<p class="text_obisnuit">(.*?)</p>', par) | |
| if content: | |
| text = content[0] | |
| for simbol in dict_simboluri.keys(): | |
| text = text.replace(simbol, dict_simboluri[simbol]) | |
| html_to_docx_paragraph(doc, text, bold_paragraph=False) | |
| elif '<p class="text_obisnuit2">' in par: | |
| content = re.findall('<p class="text_obisnuit2">(.*?)</p>', par) | |
| if content: | |
| text = content[0] | |
| for simbol in dict_simboluri.keys(): | |
| text = text.replace(simbol, dict_simboluri[simbol]) | |
| html_to_docx_paragraph(doc, text, bold_paragraph=True) | |
| # Source | |
| doc.add_paragraph() | |
| source_para = doc.add_paragraph() | |
| source_para.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY | |
| bold_run = source_para.add_run("Source: ") | |
| bold_run.bold = True | |
| bold_run.font.size = Pt(12) | |
| url_run = source_para.add_run("https://neculaifantanaru.com/{}".format(file_name)) | |
| url_run.font.size = Pt(12) | |
| url_run.font.color.rgb = RGBColor(0, 102, 204) | |
| den_fisier_docx = file_path.split('.')[0] + '.docx' | |
| doc.save(den_fisier_docx) | |
| modified_files.append(file_name) | |
| file_count += 1 | |
| print(f"Fișierul DOCX {file_count}: {file_name} a fost procesat.") | |
| except Exception as e: | |
| print(f"Eroare la procesarea DOCX fișierului {file_name}: {e}") | |
| continue | |
| print(f"\nAu fost procesate {file_count} fișiere DOCX:") | |
| for i, file_name in enumerate(modified_files, 1): | |
| print(f"{i}. {file_name}") | |
| def merge_pdf_files(directory_path): | |
| merger = PdfMerger() | |
| pdf_files = [] | |
| for root, dirs, files in os.walk(directory_path): | |
| for file_name in files: | |
| if file_name.endswith(".pdf"): | |
| print("PDF: ", file_name) | |
| file_path = root + os.sep + file_name | |
| pdf_files.append(file_path) | |
| merger.append(file_path) | |
| output_path = root + os.sep + "articles.pdf" | |
| merger.write(output_path) | |
| merger.close() | |
| for pdf_file in pdf_files: | |
| if pdf_file != output_path: | |
| os.remove(pdf_file) | |
| print(f"Fișierul {pdf_file} a fost șters.") | |
| def merge_docx_files(directory_path): | |
| master = Document() | |
| # Remove default empty paragraph added by python-docx | |
| for elem in list(master.element.body): | |
| master.element.body.remove(elem) | |
| docx_files = [] | |
| first = True | |
| for root, dirs, files in os.walk(directory_path): | |
| for file_name in sorted(files): | |
| if file_name.endswith(".docx") and file_name != "articles.docx": | |
| print("DOCX: ", file_name) | |
| file_path = os.path.join(root, file_name) | |
| docx_files.append(file_path) | |
| doc = Document(file_path) | |
| if not first: | |
| # Add page break between articles | |
| p = OxmlElement('w:p') | |
| r = OxmlElement('w:r') | |
| br = OxmlElement('w:br') | |
| br.set(qn('w:type'), 'page') | |
| r.append(br) | |
| p.append(r) | |
| master.element.body.append(p) | |
| first = False | |
| for element in doc.element.body: | |
| master.element.body.append(copy.deepcopy(element)) | |
| output_path = os.path.join(directory_path, "articles.docx") | |
| master.save(output_path) | |
| print(f"\nFișierul articles.docx a fost salvat la: {output_path}") | |
| for docx_file in docx_files: | |
| if docx_file != output_path: | |
| os.remove(docx_file) | |
| print(f"Fișierul {docx_file} a fost șters.") | |
| save_to_pdf("c:\\Folder9\\") | |
| merge_pdf_files("c:\\Folder9\\") | |
| save_to_docx("c:\\Folder9\\") | |
| merge_docx_files("c:\\Folder9\\") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment