Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Select an option

  • Save me-suzy/3f0c2f9ded83cf0c58c850b545f388fd to your computer and use it in GitHub Desktop.

Select an option

Save me-suzy/3f0c2f9ded83cf0c58c850b545f388fd to your computer and use it in GitHub Desktop.
convert_articles RO in docx si in pdf 2026
from fpdf import FPDF, HTMLMixin
from docx import Document
from docx.shared import Pt, RGBColor
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.oxml.ns import qn
from docx.oxml import OxmlElement
import copy
import os
import re
from PyPDF2 import PdfMerger
# pip uninstall --yes pypdf && pip install --upgrade fpdf2
# pip install python-docx
def read_text_from_file(file_path):
with open(file_path, encoding='utf8', errors='ignore') as f:
text = f.read()
f.close()
return text
def write_to_file(text, file_path):
with open(file_path, 'wb') as f:
f.write(text.encode('utf8', 'ignore'))
f.close()
dict_simboluri = {
'ă': 'ă', 'â': 'â', 'ã': 'ã', 'â': 'â', 'ă': 'ă', 'â': 'a', ' ': ' ', 'î': 'î',
'Î': 'Î', 'î': 'î', 'î': 'î', 'Î': 'Î', 'Î': 'Î', ' ': ' ', 'ș': 'ș', 'Ș': 'Ș',
'Ş': 'Ş', 'ș': 'ș', 'ş': 'ș', '&': '', 'ț': 'ț', 'ţ': 'ț', 'Ţ': 'Ţ', 'ț': 'ț',
'”': '"', '“': '"'
}
def fix_html(content):
content = re.sub(r'</span>(\s*)<p', r'</span>\1</p><p', content)
return content
def html_to_docx_paragraph(doc, html_content, bold_paragraph=False):
"""Convert HTML content with inline tags to a docx paragraph with formatting."""
paragraph = doc.add_paragraph()
# Remove img tags
html_content = re.sub('<img[^>]*>', '', html_content)
# Split by inline formatting tags
pattern = r'(<b>.*?</b>|<strong>.*?</strong>|<i>.*?</i>|<em>.*?</em>|<span[^>]*>.*?</span>|<a[^>]*>.*?</a>)'
parts = re.split(pattern, html_content, flags=re.DOTALL)
for part in parts:
if not part:
continue
is_bold = bold_paragraph
is_italic = False
text = part
if re.match(r'<b>', part) or re.match(r'<strong>', part):
is_bold = True
text = re.sub(r'</?b>|</?strong>', '', part)
elif re.match(r'<i>', part) or re.match(r'<em>', part):
is_italic = True
text = re.sub(r'</?i>|</?em>', '', part)
elif re.match(r'<span', part):
if 'text_obisnuit2' in part:
is_bold = True
text = re.sub(r'<span[^>]*>|</span>', '', part)
elif re.match(r'<a', part):
text = re.sub(r'<a[^>]*>|</a>', '', part)
# Strip any remaining HTML tags
text = re.sub('<[^>]+>', '', text)
if text:
run = paragraph.add_run(text)
run.bold = is_bold
run.italic = is_italic
run.font.size = Pt(12)
paragraph.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY
return paragraph
def save_to_pdf(directory_path):
modified_files = []
file_count = 0
for root, dirs, files in os.walk(directory_path):
for file_name in files:
if file_name.endswith(".html"):
try:
if "webinar" in file_name:
print(f"Fișierul {file_name} conține 'webinar' în numele său și va fi ignorat.")
continue
file_path = root + os.sep + file_name
file_content = read_text_from_file(file_path)
if "https://pastebin.com" in file_content:
print(f"Fișierul {file_name} conține 'https://pastebin.com' în conținutul său și va fi ignorat.")
continue
file_content = fix_html(file_content)
class PDF(FPDF, HTMLMixin):
pass
if '<!-- ARTICOL START -->' in file_content:
pdf = PDF()
pdf.add_page()
pdf.add_font("Kanit", fname="e:/Carte/BB/17 - Site Leadership/alte/Ionel Balauta/Aryeht/Task 1 - Traduce tot site-ul/Doar Google Web/Andreea/Meditatii/Sedinta 20 august 2022/fonts/Kanit-Regular.ttf", uni=True)
pdf.add_font("Kanit", style="B", fname="e:/Carte/BB/17 - Site Leadership/alte/Ionel Balauta/Aryeht/Task 1 - Traduce tot site-ul/Doar Google Web/Andreea/Meditatii/Sedinta 20 august 2022/fonts/Kanit-Bold.ttf", uni=True)
pdf.add_font("Kanit", style="I", fname="e:/Carte/BB/17 - Site Leadership/alte/Ionel Balauta/Aryeht/Task 1 - Traduce tot site-ul/Doar Google Web/Andreea/Meditatii/Sedinta 20 august 2022/fonts/Kanit-Italic.ttf", uni=True)
pdf.add_font("Kanit", style="BI", fname="e:/Carte/BB/17 - Site Leadership/alte/Ionel Balauta/Aryeht/Task 1 - Traduce tot site-ul/Doar Google Web/Andreea/Meditatii/Sedinta 20 august 2022/fonts/Kanit-BoldItalic.ttf", uni=True)
pdf.set_font("Kanit", size=24)
den_articol = re.search('<h1 class="den_articol" itemprop="name">(.*?)</h1>', file_content)
if den_articol is None:
print("Nu am gasit --- denumire articol --- in fisierul --- {} ---.".format(file_path))
else:
den_articol = den_articol.group(1)
for simbol in dict_simboluri.keys():
den_articol = den_articol.replace(simbol, dict_simboluri[simbol])
pdf.set_text_color(204, 0, 0)
pdf.set_font('Kanit', size=14, style="B")
pdf.multi_cell(w=190, txt=den_articol, align='J')
pdf.ln()
pdf.set_font('Kanit', size=12)
date = re.search('<td class="text_dreapta">(.*?), in <a', file_content)
if date is None:
print("Nu am gasit --- date --- in fisierul --- {} ---.".format(file_path))
else:
date = date.group(1)
pdf.set_text_color(0, 102, 204)
pdf.set_font('Kanit', size=8, style="B")
pdf.cell(txt=date)
pdf.ln()
pdf.ln()
pdf.ln()
pdf.ln()
pdf.set_text_color(0, 0, 0)
pdf.set_font('Kanit', size=12)
articol = re.search(r'<!-- ARTICOL START -->([\s\S]*?)<!-- ARTICOL FINAL -->', file_content)
if articol is None:
print("Nu am gasit --- ARTICOL START/FINAL --- in fisierul --- {} ---.".format(file_path))
else:
articol = articol.group(1)
articol = articol.replace("&quot;", "\"")
articol = articol.replace("&rsquo;", "'")
par_regex = re.compile(r'<p class="text_obisnuit.*?">.*?</p>')
pars = re.findall(par_regex, articol)
pars_text = list()
if len(pars) == 0:
print("Nu am gasit -- paragrafe text_obisnuit -- in fisierul --- {} ---.".format(file_path))
else:
for i in range(0, len(pars)):
if '<p class="text_obisnuit">' in pars[i]:
content = re.findall('<p class="text_obisnuit">(.*?)</p>', pars[i])
if len(content) == 0:
print("Nu am gasit text in paragraful {}, fisierul {}.".format(pars[i], file_path))
else:
for simbol in dict_simboluri.keys():
content[0] = content[0].replace(simbol, dict_simboluri[simbol])
content[0] = re.sub('<img[^>]*>', '', content[0])
content[0] = re.sub('<[^>]+>', '', content[0])
content[0] = ' '.join(content[0].split())
pdf.set_font('Kanit', size=12)
pdf.multi_cell(w=190, txt=content[0], align='J')
pdf.ln()
elif '<p class="text_obisnuit2">' in pars[i]:
content = re.findall('<p class="text_obisnuit2">(.*?)</p>', pars[i])
if len(content) == 0:
print("Nu am gasit text in paragraful {}, fisierul {}.".format(pars[i], file_path))
else:
for simbol in dict_simboluri.keys():
content[0] = content[0].replace(simbol, dict_simboluri[simbol])
content[0] = re.sub('<[^>]+>', '', content[0])
content[0] = ' '.join(content[0].split())
pdf.set_font('Kanit', size=12, style='B')
pdf.multi_cell(w=190, txt=content[0], align='J')
pdf.ln()
pdf.set_font('Kanit', size=12)
pdf.ln()
pdf.ln()
pdf.set_font('Kanit', size=12, style="B")
pdf.cell(txt="Source:")
pdf.set_font('Kanit', size=12)
pdf.set_text_color(0, 102, 204)
pdf.cell(w=40, txt="https://neculaifantanaru.com/{}".format(file_name), link="https://neculaifantanaru.com/{}".format(file_name))
# pdf.cell(w=40, txt="https://neculaifantanaru.com/en/{}".format(file_name), link="https://neculaifantanaru.com/en/{}".format(file_name))
den_fisier = file_path.split('.')[0] + '.pdf'
pdf.output(den_fisier)
modified_files.append(file_name)
file_count += 1
print(f"Fișierul {file_count}: {file_name} a fost modificat.")
except Exception as e:
print(f"Eroare la procesarea fișierului {file_name}: {e}")
continue
print(f"\nAu fost modificate {file_count} fișiere:")
for i, file_name in enumerate(modified_files, 1):
print(f"{i}. {file_name}")
def save_to_docx(directory_path):
modified_files = []
file_count = 0
for root, dirs, files in os.walk(directory_path):
for file_name in files:
if file_name.endswith(".html"):
try:
if "webinar" in file_name:
print(f"Fișierul {file_name} conține 'webinar' în numele său și va fi ignorat (DOCX).")
continue
file_path = root + os.sep + file_name
file_content = read_text_from_file(file_path)
if "https://pastebin.com" in file_content:
print(f"Fișierul {file_name} conține 'https://pastebin.com' în conținutul său și va fi ignorat (DOCX).")
continue
file_content = fix_html(file_content)
if '<!-- ARTICOL START -->' in file_content:
doc = Document()
# Title
den_articol = re.search('<h1 class="den_articol" itemprop="name">(.*?)</h1>', file_content)
if den_articol is None:
print("Nu am gasit --- denumire articol --- in fisierul --- {} --- (DOCX).".format(file_path))
title_text = ""
else:
title_text = den_articol.group(1)
for simbol in dict_simboluri.keys():
title_text = title_text.replace(simbol, dict_simboluri[simbol])
title_text = re.sub('<[^>]+>', '', title_text)
title_para = doc.add_paragraph()
title_para.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY
title_run = title_para.add_run(title_text)
title_run.bold = True
title_run.font.size = Pt(14)
title_run.font.color.rgb = RGBColor(204, 0, 0)
# Date
date = re.search('<td class="text_dreapta">(.*?), in <a', file_content)
if date is None:
print("Nu am gasit --- date --- in fisierul --- {} --- (DOCX).".format(file_path))
else:
date_text = date.group(1)
date_para = doc.add_paragraph()
date_para.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY
date_run = date_para.add_run(date_text)
date_run.bold = True
date_run.font.size = Pt(8)
date_run.font.color.rgb = RGBColor(0, 102, 204)
doc.add_paragraph() # spacing
# Article content
articol = re.search(r'<!-- ARTICOL START -->([\s\S]*?)<!-- ARTICOL FINAL -->', file_content)
if articol is None:
print("Nu am gasit --- ARTICOL START/FINAL --- in fisierul --- {} --- (DOCX).".format(file_path))
else:
articol = articol.group(1)
articol = articol.replace("&quot;", "\"")
articol = articol.replace("&rsquo;", "'")
par_regex = re.compile(r'<p class="text_obisnuit.*?">.*?</p>')
pars = re.findall(par_regex, articol)
if len(pars) == 0:
print("Nu am gasit -- paragrafe text_obisnuit -- in fisierul --- {} --- (DOCX).".format(file_path))
else:
for par in pars:
if '<p class="text_obisnuit">' in par:
content = re.findall('<p class="text_obisnuit">(.*?)</p>', par)
if content:
text = content[0]
for simbol in dict_simboluri.keys():
text = text.replace(simbol, dict_simboluri[simbol])
html_to_docx_paragraph(doc, text, bold_paragraph=False)
elif '<p class="text_obisnuit2">' in par:
content = re.findall('<p class="text_obisnuit2">(.*?)</p>', par)
if content:
text = content[0]
for simbol in dict_simboluri.keys():
text = text.replace(simbol, dict_simboluri[simbol])
html_to_docx_paragraph(doc, text, bold_paragraph=True)
# Source
doc.add_paragraph()
source_para = doc.add_paragraph()
source_para.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY
bold_run = source_para.add_run("Source: ")
bold_run.bold = True
bold_run.font.size = Pt(12)
url_run = source_para.add_run("https://neculaifantanaru.com/{}".format(file_name))
url_run.font.size = Pt(12)
url_run.font.color.rgb = RGBColor(0, 102, 204)
den_fisier_docx = file_path.split('.')[0] + '.docx'
doc.save(den_fisier_docx)
modified_files.append(file_name)
file_count += 1
print(f"Fișierul DOCX {file_count}: {file_name} a fost procesat.")
except Exception as e:
print(f"Eroare la procesarea DOCX fișierului {file_name}: {e}")
continue
print(f"\nAu fost procesate {file_count} fișiere DOCX:")
for i, file_name in enumerate(modified_files, 1):
print(f"{i}. {file_name}")
def merge_pdf_files(directory_path):
merger = PdfMerger()
pdf_files = []
for root, dirs, files in os.walk(directory_path):
for file_name in files:
if file_name.endswith(".pdf"):
print("PDF: ", file_name)
file_path = root + os.sep + file_name
pdf_files.append(file_path)
merger.append(file_path)
output_path = root + os.sep + "articles.pdf"
merger.write(output_path)
merger.close()
for pdf_file in pdf_files:
if pdf_file != output_path:
os.remove(pdf_file)
print(f"Fișierul {pdf_file} a fost șters.")
def merge_docx_files(directory_path):
master = Document()
# Remove default empty paragraph added by python-docx
for elem in list(master.element.body):
master.element.body.remove(elem)
docx_files = []
first = True
for root, dirs, files in os.walk(directory_path):
for file_name in sorted(files):
if file_name.endswith(".docx") and file_name != "articles.docx":
print("DOCX: ", file_name)
file_path = os.path.join(root, file_name)
docx_files.append(file_path)
doc = Document(file_path)
if not first:
# Add page break between articles
p = OxmlElement('w:p')
r = OxmlElement('w:r')
br = OxmlElement('w:br')
br.set(qn('w:type'), 'page')
r.append(br)
p.append(r)
master.element.body.append(p)
first = False
for element in doc.element.body:
master.element.body.append(copy.deepcopy(element))
output_path = os.path.join(directory_path, "articles.docx")
master.save(output_path)
print(f"\nFișierul articles.docx a fost salvat la: {output_path}")
for docx_file in docx_files:
if docx_file != output_path:
os.remove(docx_file)
print(f"Fișierul {docx_file} a fost șters.")
save_to_pdf("c:\\Folder9\\")
merge_pdf_files("c:\\Folder9\\")
save_to_docx("c:\\Folder9\\")
merge_docx_files("c:\\Folder9\\")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment