Skip to content

Instantly share code, notes, and snippets.

@me-suzy
Created June 2, 2026 20:31
Show Gist options
  • Select an option

  • Save me-suzy/843f8105ffe954e9f1b1c45ade13d0fd to your computer and use it in GitHub Desktop.

Select an option

Save me-suzy/843f8105ffe954e9f1b1c45ade13d0fd to your computer and use it in GitHub Desktop.
merge_leader_articles.py
from __future__ import annotations
import copy
import re
import unicodedata
from dataclasses import dataclass
from difflib import SequenceMatcher
from pathlib import Path
from docx import Document
from docx.oxml import OxmlElement
from docx.oxml.ns import qn
from docx.text.paragraph import Paragraph
from lxml import html
DOCX_PATH = Path(
r"e:\Carte\BB\++++carti scrise de bebe\CELE 63 de calitati ale liderului\pentru tiparire.docx"
)
HTML_ROOT = Path(r"e:\Carte\BB\17 - Site Leadership\Principal\ro")
OUT_DOCX = DOCX_PATH.with_name("pentru tiparire - actualizat cu articole web.docx")
REPORT_PATH = DOCX_PATH.with_name("raport-inlocuiri-web.txt")
HTML_FILES = [
"calitatile-unui-lider-inspiratia.html",
"calitatile-unui-lider-responsabilitatea.html",
"calitatile-unui-lider-credinta.html",
"calitatile-unui-lider-dorinta-de-autodepasire.html",
"calitatile-unui-lider-increderea.html",
"calitatile-unui-lider-perseverenta.html",
"calitatile-unui-lider-spontaneitatea.html",
"calitatile-unui-lider-vointa-ferma-de-a-invinge.html",
"calitatile-unui-lider-rezonanta.html",
"calitatile-unui-lider-sensibilitatea-sufleteasca.html",
"calitatile-unui-lider-receptivitatea.html",
"calitatile-unui-lider-puterea-de-patrundere-psihologica.html",
"calitatile-unui-lider-puterea-de-persuasiune.html",
"calitatile-unui-lider-maretia-sufleteasca.html",
"calitatile-unui-lider-puterea-de-patrundere-a-eu-lui-individual.html",
"calitatile-unui-lider-luciditatea-si-profunzimea-judecatii.html",
"calitatile-unui-lider-maretia-spirituala.html",
"calitatile-unui-lider-integritatea-launtrica.html",
"calitatile-unui-lider-intuitia-si-viziunea-patrunzatoare.html",
"calitatile-unui-lider-forta-emotionala.html",
"calitatile-unui-lider-devotamentul-absolut.html",
"calitatile-unui-lider-carisma.html",
"calitatile-unui-lider-creativitatea.html",
]
@dataclass
class HtmlPara:
kind: str
text: str
@dataclass
class HtmlArticle:
file: Path
title: str
subtitle: str
paras: list[HtmlPara]
@dataclass
class DocArticle:
number: int
marker_idx: int
end_idx: int
title: str
subtitle: str
def romanian_key(text: str) -> str:
text = text.strip().lower()
text = text.replace("ȋ", "î").replace("Ȋ", "î")
text = text.replace("ş", "ș").replace("ţ", "ț")
text = text.replace("ş", "ș").replace("ţ", "ț")
text = unicodedata.normalize("NFD", text)
text = "".join(ch for ch in text if unicodedata.category(ch) != "Mn")
text = text.replace("ş", "s").replace("ț", "t").replace("ţ", "t")
text = re.sub(r"[^a-z0-9]+", " ", text)
return re.sub(r"\s+", " ", text).strip()
def clean_text(text: str) -> str:
return re.sub(r"\s+", " ", text).strip()
def parse_html_article(path: Path) -> HtmlArticle:
tree = html.fromstring(path.read_bytes())
body = tree.xpath('//div[@itemprop="articleBody"]')
if not body:
raise RuntimeError(f"Nu gasesc articleBody in {path}")
body = body[0]
title = clean_text(body.xpath("string(.//h1[1])"))
paras: list[HtmlPara] = []
for p in body.xpath('.//p[contains(@class, "text_obisnuit")]'):
text = clean_text("".join(p.itertext()))
if not text:
continue
if text.lower().startswith("ultimele articole"):
break
cls = p.get("class") or ""
kind = "lead" if "text_obisnuit2" in cls else "body"
paras.append(HtmlPara(kind=kind, text=text))
subtitle = paras[0].text if paras else ""
return HtmlArticle(file=path, title=title, subtitle=subtitle, paras=paras)
def find_doc_articles(doc: Document) -> list[DocArticle]:
starts: list[tuple[int, int]] = []
for idx, par in enumerate(doc.paragraphs):
text = clean_text(par.text)
m = re.match(r"^-\s*(\d+)\s*-$", text)
if m:
starts.append((int(m.group(1)), idx))
articles: list[DocArticle] = []
for pos, (number, idx) in enumerate(starts):
end = starts[pos + 1][1] if pos + 1 < len(starts) else len(doc.paragraphs)
nonempty = []
for j in range(idx + 1, min(end, idx + 10)):
text = clean_text(doc.paragraphs[j].text)
if text:
nonempty.append((j, text))
if len(nonempty) >= 2:
break
title = nonempty[0][1] if nonempty else ""
subtitle = nonempty[1][1] if len(nonempty) > 1 else ""
articles.append(DocArticle(number=number, marker_idx=idx, end_idx=end, title=title, subtitle=subtitle))
return articles
def para_has_text(par: Paragraph) -> bool:
return bool(clean_text(par.text))
def delete_paragraph(par: Paragraph) -> None:
el = par._element
el.getparent().remove(el)
def text_template(par: Paragraph):
el = par._element
first_run = el.find(qn("w:r"))
r_pr = None
if first_run is not None:
r_pr_el = first_run.find(qn("w:rPr"))
if r_pr_el is not None:
r_pr = copy.deepcopy(r_pr_el)
return copy.deepcopy(el), r_pr
def set_p_text(p_el, text: str, r_pr=None):
for child in list(p_el):
if child.tag in {qn("w:r"), qn("w:hyperlink")}:
p_el.remove(child)
r = OxmlElement("w:r")
if r_pr is not None:
r.append(copy.deepcopy(r_pr))
t = OxmlElement("w:t")
if text.startswith(" ") or text.endswith(" "):
t.set(qn("xml:space"), "preserve")
t.text = text
r.append(t)
p_el.append(r)
def insert_after(prev_el, template_el, text: str, r_pr=None):
new_el = copy.deepcopy(template_el)
set_p_text(new_el, text, r_pr)
prev_el.addnext(new_el)
return new_el
def article_templates(doc: Document, article: DocArticle):
title_par = None
subtitle_par = None
body_par = None
bold_body_par = None
blank_par = None
nonempty = []
for idx in range(article.marker_idx + 1, article.end_idx):
par = doc.paragraphs[idx]
if para_has_text(par):
nonempty.append(par)
elif blank_par is None:
blank_par = par
if nonempty:
title_par = nonempty[0]
if len(nonempty) > 1:
subtitle_par = nonempty[1]
for par in nonempty[2:]:
if body_par is None:
body_par = par
if bold_body_par is None and any(run.bold for run in par.runs):
bold_body_par = par
body_par = body_par or subtitle_par or title_par or doc.paragraphs[article.marker_idx]
bold_body_par = bold_body_par or body_par
blank_par = blank_par or body_par
return {
"title": text_template(title_par or body_par),
"subtitle": text_template(subtitle_par or body_par),
"body": text_template(body_par),
"bold": text_template(bold_body_par),
"blank": text_template(blank_par),
}
def best_doc_match(html_article: HtmlArticle, doc_articles: list[DocArticle], used_numbers: set[int]):
hkey = romanian_key(html_article.title)
by_key = {romanian_key(a.title): a for a in doc_articles}
if hkey in by_key and by_key[hkey].number not in used_numbers:
return by_key[hkey], 1.0, "exact"
candidates = []
for article in doc_articles:
if article.number in used_numbers:
continue
score = SequenceMatcher(None, hkey, romanian_key(article.title)).ratio()
candidates.append((score, article))
score, article = max(candidates, key=lambda x: x[0])
if score >= 0.62:
return article, score, "fuzzy"
return None, score, "unmatched"
def replace_article(doc: Document, article: DocArticle, html_article: HtmlArticle):
templates = article_templates(doc, article)
marker_el = doc.paragraphs[article.marker_idx]._element
to_delete = [doc.paragraphs[i] for i in range(article.marker_idx + 1, article.end_idx)]
for par in to_delete:
delete_paragraph(par)
prev = marker_el
title_el, title_rpr = templates["title"]
subtitle_el, subtitle_rpr = templates["subtitle"]
body_el, body_rpr = templates["body"]
bold_el, bold_rpr = templates["bold"]
blank_el, blank_rpr = templates["blank"]
prev = insert_after(prev, title_el, html_article.title, title_rpr)
if html_article.paras:
prev = insert_after(prev, subtitle_el, html_article.paras[0].text, subtitle_rpr)
prev = insert_after(prev, blank_el, "", blank_rpr)
for hp in html_article.paras[1:]:
if hp.kind == "lead":
prev = insert_after(prev, bold_el, hp.text, bold_rpr)
else:
prev = insert_after(prev, body_el, hp.text, body_rpr)
def append_unmatched(doc: Document, unmatched: list[HtmlArticle]):
if not unmatched:
return
doc.add_page_break()
h = doc.add_paragraph("Articole de pe site fără corespondent sigur în carte")
h.style = doc.styles["Normal"]
if h.runs:
h.runs[0].bold = True
h.runs[0].font.size = docx_pt(18)
for idx, art in enumerate(unmatched, 1):
doc.add_page_break()
title = doc.add_paragraph(f"Web-{idx}. {art.title}")
title.style = doc.styles["Normal"]
if title.runs:
title.runs[0].bold = True
for p in art.paras:
par = doc.add_paragraph(p.text)
par.style = doc.styles["Normal"]
if p.kind == "lead" and par.runs:
par.runs[0].bold = True
def docx_pt(value: int):
from docx.shared import Pt
return Pt(value)
def main():
if not DOCX_PATH.is_file():
raise FileNotFoundError(DOCX_PATH)
missing = [name for name in HTML_FILES if not (HTML_ROOT / name).is_file()]
if missing:
raise FileNotFoundError("HTML lipsa: " + ", ".join(missing))
doc = Document(DOCX_PATH)
doc_articles = find_doc_articles(doc)
html_articles = [parse_html_article(HTML_ROOT / name) for name in HTML_FILES]
matches = []
used_numbers: set[int] = set()
unmatched: list[HtmlArticle] = []
for html_article in html_articles:
doc_article, score, mode = best_doc_match(html_article, doc_articles, used_numbers)
if doc_article is None:
unmatched.append(html_article)
matches.append((html_article, None, score, mode))
else:
used_numbers.add(doc_article.number)
matches.append((html_article, doc_article, score, mode))
# Replace from the end so paragraph indices from the original analysis stay valid.
for html_article, doc_article, score, mode in sorted(
[m for m in matches if m[1] is not None], key=lambda item: item[1].marker_idx, reverse=True
):
replace_article(doc, doc_article, html_article)
append_unmatched(doc, unmatched)
doc.save(OUT_DOCX)
lines = []
lines.append("Raport inlocuire articole web -> DOCX")
lines.append(f"Document original: {DOCX_PATH}")
lines.append(f"Document rezultat: {OUT_DOCX}")
lines.append(f"HTML-uri procesate: {len(html_articles)}")
lines.append(f"Articole in carte: {len(doc_articles)}")
lines.append("")
lines.append("INLOCUIRI")
for html_article, doc_article, score, mode in matches:
if doc_article is None:
continue
marker = "exact" if mode == "exact" else f"fuzzy {score:.2f}"
lines.append(
f"- {doc_article.number:02d}. {doc_article.title} <= {html_article.title} [{marker}] ({html_article.file.name})"
)
lines.append("")
lines.append("HTML FARA CORESPONDENT SIGUR")
if unmatched:
for art in unmatched:
lines.append(f"- {art.title} ({art.file.name})")
else:
lines.append("- niciunul")
lines.append("")
lines.append("NOTE")
lines.append("- Originalul nu a fost modificat.")
lines.append("- Cuprinsul static din document a fost pastrat; dupa finalizare, page numbers pot necesita actualizare in Word.")
lines.append("- Articolele HTML nepotrivite sigur sunt anexate la final, daca exista.")
REPORT_PATH.write_text("\n".join(lines) + "\n", encoding="utf-8")
print(OUT_DOCX)
print(REPORT_PATH)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment