me-suzy · March 5, 2026 07:23
diff --git a/Parsing WEBSITE - FINAL - 3 martie 2026 RO.py b/Parsing WEBSITE - FINAL - 3 martie 2026 RO.py
 import os
 import re
 import sys

 # Asigură afișarea fără erori a diacriticelor în consolă (Windows)
 try:
    sys.stdout.reconfigure(encoding="utf-8", errors="replace")
 except Exception:
    # Dacă reconfigure nu este disponibil, ignorăm eroarea
    pass

 def read_text_from_file(file_path):
    """
    Aceasta functie returneaza continutul unui fisier.
    file_path: calea catre fisierul din care vrei sa citesti
    """
    #with open(file_path, encoding='utf8') as f:
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
        text = f.read()
        return text


 def write_to_file(text, file_path):
    """
    Aceasta functie scrie un text intr-un fisier.
    text: textul pe care vrei sa il scrii
    file_path: calea catre fisierul in care vrei sa scrii
    """
    with open(file_path, 'wb') as f:
        f.write(text.encode('utf8', 'ignore'))


 def copiaza_continut_html(cale_fisier_html, cale_fisiere_gata): # astea sunt argumentele functiei, adica cand apelez functia
    # citesti textul din fisierul html
    text_html = read_text_from_file(cale_fisier_html)
    final_text = ''


    # Adăugăm logică suplimentară pentru a elimina spațiile nedorite înainte de "
    # final_text = re.sub(r'^\s+"', '"', final_text, flags=re.MULTILINE)

    # === fisier html vechi articol_categorie ===
    articol_categorie_pattern = re.compile(r'<!-- ARTICOL CATEGORIE START -->([\s\S]*?)<!-- ARTICOL CATEGORIE FINAL -->')
    articol_categorie = re.findall(articol_categorie_pattern, text_html)
    if len(articol_categorie) != 0:
        # === citire fisier model - index2.html ===
        text_html_model = read_text_from_file("e:\\Carte\\BB\\17 - Site Leadership\\alte\\Ionel Balauta\\Aryeht\\Task 1 - Traduce tot site-ul\\Doar Google Web\\Andreea\\Meditatii\\Sedinta 18\\index2.html")
        articol_categorie = articol_categorie[0]
        # Verifică dacă există <td width="\d+"><span class="den_articol"> in fisierul vechi
        width_pattern = re.compile(r'<td width="\d+"><span class="den_articol">')
        if re.search(width_pattern, articol_categorie):
            print(f"ATENTIE !!!!!!!!!!!!!!!  în fișierul: {cale_fisier_html}")
            print("Problema: există cel puțin un tag de formă <td width=\"...\"><span class=\"den_articol\">.")
            print("Soluție: lasă scriptul să înlocuiască automat cu <td><span class=\"den_articol\"> sau corectează manual în fișierul HTML după acest model.")
            # FIND: <td\s+width="\d+">(<span class="den_articol"><a href=".*?">.*?</a></span>)</td>
            # REPLACE BY: <td>\1</td>
            # si
            # FIND:  <td><p class="den_articol">
            # REPLACE BY:  <td><span class="den_articol">
            # Inlocuieste cazurile de genul <td width="123"><span class="den_articol"> cu <td><span class="den_articol">
            articol_categorie = re.sub(r'<td width="\d+"><span class="den_articol">', '<td><span class="den_articol">', articol_categorie)

        # Adăugăm noul FIND and REPLACE simplu, fără regex
        articol_categorie = articol_categorie.replace('<td><p class="den_articol">', '<td><span class="den_articol">')

        # ==== INLOCUIRE <td><span class="den_articol"> CU <td><span class="linkMare"> ==== in articol_categorie VECHI
        span_pattern = re.compile(r'<td><span class="den_articol"><a href=\"(.*?)\" class="linkMare">(.*?)</a></span></td>')
        span_nou = '<td><span class="linkMare"><a href="{}" class="linkMare"><span class="den_articol">{}</span></a></span></td>'
        span = re.findall(span_pattern, articol_categorie)
        lista_span_nou = list()
        for i in range(len(span)):
            lista_span_nou.append(span_nou.format(span[i][0], span[i][1]))
        span_pattern = re.compile(r'<td><span class="den_articol"><a href=\".*?\" class="linkMare">.*?</a></span></td>')
        span = re.findall(span_pattern, articol_categorie)
        for i in range(len(span)):
            articol_categorie = articol_categorie.replace(span[i], lista_span_nou[i])
        # ==== INLOCUIRE <td><span class="den_articol"> CU <td><span class="linkMare"> ====



        # ==== Informatii fisier original ====
        categ_link_title_pattern = re.compile(r'<td><span class="linkMare"><a href="(.*?)" class="linkMare"><span class="den_articol">(.*?)</span></a></span></td>')
        categ_link_title = re.findall(categ_link_title_pattern, articol_categorie)
        print("Total {} ARTICOLE".format(len(categ_link_title)))
        categ_date_link_title_desc_pattern = re.compile(r'<td class="text_dreapta">(.*?)<a href=\"(.*?)\" title=\"(.*?)\" class="external" rel="category tag">(.*?)</a>, by Neculai Fantanaru</td>')
        categ_date_link_title_desc = re.findall(categ_date_link_title_desc_pattern, articol_categorie)
        paragraf_pattern = re.compile(r'<p class="text_obisnuit2"><em>(.*?)</em></p>')
        paragraf = re.findall(paragraf_pattern, articol_categorie)

        print("PARAGRAF", len(paragraf))

        # === citeste mai departe - buton ===
        # Acceptă atât 'citeşte', cât și 'citește', plus varianta cu entitate HTML &#351;
        citeste_buton_pattern = re.compile(
            r'<div align="right" id="external2"><a href=\"(.*?)\">cite(?:ș|ş|&#351;)te mai departe </a>'
        )
        citeste_buton = re.findall(citeste_buton_pattern, articol_categorie)
        read_more_buton_pattern = re.compile(r'<div align="right" id="external2"><a href=\"(.*?)\">read more </a>') # in fisierul categorie nou din c:\Folder1\fisiere_gata\
        read_more_buton = re.findall(read_more_buton_pattern, articol_categorie)

        # === Informatii index2 ===
        articol_categorie_index2_pattern = re.compile(r'<!-- ARTICOL START -->([\s\S]*?)<!-- ARTICOL FINAL -->') # c:\Folder1\index2.html
        articol_categorie_index2 = re.findall(articol_categorie_index2_pattern, text_html_model)
        if len(articol_categorie_index2) != 0:
            articol_categorie_index2 = articol_categorie_index2[0]  # trebuie inlocuit cu toate categoriile din fisierul original
            # citire template pentru categorie din index2.html
            template_categorie = read_text_from_file("C:\\Folder1\\template_categorie.txt")

            # h3 => title + description
            h3_pattern = re.compile(r'<h3 class="font-weight-normal" itemprop="name"><a href=\"(.*?)\" class="color-black">(.*?)</a></h3>')
            h3 = re.findall(h3_pattern, template_categorie)
            h3 = h3[0]
            # dates section din index2.html
            dates_section_index2_pattern = re.compile(r'<!--STARTDATES-->([\s\S]*?)<!--FINNISHDATES-->') # c:\Folder1\index2.html
            dates_section_index2 = re.findall(dates_section_index2_pattern, template_categorie)
            dates_section_index2 = dates_section_index2[0]
            # date_index2_pattern = re.compile('<a href="javascript:void\(0\)" class="color-black">(.*?)</a>') # c:\Folder1\index2.html
            date_index2_pattern = re.compile(r'<a href="javascript:void\(0\)" class="color-black">(.*?)</a>') # c:\Folder1\index2.html

            # date
            date_index2 = re.findall(date_index2_pattern, dates_section_index2)
            date_index2 = date_index2[0]
            # link / title / description
            link_title_desc_index2_pattern = re.compile(r'<a href=\"(.*?)\" title=\"(.*?)\" class="color-green font-weight-600 mx-1" id="hidden">(.*?)</a>')
            link_title_desc_index2 = re.findall(link_title_desc_index2_pattern, dates_section_index2)
            link_title_desc_index2 = link_title_desc_index2[0]

            # paragraf
            paragraf_index2_pattern = re.compile(r'<p class="mb-35px color-grey line-height-25px">(.*?)</p>')
            paragraf_index2 = re.findall(paragraf_index2_pattern, template_categorie)
            paragraf_index2 = paragraf_index2[0]

            # === read more ===
            read_more_pattern = re.compile(r'<a href=\"(.*?)\" class="btn-setting color-black btn-hvr-up btn-blue btn-hvr-pink">read more</a>')
            read_more = re.findall(read_more_pattern, template_categorie)
            read_more = read_more[0]

            butoane = list()
            if len(citeste_buton) > 0:
                butoane = citeste_buton
            else:
                butoane = read_more_buton
            print("CATEGORIE", len(categ_link_title))
            # Verificare rapidă dacă numărul de butoane corespunde numărului de articole
            if len(butoane) != len(categ_link_title):
                print(f"[AVERTISMENT] În fișierul {cale_fisier_html} numărul de butoane 'citește mai departe / read more' ({len(butoane)}) "
                      f"nu corespunde cu numărul de articole ({len(categ_link_title)}).")
            for i in range(len(categ_link_title)):
                # Dacă nu există buton pentru articolul curent, afișăm informații de depanare detaliate
                if i >= len(butoane):
                    href_problema = categ_link_title[i][0]
                    title_problema = categ_link_title[i][1]
                    # Căutăm linia din fișierul HTML original unde apare articolul cu problema
                    linii_html = text_html.splitlines()
                    linie_gasita = None
                    continut_linie = ""
                    for idx, linie in enumerate(linii_html, start=1):
                        if href_problema in linie or title_problema in linie:
                            linie_gasita = idx
                            continut_linie = linie.strip()
                            break
                    print("=== EROARE BUTON CATEGORIE ===")
                    print(f"Fișier: {cale_fisier_html}")
                    print(f"Articol index (0-based): {i}")
                    print(f"Href articol: {href_problema}")
                    print(f"Titlu articol: {title_problema}")
                    if linie_gasita is not None:
                        print(f"Linia aproximativă în fișierul HTML original: {linie_gasita}")
                        print(f"Conținut linie: {continut_linie}")
                    # Încercăm să explicăm automat problema de pe linia găsită
                    if "citește mai departe" in continut_linie and "citeşte mai departe" not in continut_linie:
                        print("Problema probabilă: textul butonului folosește caracterul 'ș' (cu virgulă) în 'citește mai departe',")
                        print("dar expresia regulată caută exact 'citeşte mai departe' cu 'ş' (alt cod Unicode).")
                        print("Soluție: înlocuiește în HTML 'citește mai departe' cu 'citeşte mai departe' (sau ajustează regex-ul să accepte ambele variante).")
                    elif "read more" in continut_linie and 'class="btn-setting' not in continut_linie:
                        print("Problema probabilă: există un link 'read more', dar nu cu structura așteptată de template (clasa 'btn-setting ...').")
                        print("Soluție: aliniază structura acestui buton cu cea din template sau adaptează regex-ul.")
                    elif "id=\"external2\"" in continut_linie and "cite" in continut_linie:
                        print("Problema probabilă: există un buton 'citește/citeşte mai departe' în acest <div>,")
                        print("dar textul sau structura nu se potrivește exact cu șablonul folosit în regex (spații, diacritice, sau atribute diferite).")
                    else:
                        print("Nu am putut găsi linia exactă în fișierul HTML original (titlul / href-ul nu apar într-o singură linie).")
                    # Forțăm aceeași eroare, dar cu mesaj explicit, după ce am afișat informațiile de depanare
                    raise IndexError(f"Nu există buton 'read more' pentru articolul cu index {i} din {cale_fisier_html}")
                new_template = template_categorie
                # === facem replace cu informatiile din articolul original ===
                new_template_1 = new_template.replace(date_index2, categ_date_link_title_desc[i][0].replace(', in', '').strip())  # probleme la DATA la unul din articole
                new_template_2 = new_template_1.replace(link_title_desc_index2[0], categ_date_link_title_desc[i][1])
                new_template_3 = new_template_2.replace(link_title_desc_index2[1], categ_date_link_title_desc[i][2])
                new_template_4 = new_template_3.replace(link_title_desc_index2[2], categ_date_link_title_desc[i][3].lstrip())
                new_template_5 = new_template_4.replace(paragraf_index2, paragraf[i]) # lipseste <em> sau </em> de la unul din <p class="text_obisnuit2"> din lista articolelor din CATEGORII
                #  cauta asa:     FIND: <p class="text_obisnuit2">(?!\s*<em>)(?!Latest articles accessed by readers:)(.*?)</p>
                #              REPLACE: <p class="text_obisnuit2"><em>$1</em></p>
                new_template_6 = new_template_5.replace(read_more, butoane[i]) # asta inseamna ca ai o problema la partea "cite&#351;te mai departe" din categorii
                new_template_7 = new_template_6.replace(h3[0], categ_link_title[i][0])
                new_template_8 = new_template_7.replace(h3[1], categ_link_title[i][1])
                final_text = final_text + new_template_8 + '\n'

            text_html_model = text_html_model.replace(articol_categorie_index2, final_text)
            final_text = text_html_model

            # schimbare CATEGORIES index2
            # preluare lista fisier html
            # lista_pattern = re.compile('<ul id="sidebarNavigation">([\s\S]*?)</ul>')
            lista_pattern = re.compile(r'<ul id="sidebarNavigation">([\s\S]*?)</ul>')

            lista = re.findall(lista_pattern, text_html)
            if len(lista) != 0:
                lista = lista[0]
                elemente_lista_pattern = re.compile(r'<li><a href=\"(.*?)\" title=\"(.*?)\">(.*?) \((.*?)\)</a></li>')
                elemente_lista = re.findall(elemente_lista_pattern, lista)
                if elemente_lista != 0:
                    categories_pattern = re.compile(r'<!-- Categories -->([\s\S]*?)<!-- BOOKS START -->')
                    categories = re.findall(categories_pattern, final_text)
                    if len(categories) != 0:
                        categories_section = categories[0]
                        template_category = read_text_from_file('C:\\Folder1\\category-name.txt')

                        # Match complete div blocks - pattern matches from <div class="categories-name"> to its closing </div>
                        # The pattern uses non-greedy match but ensures we get the complete block by matching until </div> followed by newline or next <div
                        elemente_lista_model_full_pattern = re.compile(r'<div class="categories-name">.*?</div>', re.DOTALL)
                        elemente_lista_model_full = elemente_lista_model_full_pattern.findall(categories_section)

                        # Helper function to normalize text for comparison
                        def normalize_text_for_match(text):
                            """Normalizează textul pentru comparație (înlocuiește entitățile HTML și spațiile)"""
                            if not text:
                                return ""
                            text = text.strip()
                            # Normalizează entitățile HTML comune (identic cu Update Categorii Numere.py)
                            replacements = {
                                '&#351;': 'ş', '&#350;': 'Ş',
                                '&#259;': 'ă', '&#258;': 'Ă',
                                '&#226;': 'â', '&#194;': 'Â',
                                '&#238;': 'î', '&#206;': 'Î',
                                '&#355;': 'ţ', '&#354;': 'Ţ',
                                '&#730;': '˚',
                                '&plus;': '+',
                                'ș': 'ş', 'Ș': 'Ş',
                                'ț': 'ţ', 'Ț': 'Ţ',
                            }
                            for old, new in replacements.items():
                                text = text.replace(old, new)
                            # Normalizează spațiile multiple
                            text = ' '.join(text.split())
                            return text

                        # Create a dictionary mapping normalized category names to their data for easier lookup
                        elemente_dict = {}
                        for elem in elemente_lista:
                            # Normalize category name for matching
                            cat_name = normalize_text_for_match(elem[2])
                            elemente_dict[cat_name] = elem

                        # Replace categories by matching category names, not positions
                        for div_block in elemente_lista_model_full:
                            # Extract category name and number from the div block
                            cat_name_pattern = re.compile(r'<i class="fa fa-angle-right font-14 color-blue mr-1"></i>\s*([^<]+?)\s*<span>(\d+)</span>', re.DOTALL)
                            cat_match = cat_name_pattern.search(div_block)
                            if cat_match:
                                cat_name_in_div = normalize_text_for_match(cat_match.group(1))
                                number_in_div = cat_match.group(2)  # Păstrăm numărul din index.html/index2.html
                                # Find matching category in elemente_lista
                                if cat_name_in_div in elemente_dict:
                                    elem = elemente_dict[cat_name_in_div]
                                    new_template_category = template_category
                                    a_pattern = re.compile(r'<a href=\"(.*?)\" title=\"(.*?)\">')
                                    a = re.findall(a_pattern, new_template_category)[0]
                                    p_pattern = re.compile(r'<p class="font-16 color-grey text-capitalize"><i class="fa fa-angle-right font-14 color-blue mr-1"></i> (.*?) <span>(.*?)</span> </p>')
                                    p = re.findall(p_pattern, new_template_category)[0]
                                    new_template_category = new_template_category.replace(a[0], elem[0])
                                    new_template_category = new_template_category.replace(a[1], elem[1])
                                    new_template_category = new_template_category.replace(p[0], elem[2])
                                    # Păstrăm numărul din div_block (din index.html/index2.html), nu din fișierul original
                                    new_template_category = new_template_category.replace(f'<span>{p[1]}</span>', f'<span>{number_in_div}</span>')
                                    # Replace the entire div block in the categories section (only first occurrence)
                                    categories_section = categories_section.replace(div_block, new_template_category, 1)

                        # Replace the entire categories section back into final_text
                        final_text = final_text.replace(categories[0], categories_section)

                    else:
                        print("No categories + books start")
                        print("Problema: în modelul index2.html nu există blocul dintre <!-- Categories --> și <!-- BOOKS START -->.")
                        print("Soluție: copiază dintr-un index2.html funcțional secțiunea de categorii (inclusiv comentariile) și insereaz-o în acest fișier model.")
                else:
                    print("Niciun element <li>.")
                    print("Problema: în <ul id=\"sidebarNavigation\"> nu există niciun <li> cu link de categorie.")
                    print("Soluție: verifică în fișierul original lista de categorii din sidebar și adaugă elementele <li> lipsă.")
            else:
                print("Tag <ul> gol.")
                print("Problema: lipsește blocul <ul id=\"sidebarNavigation\">...</ul> din fișierul original.")
                print("Soluție: copiază această structură dintr-o pagină de categorie care funcționează corect.")


            # Shimbare LINK-URI FLAGS

            flags_pattern = re.compile(r'<!-- FLAGS_1 -->([\s\S]*?)<!-- FLAGS -->')
            flags = re.findall(flags_pattern, text_html)
            if len(flags) != 0:
                flags = flags[0]
                links_pattern = re.compile(r'<a href=\"(.*?)\">')
                links = re.findall(links_pattern, flags)
                if len(links) != 0:
                    flags_model = re.findall(flags_pattern, final_text)
                    if len(flags_model) != 0:
                        flags_model = flags_model[0]
                        links_pattern_model = re.compile(r'<li><a cunt_code=\"\+\d+\" href=\"(.*?)\">')
                        links_model = re.findall(links_pattern_model, flags_model)
                    if len(links_model) != 0:
                        new_flags = flags_model
                        for i in range(len(links)):
                            new_flags = new_flags.replace(links_model[i], links[i])
                        final_text = final_text.replace(flags_model, new_flags)
                    else:
                        print("Fara links in flags model")
                        print("Problema: în modelul index2.html, în secțiunea <!-- FLAGS_1 --> nu există linkuri cu atributele așteptate.")
                        print("Soluție: copiază blocul de steaguri (FLAGS) dintr-un index2.html funcțional.")
                else:
                    print("Fara flags model in textul final")
                    print("Problema: în fișierul model nu există blocul comentat <!-- FLAGS_1 -->...<!-- FLAGS -->.")
                    print("Soluție: adaugă acest bloc dintr-o pagină model corectă.")
            else:
                print("Fara linkuri in flags.")
                print("Problema: în fișierul original, în blocul <!-- FLAGS_1 --> nu există niciun <a href=\"...\">.")
                print("Soluție: verifică HTML-ul de steaguri, adaugă linkurile lipsă sau repară tagurile rupte.")

            # STARS - PHP
            stars_php_pattern = re.compile(r'\$item_id = (.*?);')
            stars_php = re.findall(stars_php_pattern, text_html)
            stars_php_model = re.findall(stars_php_pattern, final_text)
            if len(stars_php) != 0:
                stars_php = stars_php[0]
                if len(stars_php_model) != 0:
                    stars_php_model = stars_php_model[0]
                    final_text = final_text.replace(stars_php_model, stars_php) # FACE REPLACE
                else:
                    print("No stars fisier model")
                    print("Problema: în fișierul model (index2.html) nu există linia PHP cu $item_id = ...; pentru stele.")
                    print("Soluție: copiază dintr-un fișier model funcțional linia sau blocul PHP responsabil de rating/stars.")
            else:
                print("No stars fisier original")
                print("Problema: în fișierul original nu există linia PHP de forma $item_id = ...; necesară pentru stele.")
                print("Soluție: adaugă sau corectează această linie la începutul fișierului, după modelul paginilor care afișează corect rating-ul.")

            # TITLE
            title_pattern = re.compile(r'<title>(.*?)</title>')
            text_title = re.findall(title_pattern, text_html)
            text_title_model = re.findall(title_pattern, final_text)
            print("\n=== DEBUG META (CATEGORIE) ===")
            print("Fisier:", cale_fisier_html)
            print("TITLE original list:", text_title)
            print("TITLE model list:", text_title_model)
            if len(text_title) != 0 and len(text_title_model) != 0:
                text_title_val = text_title[0]
                text_title_model_val = text_title_model[0]
                print("TITLE original ales:", text_title_val)
                print("TITLE model ales:", text_title_model_val)
                if text_title_val != text_title_model_val:
                    print("Facem replace TITLE din model cu original.")
                else:
                    print("ATENTIE: TITLE original si model sunt identice, nu se schimba nimic.")
                final_text = final_text.replace(text_title_model_val, text_title_val)
            else:
                print("Fisier html fara tag title SAU fara title in model: {}".format(cale_fisier_html))
                print("Problema: în fișierul original lipsește tagul <title>...</title> din <head> sau în model.")
                print("Soluție: deschide fișierul, caută secțiunea <head> și adaugă un <title>Titlul articolului</title> după modelul din index.html/index2.html.")

            # DESCRIPTION
            description_pattern = re.compile(r'<meta name="description" content="(.*?)">')
            text_description = re.findall(description_pattern, text_html)
            text_description_model = re.findall(description_pattern, final_text)
            print("DESCRIPTION original list:", text_description)
            print("DESCRIPTION model list:", text_description_model)
            if len(text_description) != 0 and len(text_description_model) != 0:
                text_description_val = text_description[0]
                text_description_model_val = text_description_model[0]
                print("DESCRIPTION original ales:", text_description_val)
                print("DESCRIPTION model ales:", text_description_model_val)
                if text_description_val != text_description_model_val:
                    print("Facem replace DESCRIPTION din model cu original.")
                else:
                    print("ATENTIE: DESCRIPTION original si model sunt identice, nu se schimba nimic.")
                final_text = final_text.replace(text_description_model_val, text_description_val)
            else:
                print("Fisier html fara tag description SAU fara description in model: {}".format(cale_fisier_html))
                print("Problema: lipsește meta description: <meta name=\"description\" content=\"...\"> în original sau în model.")
                print("Soluție: adaugă în <head> un astfel de tag, copiat dintr-o pagină funcțională și adaptat pentru acest articol.")

            # CANONICAL
            canonical_pattern = re.compile(r'<link rel="canonical" href="(.*?)" />')
            text_canonical = re.findall(canonical_pattern, text_html)
            text_canonical_model = re.findall(canonical_pattern, final_text)
            print("CANONICAL original list:", text_canonical)
            print("CANONICAL model list:", text_canonical_model)
            if len(text_canonical) != 0 and len(text_canonical_model) != 0:
                text_canonical_val = text_canonical[0]
                text_canonical_model_val = text_canonical_model[0]
                print("CANONICAL original ales:", text_canonical_val)
                print("CANONICAL model ales:", text_canonical_model_val)
                if text_canonical_val != text_canonical_model_val:
                    print("Facem replace CANONICAL din model cu original.")
                else:
                    print("ATENTIE: CANONICAL original si model sunt identice, nu se schimba nimic.")
                final_text = final_text.replace(text_canonical_model_val, text_canonical_val)
            else:
                print("Fisier html fara tag canonical SAU fara canonical in model: {}".format(cale_fisier_html))
                print("Problema: lipsește <link rel=\"canonical\" href=\"...\" /> din <head> în original sau în model.")
                print("Soluție: copiază un tag canonical corect dintr-o pagină similară și actualizează URL-ul la articolul curent.")

            # ULTIMELE ARTICOLE - PĂSTREAZĂ TEMPLATE-UL DIN INDEX.HTML
            print("Template-ul 'Recent Post' din index.html păstrat cu articolele predefinite.")

        if len(articol_categorie_index2) == 0:
            print("Nu exista articol categorie in index2.html")
            print("Problema: în index2.html nu există blocul comentat <!-- ARTICOL START -->...<!-- ARTICOL FINAL --> pentru template-ul de categorie.")
            print("Soluție: copiază acest bloc dintr-un index2.html funcțional (site nou) și asigură-te că markerii de comentariu sunt identici.")
    else:
        # === citire fisier model - index.html ===
        text_html_model = read_text_from_file("e:\\Carte\\BB\\17 - Site Leadership\\alte\\Ionel Balauta\\Aryeht\\Task 1 - Traduce tot site-ul\\Doar Google Web\\Andreea\\Meditatii\\Sedinta 18\\index.html")
        # ARTICOL START - FINAL
        articol_pattern = re.compile(r'<!-- ARTICOL START -->([\s\S]*?)<!-- ARTICOL FINAL -->[\s\S]*?')
        text_articol = re.findall(articol_pattern, text_html)
        text_articol_model = re.findall(articol_pattern, text_html_model)
        if len(text_articol) != 0 and len(text_articol_model) != 0:
            text_articol = text_articol[0]
            text_articol_model = text_articol_model[0]
            text_html_model_1 = text_html_model.replace(text_articol_model, text_articol)
            final_text = text_html_model_1
        else:
            print("Fisier html fara ARTICOL START/FINAL: {}".format(cale_fisier_html))
            print("Problema: în fișierul original lipsesc markerii <!-- ARTICOL START --> și/sau <!-- ARTICOL FINAL -->.")
            print("Soluție: învelește conținutul articolului în acești doi markeri, după modelul din index.html (site vechi).")

        # TITLE
        title_pattern = re.compile(r'<title>(.*?)</title>')
        text_title = re.findall(title_pattern, text_html)
        text_title_model = re.findall(title_pattern, text_html_model_1) # inseamna ca ai probleme la partea <!-- ARTICOL START --> ori lipseste, ori este scris gresit
        print("\n=== DEBUG META (ARTICOL SIMPLU) ===")
        print("Fisier:", cale_fisier_html)
        print("TITLE original list:", text_title)
        print("TITLE model list:", text_title_model)
        if len(text_title) != 0 and len(text_title_model) != 0:
            text_title_val = text_title[0]
            text_title_model_val = text_title_model[0]
            print("TITLE original ales:", text_title_val)
            print("TITLE model ales:", text_title_model_val)
            if text_title_val != text_title_model_val:
                print("Facem replace TITLE din model cu original.")
            else:
                print("ATENTIE: TITLE original si model sunt identice, nu se schimba nimic.")
            text_html_model_2 = text_html_model_1.replace(text_title_model_val, text_title_val)
            final_text = text_html_model_2
        else:
            print("Fisier html fara tag title SAU fara title in model: {}".format(cale_fisier_html))
            print("Problema: în fișierul original sau în model lipsește tagul <title>...</title>.")
            print("Soluție: adaugă <title>...</title> în <head>, folosind ca exemplu o pagină generată corect.")

        # DESCRIPTION
        description_pattern = re.compile(r'<meta name="description" content="(.*?)">')
        text_description = re.findall(description_pattern, text_html)
        text_description_model = re.findall(description_pattern, text_html_model_2)
        print("DESCRIPTION original list:", text_description)
        print("DESCRIPTION model list:", text_description_model)
        if len(text_description) != 0 and len(text_description_model) != 0:
            text_description_val = text_description[0]
            text_description_model_val = text_description_model[0]
            print("DESCRIPTION original ales:", text_description_val)
            print("DESCRIPTION model ales:", text_description_model_val)
            if text_description_val != text_description_model_val:
                print("Facem replace DESCRIPTION din model cu original.")
            else:
                print("ATENTIE: DESCRIPTION original si model sunt identice, nu se schimba nimic.")
            text_html_model_3 = text_html_model_2.replace(text_description_model_val, text_description_val)
            final_text = text_html_model_3
        else:
            print("Fisier html fara tag description SAU fara description in model: {}".format(cale_fisier_html))
            print("Problema: nu există meta description în fișierul original sau în model.")
            print("Soluție: adaugă <meta name=\"description\" content=\"...\"> în <head>, după modelul din index.html.")

        # CANONICAL
        canonical_pattern = re.compile(r'<link rel="canonical" href="(.*?)" />')
        text_canonical = re.findall(canonical_pattern, text_html)
        text_canonical_model = re.findall(canonical_pattern, text_html_model_3)  # ASTA INSEAMNA CA Tagul DESCRIPTION nu e inchis bine. Trebuie "> in loc de "/>
        print("CANONICAL original list:", text_canonical)
        print("CANONICAL model list:", text_canonical_model)
        if len(text_canonical) != 0 and len(text_canonical_model) != 0:
            text_canonical_val = text_canonical[0]
            text_canonical_model_val = text_canonical_model[0]
            print("CANONICAL original ales:", text_canonical_val)
            print("CANONICAL model ales:", text_canonical_model_val)
            if text_canonical_val != text_canonical_model_val:
                print("Facem replace CANONICAL din model cu original.")
            else:
                print("ATENTIE: CANONICAL original si model sunt identice, nu se schimba nimic.")
            text_html_model_4 = text_html_model_3.replace(text_canonical_model_val, text_canonical_val)
            final_text = text_html_model_4
        else:
            print("Fisier html fara tag canonical SAU fara canonical in model: {}".format(cale_fisier_html))
            print("Problema: nu există link rel=\"canonical\" nici în fișierul original, nici în model.")
            print("Soluție: adaugă <link rel=\"canonical\" href=\"URL-articol\" /> în <head>, copiat dintr-o pagină corectă.")


        # remove DIV tag and TABLE tag
        text_articol_model = re.findall(articol_pattern, text_html_model_4)
        text_articol_model_old = text_articol_model[0]
        text_articol_model = text_articol_model[0]
        # Eliminăm DOAR wrapper-ul <div align="justify"> care învelește articolul vechi,
        # NU toate tag-urile </div> din blocul articolului (altfel stricăm layout-ul coloanelor)
        text_articol_model = text_articol_model.replace('<div align="justify">', '', 1)
        # Închidere specifică pentru wrapper-ul de mai sus: secvența <!-- SASA-2 --> urmată de </div>
        text_articol_model = re.sub(r'<!-- SASA-2 -->\s*</div>', r'<!-- SASA-2 -->', text_articol_model, count=1)

        table_pattern = re.compile(r'<table[\s\S]*?</table>')
        text_table = re.findall(table_pattern, text_articol_model)
        if len(text_table) != 0:
            text_table = text_table[0]
            text_articol_model = text_articol_model.replace(text_table, '')
            text_html_model_5 = text_html_model_4.replace(text_articol_model_old, text_articol_model)
            final_text = text_html_model_5
        else:
            print("No text table")
            print("Observație: în articol nu există niciun <table>...</table> de eliminat; dacă te așteptai să fie, verifică dacă tagurile <table> sunt închise corect.")

        # schimbare tag-uri ARTICLE TITLE
        article_title_pattern = re.compile(r'<h1 class="den_articol" itemprop="name">(.*?)</h1>') # site vechi
        article_title = re.findall(article_title_pattern, text_articol_model_old)
        if len(article_title) != 0:
            article_title = article_title[0]
            h3_title_pattern = re.compile(r'<h3 class="font-weight-normal" itemprop="name"><a href="javascript:void\(0\)" class="color-black">(.*?)</a></h3>')   # site nou
            h3_title = re.findall(h3_title_pattern, text_html_model_5)
            if len(h3_title) != 0:
                h3_title = h3_title[0]
                text_html_model_6 = text_html_model_5.replace(h3_title, article_title)
                final_text = text_html_model_6
            else:
                print("No h3 title.")
                print("Problema: în index.html (site nou) lipsește titlul în <h3 class=\"font-weight-normal\" ...> sau structura lui este diferită.")
                print("Soluție: verifică <h3> din model și aliniază-l la structura așteptată (cu <a href=\"javascript:void(0)\" class=\"color-black\">...).")
        else:
            print("Nu gaseste titlul articolului in tagul H3.")
            print("Problema: în articolul vechi nu există <h1 class=\"den_articol\" itemprop=\"name\">Titlu</h1>.")
            print("Soluție: adaugă acest <h1> sau verifică dacă regex-ul pentru titlu se potrivește cu structura reală din fișier.")

            #UnboundLocalError: cannot access local variable 'text_html_model_6' where it is not associated with a value
            #  DACA AI EROAREA ASTA INSEAMNA CA AI PROBLEME IN FISIERUL index.html la tagul H3 sau H1

        # schimbare DATE
        date_pattern = re.compile(r'<td class="text_dreapta">(.*?),\s+in\s+<a')
        date = re.findall(date_pattern, text_articol_model_old)
        if len(date) != 0:
            date = date[0]
            # MODIFICARE 09/03
            date_section_pattern = re.compile(r'<!--STARTDATES-->([\s\S]*?)<!--FINNISHDATES-->')
            date_section = re.findall(date_section_pattern, text_html_model_6)
            if len(date_section) > 0:
                date_section = date_section[0]
                date_pattern_model = re.compile(r'<a href="javascript:void\(0\)" class="color-black">(.*?)</a>')
                date_model = re.findall(date_pattern_model, date_section)
                if len(date_model) != 0:
                    date_model = date_model[0]
                    text_html_model_7 = text_html_model_6.replace(date_model, date)
                    final_text = text_html_model_7
                else:
                    print('No date in model.')
                    print("Problema: în secțiunea <!--STARTDATES--><!--FINNISHDATES--> din model nu există linkul cu data (<a class=\"color-black\">...).")
                    print("Soluție: copiază structura de dată dintr-un index.html funcțional.")
            else:
                print("No date section: <!--STARTDATES--><!--FINNISHDATES-->")
                print("Problema: în model nu există deloc comentariile <!--STARTDATES--> și <!--FINNISHDATES-->.")
                print("Soluție: adaugă acest bloc de date dintr-un fișier model corect.")
        else:
            print("No date.")
            print("Problema: în articolul vechi nu există data în <td class=\"text_dreapta\">... , in <a ...>Categoria</a>.")
            print("Soluție: verifică și completează manual data în structura așteptată.")

        # schimbare SECTION
        section_pattern_model = re.compile(r'<a href=\"(.*?)\" title=\"(.*?)\" class="color-green font-weight-600 mx-1" id="hidden">(.*?)</a>')
        section_model = re.findall(section_pattern_model, text_html_model_7) #  VEZI ca ai un Spatiu GOL la DATA, sau Incerci sa rulezi codul dintr-o pagina aflata in Principal 2022, in loc de pagin din Website Vechi
   # daca da aici eroare, inseamna ca nu ai pus unde trebuie  <!--STARTDATES--><!--FINNISHDATES-->
        # print(section_model)
        if len(section_model) != 0:
            section_model = section_model[0]
            section_pattern = re.compile(r'<a href=\"(.*?)\" title=\"(.*?)\" class="external" rel="category tag">(.*?)</a>') # site vechi
            section = re.findall(section_pattern, text_articol_model_old)
            if len(section) != 0:
                section = section[0]
                text_html_model_8 = text_html_model_7.replace(section_model[0], section[0])
                text_html_model_9 = text_html_model_8.replace(section_model[1], section[1])
                text_html_model_10 = text_html_model_9.replace(section_model[2], section[2])
                final_text = text_html_model_10
            else:
                print("No section.")
                print("Problema: în articolul vechi nu există linkul de categorie cu clasa \"external\" și rel=\"category tag\".")
                print("Soluție: verifică <td class=\"text_dreapta\"> și adaugă linkul de categorie după modelul paginilor corecte.")
        else:
            print("No section model.")
            print("Problema: în model (site nou) nu există linkul de categorie cu clasa \"color-green font-weight-600 mx-1\" și id=\"hidden\".")
            print("Soluție: copiază structura acestui link dintr-un index.html funcțional.")

        # schimbare CATEGORIES
        # preluare lista fisier html din c:\Folder1\fisiere_html\
        lista_pattern = re.compile(r'<ul id="sidebarNavigation">([\s\S]*?)</ul>')
        lista = re.findall(lista_pattern, text_html)
        if len(lista) != 0:
            lista = lista[0]
            elemente_lista_pattern = re.compile(r'<li><a href=\"(.*?)\" title=\"(.*?)\">(.*?) \((.*?)\)</a></li>')
            elemente_lista = re.findall(elemente_lista_pattern, lista)
            if elemente_lista:
                categories_pattern = re.compile(r'<!-- Categories -->([\s\S]*?)<!-- BOOKS START -->')
                categories = re.findall(categories_pattern, final_text)
                if len(categories) != 0:
                    categories_section = categories[0]
                    template_category = read_text_from_file('C:\\Folder1\\category-name.txt')

                    # Match complete div blocks - pattern matches from <div class="categories-name"> to its closing </div>
                    # The pattern uses non-greedy match but ensures we get the complete block by matching until </div> followed by newline or next <div
                    elemente_lista_model_full_pattern = re.compile(r'<div class="categories-name">.*?</div>', re.DOTALL)
                    elemente_lista_model_full = elemente_lista_model_full_pattern.findall(categories_section)

                    # Helper function to normalize text for comparison
                    def normalize_text_for_match(text):
                        """Normalizează textul pentru comparație (înlocuiește entitățile HTML și spațiile)"""
                        if not text:
                            return ""
                        text = text.strip()
                        # Normalizează entitățile HTML comune (identic cu Update Categorii Numere.py)
                        replacements = {
                            '&#351;': 'ş', '&#350;': 'Ş',
                            '&#259;': 'ă', '&#258;': 'Ă',
                            '&#226;': 'â', '&#194;': 'Â',
                            '&#238;': 'î', '&#206;': 'Î',
                            '&#355;': 'ţ', '&#354;': 'Ţ',
                            '&#730;': '˚',
                            '&plus;': '+',
                            'ș': 'ş', 'Ș': 'Ş',
                            'ț': 'ţ', 'Ț': 'Ţ',
                        }
                        for old, new in replacements.items():
                            text = text.replace(old, new)
                        # Normalizează spațiile multiple
                        text = ' '.join(text.split())
                        return text

                    # Create a dictionary mapping normalized category names to their data for easier lookup
                    elemente_dict = {}
                    for elem in elemente_lista:
                        # Normalize category name for matching
                        cat_name = normalize_text_for_match(elem[2])
                        elemente_dict[cat_name] = elem

                    # Replace categories by matching category names, not positions
                    for div_block in elemente_lista_model_full:
                        # Extract category name and number from the div block
                        cat_name_pattern = re.compile(r'<i class="fa fa-angle-right font-14 color-blue mr-1"></i>\s*([^<]+?)\s*<span>(\d+)</span>', re.DOTALL)
                        cat_match = cat_name_pattern.search(div_block)
                        if cat_match:
                            cat_name_in_div = normalize_text_for_match(cat_match.group(1))
                            number_in_div = cat_match.group(2)  # Păstrăm numărul din index.html/index2.html
                            # Find matching category in elemente_lista
                            if cat_name_in_div in elemente_dict:
                                elem = elemente_dict[cat_name_in_div]
                                new_template_category = template_category
                                a_pattern = re.compile(r'<a href=\"(.*?)\" title=\"(.*?)\">')
                                a = re.findall(a_pattern, new_template_category)[0]
                                p_pattern = re.compile(r'<p class="font-16 color-grey text-capitalize"><i class="fa fa-angle-right font-14 color-blue mr-1"></i> (.*?) <span>(.*?)</span> </p>')
                                p = re.findall(p_pattern, new_template_category)[0]
                                new_template_category = new_template_category.replace(a[0], elem[0])
                                new_template_category = new_template_category.replace(a[1], elem[1])
                                new_template_category = new_template_category.replace(p[0], elem[2])
                                # Păstrăm numărul din div_block (din index.html/index2.html), nu din fișierul original
                                new_template_category = new_template_category.replace(f'<span>{p[1]}</span>', f'<span>{number_in_div}</span>')
                                # Replace the entire div block in the categories section (only first occurrence)
                                categories_section = categories_section.replace(div_block, new_template_category, 1)

                    # Replace the entire categories section back into final_text
                    final_text = final_text.replace(categories[0], categories_section)
                    # print("==========================")
                    # print(final_text)
                    text_html_model_14 = final_text
                else:
                    print("No categories + books start")
                    print("Problema: în modelul index.html nu există blocul <!-- Categories -->...<!-- BOOKS START -->.")
                    print("Soluție: adaugă această secțiune de categorii copiată dintr-o pagină funcțională.")
            else:
                print("Niciun element <li>.")
                print("Problema: în <ul id=\"sidebarNavigation\"> nu există elemente <li> cu linkuri de categorie.")
                print("Soluție: completează lista de categorii în fișierul original, după modelul unei pagini corecte.")
        else:
            print("Tag <ul> gol.")
            print("Problema: lipsește complet <ul id=\"sidebarNavigation\">...</ul> din articolul vechi.")
            print("Soluție: copiază această structură dintr-un fișier HTML care are sidebarul de categorii.")

        # Shimbare LINK-URI FLAGS
        flags_pattern = re.compile(r'<!-- FLAGS_1 -->([\s\S]*?)<!-- FLAGS -->')
        flags = re.findall(flags_pattern, text_html)
        if len(flags) != 0:
            flags = flags[0]
            links_pattern = re.compile(r'<a href=\"(.*?)\">')
            links = re.findall(links_pattern, flags)
            if len(links) != 0:
                # print("Links: ", links)
                flags_model = re.findall(flags_pattern, text_html_model_14)
                if len(flags_model) != 0:
                    flags_model = flags_model[0]
                    # print("Flags: ", flags_model)
                    links_pattern_model = re.compile(r'<li><a cunt_code=\"\+\d+\" href=\"(.*?)\">')
                    links_model = re.findall(links_pattern_model, flags_model)
                    # print(links_model)
                    text_html_model_15 = text_html_model_14
                    if len(links_model) != 0:
                        for i in range(len(links)):
                            # print(links[i], links_model[i])
                            text_html_model_15 = text_html_model_15.replace(links_model[i], links[i]) # FACE REPLACE
                            final_text = text_html_model_15
                    else:
                        print("Fara links in flags model")
                else:
                    print("Fara links in flags model")
            else:
                print("Fara linkuri in flags.")
        else:
            print("Fara flags in articol original.")

        # STARS - PHP
        stars_php_pattern = re.compile(r'\$item_id = (.*?);')
        stars_php = re.findall(stars_php_pattern, text_html)
        stars_php_model = re.findall(stars_php_pattern, text_html_model_15)
        if len(stars_php) != 0:
            stars_php = stars_php[0]
            if len(stars_php_model) != 0:
                stars_php_model = stars_php_model[0]
                text_html_model_16 = text_html_model_15.replace(stars_php_model, stars_php) # FACE REPLACE
                final_text = text_html_model_16
            else:
                print("No stars fisier model")
        else:
            print("No stars fisier original")  #  lipseste $item_id = la inceputul fisierului

        # ULTIMELE ARTICOLE - PĂSTREAZĂ TEMPLATE-UL DIN INDEX.HTML
        print("Template-ul 'Recent Post' din index.html păstrat cu articolele predefinite.")

    file_path = cale_fisiere_gata + "\\" + os.path.basename(cale_fisier_html)
    write_to_file(final_text, file_path)
    print("Scriere efectuata cu succes.")



 def creare_fisiere_html(cale_folder_html, cale_fisiere_gata):
    """
    Functia itereaza printr-un folder care contine fisiere txt si creeaza fisiere html corespunzatoare
    """
    count = 0

    fisiere_de_ignorat = [
        "webinar-a-black-square-into-the-dazzling-white.html",
        "webinar-a-king-for-my-kingdom.html",
        "webinar-convince-me-that-you-are-alive.html",
        "webinar-in-emptiness-is-hidden-the-fullness.html",
        "webinars.html",
        "webinar-the-circle-that-closes-all-senses.html",
        "webinar-the-dirigible-progress-of-leadership.html",
        "webinar-the-distinctive-color-of-leadership.html",
        "webinar-the-impetus-towards-excellence.html",
        "webinar-the-man-who-made-the-june-26.html",
        "webinar-the-mystery-of-leadership.html",
        "webinar-the-narrow-corridor-towards-the-heights-of-perfection.html",
        "webinar-the-road-of-truth.html",
        "webinar-the-sweet-source-of-perfection.html",
        "webinar-the-too-narrow-ladder-of-leadership.html",
        "webinar-the-unitary-whole-of-leadership.html",
        "webinar-the-weak-construction-of-leadership.html",
        "directory.html",
        "Python - EXEMPLU EXAMPLE.html",
        "python-PROBA-EXEMPLU.html",
        "parteneri.html",
        "y_key_e479323ce281e459.html",
        "webinarii.html",
        "webinar-un-patrat-negru-in-albul-orbitor.html",
        "webinar-un-rege-pentru-regatul-meu.html",
        "webinar-totul-unitar-al-leadershipului.html",
        "webinar-taina-leadershipului.html",
        "webinar-progresul-dirijabil-al-leadershipului.html",
        "webinar-scara-prea-ingusta-a-leadershipului.html",
        "webinar-in-gol-se-ascunde-plinul.html",
        "webinar-omul-care-a-facut-26-iunie.html",
        "webinar-drumul-adevarului.html",
        "webinar-dulcele-izvor-al-perfectiunii.html",
        "webinar-culoarea-distincta-a-leadershipului.html",
        "webinar-culoarul-ingust-spre-culmile-desavarsirii.html",
        "webinar-constructia-subreda-a-leadershipului.html",
        "webinar-convinge-ma-ca-esti-in-viata.html",
        "webinar-cercul-care-inchide-toate-sensurile.html",
        "webinar-avantul-spre-excelenta.html",
        "search.html"
    ]

    for f in os.listdir(cale_folder_html):
        if f.endswith('.html'):
            if f in fisiere_de_ignorat:
                print(f"Ignorăm fișierul: {f}")
                continue
            cale_fisier_html = cale_folder_html + "\\" + f
            print("FISIER CURENT: ", cale_fisier_html)
            copiaza_continut_html(cale_fisier_html, cale_fisiere_gata)
            count += 1
        else:
            continue
    print("Numarul de fisiere modificate: ", count)

 def main():
    creare_fisiere_html("C:\\Folder1\\fisiere_html", "C:\\Folder1\\fisiere_gata")

 if __name__ == '__main__':
    main()
No results found