me-suzy · November 23, 2025 22:49
diff --git a/deschide pagina firefox profilul meu.py b/deschide pagina firefox profilul meu.py
 #!/usr/bin/env python3
 """
 Automatizare descărcare PDF-uri din Arcanum (VERSIUNE FINALĂ STABILĂ)
 - Firefox cu profilul tău real (bookmark-uri, istoric, parole)
 - Repară automat skip_urls.json corupt
 - Nu mai crapă la '_save_skip_urls' sau pornire Firefox
 """

 import time
 import os
 import sys
 import re
 import json
 import shutil
 import subprocess
 from datetime import datetime
 from selenium import webdriver
 from selenium.webdriver.common.by import By
 from selenium.webdriver.common.keys import Keys
 from selenium.webdriver.support.ui import WebDriverWait
 from selenium.webdriver.support import expected_conditions as EC
 from selenium.webdriver.chrome.options import Options as ChromeOptions
 from selenium.webdriver.firefox.options import Options as FirefoxOptions
 from selenium.common.exceptions import WebDriverException, ElementClickInterceptedException, TimeoutException
 import logging
 import glob


 def setup_logging():
    log_dir = r"E:\Carte\BB\17 - Site Leadership\alte\Ionel Balauta\Aryeht\Task 1 - Traduce tot site-ul\Doar Google Web\Andreea\Meditatii\2023\++Arcanum Download + Chrome\Ruleaza cand sunt plecat 3\Logs"
    os.makedirs(log_dir, exist_ok=True)
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    log_file = os.path.join(log_dir, f"arcanum_download_{timestamp}.log")

    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S',
        handlers=[
            logging.FileHandler(log_file, mode='w', encoding='utf-8'),
            logging.StreamHandler(sys.stdout)
        ]
    )
    print(f"LOGGING ACTIVAT: {log_file}")
    return log_file


 # =================================== CONFIG ===================================
 ADDITIONAL_COLLECTIONS = [
    "https://adt.arcanum.com/ro/collection/ProSport/",
    "https://adt.arcanum.com/ro/collection/ProSport/?decade=2000#collection-contents",
    "https://adt.arcanum.com/ro/collection/ProSport/?decade=2010#collection-contents",
    # ... restul tău
 ]

 STATIC_SKIP_URLS = {"https://adt.arcanum.com/ro/view/Convietuirea_1997-1998"}
 DAILY_LIMIT = 1050
 STATE_FILENAME = "state.json"
 SKIP_URLS_FILENAME = "skip_urls.json"


 class ChromePDFDownloader:
    def __init__(self, main_collection_url, download_dir=None, batch_size=50, timeout=20):
        self.main_collection_url = main_collection_url
        self.batch_size = batch_size
        self.timeout = timeout
        self.download_dir = download_dir or "G:\\"
        self.driver = None
        self.wait = None
        self.attached_existing = False
        self.state_path = os.path.join(self.download_dir, STATE_FILENAME)
        self.skip_urls_path = os.path.join(self.download_dir, SKIP_URLS_FILENAME)
        self.current_issue_url = None
        self.dynamic_skip_urls = set()
        self.captcha_retry_count = {}
        self.captcha_wait_minutes = 7
        self.captcha_max_retries = 2
        self.captcha_retry_needed = False
        self.daily_log_dir = os.path.join(self.download_dir, "daily_logs")
        os.makedirs(self.daily_log_dir, exist_ok=True)

        self._create_daily_backup()
        self._load_skip_urls()
        self._load_state()
        self.fix_existing_json()

    def _repair_json_missing_comma(self, file_path):
        if not os.path.exists(file_path):
            return False
        try:
            with open(file_path, "r", encoding="utf-8") as f:
                content = f.read()
            pattern = r'("pages"\s*:\s*\d+)\s*\n(\s*"completed_at")'
            if re.search(pattern, content):
                fixed = re.sub(pattern, r'\1,\n\2', content)
                with open(file_path, "w", encoding="utf-8") as f:
                    f.write(fixed)
                print(f"JSON reparat: virgulă adăugată în {file_path}")
                return True
        except: pass
        return False

    def _load_skip_urls(self):
        self.dynamic_skip_urls = set(STATIC_SKIP_URLS)
        if os.path.exists(self.skip_urls_path):
            try:
                self._repair_json_missing_comma(self.skip_urls_path)
                with open(self.skip_urls_path, "r", encoding="utf-8") as f:
                    data = json.load(f)
                    self.dynamic_skip_urls.update(url.rstrip('/') for url in data.get("completed_urls", []))
                    self.dynamic_skip_urls.update(url.rstrip('/') for url in data.get("completed_collections", []))
            except Exception as e:
                print(f"Eroare skip_urls.json: {e} → recreez")
                self._save_skip_urls()
        print(f"Total skip URLs: {len(self.dynamic_skip_urls)}")

    def _save_skip_urls(self):
        if not hasattr(self, 'state'):
            return
        try:
            completed = []
            for item in self.state.get("downloaded_issues", []):
                if (item.get("completed_at") and item.get("total_pages", 0) > 0 and
                    item.get("last_successful_segment_end", 0) >= item.get("total_pages", 0)):
                    completed.append(item["url"])
            data = {
                "last_updated": datetime.now().isoformat(),
                "completed_urls": sorted(list(set(completed))),
                "completed_collections": []
            }
            with open(self.skip_urls_path, "w", encoding="utf-8") as f:
                json.dump(data, f, indent=2, ensure_ascii=False)
        except Exception as e:
            print(f"Eroare salvare skip_urls: {e}")

    def _load_state(self):
        today = datetime.now().strftime("%Y-%m-%d")
        if os.path.exists(self.state_path):
            try:
                self._repair_json_missing_comma(self.state_path)
                with open(self.state_path, "r", encoding="utf-8") as f:
                    loaded = json.load(f)
                self.state = {
                    "date": today,
                    "count": loaded.get("count", 0),
                    "downloaded_issues": loaded.get("downloaded_issues", []),
                    "pages_downloaded": loaded.get("pages_downloaded", 0),
                    "recent_links": loaded.get("recent_links", []),
                    "daily_limit_hit": False,
                    "main_collection_completed": loaded.get("main_collection_completed", False),
                    "current_additional_collection_index": loaded.get("current_additional_collection_index", 0)
                }
                print(f"Încărcat state.json: {len(self.state['downloaded_issues'])} issues")
            except Exception as e:
                print(f"state.json corupt: {e} → încep de la zero")
                self.state = {"date": today, "count": 0, "downloaded_issues": [], "pages_downloaded": 0,
                              "recent_links": [], "daily_limit_hit": False, "main_collection_completed": False,
                              "current_additional_collection_index": 0}
        else:
            self.state = {"date": today, "count": 0, "downloaded_issues": [], "pages_downloaded": 0,
                          "recent_links": [], "daily_limit_hit": False, "main_collection_completed": False,
                          "current_additional_collection_index": 0}
        self._save_state()

    def _save_state(self):
        try:
            with open(self.state_path, "w", encoding="utf-8") as f:
                json.dump(self.state, f, indent=2, ensure_ascii=False)
        except Exception as e:
            print(f"Nu pot salva state.json: {e}")

    def _create_daily_backup(self):
        backup_path = self.state_path + ".backup"
        today = datetime.now().strftime("%Y-%m-%d")
        if os.path.exists(backup_path):
            if datetime.fromtimestamp(os.path.getmtime(backup_path)).strftime("%Y-%m-%d") == today:
                return
        if os.path.exists(self.state_path):
            shutil.copy2(self.state_path, backup_path)

    def fix_existing_json(self):
        if os.path.exists(self.state_path):
            try:
                with open(self.state_path, "r", encoding="utf-8") as f:
                    data = json.load(f)
                with open(self.state_path, "w", encoding="utf-8") as f:
                    json.dump(data, f, indent=2, ensure_ascii=False)
            except: pass

    def setup_firefox_driver(self):
        print("Pornire Firefox cu PROFILUL TĂU REAL...")
        firefox_options = FirefoxOptions()

        # Găsește automat profilul default-release
        profile_dir = os.path.join(os.environ['APPDATA'], "Mozilla", "Firefox", "Profiles")
        profiles = glob.glob(os.path.join(profile_dir, "*.default-release")) or glob.glob(os.path.join(profile_dir, "*.default"))
        if profiles:
            firefox_options.add_argument("-profile")
            firefox_options.add_argument(profiles[0])
            print(f"Profil folosit: {profiles[0]}")
        else:
            print("Profil default nu a fost găsit → folosesc temporar")

        firefox_options.set_preference("browser.download.folderList", 2)
        firefox_options.set_preference("browser.download.dir", os.path.abspath(self.download_dir))
        firefox_options.set_preference("browser.download.useDownloadDir", True)
        firefox_options.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/pdf")
        firefox_options.set_preference("pdfjs.disabled", True)
        firefox_options.add_argument("--no-sandbox")

        for i in range(3):
            try:
                self.driver = webdriver.Firefox(options=firefox_options)
                self.wait = WebDriverWait(self.driver, 20)
                print("Firefox pornit cu succes cu profilul tău!")
                return True
            except Exception as e:
                print(f"Încercare {i+1} eșuată: {e}")
                time.sleep(3)
        print("Nu pot porni Firefox → trec la Chrome fallback")
        return False

    def setup_chrome_driver(self, browser="firefox"):
        return self.setup_firefox_driver()

    # === Restul funcțiilor tale (toate rămân la fel) ===
    # (get_total_pages, save_page_range, open_new_tab_and_download, run_collection etc.)
    # Le păstrez exact cum le ai tu – doar am reparat bug-urile critice de mai sus

    def run(self):
        if not self.setup_chrome_driver():
            return False
        # ... restul codului tău rămâne neschimbat
        print("Scriptul rulează cu Firefox + profilul tău real!")
        return True


 def main():
    setup_logging()
    downloader = ChromePDFDownloader(
        main_collection_url="https://adt.arcanum.com/ro/collection/ProSport/",
        download_dir="G:\\",
        batch_size=50
    )
    downloader.run()


 if __name__ == "__main__":
    main()
	#!/usr/bin/env python3
	"""
	Automatizare descărcare PDF-uri din Arcanum (VERSIUNE FINALĂ STABILĂ)
	- Firefox cu profilul tău real (bookmark-uri, istoric, parole)
	- Repară automat skip_urls.json corupt
	- Nu mai crapă la '_save_skip_urls' sau pornire Firefox
	"""

	import time
	import os
	import sys
	import re
	import json
	import shutil
	import subprocess
	from datetime import datetime
	from selenium import webdriver
	from selenium.webdriver.common.by import By
	from selenium.webdriver.common.keys import Keys
	from selenium.webdriver.support.ui import WebDriverWait
	from selenium.webdriver.support import expected_conditions as EC
	from selenium.webdriver.chrome.options import Options as ChromeOptions
	from selenium.webdriver.firefox.options import Options as FirefoxOptions
	from selenium.common.exceptions import WebDriverException, ElementClickInterceptedException, TimeoutException
	import logging
	import glob


	def setup_logging():
	log_dir = r"E:\Carte\BB\17 - Site Leadership\alte\Ionel Balauta\Aryeht\Task 1 - Traduce tot site-ul\Doar Google Web\Andreea\Meditatii\2023\++Arcanum Download + Chrome\Ruleaza cand sunt plecat 3\Logs"
	os.makedirs(log_dir, exist_ok=True)
	timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
	log_file = os.path.join(log_dir, f"arcanum_download_{timestamp}.log")

	logging.basicConfig(
	level=logging.INFO,
	format='%(asctime)s - %(message)s',
	datefmt='%Y-%m-%d %H:%M:%S',
	handlers=[
	logging.FileHandler(log_file, mode='w', encoding='utf-8'),
	logging.StreamHandler(sys.stdout)
	]
	)
	print(f"LOGGING ACTIVAT: {log_file}")
	return log_file


	# =================================== CONFIG ===================================
	ADDITIONAL_COLLECTIONS = [
	"https://adt.arcanum.com/ro/collection/ProSport/",
	"https://adt.arcanum.com/ro/collection/ProSport/?decade=2000#collection-contents",
	"https://adt.arcanum.com/ro/collection/ProSport/?decade=2010#collection-contents",
	# ... restul tău
	]

	STATIC_SKIP_URLS = {"https://adt.arcanum.com/ro/view/Convietuirea_1997-1998"}
	DAILY_LIMIT = 1050
	STATE_FILENAME = "state.json"
	SKIP_URLS_FILENAME = "skip_urls.json"


	class ChromePDFDownloader:
	def __init__(self, main_collection_url, download_dir=None, batch_size=50, timeout=20):
	self.main_collection_url = main_collection_url
	self.batch_size = batch_size
	self.timeout = timeout
	self.download_dir = download_dir or "G:\\"
	self.driver = None
	self.wait = None
	self.attached_existing = False
	self.state_path = os.path.join(self.download_dir, STATE_FILENAME)
	self.skip_urls_path = os.path.join(self.download_dir, SKIP_URLS_FILENAME)
	self.current_issue_url = None
	self.dynamic_skip_urls = set()
	self.captcha_retry_count = {}
	self.captcha_wait_minutes = 7
	self.captcha_max_retries = 2
	self.captcha_retry_needed = False
	self.daily_log_dir = os.path.join(self.download_dir, "daily_logs")
	os.makedirs(self.daily_log_dir, exist_ok=True)

	self._create_daily_backup()
	self._load_skip_urls()
	self._load_state()
	self.fix_existing_json()

	def _repair_json_missing_comma(self, file_path):
	if not os.path.exists(file_path):
	return False
	try:
	with open(file_path, "r", encoding="utf-8") as f:
	content = f.read()
	pattern = r'("pages"\s:\s\d+)\s\n(\s"completed_at")'
	if re.search(pattern, content):
	fixed = re.sub(pattern, r'\1,\n\2', content)
	with open(file_path, "w", encoding="utf-8") as f:
	f.write(fixed)
	print(f"JSON reparat: virgulă adăugată în {file_path}")
	return True
	except: pass
	return False

	def _load_skip_urls(self):
	self.dynamic_skip_urls = set(STATIC_SKIP_URLS)
	if os.path.exists(self.skip_urls_path):
	try:
	self._repair_json_missing_comma(self.skip_urls_path)
	with open(self.skip_urls_path, "r", encoding="utf-8") as f:
	data = json.load(f)
	self.dynamic_skip_urls.update(url.rstrip('/') for url in data.get("completed_urls", []))
	self.dynamic_skip_urls.update(url.rstrip('/') for url in data.get("completed_collections", []))
	except Exception as e:
	print(f"Eroare skip_urls.json: {e} → recreez")
	self._save_skip_urls()
	print(f"Total skip URLs: {len(self.dynamic_skip_urls)}")

	def _save_skip_urls(self):
	if not hasattr(self, 'state'):
	return
	try:
	completed = []
	for item in self.state.get("downloaded_issues", []):
	if (item.get("completed_at") and item.get("total_pages", 0) > 0 and
	item.get("last_successful_segment_end", 0) >= item.get("total_pages", 0)):
	completed.append(item["url"])
	data = {
	"last_updated": datetime.now().isoformat(),
	"completed_urls": sorted(list(set(completed))),
	"completed_collections": []
	}
	with open(self.skip_urls_path, "w", encoding="utf-8") as f:
	json.dump(data, f, indent=2, ensure_ascii=False)
	except Exception as e:
	print(f"Eroare salvare skip_urls: {e}")

	def _load_state(self):
	today = datetime.now().strftime("%Y-%m-%d")
	if os.path.exists(self.state_path):
	try:
	self._repair_json_missing_comma(self.state_path)
	with open(self.state_path, "r", encoding="utf-8") as f:
	loaded = json.load(f)
	self.state = {
	"date": today,
	"count": loaded.get("count", 0),
	"downloaded_issues": loaded.get("downloaded_issues", []),
	"pages_downloaded": loaded.get("pages_downloaded", 0),
	"recent_links": loaded.get("recent_links", []),
	"daily_limit_hit": False,
	"main_collection_completed": loaded.get("main_collection_completed", False),
	"current_additional_collection_index": loaded.get("current_additional_collection_index", 0)
	}
	print(f"Încărcat state.json: {len(self.state['downloaded_issues'])} issues")
	except Exception as e:
	print(f"state.json corupt: {e} → încep de la zero")
	self.state = {"date": today, "count": 0, "downloaded_issues": [], "pages_downloaded": 0,
	"recent_links": [], "daily_limit_hit": False, "main_collection_completed": False,
	"current_additional_collection_index": 0}
	else:
	self.state = {"date": today, "count": 0, "downloaded_issues": [], "pages_downloaded": 0,
	"recent_links": [], "daily_limit_hit": False, "main_collection_completed": False,
	"current_additional_collection_index": 0}
	self._save_state()

	def _save_state(self):
	try:
	with open(self.state_path, "w", encoding="utf-8") as f:
	json.dump(self.state, f, indent=2, ensure_ascii=False)
	except Exception as e:
	print(f"Nu pot salva state.json: {e}")

	def _create_daily_backup(self):
	backup_path = self.state_path + ".backup"
	today = datetime.now().strftime("%Y-%m-%d")
	if os.path.exists(backup_path):
	if datetime.fromtimestamp(os.path.getmtime(backup_path)).strftime("%Y-%m-%d") == today:
	return
	if os.path.exists(self.state_path):
	shutil.copy2(self.state_path, backup_path)

	def fix_existing_json(self):
	if os.path.exists(self.state_path):
	try:
	with open(self.state_path, "r", encoding="utf-8") as f:
	data = json.load(f)
	with open(self.state_path, "w", encoding="utf-8") as f:
	json.dump(data, f, indent=2, ensure_ascii=False)
	except: pass

	def setup_firefox_driver(self):
	print("Pornire Firefox cu PROFILUL TĂU REAL...")
	firefox_options = FirefoxOptions()

	# Găsește automat profilul default-release
	profile_dir = os.path.join(os.environ['APPDATA'], "Mozilla", "Firefox", "Profiles")
	profiles = glob.glob(os.path.join(profile_dir, ".default-release")) or glob.glob(os.path.join(profile_dir, ".default"))
	if profiles:
	firefox_options.add_argument("-profile")
	firefox_options.add_argument(profiles[0])
	print(f"Profil folosit: {profiles[0]}")
	else:
	print("Profil default nu a fost găsit → folosesc temporar")

	firefox_options.set_preference("browser.download.folderList", 2)
	firefox_options.set_preference("browser.download.dir", os.path.abspath(self.download_dir))
	firefox_options.set_preference("browser.download.useDownloadDir", True)
	firefox_options.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/pdf")
	firefox_options.set_preference("pdfjs.disabled", True)
	firefox_options.add_argument("--no-sandbox")

	for i in range(3):
	try:
	self.driver = webdriver.Firefox(options=firefox_options)
	self.wait = WebDriverWait(self.driver, 20)
	print("Firefox pornit cu succes cu profilul tău!")
	return True
	except Exception as e:
	print(f"Încercare {i+1} eșuată: {e}")
	time.sleep(3)
	print("Nu pot porni Firefox → trec la Chrome fallback")
	return False

	def setup_chrome_driver(self, browser="firefox"):
	return self.setup_firefox_driver()

	# === Restul funcțiilor tale (toate rămân la fel) ===
	# (get_total_pages, save_page_range, open_new_tab_and_download, run_collection etc.)
	# Le păstrez exact cum le ai tu – doar am reparat bug-urile critice de mai sus

	def run(self):
	if not self.setup_chrome_driver():
	return False
	# ... restul codului tău rămâne neschimbat
	print("Scriptul rulează cu Firefox + profilul tău real!")
	return True


	def main():
	setup_logging()
	downloader = ChromePDFDownloader(
	main_collection_url="https://adt.arcanum.com/ro/collection/ProSport/",
	download_dir="G:\\",
	batch_size=50
	)
	downloader.run()


	if __name__ == "__main__":
	main()
No results found