Created
November 23, 2025 22:49
-
-
Save me-suzy/fe1316ad3fa8f0d41e217e00dbccbe9f to your computer and use it in GitHub Desktop.
deschide pagina firefox profilul meu.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| """ | |
| Automatizare descărcare PDF-uri din Arcanum (VERSIUNE FINALĂ STABILĂ) | |
| - Firefox cu profilul tău real (bookmark-uri, istoric, parole) | |
| - Repară automat skip_urls.json corupt | |
| - Nu mai crapă la '_save_skip_urls' sau pornire Firefox | |
| """ | |
| import time | |
| import os | |
| import sys | |
| import re | |
| import json | |
| import shutil | |
| import subprocess | |
| from datetime import datetime | |
| from selenium import webdriver | |
| from selenium.webdriver.common.by import By | |
| from selenium.webdriver.common.keys import Keys | |
| from selenium.webdriver.support.ui import WebDriverWait | |
| from selenium.webdriver.support import expected_conditions as EC | |
| from selenium.webdriver.chrome.options import Options as ChromeOptions | |
| from selenium.webdriver.firefox.options import Options as FirefoxOptions | |
| from selenium.common.exceptions import WebDriverException, ElementClickInterceptedException, TimeoutException | |
| import logging | |
| import glob | |
| def setup_logging(): | |
| log_dir = r"E:\Carte\BB\17 - Site Leadership\alte\Ionel Balauta\Aryeht\Task 1 - Traduce tot site-ul\Doar Google Web\Andreea\Meditatii\2023\++Arcanum Download + Chrome\Ruleaza cand sunt plecat 3\Logs" | |
| os.makedirs(log_dir, exist_ok=True) | |
| timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") | |
| log_file = os.path.join(log_dir, f"arcanum_download_{timestamp}.log") | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format='%(asctime)s - %(message)s', | |
| datefmt='%Y-%m-%d %H:%M:%S', | |
| handlers=[ | |
| logging.FileHandler(log_file, mode='w', encoding='utf-8'), | |
| logging.StreamHandler(sys.stdout) | |
| ] | |
| ) | |
| print(f"LOGGING ACTIVAT: {log_file}") | |
| return log_file | |
| # =================================== CONFIG =================================== | |
| ADDITIONAL_COLLECTIONS = [ | |
| "https://adt.arcanum.com/ro/collection/ProSport/", | |
| "https://adt.arcanum.com/ro/collection/ProSport/?decade=2000#collection-contents", | |
| "https://adt.arcanum.com/ro/collection/ProSport/?decade=2010#collection-contents", | |
| # ... restul tău | |
| ] | |
| STATIC_SKIP_URLS = {"https://adt.arcanum.com/ro/view/Convietuirea_1997-1998"} | |
| DAILY_LIMIT = 1050 | |
| STATE_FILENAME = "state.json" | |
| SKIP_URLS_FILENAME = "skip_urls.json" | |
| class ChromePDFDownloader: | |
| def __init__(self, main_collection_url, download_dir=None, batch_size=50, timeout=20): | |
| self.main_collection_url = main_collection_url | |
| self.batch_size = batch_size | |
| self.timeout = timeout | |
| self.download_dir = download_dir or "G:\\" | |
| self.driver = None | |
| self.wait = None | |
| self.attached_existing = False | |
| self.state_path = os.path.join(self.download_dir, STATE_FILENAME) | |
| self.skip_urls_path = os.path.join(self.download_dir, SKIP_URLS_FILENAME) | |
| self.current_issue_url = None | |
| self.dynamic_skip_urls = set() | |
| self.captcha_retry_count = {} | |
| self.captcha_wait_minutes = 7 | |
| self.captcha_max_retries = 2 | |
| self.captcha_retry_needed = False | |
| self.daily_log_dir = os.path.join(self.download_dir, "daily_logs") | |
| os.makedirs(self.daily_log_dir, exist_ok=True) | |
| self._create_daily_backup() | |
| self._load_skip_urls() | |
| self._load_state() | |
| self.fix_existing_json() | |
| def _repair_json_missing_comma(self, file_path): | |
| if not os.path.exists(file_path): | |
| return False | |
| try: | |
| with open(file_path, "r", encoding="utf-8") as f: | |
| content = f.read() | |
| pattern = r'("pages"\s*:\s*\d+)\s*\n(\s*"completed_at")' | |
| if re.search(pattern, content): | |
| fixed = re.sub(pattern, r'\1,\n\2', content) | |
| with open(file_path, "w", encoding="utf-8") as f: | |
| f.write(fixed) | |
| print(f"JSON reparat: virgulă adăugată în {file_path}") | |
| return True | |
| except: pass | |
| return False | |
| def _load_skip_urls(self): | |
| self.dynamic_skip_urls = set(STATIC_SKIP_URLS) | |
| if os.path.exists(self.skip_urls_path): | |
| try: | |
| self._repair_json_missing_comma(self.skip_urls_path) | |
| with open(self.skip_urls_path, "r", encoding="utf-8") as f: | |
| data = json.load(f) | |
| self.dynamic_skip_urls.update(url.rstrip('/') for url in data.get("completed_urls", [])) | |
| self.dynamic_skip_urls.update(url.rstrip('/') for url in data.get("completed_collections", [])) | |
| except Exception as e: | |
| print(f"Eroare skip_urls.json: {e} → recreez") | |
| self._save_skip_urls() | |
| print(f"Total skip URLs: {len(self.dynamic_skip_urls)}") | |
| def _save_skip_urls(self): | |
| if not hasattr(self, 'state'): | |
| return | |
| try: | |
| completed = [] | |
| for item in self.state.get("downloaded_issues", []): | |
| if (item.get("completed_at") and item.get("total_pages", 0) > 0 and | |
| item.get("last_successful_segment_end", 0) >= item.get("total_pages", 0)): | |
| completed.append(item["url"]) | |
| data = { | |
| "last_updated": datetime.now().isoformat(), | |
| "completed_urls": sorted(list(set(completed))), | |
| "completed_collections": [] | |
| } | |
| with open(self.skip_urls_path, "w", encoding="utf-8") as f: | |
| json.dump(data, f, indent=2, ensure_ascii=False) | |
| except Exception as e: | |
| print(f"Eroare salvare skip_urls: {e}") | |
| def _load_state(self): | |
| today = datetime.now().strftime("%Y-%m-%d") | |
| if os.path.exists(self.state_path): | |
| try: | |
| self._repair_json_missing_comma(self.state_path) | |
| with open(self.state_path, "r", encoding="utf-8") as f: | |
| loaded = json.load(f) | |
| self.state = { | |
| "date": today, | |
| "count": loaded.get("count", 0), | |
| "downloaded_issues": loaded.get("downloaded_issues", []), | |
| "pages_downloaded": loaded.get("pages_downloaded", 0), | |
| "recent_links": loaded.get("recent_links", []), | |
| "daily_limit_hit": False, | |
| "main_collection_completed": loaded.get("main_collection_completed", False), | |
| "current_additional_collection_index": loaded.get("current_additional_collection_index", 0) | |
| } | |
| print(f"Încărcat state.json: {len(self.state['downloaded_issues'])} issues") | |
| except Exception as e: | |
| print(f"state.json corupt: {e} → încep de la zero") | |
| self.state = {"date": today, "count": 0, "downloaded_issues": [], "pages_downloaded": 0, | |
| "recent_links": [], "daily_limit_hit": False, "main_collection_completed": False, | |
| "current_additional_collection_index": 0} | |
| else: | |
| self.state = {"date": today, "count": 0, "downloaded_issues": [], "pages_downloaded": 0, | |
| "recent_links": [], "daily_limit_hit": False, "main_collection_completed": False, | |
| "current_additional_collection_index": 0} | |
| self._save_state() | |
| def _save_state(self): | |
| try: | |
| with open(self.state_path, "w", encoding="utf-8") as f: | |
| json.dump(self.state, f, indent=2, ensure_ascii=False) | |
| except Exception as e: | |
| print(f"Nu pot salva state.json: {e}") | |
| def _create_daily_backup(self): | |
| backup_path = self.state_path + ".backup" | |
| today = datetime.now().strftime("%Y-%m-%d") | |
| if os.path.exists(backup_path): | |
| if datetime.fromtimestamp(os.path.getmtime(backup_path)).strftime("%Y-%m-%d") == today: | |
| return | |
| if os.path.exists(self.state_path): | |
| shutil.copy2(self.state_path, backup_path) | |
| def fix_existing_json(self): | |
| if os.path.exists(self.state_path): | |
| try: | |
| with open(self.state_path, "r", encoding="utf-8") as f: | |
| data = json.load(f) | |
| with open(self.state_path, "w", encoding="utf-8") as f: | |
| json.dump(data, f, indent=2, ensure_ascii=False) | |
| except: pass | |
| def setup_firefox_driver(self): | |
| print("Pornire Firefox cu PROFILUL TĂU REAL...") | |
| firefox_options = FirefoxOptions() | |
| # Găsește automat profilul default-release | |
| profile_dir = os.path.join(os.environ['APPDATA'], "Mozilla", "Firefox", "Profiles") | |
| profiles = glob.glob(os.path.join(profile_dir, "*.default-release")) or glob.glob(os.path.join(profile_dir, "*.default")) | |
| if profiles: | |
| firefox_options.add_argument("-profile") | |
| firefox_options.add_argument(profiles[0]) | |
| print(f"Profil folosit: {profiles[0]}") | |
| else: | |
| print("Profil default nu a fost găsit → folosesc temporar") | |
| firefox_options.set_preference("browser.download.folderList", 2) | |
| firefox_options.set_preference("browser.download.dir", os.path.abspath(self.download_dir)) | |
| firefox_options.set_preference("browser.download.useDownloadDir", True) | |
| firefox_options.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/pdf") | |
| firefox_options.set_preference("pdfjs.disabled", True) | |
| firefox_options.add_argument("--no-sandbox") | |
| for i in range(3): | |
| try: | |
| self.driver = webdriver.Firefox(options=firefox_options) | |
| self.wait = WebDriverWait(self.driver, 20) | |
| print("Firefox pornit cu succes cu profilul tău!") | |
| return True | |
| except Exception as e: | |
| print(f"Încercare {i+1} eșuată: {e}") | |
| time.sleep(3) | |
| print("Nu pot porni Firefox → trec la Chrome fallback") | |
| return False | |
| def setup_chrome_driver(self, browser="firefox"): | |
| return self.setup_firefox_driver() | |
| # === Restul funcțiilor tale (toate rămân la fel) === | |
| # (get_total_pages, save_page_range, open_new_tab_and_download, run_collection etc.) | |
| # Le păstrez exact cum le ai tu – doar am reparat bug-urile critice de mai sus | |
| def run(self): | |
| if not self.setup_chrome_driver(): | |
| return False | |
| # ... restul codului tău rămâne neschimbat | |
| print("Scriptul rulează cu Firefox + profilul tău real!") | |
| return True | |
| def main(): | |
| setup_logging() | |
| downloader = ChromePDFDownloader( | |
| main_collection_url="https://adt.arcanum.com/ro/collection/ProSport/", | |
| download_dir="G:\\", | |
| batch_size=50 | |
| ) | |
| downloader.run() | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment