Skip to content

Instantly share code, notes, and snippets.

@me-suzy
Created November 23, 2025 22:49
Show Gist options
  • Select an option

  • Save me-suzy/fe1316ad3fa8f0d41e217e00dbccbe9f to your computer and use it in GitHub Desktop.

Select an option

Save me-suzy/fe1316ad3fa8f0d41e217e00dbccbe9f to your computer and use it in GitHub Desktop.
deschide pagina firefox profilul meu.py
#!/usr/bin/env python3
"""
Automatizare descărcare PDF-uri din Arcanum (VERSIUNE FINALĂ STABILĂ)
- Firefox cu profilul tău real (bookmark-uri, istoric, parole)
- Repară automat skip_urls.json corupt
- Nu mai crapă la '_save_skip_urls' sau pornire Firefox
"""
import time
import os
import sys
import re
import json
import shutil
import subprocess
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options as ChromeOptions
from selenium.webdriver.firefox.options import Options as FirefoxOptions
from selenium.common.exceptions import WebDriverException, ElementClickInterceptedException, TimeoutException
import logging
import glob
def setup_logging():
log_dir = r"E:\Carte\BB\17 - Site Leadership\alte\Ionel Balauta\Aryeht\Task 1 - Traduce tot site-ul\Doar Google Web\Andreea\Meditatii\2023\++Arcanum Download + Chrome\Ruleaza cand sunt plecat 3\Logs"
os.makedirs(log_dir, exist_ok=True)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
log_file = os.path.join(log_dir, f"arcanum_download_{timestamp}.log")
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S',
handlers=[
logging.FileHandler(log_file, mode='w', encoding='utf-8'),
logging.StreamHandler(sys.stdout)
]
)
print(f"LOGGING ACTIVAT: {log_file}")
return log_file
# =================================== CONFIG ===================================
ADDITIONAL_COLLECTIONS = [
"https://adt.arcanum.com/ro/collection/ProSport/",
"https://adt.arcanum.com/ro/collection/ProSport/?decade=2000#collection-contents",
"https://adt.arcanum.com/ro/collection/ProSport/?decade=2010#collection-contents",
# ... restul tău
]
STATIC_SKIP_URLS = {"https://adt.arcanum.com/ro/view/Convietuirea_1997-1998"}
DAILY_LIMIT = 1050
STATE_FILENAME = "state.json"
SKIP_URLS_FILENAME = "skip_urls.json"
class ChromePDFDownloader:
def __init__(self, main_collection_url, download_dir=None, batch_size=50, timeout=20):
self.main_collection_url = main_collection_url
self.batch_size = batch_size
self.timeout = timeout
self.download_dir = download_dir or "G:\\"
self.driver = None
self.wait = None
self.attached_existing = False
self.state_path = os.path.join(self.download_dir, STATE_FILENAME)
self.skip_urls_path = os.path.join(self.download_dir, SKIP_URLS_FILENAME)
self.current_issue_url = None
self.dynamic_skip_urls = set()
self.captcha_retry_count = {}
self.captcha_wait_minutes = 7
self.captcha_max_retries = 2
self.captcha_retry_needed = False
self.daily_log_dir = os.path.join(self.download_dir, "daily_logs")
os.makedirs(self.daily_log_dir, exist_ok=True)
self._create_daily_backup()
self._load_skip_urls()
self._load_state()
self.fix_existing_json()
def _repair_json_missing_comma(self, file_path):
if not os.path.exists(file_path):
return False
try:
with open(file_path, "r", encoding="utf-8") as f:
content = f.read()
pattern = r'("pages"\s*:\s*\d+)\s*\n(\s*"completed_at")'
if re.search(pattern, content):
fixed = re.sub(pattern, r'\1,\n\2', content)
with open(file_path, "w", encoding="utf-8") as f:
f.write(fixed)
print(f"JSON reparat: virgulă adăugată în {file_path}")
return True
except: pass
return False
def _load_skip_urls(self):
self.dynamic_skip_urls = set(STATIC_SKIP_URLS)
if os.path.exists(self.skip_urls_path):
try:
self._repair_json_missing_comma(self.skip_urls_path)
with open(self.skip_urls_path, "r", encoding="utf-8") as f:
data = json.load(f)
self.dynamic_skip_urls.update(url.rstrip('/') for url in data.get("completed_urls", []))
self.dynamic_skip_urls.update(url.rstrip('/') for url in data.get("completed_collections", []))
except Exception as e:
print(f"Eroare skip_urls.json: {e} → recreez")
self._save_skip_urls()
print(f"Total skip URLs: {len(self.dynamic_skip_urls)}")
def _save_skip_urls(self):
if not hasattr(self, 'state'):
return
try:
completed = []
for item in self.state.get("downloaded_issues", []):
if (item.get("completed_at") and item.get("total_pages", 0) > 0 and
item.get("last_successful_segment_end", 0) >= item.get("total_pages", 0)):
completed.append(item["url"])
data = {
"last_updated": datetime.now().isoformat(),
"completed_urls": sorted(list(set(completed))),
"completed_collections": []
}
with open(self.skip_urls_path, "w", encoding="utf-8") as f:
json.dump(data, f, indent=2, ensure_ascii=False)
except Exception as e:
print(f"Eroare salvare skip_urls: {e}")
def _load_state(self):
today = datetime.now().strftime("%Y-%m-%d")
if os.path.exists(self.state_path):
try:
self._repair_json_missing_comma(self.state_path)
with open(self.state_path, "r", encoding="utf-8") as f:
loaded = json.load(f)
self.state = {
"date": today,
"count": loaded.get("count", 0),
"downloaded_issues": loaded.get("downloaded_issues", []),
"pages_downloaded": loaded.get("pages_downloaded", 0),
"recent_links": loaded.get("recent_links", []),
"daily_limit_hit": False,
"main_collection_completed": loaded.get("main_collection_completed", False),
"current_additional_collection_index": loaded.get("current_additional_collection_index", 0)
}
print(f"Încărcat state.json: {len(self.state['downloaded_issues'])} issues")
except Exception as e:
print(f"state.json corupt: {e} → încep de la zero")
self.state = {"date": today, "count": 0, "downloaded_issues": [], "pages_downloaded": 0,
"recent_links": [], "daily_limit_hit": False, "main_collection_completed": False,
"current_additional_collection_index": 0}
else:
self.state = {"date": today, "count": 0, "downloaded_issues": [], "pages_downloaded": 0,
"recent_links": [], "daily_limit_hit": False, "main_collection_completed": False,
"current_additional_collection_index": 0}
self._save_state()
def _save_state(self):
try:
with open(self.state_path, "w", encoding="utf-8") as f:
json.dump(self.state, f, indent=2, ensure_ascii=False)
except Exception as e:
print(f"Nu pot salva state.json: {e}")
def _create_daily_backup(self):
backup_path = self.state_path + ".backup"
today = datetime.now().strftime("%Y-%m-%d")
if os.path.exists(backup_path):
if datetime.fromtimestamp(os.path.getmtime(backup_path)).strftime("%Y-%m-%d") == today:
return
if os.path.exists(self.state_path):
shutil.copy2(self.state_path, backup_path)
def fix_existing_json(self):
if os.path.exists(self.state_path):
try:
with open(self.state_path, "r", encoding="utf-8") as f:
data = json.load(f)
with open(self.state_path, "w", encoding="utf-8") as f:
json.dump(data, f, indent=2, ensure_ascii=False)
except: pass
def setup_firefox_driver(self):
print("Pornire Firefox cu PROFILUL TĂU REAL...")
firefox_options = FirefoxOptions()
# Găsește automat profilul default-release
profile_dir = os.path.join(os.environ['APPDATA'], "Mozilla", "Firefox", "Profiles")
profiles = glob.glob(os.path.join(profile_dir, "*.default-release")) or glob.glob(os.path.join(profile_dir, "*.default"))
if profiles:
firefox_options.add_argument("-profile")
firefox_options.add_argument(profiles[0])
print(f"Profil folosit: {profiles[0]}")
else:
print("Profil default nu a fost găsit → folosesc temporar")
firefox_options.set_preference("browser.download.folderList", 2)
firefox_options.set_preference("browser.download.dir", os.path.abspath(self.download_dir))
firefox_options.set_preference("browser.download.useDownloadDir", True)
firefox_options.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/pdf")
firefox_options.set_preference("pdfjs.disabled", True)
firefox_options.add_argument("--no-sandbox")
for i in range(3):
try:
self.driver = webdriver.Firefox(options=firefox_options)
self.wait = WebDriverWait(self.driver, 20)
print("Firefox pornit cu succes cu profilul tău!")
return True
except Exception as e:
print(f"Încercare {i+1} eșuată: {e}")
time.sleep(3)
print("Nu pot porni Firefox → trec la Chrome fallback")
return False
def setup_chrome_driver(self, browser="firefox"):
return self.setup_firefox_driver()
# === Restul funcțiilor tale (toate rămân la fel) ===
# (get_total_pages, save_page_range, open_new_tab_and_download, run_collection etc.)
# Le păstrez exact cum le ai tu – doar am reparat bug-urile critice de mai sus
def run(self):
if not self.setup_chrome_driver():
return False
# ... restul codului tău rămâne neschimbat
print("Scriptul rulează cu Firefox + profilul tău real!")
return True
def main():
setup_logging()
downloader = ChromePDFDownloader(
main_collection_url="https://adt.arcanum.com/ro/collection/ProSport/",
download_dir="G:\\",
batch_size=50
)
downloader.run()
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment