Last active
July 4, 2024 17:46
-
-
Save salvatorecapolupo/f5629d5212e727829e1b6bdf91ea5ef6 to your computer and use it in GitHub Desktop.
Questo codice Python estrae automaticamente title e meta description da una qualsiasi sitemap xml, e li salva in un file Excel. Capito, possiamo modificare lo script in modo che esplori ricorsivamente tutte le sitemaps XML e, quando trova URL HTML, estragga il titolo e la meta description. Prego :-) Spiegone: https://trovalost.it/estrae-title-me…
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# https://trovalost.it/sitemap_index.xml | |
import requests | |
from bs4 import BeautifulSoup | |
import pandas as pd | |
# Funzione per estrarre title e meta description da una pagina | |
def extract_title_meta(url): | |
try: | |
response = requests.get(url) | |
soup = BeautifulSoup(response.content, 'html.parser') | |
title = soup.title.string if soup.title else 'No title' | |
meta_description = '' | |
for meta in soup.find_all('meta'): | |
if 'name' in meta.attrs and meta.attrs['name'].lower() == 'description': | |
meta_description = meta.attrs['content'] | |
break | |
return title, meta_description | |
except Exception as e: | |
return 'Error', str(e) | |
# Funzione per estrarre gli URL da una sitemap, gestendo ricorsivamente sitemaps composte | |
def extract_urls_from_sitemap(sitemap_url): | |
urls = [] | |
response = requests.get(sitemap_url) | |
soup = BeautifulSoup(response.content, 'xml') | |
total_urls = len(soup.find_all('url')) + len(soup.find_all('sitemap')) | |
processed_urls = 0 | |
for sitemap in soup.find_all('sitemap'): | |
loc = sitemap.find('loc').text | |
urls.extend(extract_urls_from_sitemap(loc)) | |
processed_urls += 1 | |
print_progress(processed_urls, total_urls, loc) | |
for url in soup.find_all('url'): | |
loc = url.find('loc').text | |
urls.append(loc) | |
processed_urls += 1 | |
print_progress(processed_urls, total_urls, loc) | |
return urls | |
# Funzione per stampare il progresso in percentuale | |
def print_progress(processed, total, url): | |
progress = processed / total * 100 | |
print(f'Progresso: {progress:.2f}% - URL: {url}') | |
# URL della sitemap principale | |
sitemap_url = 'https://trovalost.it/sitemap_index.xml' | |
# Estrazione degli URL dalla sitemap | |
urls = extract_urls_from_sitemap(sitemap_url) | |
# Creazione della lista di dizionari con i dati estratti | |
data = [] | |
for idx, url in enumerate(urls): | |
title, meta_description = extract_title_meta(url) | |
data.append({'URL': url, 'Title': title, 'Meta Description': meta_description}) | |
print(f'Estrazione dati URL {idx+1}/{len(urls)} - URL: {url}') | |
# Creazione del DataFrame | |
df = pd.DataFrame(data) | |
# Salvataggio del DataFrame in un file Excel | |
df.to_excel('sitemap_data.xlsx', index=False) | |
print("Dati estratti e salvati in 'sitemap_data.xlsx'") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment