Created
January 16, 2025 07:41
-
-
Save uncledevhq/c882eab049db65c3b853d4f16836ba1e to your computer and use it in GitHub Desktop.
Crawl websites for inforamtion
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from bs4 import BeautifulSoup | |
from urllib.parse import urljoin, urlparse | |
import re | |
from collections import defaultdict | |
import nltk | |
from nltk.tokenize import sent_tokenize | |
from nltk.corpus import stopwords | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.cluster import KMeans | |
class WebsiteCrawler: | |
def __init__(self, start_url, max_pages=50): | |
self.start_url = start_url | |
self.max_pages = max_pages | |
self.visited_urls = set() | |
self.domain = urlparse(start_url).netloc | |
self.pages_content = [] | |
self.faq_data = [] | |
# Download required NLTK data | |
nltk.download("punkt") | |
nltk.download("stopwords") | |
nltk.download("averaged_perceptron_tagger") | |
def is_valid_url(self, url): | |
"""Check if URL belongs to the same domain and is a valid web page.""" | |
parsed = urlparse(url) | |
return ( | |
parsed.netloc == self.domain | |
and any(url.lower().endswith(ext) for ext in [".html", ".htm", "/"]) | |
and not any( | |
pattern in url.lower() for pattern in [".pdf", ".jpg", ".png", "#"] | |
) | |
) | |
def extract_text_from_html(self, html): | |
"""Extract clean text from HTML content.""" | |
soup = BeautifulSoup(html, "html.parser") | |
# Remove script and style elements | |
for element in soup(["script", "style", "nav", "footer"]): | |
element.decompose() | |
return " ".join(soup.stripped_strings) | |
def crawl(self): | |
"""Main crawling function.""" | |
queue = [self.start_url] | |
while queue and len(self.visited_urls) < self.max_pages: | |
url = queue.pop(0) | |
if url in self.visited_urls: | |
continue | |
try: | |
response = requests.get(url, timeout=10) | |
if response.status_code == 200: | |
soup = BeautifulSoup(response.text, "html.parser") | |
self.visited_urls.add(url) | |
# Store page content | |
content = self.extract_text_from_html(response.text) | |
self.pages_content.append({"url": url, "content": content}) | |
# Extract FAQs if present | |
self.extract_faq(soup, url) | |
# Find new URLs to crawl | |
for link in soup.find_all("a", href=True): | |
new_url = urljoin(url, link["href"]) | |
if ( | |
self.is_valid_url(new_url) | |
and new_url not in self.visited_urls | |
): | |
queue.append(new_url) | |
except Exception as e: | |
print(f"Error crawling {url}: {str(e)}") | |
def extract_faq(self, soup, url): | |
"""Extract FAQ content from the page.""" | |
# Look for common FAQ patterns | |
faq_indicators = ["faq", "frequently asked questions", "q&a", "questions"] | |
# Check headers and section titles | |
headers = soup.find_all(["h1", "h2", "h3", "h4"]) | |
for header in headers: | |
header_text = header.text.lower() | |
if any(indicator in header_text for indicator in faq_indicators): | |
# Find questions and answers | |
questions = [] | |
current = header.find_next() | |
while current and current.name not in ["h1", "h2", "h3", "h4"]: | |
if current.name in ["p", "div"]: | |
text = current.text.strip() | |
if text.endswith("?"): | |
questions.append({"question": text, "answer": ""}) | |
elif questions and not questions[-1]["answer"]: | |
questions[-1]["answer"] = text | |
current = current.find_next() | |
if questions: | |
self.faq_data.append({"url": url, "faqs": questions}) | |
def analyze_website_category(self): | |
"""Analyze website category based on content.""" | |
# Combine all text content | |
all_content = " ".join(page["content"] for page in self.pages_content) | |
# Create TF-IDF vectors | |
vectorizer = TfidfVectorizer(max_features=1000, stop_words="english") | |
tfidf_matrix = vectorizer.fit_transform([all_content]) | |
# Get most important terms | |
feature_names = vectorizer.get_feature_names_out() | |
important_terms = sorted(zip(vectorizer.idf_, feature_names), reverse=True)[:20] | |
# Predefined categories and their associated keywords | |
categories = { | |
"e-commerce": ["shop", "product", "cart", "price", "shipping"], | |
"blog": ["post", "article", "author", "comment", "blog"], | |
"corporate": ["company", "service", "business", "client", "solution"], | |
"educational": ["course", "student", "learn", "training", "education"], | |
"news": ["news", "article", "update", "latest", "report"], | |
} | |
# Score each category based on keyword presence | |
category_scores = defaultdict(int) | |
important_words = [term[1] for term in important_terms] | |
for category, keywords in categories.items(): | |
for keyword in keywords: | |
if keyword in important_words: | |
category_scores[category] += 1 | |
return { | |
"detected_category": max(category_scores.items(), key=lambda x: x[1])[0], | |
"category_scores": dict(category_scores), | |
"important_terms": [term[1] for term in important_terms[:10]], | |
} | |
def generate_summary(self): | |
"""Generate useful information about the website.""" | |
# Combine all text content | |
all_content = " ".join(page["content"] for page in self.pages_content) | |
sentences = sent_tokenize(all_content) | |
# Use TF-IDF to find important sentences | |
vectorizer = TfidfVectorizer(stop_words="english") | |
tfidf_matrix = vectorizer.fit_transform(sentences) | |
# Cluster sentences | |
n_clusters = min(5, len(sentences)) | |
kmeans = KMeans(n_clusters=n_clusters) | |
kmeans.fit(tfidf_matrix) | |
# Get central sentences from each cluster | |
central_sentences = [] | |
for i in range(n_clusters): | |
cluster_sentences = [ | |
s for j, s in enumerate(sentences) if kmeans.labels_[j] == i | |
] | |
if cluster_sentences: | |
central_sentences.append(cluster_sentences[0]) | |
return { | |
"total_pages": len(self.visited_urls), | |
"summary_points": central_sentences, | |
"urls_crawled": list(self.visited_urls), | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment