Skip to content

Instantly share code, notes, and snippets.

@uncledevhq
Created January 16, 2025 07:41
Show Gist options
  • Save uncledevhq/c882eab049db65c3b853d4f16836ba1e to your computer and use it in GitHub Desktop.
Save uncledevhq/c882eab049db65c3b853d4f16836ba1e to your computer and use it in GitHub Desktop.
Crawl websites for inforamtion
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import re
from collections import defaultdict
import nltk
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
class WebsiteCrawler:
def __init__(self, start_url, max_pages=50):
self.start_url = start_url
self.max_pages = max_pages
self.visited_urls = set()
self.domain = urlparse(start_url).netloc
self.pages_content = []
self.faq_data = []
# Download required NLTK data
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("averaged_perceptron_tagger")
def is_valid_url(self, url):
"""Check if URL belongs to the same domain and is a valid web page."""
parsed = urlparse(url)
return (
parsed.netloc == self.domain
and any(url.lower().endswith(ext) for ext in [".html", ".htm", "/"])
and not any(
pattern in url.lower() for pattern in [".pdf", ".jpg", ".png", "#"]
)
)
def extract_text_from_html(self, html):
"""Extract clean text from HTML content."""
soup = BeautifulSoup(html, "html.parser")
# Remove script and style elements
for element in soup(["script", "style", "nav", "footer"]):
element.decompose()
return " ".join(soup.stripped_strings)
def crawl(self):
"""Main crawling function."""
queue = [self.start_url]
while queue and len(self.visited_urls) < self.max_pages:
url = queue.pop(0)
if url in self.visited_urls:
continue
try:
response = requests.get(url, timeout=10)
if response.status_code == 200:
soup = BeautifulSoup(response.text, "html.parser")
self.visited_urls.add(url)
# Store page content
content = self.extract_text_from_html(response.text)
self.pages_content.append({"url": url, "content": content})
# Extract FAQs if present
self.extract_faq(soup, url)
# Find new URLs to crawl
for link in soup.find_all("a", href=True):
new_url = urljoin(url, link["href"])
if (
self.is_valid_url(new_url)
and new_url not in self.visited_urls
):
queue.append(new_url)
except Exception as e:
print(f"Error crawling {url}: {str(e)}")
def extract_faq(self, soup, url):
"""Extract FAQ content from the page."""
# Look for common FAQ patterns
faq_indicators = ["faq", "frequently asked questions", "q&a", "questions"]
# Check headers and section titles
headers = soup.find_all(["h1", "h2", "h3", "h4"])
for header in headers:
header_text = header.text.lower()
if any(indicator in header_text for indicator in faq_indicators):
# Find questions and answers
questions = []
current = header.find_next()
while current and current.name not in ["h1", "h2", "h3", "h4"]:
if current.name in ["p", "div"]:
text = current.text.strip()
if text.endswith("?"):
questions.append({"question": text, "answer": ""})
elif questions and not questions[-1]["answer"]:
questions[-1]["answer"] = text
current = current.find_next()
if questions:
self.faq_data.append({"url": url, "faqs": questions})
def analyze_website_category(self):
"""Analyze website category based on content."""
# Combine all text content
all_content = " ".join(page["content"] for page in self.pages_content)
# Create TF-IDF vectors
vectorizer = TfidfVectorizer(max_features=1000, stop_words="english")
tfidf_matrix = vectorizer.fit_transform([all_content])
# Get most important terms
feature_names = vectorizer.get_feature_names_out()
important_terms = sorted(zip(vectorizer.idf_, feature_names), reverse=True)[:20]
# Predefined categories and their associated keywords
categories = {
"e-commerce": ["shop", "product", "cart", "price", "shipping"],
"blog": ["post", "article", "author", "comment", "blog"],
"corporate": ["company", "service", "business", "client", "solution"],
"educational": ["course", "student", "learn", "training", "education"],
"news": ["news", "article", "update", "latest", "report"],
}
# Score each category based on keyword presence
category_scores = defaultdict(int)
important_words = [term[1] for term in important_terms]
for category, keywords in categories.items():
for keyword in keywords:
if keyword in important_words:
category_scores[category] += 1
return {
"detected_category": max(category_scores.items(), key=lambda x: x[1])[0],
"category_scores": dict(category_scores),
"important_terms": [term[1] for term in important_terms[:10]],
}
def generate_summary(self):
"""Generate useful information about the website."""
# Combine all text content
all_content = " ".join(page["content"] for page in self.pages_content)
sentences = sent_tokenize(all_content)
# Use TF-IDF to find important sentences
vectorizer = TfidfVectorizer(stop_words="english")
tfidf_matrix = vectorizer.fit_transform(sentences)
# Cluster sentences
n_clusters = min(5, len(sentences))
kmeans = KMeans(n_clusters=n_clusters)
kmeans.fit(tfidf_matrix)
# Get central sentences from each cluster
central_sentences = []
for i in range(n_clusters):
cluster_sentences = [
s for j, s in enumerate(sentences) if kmeans.labels_[j] == i
]
if cluster_sentences:
central_sentences.append(cluster_sentences[0])
return {
"total_pages": len(self.visited_urls),
"summary_points": central_sentences,
"urls_crawled": list(self.visited_urls),
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment