uncledevhq · January 16, 2025 07:41
diff --git a/crawler1.0.py b/crawler1.0.py
 import requests
 from bs4 import BeautifulSoup
 from urllib.parse import urljoin, urlparse
 import re
 from collections import defaultdict
 import nltk
 from nltk.tokenize import sent_tokenize
 from nltk.corpus import stopwords
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.cluster import KMeans


 class WebsiteCrawler:
    def __init__(self, start_url, max_pages=50):
        self.start_url = start_url
        self.max_pages = max_pages
        self.visited_urls = set()
        self.domain = urlparse(start_url).netloc
        self.pages_content = []
        self.faq_data = []

        # Download required NLTK data
        nltk.download("punkt")
        nltk.download("stopwords")
        nltk.download("averaged_perceptron_tagger")

    def is_valid_url(self, url):
        """Check if URL belongs to the same domain and is a valid web page."""
        parsed = urlparse(url)
        return (
            parsed.netloc == self.domain
            and any(url.lower().endswith(ext) for ext in [".html", ".htm", "/"])
            and not any(
                pattern in url.lower() for pattern in [".pdf", ".jpg", ".png", "#"]
            )
        )

    def extract_text_from_html(self, html):
        """Extract clean text from HTML content."""
        soup = BeautifulSoup(html, "html.parser")
        # Remove script and style elements
        for element in soup(["script", "style", "nav", "footer"]):
            element.decompose()
        return " ".join(soup.stripped_strings)

    def crawl(self):
        """Main crawling function."""
        queue = [self.start_url]

        while queue and len(self.visited_urls) < self.max_pages:
            url = queue.pop(0)
            if url in self.visited_urls:
                continue

            try:
                response = requests.get(url, timeout=10)
                if response.status_code == 200:
                    soup = BeautifulSoup(response.text, "html.parser")
                    self.visited_urls.add(url)

                    # Store page content
                    content = self.extract_text_from_html(response.text)
                    self.pages_content.append({"url": url, "content": content})

                    # Extract FAQs if present
                    self.extract_faq(soup, url)

                    # Find new URLs to crawl
                    for link in soup.find_all("a", href=True):
                        new_url = urljoin(url, link["href"])
                        if (
                            self.is_valid_url(new_url)
                            and new_url not in self.visited_urls
                        ):
                            queue.append(new_url)

            except Exception as e:
                print(f"Error crawling {url}: {str(e)}")

    def extract_faq(self, soup, url):
        """Extract FAQ content from the page."""
        # Look for common FAQ patterns
        faq_indicators = ["faq", "frequently asked questions", "q&a", "questions"]

        # Check headers and section titles
        headers = soup.find_all(["h1", "h2", "h3", "h4"])
        for header in headers:
            header_text = header.text.lower()
            if any(indicator in header_text for indicator in faq_indicators):
                # Find questions and answers
                questions = []
                current = header.find_next()
                while current and current.name not in ["h1", "h2", "h3", "h4"]:
                    if current.name in ["p", "div"]:
                        text = current.text.strip()
                        if text.endswith("?"):
                            questions.append({"question": text, "answer": ""})
                        elif questions and not questions[-1]["answer"]:
                            questions[-1]["answer"] = text
                    current = current.find_next()

                if questions:
                    self.faq_data.append({"url": url, "faqs": questions})

    def analyze_website_category(self):
        """Analyze website category based on content."""
        # Combine all text content
        all_content = " ".join(page["content"] for page in self.pages_content)

        # Create TF-IDF vectors
        vectorizer = TfidfVectorizer(max_features=1000, stop_words="english")
        tfidf_matrix = vectorizer.fit_transform([all_content])

        # Get most important terms
        feature_names = vectorizer.get_feature_names_out()
        important_terms = sorted(zip(vectorizer.idf_, feature_names), reverse=True)[:20]

        # Predefined categories and their associated keywords
        categories = {
            "e-commerce": ["shop", "product", "cart", "price", "shipping"],
            "blog": ["post", "article", "author", "comment", "blog"],
            "corporate": ["company", "service", "business", "client", "solution"],
            "educational": ["course", "student", "learn", "training", "education"],
            "news": ["news", "article", "update", "latest", "report"],
        }

        # Score each category based on keyword presence
        category_scores = defaultdict(int)
        important_words = [term[1] for term in important_terms]

        for category, keywords in categories.items():
            for keyword in keywords:
                if keyword in important_words:
                    category_scores[category] += 1

        return {
            "detected_category": max(category_scores.items(), key=lambda x: x[1])[0],
            "category_scores": dict(category_scores),
            "important_terms": [term[1] for term in important_terms[:10]],
        }

    def generate_summary(self):
        """Generate useful information about the website."""
        # Combine all text content
        all_content = " ".join(page["content"] for page in self.pages_content)
        sentences = sent_tokenize(all_content)

        # Use TF-IDF to find important sentences
        vectorizer = TfidfVectorizer(stop_words="english")
        tfidf_matrix = vectorizer.fit_transform(sentences)

        # Cluster sentences
        n_clusters = min(5, len(sentences))
        kmeans = KMeans(n_clusters=n_clusters)
        kmeans.fit(tfidf_matrix)

        # Get central sentences from each cluster
        central_sentences = []
        for i in range(n_clusters):
            cluster_sentences = [
                s for j, s in enumerate(sentences) if kmeans.labels_[j] == i
            ]
            if cluster_sentences:
                central_sentences.append(cluster_sentences[0])

        return {
            "total_pages": len(self.visited_urls),
            "summary_points": central_sentences,
            "urls_crawled": list(self.visited_urls),
        }
	import requests
	from bs4 import BeautifulSoup
	from urllib.parse import urljoin, urlparse
	import re
	from collections import defaultdict
	import nltk
	from nltk.tokenize import sent_tokenize
	from nltk.corpus import stopwords
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.cluster import KMeans


	class WebsiteCrawler:
	def __init__(self, start_url, max_pages=50):
	self.start_url = start_url
	self.max_pages = max_pages
	self.visited_urls = set()
	self.domain = urlparse(start_url).netloc
	self.pages_content = []
	self.faq_data = []

	# Download required NLTK data
	nltk.download("punkt")
	nltk.download("stopwords")
	nltk.download("averaged_perceptron_tagger")

	def is_valid_url(self, url):
	"""Check if URL belongs to the same domain and is a valid web page."""
	parsed = urlparse(url)
	return (
	parsed.netloc == self.domain
	and any(url.lower().endswith(ext) for ext in [".html", ".htm", "/"])
	and not any(
	pattern in url.lower() for pattern in [".pdf", ".jpg", ".png", "#"]
	)
	)

	def extract_text_from_html(self, html):
	"""Extract clean text from HTML content."""
	soup = BeautifulSoup(html, "html.parser")
	# Remove script and style elements
	for element in soup(["script", "style", "nav", "footer"]):
	element.decompose()
	return " ".join(soup.stripped_strings)

	def crawl(self):
	"""Main crawling function."""
	queue = [self.start_url]

	while queue and len(self.visited_urls) < self.max_pages:
	url = queue.pop(0)
	if url in self.visited_urls:
	continue

	try:
	response = requests.get(url, timeout=10)
	if response.status_code == 200:
	soup = BeautifulSoup(response.text, "html.parser")
	self.visited_urls.add(url)

	# Store page content
	content = self.extract_text_from_html(response.text)
	self.pages_content.append({"url": url, "content": content})

	# Extract FAQs if present
	self.extract_faq(soup, url)

	# Find new URLs to crawl
	for link in soup.find_all("a", href=True):
	new_url = urljoin(url, link["href"])
	if (
	self.is_valid_url(new_url)
	and new_url not in self.visited_urls
	):
	queue.append(new_url)

	except Exception as e:
	print(f"Error crawling {url}: {str(e)}")

	def extract_faq(self, soup, url):
	"""Extract FAQ content from the page."""
	# Look for common FAQ patterns
	faq_indicators = ["faq", "frequently asked questions", "q&a", "questions"]

	# Check headers and section titles
	headers = soup.find_all(["h1", "h2", "h3", "h4"])
	for header in headers:
	header_text = header.text.lower()
	if any(indicator in header_text for indicator in faq_indicators):
	# Find questions and answers
	questions = []
	current = header.find_next()
	while current and current.name not in ["h1", "h2", "h3", "h4"]:
	if current.name in ["p", "div"]:
	text = current.text.strip()
	if text.endswith("?"):
	questions.append({"question": text, "answer": ""})
	elif questions and not questions[-1]["answer"]:
	questions[-1]["answer"] = text
	current = current.find_next()

	if questions:
	self.faq_data.append({"url": url, "faqs": questions})

	def analyze_website_category(self):
	"""Analyze website category based on content."""
	# Combine all text content
	all_content = " ".join(page["content"] for page in self.pages_content)

	# Create TF-IDF vectors
	vectorizer = TfidfVectorizer(max_features=1000, stop_words="english")
	tfidf_matrix = vectorizer.fit_transform([all_content])

	# Get most important terms
	feature_names = vectorizer.get_feature_names_out()
	important_terms = sorted(zip(vectorizer.idf_, feature_names), reverse=True)[:20]

	# Predefined categories and their associated keywords
	categories = {
	"e-commerce": ["shop", "product", "cart", "price", "shipping"],
	"blog": ["post", "article", "author", "comment", "blog"],
	"corporate": ["company", "service", "business", "client", "solution"],
	"educational": ["course", "student", "learn", "training", "education"],
	"news": ["news", "article", "update", "latest", "report"],
	}

	# Score each category based on keyword presence
	category_scores = defaultdict(int)
	important_words = [term[1] for term in important_terms]

	for category, keywords in categories.items():
	for keyword in keywords:
	if keyword in important_words:
	category_scores[category] += 1

	return {
	"detected_category": max(category_scores.items(), key=lambda x: x[1])[0],
	"category_scores": dict(category_scores),
	"important_terms": [term[1] for term in important_terms[:10]],
	}

	def generate_summary(self):
	"""Generate useful information about the website."""
	# Combine all text content
	all_content = " ".join(page["content"] for page in self.pages_content)
	sentences = sent_tokenize(all_content)

	# Use TF-IDF to find important sentences
	vectorizer = TfidfVectorizer(stop_words="english")
	tfidf_matrix = vectorizer.fit_transform(sentences)

	# Cluster sentences
	n_clusters = min(5, len(sentences))
	kmeans = KMeans(n_clusters=n_clusters)
	kmeans.fit(tfidf_matrix)

	# Get central sentences from each cluster
	central_sentences = []
	for i in range(n_clusters):
	cluster_sentences = [
	s for j, s in enumerate(sentences) if kmeans.labels_[j] == i
	]
	if cluster_sentences:
	central_sentences.append(cluster_sentences[0])

	return {
	"total_pages": len(self.visited_urls),
	"summary_points": central_sentences,
	"urls_crawled": list(self.visited_urls),
	}