Tiberriver256 · February 21, 2025 19:18
diff --git a/Recursive-crawl.py b/Recursive-crawl.py
 # /// script
 # dependencies = [
 #   "crawl4ai",
 #   "asyncio"
 # ]
 # ///

 import asyncio
 from crawl4ai import *
 import os

 async def main():
    # Create a list to store all markdown content
    all_markdown = []
    
    async with AsyncWebCrawler() as crawler:
        # Crawl the main page first
        result = await crawler.arun(
            url="<insert-your-url-here>",
        )
        
        # Add main page markdown with header
        all_markdown.append(f"# {result.metadata.get("title")}\nURL: {result.url}\n\n{result.markdown}\n")
        print("Main page processed...")
        
        # Create a set to track processed URLs to avoid duplicates
        processed_urls = {result.url}
        
        # Process internal links
        for link in result.links.get("internal", []):
            if link.get("href") in processed_urls:
                continue
                
            processed_urls.add(link.get("href"))
            try:
                sub_result = await crawler.arun(url=link.get("href"))
                # Add subpage markdown with header
                all_markdown.append(f"# {sub_result.metadata.get("title")}\nURL: {sub_result.url}\n\n{sub_result.markdown}\n")
                print(f"Processed: {link.get('href')}")
            except Exception as e:
                print(f"Error crawling {link.get('href')}: {str(e)}")
        
        # Create markdown directory if it doesn't exist
        os.makedirs("markdown", exist_ok=True)
        
        # Save all markdown to a single file
        with open("markdown/content.md", "w", encoding="utf-8") as f:
            f.write("\n---\n\n".join(all_markdown))
        
        print("\nMarkdown content has been saved to markdown/content.md")

 if __name__ == "__main__":
    asyncio.run(main())
	# /// script
	# dependencies = [
	# "crawl4ai",
	# "asyncio"
	# ]
	# ///

	import asyncio
	from crawl4ai import *
	import os

	async def main():
	# Create a list to store all markdown content
	all_markdown = []

	async with AsyncWebCrawler() as crawler:
	# Crawl the main page first
	result = await crawler.arun(
	url="<insert-your-url-here>",
	)

	# Add main page markdown with header
	all_markdown.append(f"# {result.metadata.get("title")}\nURL: {result.url}\n\n{result.markdown}\n")
	print("Main page processed...")

	# Create a set to track processed URLs to avoid duplicates
	processed_urls = {result.url}

	# Process internal links
	for link in result.links.get("internal", []):
	if link.get("href") in processed_urls:
	continue

	processed_urls.add(link.get("href"))
	try:
	sub_result = await crawler.arun(url=link.get("href"))
	# Add subpage markdown with header
	all_markdown.append(f"# {sub_result.metadata.get("title")}\nURL: {sub_result.url}\n\n{sub_result.markdown}\n")
	print(f"Processed: {link.get('href')}")
	except Exception as e:
	print(f"Error crawling {link.get('href')}: {str(e)}")

	# Create markdown directory if it doesn't exist
	os.makedirs("markdown", exist_ok=True)

	# Save all markdown to a single file
	with open("markdown/content.md", "w", encoding="utf-8") as f:
	f.write("\n---\n\n".join(all_markdown))

	print("\nMarkdown content has been saved to markdown/content.md")

	if __name__ == "__main__":
	asyncio.run(main())