Skip to content

Instantly share code, notes, and snippets.

@Tiberriver256
Created February 21, 2025 19:18
Show Gist options
  • Save Tiberriver256/263fcb84fe20edc12b379a63ccafd73b to your computer and use it in GitHub Desktop.
Save Tiberriver256/263fcb84fe20edc12b379a63ccafd73b to your computer and use it in GitHub Desktop.
# /// script
# dependencies = [
# "crawl4ai",
# "asyncio"
# ]
# ///
import asyncio
from crawl4ai import *
import os
async def main():
# Create a list to store all markdown content
all_markdown = []
async with AsyncWebCrawler() as crawler:
# Crawl the main page first
result = await crawler.arun(
url="<insert-your-url-here>",
)
# Add main page markdown with header
all_markdown.append(f"# {result.metadata.get("title")}\nURL: {result.url}\n\n{result.markdown}\n")
print("Main page processed...")
# Create a set to track processed URLs to avoid duplicates
processed_urls = {result.url}
# Process internal links
for link in result.links.get("internal", []):
if link.get("href") in processed_urls:
continue
processed_urls.add(link.get("href"))
try:
sub_result = await crawler.arun(url=link.get("href"))
# Add subpage markdown with header
all_markdown.append(f"# {sub_result.metadata.get("title")}\nURL: {sub_result.url}\n\n{sub_result.markdown}\n")
print(f"Processed: {link.get('href')}")
except Exception as e:
print(f"Error crawling {link.get('href')}: {str(e)}")
# Create markdown directory if it doesn't exist
os.makedirs("markdown", exist_ok=True)
# Save all markdown to a single file
with open("markdown/content.md", "w", encoding="utf-8") as f:
f.write("\n---\n\n".join(all_markdown))
print("\nMarkdown content has been saved to markdown/content.md")
if __name__ == "__main__":
asyncio.run(main())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment