Last active
October 24, 2025 15:16
-
-
Save mehdinourollah/5345125664d54955219be9520c464b38 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import pdfkit | |
| import requests | |
| from bs4 import BeautifulSoup | |
| import urllib.parse | |
| def webpage_to_pdf(url, output_file): | |
| # Fetch webpage content | |
| response = requests.get(url) | |
| soup = BeautifulSoup(response.text, 'html.parser') | |
| # Extract links | |
| links = [a.get('href') for a in soup.find_all('a', href=True)] | |
| # Convert relative URLs to absolute | |
| links = [urllib.parse.urljoin(url, link) for link in links] | |
| # Generate PDF from webpage | |
| pdfkit.from_url(url, output_file) | |
| # Append links to PDF (simple text append) | |
| with open(output_file.replace('.pdf', '_links.txt'), 'w') as f: | |
| f.write("Links found on the webpage:\n") | |
| for link in links: | |
| f.write(f"{link}\n") | |
| # Example usage | |
| if __name__ == "__main__": | |
| webpage_url = "https://cs.yale.edu/homes/aspnes/classes/223/notes.html" | |
| output_pdf = "output.pdf" | |
| webpage_to_pdf(webpage_url, output_pdf) |
Author
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
run via this oneliner
curl -sSL https://gist.githubusercontent.com/mehdinourollah/5345125664d54955219be9520c464b38/raw/5a96cab89fd1d4b4a2b056611a66994749236ec0/web2pdf.py > web2pdf.py && python3 -m venv venv && source venv/bin/activate && pip install requests beautifulsoup4 pdfkit && python3 web2pdf.py