Skip to content

Instantly share code, notes, and snippets.

@muddylemon
Created October 11, 2024 23:24
Show Gist options
  • Save muddylemon/f64e2e62954d7766a3fd3c69dd7f79ec to your computer and use it in GitHub Desktop.
Save muddylemon/f64e2e62954d7766a3fd3c69dd7f79ec to your computer and use it in GitHub Desktop.
### First prep script
import asyncio
from playwright.async_api import async_playwright
import random
import json
async def scrape_with_playwright(url):
async with async_playwright() as p:
browser = None
try:
browser = await p.chromium.launch()
context = await browser.new_context(
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/91.0.4472.124 Safari/537.36'
)
page = await context.new_page()
await page.set_extra_http_headers({
'Accept-Language': 'en-US,en;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
'Accept': ('text/html,application/xhtml+xml,application/xml;q=0.9,'
'image/avif,image/webp,image/apng,*/*;q=0.8,'
'application/signed-exchange;v=b3;q=0.9'),
})
await page.goto(url)
delay = random.uniform(0.8, 2)
await asyncio.sleep(delay)
content = await page.content()
print(f"Successfully scraped {url}")
return content
except Exception as e:
print(f"Error scraping {url}: {str(e)}")
return None
finally:
if browser:
await browser.close()
async def main():
url = 'https://example.com'
content = await scrape_with_playwright(url)
if content:
with open('result.json', 'w', encoding='utf-8') as f:
json.dump({'url': url, 'content': content}, f, ensure_ascii=False)
print('Content saved to result.json')
else:
print('No content to save')
asyncio.run(main())
###2nd
import asyncio
from playwright.async_api import async_playwright
import json
async def create_browser_pool(pool_size):
playwright = await async_playwright().start()
browsers = []
for _ in range(pool_size):
browser = await playwright.chromium.launch()
browsers.append(browser)
semaphore = asyncio.Semaphore(pool_size)
return playwright, browsers, semaphore
async def close_browser_pool(playwright, browsers):
for browser in browsers:
await browser.close()
await playwright.stop()
async def get_browser(browsers, semaphore):
await semaphore.acquire()
return browsers.pop()
async def release_browser(browser, browsers, semaphore):
browsers.append(browser)
semaphore.release()
async def scrape_url(browsers, semaphore, url):
browser = await get_browser(browsers, semaphore)
try:
page = await browser.new_page()
await page.goto(url)
content = await page.content()
print(f"Scraped {url}")
return {'url': url, 'content': content}
except Exception as e:
print(f"Error scraping {url}: {str(e)}")
return {'url': url, 'error': str(e)}
finally:
await release_browser(browser, browsers, semaphore)
async def main():
urls = [
'https://example.com',
'https://example.org',
'https://example.net',
'https://example.edu',
'https://example.io'
]
pool_size = 3
playwright, browsers, semaphore = await create_browser_pool(pool_size)
try:
tasks = [scrape_url(browsers, semaphore, url) for url in urls]
results = await asyncio.gather(*tasks)
finally:
await close_browser_pool(playwright, browsers)
with open('results.json', 'w', encoding='utf-8') as f:
json.dump(results, f, ensure_ascii=False)
for result in results:
url = result.get('url')
content = result.get('content')
if content:
print(f"Result for {url}: {content[:50]}...")
else:
print(f"No result for {url}, error: {result.get('error')}")
asyncio.run(main())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment