muddylemon · October 11, 2024 23:24
diff --git a/scraper.py b/scraper.py
 ### First prep script 

 import asyncio
 from playwright.async_api import async_playwright
 import random
 import json

 async def scrape_with_playwright(url):
    async with async_playwright() as p:
        browser = None
        try:
            browser = await p.chromium.launch()
            context = await browser.new_context(
                user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
                           'AppleWebKit/537.36 (KHTML, like Gecko) '
                           'Chrome/91.0.4472.124 Safari/537.36'
            )
            page = await context.new_page()
            await page.set_extra_http_headers({
                'Accept-Language': 'en-US,en;q=0.9',
                'Accept-Encoding': 'gzip, deflate, br',
                'Accept': ('text/html,application/xhtml+xml,application/xml;q=0.9,'
                           'image/avif,image/webp,image/apng,*/*;q=0.8,'
                           'application/signed-exchange;v=b3;q=0.9'),
            })
            await page.goto(url)
            delay = random.uniform(0.8, 2)
            await asyncio.sleep(delay)
            content = await page.content()
            print(f"Successfully scraped {url}")
            return content
        except Exception as e:
            print(f"Error scraping {url}: {str(e)}")
            return None
        finally:
            if browser:
                await browser.close()

 async def main():
    url = 'https://example.com'
    content = await scrape_with_playwright(url)
    if content:
        with open('result.json', 'w', encoding='utf-8') as f:
            json.dump({'url': url, 'content': content}, f, ensure_ascii=False)
        print('Content saved to result.json')
    else:
        print('No content to save')

 asyncio.run(main())


 ###2nd

 import asyncio
 from playwright.async_api import async_playwright
 import json

 async def create_browser_pool(pool_size):
    playwright = await async_playwright().start()
    browsers = []
    for _ in range(pool_size):
        browser = await playwright.chromium.launch()
        browsers.append(browser)
    semaphore = asyncio.Semaphore(pool_size)
    return playwright, browsers, semaphore

 async def close_browser_pool(playwright, browsers):
    for browser in browsers:
        await browser.close()
    await playwright.stop()

 async def get_browser(browsers, semaphore):
    await semaphore.acquire()
    return browsers.pop()

 async def release_browser(browser, browsers, semaphore):
    browsers.append(browser)
    semaphore.release()

 async def scrape_url(browsers, semaphore, url):
    browser = await get_browser(browsers, semaphore)
    try:
        page = await browser.new_page()
        await page.goto(url)
        content = await page.content()
        print(f"Scraped {url}")
        return {'url': url, 'content': content}
    except Exception as e:
        print(f"Error scraping {url}: {str(e)}")
        return {'url': url, 'error': str(e)}
    finally:
        await release_browser(browser, browsers, semaphore)

 async def main():
    urls = [
        'https://example.com',
        'https://example.org',
        'https://example.net',
        'https://example.edu',
        'https://example.io'
    ]
    pool_size = 3
    playwright, browsers, semaphore = await create_browser_pool(pool_size)
    try:
        tasks = [scrape_url(browsers, semaphore, url) for url in urls]
        results = await asyncio.gather(*tasks)
    finally:
        await close_browser_pool(playwright, browsers)
    with open('results.json', 'w', encoding='utf-8') as f:
        json.dump(results, f, ensure_ascii=False)
    for result in results:
        url = result.get('url')
        content = result.get('content')
        if content:
            print(f"Result for {url}: {content[:50]}...")
        else:
            print(f"No result for {url}, error: {result.get('error')}")

 asyncio.run(main())
	### First prep script

	import asyncio
	from playwright.async_api import async_playwright
	import random
	import json

	async def scrape_with_playwright(url):
	async with async_playwright() as p:
	browser = None
	try:
	browser = await p.chromium.launch()
	context = await browser.new_context(
	user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
	'AppleWebKit/537.36 (KHTML, like Gecko) '
	'Chrome/91.0.4472.124 Safari/537.36'
	)
	page = await context.new_page()
	await page.set_extra_http_headers({
	'Accept-Language': 'en-US,en;q=0.9',
	'Accept-Encoding': 'gzip, deflate, br',
	'Accept': ('text/html,application/xhtml+xml,application/xml;q=0.9,'
	'image/avif,image/webp,image/apng,/;q=0.8,'
	'application/signed-exchange;v=b3;q=0.9'),
	})
	await page.goto(url)
	delay = random.uniform(0.8, 2)
	await asyncio.sleep(delay)
	content = await page.content()
	print(f"Successfully scraped {url}")
	return content
	except Exception as e:
	print(f"Error scraping {url}: {str(e)}")
	return None
	finally:
	if browser:
	await browser.close()

	async def main():
	url = 'https://example.com'
	content = await scrape_with_playwright(url)
	if content:
	with open('result.json', 'w', encoding='utf-8') as f:
	json.dump({'url': url, 'content': content}, f, ensure_ascii=False)
	print('Content saved to result.json')
	else:
	print('No content to save')

	asyncio.run(main())


	###2nd

	import asyncio
	from playwright.async_api import async_playwright
	import json

	async def create_browser_pool(pool_size):
	playwright = await async_playwright().start()
	browsers = []
	for _ in range(pool_size):
	browser = await playwright.chromium.launch()
	browsers.append(browser)
	semaphore = asyncio.Semaphore(pool_size)
	return playwright, browsers, semaphore

	async def close_browser_pool(playwright, browsers):
	for browser in browsers:
	await browser.close()
	await playwright.stop()

	async def get_browser(browsers, semaphore):
	await semaphore.acquire()
	return browsers.pop()

	async def release_browser(browser, browsers, semaphore):
	browsers.append(browser)
	semaphore.release()

	async def scrape_url(browsers, semaphore, url):
	browser = await get_browser(browsers, semaphore)
	try:
	page = await browser.new_page()
	await page.goto(url)
	content = await page.content()
	print(f"Scraped {url}")
	return {'url': url, 'content': content}
	except Exception as e:
	print(f"Error scraping {url}: {str(e)}")
	return {'url': url, 'error': str(e)}
	finally:
	await release_browser(browser, browsers, semaphore)

	async def main():
	urls = [
	'https://example.com',
	'https://example.org',
	'https://example.net',
	'https://example.edu',
	'https://example.io'
	]
	pool_size = 3
	playwright, browsers, semaphore = await create_browser_pool(pool_size)
	try:
	tasks = [scrape_url(browsers, semaphore, url) for url in urls]
	results = await asyncio.gather(*tasks)
	finally:
	await close_browser_pool(playwright, browsers)
	with open('results.json', 'w', encoding='utf-8') as f:
	json.dump(results, f, ensure_ascii=False)
	for result in results:
	url = result.get('url')
	content = result.get('content')
	if content:
	print(f"Result for {url}: {content[:50]}...")
	else:
	print(f"No result for {url}, error: {result.get('error')}")

	asyncio.run(main())