Created
August 18, 2024 20:14
-
-
Save jkctech/838ecc20e71c5de30b152fb437d66709 to your computer and use it in GitHub Desktop.
Simple web relay using Flask and Selenium to return raw content of webpage. Useful for scraping dynamic loaded webpages.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import time | |
import logging | |
from flask import Flask | |
from flask import request | |
from flask import Response | |
from selenium import webdriver | |
# SETTINGS | |
port = 80 | |
defaultwait = 3 | |
defaultlogging = False | |
customlogging = True | |
headless = True | |
# Create webdriver, headless mode | |
options = webdriver.FirefoxOptions() | |
if headless: | |
options.add_argument('--headless') | |
driver = webdriver.Firefox(options=options) | |
# Disable logging | |
log = logging.getLogger('werkzeug') | |
log.disabled = not defaultlogging | |
# Request page from webdriver, log and send pagecontent back | |
def getPage(url, delay=defaultwait): | |
driver.get(url) | |
time.sleep(delay) | |
raw = driver.page_source | |
if customlogging: | |
print("GET: {}".format(url)) | |
return raw | |
# Define flask app | |
app = Flask("server") | |
# Main route | |
# Args: | |
# - url (Url to relay) | |
# - delay (Seconds to wait for page to finish loading) | |
@app.route('/') | |
def root(): | |
url = request.args.get('url', default=None, type=str) | |
delay = request.args.get('delay', default=defaultwait, type=int) | |
return Response(getPage(url, delay), mimetype='text/html') | |
# Run app indefinitely | |
app.run(debug=False, port=port) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment