-
-
Save hardevine/427ad3969ed77fa1edc0175c6f425b79 to your computer and use it in GitHub Desktop.
How to Capture Network Traffic When Scraping with Selenium & Python
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# see rkengler.com for related blog post | |
# https://www.rkengler.com/how-to-capture-network-traffic-when-scraping-with-selenium-and-python/ | |
import json | |
import pprint | |
from selenium import webdriver | |
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities | |
capabilities = DesiredCapabilities.CHROME | |
# capabilities["loggingPrefs"] = {"performance": "ALL"} # chromedriver < ~75 | |
capabilities["goog:loggingPrefs"] = {"performance": "ALL"} # chromedriver 75+ | |
driver = webdriver.Chrome( | |
r"chromedriver.exe", | |
desired_capabilities=capabilities, | |
) | |
def process_browser_logs_for_network_events(logs): | |
""" | |
Return only logs which have a method that start with "Network.response", "Network.request", or "Network.webSocket" | |
since we're interested in the network events specifically. | |
""" | |
for entry in logs: | |
log = json.loads(entry["message"])["message"] | |
if ( | |
"Network.response" in log["method"] | |
or "Network.request" in log["method"] | |
or "Network.webSocket" in log["method"] | |
): | |
yield log | |
driver.get("https://www.rkengler.com") | |
logs = driver.get_log("performance") | |
events = process_browser_logs_for_network_events(logs) | |
with open("log_entries.txt", "wt") as out: | |
for event in events: | |
pprint.pprint(event, stream=out) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment