-
-
Save LeMoussel/bfed69044cb65947a4e63c76f11e6e1f to your computer and use it in GitHub Desktop.
""" | |
Implementation of the Selenium Chrome WebDriver with HTTP Response data | |
included via the ChromeDriver performance logging capability | |
""" | |
import json | |
from requests.structures import CaseInsensitiveDict | |
# https://github.com/SeleniumHQ/selenium | |
from selenium import webdriver | |
from selenium.webdriver.chrome.service import Service | |
# https://github.com/SergeyPirogov/webdriver_manager | |
from webdriver_manager.chrome import ChromeDriverManager | |
class ChromeWebDriverPerfomance: | |
def __init__(self, headless=False): | |
self.options = webdriver.ChromeOptions() | |
self.options.add_argument("--no-sandbox") | |
self.options.add_argument("--disable-dev-shm-usage") | |
self.options.add_argument("--headless=new") | |
self.options.add_argument("disable-infobars") | |
self.options.add_argument("--disable-extensions") | |
# https://developer.chrome.com/docs/chromedriver/logging/performance-log | |
self.options.set_capability("goog:loggingPrefs", {"performance": "ALL"}) | |
self.driver = webdriver.Chrome( | |
service=Service(ChromeDriverManager().install()), options=self.options | |
) | |
# List to store each response | |
self.responses = [] | |
def get(self, url): | |
self.driver.get(url) | |
# Parse the Chrome Performance logs | |
response = None | |
for log_entry in self.driver.get_log("performance"): | |
log_message = json.loads(log_entry["message"])["message"] | |
# Filter out HTTP responses | |
if log_message["method"] == "Network.responseReceived": | |
self.responses.append(log_message["params"]["response"]) | |
if log_message["params"]["type"] == "Document": | |
response = log_message["params"]["response"] | |
return response | |
def close(self): | |
self.driver.close() | |
if __name__ == "__main__": | |
start_url = "https://zonetuto.fr" | |
cwd_perf = ChromeWebDriverPerfomance() | |
response = cwd_perf.get(start_url) | |
cwd_perf.close() | |
""" | |
The ChromeWebDriver response attribute(s) contain a dict with information about the response | |
{ | |
"connectionId": [Integer], | |
"connectionReused": [Boolean], | |
"encodedDataLength": [Integer], | |
"fromDiskCache": [Boolean], | |
"fromServiceWorker": [Boolean], | |
"headers": [dict], # HTTP Headers as a dict | |
"headersText": [String], # HTTP Headers as text | |
"mimeType": [String], | |
"protocol": [String], | |
"remoteIPAddress": [String], | |
"remotePort": [Integer], | |
"requestHeaders": [dict], | |
"requestHeadersText": [String], | |
"securityDetails": [dict], # TLS/SSL related information | |
"securityState": [String], | |
"status": [Integer], # HTTP Status Code of the Response | |
"statusText": [String], | |
"timing": [dict], | |
"url": [String] | |
} | |
""" | |
headers = CaseInsensitiveDict(response["headers"]) | |
status_code = response["status"] | |
print(f"HTTP Status code: {status_code}") | |
print(f"Headers: {headers}") |
To fix this error, you need to import the TimeoutException from the Selenium module. Here's how you can solve this specific problem and improve error handling in your code in general:
from selenium.common.exceptions import TimeoutException
If you're using multiple Selenium exceptions, you might consider importing all common exceptions:
from selenium.common.exceptions import TimeoutException, WebDriverException, NoSuchElementException
Here's a more Pythonic approach to handle exceptions:
from selenium import webdriver
from selenium.common.exceptions import TimeoutException, WebDriverException
def get(self, url):
try:
# Your code to get the URL
self.driver.get(url)
# Other logic...
return response
except TimeoutException as e:
print(f"Timeout accessing URL {url}: {e}")
# Handle timeout (return default value, retry, etc.)
except WebDriverException as e:
print(f"WebDriver error accessing URL {url}: {e}")
# Handle other WebDriver errors
except Exception as e:
print(f"Unexpected error: {e}")
# Catch any other unanticipated exception
Thank you for your instructive reply. This corrected the timeout issue I was having 👍🏻 but now I am coming up against DNS resolution errors.
This code for checking several websites :
if __name__ == "__main__":
urs=["https://www.apple.com","https://www.goxxgle.com","https://www.wsj.com"]
for ur in urs:
cwd_perf = ChromeWebDriverPerfomance()
response = cwd_perf.get(ur)
print (f"URL: {ur}")
status_code = response["status"]
print(f"HTTP Status code: {status_code}")
#headers = CaseInsensitiveDict(response["headers"])
#print(f"Headers: {headers}")
print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
cwd_perf.close()
fails when it comes across "https://www.goxxgle.com" with
URL: https://www.goxxgle.com
Traceback (most recent call last):
File "/Users/stefan/Dropbox/python/test4.py", line 74, in <module>
status_code = response["status"]
~~~~~~~~^^^^^^^^^^
TypeError: 'NoneType' object is not subscriptable
I thought this would have been caught by the 'unexpected error' line in the error checking code ?
Also, I think I should put the
cwd_perf = ChromeWebDriverPerfomance()
cwd_perf.close()
outside the for loop
BTW, this is how I implemented your suggestion:
def get(self, url):
try:
# Your code to get the URL
self.driver.get(url)
# Other logic...
# Parse the Chrome Performance logs
response = None
for log_entry in self.driver.get_log("performance"):
log_message = json.loads(log_entry["message"])["message"]
# Filter out HTTP responses
if log_message["method"] == "Network.responseReceived":
self.responses.append(log_message["params"]["response"])
if log_message["params"]["type"] == "Document":
response = log_message["params"]["response"]
return response
except TimeoutException as e:
print(f"Timeout accessing URL {url}: {e}")
# Handle timeout (return default value, retry, etc.)
except WebDriverException as e:
print(f"WebDriver error accessing URL {url}: {e}")
# Handle other WebDriver errors
except Exception as e:
print(f"Unexpected error: {e}")
# Catch any other unanticipated exception
sorry for the stupid formatting error. I fixed the problem by moving the parsing logic and the return statement out one indent. Forgive me.
Thank you for this working example. Sometimes though I get a timeout depending on the website. I tried modifying the class thusly:
def get(self, url):
try:
self.driver.get(url)
except TimeoutException:
print("Page load timed out.")
except WebDriverException as e:
if "ERR_NAME_NOT_RESOLVED" in str(e) or "dns" in str(e).lower():
print("DNS resolution failed.")
err=1
else:
print("WebDriverException occurred:", e)
but I get the error:
Traceback (most recent call last):
File "/Users/stefan/Desktop/test2.py", line 66, in
response = cwd_perf.get(url)
File "/Users/stefan/Desktop/test2.py", line 37, in get
except TimeoutException:
^^^^^^^^^^^^^^^^
NameError: name 'TimeoutException' is not defined
Can you help me with the better pythonic way to check for these errors?