Created
May 1, 2014 20:07
-
-
Save jmnwong/2688fbbe2a62ab5db195 to your computer and use it in GitHub Desktop.
Repy GET Request
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#begin include httpretrieve.repy | |
""" | |
<Program Name> | |
httpretrieve.repy | |
<Started> | |
August 19, 2009 | |
<Authors> | |
Yafete Yemuru | |
Conrad Meyer | |
<Purpose> | |
Provides a method for retrieving content from web servers using the HTTP | |
protocol. The content can be accessed as a file like object, or saved to | |
a file or returned as a string. | |
""" | |
#begin include urlparse.repy | |
""" | |
<Program Name> | |
urlparse.repy | |
<Started> | |
May 15, 2009 | |
<Author> | |
Michael Phan-Ba | |
<Purpose> | |
Provides utilities for parsing URLs, based on the Python 2.6.1 module urlparse. | |
""" | |
def urlparse_urlsplit(urlstring, default_scheme="", allow_fragments=True): | |
""" | |
<Purpose> | |
Parse a URL into five components, returning a dictionary. This corresponds | |
to the general structure of a URL: | |
scheme://netloc/path;parameters?query#fragment. The parameters are not | |
split from the URL and individual componenets are not separated. | |
Only absolute server-based URIs are currently supported (all URLs will be | |
parsed into the components listed, regardless of the scheme). | |
<Arguments> | |
default_scheme: | |
Optional: defaults to the empty string. If specified, gives the default | |
addressing scheme, to be used only if the URL does not specify one. | |
allow_fragments: | |
Optional: defaults to True. If False, fragment identifiers are not | |
allowed, even if the URL's addressing scheme normally does support them. | |
<Exceptions> | |
ValueError on parsing a non-numeric port value. | |
<Side Effects> | |
None. | |
<Returns> | |
A dictionary containing: | |
Key Value Value if not present | |
============================================================================ | |
scheme URL scheme specifier empty string | |
netloc Network location part empty string | |
path Hierarchical path empty string | |
query Query component empty string | |
fragment Fragment identifier empty string | |
username User name None | |
password Password None | |
hostname Host name (lower case) None | |
port Port number as integer, if present None | |
""" | |
components = {"scheme": default_scheme, "netloc": "", "path": "", "query": "", | |
"fragment": "", "username": None, "password": None, "hostname": None, | |
"port": None } | |
# Extract the scheme, if present. | |
(lpart, rpart) = _urlparse_splitscheme(urlstring) | |
if lpart: | |
components["scheme"] = lpart | |
# Extract the server information, if present. | |
if rpart.startswith("//"): | |
(lpart, rpart) = _urlparse_splitnetloc(rpart, 2) | |
components["netloc"] = lpart | |
# Adds a trailing slash to the URL if no path exists. | |
if rpart == "": | |
rpart = "/" | |
(components["username"], components["password"], components["hostname"], | |
components["port"]) = _urlparse_splitauthority(lpart) | |
# Extract the fragment. | |
if allow_fragments: | |
(rpart, components["fragment"]) = _urlparse_splitfragment(rpart) | |
# Extract the query. | |
(components["path"], components["query"]) = _urlparse_splitquery(rpart) | |
return components | |
def _urlparse_splitscheme(url): | |
"""Parse the scheme portion of the URL""" | |
# The scheme is valid only if it contains these characters. | |
scheme_chars = \ | |
"abcdefghijklmnopqrstuvwxyz0123456789+-." | |
scheme = "" | |
rest = url | |
spart = url.split(":", 1) | |
if len(spart) == 2: | |
# Normalize the scheme. | |
spart[0] = spart[0].lower() | |
# A scheme is valid only if it starts with an alpha character. | |
if spart[0] and spart[0][0].isalpha(): | |
for char in spart[0]: | |
if char not in scheme_chars: | |
break | |
(scheme, rest) = spart | |
return scheme, rest | |
def _urlparse_splitnetloc(url, start=0): | |
"""Parse the netloc portion of the URL""" | |
# By default, the netloc is delimited by the end of the URL. | |
delim = len(url) | |
# Find the left-most delimiter. | |
for char in "/?#": | |
xdelim = url.find(char, start) | |
if xdelim >= 0: | |
delim = min(delim, xdelim) | |
# Return the netloc and the rest of the URL. | |
return url[start:delim], url[delim:] | |
def _urlparse_splitauthority(netloc): | |
"""Parse the authority portion of the netloc""" | |
# The authority can have a userinfo portion delimited by "@". | |
authority = netloc.split("@", 1) | |
# Default values. | |
username = None | |
password = None | |
hostname = None | |
port = None | |
# Is there a userinfo portion? | |
if len(authority) == 2: | |
# userinfo can be split into username:password | |
userinfo = authority[0].split(":", 1) | |
# hostport can be split into hostname:port | |
hostport = authority[1].split(":", 1) | |
if userinfo[0]: | |
username = userinfo[0] | |
if len(userinfo) == 2: | |
password = userinfo[1] | |
# No userinfo portion found. | |
else: | |
# hostport can be split into hostname:port | |
hostport = netloc.split(":", 1) | |
# Is there a port value? | |
if hostport[0]: | |
hostname = hostport[0] | |
if len(hostport) == 2: | |
port = int(hostport[1], 10) | |
# Return the values. | |
return username, password, hostname, port | |
def _urlparse_splitquery(url): | |
"""Parse the query portion of the url""" | |
qpart = url.split("?", 1) | |
if len(qpart) == 2: | |
query = qpart[1] | |
else: | |
query = "" | |
return qpart[0], query | |
def _urlparse_splitfragment(url): | |
"""Parse the query portion of the url""" | |
fpart = url.split("#", 1) | |
if len(fpart) == 2: | |
fragment = fpart[1] | |
else: | |
fragment = "" | |
return fpart[0], fragment | |
#end include urlparse.repy | |
#begin include sockettimeout.repy | |
""" | |
<Author> | |
Justin Cappos, Armon Dadgar | |
This is a rewrite of the previous version by Richard Jordan | |
<Start Date> | |
26 Aug 2009 | |
<Description> | |
A library that causes sockets to timeout if a recv / send call would | |
block for more than an allotted amount of time. | |
""" | |
class SocketTimeoutError(Exception): | |
"""The socket timed out before receiving a response""" | |
class _timeout_socket(): | |
""" | |
<Purpose> | |
Provides a socket like object which supports custom timeouts | |
for send() and recv(). | |
""" | |
# Initialize with the socket object and a default timeout | |
def __init__(self,socket,timeout=10, checkintv='fibonacci'): | |
""" | |
<Purpose> | |
Initializes a timeout socket object. | |
<Arguments> | |
socket: | |
A socket like object to wrap. Must support send,recv,close, and willblock. | |
timeout: | |
The default timeout for send() and recv(). | |
checkintv: | |
How often socket operations (send,recv) should check if | |
they can run. The smaller the interval the more time is | |
spent busy waiting. | |
""" | |
# Store the socket, timeout and check interval | |
self.socket = socket | |
self.timeout = timeout | |
self.checkintv = checkintv | |
# Allow changing the default timeout | |
def settimeout(self,timeout=10): | |
""" | |
<Purpose> | |
Allows changing the default timeout interval. | |
<Arguments> | |
timeout: | |
The new default timeout interval. Defaults to 10. | |
Use 0 for no timeout. Given in seconds. | |
""" | |
# Update | |
self.timeout = timeout | |
# Wrap willblock | |
def willblock(self): | |
""" | |
See socket.willblock() | |
""" | |
return self.socket.willblock() | |
# Wrap close | |
def close(self): | |
""" | |
See socket.close() | |
""" | |
return self.socket.close() | |
# Provide a recv() implementation | |
def recv(self,bytes,timeout=None): | |
""" | |
<Purpose> | |
Allows receiving data from the socket object with a custom timeout. | |
<Arguments> | |
bytes: | |
The maximum amount of bytes to read | |
timeout: | |
(Optional) Defaults to the value given at initialization, or by settimeout. | |
If provided, the socket operation will timeout after this amount of time (sec). | |
Use 0 for no timeout. | |
<Exceptions> | |
As with socket.recv(), socket.willblock(). Additionally, SocketTimeoutError is | |
raised if the operation times out. | |
<Returns> | |
The data received from the socket. | |
""" | |
# It's worth noting that this fibonacci backoff begins with a 2ms poll rate, and | |
# provides a simple exponential backoff scheme. | |
fibonacci_backoff = False | |
backoff_cap = 100 # Never use more than 100ms poll rate. | |
pre_value = 1.0 # Our iterators for Fibonacci sequence. | |
pre_pre_value = 1.0 # | |
# Since we want to be able to initialize with static poll rates (backwards | |
# compatibility) we specify a string if we're using the fibonacci backoff. | |
if type(self.checkintv) is str: | |
if self.checkintv == 'fibonacci': | |
fibonacci_backoff = True | |
# Set the timeout if None | |
if timeout is None: | |
timeout = self.timeout | |
# Get the start time | |
starttime = getruntime() | |
# Block until we can read | |
rblock, wblock = self.socket.willblock() | |
while rblock: | |
# Check if we should break | |
if timeout > 0: | |
# Get the elapsed time | |
diff = getruntime() - starttime | |
# Raise an exception | |
if diff > timeout: | |
raise SocketTimeoutError,"recv() timed out!" | |
if fibonacci_backoff: | |
# Iterate the sequence once | |
sleep_length = pre_value + pre_pre_value | |
pre_pre_value = pre_value | |
pre_value = sleep_length | |
# Make sure we don't exceed maximum backoff. | |
if sleep_length > backoff_cap: | |
sleep_length = backoff_cap | |
# Unit conversion to seconds | |
sleep_length = sleep_length / 1000.0 | |
# Sleep | |
sleep(sleep_length) | |
else: # Classic functionality. | |
# Sleep | |
try: | |
sleep(float(self.checkintv)) | |
except: | |
sleep(0.1) | |
# If available, move to the next value of checkintv. | |
# Update rblock | |
rblock, wblock = self.socket.willblock() | |
# Do the recv | |
return self.socket.recv(bytes) | |
# Provide a send() implementation | |
def send(self,data,timeout=None): | |
""" | |
<Purpose> | |
Allows sending data with the socket object with a custom timeout. | |
<Arguments> | |
data: | |
The data to send | |
timeout: | |
(Optional) Defaults to the value given at initialization, or by settimeout. | |
If provided, the socket operation will timeout after this amount of time (sec). | |
Use 0 for no timeout. | |
<Exceptions> | |
As with socket.send(), socket.willblock(). Additionally, SocketTimeoutError is | |
raised if the operation times out. | |
<Returns> | |
The number of bytes sent. | |
""" | |
# Set the timeout if None | |
if timeout is None: | |
timeout = self.timeout | |
# Get the start time | |
starttime = getruntime() | |
# Block until we can write | |
rblock, wblock = self.socket.willblock() | |
while wblock: | |
# Check if we should break | |
if timeout > 0: | |
# Get the elapsed time | |
diff = getruntime() - starttime | |
# Raise an exception | |
if diff > timeout: | |
raise SocketTimeoutError,"send() timed out!" | |
# Sleep | |
# Since switching to the fibonacci backoff, the nature of | |
# this field has changed. Rather than implement the backoff | |
# for checking block status (seems wasteful) we'll just use | |
# a constant value. Ten ms seems appropriate. | |
sleep(0.010) | |
# Update rblock | |
rblock, wblock = self.socket.willblock() | |
# Do the recv | |
return self.socket.send(data) | |
def timeout_openconn(desthost, destport, localip=None, localport=None, timeout=5): | |
""" | |
<Purpose> | |
Wrapper for openconn. Very, very similar | |
<Args> | |
Same as Repy openconn | |
<Exception> | |
Raises the same exceptions as openconn. | |
<Side Effects> | |
Creates a socket object for the user | |
<Returns> | |
socket obj on success | |
""" | |
realsocketlikeobject = openconn(desthost, destport, localip, localport, timeout) | |
thissocketlikeobject = _timeout_socket(realsocketlikeobject, timeout) | |
return thissocketlikeobject | |
def timeout_waitforconn(localip, localport, function, timeout=5): | |
""" | |
<Purpose> | |
Wrapper for waitforconn. Essentially does the same thing... | |
<Args> | |
Same as Repy waitforconn with the addition of a timeout argument. | |
<Exceptions> | |
Same as Repy waitforconn | |
<Side Effects> | |
Sets up event listener which calls function on messages. | |
<Returns> | |
Handle to listener. | |
""" | |
# We use a closure for the callback we pass to waitforconn so that we don't | |
# have to map mainch's to callback functions or deal with potential race | |
# conditions if we did maintain such a mapping. | |
def _timeout_waitforconn_callback(localip, localport, sockobj, ch, mainch): | |
# 'timeout' is the free variable 'timeout' that was the argument to | |
# timeout_waitforconn. | |
thissocketlikeobject = _timeout_socket(sockobj, timeout) | |
# 'function' is the free variable 'function' that was the argument to | |
# timeout_waitforconn. | |
return function(localip, localport, thissocketlikeobject, ch, mainch) | |
return waitforconn(localip, localport, _timeout_waitforconn_callback) | |
# a wrapper for stopcomm | |
def timeout_stopcomm(commhandle): | |
""" | |
Wrapper for stopcomm. Does the same thing... | |
""" | |
return stopcomm(commhandle) | |
#end include sockettimeout.repy | |
#begin include urllib.repy | |
def urllib_quote(inputstring, safestring="/"): | |
""" | |
<Purpose> | |
Encode an inputstring such that it can be used safely in a URL or XML | |
document. | |
<Arguments> | |
inputstring: | |
The string to urlencode. | |
safestring (optional): | |
Specifies additional characters that should not be quoted -- | |
defaults to "/". | |
<Exceptions> | |
TypeError if the inputstring or safestring parameters aren't strings. | |
<Side Effects> | |
None. | |
<Returns> | |
Urlencoded version of the passed string. | |
""" | |
if type(inputstring) is not str: | |
raise TypeError("urllib_quote's inputstring parameter must be a string, not '"+str(type(inputstring))+"'") | |
if type(safestring) is not str: | |
raise TypeError("urllib_quote's safestring parameter must be a string, not '"+str(type(safestring))+"'") | |
resultstr = "" | |
# We go through each character in the string; if it's not in [0-9a-zA-Z] | |
# we wrap it. | |
safeset = set(safestring) | |
for char in inputstring: | |
asciicode = ord(char) | |
if (asciicode >= ord("0") and asciicode <= ord("9")) or \ | |
(asciicode >= ord("A") and asciicode <= ord("Z")) or \ | |
(asciicode >= ord("a") and asciicode <= ord("z")) or \ | |
asciicode == ord("_") or asciicode == ord(".") or \ | |
asciicode == ord("-") or char in safeset: | |
resultstr += char | |
else: | |
resultstr += "%%%02X" % asciicode | |
return resultstr | |
def urllib_quote_plus(inputstring, safestring=""): | |
""" | |
<Purpose> | |
Encode a string to go in the query fragment of a URL. | |
<Arguments> | |
inputstring: | |
The string to urlencode. | |
safestring (optional): | |
Specifies additional characters that should not be quoted -- | |
defaults to the empty string. | |
<Exceptions> | |
TypeError if the inputstring or safestring parameters aren't strings. | |
<Side Effects> | |
None. | |
<Returns> | |
Urlencoded version of the passed string. | |
""" | |
if type(inputstring) is not str: | |
raise TypeError("urllib_quote_plus' inputstring parameter must be a string, not '"+str(type(inputstring))+"'") | |
if type(safestring) is not str: | |
raise TypeError("urllib_quote_plus' safestring parameter must be a string, not '"+str(type(safestring))+"'") | |
return urllib_quote(inputstring, safestring + " ").replace(" ", "+") | |
def urllib_unquote(inputstring): | |
""" | |
<Purpose> | |
Unquote a urlencoded string. | |
<Arguments> | |
inputstring: | |
The string to unquote. | |
<Exceptions> | |
TypeError if the inputstring isn't a string | |
ValueError thrown if the last wrapped octet isn't a valid wrapped octet | |
(i.e. if the string ends in "%" or "%x" rather than "%xx". Also throws | |
ValueError if the nibbles aren't valid hex digits. | |
<Side Effects> | |
None. | |
<Returns> | |
The decoded string. | |
""" | |
if type(inputstring) is not str: | |
raise TypeError("urllib_unquote's inputstring parameter must be a string, not '"+str(type(inputstring))+"'") | |
resultstr = "" | |
# We go through the inputstring from end to beginning, looking for wrapped | |
# octets. When one is found we add it (unwrapped) and the following | |
# string to the resultant string, and shorten the original inputstring. | |
while True: | |
lastpercentlocation = inputstring.rfind("%") | |
if lastpercentlocation < 0: | |
break | |
wrappedoctetstr = inputstring[lastpercentlocation+1:lastpercentlocation+3] | |
if len(wrappedoctetstr) != 2: | |
raise ValueError("Quoted string is poorly formed") | |
resultstr = \ | |
chr(int(wrappedoctetstr, 16)) + \ | |
inputstring[lastpercentlocation+3:] + \ | |
resultstr | |
inputstring = inputstring[:lastpercentlocation] | |
resultstr = inputstring + resultstr | |
return resultstr | |
def urllib_unquote_plus(inputstring): | |
""" | |
<Purpose> | |
Unquote the urlencoded query fragment of a URL. | |
<Arguments> | |
inputstring: | |
The string to unquote. | |
<Exceptions> | |
TypeError if the inputstring isn't a string | |
ValueError thrown if the last wrapped octet isn't a valid wrapped octet | |
(i.e. if the inputstring ends in "%" or "%x" rather than "%xx". Also throws | |
ValueError if the nibbles aren't valid hex digits. | |
<Side Effects> | |
None. | |
<Returns> | |
The decoded string. | |
""" | |
if type(inputstring) is not str: | |
raise TypeError("urllib_unquote_plus' inputstring parameter must be a string, not '"+str(type(inputstring))+"'") | |
return urllib_unquote(inputstring.replace("+", " ")) | |
def urllib_quote_parameters(inputdictionary): | |
""" | |
<Purpose> | |
Encode a dictionary of (key, value) pairs into an HTTP query string or | |
POST body (same form). | |
<Arguments> | |
dictionary: | |
The dictionary to quote. | |
<Exceptions> | |
TypeError if the inputdictionary isn't a dict. | |
<Side Effects> | |
None. | |
<Returns> | |
The quoted dictionary. | |
""" | |
if type(inputdictionary) is not dict: | |
raise TypeError("urllib_quote_parameters' inputstringdictionary parameter must be a dict, not '"+str(type(inputstring))+"'") | |
quoted_keyvals = [] | |
for key, val in inputdictionary.items(): | |
quoted_keyvals.append("%s=%s" % (urllib_quote(key), urllib_quote(val))) | |
return "&".join(quoted_keyvals) | |
def urllib_unquote_parameters(inputstring): | |
""" | |
<Purpose> | |
Decode a urlencoded query string or POST body. | |
<Arguments> | |
inputstring: | |
The string to decode. | |
<Exceptions> | |
TypeError if the inputstring isn't a string | |
ValueError if the inputstring is poorly formed. | |
<Side Effects> | |
None. | |
<Returns> | |
A dictionary mapping keys to values. | |
""" | |
if type(inputstring) is not str: | |
raise TypeError("urllib_unquote_parameters' inputstring parameter must be a string, not '"+str(type(inputstring))+"'") | |
keyvalpairs = inputstring.split("&") | |
res = {} | |
for quotedkeyval in keyvalpairs: | |
# Throw ValueError if there is more or less than one '='. | |
quotedkey, quotedval = quotedkeyval.split("=") | |
key = urllib_unquote_plus(quotedkey) | |
val = urllib_unquote_plus(quotedval) | |
res[key] = val | |
return res | |
#end include urllib.repy | |
class HttpConnectionError(Exception): | |
""" | |
Error indicating that the web server has unexpectedly dropped the | |
connection. | |
""" | |
class HttpBrokenServerError(Exception): | |
""" | |
Error indicating that the web server has sent us complete garbage instead | |
of something resembling HTTP. | |
""" | |
def httpretrieve_open(url, querydata=None, postdata=None,\ | |
httpheaders=None, proxy=None, timeout=None): | |
""" | |
<Purpose> | |
Returns a file-like object that can be used to read the content from | |
an HTTP server. Follows 3xx redirects. | |
<Arguments> | |
url: | |
The URL to perform a GET or POST request on. | |
postdata (optional): | |
A dictionary of form data or a string to POST to the server. | |
Passing a non-None value results in a POST request being sent | |
to the server. | |
querydata (optional): | |
A dictionary of form data or a string to send as the query | |
string to the server. | |
If postdata is omitted, the URL is retrieved with GET. If | |
both postdata and querydata are omitted, there is no query | |
string sent in the request. | |
For both querydata and postdata, strings are sent *unmodified*. | |
This means you probably should encode them first, with | |
urllib_quote(). | |
httpheaders (optional): | |
A dictionary of supplemental HTTP request headers to add to the | |
request. | |
proxy (optional): | |
A proxy server 2-tuple to bind to: ('host', port). | |
timeout (optional): | |
A timeout for establishing a connection to the web server, | |
sending headers, and reading the response headers. | |
If excluded or None, never times out. | |
<Exceptions> | |
ValueError if given an invalid URL, or malformed limit or timeout | |
values. This is also raised if the user attempts to call a method | |
on the file-like object after closing it. | |
HttpConnectionError if opening the connection fails, or if the | |
connection is closed by the server before we expect. | |
SocketTimeoutError if the timeout is exceeded. | |
HttpBrokenServerError if the response or the Location response header | |
is malformed. | |
<Side Effects> | |
None | |
<Returns> | |
Returns a file-like object which can be used to read the body of | |
the response from the web server. The protocol version spoken by the | |
server, status code, and response headers are available as members of | |
the object. | |
""" | |
starttimefloat = getruntime() | |
# Check if the URL is valid and get host, path, port and query | |
parsedurldict = urlparse_urlsplit(url) | |
hoststr = parsedurldict['hostname'] | |
pathstr = parsedurldict['path'] | |
portint = parsedurldict.get('port') | |
portint = portint or 80 | |
if parsedurldict['scheme'] != 'http': | |
raise ValueError("URL doesn't seem to be for the HTTP protocol.") | |
if hoststr is None: | |
raise ValueError("Missing hostname.") | |
""" | |
#removed line---> parsedurldict['query'] is not None | |
#because it was broken | |
#if parsedurldict['query'] != "": | |
#raise ValueError("URL cannot include a query string.") | |
""" | |
# Typical HTTP sessions consist of (optionally, a series of pairs of) HTTP | |
# requests followed by HTTP responses. These happen serially. | |
# JAC: Set this up so that we can raise the right error if the | |
# timeout_openconn doesn't work. | |
sockobj = None | |
# Open connection to the web server | |
try: | |
if proxy is not None: | |
# if there is a proxy, open a connection with the proxy instead of the actual server | |
# use the timeout we are given (or none) | |
sockobj = timeout_openconn(proxy[0], proxy[1], timeout=timeout) | |
else: | |
# if there is no proxy open a connection with server directly | |
# use the timeout we are given (or none) | |
sockobj = timeout_openconn(hoststr, portint, timeout=timeout) | |
except Exception, e: | |
# If a socket object was created, we want to clean in up. | |
if sockobj: | |
sockobj.close() | |
if repr(e).startswith("timeout("): | |
raise HttpConnectionError("Socket timed out connecting to host/port.") | |
raise | |
try: | |
# Builds the HTTP request: | |
httprequeststr = _httpretrieve_build_request(hoststr, portint, pathstr, \ | |
querydata, postdata, httpheaders, proxy) | |
# Send the full HTTP request to the web server. | |
_httpretrieve_sendall(sockobj, httprequeststr) | |
# Now, we're done with the HTTP request part of the session, and we need | |
# to get the HTTP response. | |
# Check if we've timed out (if the user requested a timeout); update the | |
# socket timeout to reflect the time taken sending the request. | |
if timeout is None: | |
sockobj.settimeout(0) | |
elif getruntime() - starttimefloat >= timeout: | |
raise SocketTimeoutError("Timed out") | |
else: | |
sockobj.settimeout(timeout - (getruntime() - starttimefloat)) | |
# Receive the header lines from the web server (a series of CRLF-terminated | |
# lines, terminated by an empty line, or by the server closing the | |
# connection. | |
headersstr = "" | |
while not headersstr.endswith("\r\n\r\n"): | |
try: | |
# This should probably be replaced with page-sized reads in the future, | |
# but for now, the behavior is at least correct. | |
headersstr += sockobj.recv(1) | |
except Exception, e: | |
if str(e) == "Socket closed": | |
break | |
else: | |
raise | |
httpheaderlist = headersstr.split("\r\n") | |
# Ignore (a) trailing blank line(s) (for example, the response header- | |
# terminating blank line). | |
while len(httpheaderlist) > 0 and httpheaderlist[-1] == "": | |
httpheaderlist = httpheaderlist[:-1] | |
# Get the status code and status message from the HTTP response. | |
statuslinestr, httpheaderlist = httpheaderlist[0], httpheaderlist[1:] | |
# The status line should be in the form: "HTTP/1.X NNN SSSSS", where | |
# X is 0 or 1, NNN is a 3-digit status code, and SSSSS is a 'user-friendly' | |
# string representation of the status code (may contain spaces). | |
statuslinelist = statuslinestr.split(' ', 2) | |
if len(statuslinelist) < 3: | |
raise HttpBrokenServerError("Server returned garbage for HTTP " + \ | |
"response (status line missing one or more fields).") | |
if not statuslinelist[0].startswith('HTTP'): | |
raise HttpBrokenServerError("Server returned garbage for HTTP " + \ | |
"response (invalid response protocol in status line).") | |
friendlystatusstr = statuslinelist[2] | |
try: | |
statusint = int(statuslinelist[1]) | |
except ValueError, e: | |
raise HttpBrokenServerError("Server returned garbage for HTTP " + \ | |
"response (status code isn't integer).") | |
httpheaderdict = _httpretrieve_parse_responseheaders(httpheaderlist) | |
# If we got any sort of redirect response, follow the redirect. Note: we | |
# do *not* handle the 305 status code (use the proxy as specified in the | |
# Location header) at all; I think this is best handled at a higher layer | |
# anyway. | |
if statusint in (301, 302, 303, 307): | |
sockobj.close() | |
try: | |
redirecturlstr = httpheaderdict["Location"][0] | |
except (KeyError, IndexError), ke: | |
# When a server returns a redirect status code (3xx) but no Location | |
# header, some clients, e.g. Firefox, just show the response body | |
# as they would normally for a 2xx or 4xx response. So, I think we | |
# should ignore a missing Location header and just return the page | |
# to the caller. | |
pass | |
else: | |
# If the server did send a redirect location, let's go there. | |
return httpretrieve_open(redirecturlstr) | |
# If we weren't requested to redirect, and we didn't, return a read-only | |
# file-like object (representing the response body) to the caller. | |
return _httpretrieve_filelikeobject(sockobj, httpheaderdict, \ | |
(statuslinelist[0], statusint, friendlystatusstr)) | |
except: | |
# If any exception occured after the socket was open, we want to make | |
# sure that the socket is cleaned up if it is still open before we | |
# raise the exception. | |
if sockobj: | |
sockobj.close() | |
raise | |
def httpretrieve_save_file(url, filename, querydata=None, postdata=None, \ | |
httpheaders=None, proxy=None, timeout=None): | |
""" | |
<Purpose> | |
Perform an HTTP request, and save the content of the response to a | |
file. | |
<Arguments> | |
filename: | |
The file name to save the response to. | |
Other arguments: | |
See documentation for httpretrieve_open(). | |
<Exceptions> | |
This function will raise any exception raised by Repy file objects | |
in opening, writing to, and closing the file. | |
This function will all also raise any exception raised by | |
httpretrieve_open(), for the same reasons. | |
<Side Effects> | |
Writes the body of the response to 'filename'. | |
<Returns> | |
None | |
""" | |
# Open the output file object and http file-like object. | |
outfileobj = open(filename, 'w') | |
httpobj = httpretrieve_open(url, querydata=querydata, postdata=postdata, \ | |
httpheaders=httpheaders, proxy=proxy, timeout=timeout) | |
# Repeatedly read from the file-like HTTP object into our file, until the | |
# response is finished. | |
responsechunkstr = None | |
while responsechunkstr != '': | |
responsechunkstr = httpobj.read(4096) | |
outfileobj.write(responsechunkstr) | |
outfileobj.close() | |
httpobj.close() | |
def httpretrieve_get_string(url, querydata=None, postdata=None, \ | |
httpheaders=None, proxy=None, timeout=30): | |
""" | |
<Purpose> | |
Performs an HTTP request on the given URL, using POST or GET, | |
returning the content of the response as a string. Uses | |
httpretrieve_open. | |
<Arguments> | |
See httpretrieve_open. | |
<Exceptions> | |
See httpretrieve_open. | |
<Side Effects> | |
None. | |
<Returns> | |
Returns the body of the HTTP response (no headers). | |
""" | |
# Open a read-only file-like object for the HTTP request. | |
httpobj = httpretrieve_open(url, querydata=querydata, postdata=postdata, \ | |
httpheaders=httpheaders, proxy=proxy, timeout=timeout) | |
# Read all of the response and return it. | |
try: | |
return httpobj.read() | |
finally: | |
httpobj.close() | |
class _httpretrieve_filelikeobject: | |
# This class implements a file-like object used for performing HTTP | |
# requests and retrieving responses. | |
def __init__(self, sock, headers, httpstatus): | |
# The socket-like object connected to the HTTP server. Headers have | |
# already been read. | |
self._sockobj = sock | |
# If this is set, the close() method has already been called, so we | |
# don't accept future reads. | |
self._fileobjclosed = False | |
# This flag is set if we've finished recieving the entire response | |
# from the server. | |
self._totalcontentisreceived = False | |
# This integer represents the number of bytes read so far. | |
self._totalread = 0 | |
# This is the dictionary of HTTP response headers associated with this | |
# file-like object. | |
self.headers = headers | |
# The HTTP status tuple of this response, e.g. ("HTTP/1.0", 200, "OK") | |
self.httpstatus = httpstatus | |
def read(self, limit=None, timeout=None): | |
""" | |
<Purpose> | |
Behaves like Python's file.read(), with the potential to raise | |
additional informative exceptions. | |
<Arguments> | |
limit (optional): | |
The maximum amount of data to read. If omitted or None, this | |
reads all available data. | |
<Exceptions> | |
See file.read()'s documentation, as well as that of | |
httpretrieve_open(). | |
<Side Effects> | |
None. | |
<Returns> | |
See file.read(). | |
""" | |
# Raise an error if the caller has already close()d this object. | |
if self._fileobjclosed: | |
raise ValueError("I/O operation on closed file") | |
# If we've finished reading everything we can from the server, return the | |
# empty string. | |
if self._totalcontentisreceived: | |
return '' | |
lefttoread = None | |
if limit is not None: | |
lefttoread = limit | |
# Sanity check type/value of limit. | |
if type(limit) is not int: | |
raise TypeError("Expected an integer or None for read() limit") | |
elif limit < 0: | |
raise ValueError("Expected a non-negative integer for read() limit") | |
if timeout is None: | |
self._sockobj.settimeout(0) | |
else: | |
self._sockobj.settimeout(timeout) | |
# Try to read up to limit, or until there is nothing left. | |
httpcontentstr = '' | |
while True: | |
try: | |
contentchunkstr = self._sockobj.recv(lefttoread or 4096) | |
except Exception, e: | |
if str(e) == "Socket closed": | |
self._totalcontentisreceived = True | |
break | |
else: | |
raise | |
httpcontentstr += contentchunkstr | |
self._totalread += len(contentchunkstr) | |
if limit is not None: | |
if len(contentchunkstr) == lefttoread: | |
break | |
else: | |
lefttoread -= len(contentchunkstr) | |
if contentchunkstr == "": | |
self._totalcontentisreceived = True | |
break | |
return httpcontentstr | |
def close(self): | |
""" | |
<Purpose> | |
Close the file-like object. | |
<Arguments> | |
None | |
<Exceptions> | |
None | |
<Side Effects> | |
Disconnects from the HTTP server. | |
<Returns> | |
Nothing | |
""" | |
self._fileobjclosed = True | |
self._sockobj.close() | |
def _httpserver_put_in_headerdict(res, lastheader, lastheader_str): | |
# Helper function that tries to put the header into a dictionary of lists, | |
# 'res'. | |
if lastheader is not None: | |
if lastheader not in res: | |
res[lastheader] = [] | |
res[lastheader].append(lastheader_str.strip()) | |
def _httpretrieve_parse_responseheaders(headerlines): | |
# Parse rfc822-style headers (this could be abstracted out to an rfc822 | |
# library that would be quite useful for internet protocols). Returns | |
# a dictionary mapping headers to arrays of values. E.g.: | |
# | |
# Foo: a | |
# Bar: | |
# b | |
# Bar: c | |
# | |
# Becomes: {"Foo": ["a"], "Bar": ["b", "c"]} | |
# These variables represent the key and value of the last header we found, | |
# unless we are parsing the very first header. E.g., if we've just read: | |
# Content-Type: text/html | |
# Then, lastheaderkeystr == "Content-Type", | |
# lastheadervaluestr == "text/html" | |
lastheaderkeystr = None | |
lastheadervaluestr = "" | |
resdict = {} | |
if len(headerlines) == 0: | |
return {} | |
try: | |
# Iterate over the request header lines: | |
for i in range(len(headerlines)): | |
# Lines with leading non-CRLF whitespace characters are part of the | |
# previous line (see rfc822 for details). | |
if headerlines[i][0] in (" ", "\t") and lastheaderkeystr is not None: | |
lastheadervaluestr += headerlines[i] | |
else: | |
_httpserver_put_in_headerdict(resdict, lastheaderkeystr, lastheadervaluestr) | |
lastheaderkeystr, lastheadervaluestr = headerlines[i].split(":", 1) | |
# Add the last line to the result dictionary. | |
_httpserver_put_in_headerdict(resdict, lastheaderkeystr, lastheadervaluestr) | |
return resdict | |
except IndexError, idx: | |
raise HttpBrokenServerError("Server returned garbage for HTTP" + \ | |
" response. Bad headers.") | |
def _httpretrieve_build_request(host, port, path, querydata, postdata, \ | |
httpheaders, proxy): | |
# Builds an HTTP request from these parameters, returning it as | |
# a string. | |
# Sanity checks: | |
if path == "": | |
raise ValueError("Invalid path -- empty string.") | |
if postdata is not None and type(postdata) not in (str, dict): | |
raise TypeError("Postdata should be a dict of form-data or a string") | |
if querydata is not None and type(querydata) not in (str, dict): | |
raise TypeError("Querydata should be a dict of form-data or a string") | |
if httpheaders is not None and type(httpheaders) is not dict: | |
raise TypeError("Expected HTTP headers as a dictionary.") | |
# Type-conversions: | |
if type(querydata) is dict: | |
querydata = urllib_quote_parameters(querydata) | |
elif querydata is None: | |
querydata = "" | |
if type(postdata) is dict: | |
postdata = urllib_quote_parameters(postdata) | |
# Default to GET, unless the caller specifies a message body to send. | |
methodstr = "GET" | |
if postdata is not None: | |
methodstr = "POST" | |
# Encode the path and querystring part of the request. | |
resourcestr = querydata | |
if querydata != "": | |
resourcestr = "?" + resourcestr | |
# Encode the HTTP request line and headers: | |
if proxy is not None: | |
# proxy exists thus the request header should include the original requested url | |
requeststr = methodstr + ' http://' + host + ':' + str(port) + path + resourcestr + ' HTTP/1.0\r\n' | |
else: | |
# there is no proxy; send normal http request | |
requeststr = methodstr + ' ' + path + resourcestr + ' HTTP/1.0\r\n' | |
if httpheaders is not None: | |
# Most servers require a 'Host' header for normal functionality | |
# (especially in the case of multiple domains being hosted on a | |
# single server). | |
if "Host" not in httpheaders: | |
requeststr += "Host: " + host + ':' + str(port) + "\r\n" | |
for key, val in httpheaders.items(): | |
requeststr += key + ": " + val + '\r\n' | |
# Affix post-data related headers and content: | |
if methodstr == "POST": | |
requeststr += 'Content-Length: ' + str(len(postdata)) + '\r\n' | |
# The empty line terminates HTTP headers. | |
requeststr += '\r\n' | |
# If we're a POST request, affix any requested data to the message body. | |
if methodstr == "POST": | |
requeststr += postdata | |
return requeststr | |
def _httpretrieve_sendall(sockobj, datastr): | |
# Helper function that attempts to dump all of the data in datastr to the | |
# socket sockobj (data is any arbitrary bytes). | |
while len(datastr) > 0: | |
datastr = datastr[sockobj.send(datastr):] | |
#end include httpretrieve.repy | |
if callfunc == 'initialize': | |
# print "bing" | |
# result = httpretrieve_get_string("http://www.bing.com") | |
headers = {"Content-type": "application/x-www-form-urlencoded", "Accept": "text/html"} | |
url = "http://www.bing.com" | |
print url | |
print urlparse_urlsplit(url) | |
result = httpretrieve_open(url, httpheaders=headers) | |
print '----->', result | |
resultfile = open("index.html", "w") | |
resultfile.write(result.read()) | |
resultfile.close() | |
result.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment