Created
September 11, 2009 22:41
-
-
Save dvj/185634 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# Downloads Google Web History | |
import sys, optparse, datetime, tempfile, os, string, urllib | |
import getpass, uu | |
import httplib2,re | |
import time | |
from Cookie import SimpleCookie | |
import atom | |
#import gdata.service | |
class Scrape: | |
def __init__(self): | |
self.server = httplib2.Http(".cache") | |
self.headers = {} | |
self._cookies = SimpleCookie() | |
#self.gdata = gdata.service.GDataService(server="www.google.com") | |
self.f = open(sys.argv[1],'w') | |
self.pagetime = time.mktime(datetime.datetime.now().timetuple()) | |
def GetUserCredentials(self): | |
"""Prompts the user for a username and password.""" | |
email = None | |
if email is None: | |
email = raw_input("Email: ") | |
password_prompt = "Password for %s: " % email | |
password = getpass.getpass(password_prompt) | |
self.account = email | |
self.password = password | |
return (email, password) | |
def authenticate(self, goog_service): | |
self.service = goog_service | |
self.server = httplib2.Http() | |
#self.gdata.ClientLogin(self.account, self.password, "HOSTED_OR_GOOGLE", self.service) | |
def getLinkFromPage(self,content): | |
p1 = str(content).find(">Older ›",0) | |
if p1 < 1: | |
return None | |
pos = str(content).find("\"./lookup?hl=",p1-50) | |
endpos = str(content).find("\"",pos+10) | |
link = content[pos+3:endpos] | |
self.pagetime = float(content[pos+20:endpos])/1000000.0 | |
return link | |
def parsePage(self,content): | |
found = True | |
ppos = 0; | |
print self.pagetime | |
mytime = datetime.datetime.fromtimestamp(self.pagetime) | |
self.f.write(mytime.ctime() + "<br>\n") | |
while (found): | |
found = False | |
pos = content.find("<a title",ppos) | |
if (pos > ppos): | |
found = True | |
p2 = content.find("</a>",pos) | |
urlm = re.search('\"h.+\" hre',content[pos:p2+4]) | |
try: | |
url = urlm.group(0)[0:-4] | |
except: | |
url = "" | |
st = re.sub('href=".*"',"href=" + url,content[pos:p2+4]) | |
st = re.sub('\.\/url\?url=".+"','',st) | |
self.f.write(st + "<br>\n") | |
ppos = p2 | |
def getPage(self,uri): | |
self.headers['Content-Type'] = 'text/html' | |
#self.headers['Authorization'] = 'GoogleLogin auth=%s' % self.gdata.GetAuthSubToken() | |
response, content = self.server.request(uri, 'GET', headers=self.headers) | |
# blindly follow redirects | |
while response['status'] == '302': | |
print "Making request to %s" % response['location'] | |
response, content = self.server.request(response['location'], 'GET', headers=self.headers) | |
return response, content | |
def makeAuthRequest(self, accountName, password, authUri): | |
epoch = int(time.time()) | |
e = "T%s/%s/%s" % (epoch-2, epoch-1,epoch) | |
self._cookies['GMAIL_LOGIN'] = e | |
s = self._cookies.output(attrs=[], header='').strip() | |
#print s | |
headers = {'Content-Type': 'application/x-www-form-urlencoded', | |
'Cookie' : s, | |
'Host' : 'www.google.com', | |
'User-Agent' : 'Mozilla/5.0 (Compatible; libgmail-python)' | |
} | |
data=urllib.urlencode({ | |
"service" : self.service, | |
"Email" : accountName, | |
"Passwd" : password, | |
"source" : "test-test-0.1", | |
"accountType": "HOSTED_OR_GOOGLE", | |
"submit" : "null" | |
}) | |
try: | |
response, content = self.server.request(authUri, 'POST', body=data, headers=headers) | |
except: | |
print "HTTP request failed" | |
return response, content | |
def getAuthToken(self, accountName, password, auth_uri): | |
response, content = self.makeAuthRequest(accountName, password, auth_uri) | |
try: | |
response_dict = dict(x.split("=") | |
for x in content.split("\n") if x) | |
self.token = response_dict["Auth"] | |
except: | |
print "Authorization Failed." | |
print response | |
(a,p) = self.GetUserCredentials() | |
self.getAuthToken(a,p, auth_uri) | |
print "Got Auth Token of: " + self.token[0:20] + "..." | |
self.headers['Cookie'] = "Auth=" + self.token | |
return response | |
def getAuthCookie(self, accountName, password, auth_uri): | |
response, content = self.makeAuthRequest(accountName, password, auth_uri) | |
#print response | |
#print content[0:150] | |
if response.has_key('set-cookie') and (response['set-cookie'].find("Auth",0) != -1): | |
print "Authorized " + accountName | |
self.headers['Cookie'] = response['set-cookie'] | |
else: | |
print "Could not get Auth cookie. Check login info" | |
sys.exit() | |
#print "Got Auth Cookie of: " + self.headers['Cookie'][0:20] + "..." | |
self.token = self.headers['Cookie'] | |
return self.headers['Cookie'] | |
def printUsage(): | |
print "Usage: googleWebHistory.py <out_file>" | |
def main(): | |
if (len(sys.argv) < 2): | |
printUsage() | |
sys.exit() | |
uri = 'http://www.google.com/history/' | |
#cookieAuthUri = 'https://www.google.com/accounts/ServiceLoginBoxAuth' | |
tokenAuthUri = 'https://www.google.com/accounts/ClientLogin' | |
cookieAuthUri = 'https://www.google.com/accounts/ServiceLoginAuth' | |
service = 'hist' | |
sc = Scrape() | |
sc.GetUserCredentials() | |
sc.authenticate(service) | |
resp = sc.getAuthCookie(sc.account,sc.password, cookieAuthUri) | |
resp, content = sc.getPage(uri) | |
print resp | |
sc.parsePage(content) | |
while (uri): | |
resp, content = sc.getPage(uri) | |
try: | |
if (resp['content-location'].find("Login",0) > 0): | |
print "Getting auth from " + resp['content-location'] | |
p1 = resp['content-location'].find("continue=",0) | |
p2 = resp['content-location'].find("&",p1) | |
uri = urllib.unquote(resp['content-location'][p1+9:p2]) | |
resp = sc.getAuthCookie(sc.account,sc.password, cookieAuthUri) | |
else: | |
localuri = sc.getLinkFromPage(content) | |
try: | |
uri = "http://www.google.com/history/" + localuri | |
except: | |
uri = None | |
except: | |
localuri = sc.getLinkFromPage(content) | |
try: | |
uri = "http://www.google.com/history/" + localuri | |
except: | |
uri = None | |
sc.parsePage(content) | |
print "All done" | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment