Skip to content

Instantly share code, notes, and snippets.

@dvj
Created September 11, 2009 22:41
Show Gist options
  • Save dvj/185634 to your computer and use it in GitHub Desktop.
Save dvj/185634 to your computer and use it in GitHub Desktop.
#!/usr/bin/python
# Downloads Google Web History
import sys, optparse, datetime, tempfile, os, string, urllib
import getpass, uu
import httplib2,re
import time
from Cookie import SimpleCookie
import atom
#import gdata.service
class Scrape:
def __init__(self):
self.server = httplib2.Http(".cache")
self.headers = {}
self._cookies = SimpleCookie()
#self.gdata = gdata.service.GDataService(server="www.google.com")
self.f = open(sys.argv[1],'w')
self.pagetime = time.mktime(datetime.datetime.now().timetuple())
def GetUserCredentials(self):
"""Prompts the user for a username and password."""
email = None
if email is None:
email = raw_input("Email: ")
password_prompt = "Password for %s: " % email
password = getpass.getpass(password_prompt)
self.account = email
self.password = password
return (email, password)
def authenticate(self, goog_service):
self.service = goog_service
self.server = httplib2.Http()
#self.gdata.ClientLogin(self.account, self.password, "HOSTED_OR_GOOGLE", self.service)
def getLinkFromPage(self,content):
p1 = str(content).find(">Older ›",0)
if p1 < 1:
return None
pos = str(content).find("\"./lookup?hl=",p1-50)
endpos = str(content).find("\"",pos+10)
link = content[pos+3:endpos]
self.pagetime = float(content[pos+20:endpos])/1000000.0
return link
def parsePage(self,content):
found = True
ppos = 0;
print self.pagetime
mytime = datetime.datetime.fromtimestamp(self.pagetime)
self.f.write(mytime.ctime() + "<br>\n")
while (found):
found = False
pos = content.find("<a title",ppos)
if (pos > ppos):
found = True
p2 = content.find("</a>",pos)
urlm = re.search('\"h.+\" hre',content[pos:p2+4])
try:
url = urlm.group(0)[0:-4]
except:
url = ""
st = re.sub('href=".*"',"href=" + url,content[pos:p2+4])
st = re.sub('\.\/url\?url=".+"','',st)
self.f.write(st + "<br>\n")
ppos = p2
def getPage(self,uri):
self.headers['Content-Type'] = 'text/html'
#self.headers['Authorization'] = 'GoogleLogin auth=%s' % self.gdata.GetAuthSubToken()
response, content = self.server.request(uri, 'GET', headers=self.headers)
# blindly follow redirects
while response['status'] == '302':
print "Making request to %s" % response['location']
response, content = self.server.request(response['location'], 'GET', headers=self.headers)
return response, content
def makeAuthRequest(self, accountName, password, authUri):
epoch = int(time.time())
e = "T%s/%s/%s" % (epoch-2, epoch-1,epoch)
self._cookies['GMAIL_LOGIN'] = e
s = self._cookies.output(attrs=[], header='').strip()
#print s
headers = {'Content-Type': 'application/x-www-form-urlencoded',
'Cookie' : s,
'Host' : 'www.google.com',
'User-Agent' : 'Mozilla/5.0 (Compatible; libgmail-python)'
}
data=urllib.urlencode({
"service" : self.service,
"Email" : accountName,
"Passwd" : password,
"source" : "test-test-0.1",
"accountType": "HOSTED_OR_GOOGLE",
"submit" : "null"
})
try:
response, content = self.server.request(authUri, 'POST', body=data, headers=headers)
except:
print "HTTP request failed"
return response, content
def getAuthToken(self, accountName, password, auth_uri):
response, content = self.makeAuthRequest(accountName, password, auth_uri)
try:
response_dict = dict(x.split("=")
for x in content.split("\n") if x)
self.token = response_dict["Auth"]
except:
print "Authorization Failed."
print response
(a,p) = self.GetUserCredentials()
self.getAuthToken(a,p, auth_uri)
print "Got Auth Token of: " + self.token[0:20] + "..."
self.headers['Cookie'] = "Auth=" + self.token
return response
def getAuthCookie(self, accountName, password, auth_uri):
response, content = self.makeAuthRequest(accountName, password, auth_uri)
#print response
#print content[0:150]
if response.has_key('set-cookie') and (response['set-cookie'].find("Auth",0) != -1):
print "Authorized " + accountName
self.headers['Cookie'] = response['set-cookie']
else:
print "Could not get Auth cookie. Check login info"
sys.exit()
#print "Got Auth Cookie of: " + self.headers['Cookie'][0:20] + "..."
self.token = self.headers['Cookie']
return self.headers['Cookie']
def printUsage():
print "Usage: googleWebHistory.py <out_file>"
def main():
if (len(sys.argv) < 2):
printUsage()
sys.exit()
uri = 'http://www.google.com/history/'
#cookieAuthUri = 'https://www.google.com/accounts/ServiceLoginBoxAuth'
tokenAuthUri = 'https://www.google.com/accounts/ClientLogin'
cookieAuthUri = 'https://www.google.com/accounts/ServiceLoginAuth'
service = 'hist'
sc = Scrape()
sc.GetUserCredentials()
sc.authenticate(service)
resp = sc.getAuthCookie(sc.account,sc.password, cookieAuthUri)
resp, content = sc.getPage(uri)
print resp
sc.parsePage(content)
while (uri):
resp, content = sc.getPage(uri)
try:
if (resp['content-location'].find("Login",0) > 0):
print "Getting auth from " + resp['content-location']
p1 = resp['content-location'].find("continue=",0)
p2 = resp['content-location'].find("&",p1)
uri = urllib.unquote(resp['content-location'][p1+9:p2])
resp = sc.getAuthCookie(sc.account,sc.password, cookieAuthUri)
else:
localuri = sc.getLinkFromPage(content)
try:
uri = "http://www.google.com/history/" + localuri
except:
uri = None
except:
localuri = sc.getLinkFromPage(content)
try:
uri = "http://www.google.com/history/" + localuri
except:
uri = None
sc.parsePage(content)
print "All done"
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment