dvj · September 11, 2009 22:41
diff --git a/googleWebHistory.py b/googleWebHistory.py
 #!/usr/bin/python
 # Downloads Google Web History
 import sys, optparse, datetime, tempfile, os, string, urllib
 import getpass, uu
 import httplib2,re
 import time
 from Cookie import SimpleCookie

 import atom
 #import gdata.service

 class Scrape:
    def __init__(self):
 	self.server = httplib2.Http(".cache")
 	self.headers = {}
 	self._cookies = SimpleCookie()
 	#self.gdata = gdata.service.GDataService(server="www.google.com")
 	self.f = open(sys.argv[1],'w')
 	self.pagetime =  time.mktime(datetime.datetime.now().timetuple())

    def GetUserCredentials(self):
 	"""Prompts the user for a username and password."""
 	email = None
 	if email is None:
 	    email = raw_input("Email: ")

 	password_prompt = "Password for %s: " % email
 	password = getpass.getpass(password_prompt)
 	self.account = email
 	self.password = password
 	return (email, password)
 	

    def authenticate(self, goog_service):
 	self.service = goog_service
 	self.server = httplib2.Http()
 	#self.gdata.ClientLogin(self.account, self.password, "HOSTED_OR_GOOGLE", self.service)

  
    def getLinkFromPage(self,content):
        p1 = str(content).find(">Older&nbsp;&#8250;",0)
        if p1 < 1:
            return None
        pos = str(content).find("\"./lookup?hl=",p1-50)
        endpos = str(content).find("\"",pos+10)
        link = content[pos+3:endpos]
 	self.pagetime = float(content[pos+20:endpos])/1000000.0
        return link

    def parsePage(self,content):
 	found = True
 	ppos = 0;
 	print self.pagetime
 	mytime = datetime.datetime.fromtimestamp(self.pagetime)
 	self.f.write(mytime.ctime() + "<br>\n")
 	while (found):
 	    found = False
 	    pos = content.find("<a title",ppos)
 	    if (pos > ppos):
 		found = True
 		p2 = content.find("</a>",pos)
 		urlm = re.search('\"h.+\" hre',content[pos:p2+4])
 		try:
 		    url = urlm.group(0)[0:-4]
 		except:
 		    url = ""
 		st = re.sub('href=".*"',"href=" + url,content[pos:p2+4])
 				 
 		st = re.sub('\.\/url\?url=".+"','',st)
 		
 		self.f.write(st + "<br>\n")
 		ppos = p2


    def getPage(self,uri):
 	 self.headers['Content-Type'] = 'text/html'
 	 #self.headers['Authorization'] = 'GoogleLogin auth=%s' % self.gdata.GetAuthSubToken()
 	 response, content = self.server.request(uri, 'GET', headers=self.headers)

 	 # blindly follow redirects
 	 while response['status'] == '302':
 	     print "Making request to %s" % response['location']
 	     response, content = self.server.request(response['location'], 'GET', headers=self.headers)
 	 return response, content
 	

    def makeAuthRequest(self, accountName, password, authUri):
        epoch = int(time.time())
 	e = "T%s/%s/%s" % (epoch-2, epoch-1,epoch)	
 	self._cookies['GMAIL_LOGIN'] = e
 	s = self._cookies.output(attrs=[], header='').strip()
 	#print s
 	headers = {'Content-Type': 'application/x-www-form-urlencoded', 
 		   'Cookie'      : s,
 		   'Host'        : 'www.google.com',
                   'User-Agent'  : 'Mozilla/5.0 (Compatible; libgmail-python)'
 		   }
 	data=urllib.urlencode({
            "service"    : self.service,
            "Email"      : accountName,
            "Passwd"     : password,
            "source"     : "test-test-0.1",
            "accountType": "HOSTED_OR_GOOGLE",
 	    "submit"     : "null"
        })
 	try:
 	    response, content = self.server.request(authUri, 'POST', body=data, headers=headers)
 	except:
 	    print "HTTP request failed"
 	return response, content
 	
 	
    def getAuthToken(self, accountName, password, auth_uri):
 	response, content = self.makeAuthRequest(accountName, password, auth_uri)
 	try:
 	    response_dict = dict(x.split("=")
 				 for x in content.split("\n") if x)
 	    self.token = response_dict["Auth"]
 	except:
 	    print "Authorization Failed."
 	    print response
 	    (a,p) = self.GetUserCredentials()
 	    self.getAuthToken(a,p, auth_uri)
 	print "Got Auth Token of: " + self.token[0:20] + "..."
 	self.headers['Cookie'] = "Auth=" + self.token
 	return response

    def getAuthCookie(self, accountName, password, auth_uri):
 	response, content = self.makeAuthRequest(accountName, password, auth_uri)
 	#print response
 	#print content[0:150]
        if response.has_key('set-cookie') and (response['set-cookie'].find("Auth",0) != -1):
 	    print "Authorized " + accountName
            self.headers['Cookie'] = response['set-cookie']
        else:
            print "Could not get Auth cookie. Check login info"
 	    sys.exit()
 	#print "Got Auth Cookie of: " + self.headers['Cookie'][0:20] + "..."
 	self.token = self.headers['Cookie']
        return self.headers['Cookie']

 def printUsage():
    print "Usage: googleWebHistory.py <out_file>"

 def main():
    if (len(sys.argv) < 2):
 	printUsage()
 	sys.exit()
    uri = 'http://www.google.com/history/'
    #cookieAuthUri = 'https://www.google.com/accounts/ServiceLoginBoxAuth'
    tokenAuthUri = 'https://www.google.com/accounts/ClientLogin'
    cookieAuthUri = 'https://www.google.com/accounts/ServiceLoginAuth'
    service = 'hist'
    sc = Scrape()
    sc.GetUserCredentials()
    sc.authenticate(service)
    resp = sc.getAuthCookie(sc.account,sc.password, cookieAuthUri)
    resp, content = sc.getPage(uri)
    print resp
    sc.parsePage(content)
    while (uri):
 	resp, content = sc.getPage(uri)
 	try:
 	    if (resp['content-location'].find("Login",0) > 0):
 		print "Getting auth from " + resp['content-location']
 		p1 = resp['content-location'].find("continue=",0)
 		p2 = resp['content-location'].find("&",p1)
 		uri = urllib.unquote(resp['content-location'][p1+9:p2])
 		resp = sc.getAuthCookie(sc.account,sc.password, cookieAuthUri)
 	    else:
 		localuri = sc.getLinkFromPage(content)
 		try:
 		    uri = "http://www.google.com/history/" + localuri
 		except:
 		    uri = None
 	except:
 	    localuri = sc.getLinkFromPage(content)
 	    try:
 		uri = "http://www.google.com/history/" + localuri
 	    except:
 		uri = None
 	sc.parsePage(content)
    print "All done"


 main()
	#!/usr/bin/python
	# Downloads Google Web History
	import sys, optparse, datetime, tempfile, os, string, urllib
	import getpass, uu
	import httplib2,re
	import time
	from Cookie import SimpleCookie

	import atom
	#import gdata.service

	class Scrape:
	def __init__(self):
	self.server = httplib2.Http(".cache")
	self.headers = {}
	self._cookies = SimpleCookie()
	#self.gdata = gdata.service.GDataService(server="www.google.com")
	self.f = open(sys.argv[1],'w')
	self.pagetime = time.mktime(datetime.datetime.now().timetuple())

	def GetUserCredentials(self):
	"""Prompts the user for a username and password."""
	email = None
	if email is None:
	email = raw_input("Email: ")

	password_prompt = "Password for %s: " % email
	password = getpass.getpass(password_prompt)
	self.account = email
	self.password = password
	return (email, password)


	def authenticate(self, goog_service):
	self.service = goog_service
	self.server = httplib2.Http()
	#self.gdata.ClientLogin(self.account, self.password, "HOSTED_OR_GOOGLE", self.service)


	def getLinkFromPage(self,content):
	p1 = str(content).find(">Older ›",0)
	if p1 < 1:
	return None
	pos = str(content).find("\"./lookup?hl=",p1-50)
	endpos = str(content).find("\"",pos+10)
	link = content[pos+3:endpos]
	self.pagetime = float(content[pos+20:endpos])/1000000.0
	return link

	def parsePage(self,content):
	found = True
	ppos = 0;
	print self.pagetime
	mytime = datetime.datetime.fromtimestamp(self.pagetime)
	self.f.write(mytime.ctime() + "<br>\n")
	while (found):
	found = False
	pos = content.find("<a title",ppos)
	if (pos > ppos):
	found = True
	p2 = content.find("</a>",pos)
	urlm = re.search('\"h.+\" hre',content[pos:p2+4])
	try:
	url = urlm.group(0)[0:-4]
	except:
	url = ""
	st = re.sub('href=".*"',"href=" + url,content[pos:p2+4])

	st = re.sub('\.\/url\?url=".+"','',st)

	self.f.write(st + "<br>\n")
	ppos = p2


	def getPage(self,uri):
	self.headers['Content-Type'] = 'text/html'
	#self.headers['Authorization'] = 'GoogleLogin auth=%s' % self.gdata.GetAuthSubToken()
	response, content = self.server.request(uri, 'GET', headers=self.headers)

	# blindly follow redirects
	while response['status'] == '302':
	print "Making request to %s" % response['location']
	response, content = self.server.request(response['location'], 'GET', headers=self.headers)
	return response, content


	def makeAuthRequest(self, accountName, password, authUri):
	epoch = int(time.time())
	e = "T%s/%s/%s" % (epoch-2, epoch-1,epoch)
	self._cookies['GMAIL_LOGIN'] = e
	s = self._cookies.output(attrs=[], header='').strip()
	#print s
	headers = {'Content-Type': 'application/x-www-form-urlencoded',
	'Cookie' : s,
	'Host' : 'www.google.com',
	'User-Agent' : 'Mozilla/5.0 (Compatible; libgmail-python)'
	}
	data=urllib.urlencode({
	"service" : self.service,
	"Email" : accountName,
	"Passwd" : password,
	"source" : "test-test-0.1",
	"accountType": "HOSTED_OR_GOOGLE",
	"submit" : "null"
	})
	try:
	response, content = self.server.request(authUri, 'POST', body=data, headers=headers)
	except:
	print "HTTP request failed"
	return response, content


	def getAuthToken(self, accountName, password, auth_uri):
	response, content = self.makeAuthRequest(accountName, password, auth_uri)
	try:
	response_dict = dict(x.split("=")
	for x in content.split("\n") if x)
	self.token = response_dict["Auth"]
	except:
	print "Authorization Failed."
	print response
	(a,p) = self.GetUserCredentials()
	self.getAuthToken(a,p, auth_uri)
	print "Got Auth Token of: " + self.token[0:20] + "..."
	self.headers['Cookie'] = "Auth=" + self.token
	return response

	def getAuthCookie(self, accountName, password, auth_uri):
	response, content = self.makeAuthRequest(accountName, password, auth_uri)
	#print response
	#print content[0:150]
	if response.has_key('set-cookie') and (response['set-cookie'].find("Auth",0) != -1):
	print "Authorized " + accountName
	self.headers['Cookie'] = response['set-cookie']
	else:
	print "Could not get Auth cookie. Check login info"
	sys.exit()
	#print "Got Auth Cookie of: " + self.headers['Cookie'][0:20] + "..."
	self.token = self.headers['Cookie']
	return self.headers['Cookie']

	def printUsage():
	print "Usage: googleWebHistory.py <out_file>"

	def main():
	if (len(sys.argv) < 2):
	printUsage()
	sys.exit()
	uri = 'http://www.google.com/history/'
	#cookieAuthUri = 'https://www.google.com/accounts/ServiceLoginBoxAuth'
	tokenAuthUri = 'https://www.google.com/accounts/ClientLogin'
	cookieAuthUri = 'https://www.google.com/accounts/ServiceLoginAuth'
	service = 'hist'
	sc = Scrape()
	sc.GetUserCredentials()
	sc.authenticate(service)
	resp = sc.getAuthCookie(sc.account,sc.password, cookieAuthUri)
	resp, content = sc.getPage(uri)
	print resp
	sc.parsePage(content)
	while (uri):
	resp, content = sc.getPage(uri)
	try:
	if (resp['content-location'].find("Login",0) > 0):
	print "Getting auth from " + resp['content-location']
	p1 = resp['content-location'].find("continue=",0)
	p2 = resp['content-location'].find("&",p1)
	uri = urllib.unquote(resp['content-location'][p1+9:p2])
	resp = sc.getAuthCookie(sc.account,sc.password, cookieAuthUri)
	else:
	localuri = sc.getLinkFromPage(content)
	try:
	uri = "http://www.google.com/history/" + localuri
	except:
	uri = None
	except:
	localuri = sc.getLinkFromPage(content)
	try:
	uri = "http://www.google.com/history/" + localuri
	except:
	uri = None
	sc.parsePage(content)
	print "All done"


	main()