fohlin · August 16, 2012 18:16 · Mar 12, 2011 · Feb 26, 2011 · Feb 26, 2011 · Feb 26, 2011
diff --git a/archiveTwitter.py b/archiveTwitter.py
@@ -116,8 +116,8 @@ def doSearch(term, resume = True, delay = 10):
 			DEBUG("doSearch: nextPage")
 			nextPageFields = parse_qs(nextPage)
 			queryParameters['max_id'] = nextPageFields['max_id'].pop()
-			if nextPageFields.has_key('page'):
-				queryParameters['page'] = nextPageFields['page'].pop()
+			if nextPageFields.has_key('?page'):
+				queryParameters['page'] = nextPageFields['?page'].pop()
 			else:
 				if queryParameters.has_key('page'):
 					del queryParameters['page']
@@ -135,7 +135,8 @@ def doSearch(term, resume = True, delay = 10):
 				First time querying!
 				'''
 				DEBUG("doSearch: !nextPage, !maxID")
-
+
+		print queryParameters
 		jsonData = doQuery(queryParameters)
 
 		# Set nextPage based on the response.

diff --git a/archiveTwitter.r → archiveTwitter.py b/archiveTwitter.r → archiveTwitter.py
diff --git a/archiveTwitter.r b/archiveTwitter.r
@@ -169,4 +169,4 @@ def doSearch(term, resume = True, delay = 10):
 		time.sleep(delay)
 
 if __name__ == "__main__":
-	doSearch("mjbommar")
+	doSearch("#python")
diff --git a/archiveTwitter.r b/archiveTwitter.r
@@ -37,9 +37,10 @@ def getLastMaxID(fileName):
 	idList = []
 
 	for line in codecs.open(fileName, 'r', 'utf8'):
-		if len(line.strip()) > 0:
-			field = line.split(',')[0].strip()
-			idList.append(int(field))
+		try:
+			idList.append(int(line.split(',')[0].strip()))
+		except:
+			pass
 
 	return min(idList)
 
@@ -168,4 +169,4 @@ def doSearch(term, resume = True, delay = 10):
 		time.sleep(delay)
 
 if __name__ == "__main__":
-	doSearch("#python")
+	doSearch("mjbommar")
diff --git a/archiveTwitter.r b/archiveTwitter.r
@@ -0,0 +1,171 @@
+'''
+@author Michael J Bommarito II
+@date Feb 26, 2011
+@license Simplified BSD, (C) 2011.
+
+This script demonstrates how to use Python to archive historical tweets.
+'''
+
+import codecs
+import csv
+import json
+import os
+import time
+import urllib
+import urllib2
+import urlparse
+
+# Set me to True if you want messages about what's going on.
+DEBUG = True
+
+def DEBUG(message):
+	# Just a little debug macro.
+	if DEBUG:
+		print message
+
+# Make sure we have parse_qs for <2.6, but prefer urlparse.
+try:
+	parse_qs = urlparse.parse_qs
+except:
+	import cgi
+	parse_qs = cgi.parse_qs
+
+def getLastMaxID(fileName):
+	'''
+	Read a tweet list to see what the last max_id should be.
+	'''
+	idList = []
+
+	for line in codecs.open(fileName, 'r', 'utf8'):
+		if len(line.strip()) > 0:
+			field = line.split(',')[0].strip()
+			idList.append(int(field))
+
+	return min(idList)
+
+def json2tuple(tweet):
+	'''
+	Convert a JSON dict to a tweet tuple.  If you want to include more or less data, this is where you can change it.
+	'''
+	# Clean up the geo field
+	if tweet['geo'] == None:
+		tweet['geo'] = ""
+	elif tweet['geo']['type'] == 'Point':
+		tweet['geo'] = ','.join(map(repr, tweet['geo']['coordinates']))
+
+	# Clean up the text field
+	tweet['text'] = tweet['text'].replace('"','""').replace("\r","").replace("\n","").replace("\t","")
+
+	return ((tweet['id_str'],			# Tweet ID - DON'T REMOVE ME.
+		tweet['from_user'],			# User handle
+		tweet['geo'],				# Geostamp
+		u'"{0}"'.format(tweet['created_at']),# Timestamp
+		u'"{0}"'.format(tweet['text'])		# Tweet text content - what you see.		
+		))
+
+def doQuery(queryParameters):
+	'''
+	Execute a query and return the JSON results.
+	'''
+	queryURL = "http://search.twitter.com/search.json?" + urllib.urlencode(queryParameters)
+
+	# We want to catch errors here to continue gracefully if possible.
+	try:
+		DEBUG("doQuery: Fetching {0}".format(queryURL))
+		queryResponse = urllib2.urlopen(queryURL, timeout=10)		
+	except urllib2.HTTPError, E:
+		DEBUG("doQuery: Got an HTTP exception {0}".format(E.code))
+
+		if E.code in [400, 420]:
+			# 400, 420 => Slow down and be nice. Wait 10 minutes.
+			time.sleep(600)		
+		elif E.code == 503:
+			# 503 => Twitter is getting hammered but it's not you, just wait 1 minute.
+			time.sleep(60)
+		else:
+			# We should actually quit if it wasn't one of these.
+			sys.exit(-1)
+
+	return json.load(queryResponse, encoding="UTF-8")
+
+def doSearch(term, resume = True, delay = 10):
+	'''
+	Run a search and download all historical data.
+	'''
+	# This is the basic query.
+	queryParameters = dict(q=term,		# Search term
+				rpp=100)	# Results per page
+
+
+	# These are some state variables to keep track of what we're doing.
+	nextPage = None
+	maxID = None
+
+	# Check to see if we should resume.
+	fileName = "tweets_{0}.csv".format(term.translate(None, '?/\?%*:|"<>.'))
+	if resume and os.path.exists(fileName):
+		maxID = getLastMaxID(fileName)
+
+	# Keep going until a condition within breaks.
+	while True:
+		if nextPage:
+			'''
+			This means we are currently paging through results.
+			'''
+			DEBUG("doSearch: nextPage")
+			nextPageFields = parse_qs(nextPage)
+			queryParameters['max_id'] = nextPageFields['max_id'].pop()
+			if nextPageFields.has_key('page'):
+				queryParameters['page'] = nextPageFields['page'].pop()
+			else:
+				if queryParameters.has_key('page'):
+					del queryParameters['page']
+		else:
+			if maxID:
+				'''
+				This means we either just started downloading
+				after resuming from file or that we need to start 
+				the paging process over.
+				'''
+				DEBUG("doSearch: !nextPage, maxID={0}".format(maxID))
+				queryParameters['max_id'] = maxID
+			else:
+				'''
+				First time querying!
+				'''
+				DEBUG("doSearch: !nextPage, !maxID")
+
+		jsonData = doQuery(queryParameters)
+
+		# Set nextPage based on the response.
+		if jsonData.has_key('next_page'):
+			nextPage = jsonData['next_page']
+		else:
+			nextPage = None
+			if queryParameters.has_key('max_id'):
+				del queryParameters['max_id']
+
+			if queryParameters.has_key('page'):
+					del queryParameters['page']
+
+		# Process and output the tweets
+		tweets = map(json2tuple, jsonData['results'])
+
+		tweetFile = codecs.open(fileName, 'a+', 'utf8')
+		for tweet in tweets:
+			tweetFile.write(','.join(tweet) + "\n")
+		tweetFile.close()
+
+		# Check to see if we've reached the end.
+		if len(tweets) <= 1:
+			DEBUG("len(tweets) = 1 => breaking.")
+			break
+
+		# Update maxID
+		maxID = min([int(tweet[0]) for tweet in tweets])		
+
+		# Sleep to be nice to Twitter.
+		time.sleep(delay)
+
+if __name__ == "__main__":
+	doSearch("#python")