mutsuda · January 2, 2016 01:29
diff --git a/gsearch.py b/gsearch.py
 import csv
 import sys
 import mechanize
 import cookielib
 import re
 import time

 # Creates a Browser instance
 # and defines its headers
 def createBrowser():
    # Browser
    br = mechanize.Browser()

    # Cookie Jar
    cj = cookielib.LWPCookieJar()
    br.set_cookiejar(cj)

    # Browser options
    br.set_handle_equiv(True)
    br.set_handle_gzip(True)
    br.set_handle_redirect(True)
    br.set_handle_referer(True)
    br.set_handle_robots(False)

    # Follows refresh 0 but not hangs on refresh > 0
    br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)

    # User-Agent (this is cheating, ok?)
    br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; es-ES; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
    return br
   
 # Given a Browser a keyword and the ids
 # returns an array containing the ids,the keyword and the
 # google results
 def search(br, keyword, ids):
    # Create the variable where we will put the result
    result = []
    # Open google.com
    r = br.open('http://google.com')
    # Select the first (index zero) form
    br.select_form(nr=0)
    # Insert the query in the form
    br.form['q'] = keyword
    # Submit the form
    br.submit()
    # First column of the result will be the IDS
    result.append(ids)
    # Second column of the result will be the keyword
    result.append(keyword)
    # Looking at some results in link format
    for l in br.links(url_regex = "(?P<url>https?://(.*)\.facebook[^\s]+)"):
        # Try to get the result links cleaned
        try:
            # Apply regex to the result to clean it, put it in the result array
            result.append(re.search("(?P<url>https://[^\s]+)", l.url).group("url")) 
        except: 
            # There where no results matching our criteria, we will pass
            pass
    # Return the resulting array
    return result

 # Main place where everything takes place
 def main(argv):
    # Create a browser
    br = createBrowser()
    # Open the CSV file to write into
    ofile  = open(argv[1], "wb")
    writer = csv.writer(ofile,delimiter=';')
    # Boolean to check if we have the result
    gotResult = False
    # Open the file we read from
    with open(argv[0], 'rb') as csvfile:
        reader = csv.reader(csvfile, delimiter=';', quotechar='|')
        for row in reader:
            # Loop until we get some result (which can be empty)
            while not gotResult:
                try:    
                    # Search for the keyword with the browser and the IDS
                    line = search(br,row[1]+' '+row[2]+" facebook",row[0])
                    # If we got here we have the result
                    gotResult = True
                # We might have got an error while trying
                except mechanize.URLError as e:
                    print e.reason
                    print "Error retrieving information from Google"
                    print "Sleeping 30 seconds and trying again"    
                    # Give it a rest
                    time.sleep(30)
            print line
            # Write the resulting row into the file
            writer.writerow(line)
            # Reset variable
            gotResult = False
            # Pretend to be human
            time.sleep(1)

 if __name__ == "__main__":
    main(sys.argv[1:])
	import csv
	import sys
	import mechanize
	import cookielib
	import re
	import time

	# Creates a Browser instance
	# and defines its headers
	def createBrowser():
	# Browser
	br = mechanize.Browser()

	# Cookie Jar
	cj = cookielib.LWPCookieJar()
	br.set_cookiejar(cj)

	# Browser options
	br.set_handle_equiv(True)
	br.set_handle_gzip(True)
	br.set_handle_redirect(True)
	br.set_handle_referer(True)
	br.set_handle_robots(False)

	# Follows refresh 0 but not hangs on refresh > 0
	br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)

	# User-Agent (this is cheating, ok?)
	br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; es-ES; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
	return br

	# Given a Browser a keyword and the ids
	# returns an array containing the ids,the keyword and the
	# google results
	def search(br, keyword, ids):
	# Create the variable where we will put the result
	result = []
	# Open google.com
	r = br.open('http://google.com')
	# Select the first (index zero) form
	br.select_form(nr=0)
	# Insert the query in the form
	br.form['q'] = keyword
	# Submit the form
	br.submit()
	# First column of the result will be the IDS
	result.append(ids)
	# Second column of the result will be the keyword
	result.append(keyword)
	# Looking at some results in link format
	for l in br.links(url_regex = "(?P<url>https?://(.*)\.facebook[^\s]+)"):
	# Try to get the result links cleaned
	try:
	# Apply regex to the result to clean it, put it in the result array
	result.append(re.search("(?P<url>https://[^\s]+)", l.url).group("url"))
	except:
	# There where no results matching our criteria, we will pass
	pass
	# Return the resulting array
	return result

	# Main place where everything takes place
	def main(argv):
	# Create a browser
	br = createBrowser()
	# Open the CSV file to write into
	ofile = open(argv[1], "wb")
	writer = csv.writer(ofile,delimiter=';')
	# Boolean to check if we have the result
	gotResult = False
	# Open the file we read from
	with open(argv[0], 'rb') as csvfile:
	reader = csv.reader(csvfile, delimiter=';', quotechar='\|')
	for row in reader:
	# Loop until we get some result (which can be empty)
	while not gotResult:
	try:
	# Search for the keyword with the browser and the IDS
	line = search(br,row[1]+' '+row[2]+" facebook",row[0])
	# If we got here we have the result
	gotResult = True
	# We might have got an error while trying
	except mechanize.URLError as e:
	print e.reason
	print "Error retrieving information from Google"
	print "Sleeping 30 seconds and trying again"
	# Give it a rest
	time.sleep(30)
	print line
	# Write the resulting row into the file
	writer.writerow(line)
	# Reset variable
	gotResult = False
	# Pretend to be human
	time.sleep(1)

	if __name__ == "__main__":
	main(sys.argv[1:])