Last active
January 2, 2016 01:29
-
-
Save mutsuda/8230446 to your computer and use it in GitHub Desktop.
Search keywords from a CSV file and put resulting links matching a certain criteria in another CSV file
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
import sys | |
import mechanize | |
import cookielib | |
import re | |
import time | |
# Creates a Browser instance | |
# and defines its headers | |
def createBrowser(): | |
# Browser | |
br = mechanize.Browser() | |
# Cookie Jar | |
cj = cookielib.LWPCookieJar() | |
br.set_cookiejar(cj) | |
# Browser options | |
br.set_handle_equiv(True) | |
br.set_handle_gzip(True) | |
br.set_handle_redirect(True) | |
br.set_handle_referer(True) | |
br.set_handle_robots(False) | |
# Follows refresh 0 but not hangs on refresh > 0 | |
br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1) | |
# User-Agent (this is cheating, ok?) | |
br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; es-ES; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')] | |
return br | |
# Given a Browser a keyword and the ids | |
# returns an array containing the ids,the keyword and the | |
# google results | |
def search(br, keyword, ids): | |
# Create the variable where we will put the result | |
result = [] | |
# Open google.com | |
r = br.open('http://google.com') | |
# Select the first (index zero) form | |
br.select_form(nr=0) | |
# Insert the query in the form | |
br.form['q'] = keyword | |
# Submit the form | |
br.submit() | |
# First column of the result will be the IDS | |
result.append(ids) | |
# Second column of the result will be the keyword | |
result.append(keyword) | |
# Looking at some results in link format | |
for l in br.links(url_regex = "(?P<url>https?://(.*)\.facebook[^\s]+)"): | |
# Try to get the result links cleaned | |
try: | |
# Apply regex to the result to clean it, put it in the result array | |
result.append(re.search("(?P<url>https://[^\s]+)", l.url).group("url")) | |
except: | |
# There where no results matching our criteria, we will pass | |
pass | |
# Return the resulting array | |
return result | |
# Main place where everything takes place | |
def main(argv): | |
# Create a browser | |
br = createBrowser() | |
# Open the CSV file to write into | |
ofile = open(argv[1], "wb") | |
writer = csv.writer(ofile,delimiter=';') | |
# Boolean to check if we have the result | |
gotResult = False | |
# Open the file we read from | |
with open(argv[0], 'rb') as csvfile: | |
reader = csv.reader(csvfile, delimiter=';', quotechar='|') | |
for row in reader: | |
# Loop until we get some result (which can be empty) | |
while not gotResult: | |
try: | |
# Search for the keyword with the browser and the IDS | |
line = search(br,row[1]+' '+row[2]+" facebook",row[0]) | |
# If we got here we have the result | |
gotResult = True | |
# We might have got an error while trying | |
except mechanize.URLError as e: | |
print e.reason | |
print "Error retrieving information from Google" | |
print "Sleeping 30 seconds and trying again" | |
# Give it a rest | |
time.sleep(30) | |
print line | |
# Write the resulting row into the file | |
writer.writerow(line) | |
# Reset variable | |
gotResult = False | |
# Pretend to be human | |
time.sleep(1) | |
if __name__ == "__main__": | |
main(sys.argv[1:]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment