suriyadeepan · September 13, 2016 08:07
diff --git a/scrape_hashtags.py b/scrape_hashtags.py
 from bs4 import BeautifulSoup
 import requests

 import time
 import sys
 import os

 from selenium import webdriver
 from selenium.webdriver.common.keys import Keys

 import pickle


 firefox_profile = webdriver.FirefoxProfile()
 firefox_profile.set_preference('permissions.default.stylesheet', 2)
 firefox_profile.set_preference('permissions.default.image', 2)
 firefox_profile.set_preference('dom.ipc.plugins.enabled.libflashplayer.so', 'false')
 # create driver
 driver = webdriver.Firefox(firefox_profile=firefox_profile)

 base_url = lambda hashtag : 'https://twitter.com/hashtag/{}?src=hash'.format(hashtag)

 def get_soup(url):
    return BeautifulSoup( requests.get(url).content, 'lxml')

 def crawl_page(url, n):
    # open url
    driver.get(url)
    # wait for page to load
    driver.implicitly_wait(15)
    # scroll for n seconds
    for i in range(n):
        elem = driver.find_element_by_tag_name('a')
        elem.send_keys(Keys.END)
        time.sleep(2)
        sys.stderr.write('\r{0}/{1} complete...'.format(i+1,n))
    # gather list items
    list_items = driver.find_elements_by_tag_name('ol')
    # get soup
    soup = BeautifulSoup(list_items[0].get_attribute('innerHTML'),'lxml')
    return soup

 def extract_tweet_ids(soup):
    return [ tag.get('data-item-id') for tag in soup.findAll('li') if 'data-item-type' in tag.attrs and tag.attrs['data-item-type'] == 'tweet']

 def save_tweets(hashtags, n, group):
    # hashtag -> list of hashtags
    #  construct urls and gather tweets
    for hashtag in hashtags:
        print('>> Crawling for #{}'.format(hashtag))
        # crawl page
        soup = crawl_page(base_url(hashtag), n)
        # get tweet tags
        tweet_ids = extract_tweet_ids(soup)
        print('>> Grabbed {0} tweets from {1}...'.format(len(tweet_ids),hashtag))
        # check if group folder exists
        if not os.path.exists('save/' + group):
            os.makedirs('save/' + group)
        # write to file
        with open('save/{0}/{1}.p'.format(group,hashtag), 'wb') as f:
            pickle.dump(tweet_ids,f)


 if __name__ == '__main__':
    '''
    hashtags = [ 'Ferguson', 'LoveWins', 'BlackLivesMatter', 
            'IndyRef', 'Sandy', 'IceBucketChallenge', 
            'BringBackOurGirls', 'PrayForJapan', 
            'YesAllWomen', 'GivingTuesday']
    #hashtags = [ 'CharlieHebdo', 'JeSuisCharlie', 'PrayForParis', 
    hashtags = [ 'PrayForParis', 
            'AskRachel', 'DonaldTrump', 'IStandWithAhmed', 
            'WakeUpAmerica', 'Obama', 'SandraBland', 'tcot' ]
    '''
    hashtags = [ 'SaferThanATrumpRally', 'FeelTheBern', 'NeverTrump',
            'ImWithHer', 'MakeDonaldDrumpfAgain', 'StudentLoanDebt',
            'Syria', 'HillaryEmails', 'GunControl', 'StopGunViolence',
            'CampaignZero', 'Election2016', 'StopGunViolence', 
            'IStandWithPP', 'ClimateChange', 'GlobalWarming',
            'StudentLoanForgiveness']
    save_tweets(hashtags, n=400, group='set3')
    driver.quit()
	from bs4 import BeautifulSoup
	import requests

	import time
	import sys
	import os

	from selenium import webdriver
	from selenium.webdriver.common.keys import Keys

	import pickle


	firefox_profile = webdriver.FirefoxProfile()
	firefox_profile.set_preference('permissions.default.stylesheet', 2)
	firefox_profile.set_preference('permissions.default.image', 2)
	firefox_profile.set_preference('dom.ipc.plugins.enabled.libflashplayer.so', 'false')
	# create driver
	driver = webdriver.Firefox(firefox_profile=firefox_profile)

	base_url = lambda hashtag : 'https://twitter.com/hashtag/{}?src=hash'.format(hashtag)

	def get_soup(url):
	return BeautifulSoup( requests.get(url).content, 'lxml')

	def crawl_page(url, n):
	# open url
	driver.get(url)
	# wait for page to load
	driver.implicitly_wait(15)
	# scroll for n seconds
	for i in range(n):
	elem = driver.find_element_by_tag_name('a')
	elem.send_keys(Keys.END)
	time.sleep(2)
	sys.stderr.write('\r{0}/{1} complete...'.format(i+1,n))
	# gather list items
	list_items = driver.find_elements_by_tag_name('ol')
	# get soup
	soup = BeautifulSoup(list_items[0].get_attribute('innerHTML'),'lxml')
	return soup

	def extract_tweet_ids(soup):
	return [ tag.get('data-item-id') for tag in soup.findAll('li') if 'data-item-type' in tag.attrs and tag.attrs['data-item-type'] == 'tweet']

	def save_tweets(hashtags, n, group):
	# hashtag -> list of hashtags
	# construct urls and gather tweets
	for hashtag in hashtags:
	print('>> Crawling for #{}'.format(hashtag))
	# crawl page
	soup = crawl_page(base_url(hashtag), n)
	# get tweet tags
	tweet_ids = extract_tweet_ids(soup)
	print('>> Grabbed {0} tweets from {1}...'.format(len(tweet_ids),hashtag))
	# check if group folder exists
	if not os.path.exists('save/' + group):
	os.makedirs('save/' + group)
	# write to file
	with open('save/{0}/{1}.p'.format(group,hashtag), 'wb') as f:
	pickle.dump(tweet_ids,f)


	if __name__ == '__main__':
	'''
	hashtags = [ 'Ferguson', 'LoveWins', 'BlackLivesMatter',
	'IndyRef', 'Sandy', 'IceBucketChallenge',
	'BringBackOurGirls', 'PrayForJapan',
	'YesAllWomen', 'GivingTuesday']
	#hashtags = [ 'CharlieHebdo', 'JeSuisCharlie', 'PrayForParis',
	hashtags = [ 'PrayForParis',
	'AskRachel', 'DonaldTrump', 'IStandWithAhmed',
	'WakeUpAmerica', 'Obama', 'SandraBland', 'tcot' ]
	'''
	hashtags = [ 'SaferThanATrumpRally', 'FeelTheBern', 'NeverTrump',
	'ImWithHer', 'MakeDonaldDrumpfAgain', 'StudentLoanDebt',
	'Syria', 'HillaryEmails', 'GunControl', 'StopGunViolence',
	'CampaignZero', 'Election2016', 'StopGunViolence',
	'IStandWithPP', 'ClimateChange', 'GlobalWarming',
	'StudentLoanForgiveness']
	save_tweets(hashtags, n=400, group='set3')
	driver.quit()