Skip to content

Instantly share code, notes, and snippets.

@suriyadeepan
Created September 13, 2016 08:07
Show Gist options
  • Save suriyadeepan/da9d8bd4af44e009db9bae89db395f82 to your computer and use it in GitHub Desktop.
Save suriyadeepan/da9d8bd4af44e009db9bae89db395f82 to your computer and use it in GitHub Desktop.
Selenium based Twitter Scrapper
from bs4 import BeautifulSoup
import requests
import time
import sys
import os
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import pickle
firefox_profile = webdriver.FirefoxProfile()
firefox_profile.set_preference('permissions.default.stylesheet', 2)
firefox_profile.set_preference('permissions.default.image', 2)
firefox_profile.set_preference('dom.ipc.plugins.enabled.libflashplayer.so', 'false')
# create driver
driver = webdriver.Firefox(firefox_profile=firefox_profile)
base_url = lambda hashtag : 'https://twitter.com/hashtag/{}?src=hash'.format(hashtag)
def get_soup(url):
return BeautifulSoup( requests.get(url).content, 'lxml')
def crawl_page(url, n):
# open url
driver.get(url)
# wait for page to load
driver.implicitly_wait(15)
# scroll for n seconds
for i in range(n):
elem = driver.find_element_by_tag_name('a')
elem.send_keys(Keys.END)
time.sleep(2)
sys.stderr.write('\r{0}/{1} complete...'.format(i+1,n))
# gather list items
list_items = driver.find_elements_by_tag_name('ol')
# get soup
soup = BeautifulSoup(list_items[0].get_attribute('innerHTML'),'lxml')
return soup
def extract_tweet_ids(soup):
return [ tag.get('data-item-id') for tag in soup.findAll('li') if 'data-item-type' in tag.attrs and tag.attrs['data-item-type'] == 'tweet']
def save_tweets(hashtags, n, group):
# hashtag -> list of hashtags
# construct urls and gather tweets
for hashtag in hashtags:
print('>> Crawling for #{}'.format(hashtag))
# crawl page
soup = crawl_page(base_url(hashtag), n)
# get tweet tags
tweet_ids = extract_tweet_ids(soup)
print('>> Grabbed {0} tweets from {1}...'.format(len(tweet_ids),hashtag))
# check if group folder exists
if not os.path.exists('save/' + group):
os.makedirs('save/' + group)
# write to file
with open('save/{0}/{1}.p'.format(group,hashtag), 'wb') as f:
pickle.dump(tweet_ids,f)
if __name__ == '__main__':
'''
hashtags = [ 'Ferguson', 'LoveWins', 'BlackLivesMatter',
'IndyRef', 'Sandy', 'IceBucketChallenge',
'BringBackOurGirls', 'PrayForJapan',
'YesAllWomen', 'GivingTuesday']
#hashtags = [ 'CharlieHebdo', 'JeSuisCharlie', 'PrayForParis',
hashtags = [ 'PrayForParis',
'AskRachel', 'DonaldTrump', 'IStandWithAhmed',
'WakeUpAmerica', 'Obama', 'SandraBland', 'tcot' ]
'''
hashtags = [ 'SaferThanATrumpRally', 'FeelTheBern', 'NeverTrump',
'ImWithHer', 'MakeDonaldDrumpfAgain', 'StudentLoanDebt',
'Syria', 'HillaryEmails', 'GunControl', 'StopGunViolence',
'CampaignZero', 'Election2016', 'StopGunViolence',
'IStandWithPP', 'ClimateChange', 'GlobalWarming',
'StudentLoanForgiveness']
save_tweets(hashtags, n=400, group='set3')
driver.quit()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment