Created
November 23, 2016 13:24
-
-
Save nperony/0a423cf7da18836d9ac8a2fdc80c313d to your computer and use it in GitHub Desktop.
Crawling profiles in Facebook (or Workplace) community to get list of followers
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Facebook login to get the list of members for every follower | |
from selenium import webdriver | |
from selenium.webdriver.support import ui | |
from selenium.webdriver.common.keys import Keys | |
def page_is_loaded(driver): | |
return driver.find_element_by_tag_name("body") != None | |
driver = webdriver.Chrome() | |
driver.get('https://hyperloop.facebook.com/work/landing/input/') | |
wait = ui.WebDriverWait(driver, 10) | |
wait.until(page_is_loaded); | |
email_field = driver.find_element_by_name('email') | |
email_field.send_keys('[email protected]') | |
email_field.send_keys(Keys.RETURN) | |
wait.until(page_is_loaded); | |
# Then do the authentication manually (if 2FA is enabled) | |
### Helper functions | |
def get_follower_page_source(driver, member_id): | |
follower_url = 'https://hyperloop.facebook.com/profile.php?id=%s&sk=followers' % member_id | |
driver.get(follower_url) | |
wait.until(page_is_loaded); | |
return driver.page_source | |
from bs4 import BeautifulSoup | |
def get_follower_ids(follower_page_html, pattern='hyperloop.facebook.com/profile.php?id=', id_length=15): | |
soup = BeautifulSoup(follower_page_html, 'html.parser') | |
followers = soup.findAll('li', {'class': 'fbProfileBrowserListItem'}) | |
for f in followers: | |
name_block = f.findNext('div', {'class': 'clearfix _42ef'}) | |
profile_links = name_block.findAll('a') | |
for l in profile_links: | |
href = l.get('href') | |
if href == None: | |
continue | |
if pattern in href: | |
putative_id = href.split(pattern,1)[1] | |
putative_id = putative_id[:15] | |
if putative_id.isdigit() and len(putative_id) == 15: | |
yield putative_id | |
# create followers file (list of directed edges) | |
import datetime | |
follower_dicts = [] | |
for id in tqdm.tqdm_notebook(ids_members): | |
source = get_follower_page_source(driver, id) | |
follower_ids = list(get_follower_ids(source)) | |
follower_dicts.extend([{'id_followee': id, 'id_follower': fid} for fid in follower_ids]) | |
print('%d follower relationships' % len(follower_dicts)) | |
savefile = datetime.datetime.now().strftime('followers_%Y-%m-%d_%H-%M-%S.csv') | |
print('Creating DataFrame and saving CSV to %s' % savefile) | |
dff = pd.DataFrame(follower_dicts) | |
dff = dff[['id_followee', 'id_follower']] | |
dff.to_csv('data_dumps/' + savefile, index = False, encoding='utf-8') | |
# Bonus: create follower file with names instead of IDs | |
import pandas as pd | |
dfm = pd.read_csv('members.csv') # created by social graph crawler | |
dff = pd.read_csv('followers_xxx.csv') # created at step above | |
def get_name(dfm, id): | |
idx = dfm[dfm['id']==id].index.tolist() | |
if len(idx) == 0: | |
return 'Name not found' | |
elif len(idx) > 1: | |
return 'Duplicate ID %d' % id | |
else: | |
idx = idx[0] | |
fn = dfm.ix[idx]['first_name'] if pd.notnull(dfm.ix[idx]['first_name']) else 'FIRST_NAME' | |
ln = dfm.ix[idx]['last_name'] if pd.notnull(dfm.ix[idx]['last_name']) else 'LAST_NAME' | |
try: | |
return ' '.join([fn, ln]) | |
except TypeError: | |
print(idx) | |
dffn = pd.DataFrame() | |
dffn['follower'] = map(lambda x: get_name(dfm, x), dff['id_follower']) | |
dffn['followee'] = map(lambda x: get_name(dfm, x), dff['id_followee']) | |
dffn = dffn.drop_duplicates() # some people have created several accounts | |
import csv | |
savefile = datetime.datetime.now().strftime('followers_withnames_%Y-%m-%d_%H-%M-%S.csv') | |
dffn.to_csv(savefile, index = False, encoding='utf-8', quoting = csv.QUOTE_NONNUMERIC, header = False) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment