Last active
April 10, 2018 04:56
-
-
Save nix010/87bf179f682c91b22810bc00c0bb57fc to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
from pprint import pprint | |
from datetime import datetime | |
import requests | |
from facebook_user_crawler import FbBaseCrawler | |
""" | |
FbBaseCrawler can be found here https://gist.github.com/gearazk/7e5a7178dfdee70222bdeb9d8e8d155e | |
""" | |
class FbUserListCrawler(FbBaseCrawler): | |
pages_crawl = 10 | |
# _keyword = 'football' | |
_fbuser_id = '100003307297044' | |
API_URL = 'https://www.facebook.com/ajax/pagelet/generic.php/BrowseScrollingSetPagelet' | |
def __init__(self,keyword): | |
self.r = requests.Session() | |
self.r.cookies.update({ | |
...<your fb account cookies> | |
}) | |
self._keyword = keyword | |
pass | |
def crawl_now(self): | |
self._next_page_params = {} | |
user_list = {} | |
# Crawl :self.pages_crawl pages | |
for i in range(self.pages_crawl): | |
# Call the request | |
resp = self._get(self.API_URL, params=self._search_keyword_payload(keyword=self._keyword)) | |
# Extract the json data. Because the response is in JS function format | |
# So we have to cut it manually. | |
json_data = json.loads(resp.content[9:]) | |
# Get the paging cursor data to prepair for the next page. | |
self._next_page_params = self._search_cursor_dict(json_data.get('jsmods', {}).get('require')) | |
# If there is no HTML data return, it mean error. | |
if json_data.get('payload') is None or json_data.get('payload') == []: | |
print ('response-data-error') | |
return | |
# Extract to User FB_ID from the json response | |
_user_list = self._extract_post_info(json_data) | |
# For demo purpose | |
print(_user_list) | |
print('Page %s completed' % (str(i + 1))) | |
# If the cursor for next page is not found. Stop crawling | |
if not isinstance(self._next_page_params, dict): | |
print('Stop of page %d' % (i+1)) | |
print('next-page-error') | |
break | |
# Gather the results | |
user_list = {**user_list,**_user_list} | |
# Return a list of FB id for the next crawler to get contact info from each profile | |
return list(user_list.keys()) | |
def _extract_post_info(self,json_data): | |
post_dict = {} | |
attr_list = json_data.get('jsmods',{}).get('require') | |
for _list in attr_list: | |
if _list[0] == 'UFIController': | |
_root = _list[3] | |
_id = _root[2].get('feedbacktarget', {}).get('ownerid') | |
post_dict[_id] = { | |
'owner_name' :_root[1].get('ownerName'), | |
} | |
return post_dict | |
def _search_cursor_dict(self,dict_list): | |
if dict_list is None: return None | |
for arr in dict_list: | |
if len(arr) >= 4 and arr[1] == 'pageletComplete': | |
return arr[3][0] | |
return None | |
def _search_keyword_payload(self,keyword): | |
""" | |
Basiclly just prepair the payload data in FB format. | |
To replicate the request | |
:return : dict | |
""" | |
sub_query = { | |
"bqf": "keywords_blended_posts(%s)" % keyword, | |
"vertical": "content", | |
"post_search_vertical": None, | |
"filters": { | |
"filter_author": "stories-feed-friends", | |
"filter_author_enabled": "true" | |
}, | |
"has_chrono_sort": False, | |
"query_analysis": None, | |
"subrequest_disabled": False, | |
"token_role": "NONE", | |
"preloaded_story_ids": [], | |
"extra_data": None, | |
"disable_main_browse_unicorn": False, | |
"entry_point_scope": None, | |
"entry_point_surface": None, | |
"squashed_ent_ids": [], | |
"source_session_id": None, | |
"preloaded_entity_ids": [], | |
"preloaded_entity_type": None, | |
"query_source": None | |
} | |
enc_q = { | |
"view": "list", | |
"encoded_query": json.dumps(sub_query), | |
"encoded_title": "", | |
"ref": "unknown", | |
"logger_source": "www_main", | |
"typeahead_sid": "", | |
"tl_log": False, | |
# "impression_id": "c02624f9", | |
"experience_type": "grammar", | |
"exclude_ids": None, | |
"browse_location": "browse_location:browse", | |
"trending_source": None, | |
"reaction_surface": None, | |
"reaction_session_id": None, | |
"ref_path": "/search/str/football/keywords_blended_posts", | |
"is_trending": False, | |
"topic_id": None, | |
"place_id": None, | |
"story_id": None, | |
"callsite": "browse_ui:init_result_set", | |
"has_top_pagelet": True, | |
"display_params": { | |
"crct": "none", | |
"mrss": True | |
}, | |
} | |
enc_q.update(self._next_page_params) | |
return { | |
'dpr' : '1', | |
'data' : json.dumps(enc_q), | |
'__user': self._fbuser_id, | |
'__a' : '1', | |
'__be' : '1', | |
'__pc' : 'PHASED:DEFAULT', | |
} | |
# | |
keyword = input('What industry do you want to find ? : ') | |
keyword = keyword.strip() | |
crawler = FbUserListCrawler(keyword=keyword) | |
ids = crawler.crawl_now() | |
print(ids) | |
contact_crawler = FbBaseCrawler( | |
email='thienkhiemx', | |
password='xxx', | |
users_fbid=ids | |
) | |
contact_crawler.crawl_now() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment