Created
July 27, 2020 07:41
-
-
Save dtmsecurity/9c05890da66480e5291b116adb63c473 to your computer and use it in GitHub Desktop.
Integrate URLs scraped from liked tweets and Notion using the unofficial API
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import urllib | |
from bs4 import BeautifulSoup | |
import tweepy | |
from urlextract import URLExtract | |
from notion.client import NotionClient | |
from notion.block import TodoBlock, BookmarkBlock | |
import os | |
from unshortenit import UnshortenIt | |
class GoldFermi: | |
def __init__(self, twitter_details, notion_details, url_file, reverse=True, verbose=False): | |
self.verbose = verbose | |
if self.verbose: | |
print("[*] Authenticate to Twitter") | |
self.auth = tweepy.OAuthHandler(twitter_details["consumer_key"], twitter_details["consumer_secret"]) | |
self.auth.set_access_token(twitter_details["access_token"], twitter_details["access_token_secret"]) | |
self.twitter_api = tweepy.API(self.auth) | |
self.twitter_screen_name = twitter_details["screen_name"] | |
self.twitter_tweet_limit = twitter_details["tweet_limit"] | |
if self.verbose: | |
print("[*] Authenticate to Notion") | |
self.notion_client = NotionClient(token_v2=notion_details["token_v2"]) | |
self.notion_page = self.notion_client.get_block(notion_details["notion_page"]) | |
if self.verbose: | |
print("[*] Target Notion page {}".format(self.notion_page)) | |
self.url_file = url_file | |
if self.verbose: | |
print("[*] URL cache file {}".format(self.url_file)) | |
self.reverse = reverse | |
if self.verbose and self.reverse: | |
print("[*] New send new URLs to top of Notion page") | |
def get_liked_tweets(self, tweet_limit): | |
if self.verbose: | |
print("[*] Get last {} tweets for {}".format(self.twitter_tweet_limit, self.twitter_screen_name)) | |
return tweepy.Cursor(self.twitter_api.favorites, id=self.twitter_screen_name).items(tweet_limit) | |
def expand_tweet_text(self, tweet_id): | |
status = self.twitter_api.get_status(tweet_id, tweet_mode="extended") | |
try: | |
tweet_text = str(status.retweeted_status.full_text) | |
except AttributeError: # Not a Retweet | |
tweet_text = str(status.full_text) | |
return str(tweet_text) | |
def extract_urls(self, text): | |
extractor = URLExtract() | |
return extractor.find_urls(text) | |
def get_html_title(self, html): | |
try: | |
soup = BeautifulSoup(html, 'html.parser') | |
return soup.find('title').string | |
except: | |
return "" | |
def get_redirect(self, url): | |
if self.verbose: | |
print("[-] Unshorten {}".format(url)) | |
unshortener = UnshortenIt(default_timeout=5) | |
return unshortener.unshorten(url) | |
def get_url_data(self, url): | |
redirect_url = self.get_redirect(url) | |
url_data = dict() | |
try: | |
opener = urllib.request.build_opener() | |
hdr = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'} | |
if self.verbose: | |
print("[-] Visiting {}".format(redirect_url)) | |
request = urllib.request.Request(redirect_url, headers=hdr) | |
u = opener.open(request) | |
url_data["expanded_url"] = str(u.geturl()) | |
if self.verbose: | |
print("[-] Expanded url is {}".format(url_data["expanded_url"])) | |
url_data["html"] = u.read().decode() | |
url_data["page_title"] = self.get_html_title(url_data["html"]) | |
if self.verbose: | |
print("[-] Page title is {}".format(url_data["page_title"])) | |
return url_data | |
except Exception as e: | |
#print(e) | |
pass | |
def save_link_to_notion_page(self, url, title, description): | |
new_todo = self.notion_page.children.add_new(TodoBlock, title=title) | |
new_todo.children.add_new(BookmarkBlock, title=title, link=url, description=description) | |
if self.reverse: | |
new_todo.move_to(self.notion_page, "first-child") | |
def url_exists(self, url): | |
if os.path.isfile(self.url_file): | |
with open(self.url_file, "r") as url_file_fh: | |
urls_in_file = url_file_fh.readlines() | |
for url_in_file in urls_in_file: | |
if url == url_in_file.strip(): | |
return True | |
with open(self.url_file, "a+") as url_file_fh: | |
url_file_fh.write("{}\n".format(url)) | |
return False | |
def process_liked_tweets(self): | |
for liked_tweet in self.get_liked_tweets(self.twitter_tweet_limit): | |
liked_tweet_id = liked_tweet.id | |
liked_tweet_text = self.expand_tweet_text(liked_tweet_id) | |
liked_tweet_screen_name = liked_tweet.user.screen_name | |
liked_tweet_description = "@{} - {}".format(liked_tweet_screen_name, liked_tweet_text) | |
if self.verbose: | |
print("[*] Tweet: {}".format(liked_tweet_description)) | |
for url in self.extract_urls(liked_tweet_text): | |
try: | |
url_data = self.get_url_data(url) | |
if self.verbose: | |
print(" [*] URL: {} - {}".format(url_data["expanded_url"], url_data["page_title"])) | |
if "twitter.com" in url_data["expanded_url"].lower(): | |
if self.verbose: | |
print(" [*] Ignoring as is to twitter.com") | |
continue | |
if self.url_exists(url_data["expanded_url"]): | |
if self.verbose: | |
print(" [*] Ignoring as already in cache") | |
continue | |
if self.verbose: | |
print(" [*] Adding to Notion") | |
self.save_link_to_notion_page(url_data["expanded_url"],url_data["page_title"], liked_tweet_description) | |
except: | |
pass | |
if __name__ == '__main__': | |
url_file_config = "urls.txt" | |
twitter_details_config = { | |
"consumer_key": "", | |
"consumer_secret": "", | |
"access_token": "", | |
"access_token_secret": "", | |
"screen_name": "@<twitter handle>", | |
"tweet_limit": 10 | |
} | |
notion_details_config = { | |
"token_v2": "<notion cookie>", | |
"notion_page": "https://www.notion.so/<page>" | |
} | |
g = GoldFermi(twitter_details_config, notion_details_config, url_file_config, verbose=True) | |
g.process_liked_tweets() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment