Created
October 12, 2016 13:20
-
-
Save adammichaelwood/6f2e02c4156c6cc27020ce179d2e8638 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import google | |
import time | |
import random | |
from bs4 import BeautifulSoup | |
import urllib.request | |
import http.cookiejar | |
from selenium import webdriver | |
import subprocess | |
import blessings | |
from operator import itemgetter | |
from readability.readability import Document | |
from urllib.parse import urlparse | |
from titlecase import titlecase | |
browser = webdriver.Firefox() | |
# import Resource | |
topic = input("Topic: ") | |
domain_blacklist = set() | |
t = blessings.Terminal() | |
# link_type = "resource" # default for now; could be others in future | |
# get tpye of link listing from user (general resource; recent news [time period]) | |
resource_words = ["", "tutorial", "tools", "resources", "library", "app"] # what else? | |
url_set = set() | |
for phrase in resource_words: | |
search_phrase = topic + phrase | |
results = google.search(search_phrase, num=10, start=0, stop=25, pause=2.0) | |
for url in results: | |
url_set.add(url) | |
url_list = list(url_set) | |
lnk_list = [] | |
cat_msg = """Category | |
0 - DO NOT USE | |
1 - Tutorials | |
2 - Additional Learning Materials | |
3 - Reference | |
4 - Online Tools | |
5 - Local Tools | |
6 - Community | |
7 - Books | |
99 - Scrape links | |
""" | |
current_count = 1 | |
for url in url_list: | |
try: | |
# print(url) # just checking.... | |
print( str(current_count) + " of " + str( len(url_list) ) ) | |
print(url) | |
current_count += 1 | |
if ("youtube" in url) or ("amazon" in url) or ("wikipedia" in url) or (urlparse(url).hostname in domain_blacklist): | |
print ("NOPE") | |
continue | |
browser.get(url) | |
lnk = { | |
'href': browser.current_url, | |
# 'content': BeautifulSoup(urllib.request.urlopen(url).read(), 'html.parser') | |
} | |
print(lnk['href']) | |
category_input = input(cat_msg) | |
if not category_input: | |
category_input = 0; | |
html_source = browser.page_source | |
possible_title = Document(html_source).title() | |
main_content = Document(html_source).summary() | |
content_soup = BeautifulSoup(main_content, 'lxml') | |
if (int(category_input) == 99): | |
print("Fetching links...") | |
new_link_counter = 0 | |
try: | |
for link in content_soup.find_all('a'): | |
try: | |
new_url = link.get('href') | |
print(new_url) | |
time.sleep(0.1) | |
if ("http" in new_url) and ( new_url not in url_set ) and ( urlparse(new_url).hostname not in domain_blacklist ): | |
url_set.add(new_url) | |
url_list.append(new_url) | |
print(t.blue("ADDED " + new_url)) | |
new_link_counter += 1 | |
else: | |
print(t.red("NOPE")) | |
except: | |
print(t.red("Something happened. New URL not added.")) | |
continue | |
print(t.blue(str(new_link_counter) + " links added.\n Total Links: " + str( len(url_list) ))) | |
t.sleep(0.3) | |
except: | |
print("Something happened. No links added.") | |
print(sys.exc_info()[0]) | |
category_input = input(cat_msg) | |
if ( not category_input ) or (int(category_input) == 99): | |
lnk['use'] = False | |
if not category_input: | |
if input("Add to blacklist?"): | |
domain_blacklist.add(urlparse(url).hostname) | |
continue | |
try: | |
lnk['cat'] = int(category_input) | |
except: | |
lnk['cat'] = input("Category must be a number: ") | |
while 'quality' not in lnk: | |
try: | |
lnk['quality'] = int(input("Quality (0-9): ")) | |
except: | |
continue | |
print("Possible title: " + possible_title) | |
truncate_at = input("TRUNCATE: ") | |
if not truncate_at: | |
change_title = input("CHANGE: ") | |
if not change_title: | |
lnk['title'] = possible_title | |
else: | |
lnk['title'] = change_title | |
else: | |
lnk['title'] = possible_title.split(truncate_at, 1)[0] | |
print("CURRENT TITLE TO USE: " + lnk['title']) | |
if input("Capitalize it?"): | |
try: | |
lnk['title'] = titlecase(lnk['title']) | |
except: | |
print("Something happened. Not capitalized.") | |
print(sys.exc_info()[0]) | |
print("TITLE TO USE: " + lnk['title']) | |
lnk['desc'] = input("Description: ") | |
link_string = " - [{title}]({href}) {desc}\n".format(**lnk) | |
lnk['string'] = link_string | |
print(t.yellow(lnk['string'])) | |
lnk_list.append(lnk) | |
except: | |
continue | |
#print(str(url_list)) | |
#print(str(url_set)) | |
#YouTube | |
lnk_list = sorted(lnk_list, key=itemgetter('quality'), reverse=True) | |
print("-------\n\n") | |
for i in range(1,12): | |
print( "## " + str(i) + "\n\n" ) | |
for lnk in lnk_list: | |
if ( lnk['cat'] == i ): | |
print(lnk['string']) | |
print( "\n" ) | |
#print("Closing browser...") | |
#browser.quit() | |
#print("...browser closed.") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment