adammichaelwood · October 12, 2016 13:20
diff --git a/get_links.py b/get_links.py
 import sys
 import google
 import time
 import random
 from bs4 import BeautifulSoup
 import urllib.request
 import http.cookiejar
 from selenium import webdriver
 import subprocess
 import blessings
 from operator import itemgetter
 from readability.readability import Document
 from urllib.parse import urlparse
 from titlecase import titlecase

 browser = webdriver.Firefox()
 # import Resource
 topic = input("Topic: ")

 domain_blacklist = set()

 t = blessings.Terminal()

 # link_type = "resource" # default for now; could be others in future

 # get tpye of link listing from user (general resource; recent news [time period])

 resource_words = ["", "tutorial", "tools", "resources", "library", "app"] # what else?

 url_set = set()

 for phrase in resource_words:

    search_phrase = topic + phrase
    results = google.search(search_phrase, num=10, start=0, stop=25, pause=2.0)
    for url in results:
        url_set.add(url)



 url_list = list(url_set)
 lnk_list = []


 cat_msg = """Category
    0 - DO NOT USE
    1 - Tutorials
    2 - Additional Learning Materials
    3 - Reference
    4 - Online Tools
    5 - Local Tools
    6 - Community
    7 - Books
    99 - Scrape links
    """




 current_count = 1

 for url in url_list:
    try:
        # print(url) # just checking....

        print( str(current_count) + " of " + str( len(url_list) ) )
        print(url)
        current_count += 1

        if ("youtube" in url) or ("amazon" in url) or ("wikipedia" in url) or (urlparse(url).hostname in domain_blacklist):
            print ("NOPE")
            continue








        browser.get(url)


        lnk = {
            'href': browser.current_url,
        #    'content':  BeautifulSoup(urllib.request.urlopen(url).read(), 'html.parser')
        }

        print(lnk['href'])

        category_input = input(cat_msg)


        if not category_input:
            category_input = 0;

        html_source = browser.page_source
        possible_title = Document(html_source).title()
        main_content = Document(html_source).summary()
        content_soup = BeautifulSoup(main_content, 'lxml')

        if (int(category_input) == 99):
            print("Fetching links...")
            new_link_counter = 0
            try:
                for link in content_soup.find_all('a'):
                    try:
                        new_url = link.get('href')
                        print(new_url)
                        time.sleep(0.1)
                        if ("http" in new_url) and ( new_url not in url_set ) and ( urlparse(new_url).hostname not in domain_blacklist  ):
                            url_set.add(new_url)
                            url_list.append(new_url)
                            print(t.blue("ADDED " + new_url))
                            new_link_counter += 1
                        else:
                            print(t.red("NOPE"))
                    except:
                        print(t.red("Something happened. New URL not added."))
                        continue
                print(t.blue(str(new_link_counter) + " links added.\n Total Links: " + str( len(url_list) )))
                t.sleep(0.3)
            except:
                print("Something happened. No links added.")
                print(sys.exc_info()[0])

            category_input = input(cat_msg)

        if ( not category_input ) or (int(category_input) == 99):
            lnk['use'] = False
            if not category_input:
                if input("Add to blacklist?"):
                    domain_blacklist.add(urlparse(url).hostname)
            continue


        try:
            lnk['cat'] = int(category_input)
        except:
            lnk['cat'] = input("Category must be a number: ")




        while 'quality' not in lnk:
            try:
                lnk['quality'] = int(input("Quality (0-9): "))
            except:
                continue

        print("Possible title: " + possible_title)
        truncate_at = input("TRUNCATE: ")
        if not truncate_at:
            change_title = input("CHANGE: ")
            if not change_title:
                lnk['title'] = possible_title
            else:
                lnk['title'] = change_title
        else:
            lnk['title'] = possible_title.split(truncate_at, 1)[0]
        print("CURRENT TITLE TO USE: " + lnk['title'])

        if input("Capitalize it?"):
            try:
                lnk['title'] = titlecase(lnk['title'])
            except:
                print("Something happened. Not capitalized.")
                print(sys.exc_info()[0])


        print("TITLE TO USE: " + lnk['title'])



        lnk['desc'] = input("Description: ")


        link_string = " - [{title}]({href}) {desc}\n".format(**lnk)

        lnk['string'] = link_string

        print(t.yellow(lnk['string']))
        lnk_list.append(lnk)

    except:
        continue




    #print(str(url_list))
    #print(str(url_set))




 #YouTube

 lnk_list = sorted(lnk_list, key=itemgetter('quality'), reverse=True)

 print("-------\n\n")

 for i in range(1,12):
    print( "## " + str(i) + "\n\n" )
    for lnk in lnk_list:
        if ( lnk['cat'] == i ):
            print(lnk['string'])

    print( "\n" )



 #print("Closing browser...")
 #browser.quit()
 #print("...browser closed.")
	import sys
	import google
	import time
	import random
	from bs4 import BeautifulSoup
	import urllib.request
	import http.cookiejar
	from selenium import webdriver
	import subprocess
	import blessings
	from operator import itemgetter
	from readability.readability import Document
	from urllib.parse import urlparse
	from titlecase import titlecase

	browser = webdriver.Firefox()
	# import Resource
	topic = input("Topic: ")

	domain_blacklist = set()

	t = blessings.Terminal()

	# link_type = "resource" # default for now; could be others in future

	# get tpye of link listing from user (general resource; recent news [time period])

	resource_words = ["", "tutorial", "tools", "resources", "library", "app"] # what else?

	url_set = set()

	for phrase in resource_words:

	search_phrase = topic + phrase
	results = google.search(search_phrase, num=10, start=0, stop=25, pause=2.0)
	for url in results:
	url_set.add(url)



	url_list = list(url_set)
	lnk_list = []


	cat_msg = """Category
	0 - DO NOT USE
	1 - Tutorials
	2 - Additional Learning Materials
	3 - Reference
	4 - Online Tools
	5 - Local Tools
	6 - Community
	7 - Books
	99 - Scrape links
	"""




	current_count = 1

	for url in url_list:
	try:
	# print(url) # just checking....

	print( str(current_count) + " of " + str( len(url_list) ) )
	print(url)
	current_count += 1

	if ("youtube" in url) or ("amazon" in url) or ("wikipedia" in url) or (urlparse(url).hostname in domain_blacklist):
	print ("NOPE")
	continue








	browser.get(url)


	lnk = {
	'href': browser.current_url,
	# 'content': BeautifulSoup(urllib.request.urlopen(url).read(), 'html.parser')
	}

	print(lnk['href'])

	category_input = input(cat_msg)


	if not category_input:
	category_input = 0;

	html_source = browser.page_source
	possible_title = Document(html_source).title()
	main_content = Document(html_source).summary()
	content_soup = BeautifulSoup(main_content, 'lxml')

	if (int(category_input) == 99):
	print("Fetching links...")
	new_link_counter = 0
	try:
	for link in content_soup.find_all('a'):
	try:
	new_url = link.get('href')
	print(new_url)
	time.sleep(0.1)
	if ("http" in new_url) and ( new_url not in url_set ) and ( urlparse(new_url).hostname not in domain_blacklist ):
	url_set.add(new_url)
	url_list.append(new_url)
	print(t.blue("ADDED " + new_url))
	new_link_counter += 1
	else:
	print(t.red("NOPE"))
	except:
	print(t.red("Something happened. New URL not added."))
	continue
	print(t.blue(str(new_link_counter) + " links added.\n Total Links: " + str( len(url_list) )))
	t.sleep(0.3)
	except:
	print("Something happened. No links added.")
	print(sys.exc_info()[0])

	category_input = input(cat_msg)

	if ( not category_input ) or (int(category_input) == 99):
	lnk['use'] = False
	if not category_input:
	if input("Add to blacklist?"):
	domain_blacklist.add(urlparse(url).hostname)
	continue


	try:
	lnk['cat'] = int(category_input)
	except:
	lnk['cat'] = input("Category must be a number: ")




	while 'quality' not in lnk:
	try:
	lnk['quality'] = int(input("Quality (0-9): "))
	except:
	continue

	print("Possible title: " + possible_title)
	truncate_at = input("TRUNCATE: ")
	if not truncate_at:
	change_title = input("CHANGE: ")
	if not change_title:
	lnk['title'] = possible_title
	else:
	lnk['title'] = change_title
	else:
	lnk['title'] = possible_title.split(truncate_at, 1)[0]
	print("CURRENT TITLE TO USE: " + lnk['title'])

	if input("Capitalize it?"):
	try:
	lnk['title'] = titlecase(lnk['title'])
	except:
	print("Something happened. Not capitalized.")
	print(sys.exc_info()[0])


	print("TITLE TO USE: " + lnk['title'])



	lnk['desc'] = input("Description: ")


	link_string = " - [{title}]({href}) {desc}\n".format(**lnk)

	lnk['string'] = link_string

	print(t.yellow(lnk['string']))
	lnk_list.append(lnk)

	except:
	continue




	#print(str(url_list))
	#print(str(url_set))




	#YouTube

	lnk_list = sorted(lnk_list, key=itemgetter('quality'), reverse=True)

	print("-------\n\n")

	for i in range(1,12):
	print( "## " + str(i) + "\n\n" )
	for lnk in lnk_list:
	if ( lnk['cat'] == i ):
	print(lnk['string'])

	print( "\n" )



	#print("Closing browser...")
	#browser.quit()
	#print("...browser closed.")