Last active
October 22, 2018 16:04
-
-
Save nokados/ba1d5091a00dd7cedf24364470c987d6 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json, requests, time | |
import datetime | |
import pickle | |
from collections import Counter | |
from pyquery import PyQuery | |
from readability import Document | |
from lxml.etree import XMLSyntaxError, LxmlError | |
from readability.readability import Unparseable | |
from requests.adapters import MaxRetryError | |
from requests.exceptions import ConnectionError | |
from urllib3.exceptions import NewConnectionError | |
from urllib.parse import urlparse | |
from threading import Thread, Semaphore | |
import http.client as httplib, sys | |
from queue import Queue | |
import sqlite3 | |
import os | |
def get_posts_fast_api(post_urllist): | |
posts = {} | |
errors = [] | |
checked_urls = [] | |
concurrent = 70 | |
sem = Semaphore() | |
def doWork(): | |
while True: | |
story_id, story = q.get() | |
url = story['url'] | |
checked_urls.append(url) | |
status, data = getStatus(url) | |
if status == 200: | |
tryAddPost(story_id, data) | |
q.task_done() | |
def getStatus(ourl): | |
try: | |
url = urlparse(ourl) | |
if url.scheme == 'https': | |
conn = httplib.HTTPSConnection(url.netloc) | |
elif url.scheme == 'http': | |
conn = httplib.HTTPConnection(url.netloc) | |
else: | |
raise Exception('Incorrect protocol: {}'.format(url.scheme)) | |
conn.request("GET", url.path) | |
res = conn.getresponse() | |
return res.status, res.read() | |
except Exception as e: | |
print(e, ourl) | |
errors.append(ourl) | |
return "error", None | |
def tryAddPost(story_id, data): | |
try: | |
d = Document(data) | |
text = PyQuery(d.summary()).text() | |
except (Unparseable, LxmlError): | |
print('{} has incorrect xml'.format(story_id)) | |
if not text: | |
print('ERR: Could not get the post({}) text'.format(story_id)) | |
return | |
with sem: | |
with open('posts/{}.txt'.format(story_id), 'w') as f: | |
f.write(text) | |
print('Started at {}'.format(time.strftime('%X %x'))) | |
q = Queue(concurrent * 2) | |
for i in range(concurrent): | |
t = Thread(target=doWork) | |
t.daemon = True | |
t.start() | |
try: | |
for story_id, story in post_urllist.items(): | |
q.put((story_id, story)) | |
q.join() | |
except KeyboardInterrupt: | |
print('Keyboard interrupt') | |
print('Finished at {}'.format(time.strftime('%X %x'))) | |
num_posts = len(os.listdir('posts')) | |
print('Got {} posts'.format(num_posts)) | |
return posts, errors, checked_urls | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment