Created
January 2, 2012 08:32
-
-
Save marians/1549876 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# encoding: utf-8 | |
""" | |
This script acquires statistics on usenet groups. | |
It first reads a list of groups from one or more usenet servers | |
and then gets monthly post statistics about these groups from | |
Google Groups. | |
""" | |
import sys | |
import os | |
import nntplib | |
import urllib2 | |
import re | |
import time | |
from scrapemark import scrape | |
# Path/filename for the group list. Will be created on first run if not existent. | |
GROUP_LIST_FILE = 'grouplist.csv' | |
# minimum number of appearances. A group that is seen less frequently will not be considered | |
MIN_GROUP_COUNT = 10 | |
# results | |
RESULTS_FILE = 'postcount.csv' | |
def get_servers(): | |
url = 'http://www.newzbot.com/serverlist.php?since=ALL&orderby=kps&sortorder=desc&show_maxgroup=on&show_post=on&show_kps=on&show_created=on¬able=1' | |
data = scrape(""" | |
{* | |
<b>News server hostname:</b> <a href='/search.php?t=info&q={{ [id]|int }}'>{{ [name] }}</a> | |
*} | |
""", url=url) | |
if 'name' in data: | |
return data['name'] | |
def get_groups_from_server(servername): | |
try: | |
s = nntplib.NNTP(servername) | |
except: | |
return None | |
try: | |
(response, groups) = s.newgroups('000101', '000000') | |
except: | |
return None | |
s.quit() | |
ret = [] | |
for group in groups: | |
parts = group.split() | |
ret.append(parts[0]) | |
return ret | |
def get_group_stats_from_google(group): | |
url = "http://groups.google.com/group/%s/about?hl=en" % group | |
request = urllib2.Request(url, None, {'User-Agent': "Mozilla/5.0 (X11; U; Linux i686) Gecko/20071127 Firefox/2.0.0.11"}) | |
try: | |
handler = urllib2.urlopen(request) | |
except: | |
return None | |
html = handler.read() | |
matches = re.findall(r'href="/group/[^/]+/browse_frm/month/([^\?]+)\?hl=en">([0-9]+)</', html) | |
if matches is not None: | |
return matches | |
if __name__ == '__main__': | |
groupdict = {} | |
if not os.path.exists(GROUP_LIST_FILE): | |
# get group lists | |
print "Reading NNTP server list..." | |
nntp_servers = get_servers() | |
print "Got", len(nntp_servers), "servers." | |
for server in nntp_servers: | |
print "Getting groups from", server, "..." | |
groups = get_groups_from_server(server) | |
if groups is not None: | |
print "... found", len(groups), "groups." | |
for group in groups: | |
if group not in groupdict: | |
groupdict[group] = 0 | |
groupdict[group] += 1 | |
print groupdict | |
f = open(GROUP_LIST_FILE, 'w+') | |
for group in groupdict.keys(): | |
f.write(group + "\t" + str(groupdict[group]) + "\n") | |
f.close() | |
# read grouplist file | |
lines = open(GROUP_LIST_FILE).read().split("\n") | |
for line in lines: | |
line = line.strip() | |
if line != "": | |
parts = line.split("\t") | |
groupdict[parts[0]] = int(parts[1]) | |
# get group stats from Google | |
if not os.path.exists(RESULTS_FILE): | |
f = open(RESULTS_FILE, 'w+') | |
for group in groupdict.keys(): | |
if groupdict[group] >= MIN_GROUP_COUNT: | |
print "Getting post count for group", group, "..." | |
postcounts = get_group_stats_from_google(group) | |
if postcounts is not None: | |
for entry in postcounts: | |
f.write("%s\t%s\t%d\n" % (group, entry[0], int(entry[1]))) | |
time.sleep(1) | |
f.close() | |
datedict = {} | |
lines = open(RESULTS_FILE, 'r').read().split("\n") | |
for line in lines: | |
line = line.strip() | |
parts = line.split("\t") | |
if len(parts) == 3: | |
if parts[1] not in datedict: | |
datedict[parts[1]] = 0 | |
datedict[parts[1]] += int(parts[2]) | |
for datestring in datedict.keys(): | |
print datestring + "-01\t" + str(datedict[datestring]) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment