marians · January 2, 2012 08:32
diff --git a/usenet_group_stats.py b/usenet_group_stats.py
 #!/usr/bin/env python
 # encoding: utf-8
 """
 	This script acquires statistics on usenet groups.
 	
 	It first reads a list of groups from one or more usenet servers
 	and then gets monthly post statistics about these groups from
 	Google Groups.
 """

 import sys
 import os
 import nntplib
 import urllib2
 import re
 import time
 from scrapemark import scrape

 # Path/filename for the group list. Will be created on first run if not existent.
 GROUP_LIST_FILE = 'grouplist.csv'

 # minimum number of appearances. A group that is seen less frequently will not be considered
 MIN_GROUP_COUNT = 10

 # results
 RESULTS_FILE = 'postcount.csv'

 def get_servers():
 	url = 'http://www.newzbot.com/serverlist.php?since=ALL&orderby=kps&sortorder=desc&show_maxgroup=on&show_post=on&show_kps=on&show_created=on&notable=1'
 	data = scrape("""
 	{*
 		<b>News server hostname:</b> <a href='/search.php?t=info&q={{ [id]|int }}'>{{ [name] }}</a>
 	*}
 	""", url=url)
 	if 'name' in data:
 		return data['name']

 def get_groups_from_server(servername):
 	try:
 		s = nntplib.NNTP(servername)
 	except:
 		return None
 	try:
 		(response, groups) = s.newgroups('000101', '000000')
 	except:
 		return None
 	s.quit()
 	ret = []
 	for group in groups:
 		parts = group.split()
 		ret.append(parts[0])
 	return ret

 def get_group_stats_from_google(group):
 	url = "http://groups.google.com/group/%s/about?hl=en" % group
 	request = urllib2.Request(url, None, {'User-Agent': "Mozilla/5.0 (X11; U; Linux i686) Gecko/20071127 Firefox/2.0.0.11"})
 	try:
 		handler = urllib2.urlopen(request)
 	except:
 		return None
 	html = handler.read()
 	matches = re.findall(r'href="/group/[^/]+/browse_frm/month/([^\?]+)\?hl=en">([0-9]+)</', html)
 	if matches is not None:
 		return matches

 if __name__ == '__main__':
 	groupdict = {}
 	if not os.path.exists(GROUP_LIST_FILE):
 		# get group lists
 		print "Reading NNTP server list..."
 		nntp_servers = get_servers()
 		print "Got", len(nntp_servers), "servers."
 		for server in nntp_servers:
 			print "Getting groups from", server, "..."
 			groups = get_groups_from_server(server)
 			if groups is not None:
 				print "... found", len(groups), "groups."
 				for group in groups:
 					if group not in groupdict:
 						groupdict[group] = 0
 					groupdict[group] += 1
 		print groupdict
 		f = open(GROUP_LIST_FILE, 'w+')
 		for group in groupdict.keys():
 			f.write(group + "\t" + str(groupdict[group]) + "\n")
 		f.close()
 	
 	# read grouplist file
 	lines = open(GROUP_LIST_FILE).read().split("\n")
 	for line in lines:
 		line = line.strip()
 		if line != "":
 			parts = line.split("\t")
 			groupdict[parts[0]] = int(parts[1])
 	
 	# get group stats from Google
 	if not os.path.exists(RESULTS_FILE):
 		f = open(RESULTS_FILE, 'w+')
 		for group in groupdict.keys():
 			if groupdict[group] >= MIN_GROUP_COUNT:
 				print "Getting post count for group", group, "..."
 				postcounts = get_group_stats_from_google(group)
 				if postcounts is not None:
 					for entry in postcounts:
 						f.write("%s\t%s\t%d\n" % (group, entry[0], int(entry[1])))
 				time.sleep(1)
 		f.close()
 	
 	datedict = {}
 	lines = open(RESULTS_FILE, 'r').read().split("\n")
 	for line in lines:
 		line = line.strip()
 		parts = line.split("\t")
 		if len(parts) == 3:
 			if parts[1] not in datedict:
 				datedict[parts[1]] = 0
 			datedict[parts[1]] += int(parts[2])
 	for datestring in datedict.keys():
 		print datestring + "-01\t" + str(datedict[datestring])
	#!/usr/bin/env python
	# encoding: utf-8
	"""
	This script acquires statistics on usenet groups.

	It first reads a list of groups from one or more usenet servers
	and then gets monthly post statistics about these groups from
	Google Groups.
	"""

	import sys
	import os
	import nntplib
	import urllib2
	import re
	import time
	from scrapemark import scrape

	# Path/filename for the group list. Will be created on first run if not existent.
	GROUP_LIST_FILE = 'grouplist.csv'

	# minimum number of appearances. A group that is seen less frequently will not be considered
	MIN_GROUP_COUNT = 10

	# results
	RESULTS_FILE = 'postcount.csv'

	def get_servers():
	url = 'http://www.newzbot.com/serverlist.php?since=ALL&orderby=kps&sortorder=desc&show_maxgroup=on&show_post=on&show_kps=on&show_created=on&notable=1'
	data = scrape("""
	{*
	<b>News server hostname:</b> <a href='/search.php?t=info&q={{ [id]\|int }}'>{{ [name] }}</a>
	*}
	""", url=url)
	if 'name' in data:
	return data['name']

	def get_groups_from_server(servername):
	try:
	s = nntplib.NNTP(servername)
	except:
	return None
	try:
	(response, groups) = s.newgroups('000101', '000000')
	except:
	return None
	s.quit()
	ret = []
	for group in groups:
	parts = group.split()
	ret.append(parts[0])
	return ret

	def get_group_stats_from_google(group):
	url = "http://groups.google.com/group/%s/about?hl=en" % group
	request = urllib2.Request(url, None, {'User-Agent': "Mozilla/5.0 (X11; U; Linux i686) Gecko/20071127 Firefox/2.0.0.11"})
	try:
	handler = urllib2.urlopen(request)
	except:
	return None
	html = handler.read()
	matches = re.findall(r'href="/group/[^/]+/browse_frm/month/([^\?]+)\?hl=en">([0-9]+)</', html)
	if matches is not None:
	return matches

	if __name__ == '__main__':
	groupdict = {}
	if not os.path.exists(GROUP_LIST_FILE):
	# get group lists
	print "Reading NNTP server list..."
	nntp_servers = get_servers()
	print "Got", len(nntp_servers), "servers."
	for server in nntp_servers:
	print "Getting groups from", server, "..."
	groups = get_groups_from_server(server)
	if groups is not None:
	print "... found", len(groups), "groups."
	for group in groups:
	if group not in groupdict:
	groupdict[group] = 0
	groupdict[group] += 1
	print groupdict
	f = open(GROUP_LIST_FILE, 'w+')
	for group in groupdict.keys():
	f.write(group + "\t" + str(groupdict[group]) + "\n")
	f.close()

	# read grouplist file
	lines = open(GROUP_LIST_FILE).read().split("\n")
	for line in lines:
	line = line.strip()
	if line != "":
	parts = line.split("\t")
	groupdict[parts[0]] = int(parts[1])

	# get group stats from Google
	if not os.path.exists(RESULTS_FILE):
	f = open(RESULTS_FILE, 'w+')
	for group in groupdict.keys():
	if groupdict[group] >= MIN_GROUP_COUNT:
	print "Getting post count for group", group, "..."
	postcounts = get_group_stats_from_google(group)
	if postcounts is not None:
	for entry in postcounts:
	f.write("%s\t%s\t%d\n" % (group, entry[0], int(entry[1])))
	time.sleep(1)
	f.close()

	datedict = {}
	lines = open(RESULTS_FILE, 'r').read().split("\n")
	for line in lines:
	line = line.strip()
	parts = line.split("\t")
	if len(parts) == 3:
	if parts[1] not in datedict:
	datedict[parts[1]] = 0
	datedict[parts[1]] += int(parts[2])
	for datestring in datedict.keys():
	print datestring + "-01\t" + str(datedict[datestring])