jflanaga · May 20, 2019 06:43
diff --git a/hashtag_cooccurrence.py b/hashtag_cooccurrence.py
 #!/usr/bin/env python
 # adopted from https://github.com/derekgreene/twitter-jsonl-tools

 import argparse
 import codecs
 import fileinput
 import itertools
 import logging
 import operator
 import ujson as json

 from collections import defaultdict
 from prettytable import PrettyTable


 def main():
    parser = argparse.ArgumentParser()
    
    parser.add_argument('files',
                        metavar='FILE',
                        nargs='*',
                        help='files to read')
    parser.add_argument('--outpath',
                        default='hashtag_cooccurrences.csv',
                        help='output path for csv file')
    
    parser.add_argument('--top',
                        type=int,
                        default=10,
                        help='number of co-occurrences (default: 10)')
    
    args = parser.parse_args()
    
    logging.basicConfig(level=20, format='%(message)s')

    files = args.files
    top = args.top
    out_path = args.outpath
    
    pair_counts = defaultdict(int)
    for file in files:
        logging.info(f"Loading tweets from {file} ...")
        num_tweets, num_failed, line_number = 0, 0, 0
        num_multiple = 0
        for line in fileinput.input(file):
            try:
                line_number += 1
                tweet = json.loads(line)
                tweet_tags = set()
                
                # tags for retweets
                if 'retweeted_status' in tweet and tweet['retweeted_status']:
                    if 'hashtags' in tweet['retweeted_status']['entities']:
                        for tag in tweet['retweeted_status']['entities'][
                                'hashtags']:
                            if tag['text'].lower() != "brexit":
                                tweet_tags.add("#" +
                                               tag['text'].lower().strip())
                # tags for everything else
                elif 'entities' in tweet:
                    if 'hashtags' in tweet['entities'] and len(
                            tweet['entities']['hashtags']) > 0:
                        for tag in tweet['entities']['hashtags']:
                            if tag['text'].lower() != "brexit":
                                tweet_tags.add("#" +
                                               tag['text'].lower().strip())

                # do not count duplicates
                tweet_tags = list(tweet_tags)
                
                # process the pairs
                if len(tweet_tags) > 1:
                    num_multiple += 1
                    for p in itertools.combinations(tweet_tags, 2):
                        if p[0] < p[1]:
                            pair = frozenset([p[0], p[1]])
                        else:
                            pair = frozenset([p[1], p[0]])
                        pair_counts[pair] += 1
                num_tweets += 1
                if line_number % 50000 == 0:
                    logging.info("Processed %d lines" % line_number)

            except Exception as e:
                logging.error(
                    f"Failed to parse tweet on line {line_number}: {e}")
                num_failed += 1

        fileinput.close()
        logging.info("Processed %d tweets from file" % num_tweets)
        logging.info("%d/%d tweets in file contained more than one hashtag" %
                     (num_multiple, num_tweets))
    logging.info("Total of %d unique pairs of hashtags" % len(pair_counts))
    
    # Output pairs
    logging.info("Writing pairs to %s ..." % out_path)
    fout = codecs.open(out_path, "w", encoding="utf-8", errors="ignore")
    fout.write("Hashtag1\tHastag2\tCount\n")
    for p in pair_counts:
        pair = list(p)
        pair.sort()
        fout.write("%s\t%s\t%d\n" % (pair[0], pair[1], pair_counts[p]))
    fout.close()

    # Display top counts
    sx = sorted(pair_counts.items(), key=operator.itemgetter(1), reverse=True)
    logging.info("Top %d co-occurring hashtag pairs:" % min(len(sx), top))
    tab = PrettyTable(["Hashtag1", "Hashtag2", "Count"])
    tab.align["Hashtag1"] = "l"
    tab.align["Hashtag2"] = "l"
    tab.align["Count"] = "r"
    for i, p in enumerate(sx):
        if i > top:
            break
        pair = list(p[0])
        pair.sort()
        tab.add_row([pair[0], pair[1], p[1]])
    logging.info(tab)


 if __name__ == '__main__':
    main()
	#!/usr/bin/env python
	# adopted from https://github.com/derekgreene/twitter-jsonl-tools

	import argparse
	import codecs
	import fileinput
	import itertools
	import logging
	import operator
	import ujson as json

	from collections import defaultdict
	from prettytable import PrettyTable


	def main():
	parser = argparse.ArgumentParser()

	parser.add_argument('files',
	metavar='FILE',
	nargs='*',
	help='files to read')
	parser.add_argument('--outpath',
	default='hashtag_cooccurrences.csv',
	help='output path for csv file')

	parser.add_argument('--top',
	type=int,
	default=10,
	help='number of co-occurrences (default: 10)')

	args = parser.parse_args()

	logging.basicConfig(level=20, format='%(message)s')

	files = args.files
	top = args.top
	out_path = args.outpath

	pair_counts = defaultdict(int)
	for file in files:
	logging.info(f"Loading tweets from {file} ...")
	num_tweets, num_failed, line_number = 0, 0, 0
	num_multiple = 0
	for line in fileinput.input(file):
	try:
	line_number += 1
	tweet = json.loads(line)
	tweet_tags = set()

	# tags for retweets
	if 'retweeted_status' in tweet and tweet['retweeted_status']:
	if 'hashtags' in tweet['retweeted_status']['entities']:
	for tag in tweet['retweeted_status']['entities'][
	'hashtags']:
	if tag['text'].lower() != "brexit":
	tweet_tags.add("#" +
	tag['text'].lower().strip())
	# tags for everything else
	elif 'entities' in tweet:
	if 'hashtags' in tweet['entities'] and len(
	tweet['entities']['hashtags']) > 0:
	for tag in tweet['entities']['hashtags']:
	if tag['text'].lower() != "brexit":
	tweet_tags.add("#" +
	tag['text'].lower().strip())

	# do not count duplicates
	tweet_tags = list(tweet_tags)

	# process the pairs
	if len(tweet_tags) > 1:
	num_multiple += 1
	for p in itertools.combinations(tweet_tags, 2):
	if p[0] < p[1]:
	pair = frozenset([p[0], p[1]])
	else:
	pair = frozenset([p[1], p[0]])
	pair_counts[pair] += 1
	num_tweets += 1
	if line_number % 50000 == 0:
	logging.info("Processed %d lines" % line_number)

	except Exception as e:
	logging.error(
	f"Failed to parse tweet on line {line_number}: {e}")
	num_failed += 1

	fileinput.close()
	logging.info("Processed %d tweets from file" % num_tweets)
	logging.info("%d/%d tweets in file contained more than one hashtag" %
	(num_multiple, num_tweets))
	logging.info("Total of %d unique pairs of hashtags" % len(pair_counts))

	# Output pairs
	logging.info("Writing pairs to %s ..." % out_path)
	fout = codecs.open(out_path, "w", encoding="utf-8", errors="ignore")
	fout.write("Hashtag1\tHastag2\tCount\n")
	for p in pair_counts:
	pair = list(p)
	pair.sort()
	fout.write("%s\t%s\t%d\n" % (pair[0], pair[1], pair_counts[p]))
	fout.close()

	# Display top counts
	sx = sorted(pair_counts.items(), key=operator.itemgetter(1), reverse=True)
	logging.info("Top %d co-occurring hashtag pairs:" % min(len(sx), top))
	tab = PrettyTable(["Hashtag1", "Hashtag2", "Count"])
	tab.align["Hashtag1"] = "l"
	tab.align["Hashtag2"] = "l"
	tab.align["Count"] = "r"
	for i, p in enumerate(sx):
	if i > top:
	break
	pair = list(p[0])
	pair.sort()
	tab.add_row([pair[0], pair[1], p[1]])
	logging.info(tab)


	if __name__ == '__main__':
	main()