Last active
May 20, 2019 06:43
-
-
Save jflanaga/13bd6fb9e86324ba176dee2473fc71da to your computer and use it in GitHub Desktop.
hashtag_cooccurrence.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# adopted from https://github.com/derekgreene/twitter-jsonl-tools | |
import argparse | |
import codecs | |
import fileinput | |
import itertools | |
import logging | |
import operator | |
import ujson as json | |
from collections import defaultdict | |
from prettytable import PrettyTable | |
def main(): | |
parser = argparse.ArgumentParser() | |
parser.add_argument('files', | |
metavar='FILE', | |
nargs='*', | |
help='files to read') | |
parser.add_argument('--outpath', | |
default='hashtag_cooccurrences.csv', | |
help='output path for csv file') | |
parser.add_argument('--top', | |
type=int, | |
default=10, | |
help='number of co-occurrences (default: 10)') | |
args = parser.parse_args() | |
logging.basicConfig(level=20, format='%(message)s') | |
files = args.files | |
top = args.top | |
out_path = args.outpath | |
pair_counts = defaultdict(int) | |
for file in files: | |
logging.info(f"Loading tweets from {file} ...") | |
num_tweets, num_failed, line_number = 0, 0, 0 | |
num_multiple = 0 | |
for line in fileinput.input(file): | |
try: | |
line_number += 1 | |
tweet = json.loads(line) | |
tweet_tags = set() | |
# tags for retweets | |
if 'retweeted_status' in tweet and tweet['retweeted_status']: | |
if 'hashtags' in tweet['retweeted_status']['entities']: | |
for tag in tweet['retweeted_status']['entities'][ | |
'hashtags']: | |
if tag['text'].lower() != "brexit": | |
tweet_tags.add("#" + | |
tag['text'].lower().strip()) | |
# tags for everything else | |
elif 'entities' in tweet: | |
if 'hashtags' in tweet['entities'] and len( | |
tweet['entities']['hashtags']) > 0: | |
for tag in tweet['entities']['hashtags']: | |
if tag['text'].lower() != "brexit": | |
tweet_tags.add("#" + | |
tag['text'].lower().strip()) | |
# do not count duplicates | |
tweet_tags = list(tweet_tags) | |
# process the pairs | |
if len(tweet_tags) > 1: | |
num_multiple += 1 | |
for p in itertools.combinations(tweet_tags, 2): | |
if p[0] < p[1]: | |
pair = frozenset([p[0], p[1]]) | |
else: | |
pair = frozenset([p[1], p[0]]) | |
pair_counts[pair] += 1 | |
num_tweets += 1 | |
if line_number % 50000 == 0: | |
logging.info("Processed %d lines" % line_number) | |
except Exception as e: | |
logging.error( | |
f"Failed to parse tweet on line {line_number}: {e}") | |
num_failed += 1 | |
fileinput.close() | |
logging.info("Processed %d tweets from file" % num_tweets) | |
logging.info("%d/%d tweets in file contained more than one hashtag" % | |
(num_multiple, num_tweets)) | |
logging.info("Total of %d unique pairs of hashtags" % len(pair_counts)) | |
# Output pairs | |
logging.info("Writing pairs to %s ..." % out_path) | |
fout = codecs.open(out_path, "w", encoding="utf-8", errors="ignore") | |
fout.write("Hashtag1\tHastag2\tCount\n") | |
for p in pair_counts: | |
pair = list(p) | |
pair.sort() | |
fout.write("%s\t%s\t%d\n" % (pair[0], pair[1], pair_counts[p])) | |
fout.close() | |
# Display top counts | |
sx = sorted(pair_counts.items(), key=operator.itemgetter(1), reverse=True) | |
logging.info("Top %d co-occurring hashtag pairs:" % min(len(sx), top)) | |
tab = PrettyTable(["Hashtag1", "Hashtag2", "Count"]) | |
tab.align["Hashtag1"] = "l" | |
tab.align["Hashtag2"] = "l" | |
tab.align["Count"] = "r" | |
for i, p in enumerate(sx): | |
if i > top: | |
break | |
pair = list(p[0]) | |
pair.sort() | |
tab.add_row([pair[0], pair[1], p[1]]) | |
logging.info(tab) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment