Skip to content

Instantly share code, notes, and snippets.

@jflanaga
Last active May 20, 2019 06:43
Show Gist options
  • Save jflanaga/13bd6fb9e86324ba176dee2473fc71da to your computer and use it in GitHub Desktop.
Save jflanaga/13bd6fb9e86324ba176dee2473fc71da to your computer and use it in GitHub Desktop.
hashtag_cooccurrence.py
#!/usr/bin/env python
# adopted from https://github.com/derekgreene/twitter-jsonl-tools
import argparse
import codecs
import fileinput
import itertools
import logging
import operator
import ujson as json
from collections import defaultdict
from prettytable import PrettyTable
def main():
parser = argparse.ArgumentParser()
parser.add_argument('files',
metavar='FILE',
nargs='*',
help='files to read')
parser.add_argument('--outpath',
default='hashtag_cooccurrences.csv',
help='output path for csv file')
parser.add_argument('--top',
type=int,
default=10,
help='number of co-occurrences (default: 10)')
args = parser.parse_args()
logging.basicConfig(level=20, format='%(message)s')
files = args.files
top = args.top
out_path = args.outpath
pair_counts = defaultdict(int)
for file in files:
logging.info(f"Loading tweets from {file} ...")
num_tweets, num_failed, line_number = 0, 0, 0
num_multiple = 0
for line in fileinput.input(file):
try:
line_number += 1
tweet = json.loads(line)
tweet_tags = set()
# tags for retweets
if 'retweeted_status' in tweet and tweet['retweeted_status']:
if 'hashtags' in tweet['retweeted_status']['entities']:
for tag in tweet['retweeted_status']['entities'][
'hashtags']:
if tag['text'].lower() != "brexit":
tweet_tags.add("#" +
tag['text'].lower().strip())
# tags for everything else
elif 'entities' in tweet:
if 'hashtags' in tweet['entities'] and len(
tweet['entities']['hashtags']) > 0:
for tag in tweet['entities']['hashtags']:
if tag['text'].lower() != "brexit":
tweet_tags.add("#" +
tag['text'].lower().strip())
# do not count duplicates
tweet_tags = list(tweet_tags)
# process the pairs
if len(tweet_tags) > 1:
num_multiple += 1
for p in itertools.combinations(tweet_tags, 2):
if p[0] < p[1]:
pair = frozenset([p[0], p[1]])
else:
pair = frozenset([p[1], p[0]])
pair_counts[pair] += 1
num_tweets += 1
if line_number % 50000 == 0:
logging.info("Processed %d lines" % line_number)
except Exception as e:
logging.error(
f"Failed to parse tweet on line {line_number}: {e}")
num_failed += 1
fileinput.close()
logging.info("Processed %d tweets from file" % num_tweets)
logging.info("%d/%d tweets in file contained more than one hashtag" %
(num_multiple, num_tweets))
logging.info("Total of %d unique pairs of hashtags" % len(pair_counts))
# Output pairs
logging.info("Writing pairs to %s ..." % out_path)
fout = codecs.open(out_path, "w", encoding="utf-8", errors="ignore")
fout.write("Hashtag1\tHastag2\tCount\n")
for p in pair_counts:
pair = list(p)
pair.sort()
fout.write("%s\t%s\t%d\n" % (pair[0], pair[1], pair_counts[p]))
fout.close()
# Display top counts
sx = sorted(pair_counts.items(), key=operator.itemgetter(1), reverse=True)
logging.info("Top %d co-occurring hashtag pairs:" % min(len(sx), top))
tab = PrettyTable(["Hashtag1", "Hashtag2", "Count"])
tab.align["Hashtag1"] = "l"
tab.align["Hashtag2"] = "l"
tab.align["Count"] = "r"
for i, p in enumerate(sx):
if i > top:
break
pair = list(p[0])
pair.sort()
tab.add_row([pair[0], pair[1], p[1]])
logging.info(tab)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment