Created
April 23, 2013 23:21
-
-
Save johnb30/5448269 to your computer and use it in GitHub Desktop.
Code used to pull the unique actors from the GDELT dataset.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from path import path | |
import pandas as pd | |
unique_actors = dict() | |
lengths = list() | |
for in_file in path.getcwd().files('*.reduced.txt'): | |
data = open(in_file, 'r') | |
print "%s data read in...subsetting" % (in_file) | |
for line in data: | |
line = line.replace('\n', '') | |
split_line = line.split('\t') | |
try: | |
if not unique_actors.get(split_line[1]): | |
unique_actors[split_line[1]] = 1 | |
lengths.append(len(split_line[1])) | |
elif unique_actors.get(split_line[1]): | |
unique_actors[split_line[1]] += 1 | |
if not unique_actors.get(split_line[2]): | |
unique_actors[split_line[2]] = 1 | |
lengths.append(len(split_line[1])) | |
elif unique_actors.get(split_line[2]): | |
unique_actors[split_line[2]] += 1 | |
except IndexError: | |
pass | |
print len(unique_actors.keys()) | |
print max(lengths) | |
data = pd.Series(unique_actors) | |
data.to_csv('unique_actors.csv') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment