narulkargunjan · April 28, 2014 10:38 · Apr 28, 2014
diff --git a/kanjo_base. b/kanjo_base.
@@ -0,0 +1,331 @@
+import os
+import gzip
+import json
+import re
+import string
+import pprint
+import esmre
+from collections import defaultdict, deque
+from senti_classifier import senti_classifier
+import requests 
+import translitcodec
+import json
+import jsonrpclib
+
+def printTweet(tweet):
+    pprint.pprint(tweet)
+
+regiondict = json.loads(open('metadata/state_regions_divisions.json').read())
+split_str = '::'
+def tweetGeoTag(tweet):
+    states = sorted(list(regiondict.keys()))
+    geo = {'world' : 'WW'}
+    place = tweet.get('place', {})
+    place = place and place or {}
+
+    try:
+        geo['country'] = place['country_code']
+    except KeyError:
+        pass
+
+    try:
+        coords = tweet['coordinates']['coordinates']
+    except (KeyError, TypeError):
+        coords = None
+
+    if (coords and geo.get('country', None) == 'US'):
+        point = {'latitude': coords[1], 
+                 'longitude': coords[0] }
+        geo['state'] = decodeCoordinate(point)
+
+
+    elif (place and place['bounding_box']['coordinates'][0]
+          and geo.get('country') == 'US'):
+        bbox = place['bounding_box']['coordinates'][0]
+        avgcoord = map(lambda x: x*1.0/len(bbox),
+                        reduce(lambda x, y: ((y[0] + x[0]),
+                                             (y[1] + x[1])),
+                               bbox, (0, 0)))
+        avgcoord = {'latitude': avgcoord[1], 
+                     'longitude': avgcoord[0] }
+        geo['state'] = decodeCoordinate(avgcoord)
+        geo[place['place_type']] = place['name']
+
+    state = geo.get('state', '')
+
+    try:
+        rd = regiondict[state]
+    except KeyError:
+        pass
+    else:
+        geo['division'] = rd['Division']
+        geo['region'] = rd['Region']
+
+    return geo
+
+
+statecoord = json.loads(open('metadata/state_coordinates.json').read())
+def decodeCoordinate(target):
+    longitude = lambda p: p['longitude']
+    latitude = lambda p: p['latitude']
+
+    def dist(p1, p2):
+        xdist = (longitude(p1) - longitude(p2)) ** 2.0
+        ydist = (latitude(p1) - latitude(p2)) ** 2.0        
+        return xdist + ydist 
+
+    dists = map(lambda state: {'delta' : dist(state, target),
+                               'code' : state['state']}, statecoord)
+    mdist = min(dists, key=lambda x: x['delta'])
+    return mdist['code']
+
+def tweetProcessText(tweet):
+    # process the tweets
+    #Convert to lower case
+    tweet = tweet.lower()
+    #Remove the crazy smilies (UTF-16) but keep the normal ones
+    tweet = tweet.encode('translit/long').encode('ascii', 'ignore')
+    #substitute the slangs
+    tweet = substituteSlangs(tweet)
+    #Remove www.* or https?://* 
+    tweet = re.sub('((www\.[\s]+)|(https?://[^\s]+))','',tweet)
+    #Remove @username 
+    tweet = re.sub('(rt)? @[^\s]+','',tweet)
+    #Remove additional white spaces
+    tweet = re.sub('[\s]+', ' ', tweet)
+    #Replace #word with word
+    tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
+    #Remove the numbers
+    tweet = re.sub('[0-9]*','',tweet)
+    #Remove Punctuations
+    tweet = re.sub(r'[^\w\s]','',tweet)
+    #trim
+    tweet = tweet.strip('\'"')
+    #trim
+    tweet = tweet.strip()
+
+    return tweet
+#end
+slanglist = [line.strip().split('\t') for line in open("metadata/SlangLookupTable.txt")]
+SLANGS = esmre.Index()
+for slang, replacement in slanglist:
+    SLANGS.enter('%s' % slang, (slang,replacement))
+def substituteSlangs(tweet):
+    matchesl = []
+    _sl = list(enumerate(tweet.split(' ')))
+    for matchl in SLANGS.query(tweet):
+        for i, word in _sl:
+            if matchl[0] in [word]:
+                _sl[i] = (i, matchl[1])
+    return (' '.join([x[1] for x in _sl]))
+
+
+emot = list(set([tuple(line.strip().split('\t')) for line in open("metadata/EmoticonLookupTable.txt")]))
+def tweetEmoticonScore(tweet):
+    tweet = tweet['text']
+    tweet = tweet.encode('translit/long').encode('ascii', 'ignore')
+    pos_emot = 0
+    neg_emot = 0
+    useful = "F"
+    polarity = 0
+    emot_details = {}
+    emot_score = filter(None, map(lambda x: x[0] in tweet.split() and x or None, emot))
+    for score in emot_score:
+        if int(score[1]) > 0:
+            pos_emot += int(score[1])
+        else:
+            neg_emot += int(score[1])
+    if pos_emot != 0 or neg_emot != 0:
+        useful = "T"
+    if pos_emot + neg_emot > 0:
+        polarity = "P"
+    elif pos_emot + neg_emot < 0:
+        polarity = "N"
+    emot_details = {'method' : 'emoticon_score', 'matches' : emot_score,'pos_score' : pos_emot, 'neg_score' : neg_emot, 
+                    'useful' : useful, 'polarity' : polarity}
+    return emot_details
+
+hashdet = [line.strip().split('\t') for line in open("metadata/NRC-Hastag-Sentiment-Lexicon-unigrams-pmilexicon.txt")]
+hashdict = {}
+for term, score, num_pos, num_neg in hashdet:
+    hashdict[term] = float(score)
+def HASHTAGFit(tweet):
+    hashtags = tweet['entities']['hashtags']
+    pos_hash = 0
+    neg_hash = 0
+    useful = "F"
+    polarity = 0
+    matches = []
+    hashtags_details = {}
+    for _hashd in hashtags:
+        _hash = '#' + _hashd['text']
+        try:
+            match = hashdict[_hash]
+        except KeyError:
+            continue
+
+        else:
+            matches.append((_hash,match))
+    for score in matches:
+        if int(score[1]) > 0:
+            pos_hash += score[1]
+        else:
+            neg_hash += score[1]
+    if pos_hash != 0 or neg_hash != 0:
+        useful = "T"
+    if pos_hash + neg_hash > 0:
+        polarity = "P"
+    elif pos_hash + neg_hash < 0:
+        polarity = "N"
+    hashtags_details = {'method' : 'hashtag_score', 'matches' : matches,'pos_score' : pos_hash, 'neg_score' : neg_hash, 
+                        'useful' : useful, 'polarity' : polarity}
+    return hashtags_details
+
+afinn = [line.strip().split('\t') for line in open("metadata/AFINN-111.txt")]
+AFINN = esmre.Index()
+for word, score in afinn:
+    AFINN.enter('%s' % word, (word,score))
+def AFINNFit(tweet):
+    tweet = tweet['textProcessed']
+    pos_words = 0
+    neg_words = 0
+    useful = "F"
+    polarity = 0
+    matches = []
+    afinn_details = {}
+    _st = tweet.split(' ')
+    for match in AFINN.query(tweet):
+        if match[0] in _st:
+            matches.append(match)
+    for score in matches:
+        if int(score[1]) > 0:
+            pos_words += int(score[1])
+        else:
+            neg_words += int(score[1])
+    if pos_words != 0 or neg_words != 0:
+        useful = "T"
+    if pos_words + neg_words > 0:
+        polarity = "P"
+    elif pos_words + neg_words < 0:
+        polarity = "N"
+    afinn_details = {'method' : 'afinn_score', 'matches' : matches,'pos_score' : pos_words, 'neg_score' : neg_words,
+                    'useful' : useful, 'polarity' : polarity}
+    return afinn_details
+
+def SENTIWORDNETFit(tweet):
+    pos_score = 0
+    neg_score = 0
+    useful = "F"
+    polarity = 0
+    pos_score, neg_score = senti_classifier.polarity_scores(tweet)
+    if pos_score != 0 or neg_score != 0:
+        useful = "T"
+    if pos_score + neg_score > 0:
+        polarity = "P"
+    elif pos_score + neg_score < 0:
+        polarity = "N"
+    sentiwordnet_details = {'method' : 'senti_classifier_score', 'pos_score' : pos_score, 'neg_score' : neg_score,
+                            'useful' : useful, 'polarity' : polarity}
+    return sentiwordnet_details
+
+class StanfordNLP:
+    def __init__(self, port_number=8080):
+        self.server = jsonrpclib.Server("http://192.168.1.3:%d" % port_number)
+
+    def parse(self, text):
+        return self.server.parse(text)
+
+nlp = StanfordNLP()
+
+def STANFNLPFit(tweet):
+    useful = "F"
+    tweet = tweet['textProcessed']
+    result = nlp.parse(tweet)
+    stanfnlp_details = {'method' : 'stanfnlp_score', 'polarity' : result, 'useful' : useful}
+    return stanfnlp_details
+
+# Sentiment
+def tweetSentiFit(tweet):
+    fit_methods = [AFINNFit, SENTIWORDNETFit, tweetEmoticonScore, HASHTAGFit, STANFNLPFit]
+    fit = []
+    for method in fit_methods:
+        fit.append(method(tweet))
+    return fit
+
+
+senti140 = deque()
+senti140count = 0
+def SENTI140Fit(tweet,callback):
+
+    global senti140count
+
+    if senti140count < 10:
+        senti140count += 1
+        senti140.append(tweet)
+    else:
+        senti140.append(tweet)
+        senti140data = list(senti140)
+        senti140.clear()
+        senti140count = 0
+
+        data = map(lambda x: {'text' : x['text'], 'id' : x['id'], 
+                                    'query' : 'pepsi'}, senti140data)
+
+        senti140response = requests.post('http://www.sentiment140.com/api/[email protected]', data = json.dumps({'data' : data}))
+        senti140response_json = senti140response.json()
+        senti140resp_dict = {}
+        for tweets in senti140response_json['data']:
+            senti140resp_dict[tweets['id']]=tweets['polarity']
+
+        polarity = -1
+        useful = "F"
+        # print senti140data
+        for orig_tweet in senti140data:
+            try:
+                polarity = senti140resp_dict[orig_tweet['id']]
+            except:
+                continue
+            else:
+                orig_tweet['sentiment_vector'].append({'senti140_score' : polarity, 'useful' : 'T'})
+            # print orig_tweet['sentiment_vector']
+        callback(senti140data)
+
+
+def write_tweets_train(tweets):
+    start = tweets[0]['id']
+    with gzip.open('/tmp/train_%s.gz' % start, 'wb') as f:
+        data = map(lambda x: json.dumps(x), tweets)
+        f.writelines('\n'.join(data))
+
+
+files = os.listdir("data/testdata/")
+
+def main():
+    for f in files:
+        fp = os.path.join("data/testdata/",f)
+        with gzip.open(fp, 'rb') as zf:
+            for line in zf.readlines():
+                tweet = json.loads(line)
+                # printTweet(tweet)
+                # tweet['geoTag'] = tweetGeoTag(tweet)
+                print tweet['text']
+                tweet['textProcessed'] = tweetProcessText(tweet['text'])
+                # print tweet['textProcessed']
+                # tweet['emoticonScore'] = tweetEmoticonScore(tweet['text'])
+                # tweet['hashtagScore'] = HASHTAGFit(tweettweet['entities']['hashtags'])
+                tweet['sentiment_vector'] = tweetSentiFit(tweet)
+                print tweet['sentiment_vector']
+                SENTI140Fit(tweet,write_tweets_train)
+                # raise SystemExit
+
+
+
+
+
+if __name__ == '__main__':
+    main()
+    #print tweetSentiFit(tweetProcessText('Fuck pepsi'))
+    #print tweetEmoticonScore(('Fuck pepsi :@ Hail coke :) 8)'))
+    #print tweetProcessText('Fuck pepsi :@ Hail coke :) 8) rofl lmao g')
+    # print tweetSentiFit('fuck Pepsi :@ hail coke :) 8) rofl lmao g')
+    # print HASHTAGFit(['#lovedit','#foul'])