bostwick · February 11, 2012 04:00 · Feb 11, 2012 · Feb 11, 2012 · Feb 11, 2012 · Feb 11, 2012
diff --git a/FacebookWordCloud.py b/FacebookWordCloud.py
@@ -118,6 +118,12 @@ def word_html(word, size):
     messages = [post['message'] for page in pages 
                                     for post in page['data']
                                         if 'message' in post]
+    comments = [comment['message'] for page in pages
+                                    for post in page['data']
+                                        if 'comments' in post and 'data' in post['comments']
+                                            for comment in post['comments']['data']
+                                                if 'if message' in comment]
+    messages.extend(comments)
 
     tokens = [token for msg in messages for token in word_tokenize(msg)]
     counts = word_count(tokens)

diff --git a/FacebookWordCloud.py b/FacebookWordCloud.py
@@ -20,7 +20,7 @@
 
 def get_json(url):
     """ Return the parsed JSON from a url. """
-    # print "GET %s" % url
+    print "GET %s" % url
     f = urllib2.urlopen(url)
     return json.loads(f.read())
 
@@ -32,18 +32,23 @@ def fb_feed_url(access_token):
     return "https://graph.facebook.com/me/home?access_token=%s" % access_token
 
 
-def fb_news_feed(access_token, pages):
+def fb_news_feed(access_token):
     """ Returns a set number of pages from a facebook news feed as a 
     list of parsed json. 
     """
     feed_url = fb_feed_url(access_token)
     page_json = []
 
-    for x in range(0, pages):
+    for x in range(0, 100):
         json = get_json(feed_url)
         page_json.append(json)
-        feed_url = json['paging']['next']
-
+
+        # Break if there's not enough data to continue
+        if 'paging' in json and 'next' in json['paging']:
+            feed_url = json['paging']['next']
+        else:
+            break
+
     return page_json
 
 
@@ -109,7 +114,7 @@ def word_html(word, size):
 if __name__ == "__main__":
     access_token = sys.argv[1]
 
-    pages = fb_news_feed(access_token, 6)
+    pages = fb_news_feed(access_token)
     messages = [post['message'] for page in pages 
                                     for post in page['data']
                                         if 'message' in post]

diff --git a/get_json b/get_json
@@ -1,5 +0,0 @@
-def get_json(url):
-    """ Return the parsed JSON from a url. """
-    # print "GET %s" % url
-    f = urllib2.urlopen(url)
-    return json.loads(f.read())

diff --git a/FacebookWordCloud.py b/FacebookWordCloud.py
@@ -0,0 +1,125 @@
+#!/usr/bin/env python
+
+import sys
+import urllib2
+import json
+from collections import defaultdict
+
+from nltk import word_tokenize
+
+IGNORED_WORDS = ["!", ".", ",", "(", ")", "'s", ":", "?", "...", "$", 
+                 "<", ">", "''", "``", "-", "c", "'", "--", "&",
+                 "and", "the", "or", "not", "i", "you", "to", "this", 
+                 "of", "in", "for", "a", "an", "and", "your", "with", 
+                 "me", "my", "be", "these", "that", "do", "at", "no", "so",
+                 "1", "2", "3", "4", "5", "6", "7", "8", "9", "0", 
+                 "http", "@", "is", "am", "are", "it", "if", "n't", "'em",
+                 "from", "one", "on", "up", "like", "we", "their", "they",
+                 "'ll", "'d", "'m", "//www.youtube.com/watch", "by", "have",
+                 "just", "will", "as"]
+
+def get_json(url):
+    """ Return the parsed JSON from a url. """
+    # print "GET %s" % url
+    f = urllib2.urlopen(url)
+    return json.loads(f.read())
+
+
+def fb_feed_url(access_token):
+    """ The graph api url for your facebook feed. access_token should be 
+    a valid facebook access token as a string.
+    """
+    return "https://graph.facebook.com/me/home?access_token=%s" % access_token
+
+
+def fb_news_feed(access_token, pages):
+    """ Returns a set number of pages from a facebook news feed as a 
+    list of parsed json. 
+    """
+    feed_url = fb_feed_url(access_token)
+    page_json = []
+
+    for x in range(0, pages):
+        json = get_json(feed_url)
+        page_json.append(json)
+        feed_url = json['paging']['next']
+
+    return page_json
+
+
+def word_count(words):
+    """ Count the occurrences of each word. Words should be a list of strings
+    that you might get from str.split() or nltk.word_tokenize().
+    """
+    seen = defaultdict(lambda: 0)
+
+    for w in words:
+        if w.lower() not in IGNORED_WORDS:
+            seen[w] += 1
+
+    return seen
+
+
+def word_cloud_sizes(counts):
+    max_count = max(counts.values())
+    min_size, max_size = (1, 5)
+    word_sizes = defaultdict(lambda : 1)
+
+    for word in counts:
+        if counts[word] > 1:
+            size = int(float(counts[word]) * (max_size - 1) / max_count) + 1
+            word_sizes[word] = size
+
+    return word_sizes
+
+def word_cloud_header():
+    return """
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
+    "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html>
+<head>
+    <title>My Facebook Word Cloud</title>
+    <style type=\"text/css\">
+.size-1 { font-size: 10pt; }
+.size-2 { font-size: 16pt; }
+.size-3 { font-size: 28pt; }
+.size-4 { font-size: 36pt; font-weight: bold; }
+.size-5 { font-size: 48pt; font-weight: bold; }
+.word-cloud { margin: 0 auto; width: 600px; padding-top: 10px; }
+    </style>
+</head>
+<body>
+<div class="word-cloud">
+"""
+
+def word_cloud_footer():
+    return """
+</div>
+</body>
+</html>
+"""
+
+def word_cloud_html(sizes):
+    def word_html(word, size):
+        return "<span class=\"size-%s\">%s</span>" % (size, word)
+
+    return ' '.join([word_html(w, s) for w, s in sizes.items()])
+
+
+if __name__ == "__main__":
+    access_token = sys.argv[1]
+
+    pages = fb_news_feed(access_token, 6)
+    messages = [post['message'] for page in pages 
+                                    for post in page['data']
+                                        if 'message' in post]
+
+    tokens = [token for msg in messages for token in word_tokenize(msg)]
+    counts = word_count(tokens)
+
+    sizes = word_cloud_sizes(counts)
+
+    print ''.join([
+        word_cloud_header(),
+        word_cloud_html(sizes),
+        word_cloud_footer()])
diff --git a/fb_news_feed b/fb_news_feed
@@ -1,20 +0,0 @@
-def fb_feed_url(access_token):
-    """ The graph api url for your facebook feed. access_token should be 
-    a valid facebook access token as a string.
-    """
-    return "https://graph.facebook.com/me/home?access_token=%s" % access_token
-
-
-def fb_news_feed(access_token, pages):
-    """ Returns a set number of pages from a facebook news feed as a 
-    list of parsed json. 
-    """
-    feed_url = fb_feed_url(access_token)
-    page_json = []
-
-    for x in range(0, pages):
-        json = get_json(feed_url)
-        page_json.append(json)
-        feed_url = json['paging']['next']
-
-    return page_json

diff --git a/gistfile1.py → fb_news_feed b/gistfile1.py → fb_news_feed
diff --git a/gistfile1.py b/gistfile1.py
@@ -0,0 +1,20 @@
+def fb_feed_url(access_token):
+    """ The graph api url for your facebook feed. access_token should be 
+    a valid facebook access token as a string.
+    """
+    return "https://graph.facebook.com/me/home?access_token=%s" % access_token
+
+
+def fb_news_feed(access_token, pages):
+    """ Returns a set number of pages from a facebook news feed as a 
+    list of parsed json. 
+    """
+    feed_url = fb_feed_url(access_token)
+    page_json = []
+
+    for x in range(0, pages):
+        json = get_json(feed_url)
+        page_json.append(json)
+        feed_url = json['paging']['next']
+
+    return page_json
diff --git a/get_json b/get_json
@@ -0,0 +1,5 @@
+def get_json(url):
+    """ Return the parsed JSON from a url. """
+    # print "GET %s" % url
+    f = urllib2.urlopen(url)
+    return json.loads(f.read())