Instantly share code, notes, and snippets.
Created
January 18, 2017 10:03
-
Star
2
(2)
You must be signed in to star a gist -
Fork
0
(0)
You must be signed in to fork a gist
-
Save mizvol/eb24770ac3d5d598463f972e2a669f03 to your computer and use it in GitHub Desktop.
LDA topic analysis of Instagram hashtags for clustering. Analysis + Visualization in D3JS
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Clustering Instagram users using hashtags. Topic analysis and visualization in D3JS" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"import pymongo as pm\n", | |
"import unicodedata" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Reading the data from Mongo" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"client = pm.MongoClient()\n", | |
"db = client.instagram\n", | |
"tagsDB = db.tags" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Extracting tags data" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"rawTags = []\n", | |
"for user in tagsDB.find():\n", | |
" rawTags.extend(user['tags'])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"424113" | |
] | |
}, | |
"execution_count": 4, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"len(rawTags)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[u'contiki',\n", | |
" u'swissalps',\n", | |
" u'newfriends',\n", | |
" u'freezingmynutsoff',\n", | |
" u'walkabout',\n", | |
" u'jungfraujoch',\n", | |
" u'yolo',\n", | |
" u'travel',\n", | |
" u'noregrets',\n", | |
" u'goodtimes']" | |
] | |
}, | |
"execution_count": 5, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"rawTags[:10]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"tagsRDD = sc.parallelize(rawTags)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"424113" | |
] | |
}, | |
"execution_count": 7, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"tagsRDD.count()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Cleaning\n", | |
"Note, if you want to keep language specific features and words, you have to clean the data in a different way." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"countsRDD = (\n", | |
" tagsRDD\n", | |
" .map(lambda tag: (unicodedata.normalize('NFKD', tag).encode('ascii','ignore'), 1))\n", | |
" .reduceByKey(lambda a, b: a + b)\n", | |
")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"106083" | |
] | |
}, | |
"execution_count": 9, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"countsRDD.count()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Explore the data" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"ordered = countsRDD.takeOrdered(500, lambda (key, value): -value)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[('', 9146),\n", | |
" ('switzerland', 8973),\n", | |
" ('zurich', 3990),\n", | |
" ('love', 2605),\n", | |
" ('swiss', 2593),\n", | |
" ('easter', 2526),\n", | |
" ('mountains', 1999),\n", | |
" ('travel', 1978),\n", | |
" ('spring', 1963),\n", | |
" ('snow', 1809),\n", | |
" ('sun', 1754),\n", | |
" ('nature', 1742),\n", | |
" ('lake', 1565),\n", | |
" ('geneva', 1516),\n", | |
" ('beautiful', 1485),\n", | |
" ('schweiz', 1462),\n", | |
" ('happy', 1408),\n", | |
" ('instagood', 1403),\n", | |
" ('photooftheday', 1265),\n", | |
" ('picoftheday', 1256),\n", | |
" ('suisse', 1238),\n", | |
" ('friends', 1199),\n", | |
" ('alps', 1165),\n", | |
" ('happyeaster', 1096),\n", | |
" ('ski', 1085),\n", | |
" ('fun', 1044),\n", | |
" ('basel', 994),\n", | |
" ('landscape', 928),\n", | |
" ('sky', 923),\n", | |
" ('skiing', 860),\n", | |
" ('sunset', 839),\n", | |
" ('swissalps', 820),\n", | |
" ('family', 805),\n", | |
" ('bern', 800),\n", | |
" ('nofilter', 799),\n", | |
" ('luzern', 770),\n", | |
" ('amazing', 755),\n", | |
" ('view', 754),\n", | |
" ('europe', 737),\n", | |
" ('instadaily', 736),\n", | |
" ('weekend', 713),\n", | |
" ('geneve', 701),\n", | |
" ('fashion', 700),\n", | |
" ('art', 680),\n", | |
" ('holiday', 678),\n", | |
" ('sunnyday', 666),\n", | |
" ('me', 659),\n", | |
" ('mountain', 657),\n", | |
" ('food', 648),\n", | |
" ('lausanne', 589),\n", | |
" ('instalike', 583),\n", | |
" ('smile', 577),\n", | |
" ('style', 572),\n", | |
" ('like4like', 565),\n", | |
" ('lucerne', 565),\n", | |
" ('followme', 563),\n", | |
" ('clouds', 561),\n", | |
" ('architecture', 546),\n", | |
" ('blue', 544),\n", | |
" ('wanderlust', 540),\n", | |
" ('zermatt', 537),\n", | |
" ('instatravel', 536),\n", | |
" ('selfie', 535),\n", | |
" ('instamood', 533),\n", | |
" ('life', 532),\n", | |
" ('winter', 529),\n", | |
" ('city', 519),\n", | |
" ('ostern', 515),\n", | |
" ('trip', 515),\n", | |
" ('sunny', 513),\n", | |
" ('photography', 512),\n", | |
" ('flowers', 501),\n", | |
" ('blackandwhite', 497),\n", | |
" ('baselworld2016', 496),\n", | |
" ('travelgram', 492),\n", | |
" ('bluesky', 492),\n", | |
" ('sunshine', 491),\n", | |
" ('instagram', 485),\n", | |
" ('girl', 478),\n", | |
" ('foodporn', 476),\n", | |
" ('home', 471),\n", | |
" ('verbier', 471),\n", | |
" ('party', 462),\n", | |
" ('igers', 462),\n", | |
" ('music', 458),\n", | |
" ('vscocam', 451),\n", | |
" ('beauty', 443),\n", | |
" ('tbt', 436),\n", | |
" ('montreux', 435),\n", | |
" ('vsco', 429),\n", | |
" ('luxury', 426),\n", | |
" ('instapic', 424),\n", | |
" ('baselworld', 422),\n", | |
" ('fitness', 422),\n", | |
" ('follow', 422),\n", | |
" ('svizzera', 420),\n", | |
" ('chocolate', 418),\n", | |
" ('visitswitzerland', 407),\n", | |
" ('lifestyle', 397),\n", | |
" ('snowboarding', 396),\n", | |
" ('night', 395),\n", | |
" ('relax', 385),\n", | |
" ('matterhorn', 380),\n", | |
" ('lacleman', 378),\n", | |
" ('photo', 372),\n", | |
" ('water', 370),\n", | |
" ('lugano', 365),\n", | |
" ('holidays', 362),\n", | |
" ('interlaken', 357),\n", | |
" ('myswitzerland', 356),\n", | |
" ('vacation', 354),\n", | |
" ('design', 349),\n", | |
" ('switzerlandwonderland', 348),\n", | |
" ('summer', 348),\n", | |
" ('goodtimes', 346),\n", | |
" ('stmoritz', 342),\n", | |
" ('morning', 341),\n", | |
" ('day', 340),\n", | |
" ('cute', 340),\n", | |
" ('2016', 333),\n", | |
" ('traveling', 332),\n", | |
" ('enjoy', 328),\n", | |
" ('tagsforlikes', 326),\n", | |
" ('sunday', 322),\n", | |
" ('tree', 322),\n", | |
" ('saturday', 320),\n", | |
" ('green', 319),\n", | |
" ('bestoftheday', 314),\n", | |
" ('goodmorning', 314),\n", | |
" ('loveit', 309),\n", | |
" ('travelling', 309),\n", | |
" ('instafood', 297),\n", | |
" ('river', 295),\n", | |
" ('happiness', 294),\n", | |
" ('white', 294),\n", | |
" ('hiking', 292),\n", | |
" ('nice', 291),\n", | |
" ('germany', 287),\n", | |
" ('snowboard', 285),\n", | |
" ('coffee', 281),\n", | |
" ('france', 281),\n", | |
" ('konstanz', 279),\n", | |
" ('inlovewithswitzerland', 276),\n", | |
" ('black', 276),\n", | |
" ('ticino', 274),\n", | |
" ('follow4follow', 271),\n", | |
" ('valais', 269),\n", | |
" ('healthy', 265),\n", | |
" ('instacool', 264),\n", | |
" ('work', 264),\n", | |
" ('adventure', 263),\n", | |
" ('watch', 257),\n", | |
" ('sport', 257),\n", | |
" ('trees', 256),\n", | |
" ('likeforlike', 256),\n", | |
" ('zuri', 255),\n", | |
" ('bodensee', 255),\n", | |
" ('awesome', 254),\n", | |
" ('watches', 253),\n", | |
" ('_', 251),\n", | |
" ('springtime', 250),\n", | |
" ('light', 246),\n", | |
" ('italy', 246),\n", | |
" ('paques', 246),\n", | |
" ('fruhling', 244),\n", | |
" ('yummy', 243),\n", | |
" ('street', 243),\n", | |
" ('breakfast', 242),\n", | |
" ('graubunden', 242),\n", | |
" ('train', 241),\n", | |
" ('naturelovers', 241),\n", | |
" ('dinner', 240),\n", | |
" ('explore', 240),\n", | |
" ('davos', 239),\n", | |
" ('best', 236),\n", | |
" ('swissmade', 236),\n", | |
" ('girls', 234),\n", | |
" ('red', 234),\n", | |
" ('peace', 230),\n", | |
" ('laax', 229),\n", | |
" ('travelingram', 229),\n", | |
" ('sunrise', 227),\n", | |
" ('chill', 226),\n", | |
" ('like', 224),\n", | |
" ('workout', 223),\n", | |
" ('panorama', 223),\n", | |
" ('switzerlandpictures', 221),\n", | |
" ('cool', 221),\n", | |
" ('gopro', 219),\n", | |
" ('ootd', 217),\n", | |
" ('delicious', 216),\n", | |
" ('beautifulday', 215),\n", | |
" ('zurichsee', 215),\n", | |
" ('sweet', 215),\n", | |
" ('model', 214),\n", | |
" ('throwback', 213),\n", | |
" ('ig_switzerland', 211),\n", | |
" ('photographer', 210),\n", | |
" ('car', 210),\n", | |
" ('dog', 209),\n", | |
" ('suiza', 205),\n", | |
" ('beautifuldestinations', 205),\n", | |
" ('see', 205),\n", | |
" ('colorful', 205),\n", | |
" ('walk', 204),\n", | |
" ('colors', 204),\n", | |
" ('lunch', 204),\n", | |
" ('new', 202),\n", | |
" ('training', 202),\n", | |
" ('live', 201),\n", | |
" ('gym', 201),\n", | |
" ('foodie', 200),\n", | |
" ('forest', 200),\n", | |
" ('motivation', 198),\n", | |
" ('cold', 197),\n", | |
" ('world', 195),\n", | |
" ('beer', 194),\n", | |
" ('ischgl', 194),\n", | |
" ('familytime', 192),\n", | |
" ('castle', 192),\n", | |
" ('pasqua', 190),\n", | |
" ('running', 190),\n", | |
" ('fit', 188),\n", | |
" ('switzerland_vacations', 187),\n", | |
" ('restaurant', 187),\n", | |
" ('good', 187),\n", | |
" ('pink', 186),\n", | |
" ('bunny', 186),\n", | |
" ('roadtrip', 186),\n", | |
" ('homesweethome', 186),\n", | |
" ('time', 185),\n", | |
" ('my', 184),\n", | |
" ('l4l', 184),\n", | |
" ('picture', 183),\n", | |
" ('memories', 183),\n", | |
" ('lakegeneva', 181),\n", | |
" ('nike', 181),\n", | |
" ('alpes', 180),\n", | |
" ('inspiration', 180),\n", | |
" ('nikon', 179),\n", | |
" ('tb', 178),\n", | |
" ('instalove', 177),\n", | |
" ('sonne', 176),\n", | |
" ('grindelwald', 173),\n", | |
" ('church', 171),\n", | |
" ('canon', 171),\n", | |
" ('weather', 171),\n", | |
" ('travelphotography', 168),\n", | |
" ('engelberg', 168),\n", | |
" ('repost', 167),\n", | |
" ('concert', 166),\n", | |
" ('goodlife', 166),\n", | |
" ('tattoo', 165),\n", | |
" ('neverstopexploring', 164),\n", | |
" ('engadin', 164),\n", | |
" ('bridge', 164),\n", | |
" ('with', 163),\n", | |
" ('pretty', 163),\n", | |
" ('iloveswitzerland', 163),\n", | |
" ('instaphoto', 162),\n", | |
" ('lovely', 162),\n", | |
" ('watchporn', 162),\n", | |
" ('jungfrau', 161),\n", | |
" ('passion', 161),\n", | |
" ('wallis', 160),\n", | |
" ('airport', 159),\n", | |
" ('perfect', 158),\n", | |
" ('hotel', 158),\n", | |
" ('tourism', 157),\n", | |
" ('shopping', 156),\n", | |
" ('friendship', 156),\n", | |
" ('funny', 155),\n", | |
" ('monday', 154),\n", | |
" ('easterweekend', 153),\n", | |
" ('swag', 152),\n", | |
" ('instamoment', 152),\n", | |
" ('flower', 152),\n", | |
" ('berge', 150),\n", | |
" ('the', 150),\n", | |
" ('froheostern', 150),\n", | |
" ('thun', 150),\n", | |
" ('iphoneonly', 149),\n", | |
" ('mylove', 147),\n", | |
" ('pic', 147),\n", | |
" ('skyporn', 147),\n", | |
" ('bar', 147),\n", | |
" ('brunch', 146),\n", | |
" ('neuchatel', 145),\n", | |
" ('loveswitzerlandcontest', 145),\n", | |
" ('powder', 144),\n", | |
" ('fresh', 143),\n", | |
" ('evening', 143),\n", | |
" ('makeup', 143),\n", | |
" ('boy', 142),\n", | |
" ('hair', 142),\n", | |
" ('vegan', 142),\n", | |
" ('hot', 142),\n", | |
" ('wonderful', 141),\n", | |
" ('in', 141),\n", | |
" ('color', 140),\n", | |
" ('house', 140),\n", | |
" ('tourist', 140),\n", | |
" ('instafollow', 140),\n", | |
" ('eurotrip', 139),\n", | |
" ('swizerland', 139),\n", | |
" ('top', 139),\n", | |
" ('friday', 139),\n", | |
" ('lago', 139),\n", | |
" ('traveller', 138),\n", | |
" ('suica', 138),\n", | |
" ('pictureoftheday', 138),\n", | |
" ('instago', 138),\n", | |
" ('eggs', 138),\n", | |
" ('gold', 138),\n", | |
" ('potd', 138),\n", | |
" ('rhein', 138),\n", | |
" ('polymanga', 137),\n", | |
" ('wine', 137),\n", | |
" ('stgallen', 136),\n", | |
" ('rolex', 136),\n", | |
" ('mood', 136),\n", | |
" ('austria', 136),\n", | |
" ('dance', 136),\n", | |
" ('swan', 136),\n", | |
" ('fribourg', 135),\n", | |
" ('goodday', 134),\n", | |
" ('alpen', 134),\n", | |
" ('igdaily', 134),\n", | |
" ('printemps', 134),\n", | |
" ('swisslife', 134),\n", | |
" ('lamborghini', 134),\n", | |
" ('paris', 133),\n", | |
" ('apresski', 133),\n", | |
" ('march', 133),\n", | |
" ('boat', 132),\n", | |
" ('schnee', 132),\n", | |
" ('portrait', 132),\n", | |
" ('oldtown', 132),\n", | |
" ('crazy', 132),\n", | |
" ('lac', 132),\n", | |
" ('birthday', 132),\n", | |
" ('urban', 131),\n", | |
" ('f4f', 131),\n", | |
" ('arosa', 130),\n", | |
" ('tflers', 130),\n", | |
" ('latergram', 130),\n", | |
" ('swissmountains', 129),\n", | |
" ('winterwonderland', 128),\n", | |
" ('vaud', 128),\n", | |
" ('jetdeau', 128),\n", | |
" ('streetart', 128),\n", | |
" ('cat', 128),\n", | |
" ('bmw', 127),\n", | |
" ('ig_europe', 126),\n", | |
" ('titlis', 126),\n", | |
" ('look', 125),\n", | |
" ('fitfam', 125),\n", | |
" ('enjoylife', 125),\n", | |
" ('and', 124),\n", | |
" ('traveltheworld', 124),\n", | |
" ('blessed', 124),\n", | |
" ('paradise', 124),\n", | |
" ('montagne', 123),\n", | |
" ('outdoors', 123),\n", | |
" ('ig_swiss', 122),\n", | |
" ('vevey', 122),\n", | |
" ('dessert', 122),\n", | |
" ('couple', 121),\n", | |
" ('sunglasses', 121),\n", | |
" ('bike', 121),\n", | |
" ('zug', 120),\n", | |
" ('winterthur', 120),\n", | |
" ('hiphop', 120),\n", | |
" ('cars', 119),\n", | |
" ('baby', 119),\n", | |
" ('club', 118),\n", | |
" ('animal', 118),\n", | |
" ('ferrari', 117),\n", | |
" ('vintage', 117),\n", | |
" ('natur', 116),\n", | |
" ('friend', 116),\n", | |
" ('museum', 116),\n", | |
" ('qualitytime', 116),\n", | |
" ('carporn', 116),\n", | |
" ('goodvibes', 116),\n", | |
" ('loveyou', 115),\n", | |
" ('wood', 115),\n", | |
" ('igtravel', 115),\n", | |
" ('lindt', 115),\n", | |
" ('instagramers', 115),\n", | |
" ('italia', 114),\n", | |
" ('goodtime', 114),\n", | |
" ('buonapasqua', 114),\n", | |
" ('nature_perfection', 114),\n", | |
" ('fly', 113),\n", | |
" ('nofilterneeded', 113),\n", | |
" ('today', 113),\n", | |
" ('audi', 113),\n", | |
" ('bw', 112),\n", | |
" ('eat', 112),\n", | |
" ('shooting', 112),\n", | |
" ('watchesofinstagram', 111),\n", | |
" ('walking', 111),\n", | |
" ('supercar', 110),\n", | |
" ('igerssuisse', 110),\n", | |
" ('lakezurich', 110),\n", | |
" ('garden', 110),\n", | |
" ('likes', 110),\n", | |
" ('great', 110),\n", | |
" ('dj', 109),\n", | |
" ('traveler', 109),\n", | |
" ('super_switzerland', 109),\n", | |
" ('yellow', 109),\n", | |
" ('artist', 109),\n", | |
" ('porsche', 109),\n", | |
" ('landscape_lovers', 108),\n", | |
" ('drinks', 108),\n", | |
" ('happyday', 108),\n", | |
" ('handmade', 108),\n", | |
" ('run', 108),\n", | |
" ('naturephotography', 108),\n", | |
" ('goodnight', 107),\n", | |
" ('vierwaldstattersee', 107),\n", | |
" ('people', 106),\n", | |
" ('blonde', 106),\n", | |
" ('visitzurich', 106),\n", | |
" ('london', 105),\n", | |
" ('cheese', 105),\n", | |
" ('easterbunny', 105),\n", | |
" ('outdoor', 105),\n", | |
" ('fondue', 105),\n", | |
" ('ascona', 104),\n", | |
" ('followforfollow', 104),\n", | |
" ('watchoftheday', 104),\n", | |
" ('leman', 103),\n", | |
" ('lagomaggiore', 102),\n", | |
" ('streetphotography', 102),\n", | |
" ('reflection', 102),\n", | |
" ('lights', 101),\n", | |
" ('building', 101),\n", | |
" ('ice', 101),\n", | |
" ('iphone', 100),\n", | |
" ('genevalake', 100),\n", | |
" ('health', 100),\n", | |
" ('freeride', 100),\n", | |
" ('bodybuilding', 99),\n", | |
" ('igersgeneva', 99),\n", | |
" ('champagne', 99),\n", | |
" ('waterfall', 99),\n", | |
" ('beard', 99),\n", | |
" ('chilling', 98),\n", | |
" ('cloudporn', 98),\n", | |
" ('sister', 98),\n", | |
" ('primavera', 97),\n", | |
" ('dream', 97),\n", | |
" ('starbucks', 97),\n", | |
" ('instafashion', 97),\n", | |
" ('aviation', 97),\n", | |
" ('springbreak', 97),\n", | |
" ('rheinfall', 97),\n", | |
" ('dogsofinstagram', 97),\n", | |
" ('polymanga2016', 96),\n", | |
" ('liechtenstein', 96),\n", | |
" ('zurichcity', 96),\n", | |
" ('igersswitzerland', 95),\n", | |
" ('blogger', 95),\n", | |
" ('instanature', 95),\n", | |
" ('scenery', 95),\n", | |
" ('schaffhausen', 95),\n", | |
" ('outfit', 95),\n", | |
" ('horology', 94),\n", | |
" ('liveauthentic', 94),\n", | |
" ('shoes', 94),\n", | |
" ('nightlife', 94),\n", | |
" ('animals', 94),\n", | |
" ('adidas', 93),\n", | |
" ('interiordesign', 93),\n", | |
" ('instatraveling', 93),\n", | |
" ('jewelry', 92),\n", | |
" ('homemade', 92),\n", | |
" ('cake', 92),\n", | |
" ('tasty', 92),\n", | |
" ('nightout', 92),\n", | |
" ('wow', 91),\n", | |
" ('zurichairport', 91),\n", | |
" ('gstaad', 91),\n", | |
" ('mylife', 91),\n", | |
" ('rigi', 91),\n", | |
" ('video', 91),\n", | |
" ('mercedes', 91),\n", | |
" ('a', 91),\n", | |
" ('all_shots', 90),\n", | |
" ('flying', 90),\n", | |
" ('moment', 90),\n", | |
" ('deutschland', 89),\n", | |
" ('zurisee', 89),\n", | |
" ('mytravelgram', 89),\n", | |
" ('forever', 89),\n", | |
" ('beach', 89),\n", | |
" ('park', 88)]" | |
] | |
}, | |
"execution_count": 11, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"ordered" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
" , switzerland , zurich , love , swiss , easter , mountains , travel , spring , snow , sun , nature , lake , geneva , beautiful , schweiz , happy , instagood , photooftheday , picoftheday , suisse , friends , alps , happyeaster , ski , fun , basel , landscape , sky , skiing , sunset , swissalps , family , bern , nofilter , luzern , amazing , view , europe , instadaily , weekend , geneve , fashion , art , holiday , sunnyday , me , mountain , food , lausanne , instalike , smile , style , like4like , lucerne , followme , clouds , architecture , blue , wanderlust , zermatt , instatravel , selfie , instamood , life , winter , city , ostern , trip , sunny , photography , flowers , blackandwhite , baselworld2016 , travelgram , bluesky , sunshine , instagram , girl , foodporn , home , verbier , party , igers , music , vscocam , beauty , tbt , montreux , vsco , luxury , instapic , baselworld , fitness , follow , svizzera , chocolate , visitswitzerland , lifestyle , snowboarding , night , relax , matterhorn , lacleman , photo , water , lugano , holidays , interlaken , myswitzerland , vacation , design , switzerlandwonderland , summer , goodtimes , stmoritz , morning , day , cute , 2016 , traveling , enjoy , tagsforlikes , sunday , tree , saturday , green , bestoftheday , goodmorning , loveit , travelling , instafood , river , happiness , white , hiking , nice , germany , snowboard , coffee , france , konstanz , inlovewithswitzerland , black , ticino , follow4follow , valais , healthy , instacool , work , adventure , watch , sport , trees , likeforlike , zuri , bodensee , awesome , watches , _ , springtime , light , italy , paques , fruhling , yummy , street , breakfast , graubunden , train , naturelovers , dinner , explore , davos , best , swissmade , girls , red , peace , laax , travelingram , sunrise , chill , like , workout , panorama , switzerlandpictures , cool , gopro , ootd , delicious , beautifulday , zurichsee , sweet , model , throwback , ig_switzerland , photographer , car , dog , suiza , beautifuldestinations , see , colorful , walk , colors , lunch , new , training , live , gym , foodie , forest , motivation , cold , world , beer , ischgl , familytime , castle , pasqua , running , fit , switzerland_vacations , restaurant , good , pink , bunny , roadtrip , homesweethome , time , my , l4l , picture , memories , lakegeneva , nike , alpes , inspiration , nikon , tb , instalove , sonne , grindelwald , church , canon , weather , travelphotography , engelberg , repost , concert , goodlife , tattoo , neverstopexploring , engadin , bridge , with , pretty , iloveswitzerland , instaphoto , lovely , watchporn , jungfrau , passion , wallis , airport , perfect , hotel , tourism , shopping , friendship , funny , monday , easterweekend , swag , instamoment , flower , berge , the , froheostern , thun , iphoneonly , mylove , pic , skyporn , bar , brunch , neuchatel , loveswitzerlandcontest , powder , fresh , evening , makeup , boy , hair , vegan , hot , wonderful , in , color , house , tourist , instafollow , eurotrip , swizerland , top , friday , lago , traveller , suica , pictureoftheday , instago , eggs , gold , potd , rhein , polymanga , wine , stgallen , rolex , mood , austria , dance , swan , fribourg , goodday , alpen , igdaily , printemps , swisslife , lamborghini , paris , apresski , march , boat , schnee , portrait , oldtown , crazy , lac , birthday , urban , f4f , arosa , tflers , latergram , swissmountains , winterwonderland , vaud , jetdeau , streetart , cat , bmw , ig_europe , titlis , look , fitfam , enjoylife , and , traveltheworld , blessed , paradise , montagne , outdoors , ig_swiss , vevey , dessert , couple , sunglasses , bike , zug , winterthur , hiphop , cars , baby , club , animal , ferrari , vintage , natur , friend , museum , qualitytime , carporn , goodvibes , loveyou , wood , igtravel , lindt , instagramers , italia , goodtime , buonapasqua , nature_perfection , fly , nofilterneeded , today , audi , bw , eat , shooting , watchesofinstagram , walking , supercar , igerssuisse , lakezurich , garden , likes , great , dj , traveler , super_switzerland , yellow , artist , porsche , landscape_lovers , drinks , happyday , handmade , run , naturephotography , goodnight , vierwaldstattersee , people , blonde , visitzurich , london , cheese , easterbunny , outdoor , fondue , ascona , followforfollow , watchoftheday , leman , lagomaggiore , streetphotography , reflection , lights , building , ice , iphone , genevalake , health , freeride , bodybuilding , igersgeneva , champagne , waterfall , beard , chilling , cloudporn , sister , primavera , dream , starbucks , instafashion , aviation , springbreak , rheinfall , dogsofinstagram , polymanga2016 , liechtenstein , zurichcity , igersswitzerland , blogger , instanature , scenery , schaffhausen , outfit , horology , liveauthentic , shoes , nightlife , animals , adidas , interiordesign , instatraveling , jewelry , homemade , cake , tasty , nightout , wow , zurichairport , gstaad , mylife , rigi , video , mercedes , a , all_shots , flying , moment , deutschland , zurisee , mytravelgram , forever , beach , park ,\n" | |
] | |
} | |
], | |
"source": [ | |
"for order in ordered:\n", | |
" print order[0],\",\"," | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"with open('/home/volodymyrmiz/Desktop/rawTags.txt', 'w') as f:\n", | |
" for tag in ordered:\n", | |
" if tag[0] != '':\n", | |
" f.write((tag[0] + ' ')*(tag[1] / 10))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"from matplotlib import pyplot as plt\n", | |
"import matplotlib\n", | |
"matplotlib.style.use('ggplot')\n", | |
"import numpy as np" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"frequentTags = [tag[0] for tag in ordered]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"frequency = [tag[1] for tag in ordered]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"y_pos = np.arange(len(frequentTags))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 19, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"#plt.barh(y_pos, frequency, alpha=0.5)\n", | |
"#plt.yticks(y_pos, frequentTags)\n", | |
"#plt.show()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Find words co-occurences" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 20, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"userTags = []\n", | |
"for user in tagsDB.find():\n", | |
" userTags.append([unicodedata.normalize('NFKD', tag).encode('ascii','ignore') \n", | |
" for tag in user['tags'] \n", | |
" if unicodedata.normalize('NFKD', tag).encode('ascii','ignore') != ''])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 21, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"['contiki',\n", | |
" 'swissalps',\n", | |
" 'newfriends',\n", | |
" 'freezingmynutsoff',\n", | |
" 'walkabout',\n", | |
" 'jungfraujoch',\n", | |
" 'yolo',\n", | |
" 'travel',\n", | |
" 'noregrets']" | |
] | |
}, | |
"execution_count": 21, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"userTags[0]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 22, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Co-occurrence for train:\n", | |
"train\n", | |
"switzerland\n", | |
"travel\n", | |
"zurich\n", | |
"mountains\n", | |
"swiss\n", | |
"easter\n", | |
"snow\n", | |
"nature\n", | |
"lake\n", | |
"photooftheday\n", | |
"alps\n", | |
"sky\n", | |
"beautiful\n", | |
"europe\n", | |
"spring\n", | |
"sbb\n", | |
"view\n", | |
"clouds\n", | |
"love\n", | |
"instagood\n" | |
] | |
} | |
], | |
"source": [ | |
"from collections import Counter\n", | |
"search_word = \"train\"\n", | |
"count_search = Counter()\n", | |
"for tag in userTags:\n", | |
" if search_word in tag:\n", | |
" count_search.update(tag)\n", | |
"print(\"Co-occurrence for %s:\" % search_word)\n", | |
"for word in count_search.most_common(21):\n", | |
" print word[0]" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Topic analysis using LDA" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"LDA.\n", | |
"As with many\n", | |
"clustering models, such a model restricts a document to being associated with a single topic. LDA,\n", | |
"on the other hand, involves three levels, and notably the topic node is sampled repeatedly within the\n", | |
"document. Under this model, documents can be associated with multiple topics." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 23, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"from pyspark.mllib.clustering import LDA, LDAModel\n", | |
"from pyspark.mllib.linalg import Vectors" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 24, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"tagsList = []\n", | |
"for tag in tagsDB.find():\n", | |
" tagsList.append((str(tag['_id']), [unicodedata.normalize('NFKD', t).encode('ascii','ignore') \n", | |
" for t in tag['tags']\n", | |
" if unicodedata.normalize('NFKD', t).encode('ascii','ignore') != '']))" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"#### Filter tag list of each user. Remove the most common and rarely used ones" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 25, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"filteredList = []\n", | |
"for tag in tagsList:\n", | |
" filteredList.append((tag[0], list(set(tag[1]).intersection(frequentTags[:]))))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 26, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"tagsListDF = sc.parallelize(filteredList).toDF([\"id\", \"tokens\"])" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"#### Vectorize tags arrays for each user" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 28, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"from pyspark.ml.feature import CountVectorizer" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 29, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"vectorizer = CountVectorizer(inputCol=\"tokens\", outputCol=\"features\").fit(tagsListDF)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 30, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"countVectors = vectorizer.transform(tagsListDF).select(\"id\", \"features\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 31, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[Row(id=u'234933728', features=SparseVector(499, {6: 1.0, 30: 1.0}))]" | |
] | |
}, | |
"execution_count": 31, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"countVectors.take(1)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"#### Find TF-IDF coefficients for each word instead of bag of words" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 32, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"from pyspark.mllib.feature import IDF" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 33, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"frequencyVectors = countVectors.map(lambda vector: vector[1])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 34, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[SparseVector(499, {6: 1.0, 30: 1.0}), SparseVector(499, {113: 1.0, 210: 1.0})]" | |
] | |
}, | |
"execution_count": 34, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"frequencyVectors.take(2)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 35, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"frequencyVectors.cache()\n", | |
"idf = IDF().fit(frequencyVectors)\n", | |
"tfidf = idf.transform(frequencyVectors)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 36, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[SparseVector(499, {6: 2.8768, 30: 3.7561})]" | |
] | |
}, | |
"execution_count": 36, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"tfidf.take(1)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 37, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"#just in case, if ids are needed\n", | |
"tfidf_with_ids = countVectors.map(lambda vector: int(vector[0])).zip(tfidf).map(lambda pair: [pair[0], pair[1]])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 38, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[[234933728, SparseVector(499, {6: 2.8768, 30: 3.7561})]]" | |
] | |
}, | |
"execution_count": 38, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"tfidf_with_ids.take(1)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 39, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"corpus = tfidf.map(lambda x: [1, x]).cache()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 40, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[[1, SparseVector(499, {6: 2.8768, 30: 3.7561})],\n", | |
" [1, SparseVector(499, {113: 4.6173, 210: 5.1634})],\n", | |
" [1, SparseVector(499, {})],\n", | |
" [1, SparseVector(499, {22: 3.4672})],\n", | |
" [1,\n", | |
" SparseVector(499, {2: 2.6026, 8: 2.9656, 13: 3.1635, 16: 3.2196, 17: 3.3231, 18: 3.3302, 20: 3.3766, 23: 3.4764, 35: 3.8386, 43: 3.946, 45: 3.9744, 63: 4.1881, 76: 4.2805, 85: 4.3708, 89: 4.4099, 109: 4.5946, 127: 4.7141, 327: 5.5614, 401: 5.7571})],\n", | |
" [1, SparseVector(499, {418: 5.7753})],\n", | |
" [1, SparseVector(499, {})],\n", | |
" [1, SparseVector(499, {2: 2.6026, 24: 3.5149, 114: 4.6289, 176: 5.0243})],\n", | |
" [1, SparseVector(499, {158: 4.9412})],\n", | |
" [1, SparseVector(499, {})]]" | |
] | |
}, | |
"execution_count": 40, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"corpus.take(10)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"#### Build Latent Dirichlet Allocation model for clustering" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 42, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"ldaModel = LDA.train(corpus, k = 15, maxIterations=100, optimizer=\"online\", docConcentration=2.0, topicConcentration=3.0)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Note: LDA does not perform well with the EMLDAOptimizer which is used by default. In the case of EMLDAOptimizer we have significant bies to the most popular hashtags. I used the OnlineLDAOptimizer instead. The Optimizer implements the Online variational Bayes LDA algorithm, which processes a subset of the corpus on each iteration, and updates the term-topic distribution adaptively." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 43, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"499" | |
] | |
}, | |
"execution_count": 43, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"len(ldaModel.topicsMatrix())" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 44, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"topicIndices = ldaModel.describeTopics(maxTermsPerTopic=5)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 45, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"([8, 23, 5, 64, 28],\n", | |
" [0.06482984277484072,\n", | |
" 0.045950294558274096,\n", | |
" 0.039156100706073844,\n", | |
" 0.031679928472898536,\n", | |
" 0.030337389898223453])" | |
] | |
}, | |
"execution_count": 45, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"topicIndices[0]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 46, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"vocablist = vectorizer.vocabulary" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 47, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"<bound method LDAModel.vocabSize of <pyspark.mllib.clustering.LDAModel object at 0x7f959069d210>>" | |
] | |
}, | |
"execution_count": 47, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"ldaModel.vocabSize" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 48, | |
"metadata": { | |
"collapsed": false, | |
"scrolled": true | |
}, | |
"outputs": [], | |
"source": [ | |
"# from operator import itemgetter \n", | |
"# for topic in topicIndices:\n", | |
"# text = itemgetter(*topic[0])(vocablist)\n", | |
"# print \"TOPIC\"\n", | |
"# for tag in text:\n", | |
"# print tag, topic[1][text.index(tag)]" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Visualization" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 49, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"topicsRDD = sc.parallelize(topicIndices)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 50, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"import operator\n", | |
"termsRDD = topicsRDD.map(lambda topic: (zip(operator.itemgetter(*topic[0])(vocablist), topic[1])))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 51, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[[(u'snow', 0.06482984277484072),\n", | |
" (u'ski', 0.045950294558274096),\n", | |
" (u'mountains', 0.039156100706073844),\n", | |
" (u'winter', 0.031679928472898536),\n", | |
" (u'skiing', 0.030337389898223453)],\n", | |
" [(u'nature', 0.030827512537037274),\n", | |
" (u'lake', 0.02478979431897514),\n", | |
" (u'spring', 0.024462219076508293),\n", | |
" (u'landscape', 0.0224125857946134),\n", | |
" (u'flowers', 0.02067768513339612)],\n", | |
" [(u'luzern', 0.036862489880800375),\n", | |
" (u'switzerland', 0.0338325739317431),\n", | |
" (u'verbier', 0.02953757773230965),\n", | |
" (u'zurich', 0.02188718968235265),\n", | |
" (u'swiss', 0.02123679980747416)],\n", | |
" [(u'art', 0.03698756198224295),\n", | |
" (u'zurich', 0.03313016298308508),\n", | |
" (u'switzerland', 0.03157849816347215),\n", | |
" (u'easter', 0.025158555831459168),\n", | |
" (u'family', 0.0223036711631496)],\n", | |
" [(u'travel', 0.06901255967841895),\n", | |
" (u'instatravel', 0.037299454043090645),\n", | |
" (u'europe', 0.034330202644613936),\n", | |
" (u'travelgram', 0.03368921018190022),\n", | |
" (u'trip', 0.03304379062370829)],\n", | |
" [(u'switzerland', 0.034456317666373186),\n", | |
" (u'goodtimes', 0.03152461878027823),\n", | |
" (u'zurich', 0.02950826562264097),\n", | |
" (u'weekend', 0.022231218091606136),\n", | |
" (u'tb', 0.018888255116828026)],\n", | |
" [(u'geneva', 0.05857098842284779),\n", | |
" (u'car', 0.031332107870771786),\n", | |
" (u'switzerland', 0.0276575550779648),\n", | |
" (u'lamborghini', 0.02190996776182064),\n", | |
" (u'ferrari', 0.02052769480440934)],\n", | |
" [(u'visitswitzerland', 0.03794345847628957),\n", | |
" (u'vscocam', 0.034333155243237684),\n", | |
" (u'switzerlandwonderland', 0.03226477690795611),\n", | |
" (u'vsco', 0.03213684652910808),\n", | |
" (u'myswitzerland', 0.028912085943440736)],\n", | |
" [(u'fitness', 0.050251802705119475),\n", | |
" (u'healthy', 0.02958433988432552),\n", | |
" (u'sport', 0.029353669207932892),\n", | |
" (u'workout', 0.028909345997833683),\n", | |
" (u'motivation', 0.028554499225599026)],\n", | |
" [(u'suisse', 0.04139109858365735),\n", | |
" (u'montreux', 0.03704523460681258),\n", | |
" (u'lacleman', 0.03250671538638928),\n", | |
" (u'lausanne', 0.02801144795578456),\n", | |
" (u'switzerland', 0.02688108605034323)],\n", | |
" [(u'instagood', 0.03645624632489141),\n", | |
" (u'picoftheday', 0.03076251964509694),\n", | |
" (u'photooftheday', 0.029703059421505945),\n", | |
" (u'instadaily', 0.028632883460075267),\n", | |
" (u'instalike', 0.026769381245962745)],\n", | |
" [(u'music', 0.040575034449173424),\n", | |
" (u'party', 0.03921196577945221),\n", | |
" (u'friends', 0.027131120772706),\n", | |
" (u'konstanz', 0.024898402612021024),\n", | |
" (u'bodensee', 0.02318591615707108)],\n", | |
" [(u'baselworld2016', 0.056018990170552174),\n", | |
" (u'baselworld', 0.05048503702936801),\n", | |
" (u'basel', 0.04590056891456354),\n", | |
" (u'luxury', 0.041513618850625066),\n", | |
" (u'watches', 0.03597274460743156)],\n", | |
" [(u'swiss', 0.02934004343949627),\n", | |
" (u'nofilter', 0.02346935547247407),\n", | |
" (u'switzerland', 0.021960428794446603),\n", | |
" (u'selfie', 0.020904185010864756),\n", | |
" (u'love', 0.019373152419980853)],\n", | |
" [(u'food', 0.030915742768915375),\n", | |
" (u'foodporn', 0.03030777683238159),\n", | |
" (u'day', 0.02158525753702872),\n", | |
" (u'instafood', 0.020540403766244577),\n", | |
" (u'yummy', 0.019046245223605044)]]" | |
] | |
}, | |
"execution_count": 51, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"termsRDD.take(25)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 52, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"indexedTermsRDD = termsRDD.zipWithIndex()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 53, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"termsRDD = indexedTermsRDD.flatMap(lambda term: [(t[0], t[1], term[1]) for t in term[0]])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 54, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"termDF = termsRDD.toDF(['term', 'probability', 'topicId'])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 55, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[Row(term=u'snow', probability=0.06482984277484072, topicId=0),\n", | |
" Row(term=u'ski', probability=0.045950294558274096, topicId=0),\n", | |
" Row(term=u'mountains', probability=0.039156100706073844, topicId=0),\n", | |
" Row(term=u'winter', probability=0.031679928472898536, topicId=0),\n", | |
" Row(term=u'skiing', probability=0.030337389898223453, topicId=0),\n", | |
" Row(term=u'nature', probability=0.030827512537037274, topicId=1),\n", | |
" Row(term=u'lake', probability=0.02478979431897514, topicId=1),\n", | |
" Row(term=u'spring', probability=0.024462219076508293, topicId=1),\n", | |
" Row(term=u'landscape', probability=0.0224125857946134, topicId=1),\n", | |
" Row(term=u'flowers', probability=0.02067768513339612, topicId=1)]" | |
] | |
}, | |
"execution_count": 55, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"termDF.take(10)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 56, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"rawJson = termDF.toJSON().collect()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 57, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"from IPython.core.display import display, HTML\n", | |
"from IPython.display import Javascript\n", | |
"\n", | |
"s = \"\"\n", | |
"for line in rawJson:\n", | |
" s += (str(line) +',')\n", | |
"stringJson = s[:-1]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 58, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"'{\"term\":\"snow\",\"probability\":0.06482984277484072,\"topicId\":0},{\"term\":\"ski\",\"probability\":0.045950294558274096,\"topicId\":0},{\"term\":\"mountains\",\"probability\":0.039156100706073844,\"topicId\":0},{\"term\":\"winter\",\"probability\":0.031679928472898536,\"topicId\":0},{\"term\":\"skiing\",\"probability\":0.030337389898223453,\"topicId\":0},{\"term\":\"nature\",\"probability\":0.030827512537037274,\"topicId\":1},{\"term\":\"lake\",\"probability\":0.02478979431897514,\"topicId\":1},{\"term\":\"spring\",\"probability\":0.024462219076508293,\"topicId\":1},{\"term\":\"landscape\",\"probability\":0.0224125857946134,\"topicId\":1},{\"term\":\"flowers\",\"probability\":0.02067768513339612,\"topicId\":1},{\"term\":\"luzern\",\"probability\":0.036862489880800375,\"topicId\":2},{\"term\":\"switzerland\",\"probability\":0.0338325739317431,\"topicId\":2},{\"term\":\"verbier\",\"probability\":0.02953757773230965,\"topicId\":2},{\"term\":\"zurich\",\"probability\":0.02188718968235265,\"topicId\":2},{\"term\":\"swiss\",\"probability\":0.02123679980747416,\"topicId\":2},{\"term\":\"art\",\"probability\":0.03698756198224295,\"topicId\":3},{\"term\":\"zurich\",\"probability\":0.03313016298308508,\"topicId\":3},{\"term\":\"switzerland\",\"probability\":0.03157849816347215,\"topicId\":3},{\"term\":\"easter\",\"probability\":0.025158555831459168,\"topicId\":3},{\"term\":\"family\",\"probability\":0.0223036711631496,\"topicId\":3},{\"term\":\"travel\",\"probability\":0.06901255967841895,\"topicId\":4},{\"term\":\"instatravel\",\"probability\":0.037299454043090645,\"topicId\":4},{\"term\":\"europe\",\"probability\":0.034330202644613936,\"topicId\":4},{\"term\":\"travelgram\",\"probability\":0.03368921018190022,\"topicId\":4},{\"term\":\"trip\",\"probability\":0.03304379062370829,\"topicId\":4},{\"term\":\"switzerland\",\"probability\":0.034456317666373186,\"topicId\":5},{\"term\":\"goodtimes\",\"probability\":0.03152461878027823,\"topicId\":5},{\"term\":\"zurich\",\"probability\":0.02950826562264097,\"topicId\":5},{\"term\":\"weekend\",\"probability\":0.022231218091606136,\"topicId\":5},{\"term\":\"tb\",\"probability\":0.018888255116828026,\"topicId\":5},{\"term\":\"geneva\",\"probability\":0.05857098842284779,\"topicId\":6},{\"term\":\"car\",\"probability\":0.031332107870771786,\"topicId\":6},{\"term\":\"switzerland\",\"probability\":0.0276575550779648,\"topicId\":6},{\"term\":\"lamborghini\",\"probability\":0.02190996776182064,\"topicId\":6},{\"term\":\"ferrari\",\"probability\":0.02052769480440934,\"topicId\":6},{\"term\":\"visitswitzerland\",\"probability\":0.03794345847628957,\"topicId\":7},{\"term\":\"vscocam\",\"probability\":0.034333155243237684,\"topicId\":7},{\"term\":\"switzerlandwonderland\",\"probability\":0.03226477690795611,\"topicId\":7},{\"term\":\"vsco\",\"probability\":0.03213684652910808,\"topicId\":7},{\"term\":\"myswitzerland\",\"probability\":0.028912085943440736,\"topicId\":7},{\"term\":\"fitness\",\"probability\":0.050251802705119475,\"topicId\":8},{\"term\":\"healthy\",\"probability\":0.02958433988432552,\"topicId\":8},{\"term\":\"sport\",\"probability\":0.029353669207932892,\"topicId\":8},{\"term\":\"workout\",\"probability\":0.028909345997833683,\"topicId\":8},{\"term\":\"motivation\",\"probability\":0.028554499225599026,\"topicId\":8},{\"term\":\"suisse\",\"probability\":0.04139109858365735,\"topicId\":9},{\"term\":\"montreux\",\"probability\":0.03704523460681258,\"topicId\":9},{\"term\":\"lacleman\",\"probability\":0.03250671538638928,\"topicId\":9},{\"term\":\"lausanne\",\"probability\":0.02801144795578456,\"topicId\":9},{\"term\":\"switzerland\",\"probability\":0.02688108605034323,\"topicId\":9},{\"term\":\"instagood\",\"probability\":0.03645624632489141,\"topicId\":10},{\"term\":\"picoftheday\",\"probability\":0.03076251964509694,\"topicId\":10},{\"term\":\"photooftheday\",\"probability\":0.029703059421505945,\"topicId\":10},{\"term\":\"instadaily\",\"probability\":0.028632883460075267,\"topicId\":10},{\"term\":\"instalike\",\"probability\":0.026769381245962745,\"topicId\":10},{\"term\":\"music\",\"probability\":0.040575034449173424,\"topicId\":11},{\"term\":\"party\",\"probability\":0.03921196577945221,\"topicId\":11},{\"term\":\"friends\",\"probability\":0.027131120772706,\"topicId\":11},{\"term\":\"konstanz\",\"probability\":0.024898402612021024,\"topicId\":11},{\"term\":\"bodensee\",\"probability\":0.02318591615707108,\"topicId\":11},{\"term\":\"baselworld2016\",\"probability\":0.056018990170552174,\"topicId\":12},{\"term\":\"baselworld\",\"probability\":0.05048503702936801,\"topicId\":12},{\"term\":\"basel\",\"probability\":0.04590056891456354,\"topicId\":12},{\"term\":\"luxury\",\"probability\":0.041513618850625066,\"topicId\":12},{\"term\":\"watches\",\"probability\":0.03597274460743156,\"topicId\":12},{\"term\":\"swiss\",\"probability\":0.02934004343949627,\"topicId\":13},{\"term\":\"nofilter\",\"probability\":0.02346935547247407,\"topicId\":13},{\"term\":\"switzerland\",\"probability\":0.021960428794446603,\"topicId\":13},{\"term\":\"selfie\",\"probability\":0.020904185010864756,\"topicId\":13},{\"term\":\"love\",\"probability\":0.019373152419980853,\"topicId\":13},{\"term\":\"food\",\"probability\":0.030915742768915375,\"topicId\":14},{\"term\":\"foodporn\",\"probability\":0.03030777683238159,\"topicId\":14},{\"term\":\"day\",\"probability\":0.02158525753702872,\"topicId\":14},{\"term\":\"instafood\",\"probability\":0.020540403766244577,\"topicId\":14},{\"term\":\"yummy\",\"probability\":0.019046245223605044,\"topicId\":14}'" | |
] | |
}, | |
"execution_count": 58, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"stringJson" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 59, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"html_code = \"\"\"\n", | |
"<!DOCTYPE html>\n", | |
"<meta charset=\"utf-8\">\n", | |
"<style>\n", | |
"\n", | |
"circle {\n", | |
" fill: rgb(31, 119, 180);\n", | |
" fill-opacity: 0.5;\n", | |
" stroke: rgb(31, 119, 180);\n", | |
" stroke-width: 1px;\n", | |
"}\n", | |
"\n", | |
".leaf circle {\n", | |
" fill: #ff7f0e;\n", | |
" fill-opacity: 1;\n", | |
"}\n", | |
"\n", | |
"text {\n", | |
" font: 14px sans-serif;\n", | |
"}\n", | |
"\n", | |
"</style>\n", | |
"<body>\n", | |
"<script src=\"https://cdnjs.cloudflare.com/ajax/libs/d3/3.5.5/d3.min.js\"></script>\n", | |
"\n", | |
"<script>\n", | |
"\n", | |
"var json = {\n", | |
" \"name\": \"data\",\n", | |
" \"children\": [\n", | |
" {\n", | |
" \"name\": \"topics\",\n", | |
" \"children\": [\n", | |
" %s\n", | |
" ]\n", | |
" }\n", | |
" ]\n", | |
"};\n", | |
"\n", | |
"var r = 1500,\n", | |
" format = d3.format(\",d\"),\n", | |
" fill = d3.scale.category20c();\n", | |
"\n", | |
"var bubble = d3.layout.pack()\n", | |
" .sort(null)\n", | |
" .size([r, r])\n", | |
" .padding(1.5);\n", | |
"\n", | |
"var vis = d3.select(\"body\").append(\"svg\")\n", | |
" .attr(\"width\", r)\n", | |
" .attr(\"height\", r)\n", | |
" .attr(\"class\", \"bubble\");\n", | |
"\n", | |
" \n", | |
"var node = vis.selectAll(\"g.node\")\n", | |
" .data(bubble.nodes(classes(json))\n", | |
" .filter(function(d) { return !d.children; }))\n", | |
" .enter().append(\"g\")\n", | |
" .attr(\"class\", \"node\")\n", | |
" .attr(\"transform\", function(d) { return \"translate(\" + d.x + \",\" + d.y + \")\"; })\n", | |
" color = d3.scale.category20();\n", | |
" \n", | |
" node.append(\"title\")\n", | |
" .text(function(d) { return d.className + \": \" + format(d.value); });\n", | |
"\n", | |
" node.append(\"circle\")\n", | |
" .attr(\"r\", function(d) { return d.r; })\n", | |
" .style(\"fill\", function(d) {return color(d.topicName);});\n", | |
"\n", | |
"var text = node.append(\"text\")\n", | |
" .attr(\"text-anchor\", \"middle\")\n", | |
" .attr(\"dy\", \".3em\")\n", | |
" .text(function(d) { return d.className.substring(0, d.r / 3)});\n", | |
" \n", | |
" text.append(\"tspan\")\n", | |
" .attr(\"dy\", \"1.2em\")\n", | |
" .attr(\"x\", 0)\n", | |
" .text(function(d) {return Math.ceil(d.value * 10000) /10000; });\n", | |
"\n", | |
"// Returns a flattened hierarchy containing all leaf nodes under the root.\n", | |
"function classes(root) {\n", | |
" var classes = [];\n", | |
"\n", | |
" function recurse(term, node) {\n", | |
" if (node.children) node.children.forEach(function(child) { recurse(node.term, child); });\n", | |
" else classes.push({topicName: node.topicId, className: node.term, value: node.probability});\n", | |
" }\n", | |
"\n", | |
" recurse(null, root);\n", | |
" return {children: classes};\n", | |
"}\n", | |
"\n", | |
"</script>\"\"\" % stringJson" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 60, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"\n", | |
"<!DOCTYPE html>\n", | |
"<meta charset=\"utf-8\">\n", | |
"<style>\n", | |
"\n", | |
"circle {\n", | |
" fill: rgb(31, 119, 180);\n", | |
" fill-opacity: 0.5;\n", | |
" stroke: rgb(31, 119, 180);\n", | |
" stroke-width: 1px;\n", | |
"}\n", | |
"\n", | |
".leaf circle {\n", | |
" fill: #ff7f0e;\n", | |
" fill-opacity: 1;\n", | |
"}\n", | |
"\n", | |
"text {\n", | |
" font: 14px sans-serif;\n", | |
"}\n", | |
"\n", | |
"</style>\n", | |
"<body>\n", | |
"<script src=\"https://cdnjs.cloudflare.com/ajax/libs/d3/3.5.5/d3.min.js\"></script>\n", | |
"\n", | |
"<script>\n", | |
"\n", | |
"var json = {\n", | |
" \"name\": \"data\",\n", | |
" \"children\": [\n", | |
" {\n", | |
" \"name\": \"topics\",\n", | |
" \"children\": [\n", | |
" {\"term\":\"snow\",\"probability\":0.06482984277484072,\"topicId\":0},{\"term\":\"ski\",\"probability\":0.045950294558274096,\"topicId\":0},{\"term\":\"mountains\",\"probability\":0.039156100706073844,\"topicId\":0},{\"term\":\"winter\",\"probability\":0.031679928472898536,\"topicId\":0},{\"term\":\"skiing\",\"probability\":0.030337389898223453,\"topicId\":0},{\"term\":\"nature\",\"probability\":0.030827512537037274,\"topicId\":1},{\"term\":\"lake\",\"probability\":0.02478979431897514,\"topicId\":1},{\"term\":\"spring\",\"probability\":0.024462219076508293,\"topicId\":1},{\"term\":\"landscape\",\"probability\":0.0224125857946134,\"topicId\":1},{\"term\":\"flowers\",\"probability\":0.02067768513339612,\"topicId\":1},{\"term\":\"luzern\",\"probability\":0.036862489880800375,\"topicId\":2},{\"term\":\"switzerland\",\"probability\":0.0338325739317431,\"topicId\":2},{\"term\":\"verbier\",\"probability\":0.02953757773230965,\"topicId\":2},{\"term\":\"zurich\",\"probability\":0.02188718968235265,\"topicId\":2},{\"term\":\"swiss\",\"probability\":0.02123679980747416,\"topicId\":2},{\"term\":\"art\",\"probability\":0.03698756198224295,\"topicId\":3},{\"term\":\"zurich\",\"probability\":0.03313016298308508,\"topicId\":3},{\"term\":\"switzerland\",\"probability\":0.03157849816347215,\"topicId\":3},{\"term\":\"easter\",\"probability\":0.025158555831459168,\"topicId\":3},{\"term\":\"family\",\"probability\":0.0223036711631496,\"topicId\":3},{\"term\":\"travel\",\"probability\":0.06901255967841895,\"topicId\":4},{\"term\":\"instatravel\",\"probability\":0.037299454043090645,\"topicId\":4},{\"term\":\"europe\",\"probability\":0.034330202644613936,\"topicId\":4},{\"term\":\"travelgram\",\"probability\":0.03368921018190022,\"topicId\":4},{\"term\":\"trip\",\"probability\":0.03304379062370829,\"topicId\":4},{\"term\":\"switzerland\",\"probability\":0.034456317666373186,\"topicId\":5},{\"term\":\"goodtimes\",\"probability\":0.03152461878027823,\"topicId\":5},{\"term\":\"zurich\",\"probability\":0.02950826562264097,\"topicId\":5},{\"term\":\"weekend\",\"probability\":0.022231218091606136,\"topicId\":5},{\"term\":\"tb\",\"probability\":0.018888255116828026,\"topicId\":5},{\"term\":\"geneva\",\"probability\":0.05857098842284779,\"topicId\":6},{\"term\":\"car\",\"probability\":0.031332107870771786,\"topicId\":6},{\"term\":\"switzerland\",\"probability\":0.0276575550779648,\"topicId\":6},{\"term\":\"lamborghini\",\"probability\":0.02190996776182064,\"topicId\":6},{\"term\":\"ferrari\",\"probability\":0.02052769480440934,\"topicId\":6},{\"term\":\"visitswitzerland\",\"probability\":0.03794345847628957,\"topicId\":7},{\"term\":\"vscocam\",\"probability\":0.034333155243237684,\"topicId\":7},{\"term\":\"switzerlandwonderland\",\"probability\":0.03226477690795611,\"topicId\":7},{\"term\":\"vsco\",\"probability\":0.03213684652910808,\"topicId\":7},{\"term\":\"myswitzerland\",\"probability\":0.028912085943440736,\"topicId\":7},{\"term\":\"fitness\",\"probability\":0.050251802705119475,\"topicId\":8},{\"term\":\"healthy\",\"probability\":0.02958433988432552,\"topicId\":8},{\"term\":\"sport\",\"probability\":0.029353669207932892,\"topicId\":8},{\"term\":\"workout\",\"probability\":0.028909345997833683,\"topicId\":8},{\"term\":\"motivation\",\"probability\":0.028554499225599026,\"topicId\":8},{\"term\":\"suisse\",\"probability\":0.04139109858365735,\"topicId\":9},{\"term\":\"montreux\",\"probability\":0.03704523460681258,\"topicId\":9},{\"term\":\"lacleman\",\"probability\":0.03250671538638928,\"topicId\":9},{\"term\":\"lausanne\",\"probability\":0.02801144795578456,\"topicId\":9},{\"term\":\"switzerland\",\"probability\":0.02688108605034323,\"topicId\":9},{\"term\":\"instagood\",\"probability\":0.03645624632489141,\"topicId\":10},{\"term\":\"picoftheday\",\"probability\":0.03076251964509694,\"topicId\":10},{\"term\":\"photooftheday\",\"probability\":0.029703059421505945,\"topicId\":10},{\"term\":\"instadaily\",\"probability\":0.028632883460075267,\"topicId\":10},{\"term\":\"instalike\",\"probability\":0.026769381245962745,\"topicId\":10},{\"term\":\"music\",\"probability\":0.040575034449173424,\"topicId\":11},{\"term\":\"party\",\"probability\":0.03921196577945221,\"topicId\":11},{\"term\":\"friends\",\"probability\":0.027131120772706,\"topicId\":11},{\"term\":\"konstanz\",\"probability\":0.024898402612021024,\"topicId\":11},{\"term\":\"bodensee\",\"probability\":0.02318591615707108,\"topicId\":11},{\"term\":\"baselworld2016\",\"probability\":0.056018990170552174,\"topicId\":12},{\"term\":\"baselworld\",\"probability\":0.05048503702936801,\"topicId\":12},{\"term\":\"basel\",\"probability\":0.04590056891456354,\"topicId\":12},{\"term\":\"luxury\",\"probability\":0.041513618850625066,\"topicId\":12},{\"term\":\"watches\",\"probability\":0.03597274460743156,\"topicId\":12},{\"term\":\"swiss\",\"probability\":0.02934004343949627,\"topicId\":13},{\"term\":\"nofilter\",\"probability\":0.02346935547247407,\"topicId\":13},{\"term\":\"switzerland\",\"probability\":0.021960428794446603,\"topicId\":13},{\"term\":\"selfie\",\"probability\":0.020904185010864756,\"topicId\":13},{\"term\":\"love\",\"probability\":0.019373152419980853,\"topicId\":13},{\"term\":\"food\",\"probability\":0.030915742768915375,\"topicId\":14},{\"term\":\"foodporn\",\"probability\":0.03030777683238159,\"topicId\":14},{\"term\":\"day\",\"probability\":0.02158525753702872,\"topicId\":14},{\"term\":\"instafood\",\"probability\":0.020540403766244577,\"topicId\":14},{\"term\":\"yummy\",\"probability\":0.019046245223605044,\"topicId\":14}\n", | |
" ]\n", | |
" }\n", | |
" ]\n", | |
"};\n", | |
"\n", | |
"var r = 1500,\n", | |
" format = d3.format(\",d\"),\n", | |
" fill = d3.scale.category20c();\n", | |
"\n", | |
"var bubble = d3.layout.pack()\n", | |
" .sort(null)\n", | |
" .size([r, r])\n", | |
" .padding(1.5);\n", | |
"\n", | |
"var vis = d3.select(\"body\").append(\"svg\")\n", | |
" .attr(\"width\", r)\n", | |
" .attr(\"height\", r)\n", | |
" .attr(\"class\", \"bubble\");\n", | |
"\n", | |
" \n", | |
"var node = vis.selectAll(\"g.node\")\n", | |
" .data(bubble.nodes(classes(json))\n", | |
" .filter(function(d) { return !d.children; }))\n", | |
" .enter().append(\"g\")\n", | |
" .attr(\"class\", \"node\")\n", | |
" .attr(\"transform\", function(d) { return \"translate(\" + d.x + \",\" + d.y + \")\"; })\n", | |
" color = d3.scale.category20();\n", | |
" \n", | |
" node.append(\"title\")\n", | |
" .text(function(d) { return d.className + \": \" + format(d.value); });\n", | |
"\n", | |
" node.append(\"circle\")\n", | |
" .attr(\"r\", function(d) { return d.r; })\n", | |
" .style(\"fill\", function(d) {return color(d.topicName);});\n", | |
"\n", | |
"var text = node.append(\"text\")\n", | |
" .attr(\"text-anchor\", \"middle\")\n", | |
" .attr(\"dy\", \".3em\")\n", | |
" .text(function(d) { return d.className.substring(0, d.r / 3)});\n", | |
" \n", | |
" text.append(\"tspan\")\n", | |
" .attr(\"dy\", \"1.2em\")\n", | |
" .attr(\"x\", 0)\n", | |
" .text(function(d) {return Math.ceil(d.value * 10000) /10000; });\n", | |
"\n", | |
"// Returns a flattened hierarchy containing all leaf nodes under the root.\n", | |
"function classes(root) {\n", | |
" var classes = [];\n", | |
"\n", | |
" function recurse(term, node) {\n", | |
" if (node.children) node.children.forEach(function(child) { recurse(node.term, child); });\n", | |
" else classes.push({topicName: node.topicId, className: node.term, value: node.probability});\n", | |
" }\n", | |
"\n", | |
" recurse(null, root);\n", | |
" return {children: classes};\n", | |
"}\n", | |
"\n", | |
"</script>" | |
], | |
"text/plain": [ | |
"<IPython.core.display.HTML object>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"display(HTML(html_code))" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 2", | |
"language": "python", | |
"name": "python2" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 2 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython2", | |
"version": "2.7.12" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 1 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment