Created
February 1, 2017 19:29
-
-
Save anonymous/e8d82127e250e821225bc0d874ef4ef3 to your computer and use it in GitHub Desktop.
FNC1 - Data Handling Resources
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"metadata": { | |
"trusted": true, | |
"collapsed": true | |
}, | |
"cell_type": "code", | |
"source": "'''\nauthor: brian mcmahan\n\nparser and pandas data manager for the fake news challenge first's dataset: fnc1\n\nthanks to napsternxg to converting it to json first\n'''\nimport urllib\nimport pandas as pd\nimport ujson as json", | |
"execution_count": 121, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true, | |
"collapsed": true | |
}, | |
"cell_type": "code", | |
"source": "fpath = 'https://raw.githubusercontent.com/napsternxg/fnc-1/8836b0b51b0826435f59152013cced8950267595/train_combined.json'\nfnc1 = Dataset.from_jsonurl(fpath)", | |
"execution_count": 111, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true, | |
"collapsed": false | |
}, | |
"cell_type": "code", | |
"source": "print(fnc1)", | |
"execution_count": 112, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": "[FNC-I][][1683 datums]\n", | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true, | |
"collapsed": false | |
}, | |
"cell_type": "code", | |
"source": "print(fnc1.mode)\nprint(fnc1[0])\nfnc1[1:15]", | |
"execution_count": 113, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": "list\n[Story0][36 stances]\n", | |
"name": "stdout" | |
}, | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": "[[Story4][1 stances],\n [Story5][56 stances],\n [Story6][1 stances],\n [Story7][102 stances],\n [Story8][45 stances],\n [Story9][2 stances],\n [Story10][46 stances],\n [Story11][71 stances],\n [Story13][15 stances],\n [Story14][1 stances],\n [Story15][26 stances],\n [Story16][1 stances],\n [Story17][52 stances],\n [Story18][1 stances]]" | |
}, | |
"metadata": {}, | |
"execution_count": 113 | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true, | |
"collapsed": false | |
}, | |
"cell_type": "code", | |
"source": "fnc1[100].agree", | |
"execution_count": 114, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": " headline label\n0 'Nasa Confirms Earth Will Experience 6 Days of... agree\n1 ’6 Days Darkness in December 2014′ 100% Fake; ... agree", | |
"text/html": "<div>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>headline</th>\n <th>label</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>'Nasa Confirms Earth Will Experience 6 Days of...</td>\n <td>agree</td>\n </tr>\n <tr>\n <th>1</th>\n <td>’6 Days Darkness in December 2014′ 100% Fake; ...</td>\n <td>agree</td>\n </tr>\n </tbody>\n</table>\n</div>" | |
}, | |
"metadata": {}, | |
"execution_count": 114 | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true, | |
"collapsed": false | |
}, | |
"cell_type": "code", | |
"source": "fnc1[104].discuss", | |
"execution_count": 115, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": " headline label\n0 Heartbroken girl spends week in KFC after gett... discuss\n1 Comfort eating? Chinese woman, 26, spends an e... discuss", | |
"text/html": "<div>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>headline</th>\n <th>label</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>Heartbroken girl spends week in KFC after gett...</td>\n <td>discuss</td>\n </tr>\n <tr>\n <th>1</th>\n <td>Comfort eating? Chinese woman, 26, spends an e...</td>\n <td>discuss</td>\n </tr>\n </tbody>\n</table>\n</div>" | |
}, | |
"metadata": {}, | |
"execution_count": 115 | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true, | |
"collapsed": false | |
}, | |
"cell_type": "code", | |
"source": "fnc1[155].disagree", | |
"execution_count": 119, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": " headline label\n2 Batmobile Stolen From \"Batman v Superman: Dawn... disagree", | |
"text/html": "<div>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>headline</th>\n <th>label</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>2</th>\n <td>Batmobile Stolen From \"Batman v Superman: Dawn...</td>\n <td>disagree</td>\n </tr>\n </tbody>\n</table>\n</div>" | |
}, | |
"metadata": {}, | |
"execution_count": 119 | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true, | |
"collapsed": false | |
}, | |
"cell_type": "code", | |
"source": "class Story(object):\n def __init__(self, body_id, article_body, stances):\n self.body_id = body_id\n self.article_body = article_body\n self.stances = stances\n \n \n @classmethod\n def from_raw(cls, body_info, stances):\n return cls(body_info['Body ID'], body_info['articleBody'], pd.DataFrame(stances))\n \n @property\n def unrelated(self):\n return self.stances[self.stances.label=='unrelated']\n\n @property\n def agree(self):\n return self.stances[self.stances.label=='agree']\n \n @property\n def disagree(self):\n return self.stances[self.stances.label=='disagree']\n \n @property\n def discuss(self):\n return self.stances[self.stances.label=='discuss']\n \n def __str__(self):\n return \"[Story{}][{} stances]\".format(self.body_id, len(self.stances))\n \n def __repr__(self):\n return str(self)\n \nclass Dataset(object):\n def __init__(self, data):\n self.data = data\n self.split = \"\"\n self.mode = \"list\"\n self._index = [d.body_id for d in data.values()]\n\n @classmethod\n def from_dict(cls, data):\n stances = {d['Body ID']:[] for d in data['bodies']}\n for stance in data['stances']:\n stances[stance['Body ID']].append({'headline': stance['Headline'],\n 'label': stance['Stance']})\n bodies = {d['Body ID']:Story.from_raw(body_info=d, stances=stances[d['Body ID']]) \n for d in data['bodies']}\n return cls(bodies)\n \n @classmethod\n def from_jsonfile(cls, jsonfile):\n with open(jsonfile) as fp:\n return cls.from_dict(json.load(fp))\n \n @classmethod\n def from_jsonurl(cls, jsonurl):\n return cls.from_dict(json.load(urllib.request.urlopen(fpath)))\n \n def __getitem__(self, k):\n if isinstance(k, int) and self.mode == \"list\":\n return self.data[self._index[k]]\n elif isinstance(k, int) and self.mode == \"dict\":\n return self.data[k]\n elif isinstance(k, slice):\n assert self.mode == \"list\"\n return [self.data[i] for i in self._index[k]]\n else:\n raise AttributeError(\"bad index or key\")\n \n def __len__(self):\n return len(self.data)\n \n def __str__(self):\n return \"[FNC-I][{}][{} datums]\".format(self.split, len(self.data))\n \n def __repr__(self):\n return str(self)\n", | |
"execution_count": 110, | |
"outputs": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"name": "conda-env-tf-py", | |
"display_name": "Python [conda env:tf]", | |
"language": "python" | |
}, | |
"anaconda-cloud": {}, | |
"language_info": { | |
"mimetype": "text/x-python", | |
"nbconvert_exporter": "python", | |
"name": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.5.2", | |
"file_extension": ".py", | |
"codemirror_mode": { | |
"version": 3, | |
"name": "ipython" | |
} | |
}, | |
"gist": { | |
"id": "", | |
"data": { | |
"description": "FNC1 - Data Handling Resources", | |
"public": true | |
} | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 1 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment