Skip to content

Instantly share code, notes, and snippets.

Created February 1, 2017 19:29
Show Gist options
  • Save anonymous/e8d82127e250e821225bc0d874ef4ef3 to your computer and use it in GitHub Desktop.
Save anonymous/e8d82127e250e821225bc0d874ef4ef3 to your computer and use it in GitHub Desktop.
FNC1 - Data Handling Resources
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"metadata": {
"trusted": true,
"collapsed": true
},
"cell_type": "code",
"source": "'''\nauthor: brian mcmahan\n\nparser and pandas data manager for the fake news challenge first's dataset: fnc1\n\nthanks to napsternxg to converting it to json first\n'''\nimport urllib\nimport pandas as pd\nimport ujson as json",
"execution_count": 121,
"outputs": []
},
{
"metadata": {
"trusted": true,
"collapsed": true
},
"cell_type": "code",
"source": "fpath = 'https://raw.githubusercontent.com/napsternxg/fnc-1/8836b0b51b0826435f59152013cced8950267595/train_combined.json'\nfnc1 = Dataset.from_jsonurl(fpath)",
"execution_count": 111,
"outputs": []
},
{
"metadata": {
"trusted": true,
"collapsed": false
},
"cell_type": "code",
"source": "print(fnc1)",
"execution_count": 112,
"outputs": [
{
"output_type": "stream",
"text": "[FNC-I][][1683 datums]\n",
"name": "stdout"
}
]
},
{
"metadata": {
"trusted": true,
"collapsed": false
},
"cell_type": "code",
"source": "print(fnc1.mode)\nprint(fnc1[0])\nfnc1[1:15]",
"execution_count": 113,
"outputs": [
{
"output_type": "stream",
"text": "list\n[Story0][36 stances]\n",
"name": "stdout"
},
{
"output_type": "execute_result",
"data": {
"text/plain": "[[Story4][1 stances],\n [Story5][56 stances],\n [Story6][1 stances],\n [Story7][102 stances],\n [Story8][45 stances],\n [Story9][2 stances],\n [Story10][46 stances],\n [Story11][71 stances],\n [Story13][15 stances],\n [Story14][1 stances],\n [Story15][26 stances],\n [Story16][1 stances],\n [Story17][52 stances],\n [Story18][1 stances]]"
},
"metadata": {},
"execution_count": 113
}
]
},
{
"metadata": {
"trusted": true,
"collapsed": false
},
"cell_type": "code",
"source": "fnc1[100].agree",
"execution_count": 114,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": " headline label\n0 'Nasa Confirms Earth Will Experience 6 Days of... agree\n1 ’6 Days Darkness in December 2014′ 100% Fake; ... agree",
"text/html": "<div>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>headline</th>\n <th>label</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>'Nasa Confirms Earth Will Experience 6 Days of...</td>\n <td>agree</td>\n </tr>\n <tr>\n <th>1</th>\n <td>’6 Days Darkness in December 2014′ 100% Fake; ...</td>\n <td>agree</td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"metadata": {},
"execution_count": 114
}
]
},
{
"metadata": {
"trusted": true,
"collapsed": false
},
"cell_type": "code",
"source": "fnc1[104].discuss",
"execution_count": 115,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": " headline label\n0 Heartbroken girl spends week in KFC after gett... discuss\n1 Comfort eating? Chinese woman, 26, spends an e... discuss",
"text/html": "<div>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>headline</th>\n <th>label</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>Heartbroken girl spends week in KFC after gett...</td>\n <td>discuss</td>\n </tr>\n <tr>\n <th>1</th>\n <td>Comfort eating? Chinese woman, 26, spends an e...</td>\n <td>discuss</td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"metadata": {},
"execution_count": 115
}
]
},
{
"metadata": {
"trusted": true,
"collapsed": false
},
"cell_type": "code",
"source": "fnc1[155].disagree",
"execution_count": 119,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": " headline label\n2 Batmobile Stolen From \"Batman v Superman: Dawn... disagree",
"text/html": "<div>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>headline</th>\n <th>label</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>2</th>\n <td>Batmobile Stolen From \"Batman v Superman: Dawn...</td>\n <td>disagree</td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"metadata": {},
"execution_count": 119
}
]
},
{
"metadata": {
"trusted": true,
"collapsed": false
},
"cell_type": "code",
"source": "class Story(object):\n def __init__(self, body_id, article_body, stances):\n self.body_id = body_id\n self.article_body = article_body\n self.stances = stances\n \n \n @classmethod\n def from_raw(cls, body_info, stances):\n return cls(body_info['Body ID'], body_info['articleBody'], pd.DataFrame(stances))\n \n @property\n def unrelated(self):\n return self.stances[self.stances.label=='unrelated']\n\n @property\n def agree(self):\n return self.stances[self.stances.label=='agree']\n \n @property\n def disagree(self):\n return self.stances[self.stances.label=='disagree']\n \n @property\n def discuss(self):\n return self.stances[self.stances.label=='discuss']\n \n def __str__(self):\n return \"[Story{}][{} stances]\".format(self.body_id, len(self.stances))\n \n def __repr__(self):\n return str(self)\n \nclass Dataset(object):\n def __init__(self, data):\n self.data = data\n self.split = \"\"\n self.mode = \"list\"\n self._index = [d.body_id for d in data.values()]\n\n @classmethod\n def from_dict(cls, data):\n stances = {d['Body ID']:[] for d in data['bodies']}\n for stance in data['stances']:\n stances[stance['Body ID']].append({'headline': stance['Headline'],\n 'label': stance['Stance']})\n bodies = {d['Body ID']:Story.from_raw(body_info=d, stances=stances[d['Body ID']]) \n for d in data['bodies']}\n return cls(bodies)\n \n @classmethod\n def from_jsonfile(cls, jsonfile):\n with open(jsonfile) as fp:\n return cls.from_dict(json.load(fp))\n \n @classmethod\n def from_jsonurl(cls, jsonurl):\n return cls.from_dict(json.load(urllib.request.urlopen(fpath)))\n \n def __getitem__(self, k):\n if isinstance(k, int) and self.mode == \"list\":\n return self.data[self._index[k]]\n elif isinstance(k, int) and self.mode == \"dict\":\n return self.data[k]\n elif isinstance(k, slice):\n assert self.mode == \"list\"\n return [self.data[i] for i in self._index[k]]\n else:\n raise AttributeError(\"bad index or key\")\n \n def __len__(self):\n return len(self.data)\n \n def __str__(self):\n return \"[FNC-I][{}][{} datums]\".format(self.split, len(self.data))\n \n def __repr__(self):\n return str(self)\n",
"execution_count": 110,
"outputs": []
}
],
"metadata": {
"kernelspec": {
"name": "conda-env-tf-py",
"display_name": "Python [conda env:tf]",
"language": "python"
},
"anaconda-cloud": {},
"language_info": {
"mimetype": "text/x-python",
"nbconvert_exporter": "python",
"name": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2",
"file_extension": ".py",
"codemirror_mode": {
"version": 3,
"name": "ipython"
}
},
"gist": {
"id": "",
"data": {
"description": "FNC1 - Data Handling Resources",
"public": true
}
}
},
"nbformat": 4,
"nbformat_minor": 1
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment