Last active
July 7, 2020 16:06
-
-
Save lawlesst/175f99d06712432c3d16aa3056e586f3 to your computer and use it in GitHub Desktop.
tdm-pilot.org gists
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
datasets/ | |
.ipynb* |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 120, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import json\n", | |
"import gzip\n", | |
"import random\n", | |
"from pprint import pprint\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 121, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Reading the dataset ...\n", | |
"Adding http://www.jstor.org/stable/10.1086/491498 to sample\n", | |
"Adding http://www.jstor.org/stable/10.1086/432295 to sample\n", | |
"Adding http://www.jstor.org/stable/10.1086/379413 to sample\n", | |
"Adding http://www.jstor.org/stable/228664 to sample\n", | |
"Adding http://www.jstor.org/stable/236768 to sample\n", | |
"Adding http://www.jstor.org/stable/227706 to sample\n", | |
"Adding http://www.jstor.org/stable/231357 to sample\n", | |
"Adding http://www.jstor.org/stable/3080697 to sample\n", | |
"Adding http://www.jstor.org/stable/229231 to sample\n", | |
"Adding http://www.jstor.org/stable/230556 to sample\n", | |
"Adding http://www.jstor.org/stable/10.1086/670902 to sample\n", | |
"Adding http://www.jstor.org/stable/228263 to sample\n", | |
"Adding http://www.jstor.org/stable/229843 to sample\n", | |
"Adding http://www.jstor.org/stable/10.1086/678012 to sample\n", | |
"Adding http://www.jstor.org/stable/230061 to sample\n", | |
"Adding http://www.jstor.org/stable/10.1086/376025 to sample\n", | |
"Adding http://www.jstor.org/stable/10.1086/653929 to sample\n", | |
"Adding http://www.jstor.org/stable/226119 to sample\n", | |
"Adding http://www.jstor.org/stable/10.1086/491505 to sample\n", | |
"Adding http://www.jstor.org/stable/235887 to sample\n", | |
"Adding http://www.jstor.org/stable/10.1086/682793 to sample\n", | |
"Adding http://www.jstor.org/stable/227572 to sample\n", | |
"Adding http://www.jstor.org/stable/10.1086/386402 to sample\n", | |
"Adding http://www.jstor.org/stable/223695 to sample\n", | |
"Adding http://www.jstor.org/stable/235969 to sample\n", | |
"Dataset reading complete. 25 total documents.\n" | |
] | |
} | |
], | |
"source": [ | |
"sample_doc_numbers = random.sample(range(0, 19000), 25)\n", | |
"sample_docs = []\n", | |
"\n", | |
"print(\"Reading the dataset ...\")\n", | |
"\n", | |
"with gzip.open(\"./datasets/dset1.jsonl.gz\", \"rb\") as inf:\n", | |
" for row_num, row in enumerate(inf):\n", | |
" doc = json.loads(row)\n", | |
" if row_num not in sample_doc_numbers:\n", | |
" continue\n", | |
" print(f\"Adding {doc['id']} to sample\")\n", | |
" sample_docs.append(doc)\n", | |
"\n", | |
"print(f\"Dataset reading complete. {len(sample_docs)} total documents.\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 122, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"doc1 = sample_docs[0]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 123, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"to_delete = [\"unigramCount\", \"bigramCount\", \"trigramCount\", \"fullText\"]\n", | |
"for k in to_delete:\n", | |
" if k in doc1.keys():\n", | |
" del doc1[k]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 124, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"{'creator': ['Joan‐Pau Rubiés'],\n", | |
" 'datePublished': '2005-06-01',\n", | |
" 'docType': 'article',\n", | |
" 'id': 'http://www.jstor.org/stable/10.1086/491498',\n", | |
" 'identifier': [{'name': 'issn', 'value': '00211753'},\n", | |
" {'name': 'oclc', 'value': '49976319'},\n", | |
" {'name': 'lccn', 'value': '2002-227035'},\n", | |
" {'name': 'local_uuid',\n", | |
" 'value': 'd22c16bb-d068-3bdf-9962-8d0db608891e'},\n", | |
" {'name': 'local_doi', 'value': '10.1086/491498'},\n", | |
" {'name': 'journal_id', 'value': 'isis'}],\n", | |
" 'isPartOf': 'Isis',\n", | |
" 'issueNumber': '2',\n", | |
" 'language': ['eng'],\n", | |
" 'outputFormat': ['unigram', 'bigram', 'trigram'],\n", | |
" 'pageCount': 2,\n", | |
" 'pageEnd': '276',\n", | |
" 'pageStart': '275',\n", | |
" 'pagination': 'pp. 275-276',\n", | |
" 'provider': 'jstor',\n", | |
" 'publicationYear': 2005,\n", | |
" 'publisher': 'The University of Chicago Press',\n", | |
" 'sourceCategory': ['History of Science & Technology', 'History'],\n", | |
" 'title': 'Review Article',\n", | |
" 'url': 'http://www.jstor.org/stable/10.1086/491498',\n", | |
" 'volumeNumber': '96',\n", | |
" 'wordCount': 1051}\n" | |
] | |
} | |
], | |
"source": [ | |
"pprint(doc1)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 125, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"fields_to_keep = [\n", | |
" \"id\",\n", | |
" \"title\",\n", | |
" \"isPartOf\",\n", | |
" \"publicationYear\",\n", | |
" \"creator\",\n", | |
" \"wordCount\",\n", | |
" \"provider\",\n", | |
" \"url\"\n", | |
"]\n", | |
"filtered_sample_docs = []\n", | |
"for doc in sample_docs:\n", | |
" new_doc = {}\n", | |
" for f in fields_to_keep:\n", | |
" value = doc.get(f)\n", | |
" new_doc[f] = value\n", | |
" filtered_sample_docs.append(new_doc)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 126, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"{'creator': ['Joan‐Pau Rubiés'],\n", | |
" 'id': 'http://www.jstor.org/stable/10.1086/491498',\n", | |
" 'isPartOf': 'Isis',\n", | |
" 'provider': 'jstor',\n", | |
" 'publicationYear': 2005,\n", | |
" 'title': 'Review Article',\n", | |
" 'url': 'http://www.jstor.org/stable/10.1086/491498',\n", | |
" 'wordCount': 1051}\n" | |
] | |
} | |
], | |
"source": [ | |
"pprint(filtered_sample_docs[0])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 127, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"with open(\"datasets/filtered_dset1.json\", \"w\") as of:\n", | |
" json.dump(filtered_sample_docs, of)" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.6" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
set -e | |
#service=http://localhost:5000/dl | |
service=https://www.jstor.org/api/tdm/v1 | |
fname=$2 | |
if [ -z "${fname}" ]; then | |
fname=$1 | |
fi | |
mkdir -p datasets | |
dl=`curl -s $service/nb/dataset/$1/info |\ | |
grep -o 'https://ithaka-labs.*Expires\=[0-9]*'` | |
dset="./datasets/$fname.jsonl.gz" | |
wget -q -L --show-progress \ | |
-O $dset \ | |
--user-agent "tdm notebooks" \ | |
$dl | |
export DATASET_FILE=$dset | |
echo "Your dataset $1 is stored in: $dset" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
jupyter-notebookparams | |
jupyter_contrib_nbextensions | |
pandas | |
matplotlib | |
seaborn | |
gensim | |
wordfreq |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
/opt/conda/bin/python3 | |
version=0.1 | |
python -m nltk.downloader stopwords wordnet | |
jupyter contrib nbextension install --user | |
jupyter nbextension install jupyter_contrib_nbextensions/nbextensions/toc2 --user | |
jupyter nbextension enable toc2/main | |
jupyter nbextension enable --py jupyter_notebookparams | |
exec "$@" | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment