Last active
July 7, 2020 16:06
-
-
Save lawlesst/175f99d06712432c3d16aa3056e586f3 to your computer and use it in GitHub Desktop.
tdm-pilot.org gists
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
datasets/ | |
.ipynb* |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
set -e | |
#service=http://localhost:5000/dl | |
service=https://www.jstor.org/api/tdm/v1 | |
fname=$2 | |
if [ -z "${fname}" ]; then | |
fname=$1 | |
fi | |
mkdir -p datasets | |
dl=`curl -s $service/nb/dataset/$1/info |\ | |
grep -o 'https://ithaka-labs.*Expires\=[0-9]*'` | |
dset="./datasets/$fname.jsonl.gz" | |
wget -q -L --show-progress \ | |
-O $dset \ | |
--user-agent "tdm notebooks" \ | |
$dl | |
export DATASET_FILE=$dset | |
echo "Your dataset $1 is stored in: $dset" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
jupyter-notebookparams | |
jupyter_contrib_nbextensions | |
pandas | |
matplotlib | |
seaborn | |
gensim | |
wordfreq |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
/opt/conda/bin/python3 | |
version=0.1 | |
python -m nltk.downloader stopwords wordnet | |
jupyter contrib nbextension install --user | |
jupyter nbextension install jupyter_contrib_nbextensions/nbextensions/toc2 --user | |
jupyter nbextension enable toc2/main | |
jupyter nbextension enable --py jupyter_notebookparams | |
exec "$@" | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# New TDM client demo" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Download and filter metadata with Pandas" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Parameters:\n", | |
"dataset_id = \"943b499d-2d00-e422-095f-97274a8b2121\"\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Importing your dataset with a dataset ID\n", | |
"import tdm_client\n", | |
"\n", | |
"dataset_metadata = tdm_client.get_metadata(dataset_id)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import pandas as pd" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df = pd.read_csv(dataset_metadata)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"dataset_document_count = len(df)\n", | |
"print(\"Total documents\", dataset_document_count)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Set the pandas option to show all columns\n", | |
"pd.set_option(\"max_columns\", None) \n", | |
"\n", | |
"df.head() # Show the first five rows of our DataFrame" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"id_list = df['id'].tolist()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"'http://www.jstor.org/stable/2871420' in id_list" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Drop each of these named columns\n", | |
"df = df.drop(['outputFormat', 'pageEnd', 'pageStart', 'datePublished', 'language'], axis=1)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Drop articles without an author\n", | |
"df = df.dropna(subset=['creator'])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"print(\"Original total\", dataset_document_count)\n", | |
"print(\"Filtered total\", len(df))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Examples for filtering the data based on the values found under 'title'\n", | |
"\n", | |
"df = df[df.title != 'Review Article'] # Remove articles with title \"Review Article\"\n", | |
"df = df[df.title != 'Front Matter'] # Remove articles with title \"Front Matter\"\n", | |
"df = df[df.title != 'Back Matter'] # Remove articles with title \"Back Matter\"\n", | |
"\n", | |
"# Remove articles with fewer than 3000 words, adjust or remove\n", | |
"\n", | |
"df = df[df.wordCount > 3000] " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"scrolled": true | |
}, | |
"outputs": [], | |
"source": [ | |
"print(\"Original total\", dataset_document_count)\n", | |
"print(\"Filtered total\", len(df))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"filtered_id_list = df[\"id\"].tolist()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df.groupby(['publicationYear'])['id'].agg('count').plot.bar(title='Documents by year', figsize=(20, 5), fontsize=12); " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df.groupby(['publicationYear'])['pageCount'].agg('sum').plot.bar(title='Pages by decade', figsize=(20, 5), fontsize=12);" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Count word frequencies" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"dataset_json_file = tdm_client.get_dataset(dataset_id)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import json\n", | |
"import gzip\n", | |
"from collections import Counter\n", | |
"\n", | |
"word_frequency = Counter()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"with gzip.open(dataset_json_file, \"rb\") as input_file:\n", | |
" for row in input_file:\n", | |
" document = json.loads(row)\n", | |
" _id = document[\"id\"]\n", | |
" if _id in filtered_id_list:\n", | |
" unigrams = document.get(\"unigramCount\", [])\n", | |
" for gram, count in unigrams.items():\n", | |
" word_frequency[gram] += count" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"for gram, count in word_frequency.most_common(25):\n", | |
" print(gram.ljust(20), count)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from nltk.corpus import stopwords\n", | |
"stop_words = stopwords.words('english')\n", | |
"stop_words[:10]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"scrolled": true | |
}, | |
"outputs": [], | |
"source": [ | |
"transformed_word_frequency = Counter()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"for document in tdm_client.dataset_reader(dataset_json_file):\n", | |
" _id = document[\"id\"]\n", | |
" if _id in filtered_id_list:\n", | |
" unigrams = document.get(\"unigramCount\", [])\n", | |
" for gram, count in unigrams.items():\n", | |
" clean_gram = gram.lower()\n", | |
" if clean_gram in stop_words:\n", | |
" continue\n", | |
" transformed_word_frequency[clean_gram] += count\n", | |
" break" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"for gram, count in transformed_word_frequency.most_common(25):\n", | |
" print(gram.ljust(20), count)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df = pd.DataFrame(list(transformed_word_frequency.items())[:25], columns=[\"ngram\", \"count\"])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df.sort_values('count', ascending=True).plot.barh(title='Frequent words', figsize=(20, 10), fontsize=12, x=\"ngram\", y=\"count\");" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Significant terms\n", | |
"\n", | |
"Run TFIDF on the first 10 documents in the filtered corpus." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import gensim" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Limit to n documents. Set to None to do all\n", | |
"\n", | |
"limit = 500\n", | |
"\n", | |
"n = 0\n", | |
"documents = []\n", | |
"for document in tdm_client.dataset_reader(dataset_json_file):\n", | |
" processed_document = []\n", | |
" _id = document[\"id\"]\n", | |
" if _id in filtered_id_list:\n", | |
" unigrams = document.get(\"unigramCount\", [])\n", | |
" for gram, count in unigrams.items():\n", | |
" clean_gram = process_token(gram)\n", | |
" if clean_gram is None:\n", | |
" continue\n", | |
" processed_document.append(clean_gram)\n", | |
" if len(processed_document) > 0:\n", | |
" documents.append(processed_document)\n", | |
" n += 1\n", | |
" if (limit is not None) and (n >= limit):\n", | |
" break\n", | |
"\n", | |
"dictionary = gensim.corpora.Dictionary(documents)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def process_token(token):\n", | |
" token = token.lower()\n", | |
" if token in stop_words:\n", | |
" return\n", | |
" if len(token) < 4:\n", | |
" return\n", | |
" if not(token.isalpha()):\n", | |
" return\n", | |
" return token\n", | |
"\n", | |
"dictionary = gensim.corpora.Dictionary(documents)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"dict(list(dictionary.token2id.items())[0:10]) # Print the first ten tokens and their associated IDs.\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"bow_corpus = [dictionary.doc2bow(doc) for doc in documents]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"model = gensim.models.TfidfModel(bow_corpus)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"corpus_tfidf = model[bow_corpus]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"rows = []\n", | |
"for doc in corpus_tfidf:\n", | |
" for term_id, score in doc:\n", | |
" rows.append([dictionary.get(term_id), score])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df = pd.DataFrame(rows, columns=[\"ngram\", \"score\"])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df.describe()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"top_50 = df.sort_values(\"score\", ascending=False).head(n=50)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"top_50.sort_values(\"score\", ascending=True).plot.barh(title='Significant terms', figsize=(20, 10), fontsize=12, x=\"ngram\", y=\"score\");" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
" ### LDA topic modeling" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"doc_count = len(id_list)\n", | |
"num_topics = 7 # Change the number of topics\n", | |
"\n", | |
"# Remove terms that appear in less than 10% of documents and more than 75% of documents.\n", | |
"dictionary.filter_extremes(no_below=10 * .10, no_above=0.75)\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"bow_corpus = [dictionary.doc2bow(doc) for doc in documents]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Train the LDA model.\n", | |
"model = gensim.models.LdaModel(\n", | |
" corpus=bow_corpus,\n", | |
" id2word=dictionary,\n", | |
" num_topics=num_topics\n", | |
")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"for topic_num in range(0, num_topics):\n", | |
" word_ids = model.get_topic_terms(topic_num)\n", | |
" words = []\n", | |
" for wid, weight in word_ids:\n", | |
" word = dictionary.id2token[wid]\n", | |
" words.append(word)\n", | |
" print(\"Topic {}\".format(str(topic_num).ljust(5)), \" \".join(words))" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.6" | |
}, | |
"toc": { | |
"base_numbering": 1, | |
"nav_menu": {}, | |
"number_sections": true, | |
"sideBar": true, | |
"skip_h1_title": true, | |
"title_cell": "Table of Contents", | |
"title_sidebar": "Contents", | |
"toc_cell": false, | |
"toc_position": { | |
"height": "calc(100% - 180px)", | |
"left": "10px", | |
"top": "150px", | |
"width": "223.188px" | |
}, | |
"toc_section_display": true, | |
"toc_window_display": true | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 4 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment