lawlesst · July 7, 2020 16:06
diff --git a/.gitignore b/.gitignore
 datasets/
 .ipynb*
diff --git a/1-topic-modeling.ipynb b/1-topic-modeling.ipynb
diff --git a/download-dataset.ipynb b/download-dataset.ipynb
diff --git a/filtering-a-dataset.ipynb b/filtering-a-dataset.ipynb
diff --git a/getDataset b/getDataset
 #!/bin/bash
 set -e

 #service=http://localhost:5000/dl
 service=https://www.jstor.org/api/tdm/v1

 fname=$2
 if [ -z "${fname}" ]; then
    fname=$1
 fi
 mkdir -p datasets

 dl=`curl -s $service/nb/dataset/$1/info |\
    grep -o 'https://ithaka-labs.*Expires\=[0-9]*'`

 dset="./datasets/$fname.jsonl.gz"
 wget -q -L --show-progress \
    -O $dset \
    --user-agent "tdm notebooks" \
    $dl

 export DATASET_FILE=$dset

 echo "Your dataset $1 is stored in: $dset"
diff --git a/library-history.ipynb b/library-history.ipynb
diff --git a/requirements.txt b/requirements.txt
 jupyter-notebookparams
 jupyter_contrib_nbextensions
 pandas
 matplotlib
 seaborn
 gensim
 wordfreq
diff --git a/start b/start
 #!/bin/bash

 /opt/conda/bin/python3

 version=0.1

 python -m nltk.downloader stopwords wordnet

 jupyter contrib nbextension install --user
 jupyter nbextension install jupyter_contrib_nbextensions/nbextensions/toc2 --user
 jupyter nbextension enable toc2/main
 jupyter nbextension enable --py jupyter_notebookparams

 exec "$@"

diff --git a/tdm-client-demo.ipynb b/tdm-client-demo.ipynb
 {
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# New TDM client demo"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Download and filter metadata with Pandas"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Parameters:\n",
    "dataset_id = \"943b499d-2d00-e422-095f-97274a8b2121\"\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Importing your dataset with a dataset ID\n",
    "import tdm_client\n",
    "\n",
    "dataset_metadata = tdm_client.get_metadata(dataset_id)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv(dataset_metadata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset_document_count = len(df)\n",
    "print(\"Total documents\", dataset_document_count)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Set the pandas option to show all columns\n",
    "pd.set_option(\"max_columns\", None) \n",
    "\n",
    "df.head() # Show the first five rows of our DataFrame"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "id_list = df['id'].tolist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "'http://www.jstor.org/stable/2871420' in id_list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Drop each of these named columns\n",
    "df = df.drop(['outputFormat', 'pageEnd', 'pageStart', 'datePublished', 'language'], axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Drop articles without an author\n",
    "df = df.dropna(subset=['creator'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"Original total\", dataset_document_count)\n",
    "print(\"Filtered total\", len(df))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Examples for filtering the data based on the values found under 'title'\n",
    "\n",
    "df = df[df.title != 'Review Article'] # Remove articles with title \"Review Article\"\n",
    "df = df[df.title != 'Front Matter'] # Remove articles with title \"Front Matter\"\n",
    "df = df[df.title != 'Back Matter'] # Remove articles with title \"Back Matter\"\n",
    "\n",
    "# Remove articles with fewer than 3000 words, adjust or remove\n",
    "\n",
    "df = df[df.wordCount > 3000] "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "print(\"Original total\", dataset_document_count)\n",
    "print(\"Filtered total\", len(df))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "filtered_id_list = df[\"id\"].tolist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.groupby(['publicationYear'])['id'].agg('count').plot.bar(title='Documents by year', figsize=(20, 5), fontsize=12); "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.groupby(['publicationYear'])['pageCount'].agg('sum').plot.bar(title='Pages by decade', figsize=(20, 5), fontsize=12);"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Count word frequencies"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset_json_file = tdm_client.get_dataset(dataset_id)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import gzip\n",
    "from collections import Counter\n",
    "\n",
    "word_frequency = Counter()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with gzip.open(dataset_json_file, \"rb\") as input_file:\n",
    "    for row in input_file:\n",
    "        document = json.loads(row)\n",
    "        _id = document[\"id\"]\n",
    "        if _id in filtered_id_list:\n",
    "            unigrams = document.get(\"unigramCount\", [])\n",
    "            for gram, count in unigrams.items():\n",
    "                word_frequency[gram] += count"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for gram, count in word_frequency.most_common(25):\n",
    "    print(gram.ljust(20), count)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from nltk.corpus import stopwords\n",
    "stop_words = stopwords.words('english')\n",
    "stop_words[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "transformed_word_frequency = Counter()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for document in tdm_client.dataset_reader(dataset_json_file):\n",
    "    _id = document[\"id\"]\n",
    "    if _id in filtered_id_list:\n",
    "        unigrams = document.get(\"unigramCount\", [])\n",
    "        for gram, count in unigrams.items():\n",
    "            clean_gram = gram.lower()\n",
    "            if clean_gram in stop_words:\n",
    "                continue\n",
    "            transformed_word_frequency[clean_gram] += count\n",
    "        break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for gram, count in transformed_word_frequency.most_common(25):\n",
    "    print(gram.ljust(20), count)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.DataFrame(list(transformed_word_frequency.items())[:25], columns=[\"ngram\", \"count\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.sort_values('count', ascending=True).plot.barh(title='Frequent words', figsize=(20, 10), fontsize=12, x=\"ngram\", y=\"count\");"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Significant terms\n",
    "\n",
    "Run TFIDF on the first 10 documents in the filtered corpus."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import gensim"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Limit to n documents. Set to None to do all\n",
    "\n",
    "limit = 500\n",
    "\n",
    "n = 0\n",
    "documents = []\n",
    "for document in tdm_client.dataset_reader(dataset_json_file):\n",
    "    processed_document = []\n",
    "    _id = document[\"id\"]\n",
    "    if _id in filtered_id_list:\n",
    "        unigrams = document.get(\"unigramCount\", [])\n",
    "        for gram, count in unigrams.items():\n",
    "            clean_gram = process_token(gram)\n",
    "            if clean_gram is None:\n",
    "                continue\n",
    "            processed_document.append(clean_gram)\n",
    "        if len(processed_document) > 0:\n",
    "            documents.append(processed_document)\n",
    "        n += 1\n",
    "        if (limit is not None) and (n >= limit):\n",
    "           break\n",
    "\n",
    "dictionary = gensim.corpora.Dictionary(documents)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def process_token(token):\n",
    "    token = token.lower()\n",
    "    if token in stop_words:\n",
    "        return\n",
    "    if len(token) < 4:\n",
    "        return\n",
    "    if not(token.isalpha()):\n",
    "        return\n",
    "    return token\n",
    "\n",
    "dictionary = gensim.corpora.Dictionary(documents)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dict(list(dictionary.token2id.items())[0:10]) # Print the first ten tokens and their associated IDs.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "bow_corpus = [dictionary.doc2bow(doc) for doc in documents]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = gensim.models.TfidfModel(bow_corpus)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "corpus_tfidf = model[bow_corpus]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "rows = []\n",
    "for doc in corpus_tfidf:\n",
    "    for term_id, score in doc:\n",
    "        rows.append([dictionary.get(term_id), score])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.DataFrame(rows, columns=[\"ngram\", \"score\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "top_50 = df.sort_values(\"score\", ascending=False).head(n=50)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "top_50.sort_values(\"score\", ascending=True).plot.barh(title='Significant terms', figsize=(20, 10), fontsize=12, x=\"ngram\", y=\"score\");"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    " ### LDA topic modeling"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "doc_count = len(id_list)\n",
    "num_topics = 7 # Change the number of topics\n",
    "\n",
    "# Remove terms that appear in less than 10% of documents and more than 75% of documents.\n",
    "dictionary.filter_extremes(no_below=10 * .10, no_above=0.75)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "bow_corpus = [dictionary.doc2bow(doc) for doc in documents]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Train the LDA model.\n",
    "model = gensim.models.LdaModel(\n",
    "    corpus=bow_corpus,\n",
    "    id2word=dictionary,\n",
    "    num_topics=num_topics\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for topic_num in range(0, num_topics):\n",
    "    word_ids = model.get_topic_terms(topic_num)\n",
    "    words = []\n",
    "    for wid, weight in word_ids:\n",
    "        word = dictionary.id2token[wid]\n",
    "        words.append(word)\n",
    "    print(\"Topic {}\".format(str(topic_num).ljust(5)), \" \".join(words))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.6"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": true,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {
    "height": "calc(100% - 180px)",
    "left": "10px",
    "top": "150px",
    "width": "223.188px"
   },
   "toc_section_display": true,
   "toc_window_display": true
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
 }
diff --git a/word-frequencies-across-dataset.ipynb b/word-frequencies-across-dataset.ipynb
	#!/bin/bash
	set -e

	#service=http://localhost:5000/dl
	service=https://www.jstor.org/api/tdm/v1

	fname=$2
	if [ -z "${fname}" ]; then
	fname=$1
	fi
	mkdir -p datasets

	dl=`curl -s $service/nb/dataset/$1/info \|\
	grep -o 'https://ithaka-labs.Expires\=[0-9]'`

	dset="./datasets/$fname.jsonl.gz"
	wget -q -L --show-progress \
	-O $dset \
	--user-agent "tdm notebooks" \
	$dl

	export DATASET_FILE=$dset

	echo "Your dataset $1 is stored in: $dset"
	jupyter-notebookparams
	jupyter_contrib_nbextensions
	pandas
	matplotlib
	seaborn
	gensim
	wordfreq
	#!/bin/bash

	/opt/conda/bin/python3

	version=0.1

	python -m nltk.downloader stopwords wordnet

	jupyter contrib nbextension install --user
	jupyter nbextension install jupyter_contrib_nbextensions/nbextensions/toc2 --user
	jupyter nbextension enable toc2/main
	jupyter nbextension enable --py jupyter_notebookparams

	exec "$@"
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# New TDM client demo"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Download and filter metadata with Pandas"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Parameters:\n",
	"dataset_id = \"943b499d-2d00-e422-095f-97274a8b2121\"\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Importing your dataset with a dataset ID\n",
	"import tdm_client\n",
	"\n",
	"dataset_metadata = tdm_client.get_metadata(dataset_id)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"import pandas as pd"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"df = pd.read_csv(dataset_metadata)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"dataset_document_count = len(df)\n",
	"print(\"Total documents\", dataset_document_count)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Set the pandas option to show all columns\n",
	"pd.set_option(\"max_columns\", None) \n",
	"\n",
	"df.head() # Show the first five rows of our DataFrame"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"id_list = df['id'].tolist()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"'http://www.jstor.org/stable/2871420' in id_list"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Drop each of these named columns\n",
	"df = df.drop(['outputFormat', 'pageEnd', 'pageStart', 'datePublished', 'language'], axis=1)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Drop articles without an author\n",
	"df = df.dropna(subset=['creator'])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"print(\"Original total\", dataset_document_count)\n",
	"print(\"Filtered total\", len(df))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Examples for filtering the data based on the values found under 'title'\n",
	"\n",
	"df = df[df.title != 'Review Article'] # Remove articles with title \"Review Article\"\n",
	"df = df[df.title != 'Front Matter'] # Remove articles with title \"Front Matter\"\n",
	"df = df[df.title != 'Back Matter'] # Remove articles with title \"Back Matter\"\n",
	"\n",
	"# Remove articles with fewer than 3000 words, adjust or remove\n",
	"\n",
	"df = df[df.wordCount > 3000] "
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"scrolled": true
	},
	"outputs": [],
	"source": [
	"print(\"Original total\", dataset_document_count)\n",
	"print(\"Filtered total\", len(df))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"filtered_id_list = df[\"id\"].tolist()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"df.groupby(['publicationYear'])['id'].agg('count').plot.bar(title='Documents by year', figsize=(20, 5), fontsize=12); "
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"df.groupby(['publicationYear'])['pageCount'].agg('sum').plot.bar(title='Pages by decade', figsize=(20, 5), fontsize=12);"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Count word frequencies"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"dataset_json_file = tdm_client.get_dataset(dataset_id)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"import json\n",
	"import gzip\n",
	"from collections import Counter\n",
	"\n",
	"word_frequency = Counter()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"with gzip.open(dataset_json_file, \"rb\") as input_file:\n",
	" for row in input_file:\n",
	" document = json.loads(row)\n",
	" _id = document[\"id\"]\n",
	" if _id in filtered_id_list:\n",
	" unigrams = document.get(\"unigramCount\", [])\n",
	" for gram, count in unigrams.items():\n",
	" word_frequency[gram] += count"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"for gram, count in word_frequency.most_common(25):\n",
	" print(gram.ljust(20), count)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"from nltk.corpus import stopwords\n",
	"stop_words = stopwords.words('english')\n",
	"stop_words[:10]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"scrolled": true
	},
	"outputs": [],
	"source": [
	"transformed_word_frequency = Counter()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"for document in tdm_client.dataset_reader(dataset_json_file):\n",
	" _id = document[\"id\"]\n",
	" if _id in filtered_id_list:\n",
	" unigrams = document.get(\"unigramCount\", [])\n",
	" for gram, count in unigrams.items():\n",
	" clean_gram = gram.lower()\n",
	" if clean_gram in stop_words:\n",
	" continue\n",
	" transformed_word_frequency[clean_gram] += count\n",
	" break"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"for gram, count in transformed_word_frequency.most_common(25):\n",
	" print(gram.ljust(20), count)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"df = pd.DataFrame(list(transformed_word_frequency.items())[:25], columns=[\"ngram\", \"count\"])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"df.head()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"df.sort_values('count', ascending=True).plot.barh(title='Frequent words', figsize=(20, 10), fontsize=12, x=\"ngram\", y=\"count\");"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Significant terms\n",
	"\n",
	"Run TFIDF on the first 10 documents in the filtered corpus."
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"import gensim"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Limit to n documents. Set to None to do all\n",
	"\n",
	"limit = 500\n",
	"\n",
	"n = 0\n",
	"documents = []\n",
	"for document in tdm_client.dataset_reader(dataset_json_file):\n",
	" processed_document = []\n",
	" _id = document[\"id\"]\n",
	" if _id in filtered_id_list:\n",
	" unigrams = document.get(\"unigramCount\", [])\n",
	" for gram, count in unigrams.items():\n",
	" clean_gram = process_token(gram)\n",
	" if clean_gram is None:\n",
	" continue\n",
	" processed_document.append(clean_gram)\n",
	" if len(processed_document) > 0:\n",
	" documents.append(processed_document)\n",
	" n += 1\n",
	" if (limit is not None) and (n >= limit):\n",
	" break\n",
	"\n",
	"dictionary = gensim.corpora.Dictionary(documents)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"def process_token(token):\n",
	" token = token.lower()\n",
	" if token in stop_words:\n",
	" return\n",
	" if len(token) < 4:\n",
	" return\n",
	" if not(token.isalpha()):\n",
	" return\n",
	" return token\n",
	"\n",
	"dictionary = gensim.corpora.Dictionary(documents)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"dict(list(dictionary.token2id.items())[0:10]) # Print the first ten tokens and their associated IDs.\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"bow_corpus = [dictionary.doc2bow(doc) for doc in documents]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"model = gensim.models.TfidfModel(bow_corpus)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"corpus_tfidf = model[bow_corpus]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"rows = []\n",
	"for doc in corpus_tfidf:\n",
	" for term_id, score in doc:\n",
	" rows.append([dictionary.get(term_id), score])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"df = pd.DataFrame(rows, columns=[\"ngram\", \"score\"])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"df.head()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"df.describe()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"top_50 = df.sort_values(\"score\", ascending=False).head(n=50)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"top_50.sort_values(\"score\", ascending=True).plot.barh(title='Significant terms', figsize=(20, 10), fontsize=12, x=\"ngram\", y=\"score\");"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	" ### LDA topic modeling"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"doc_count = len(id_list)\n",
	"num_topics = 7 # Change the number of topics\n",
	"\n",
	"# Remove terms that appear in less than 10% of documents and more than 75% of documents.\n",
	"dictionary.filter_extremes(no_below=10 * .10, no_above=0.75)\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"bow_corpus = [dictionary.doc2bow(doc) for doc in documents]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Train the LDA model.\n",
	"model = gensim.models.LdaModel(\n",
	" corpus=bow_corpus,\n",
	" id2word=dictionary,\n",
	" num_topics=num_topics\n",
	")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"for topic_num in range(0, num_topics):\n",
	" word_ids = model.get_topic_terms(topic_num)\n",
	" words = []\n",
	" for wid, weight in word_ids:\n",
	" word = dictionary.id2token[wid]\n",
	" words.append(word)\n",
	" print(\"Topic {}\".format(str(topic_num).ljust(5)), \" \".join(words))"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.6.6"
	},
	"toc": {
	"base_numbering": 1,
	"nav_menu": {},
	"number_sections": true,
	"sideBar": true,
	"skip_h1_title": true,
	"title_cell": "Table of Contents",
	"title_sidebar": "Contents",
	"toc_cell": false,
	"toc_position": {
	"height": "calc(100% - 180px)",
	"left": "10px",
	"top": "150px",
	"width": "223.188px"
	},
	"toc_section_display": true,
	"toc_window_display": true
	}
	},
	"nbformat": 4,
	"nbformat_minor": 4
	}