lawlesst · July 7, 2020 16:06
diff --git a/.gitignore b/.gitignore
 datasets/
 .ipynb*
diff --git a/1-topic-modeling.ipynb b/1-topic-modeling.ipynb
diff --git a/download-dataset.ipynb b/download-dataset.ipynb
diff --git a/filtering-a-dataset.ipynb b/filtering-a-dataset.ipynb
diff --git a/getDataset b/getDataset
 #!/bin/bash
 set -e

 #service=http://localhost:5000/dl
 service=https://www.jstor.org/api/tdm/v1

 fname=$2
 if [ -z "${fname}" ]; then
    fname=$1
 fi
 mkdir -p datasets

 dl=`curl -s $service/nb/dataset/$1/info |\
    grep -o 'https://ithaka-labs.*Expires\=[0-9]*'`

 dset="./datasets/$fname.jsonl.gz"
 wget -q -L --show-progress \
    -O $dset \
    --user-agent "tdm notebooks" \
    $dl

 export DATASET_FILE=$dset

 echo "Your dataset $1 is stored in: $dset"
diff --git a/library-history.ipynb b/library-history.ipynb
 {
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Topic modeling journal runs\n",
    "\n",
    "An example notebook looking at articles from a single journal, in this case Library History and its variants. \n",
    "\n",
    "Process:\n",
    "* build dataset in corpus builder\n",
    "* download dataset to notebook environment\n",
    "* use ngrams to build a topic model \n",
    "* use the model to infer topics for each article\n",
    "* track topic frequency over time\n",
    "* plot the results\n",
    "\n",
    "The Python library gensim is used for LDA topic modeling.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "!bash getDataset 5c54351f-d2fa-749f-3efc-0477720bd176 library-history"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import gzip\n",
    "from collections import Counter\n",
    "from pprint import pprint\n",
    "\n",
    "import gensim\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "from wordfreq import simple_tokenize"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import logging\n",
    "logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.WARN)\n",
    "\n",
    "logging.getLogger('gensim.models').setLevel(logging.WARN)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset_filename = \"datasets/library-history.jsonl.gz\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Processing\n",
    "\n",
    "Define functions to:\n",
    " - process individual tokens\n",
    " - process ngrams\n",
    " - convert a TDM document to a gensim \"bag of words\"\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def process_token(token, strip_stopwords=True):\n",
    "    token = \" \".join(simple_tokenize(token))\n",
    "    if len(token) < 3:\n",
    "        return\n",
    "    return token"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "assert process_token(\"Title,\") == \"title\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def process_ngram(token):\n",
    "    token = simple_tokenize(token)\n",
    "    return \"_\".join(token)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "def doc_to_bow(raw):\n",
    "    bow_doc = []\n",
    "    ngrams = document.get(\"unigramCount\", {})\n",
    "    for gram, count in ngrams.items():\n",
    "        cg = process_token(gram)\n",
    "        if (cg is None) or len(cg) == 0:\n",
    "            continue\n",
    "        else:\n",
    "            #bow_doc += [cg] * count\n",
    "            bow_doc.append(cg)\n",
    "    for ngram, ngram_len in [(\"bigramCount\", 2), (\"trigramCount\", 3)]:\n",
    "        for gram, count in document.get(ngram, {}).items():\n",
    "            #if count > 1:\n",
    "            #    continue\n",
    "            clean_gram = process_ngram(gram)\n",
    "            if (clean_gram is None) or len(clean_gram) == 0:\n",
    "                continue\n",
    "            #bow_doc += [clean_gram] * count \n",
    "            bow_doc.append(clean_gram)\n",
    "    if len(bow_doc) == 0:\n",
    "        return\n",
    "    return bow_doc"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Build the corpus\n",
    "\n",
    "Read each document in our dataset, process the ngrams, and convert to a list of documents"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Limit to n documents. Set to None to do all\n",
    "limit = None\n",
    "num_docs = 0\n",
    "\n",
    "documents = []\n",
    "metadata = {}\n",
    "\n",
    "with gzip.open(dataset_filename, \"rb\") as inf:\n",
    "    for idx, row in enumerate(inf):\n",
    "        document = json.loads(row)\n",
    "        _id = document[\"id\"]\n",
    "        bd = doc_to_bow(document)\n",
    "        metadata[idx] = {\n",
    "            \"year\": document[\"publicationYear\"],\n",
    "            \"id\": _id\n",
    "        }\n",
    "        if bd is None:\n",
    "            print(_id)\n",
    "            continue\n",
    "        else:\n",
    "            documents.append(bd)\n",
    "            num_docs += 1\n",
    "        if (limit is not None) and (num_docs >= limit):\n",
    "           break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "len(documents)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dictionary = gensim.corpora.Dictionary(documents)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print('Number of unique tokens: %d' % len(dictionary))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# Remove terms that appear in less than 20 of and more than 50% of documents. \n",
    "dictionary.filter_extremes(no_below=5, no_above=0.50)\n",
    "print('Number of unique tokens: %d' % len(dictionary))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "bow_corpus = [dictionary.doc2bow(doc) for doc in documents]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "print('Number of documents: %d' % len(bow_corpus))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Train the model\n",
    "\n",
    "Run our bow corpus through the LDA model and print the identified topics with the terms."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#logging.getLogger('gensim.models').setLevel(logging.ERROR)\n",
    "\n",
    "num_topics = 3\n",
    "passes = 50\n",
    "iterations = 700\n",
    "eval_every = None\n",
    "\n",
    "# Train the LDA model.\n",
    "model = gensim.models.LdaModel(\n",
    "    corpus=bow_corpus,\n",
    "    id2word=dictionary,\n",
    "    iterations=iterations,\n",
    "    num_topics=num_topics,\n",
    "    passes=passes,\n",
    "    eval_every=eval_every\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "for topic_num in range(0, num_topics):\n",
    "    word_ids = model.get_topic_terms(topic_num)\n",
    "    words = []\n",
    "    for wid, weight in word_ids:\n",
    "        word = dictionary.id2token[wid]\n",
    "        words.append(word)\n",
    "    print(\"Topic {}\".format(str(topic_num + 1).ljust(5)), \" \".join(words))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Track topic changes over time\n",
    "\n",
    "Run each document through the model to identify the topics per document per year"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "year_to_topic = {}\n",
    "year_count = Counter()\n",
    "rows = []\n",
    "\n",
    "for idx, meta in metadata.items():\n",
    "    year = meta[\"year\"]\n",
    "    cdoc = bow_corpus[idx]\n",
    "    topics = model.get_document_topics(cdoc)\n",
    "    for topic, score in topics:\n",
    "        cnt = year_to_topic.get(year, Counter())\n",
    "        cnt[topic] += 1\n",
    "        year_to_topic[year] = cnt\n",
    "        year_count[year] += 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "rows = []\n",
    "for yr, cnt in year_to_topic.items():\n",
    "    for topic, count in cnt.items():\n",
    "        rows.append((yr, topic + 1, count))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.DataFrame(rows, columns=[\"year\", \"topic_num\", \"n\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def yearly_frequency(row):\n",
    "    return row[\"n\"] / year_count[row[\"year\"]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df[\"tf\"] = df.apply(yearly_frequency, axis=1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "plt = sns.lmplot(\n",
    "    x=\"year\",\n",
    "    y=\"tf\", \n",
    "    data=df, \n",
    "    hue=\"topic_num\",\n",
    "    ci=None,\n",
    "    palette=sns.color_palette(\"muted\", n_colors=num_topics)\n",
    ");"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": null
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": true,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {
    "height": "calc(100% - 180px)",
    "left": "10px",
    "top": "150px",
    "width": "211.188px"
   },
   "toc_section_display": true,
   "toc_window_display": true
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
diff --git a/requirements.txt b/requirements.txt
 jupyter-notebookparams
 jupyter_contrib_nbextensions
 pandas
 matplotlib
 seaborn
 gensim
 wordfreq
diff --git a/start b/start
 #!/bin/bash

 /opt/conda/bin/python3

 version=0.1

 python -m nltk.downloader stopwords wordnet

 jupyter contrib nbextension install --user
 jupyter nbextension install jupyter_contrib_nbextensions/nbextensions/toc2 --user
 jupyter nbextension enable toc2/main
 jupyter nbextension enable --py jupyter_notebookparams

 exec "$@"

diff --git a/tdm-client-demo.ipynb b/tdm-client-demo.ipynb
diff --git a/word-frequencies-across-dataset.ipynb b/word-frequencies-across-dataset.ipynb
	#!/bin/bash
	set -e

	#service=http://localhost:5000/dl
	service=https://www.jstor.org/api/tdm/v1

	fname=$2
	if [ -z "${fname}" ]; then
	fname=$1
	fi
	mkdir -p datasets

	dl=`curl -s $service/nb/dataset/$1/info \|\
	grep -o 'https://ithaka-labs.Expires\=[0-9]'`

	dset="./datasets/$fname.jsonl.gz"
	wget -q -L --show-progress \
	-O $dset \
	--user-agent "tdm notebooks" \
	$dl

	export DATASET_FILE=$dset

	echo "Your dataset $1 is stored in: $dset"
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Topic modeling journal runs\n",
	"\n",
	"An example notebook looking at articles from a single journal, in this case Library History and its variants. \n",
	"\n",
	"Process:\n",
	"* build dataset in corpus builder\n",
	"* download dataset to notebook environment\n",
	"* use ngrams to build a topic model \n",
	"* use the model to infer topics for each article\n",
	"* track topic frequency over time\n",
	"* plot the results\n",
	"\n",
	"The Python library gensim is used for LDA topic modeling.\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"scrolled": false
	},
	"outputs": [],
	"source": [
	"!bash getDataset 5c54351f-d2fa-749f-3efc-0477720bd176 library-history"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"import json\n",
	"import gzip\n",
	"from collections import Counter\n",
	"from pprint import pprint\n",
	"\n",
	"import gensim\n",
	"import pandas as pd\n",
	"import matplotlib.pyplot as plt\n",
	"import seaborn as sns\n",
	"from wordfreq import simple_tokenize"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"import logging\n",
	"logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.WARN)\n",
	"\n",
	"logging.getLogger('gensim.models').setLevel(logging.WARN)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"dataset_filename = \"datasets/library-history.jsonl.gz\""
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Processing\n",
	"\n",
	"Define functions to:\n",
	" - process individual tokens\n",
	" - process ngrams\n",
	" - convert a TDM document to a gensim \"bag of words\"\n",
	" "
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"def process_token(token, strip_stopwords=True):\n",
	" token = \" \".join(simple_tokenize(token))\n",
	" if len(token) < 3:\n",
	" return\n",
	" return token"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"assert process_token(\"Title,\") == \"title\""
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"def process_ngram(token):\n",
	" token = simple_tokenize(token)\n",
	" return \"_\".join(token)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"\n",
	"def doc_to_bow(raw):\n",
	" bow_doc = []\n",
	" ngrams = document.get(\"unigramCount\", {})\n",
	" for gram, count in ngrams.items():\n",
	" cg = process_token(gram)\n",
	" if (cg is None) or len(cg) == 0:\n",
	" continue\n",
	" else:\n",
	" #bow_doc += [cg] * count\n",
	" bow_doc.append(cg)\n",
	" for ngram, ngram_len in [(\"bigramCount\", 2), (\"trigramCount\", 3)]:\n",
	" for gram, count in document.get(ngram, {}).items():\n",
	" #if count > 1:\n",
	" # continue\n",
	" clean_gram = process_ngram(gram)\n",
	" if (clean_gram is None) or len(clean_gram) == 0:\n",
	" continue\n",
	" #bow_doc += [clean_gram] * count \n",
	" bow_doc.append(clean_gram)\n",
	" if len(bow_doc) == 0:\n",
	" return\n",
	" return bow_doc"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Build the corpus\n",
	"\n",
	"Read each document in our dataset, process the ngrams, and convert to a list of documents"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Limit to n documents. Set to None to do all\n",
	"limit = None\n",
	"num_docs = 0\n",
	"\n",
	"documents = []\n",
	"metadata = {}\n",
	"\n",
	"with gzip.open(dataset_filename, \"rb\") as inf:\n",
	" for idx, row in enumerate(inf):\n",
	" document = json.loads(row)\n",
	" _id = document[\"id\"]\n",
	" bd = doc_to_bow(document)\n",
	" metadata[idx] = {\n",
	" \"year\": document[\"publicationYear\"],\n",
	" \"id\": _id\n",
	" }\n",
	" if bd is None:\n",
	" print(_id)\n",
	" continue\n",
	" else:\n",
	" documents.append(bd)\n",
	" num_docs += 1\n",
	" if (limit is not None) and (num_docs >= limit):\n",
	" break"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"len(documents)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"dictionary = gensim.corpora.Dictionary(documents)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"print('Number of unique tokens: %d' % len(dictionary))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"scrolled": true
	},
	"outputs": [],
	"source": [
	"# Remove terms that appear in less than 20 of and more than 50% of documents. \n",
	"dictionary.filter_extremes(no_below=5, no_above=0.50)\n",
	"print('Number of unique tokens: %d' % len(dictionary))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"bow_corpus = [dictionary.doc2bow(doc) for doc in documents]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"scrolled": true
	},
	"outputs": [],
	"source": [
	"print('Number of documents: %d' % len(bow_corpus))"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Train the model\n",
	"\n",
	"Run our bow corpus through the LDA model and print the identified topics with the terms."
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"#logging.getLogger('gensim.models').setLevel(logging.ERROR)\n",
	"\n",
	"num_topics = 3\n",
	"passes = 50\n",
	"iterations = 700\n",
	"eval_every = None\n",
	"\n",
	"# Train the LDA model.\n",
	"model = gensim.models.LdaModel(\n",
	" corpus=bow_corpus,\n",
	" id2word=dictionary,\n",
	" iterations=iterations,\n",
	" num_topics=num_topics,\n",
	" passes=passes,\n",
	" eval_every=eval_every\n",
	")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"scrolled": true
	},
	"outputs": [],
	"source": [
	"for topic_num in range(0, num_topics):\n",
	" word_ids = model.get_topic_terms(topic_num)\n",
	" words = []\n",
	" for wid, weight in word_ids:\n",
	" word = dictionary.id2token[wid]\n",
	" words.append(word)\n",
	" print(\"Topic {}\".format(str(topic_num + 1).ljust(5)), \" \".join(words))"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Track topic changes over time\n",
	"\n",
	"Run each document through the model to identify the topics per document per year"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"year_to_topic = {}\n",
	"year_count = Counter()\n",
	"rows = []\n",
	"\n",
	"for idx, meta in metadata.items():\n",
	" year = meta[\"year\"]\n",
	" cdoc = bow_corpus[idx]\n",
	" topics = model.get_document_topics(cdoc)\n",
	" for topic, score in topics:\n",
	" cnt = year_to_topic.get(year, Counter())\n",
	" cnt[topic] += 1\n",
	" year_to_topic[year] = cnt\n",
	" year_count[year] += 1"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"rows = []\n",
	"for yr, cnt in year_to_topic.items():\n",
	" for topic, count in cnt.items():\n",
	" rows.append((yr, topic + 1, count))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"df = pd.DataFrame(rows, columns=[\"year\", \"topic_num\", \"n\"])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"def yearly_frequency(row):\n",
	" return row[\"n\"] / year_count[row[\"year\"]]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"df[\"tf\"] = df.apply(yearly_frequency, axis=1)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Results"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"scrolled": true
	},
	"outputs": [],
	"source": [
	"plt = sns.lmplot(\n",
	" x=\"year\",\n",
	" y=\"tf\", \n",
	" data=df, \n",
	" hue=\"topic_num\",\n",
	" ci=None,\n",
	" palette=sns.color_palette(\"muted\", n_colors=num_topics)\n",
	");"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": null
	},
	"toc": {
	"base_numbering": 1,
	"nav_menu": {},
	"number_sections": true,
	"sideBar": true,
	"skip_h1_title": true,
	"title_cell": "Table of Contents",
	"title_sidebar": "Contents",
	"toc_cell": false,
	"toc_position": {
	"height": "calc(100% - 180px)",
	"left": "10px",
	"top": "150px",
	"width": "211.188px"
	},
	"toc_section_display": true,
	"toc_window_display": true
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}
	jupyter-notebookparams
	jupyter_contrib_nbextensions
	pandas
	matplotlib
	seaborn
	gensim
	wordfreq
	#!/bin/bash

	/opt/conda/bin/python3

	version=0.1

	python -m nltk.downloader stopwords wordnet

	jupyter contrib nbextension install --user
	jupyter nbextension install jupyter_contrib_nbextensions/nbextensions/toc2 --user
	jupyter nbextension enable toc2/main
	jupyter nbextension enable --py jupyter_notebookparams

	exec "$@"