Last active
March 9, 2018 19:46
-
-
Save JasonKessler/583799bffae65694489529004bc5c84c to your computer and use it in GitHub Desktop.
Accept vs. Reject ICLR Reviews LORIDP
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
reviews_df = pd.read_csv('https://github.com/JasonKessler/ICLR18ReviewVis/raw/master/iclr2018_reviews.csv.bz2') | |
reviews_df['parse'] = reviews_df['review'].apply(spacy.load('en', parser=False)) | |
# Create Corpus based on accept/reject/workshop decision | |
full_corpus = st.CorpusFromParsedDocuments( | |
reviews_df, category_col='decision', parsed_col='parse').build() | |
# A two-category corpus to use for plotting, with unigrams which only occur in bigrams removed. | |
# Terms used in <5 documents are removed as well. | |
corpus = st.CompactTerms(full_corpus.remove_categories(['Workshop']), | |
st.OncePerDocFrequencyRanker, 5).compact() | |
# Use counts of unigrams and bigrams from the Workshop corpus as the Dirichlet prior | |
priors = (st.PriorFactory(full_corpus, term_ranker=st.OncePerDocFrequencyRanker) | |
.use_categories(['Workshop'].align_to_target(corpus).get_priors())) | |
term_scorer = LogOddsRatioInformativeDirichletPrior( | |
priors, reviews_df.parse.apply(len).mean(), 'word') # use the original approach to scaling prior | |
html = st.produce_frequency_explorer(corpus, | |
category='Accept', not_categories=['Reject'], | |
term_ranker = st.OncePerDocFrequencyRanker, | |
term_scorer = term_scorer, | |
grey_threshold = 1.96, | |
metadata = corpus.get_df()['metadata']) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment