Last active
January 16, 2022 07:33
-
-
Save onnyyonn/b3a1d426f92e7526bd63fc2df790e3cd to your computer and use it in GitHub Desktop.
Sample script to generate freqencies of a list of words from Google Ngram Viewer
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Sample script to generate freqencies of a list of words from Google Ngram Viewer | |
import pandas as pd, numpy as np | |
import re, requests | |
from ast import literal_eval | |
# Import list of words | |
df = pd.read_csv("wordle_full.csv") | |
df.rename( columns={'Unnamed: 0':'word', 'Unnamed: 1':'frequency'}, inplace=True ) | |
# Google will rate limit. Need some retry strategy. | |
retry_strategy = requests.packages.urllib3.util.retry.Retry( | |
total=10, | |
backoff_factor=2, | |
status_forcelist=[429], | |
allowed_methods=["HEAD", "GET", "OPTIONS"] | |
) | |
adapter = requests.adapters.HTTPAdapter(max_retries=retry_strategy) | |
http = requests.Session() | |
http.mount("https://", adapter) | |
http.mount("http://", adapter) | |
# Query the frequency timeseries for each word | |
for i in range(len(df)): | |
if (i%200 == 0): | |
print(i) | |
term = df['word'][i] | |
url =f"https://books.google.com/ngrams/graph?content={term}&year_start=1800&year_end=2019&corpus=26&smoothing=0&case_insensitive=true" | |
resp = http.get(url) | |
if resp.ok: | |
resp_parsed = literal_eval(re.findall('ngrams.data = (.*?);\\n', resp.text)[0]) | |
if resp_parsed: | |
res = resp_parsed[0]['timeseries'] | |
df.loc[i, 'frequency'] = sum(res) | |
else: | |
df.loc[i, 'frequency'] = 0 # If the word doesn't exist at Google's end, frequency is zero | |
else: | |
raise Exception("Rate limit is too high!") | |
# (Optional) Convert frequency to number of occurence | |
# This is helpful if integer output is needed instead of float | |
# Assumption: The number of occurence is 1 for the lowest non-zero frequency value | |
# The assumption is most likely not true. But doesn't matter for comparison amon words | |
df["frequency"] = np.ceil(df["frequency"] / min(df['frequency'][df['frequency']>0])).astype(int) | |
# Save | |
df.sort_values(by='frequency', ascending=False).to_csv(r'words.txt', header=None, index=None, sep=' ') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment