Skip to content

Instantly share code, notes, and snippets.

@onnyyonn
Last active January 16, 2022 07:33
Show Gist options
  • Save onnyyonn/b3a1d426f92e7526bd63fc2df790e3cd to your computer and use it in GitHub Desktop.
Save onnyyonn/b3a1d426f92e7526bd63fc2df790e3cd to your computer and use it in GitHub Desktop.
Sample script to generate freqencies of a list of words from Google Ngram Viewer
# Sample script to generate freqencies of a list of words from Google Ngram Viewer
import pandas as pd, numpy as np
import re, requests
from ast import literal_eval
# Import list of words
df = pd.read_csv("wordle_full.csv")
df.rename( columns={'Unnamed: 0':'word', 'Unnamed: 1':'frequency'}, inplace=True )
# Google will rate limit. Need some retry strategy.
retry_strategy = requests.packages.urllib3.util.retry.Retry(
total=10,
backoff_factor=2,
status_forcelist=[429],
allowed_methods=["HEAD", "GET", "OPTIONS"]
)
adapter = requests.adapters.HTTPAdapter(max_retries=retry_strategy)
http = requests.Session()
http.mount("https://", adapter)
http.mount("http://", adapter)
# Query the frequency timeseries for each word
for i in range(len(df)):
if (i%200 == 0):
print(i)
term = df['word'][i]
url =f"https://books.google.com/ngrams/graph?content={term}&year_start=1800&year_end=2019&corpus=26&smoothing=0&case_insensitive=true"
resp = http.get(url)
if resp.ok:
resp_parsed = literal_eval(re.findall('ngrams.data = (.*?);\\n', resp.text)[0])
if resp_parsed:
res = resp_parsed[0]['timeseries']
df.loc[i, 'frequency'] = sum(res)
else:
df.loc[i, 'frequency'] = 0 # If the word doesn't exist at Google's end, frequency is zero
else:
raise Exception("Rate limit is too high!")
# (Optional) Convert frequency to number of occurence
# This is helpful if integer output is needed instead of float
# Assumption: The number of occurence is 1 for the lowest non-zero frequency value
# The assumption is most likely not true. But doesn't matter for comparison amon words
df["frequency"] = np.ceil(df["frequency"] / min(df['frequency'][df['frequency']>0])).astype(int)
# Save
df.sort_values(by='frequency', ascending=False).to_csv(r'words.txt', header=None, index=None, sep=' ')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment