Created
November 26, 2024 04:00
-
-
Save tylerneylon/eb48baaa11ef0a17c827de6a9cc37b1a to your computer and use it in GitHub Desktop.
A short Python script to find bigram frequencies based on a source text.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{"th": 0.03309640905197913, "he": 0.03289597993942979, "pr": 0.002746850084321499, "ro": 0.005424830253317676, "oj": 8.741181560521646e-05, "je": 0.0003752527437597676, "ec": 0.00287222864811888, "ct": 0.002534942652551277, "gu": 0.0005968725994861245, "ut": 0.005476041215995479, "te": 0.009365425536611424, "en": 0.012622619352446206, "nb": 0.00012891035432688487, "be": 0.007778768641231889, "er": 0.023745110677485717, "rg": 0.0005032801504542766, "eb": 0.00018630195043131993, "bo": 0.0023371623828990704, "oo": 0.004796171539066018, "ok": 0.002003408177860971, "of": 0.008637876687533663, "da": 0.0015663490998348887, "av": 0.003626265926167919, "vi": 0.001449800012361267, "id": 0.005190849130738056, "co": 0.006384594329710305, "op": 0.0019883980681105803, "pp": 0.0020528532452740228, "pe": 0.004910071783642512, "rf": 0.0011416512886620695, "fi": 0.0026135249918327343, "ie": 0.003583884439813875, "el": 0.006230961441676894, "ld": 0.005101671419868088, "hi": 0.010678368665954422, "is": 0.009714189851399914, "fo": 0.005046045719028404, "or": 0.010747238581279744, "us": 0.003916755697219598, "se": 0.00892218582515871, "an": 0.020721897984230554, "ny": 0.0014674589650087854, "yo": 0.0044094404760853636, "on": 0.013853448351978244, "ne": 0.008082502626769206, "yw": 4.503032925117211e-05, "wh": 0.006225663755882639, "re": 0.01707709015778274, "in": 0.02471282128256973, "un": 0.004486256920102068, "ni": 0.0025084542235799994, "it": 0.012499889631545953, "ed": 0.013051731901780906, "st": 0.009035203122102828, "ta": 0.003417007337294825, "at": 0.014062706940851337, "es": 0.009348649531596281, "nd": 0.016761877853024538, "mo": 0.0033640304793522697, "os": 0.0022700583628385, "ot": 0.005731213081752121, "pa": 0.0024148617745481517, "ar": 0.008957503730453746, "rt": 0.0032368860202901368, "ts": 0.002024598921037993, "wo": 0.002827198318867708, "rl": 0.0008123118217858499, "no": 0.006335149262297253, "wi": 0.00546279700150984, "al": 0.0062989484093698405, "lm": 0.0001933655314903273, "tr": 0.003137112937831657, "ri": 0.0045780834738691645, "ic": 0.004307018550729756, "ti": 0.006799579716926989, "io": 0.0037525274375976763, "ns": 0.0026850437500551844, "ha": 0.016014904156034505, "so": 0.004430631219262385, "oe": 0.0002569377610213938, "ev": 0.0027839338848812878, "ve": 0.009450188509319512, "ou": 0.015001280274066945, "ma": 0.004435928905056641, "ay": 0.003371094060411277, "py": 0.0002816602947279197, "gi": 0.001007443248540929, "iv": 0.0014992450797743186, "aw": 0.001912464571726251, "wa": 0.007581871319212057, "de": 0.006209770698499872, "rm": 0.0010251022011884476, "ms": 0.000707241053533115, "li": 0.005689714543030452, "ce": 0.0047061108805636735, "nc": 0.0028342618999267154, "cl": 0.0010489417872625974, "lu": 0.0006004043900156282, "ud": 0.0005174073125722913, "nl": 0.0007805257070203167, "ww": 1.5893057382766628e-05, "if": 0.0023945539790035052, "lo": 0.004300837917303125, "oc": 0.0012228824708406544, "ca": 0.003860247048747539, "il": 0.003567991382431108, "ll": 0.006415497496843462, "to": 0.01338018842102475, "ch": 0.005273846208181393, "ck": 0.0018215209655915309, "la": 0.003508833891061921, "ws": 0.00030285103790494186, "nt": 0.008781797151610938, "ry": 0.002522581385698014, "ef": 0.0015416265661283629, "si": 0.00410040880475379, "ng": 0.012345373795880167, "tl": 0.0022223791906902, "le": 0.008417139779439682, "au": 0.001611379429086061, "ho": 0.006358988848371403, "di": 0.0035344393724008227, "ke": 0.0036863063651694818, "ea": 0.008249379729288256, "as": 0.011538359659888572, "em": 0.003208631696054107, "mb": 0.0007151875822244983, "ly": 0.004605454850472818, "up": 0.0022064861333074334, "pd": 2.6488428971277716e-06, "ju": 0.00032757357161146774, "ul": 0.004284061912287983, "ua": 0.0005421298462788172, "ag": 0.0020131206018171063, "ge": 0.002839559585720971, "gl": 0.0006939968390474761, "sh": 0.005798317101812692, "ig": 0.00252787907149227, "na": 0.001286454700371721, "pu": 0.0008352684602276239, "ub": 0.0005589058512939598, "bl": 0.002210900871469313, "cr": 0.0012202336279435266, "jo": 0.0003381689431999788, "hu": 0.0006083509187070115, "ur": 0.005917515032183442, "rc": 0.0005491934273378246, "dg": 0.00036112558164175284, "by": 0.0013968231544187115, "af": 0.0009906672435257865, "ff": 0.0011725544557952268, "fe": 0.002406915245856768, "sc": 0.000834385512595248, "ib": 0.0004246978111728193, "mr": 0.002793646308837423, "rs": 0.0038637788392770424, "rd": 0.0021164254748050892, "ki": 0.0019151134146233788, "gh": 0.0037913771334222167, "am": 0.0030523499651235685, "mp": 0.0012811570145774653, "pt": 0.000660444829017191, "ir": 0.0030761895511977186, "rn": 0.0016970253494265255, "ii": 3.70838005597888e-05, "ob": 0.0007540372780490389, "bs": 0.0004008582250986694, "rv": 0.0005659694323529672, "fa": 0.001829467494282914, "sg": 4.679622451592396e-05, "gr": 0.001516904032421837, "ra": 0.004478310391410685, "ac": 0.003139761780728785, "my": 0.0051811367067819206, "ci": 0.0010277510440855752, "cq": 5.915749136918689e-05, "qu": 0.0012370096329586693, "ai": 0.0058221566878868414, "lf": 0.0013915254686244559, "sa": 0.0051361063775307485, "ol": 0.002491678218564857, "ys": 0.0014851179176563038, "sp": 0.0015866568953795351, "ia": 0.0010489417872625974, "ap": 0.001727045568927307, "ft": 0.0010515906301597252, "ix": 0.0001403886735477719, "me": 0.010507076825273493, "ab": 0.0019672073249335583, "bi": 0.00046354750699736, "hd": 3.0903167133157336e-05, "om": 0.0055678677697625755, "eg": 0.001411833264169102, "ov": 0.001425960426287117, "xi": 0.00019248258385795138, "ow": 0.005515773859452396, "wn": 0.0013623881967560504, "cc": 0.0006215951331926503, "do": 0.004307901498362132, "ik": 0.0007858233928145722, "et": 0.004372356675525575, "tt": 0.0035785867540196192, "eq": 0.00019513142675507916, "ue": 0.0006763378863999576, "ak": 0.0017711929505461032, "mi": 0.004241680425933938, "xv": 1.0595371588511086e-05, "nn": 0.0009120849042443292, "ew": 0.0008379173031247516, "oy": 0.00048032351201250255, "od": 0.001689961768367518, "dy": 0.0006463176668991762, "tu": 0.0018603706614160715, "xx": 2.6488428971277714e-05, "ee": 0.005340067280609587, "hs": 0.00013509098775351633, "ml": 0.0002269175415206124, "eo": 0.0002269175415206124, "pl": 0.001865668347210327, "rr": 0.0011231093883821751, "ss": 0.0038858525300864404, "ip": 0.0006330734524135374, "go": 0.002689458488217064, "ba": 0.0011919793037074972, "ad": 0.006196526484014233, "ls": 0.0006939968390474761, "ty": 0.0019204111004176342, "mm": 0.0006127656568688911, "dd": 0.000868820470257909, "dl": 0.0009694765003487643, "wb": 0.0007690473877994296, "ga": 0.0015142551895247092, "ey": 0.0017067377733826607, "sf": 0.00012361266853262934, "fu": 0.0011769691939571064, "ep": 0.0017447045215748253, "sm": 0.00040262412036342124, "tn": 0.00013862277828302004, "kf": 0.00026665018497752896, "xl": 8.829476323759239e-06, "ek": 0.00018806784569607178, "pi": 0.0011266411789116787, "lv": 0.00018895079332844768, "nv": 0.00024987417996238644, "vo": 0.0005297685794255543, "gg": 0.0009288609092594719, "dr": 0.0009059042708176978, "ru": 0.0009571152334955014, "ex": 0.001415365054698606, "xp": 0.000521822050734171, "lx": 4.414738161879619e-06, "gn": 0.000625126923722154, "tw": 0.0007928869738735796, "ht": 0.002663853006878162, "sy": 0.00020572679834359025, "su": 0.002501390642520992, "uf": 0.0001403886735477719, "fr": 0.0017809053745022382, "po": 0.0027865827277784153, "ui": 0.001054239473056853, "we": 0.0048791686165093546, "yi": 0.0004344102351289545, "nf": 0.0005765648039414783, "va": 0.0005368321604845616, "rp": 0.00033198830977334735, "rh": 0.00018718489806369585, "ps": 0.00047590877385062294, "kn": 0.0014692248602735372, "wf": 4.944506741305173e-05, "ye": 0.0012723275382537061, "im": 0.0038417051484676444, "sk": 0.0005235879459989228, "wy": 2.295663844177402e-05, "wd": 5.915749136918689e-05, "br": 0.0011460660268239492, "oi": 0.0009685935527163884, "wr": 0.0003796674819216472, "rw": 0.00022073690809398095, "um": 0.0008635227844636535, "ds": 0.0014586294886850262, "hf": 7.151875822244982e-05, "rk": 0.0009491687048041181, "ks": 0.00041763423011381196, "cy": 0.00013950572591539596, "bu": 0.002514634857006631, "yb": 0.00012096382563550156, "mu": 0.0014701078079059132, "lt": 0.0007151875822244983, "sl": 0.0005032801504542766, "nu": 0.00032845651924384364, "ei": 0.0013014648101221118, "hb": 4.591327688354804e-05, "uc": 0.0015372118279664833, "ky": 6.71040200605702e-05, "hr": 0.0005712671181472227, "ug": 0.001946016581756536, "nh": 8.034823454620906e-05, "nj": 9.535834429659977e-05, "ym": 8.299707744333684e-05, "lc": 5.56257008396832e-05, "dv": 0.0001262615114297571, "ja": 0.00017658952647518476, "fl": 0.0005156414173075395, "gs": 0.0004503032925117211, "mf": 0.00012891035432688487, "du": 0.0004891529883362618, "fp": 8.829476323759239e-06, "hm": 9.800718719372754e-05, "ka": 3.35520100302851e-05, "iu": 5.4742753207307274e-05, "ph": 0.000289606823419303, "oa": 0.0007460907493576556, "xc": 0.0003081487236991974, "xt": 0.0003054998808020696, "bj": 0.00021985396046160502, "lw": 0.0003169782000229566, "wl": 0.00022426869862348465, "gt": 9.447539666422384e-05, "lk": 0.000509460783880908, "hy": 0.0003884969582454065, "ya": 0.00017394068357805698, "sb": 5.827454373681097e-05, "nk": 0.0011416512886620695, "yh": 1.5893057382766628e-05, "ax": 5.20939103101795e-05, "cu": 0.00092709501399472, "tf": 0.00016599415488667367, "lr": 7.151875822244982e-05, "bt": 0.00020307795544646247, "eh": 0.0003311053621409714, "nq": 0.00014303751644489964, "tc": 0.0003222758858172122, "sw": 0.0003134464094934529, "sn": 0.0001712918406809292, "oh": 0.00034346662899423436, "dn": 0.00043264433986420263, "lp": 0.0001589305738276663, "ux": 1.147831922088701e-05, "xu": 1.5010109750390705e-05, "bb": 0.00014303751644489964, "aa": 8.56459203404646e-05, "rb": 0.0001562817309305385, "wu": 2.0307795544646246e-05, "kl": 0.00025075712759476234, "uo": 4.767917214829988e-05, "nw": 6.092338663393874e-05, "og": 0.00033287125740572325, "iz": 8.299707744333684e-05, "ze": 0.00014303751644489964, "az": 9.2709501399472e-05, "dm": 0.00011478319220887009, "bd": 1.5010109750390705e-05, "yt": 0.00032492472871433994, "nx": 5.827454373681097e-05, "ej": 5.915749136918689e-05, "zi": 4.3264433986420264e-05, "mn": 9.624129192897569e-05, "gm": 1.7658952647518477e-05, "hl": 8.652886797284053e-05, "ah": 0.0002843091376250475, "rj": 7.06358105900739e-06, "fy": 3.6200852927412874e-05, "dw": 5.0328015045427655e-05, "tm": 0.00012449561616500524, "hn": 1.5010109750390705e-05, "dh": 1.7658952647518477e-05, "cs": 2.383958607414994e-05, "za": 2.295663844177402e-05, "gy": 4.3264433986420264e-05, "df": 6.180633426631467e-05, "kc": 5.297685794255543e-06, "nm": 3.35520100302851e-05, "lb": 3.973264345691657e-05, "bm": 1.677600501514255e-05, "ox": 0.00010683666351748678, "vy": 0.00020219500781408655, "yl": 3.35520100302851e-05, "xa": 0.000104187820620359, "wk": 8.829476323759239e-06, "yn": 7.946528691383314e-06, "sq": 4.856211978067581e-05, "ez": 2.383958607414994e-05, "xe": 0.0001262615114297571, "ln": 4.856211978067581e-05, "xh": 2.4722533706525864e-05, "oz": 2.7371376603653637e-05, "lg": 2.383958607414994e-05, "uy": 2.295663844177402e-05, "tb": 6.180633426631466e-06, "yf": 2.4722533706525864e-05, "yp": 1.677600501514255e-05, "hq": 2.6488428971277716e-06, "vu": 7.06358105900739e-06, "dj": 1.412716211801478e-05, "ku": 8.829476323759237e-07, "ji": 9.447539666422384e-05, "pw": 1.677600501514255e-05, "sr": 0.00011213434931174233, "hw": 1.3244214485638857e-05, "dk": 4.149853872166842e-05, "zy": 1.3244214485638857e-05, "eu": 3.0903167133157336e-05, "np": 3.1786114765533255e-05, "wt": 2.9137271868405486e-05, "uv": 1.85419002798944e-05, "kw": 1.85419002798944e-05, "ao": 3.35520100302851e-05, "dc": 4.414738161879619e-06, "ih": 6.180633426631466e-06, "pb": 5.297685794255543e-06, "yr": 4.061559108929249e-05, "nr": 3.1786114765533255e-05, "uz": 7.06358105900739e-06, "zz": 1.5893057382766628e-05, "zl": 1.2361266853262932e-05, "xo": 7.06358105900739e-06, "wp": 8.829476323759237e-07, "sd": 3.443495766266103e-05, "iq": 9.712423956135162e-06, "bv": 2.6488428971277716e-06, "kh": 1.2361266853262932e-05, "tp": 5.297685794255543e-06, "yg": 1.2361266853262932e-05, "dt": 5.297685794255543e-06, "ko": 9.712423956135162e-06, "wz": 8.829476323759237e-07, "kr": 1.7658952647518475e-06, "aj": 9.712423956135162e-06, "uk": 3.531790529503695e-06, "kg": 8.829476323759239e-06, "yd": 7.946528691383314e-06, "fs": 1.147831922088701e-05, "zo": 7.946528691383314e-06, "yc": 3.531790529503695e-06, "xq": 5.297685794255543e-06, "pk": 1.2361266853262932e-05, "hc": 1.7658952647518475e-06, "db": 4.414738161879619e-06, "kd": 2.6488428971277716e-06, "km": 1.0595371588511086e-05, "cb": 2.6488428971277716e-06, "tz": 1.0595371588511086e-05, "fn": 2.6488428971277716e-06, "ae": 1.3244214485638857e-05, "xf": 1.0595371588511086e-05, "hg": 2.1190743177022172e-05, "mt": 6.180633426631466e-06, "gp": 2.6488428971277716e-06, "wc": 5.297685794255543e-05, "iw": 3.531790529503695e-06, "kp": 1.7658952647518475e-06, "fm": 8.829476323759237e-07, "hh": 3.531790529503695e-06, "oq": 9.712423956135162e-06, "yv": 8.829476323759237e-07, "wm": 1.7658952647518475e-06, "tj": 8.829476323759237e-07, "zu": 1.7658952647518475e-06, "md": 2.6488428971277716e-06, "bn": 2.6488428971277716e-06, "yk": 8.829476323759237e-07, "kb": 3.531790529503695e-06, "uw": 8.829476323759237e-07, "cn": 8.829476323759237e-07, "uq": 3.531790529503695e-06, "tg": 1.7658952647518475e-06, "bw": 1.7658952647518475e-06, "qy": 8.829476323759237e-07, "pm": 4.414738161879619e-06, "nz": 1.7658952647518475e-06, "xb": 8.829476323759237e-07, "xy": 3.531790529503695e-06, "pn": 8.829476323759237e-07, "bh": 1.7658952647518475e-06, "vn": 8.829476323759237e-07, "pf": 1.7658952647518475e-06, "gz": 1.7658952647518475e-06, "gb": 8.829476323759237e-07, "vr": 8.829476323759237e-07, "yy": 8.829476323759237e-07, "pg": 1.7658952647518475e-06, "td": 8.829476323759237e-07} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# coding: utf-8 | |
""" find_bigram_freqs.py | |
Usage: | |
./find_bigram_freqs.py <plain_text_file.txt> | |
This produces the file bigram_freqs.json, which is an object mapping | |
lowercase-only letter bigrams aa, ab, ..., zz to their frequencies in the | |
given source text. The frequencies will add up to 1, modulo floating point | |
imprecision. | |
""" | |
# ______________________________________________________________________ | |
# Imports | |
import json | |
import re | |
import sys | |
from collections import Counter | |
# ______________________________________________________________________ | |
# Functions | |
def load_words(filename): | |
with open(filename) as f: | |
text = f.read() | |
words = re.findall(r'[\w\S]+', text) | |
return [w.encode('ascii', 'ignore').decode('ascii').lower() for w in words] | |
def find_bigram_freqs(words): | |
c = Counter() | |
for w in words: | |
for i in range(len(w) - 1): | |
bigram = w[i:i+2] | |
if re.match(r'[a-z][a-z]', bigram): | |
c[bigram] += 1 | |
total = sum(c.values()) | |
return {bigram: count / total for bigram, count in c.items()} | |
# ______________________________________________________________________ | |
# Main | |
if __name__ == '__main__': | |
if len(sys.argv) < 2: | |
print(__doc__) | |
sys.exit(0) | |
words = load_words(sys.argv[1]) | |
bigram_freqs = find_bigram_freqs(words) | |
with open('bigram_freqs.json', 'w') as f: | |
json.dump(bigram_freqs, f) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I needed to have a list of English-language bigram frequencies (for a cryptogram-solver), and I couldn't immediately find an easy-to-understand online source, so I derived the frequencies from the book David Copperfield by Charles Dickens. This is not the perfect book since it's not the same as modern English, but it'll do for many practical applications. Of course, you can use any source text you like with the given Python script.