Created
March 29, 2022 11:37
-
-
Save shuntaroy/0d3672431379c39ddf192fc6270d3207 to your computer and use it in GitHub Desktop.
Naive implementations of some classical, information theoretic keyword extraction methods
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Gamma Index. | |
Zhou and Slater 2002""" | |
from typing import List | |
import numpy as np | |
import sigma_index as s | |
def avg_sep(spans: List[int]) -> List[float]: | |
return [(j + i) / 2 for i,j in zip(spans[1:], spans)] | |
def delta(d: float, mean: float) -> bool: | |
if d < mean: | |
return True | |
else: | |
return False | |
def nu(d: float, mean: float) -> float: | |
return (mean - d) / mean | |
def gamma(avg_seps: List[float], mean: float) -> float: | |
return np.mean([nu(d, mean) for d in avg_seps if delta(d, mean)]) | |
# TODO: need normaise | |
if __name__ == '__main__': | |
import sys | |
import json | |
from tqdm import tqdm | |
from collections import Counter | |
with open(sys.argv[1]) as f: | |
j = json.load(f) | |
text = j['body'] | |
text = text.split() | |
N = len(text) | |
indices = Counter() | |
for word in tqdm(set(text)): | |
poslist = s.extract_occurence(text, word) | |
spans = s.make_spans(poslist) | |
spans.insert(0, 0) | |
spans.insert(-1, N + 1) | |
n = len(poslist) | |
avg_seps = avg_sep(spans) | |
mean = (N + 1) / (n + 1) | |
indices[word] = gamma(avg_seps, mean) | |
print(indices.most_common(25)) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""sigma index. | |
Ortuño, M., Carpena, P., Bernaola-Galván, P., Muñoz, E., & Somoza, A. M. (2002). | |
Keyword detection in natural languages and DNA. Europhysics Letters (EPL), 57, 759–764.""" | |
from typing import Dict, List | |
import numpy as np | |
def extract_occurence(text: List[str], word: str) -> List[int]: | |
"""Extract positions of occurrences of the input word from the input text. | |
`text` are assumed to be normalised. | |
`word` are also assumed to follow the same normalisation of `text` | |
""" | |
ret = [] | |
for i, w in enumerate(text): | |
if w == word: | |
ret.append(i) | |
return ret | |
def make_spans(poslist: List[int]) -> List[int]: | |
"""Make a list of spans/lengths between word occurence.""" | |
# 6 8 14 20 30 = > 2 6 6 10 | |
zipped = zip(poslist, poslist[1:]) # stripped by the shortest list | |
return [j - i for i, j in zipped] | |
def p(x: int, spans: List[int]) -> float: | |
"""Return the relative frequency of occurrence of a given separation x.""" | |
n = len(spans) | |
n_i = len([i for i in spans if i == x]) | |
return n_i / n if n_i > 0 else 0 | |
def P(x: int, spans: List[int], x_i: int=1) -> float: | |
"""Integrated distribution function of p(x).""" | |
val = 0.0 | |
for i in range(x_i, x + 1): | |
val += p(i, spans) | |
return val | |
def Ps(s: float, spans: List[int]) -> float: | |
"""Integrated distribution function of p(s) where s is normalised x (= x/mean(x)).""" | |
x = np.mean(spans) * s | |
print('restored x =', x) | |
print(f'execute P({int(x)})') | |
val = P(int(x), spans) | |
return val | |
def Ps_rand(): | |
pass | |
def sigma(n: int, N:int, spans: List[int]) -> float: | |
"""Sigma index. | |
Herrera and Pury (2008) version | |
""" | |
mean = (N + 1) / (n + 1) | |
std = np.std(spans) | |
return std / mean | |
def sigma_rand(n: int, N:int) -> float: | |
return np.sqrt(1 - n / N) | |
def sigma_nor(n: int, N:int, spans: List[int]) -> float: | |
return sigma(n, N, spans) / sigma_rand(n, N) | |
if __name__ == '__main__': | |
# lorem = """Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.""" | |
# | |
# poslist = extract_occurence(lorem.lower(), 'in') | |
# print(poslist) | |
# spans = make_spans(poslist) | |
# print(spans) | |
# set_s = spans / np.mean(spans) | |
# print(set_s) | |
# val = P(11, spans) | |
# print(val) | |
# print(Ps(0.22, spans)) | |
import sys | |
import json | |
from tqdm import tqdm | |
from collections import Counter | |
with open(sys.argv[1]) as f: | |
j = json.load(f) | |
text = j['body'] | |
text = text.split() | |
N = len(text) | |
indices = Counter() | |
for word in tqdm(set(text)): | |
poslist = extract_occurence(text, word) | |
spans = make_spans(poslist) | |
spans.insert(0, 0) | |
spans.insert(-1, N + 1) | |
n = len(poslist) | |
indices[word] = sigma(n, N, spans) | |
print(indices.most_common(25)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment