shuntaroy · March 29, 2022 11:37
diff --git a/gamma_index.py b/gamma_index.py
 """Gamma Index.

 Zhou and Slater 2002"""

 from typing import List

 import numpy as np

 import sigma_index as s


 def avg_sep(spans: List[int]) -> List[float]:
    return [(j + i) / 2 for i,j in zip(spans[1:], spans)]


 def delta(d: float, mean: float) -> bool:
    if d < mean:
        return True
    else:
        return False


 def nu(d: float, mean: float) -> float:
    return (mean - d) / mean


 def gamma(avg_seps: List[float], mean: float) -> float:
    return np.mean([nu(d, mean) for d in avg_seps if delta(d, mean)])


 # TODO: need normaise
 if __name__ == '__main__':
    import sys
    import json
    from tqdm import tqdm
    from collections import Counter

    with open(sys.argv[1]) as f:
        j = json.load(f)
    text = j['body']
    text = text.split()
    N = len(text)
    indices = Counter()
    for word in tqdm(set(text)):
        poslist = s.extract_occurence(text, word)
        spans = s.make_spans(poslist)
        spans.insert(0, 0)
        spans.insert(-1, N + 1)
        n = len(poslist)
        avg_seps = avg_sep(spans)
        mean = (N + 1) / (n + 1)
        indices[word] = gamma(avg_seps, mean)
    print(indices.most_common(25))
diff --git a/sigma_index.py b/sigma_index.py
 """sigma index.

 Ortuño, M., Carpena, P., Bernaola-Galván, P., Muñoz, E., & Somoza, A. M. (2002).
 Keyword detection in natural languages and DNA. Europhysics Letters (EPL), 57, 759–764."""

 from typing import Dict, List

 import numpy as np


 def extract_occurence(text: List[str], word: str) -> List[int]:
    """Extract positions of occurrences of the input word from the input text.

    `text` are assumed to be normalised.
    `word` are also assumed to follow the same normalisation of `text`
    """
    ret = []
    for i, w in enumerate(text):
        if w == word:
            ret.append(i)

    return ret


 def make_spans(poslist: List[int]) -> List[int]:
    """Make a list of spans/lengths between word occurence."""
    # 6 8 14 20 30 = > 2 6 6 10
    zipped = zip(poslist, poslist[1:])  # stripped by the shortest list

    return [j - i for i, j in zipped]


 def p(x: int, spans: List[int]) -> float:
    """Return the relative frequency of occurrence of a given separation x."""
    n = len(spans)
    n_i = len([i for i in spans if i == x])
    return n_i / n if n_i > 0 else 0


 def P(x: int, spans: List[int], x_i: int=1) -> float:
    """Integrated distribution function of p(x)."""
    val = 0.0
    for i in range(x_i, x + 1):
        val += p(i, spans)

    return val


 def Ps(s: float, spans: List[int]) -> float:
    """Integrated distribution function of p(s) where s is normalised x (= x/mean(x))."""
    x = np.mean(spans) * s
    print('restored x =', x)
    print(f'execute P({int(x)})')
    val = P(int(x), spans)
    return val


 def Ps_rand():
    pass


 def sigma(n: int, N:int, spans: List[int]) -> float:
    """Sigma index.

    Herrera and Pury (2008) version
    """
    mean = (N + 1) / (n + 1)
    std = np.std(spans)
    return std / mean


 def sigma_rand(n: int, N:int) -> float:
    return np.sqrt(1 - n / N)


 def sigma_nor(n: int, N:int, spans: List[int]) -> float:
    return sigma(n, N, spans) / sigma_rand(n, N)


 if __name__ == '__main__':
    # lorem = """Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum."""
    #
    # poslist = extract_occurence(lorem.lower(), 'in')
    # print(poslist)
    # spans = make_spans(poslist)
    # print(spans)
    # set_s = spans / np.mean(spans)
    # print(set_s)
    # val = P(11, spans)
    # print(val)
    # print(Ps(0.22, spans))

    import sys
    import json
    from tqdm import tqdm
    from collections import Counter

    with open(sys.argv[1]) as f:
        j = json.load(f)
    text = j['body']
    text = text.split()
    N = len(text)
    indices = Counter()
    for word in tqdm(set(text)):
        poslist = extract_occurence(text, word)
        spans = make_spans(poslist)
        spans.insert(0, 0)
        spans.insert(-1, N + 1)
        n = len(poslist)
        indices[word] = sigma(n, N, spans)
    print(indices.most_common(25))
	"""Gamma Index.

	Zhou and Slater 2002"""

	from typing import List

	import numpy as np

	import sigma_index as s


	def avg_sep(spans: List[int]) -> List[float]:
	return [(j + i) / 2 for i,j in zip(spans[1:], spans)]


	def delta(d: float, mean: float) -> bool:
	if d < mean:
	return True
	else:
	return False


	def nu(d: float, mean: float) -> float:
	return (mean - d) / mean


	def gamma(avg_seps: List[float], mean: float) -> float:
	return np.mean([nu(d, mean) for d in avg_seps if delta(d, mean)])


	# TODO: need normaise
	if __name__ == '__main__':
	import sys
	import json
	from tqdm import tqdm
	from collections import Counter

	with open(sys.argv[1]) as f:
	j = json.load(f)
	text = j['body']
	text = text.split()
	N = len(text)
	indices = Counter()
	for word in tqdm(set(text)):
	poslist = s.extract_occurence(text, word)
	spans = s.make_spans(poslist)
	spans.insert(0, 0)
	spans.insert(-1, N + 1)
	n = len(poslist)
	avg_seps = avg_sep(spans)
	mean = (N + 1) / (n + 1)
	indices[word] = gamma(avg_seps, mean)
	print(indices.most_common(25))
	"""sigma index.

	Ortuño, M., Carpena, P., Bernaola-Galván, P., Muñoz, E., & Somoza, A. M. (2002).
	Keyword detection in natural languages and DNA. Europhysics Letters (EPL), 57, 759–764."""

	from typing import Dict, List

	import numpy as np


	def extract_occurence(text: List[str], word: str) -> List[int]:
	"""Extract positions of occurrences of the input word from the input text.

	`text` are assumed to be normalised.
	`word` are also assumed to follow the same normalisation of `text`
	"""
	ret = []
	for i, w in enumerate(text):
	if w == word:
	ret.append(i)

	return ret


	def make_spans(poslist: List[int]) -> List[int]:
	"""Make a list of spans/lengths between word occurence."""
	# 6 8 14 20 30 = > 2 6 6 10
	zipped = zip(poslist, poslist[1:]) # stripped by the shortest list

	return [j - i for i, j in zipped]


	def p(x: int, spans: List[int]) -> float:
	"""Return the relative frequency of occurrence of a given separation x."""
	n = len(spans)
	n_i = len([i for i in spans if i == x])
	return n_i / n if n_i > 0 else 0


	def P(x: int, spans: List[int], x_i: int=1) -> float:
	"""Integrated distribution function of p(x)."""
	val = 0.0
	for i in range(x_i, x + 1):
	val += p(i, spans)

	return val


	def Ps(s: float, spans: List[int]) -> float:
	"""Integrated distribution function of p(s) where s is normalised x (= x/mean(x))."""
	x = np.mean(spans) * s
	print('restored x =', x)
	print(f'execute P({int(x)})')
	val = P(int(x), spans)
	return val


	def Ps_rand():
	pass


	def sigma(n: int, N:int, spans: List[int]) -> float:
	"""Sigma index.

	Herrera and Pury (2008) version
	"""
	mean = (N + 1) / (n + 1)
	std = np.std(spans)
	return std / mean


	def sigma_rand(n: int, N:int) -> float:
	return np.sqrt(1 - n / N)


	def sigma_nor(n: int, N:int, spans: List[int]) -> float:
	return sigma(n, N, spans) / sigma_rand(n, N)


	if __name__ == '__main__':
	# lorem = """Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum."""
	#
	# poslist = extract_occurence(lorem.lower(), 'in')
	# print(poslist)
	# spans = make_spans(poslist)
	# print(spans)
	# set_s = spans / np.mean(spans)
	# print(set_s)
	# val = P(11, spans)
	# print(val)
	# print(Ps(0.22, spans))

	import sys
	import json
	from tqdm import tqdm
	from collections import Counter

	with open(sys.argv[1]) as f:
	j = json.load(f)
	text = j['body']
	text = text.split()
	N = len(text)
	indices = Counter()
	for word in tqdm(set(text)):
	poslist = extract_occurence(text, word)
	spans = make_spans(poslist)
	spans.insert(0, 0)
	spans.insert(-1, N + 1)
	n = len(poslist)
	indices[word] = sigma(n, N, spans)
	print(indices.most_common(25))