|
""" |
|
Compute colexifications for dedicated wordlists. |
|
""" |
|
|
|
from cltoolkit import Wordlist |
|
from pycldf import Dataset |
|
from collections import defaultdict |
|
from itertools import combinations |
|
|
|
|
|
def get_colexifications(language, data): |
|
""" |
|
Compute colexifications and add them to the data dictionary. |
|
""" |
|
tmp = defaultdict(set) |
|
for form in language.forms: |
|
if form.concept: |
|
tmp[form.form].add(form.concept.concepticon_gloss) |
|
|
|
for forms, colset in tmp.items(): |
|
if len(colset) > 1: |
|
for cA, cB in combinations(colset, r=2): |
|
data[cA, cB][language.name] += [form] |
|
else: |
|
c = colset.pop() |
|
data[c, c][language.name] += [form] |
|
|
|
|
|
|
|
|
|
wl = Wordlist([ |
|
Dataset.from_metadata("ids/cldf/cldf-metadata.json"), |
|
Dataset.from_metadata("northeuralex/cldf/cldf-metadata.json") |
|
]) |
|
|
|
|
|
cols = defaultdict(lambda : defaultdict(list)) |
|
all_languages = [] |
|
for language in wl.languages: |
|
if language.family in ["Nakh-Daghestanian", "Turkic"] or \ |
|
language.glottocode in ["jude1256", "russ1263"]: |
|
print("[i] analyzing language {0}".format(language.name)) |
|
all_languages += [language.name] |
|
get_colexifications(language, cols) |
|
|
|
# get all concepts involved in colexifications |
|
colexified = [(cA, cB) for cA, cB in cols if cA != cB] |
|
|
|
# write colexifications to file for individual languages |
|
matrix = [] |
|
for cA, cB in colexified: |
|
row = [cA, cB] |
|
for language in all_languages: |
|
if cols[cA, cA][language] and cols[cB, cB][language]: |
|
row += ["1" if language in cols[cA, cB] else "0"] |
|
else: |
|
row += ["?"] |
|
matrix += [row] |
|
|
|
with open("colexifications.tsv", "w") as f: |
|
f.write("ConceptA\tConceptB\t"+"\t".join(all_languages)+"\n") |
|
for row in matrix: |
|
f.write("\t".join(row)+"\n") |