Skip to content

Instantly share code, notes, and snippets.

@LinguList
Last active June 3, 2022 06:51
Show Gist options
  • Save LinguList/af661e8f5254a8bf939ef2c5f8fc6e81 to your computer and use it in GitHub Desktop.
Save LinguList/af661e8f5254a8bf939ef2c5f8fc6e81 to your computer and use it in GitHub Desktop.
How to Compute Colexifications with CL Toolkit

How to Compute Colexifications with CL Toolkit (Supplementary Material)

Please check the original blog post for details on the code provided here.

To run the code, just use the Makefile.

make install
make colexifications

Cite the original article as:

Johann-Mattis List, "How to Compute Colexifications with CL Toolkit (How to do X in Linguistics 10)," in Computer-Assisted Language Comparison in Practice, 02/06/2022, https://calc.hypotheses.org/4266.

"""
Compute colexifications for dedicated wordlists.
"""
from cltoolkit import Wordlist
from pycldf import Dataset
from collections import defaultdict
from itertools import combinations
def get_colexifications(language, data):
"""
Compute colexifications and add them to the data dictionary.
"""
tmp = defaultdict(set)
for form in language.forms:
if form.concept:
tmp[form.form].add(form.concept.concepticon_gloss)
for forms, colset in tmp.items():
if len(colset) > 1:
for cA, cB in combinations(colset, r=2):
data[cA, cB][language.name] += [form]
else:
c = colset.pop()
data[c, c][language.name] += [form]
wl = Wordlist([
Dataset.from_metadata("ids/cldf/cldf-metadata.json"),
Dataset.from_metadata("northeuralex/cldf/cldf-metadata.json")
])
cols = defaultdict(lambda : defaultdict(list))
all_languages = []
for language in wl.languages:
if language.family in ["Nakh-Daghestanian", "Turkic"] or \
language.glottocode in ["jude1256", "russ1263"]:
print("[i] analyzing language {0}".format(language.name))
all_languages += [language.name]
get_colexifications(language, cols)
# get all concepts involved in colexifications
colexified = [(cA, cB) for cA, cB in cols if cA != cB]
# write colexifications to file for individual languages
matrix = []
for cA, cB in colexified:
row = [cA, cB]
for language in all_languages:
if cols[cA, cA][language] and cols[cB, cB][language]:
row += ["1" if language in cols[cA, cB] else "0"]
else:
row += ["?"]
matrix += [row]
with open("colexifications.tsv", "w") as f:
f.write("ConceptA\tConceptB\t"+"\t".join(all_languages)+"\n")
for row in matrix:
f.write("\t".join(row)+"\n")
install:
git clone https://github.com/intercontinental-dictionary-series/ids.git
git clone https://github.com/lexibank/northeuralex.git
pip install cltoolkit
colexifications:
python colexifications.py
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment