Created
February 4, 2020 15:04
-
-
Save aus10powell/7edf80488e30a4d6a173854968686a1b to your computer and use it in GitHub Desktop.
This is some snippets of code to get started with Snorkel
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# make sure all packages import | |
from snorkel.labeling import labeling_function | |
from snorkel.utils import to_int_label_array | |
from snorkel.labeling import PandasLFApplier | |
@labeling_function() | |
def child_in_msg(x): | |
"""Postive examples (unknown otherwise) if child's name is in message. | |
""" | |
return 1 if x.pat_first_in_msg == 1 else -1 | |
@labeling_function() | |
def child_is_older(x): | |
"""Positive and negative examples for different cut-offs for child's age. | |
""" | |
if x.age < 15.5: | |
return 1 | |
elif 15.5 <= x.age < 17: | |
return -1 | |
elif 16 <= x.age: | |
return 0 | |
@labeling_function() | |
def email_domain(x): | |
"""Gmail is a strong signal for not being a parent in text. | |
""" | |
return 0 if "gmail" in x.email_domain else -1 | |
@labeling_function() | |
def is_proxy(x): | |
"""Positive example (0 otherwise) if a proxy account. | |
""" | |
return 1 if x.proxy_account_yn == "Y" else 0 | |
@labeling_function() | |
def email_domain2(x): | |
"""Oracle is not a email domain easily accessable to teens. | |
""" | |
return 1 if "oracle" not in x.email_domain else -1 | |
# Assign labeling functions | |
lfs = [child_in_msg, child_is_older, email_domain, email_domain2, reading_fog, is_proxy] | |
applier = PandasLFApplier(lfs=lfs) | |
L_train = applier.apply(df=X_train) | |
L_dev = applier.apply(df=X_val) | |
# Check coverage for different labeling functions | |
c1, c2, c3, c4, c5, c6 = (L_train != -1).mean(axis=0) | |
print(f"check_out coverage: {c1 * 100:.1f}%") | |
print(f"check coverage: {c2 * 100:.1f}%") | |
print(f"check coverage: {c3 * 100:.1f}%") | |
print(f"check coverage: {c4 * 100:.1f}%") | |
print(f"check coverage: {c5 * 100:.1f}%") | |
print(f"check coverage: {c6 * 100:.1f}%") | |
# Full analysis (including coverage) of how well labeling functions are performing | |
from snorkel.labeling import LFAnalysis | |
LFAnalysis(L_train, lfs).lf_summary() | |
##################### | |
# Labeling functions looking at text | |
##################### | |
from snorkel.labeling import LabelingFunction | |
def keyword_lookup(x, keywords, label): | |
if any(word in x.msg_txt.lower() for word in keywords): | |
return label | |
return -1 | |
def make_keyword_lf(keywords, label=1): | |
return LabelingFunction( | |
name=f"keyword_{keywords[0]}", | |
f=keyword_lookup, | |
resources=dict(keywords=keywords, label=label), | |
) | |
####################################################################### | |
# Positive or unknown keywords | |
# In my case I was interested in giveaways that a parent was communicating | |
# with a clinician versus the teen (who should have been communicating) | |
####################################################################### | |
keyword_my_son = make_keyword_lf(keywords=["son"]) | |
keyword_my_daughter = make_keyword_lf(keywords=["my daughter"]) | |
keyword_my_child = make_keyword_lf(keywords=["my child", "your child"]) | |
keyword_son = make_keyword_lf(keywords=["son", "he", "him"]) | |
keyword_referring = make_keyword_lf(keywords=["for us", "our"]) | |
keyword_daughter = make_keyword_lf(keywords=["daughter", "she", "her"]) | |
keyword_child = make_keyword_lf(keywords=["child", "parent", "child's"]) | |
####################################################################### | |
# Negative or unknown keywords | |
####################################################################### | |
keyword_my_parent = make_keyword_lf( | |
keywords=["father", "mother", "mom", "dad", "my mom", "my dad"], label=0 | |
) | |
lfs = [ | |
child_in_msg, | |
# keyword_my_son, | |
keyword_my_daughter, | |
keyword_my_child, | |
keyword_son, | |
keyword_daughter, | |
keyword_child, | |
child_is_older, | |
reading_fog, | |
# Negative examples | |
keyword_my_parent, | |
email_domain, | |
email_domain2, | |
keyword_referring, | |
is_proxy, | |
] | |
applier = PandasLFApplier(lfs=lfs) | |
L_train = applier.apply(df=X_train) | |
L_dev = applier.apply(df=X_val) | |
L_valid = applier.apply(df=X_test) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment